1 /*
2 * Copyright (c) 1995-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137
138 #include <nfs/nfs_conf.h>
139
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 ((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 release_pathbuff(x)
154 #else
155 #define GET_PATH(x) \
156 ((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
171 #endif
172
173 extern void disk_conditioner_unmount(mount_t mp);
174
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 vnode_t olddp;
178 vnode_t newdp;
179 };
180 /* callback for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192 boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195 struct componentname *cnp, user_addr_t fsmountargs,
196 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200
201 struct fd_vn_data * fg_vn_data_alloc(void);
202
203 /*
204 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205 * Concurrent lookups (or lookups by ids) on hard links can cause the
206 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207 * does) to return ENOENT as the path cannot be returned from the name cache
208 * alone. We have no option but to retry and hope to get one namei->reverse path
209 * generation done without an intervening lookup, lookup by id on the hard link
210 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211 * which currently are the MAC hooks for rename, unlink and rmdir.
212 */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219 int unlink_flags);
220
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236
237 __private_extern__
238 int sync_internal(void);
239
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249
250 extern lck_rw_t rootvnode_rw_lock;
251
252 VFS_SMR_DECLARE;
253 extern uint32_t nc_smr_enabled;
254
255 /*
256 * incremented each time a mount or unmount operation occurs
257 * used to invalidate the cached value of the rootvp in the
258 * mount structure utilized by cache_lookup_path
259 */
260 uint32_t mount_generation = 0;
261
262 /* counts number of mount and unmount operations */
263 unsigned int vfs_nummntops = 0;
264
265 /* system-wide, per-boot unique mount ID */
266 static _Atomic uint64_t mount_unique_id = 1;
267
268 extern const struct fileops vnops;
269 #if CONFIG_APPLEDOUBLE
270 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
271 #endif /* CONFIG_APPLEDOUBLE */
272
273 /* Maximum buffer length supported by fsgetpath(2) */
274 #define FSGETPATH_MAXBUFLEN 8192
275
276 /*
277 * Virtual File System System Calls
278 */
279
280 /*
281 * Private in-kernel mounting spi (specific use-cases only)
282 */
283 boolean_t
vfs_iskernelmount(mount_t mp)284 vfs_iskernelmount(mount_t mp)
285 {
286 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
287 }
288
289 __private_extern__
290 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)291 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
292 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
293 vfs_context_t ctx)
294 {
295 struct nameidata nd;
296 boolean_t did_namei;
297 int error;
298
299 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
300 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
301
302 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
303
304 /*
305 * Get the vnode to be covered if it's not supplied
306 */
307 if (vp == NULLVP) {
308 error = namei(&nd);
309 if (error) {
310 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
311 printf("failed to locate mount-on path: %s ", path);
312 }
313 return error;
314 }
315 vp = nd.ni_vp;
316 pvp = nd.ni_dvp;
317 did_namei = TRUE;
318 } else {
319 char *pnbuf = CAST_DOWN(char *, path);
320
321 nd.ni_cnd.cn_pnbuf = pnbuf;
322 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
323 did_namei = FALSE;
324 }
325
326 kern_flags |= KERNEL_MOUNT_KMOUNT;
327 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
328 syscall_flags, kern_flags, NULL, ctx);
329
330 if (did_namei) {
331 vnode_put(vp);
332 vnode_put(pvp);
333 nameidone(&nd);
334 }
335
336 return error;
337 }
338
339 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)340 vfs_mount_at_path(const char *fstype, const char *path,
341 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
342 int mnt_flags, int flags)
343 {
344 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
345 int error, km_flags = 0;
346 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
347
348 /*
349 * This call is currently restricted to specific use cases.
350 */
351 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
352 return ENOTSUP;
353 }
354
355 #if !defined(XNU_TARGET_OS_OSX)
356 if (strcmp(fstype, "lifs") == 0) {
357 syscall_flags |= MNT_NOEXEC;
358 }
359 #endif
360
361 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
362 km_flags |= KERNEL_MOUNT_NOAUTH;
363 }
364 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
365 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
366 }
367
368 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
369 syscall_flags, km_flags, ctx);
370 if (error) {
371 printf("%s: mount on %s failed, error %d\n", __func__, path,
372 error);
373 }
374
375 return error;
376 }
377
378 int
vfs_mount_override_type_name(mount_t mp,const char * name)379 vfs_mount_override_type_name(mount_t mp, const char *name)
380 {
381 if (mp == NULL || name == NULL) {
382 return EINVAL;
383 }
384
385 /* Override the FS type name. */
386 mount_lock_spin(mp);
387 strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
388 mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
389 mount_unlock(mp);
390
391 return 0;
392 }
393
394 /*
395 * Mount a file system.
396 */
397 /* ARGSUSED */
398 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)399 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
400 {
401 struct __mac_mount_args muap;
402
403 muap.type = uap->type;
404 muap.path = uap->path;
405 muap.flags = uap->flags;
406 muap.data = uap->data;
407 muap.mac_p = USER_ADDR_NULL;
408 return __mac_mount(p, &muap, retval);
409 }
410
411 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)412 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
413 {
414 struct componentname cn;
415 vfs_context_t ctx = vfs_context_current();
416 size_t dummy = 0;
417 int error;
418 int flags = uap->flags;
419 char fstypename[MFSNAMELEN];
420 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
421 vnode_t pvp;
422 vnode_t vp;
423
424 AUDIT_ARG(fd, uap->fd);
425 AUDIT_ARG(fflags, flags);
426 /* fstypename will get audited by mount_common */
427
428 /* Sanity check the flags */
429 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
430 return ENOTSUP;
431 }
432
433 if (flags & MNT_UNION) {
434 return EPERM;
435 }
436
437 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
438 if (error) {
439 return error;
440 }
441
442 if ((error = file_vnode(uap->fd, &vp)) != 0) {
443 return error;
444 }
445
446 if ((error = vnode_getwithref(vp)) != 0) {
447 file_drop(uap->fd);
448 return error;
449 }
450
451 pvp = vnode_getparent(vp);
452 if (pvp == NULL) {
453 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
454 error = EBUSY;
455 } else {
456 error = EINVAL;
457 }
458 vnode_put(vp);
459 file_drop(uap->fd);
460 return error;
461 }
462
463 memset(&cn, 0, sizeof(struct componentname));
464 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
465 cn.cn_pnlen = MAXPATHLEN;
466
467 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
468 zfree(ZV_NAMEI, cn.cn_pnbuf);
469 vnode_put(pvp);
470 vnode_put(vp);
471 file_drop(uap->fd);
472 return error;
473 }
474
475 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
476
477 zfree(ZV_NAMEI, cn.cn_pnbuf);
478 vnode_put(pvp);
479 vnode_put(vp);
480 file_drop(uap->fd);
481
482 return error;
483 }
484
485 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
486
487 /*
488 * Get the size of a graft file (a manifest or payload file).
489 * The vp should be an iocounted vnode.
490 */
491 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)492 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
493 {
494 struct stat64 sb = {};
495 int error;
496
497 *size = 0;
498
499 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
500 if (error) {
501 return error;
502 }
503
504 if (sb.st_size == 0) {
505 error = ENODATA;
506 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
507 error = EFBIG;
508 } else {
509 *size = (size_t) sb.st_size;
510 }
511
512 return error;
513 }
514
515 /*
516 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
517 * `size` must already be validated.
518 */
519 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)520 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
521 {
522 return vn_rdwr(UIO_READ, graft_vp,
523 (caddr_t) buf, (int) size, /* offset */ 0,
524 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
525 vfs_context_ucred(vctx), /* resid */ NULL,
526 vfs_context_proc(vctx));
527 }
528
529 /*
530 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
531 * and read it into `buf`.
532 */
533 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)534 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
535 {
536 vnode_t metadata_vp = NULLVP;
537 int error;
538
539 // Convert this graft fd to a vnode.
540 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
541 goto out;
542 }
543
544 // Get (and validate) size information.
545 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
546 goto out;
547 }
548
549 // Read each file into the provided buffer - we must get the expected amount of bytes.
550 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
551 goto out;
552 }
553
554 out:
555 if (metadata_vp) {
556 vnode_put(metadata_vp);
557 metadata_vp = NULLVP;
558 }
559
560 return error;
561 }
562
563 /*
564 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
565 * provided in `gfs`, saving the size of data read in `gfs`.
566 */
567 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)568 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
569 fsioc_graft_fs_t *gfs)
570 {
571 int error;
572
573 // Read the authentic manifest.
574 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
575 &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
576 return error;
577 }
578
579 // The user manifest is currently unused, but set its size.
580 gfs->user_manifest_size = 0;
581
582 // Read the payload.
583 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
584 &gfs->payload_size, gfs->payload))) {
585 return error;
586 }
587
588 return 0;
589 }
590
591 /*
592 * Call into the filesystem to verify and graft a cryptex.
593 */
594 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)595 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
596 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
597 {
598 fsioc_graft_fs_t gfs = {};
599 uint64_t graft_dir_ino = 0;
600 struct stat64 sb = {};
601 int error;
602
603 // Pre-flight arguments.
604 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
605 // Make sure that this graft version matches what we support.
606 return ENOTSUP;
607 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
608 // For this type, cryptex VP must live on same volume as the target of graft.
609 return EXDEV;
610 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
611 // We cannot graft upon non-directories.
612 return ENOTDIR;
613 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
614 sbc_args->sbc_payload_fd < 0) {
615 // We cannot graft without a manifest and payload.
616 return EINVAL;
617 }
618
619 if (mounton_vp) {
620 // Get the mounton's inode number.
621 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
622 if (error) {
623 return error;
624 }
625 graft_dir_ino = (uint64_t) sb.st_ino;
626 }
627
628 // Create buffers (of our maximum-defined size) to store authentication info.
629 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
630 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
631
632 if (!gfs.authentic_manifest || !gfs.payload) {
633 error = ENOMEM;
634 goto out;
635 }
636
637 // Read our fd's into our buffers.
638 // (Note that this will set the buffer size fields in `gfs`.)
639 error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
640 if (error) {
641 goto out;
642 }
643
644 gfs.graft_version = FSIOC_GRAFT_VERSION;
645 gfs.graft_type = graft_type;
646 gfs.graft_4cc = sbc_args->sbc_4cc;
647 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
648 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
649 }
650 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
651 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
652 }
653 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
654 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
655 }
656 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
657 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
658 }
659 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
660 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
661 }
662 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
663 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
664 }
665 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
666
667 // Call into the FS to perform the graft (and validation).
668 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
669
670 out:
671 if (gfs.authentic_manifest) {
672 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
673 gfs.authentic_manifest = NULL;
674 }
675 if (gfs.payload) {
676 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
677 gfs.payload = NULL;
678 }
679
680 return error;
681 }
682
683 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
684
685 /*
686 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
687 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
688 */
689 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)690 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
691 {
692 int ua_dmgfd = uap->dmg_fd;
693 user_addr_t ua_mountdir = uap->mountdir;
694 uint32_t ua_grafttype = uap->graft_type;
695 user_addr_t ua_graftargs = uap->gda;
696
697 graftdmg_args_un kern_gda = {};
698 int error = 0;
699 secure_boot_cryptex_args_t *sbc_args = NULL;
700
701 vnode_t cryptex_vp = NULLVP;
702 vnode_t mounton_vp = NULLVP;
703 struct nameidata nd = {};
704 vfs_context_t ctx = vfs_context_current();
705
706 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
707 return EPERM;
708 }
709
710 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
711 if (error) {
712 return error;
713 }
714
715 // Copy mount dir in, if provided.
716 if (ua_mountdir != USER_ADDR_NULL) {
717 // Acquire vnode for mount-on path
718 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
719 UIO_USERSPACE, ua_mountdir, ctx);
720
721 error = namei(&nd);
722 if (error) {
723 return error;
724 }
725 mounton_vp = nd.ni_vp;
726 }
727
728 // Convert fd to vnode.
729 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
730 if (error) {
731 goto graftout;
732 }
733
734 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_DOWNLEVEL) {
735 error = EINVAL;
736 } else {
737 sbc_args = &kern_gda.sbc_args;
738 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
739 }
740
741 graftout:
742 if (cryptex_vp) {
743 vnode_put(cryptex_vp);
744 cryptex_vp = NULLVP;
745 }
746 if (mounton_vp) {
747 vnode_put(mounton_vp);
748 mounton_vp = NULLVP;
749 }
750 if (ua_mountdir != USER_ADDR_NULL) {
751 nameidone(&nd);
752 }
753
754 return error;
755 }
756
757 /*
758 * Ungraft a cryptex disk image (via mount dir FD)
759 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
760 */
761 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)762 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
763 {
764 int error = 0;
765 user_addr_t ua_mountdir = uap->mountdir;
766 fsioc_ungraft_fs_t ugfs;
767 vnode_t mounton_vp = NULLVP;
768 struct nameidata nd = {};
769 vfs_context_t ctx = vfs_context_current();
770
771 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
772 return EPERM;
773 }
774
775 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
776 return EINVAL;
777 }
778
779 ugfs.ungraft_flags = 0;
780
781 // Acquire vnode for mount-on path
782 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
783 UIO_USERSPACE, ua_mountdir, ctx);
784
785 error = namei(&nd);
786 if (error) {
787 return error;
788 }
789 mounton_vp = nd.ni_vp;
790
791 // Call into the FS to perform the ungraft
792 error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
793
794 vnode_put(mounton_vp);
795 nameidone(&nd);
796
797 return error;
798 }
799
800
801 void
vfs_notify_mount(vnode_t pdvp)802 vfs_notify_mount(vnode_t pdvp)
803 {
804 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
805 lock_vnode_and_post(pdvp, NOTE_WRITE);
806 }
807
808 /*
809 * __mac_mount:
810 * Mount a file system taking into account MAC label behavior.
811 * See mount(2) man page for more information
812 *
813 * Parameters: p Process requesting the mount
814 * uap User argument descriptor (see below)
815 * retval (ignored)
816 *
817 * Indirect: uap->type Filesystem type
818 * uap->path Path to mount
819 * uap->data Mount arguments
820 * uap->mac_p MAC info
821 * uap->flags Mount flags
822 *
823 *
824 * Returns: 0 Success
825 * !0 Not success
826 */
827 boolean_t root_fs_upgrade_try = FALSE;
828
829 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)830 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
831 {
832 vnode_t pvp = NULL;
833 vnode_t vp = NULL;
834 int need_nameidone = 0;
835 vfs_context_t ctx = vfs_context_current();
836 char fstypename[MFSNAMELEN];
837 struct nameidata nd;
838 size_t dummy = 0;
839 char *labelstr = NULL;
840 size_t labelsz = 0;
841 int flags = uap->flags;
842 int error;
843 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
844 boolean_t is_64bit = IS_64BIT_PROCESS(p);
845 #else
846 #pragma unused(p)
847 #endif
848 /*
849 * Get the fs type name from user space
850 */
851 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
852 if (error) {
853 return error;
854 }
855
856 /*
857 * Get the vnode to be covered
858 */
859 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
860 UIO_USERSPACE, uap->path, ctx);
861 if (flags & MNT_NOFOLLOW) {
862 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
863 }
864 error = namei(&nd);
865 if (error) {
866 goto out;
867 }
868 need_nameidone = 1;
869 vp = nd.ni_vp;
870 pvp = nd.ni_dvp;
871
872 #ifdef CONFIG_IMGSRC_ACCESS
873 /* Mounting image source cannot be batched with other operations */
874 if (flags == MNT_IMGSRC_BY_INDEX) {
875 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
876 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
877 goto out;
878 }
879 #endif /* CONFIG_IMGSRC_ACCESS */
880
881 #if CONFIG_MACF
882 /*
883 * Get the label string (if any) from user space
884 */
885 if (uap->mac_p != USER_ADDR_NULL) {
886 struct user_mac mac;
887 size_t ulen = 0;
888
889 if (is_64bit) {
890 struct user64_mac mac64;
891 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
892 mac.m_buflen = (user_size_t)mac64.m_buflen;
893 mac.m_string = (user_addr_t)mac64.m_string;
894 } else {
895 struct user32_mac mac32;
896 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
897 mac.m_buflen = mac32.m_buflen;
898 mac.m_string = mac32.m_string;
899 }
900 if (error) {
901 goto out;
902 }
903 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
904 (mac.m_buflen < 2)) {
905 error = EINVAL;
906 goto out;
907 }
908 labelsz = mac.m_buflen;
909 labelstr = kalloc_data(labelsz, Z_WAITOK);
910 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
911 if (error) {
912 goto out;
913 }
914 AUDIT_ARG(mac_string, labelstr);
915 }
916 #endif /* CONFIG_MACF */
917
918 AUDIT_ARG(fflags, flags);
919
920 #if !CONFIG_UNION_MOUNTS
921 if (flags & MNT_UNION) {
922 error = EPERM;
923 goto out;
924 }
925 #endif
926
927 if ((vp->v_flag & VROOT) &&
928 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
929 #if CONFIG_UNION_MOUNTS
930 if (!(flags & MNT_UNION)) {
931 flags |= MNT_UPDATE;
932 } else {
933 /*
934 * For a union mount on '/', treat it as fresh
935 * mount instead of update.
936 * Otherwise, union mouting on '/' used to panic the
937 * system before, since mnt_vnodecovered was found to
938 * be NULL for '/' which is required for unionlookup
939 * after it gets ENOENT on union mount.
940 */
941 flags = (flags & ~(MNT_UPDATE));
942 }
943 #else
944 flags |= MNT_UPDATE;
945 #endif /* CONFIG_UNION_MOUNTS */
946
947 #if SECURE_KERNEL
948 if ((flags & MNT_RDONLY) == 0) {
949 /* Release kernels are not allowed to mount "/" as rw */
950 error = EPERM;
951 goto out;
952 }
953 #endif
954
955 /*
956 * See 7392553 for more details on why this check exists.
957 * Suffice to say: If this check is ON and something tries
958 * to mount the rootFS RW, we'll turn off the codesign
959 * bitmap optimization.
960 */
961 #if CHECK_CS_VALIDATION_BITMAP
962 if ((flags & MNT_RDONLY) == 0) {
963 root_fs_upgrade_try = TRUE;
964 }
965 #endif
966 }
967
968 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
969 labelstr, ctx);
970
971 out:
972
973 #if CONFIG_MACF
974 kfree_data(labelstr, labelsz);
975 #endif /* CONFIG_MACF */
976
977 if (vp) {
978 vnode_put(vp);
979 }
980 if (pvp) {
981 vnode_put(pvp);
982 }
983 if (need_nameidone) {
984 nameidone(&nd);
985 }
986
987 return error;
988 }
989
990 /*
991 * common mount implementation (final stage of mounting)
992 *
993 * Arguments:
994 * fstypename file system type (ie it's vfs name)
995 * pvp parent of covered vnode
996 * vp covered vnode
997 * cnp component name (ie path) of covered vnode
998 * flags generic mount flags
999 * fsmountargs file system specific data
1000 * labelstr optional MAC label
1001 * kernelmount TRUE for mounts initiated from inside the kernel
1002 * ctx caller's context
1003 */
1004 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1005 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1006 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1007 char *labelstr, vfs_context_t ctx)
1008 {
1009 #if !CONFIG_MACF
1010 #pragma unused(labelstr)
1011 #endif
1012 struct vnode *devvp = NULLVP;
1013 struct vnode *device_vnode = NULLVP;
1014 #if CONFIG_MACF
1015 struct vnode *rvp;
1016 #endif
1017 struct mount *mp = NULL;
1018 struct vfstable *vfsp = (struct vfstable *)0;
1019 struct proc *p = vfs_context_proc(ctx);
1020 int error, flag = 0;
1021 bool flag_set = false;
1022 user_addr_t devpath = USER_ADDR_NULL;
1023 int ronly = 0;
1024 int mntalloc = 0;
1025 boolean_t vfsp_ref = FALSE;
1026 boolean_t is_rwlock_locked = FALSE;
1027 boolean_t did_rele = FALSE;
1028 boolean_t have_usecount = FALSE;
1029 boolean_t did_set_lmount = FALSE;
1030 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1031
1032 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1033 /* Check for mutually-exclusive flag bits */
1034 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1035 int bitcount = 0;
1036 while (checkflags != 0) {
1037 checkflags &= (checkflags - 1);
1038 bitcount++;
1039 }
1040
1041 if (bitcount > 1) {
1042 //not allowed to request multiple mount-by-role flags
1043 error = EINVAL;
1044 goto out1;
1045 }
1046 #endif
1047
1048 /*
1049 * Process an update for an existing mount
1050 */
1051 if (flags & MNT_UPDATE) {
1052 if ((vp->v_flag & VROOT) == 0) {
1053 error = EINVAL;
1054 goto out1;
1055 }
1056 mp = vp->v_mount;
1057
1058 /* if unmount or mount in progress, return error */
1059 mount_lock_spin(mp);
1060 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1061 mount_unlock(mp);
1062 error = EBUSY;
1063 goto out1;
1064 }
1065 mp->mnt_lflag |= MNT_LMOUNT;
1066 did_set_lmount = TRUE;
1067 mount_unlock(mp);
1068 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1069 is_rwlock_locked = TRUE;
1070 /*
1071 * We only allow the filesystem to be reloaded if it
1072 * is currently mounted read-only.
1073 */
1074 if ((flags & MNT_RELOAD) &&
1075 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1076 error = ENOTSUP;
1077 goto out1;
1078 }
1079
1080 /*
1081 * If content protection is enabled, update mounts are not
1082 * allowed to turn it off.
1083 */
1084 if ((mp->mnt_flag & MNT_CPROTECT) &&
1085 ((flags & MNT_CPROTECT) == 0)) {
1086 error = EINVAL;
1087 goto out1;
1088 }
1089
1090 /*
1091 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1092 * failure to return an error for this so we'll just silently
1093 * add it if it is not passed in.
1094 */
1095 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1096 ((flags & MNT_REMOVABLE) == 0)) {
1097 flags |= MNT_REMOVABLE;
1098 }
1099
1100 /* Can't downgrade the backer of the root FS */
1101 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1102 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1103 error = ENOTSUP;
1104 goto out1;
1105 }
1106
1107 /*
1108 * Only root, or the user that did the original mount is
1109 * permitted to update it.
1110 */
1111 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1112 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1113 goto out1;
1114 }
1115 #if CONFIG_MACF
1116 error = mac_mount_check_remount(ctx, mp);
1117 if (error != 0) {
1118 goto out1;
1119 }
1120 #endif
1121 /*
1122 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1123 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1124 */
1125 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1126 flags |= MNT_NOSUID | MNT_NODEV;
1127 if (mp->mnt_flag & MNT_NOEXEC) {
1128 flags |= MNT_NOEXEC;
1129 }
1130 }
1131 flag = mp->mnt_flag;
1132 flag_set = true;
1133
1134
1135
1136 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1137
1138 vfsp = mp->mnt_vtable;
1139 goto update;
1140 } // MNT_UPDATE
1141
1142 /*
1143 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1144 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1145 */
1146 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1147 flags |= MNT_NOSUID | MNT_NODEV;
1148 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1149 flags |= MNT_NOEXEC;
1150 }
1151 }
1152
1153 /* XXXAUDIT: Should we capture the type on the error path as well? */
1154 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1155 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1156 mount_list_lock();
1157 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1158 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1159 vfsp->vfc_refcount++;
1160 vfsp_ref = TRUE;
1161 break;
1162 }
1163 }
1164 mount_list_unlock();
1165 if (vfsp == NULL) {
1166 error = ENODEV;
1167 goto out1;
1168 }
1169
1170 /*
1171 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1172 * except in ROSV configs and for the initial BaseSystem root.
1173 */
1174 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1175 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1176 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1177 error = EINVAL; /* unsupported request */
1178 goto out1;
1179 }
1180
1181 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1182 if (error != 0) {
1183 goto out1;
1184 }
1185
1186 /*
1187 * Allocate and initialize the filesystem (mount_t)
1188 */
1189 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1190 mntalloc = 1;
1191
1192 /* Initialize the default IO constraints */
1193 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1194 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1195 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1196 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1197 mp->mnt_devblocksize = DEV_BSIZE;
1198 mp->mnt_alignmentmask = PAGE_MASK;
1199 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1200 mp->mnt_ioscale = 1;
1201 mp->mnt_ioflags = 0;
1202 mp->mnt_realrootvp = NULLVP;
1203 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1204
1205 mp->mnt_lflag |= MNT_LMOUNT;
1206 did_set_lmount = TRUE;
1207
1208 TAILQ_INIT(&mp->mnt_vnodelist);
1209 TAILQ_INIT(&mp->mnt_workerqueue);
1210 TAILQ_INIT(&mp->mnt_newvnodes);
1211 mount_lock_init(mp);
1212 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1213 is_rwlock_locked = TRUE;
1214 mp->mnt_op = vfsp->vfc_vfsops;
1215 mp->mnt_vtable = vfsp;
1216 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1217 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1218 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1219 do {
1220 size_t pathlen = MAXPATHLEN;
1221
1222 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1223 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1224 }
1225 } while (0);
1226 mp->mnt_vnodecovered = vp;
1227 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1228 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1229 mp->mnt_devbsdunit = 0;
1230 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1231
1232 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1233 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1234
1235 if (kernelmount) {
1236 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1237 }
1238 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1239 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1240 }
1241
1242 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1243 // kernel mounted devfs
1244 mp->mnt_kern_flag |= MNTK_SYSTEM;
1245 }
1246
1247 update:
1248
1249 /*
1250 * Set the mount level flags.
1251 */
1252 if (flags & MNT_RDONLY) {
1253 mp->mnt_flag |= MNT_RDONLY;
1254 } else if (mp->mnt_flag & MNT_RDONLY) {
1255 // disallow read/write upgrades of file systems that
1256 // had the TYPENAME_OVERRIDE feature set.
1257 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1258 error = EPERM;
1259 goto out1;
1260 }
1261 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1262 }
1263 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1264 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1265 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1266 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1267 MNT_QUARANTINE | MNT_CPROTECT);
1268
1269 #if SECURE_KERNEL
1270 #if !CONFIG_MNT_SUID
1271 /*
1272 * On release builds of iOS based platforms, always enforce NOSUID on
1273 * all mounts. We do this here because we can catch update mounts as well as
1274 * non-update mounts in this case.
1275 */
1276 mp->mnt_flag |= (MNT_NOSUID);
1277 #endif
1278 #endif
1279
1280 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1281 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1282 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1283 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1284 MNT_QUARANTINE | MNT_CPROTECT);
1285
1286 #if CONFIG_MACF
1287 if (flags & MNT_MULTILABEL) {
1288 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1289 error = EINVAL;
1290 goto out1;
1291 }
1292 mp->mnt_flag |= MNT_MULTILABEL;
1293 }
1294 #endif
1295 /*
1296 * Process device path for local file systems if requested.
1297 *
1298 * Snapshot and mount-by-role mounts do not use this path; they are
1299 * passing other opaque data in the device path field.
1300 *
1301 * Basesystemroot mounts pass a device path to be resolved here,
1302 * but it's just a char * already inside the kernel, which
1303 * kernel_mount() shoved into a user_addr_t to call us. So for such
1304 * mounts we must skip copyin (both of the address and of the string
1305 * (in NDINIT).
1306 */
1307 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1308 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1309 boolean_t do_copyin_devpath = true;
1310 #if CONFIG_BASESYSTEMROOT
1311 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1312 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1313 // We have been passed fsmountargs, which is typed as a user_addr_t,
1314 // but is actually a char ** pointing to a (kernelspace) string.
1315 // We manually unpack it with a series of casts and dereferences
1316 // that reverses what was done just above us on the stack in
1317 // imageboot_pivot_image().
1318 // After retrieving the path to the dev node (which we will NDINIT
1319 // in a moment), we pass NULL fsmountargs on to the filesystem.
1320 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1321 char **devnamepp = (char **)fsmountargs;
1322 char *devnamep = *devnamepp;
1323 devpath = CAST_USER_ADDR_T(devnamep);
1324 do_copyin_devpath = false;
1325 fsmountargs = USER_ADDR_NULL;
1326
1327 //Now that we have a mp, denote that this mount is for the basesystem.
1328 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1329 }
1330 #endif // CONFIG_BASESYSTEMROOT
1331
1332 if (do_copyin_devpath) {
1333 if (vfs_context_is64bit(ctx)) {
1334 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1335 goto out1;
1336 }
1337 fsmountargs += sizeof(devpath);
1338 } else {
1339 user32_addr_t tmp;
1340 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1341 goto out1;
1342 }
1343 /* munge into LP64 addr */
1344 devpath = CAST_USER_ADDR_T(tmp);
1345 fsmountargs += sizeof(tmp);
1346 }
1347 }
1348
1349 /* Lookup device and authorize access to it */
1350 if ((devpath)) {
1351 struct nameidata nd;
1352
1353 enum uio_seg seg = UIO_USERSPACE;
1354 #if CONFIG_BASESYSTEMROOT
1355 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1356 seg = UIO_SYSSPACE;
1357 }
1358 #endif // CONFIG_BASESYSTEMROOT
1359
1360 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1361 if ((error = namei(&nd))) {
1362 goto out1;
1363 }
1364
1365 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1366 devvp = nd.ni_vp;
1367
1368 nameidone(&nd);
1369
1370 if (devvp->v_type != VBLK) {
1371 error = ENOTBLK;
1372 goto out2;
1373 }
1374 if (major(devvp->v_rdev) >= nblkdev) {
1375 error = ENXIO;
1376 goto out2;
1377 }
1378 /*
1379 * If mount by non-root, then verify that user has necessary
1380 * permissions on the device.
1381 */
1382 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1383 mode_t accessmode = KAUTH_VNODE_READ_DATA;
1384
1385 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1386 accessmode |= KAUTH_VNODE_WRITE_DATA;
1387 }
1388 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1389 goto out2;
1390 }
1391 }
1392 }
1393 /* On first mount, preflight and open device */
1394 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1395 if ((error = vnode_ref(devvp))) {
1396 goto out2;
1397 }
1398 /*
1399 * Disallow multiple mounts of the same device.
1400 * Disallow mounting of a device that is currently in use
1401 * (except for root, which might share swap device for miniroot).
1402 * Flush out any old buffers remaining from a previous use.
1403 */
1404 if ((error = vfs_mountedon(devvp))) {
1405 goto out3;
1406 }
1407
1408 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1409 error = EBUSY;
1410 goto out3;
1411 }
1412 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1413 error = ENOTBLK;
1414 goto out3;
1415 }
1416 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1417 goto out3;
1418 }
1419
1420 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1421 #if CONFIG_MACF
1422 error = mac_vnode_check_open(ctx,
1423 devvp,
1424 ronly ? FREAD : FREAD | FWRITE);
1425 if (error) {
1426 goto out3;
1427 }
1428 #endif /* MAC */
1429 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1430 goto out3;
1431 }
1432
1433 mp->mnt_devvp = devvp;
1434 device_vnode = devvp;
1435 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1436 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1437 (device_vnode = mp->mnt_devvp)) {
1438 dev_t dev;
1439 int maj;
1440 /*
1441 * If upgrade to read-write by non-root, then verify
1442 * that user has necessary permissions on the device.
1443 */
1444 vnode_getalways(device_vnode);
1445
1446 if (suser(vfs_context_ucred(ctx), NULL) &&
1447 (error = vnode_authorize(device_vnode, NULL,
1448 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1449 ctx)) != 0) {
1450 vnode_put(device_vnode);
1451 goto out2;
1452 }
1453
1454 /* Tell the device that we're upgrading */
1455 dev = (dev_t)device_vnode->v_rdev;
1456 maj = major(dev);
1457
1458 if ((u_int)maj >= (u_int)nblkdev) {
1459 panic("Volume mounted on a device with invalid major number.");
1460 }
1461
1462 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1463 vnode_put(device_vnode);
1464 device_vnode = NULLVP;
1465 if (error != 0) {
1466 goto out2;
1467 }
1468 }
1469 } // localargs && !(snapshot | data | vm)
1470
1471 #if CONFIG_MACF
1472 if ((flags & MNT_UPDATE) == 0) {
1473 mac_mount_label_init(mp);
1474 mac_mount_label_associate(ctx, mp);
1475 }
1476 if (labelstr) {
1477 if ((flags & MNT_UPDATE) != 0) {
1478 error = mac_mount_check_label_update(ctx, mp);
1479 if (error != 0) {
1480 goto out3;
1481 }
1482 }
1483 }
1484 #endif
1485 /*
1486 * Mount the filesystem. We already asserted that internal_flags
1487 * cannot have more than one mount-by-role bit set.
1488 */
1489 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1490 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1491 (caddr_t)fsmountargs, 0, ctx);
1492 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1493 #if CONFIG_ROSV_STARTUP
1494 struct mount *origin_mp = (struct mount*)fsmountargs;
1495 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1496 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1497 if (error) {
1498 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1499 } else {
1500 /* Mark volume associated with system volume */
1501 mp->mnt_kern_flag |= MNTK_SYSTEM;
1502
1503 /* Attempt to acquire the mnt_devvp and set it up */
1504 struct vnode *mp_devvp = NULL;
1505 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1506 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1507 0, &mp_devvp, vfs_context_kernel());
1508 if (!lerr) {
1509 mp->mnt_devvp = mp_devvp;
1510 //vnode_lookup took an iocount, need to drop it.
1511 vnode_put(mp_devvp);
1512 // now set `device_vnode` to the devvp that was acquired.
1513 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1514 // note that though the iocount above was dropped, the mount acquires
1515 // an implicit reference against the device.
1516 device_vnode = mp_devvp;
1517 }
1518 }
1519 }
1520 #else
1521 error = EINVAL;
1522 #endif
1523 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1524 #if CONFIG_MOUNT_VM
1525 struct mount *origin_mp = (struct mount*)fsmountargs;
1526 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1527 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1528 if (error) {
1529 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1530 } else {
1531 /* Mark volume associated with system volume and a swap mount */
1532 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1533 /* Attempt to acquire the mnt_devvp and set it up */
1534 struct vnode *mp_devvp = NULL;
1535 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1536 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1537 0, &mp_devvp, vfs_context_kernel());
1538 if (!lerr) {
1539 mp->mnt_devvp = mp_devvp;
1540 //vnode_lookup took an iocount, need to drop it.
1541 vnode_put(mp_devvp);
1542
1543 // now set `device_vnode` to the devvp that was acquired.
1544 // note that though the iocount above was dropped, the mount acquires
1545 // an implicit reference against the device.
1546 device_vnode = mp_devvp;
1547 }
1548 }
1549 }
1550 #else
1551 error = EINVAL;
1552 #endif
1553 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1554 #if CONFIG_MOUNT_PREBOOTRECOVERY
1555 struct mount *origin_mp = (struct mount*)fsmountargs;
1556 uint32_t mount_role = 0;
1557 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1558 mount_role = VFS_PREBOOT_ROLE;
1559 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1560 mount_role = VFS_RECOVERY_ROLE;
1561 }
1562
1563 if (mount_role != 0) {
1564 fs_role_mount_args_t frma = {origin_mp, mount_role};
1565 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1566 if (error) {
1567 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1568 } else {
1569 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1570 /* Mark volume associated with system volume */
1571 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1572 /* Attempt to acquire the mnt_devvp and set it up */
1573 struct vnode *mp_devvp = NULL;
1574 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1575 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1576 0, &mp_devvp, vfs_context_kernel());
1577 if (!lerr) {
1578 mp->mnt_devvp = mp_devvp;
1579 //vnode_lookup took an iocount, need to drop it.
1580 vnode_put(mp_devvp);
1581
1582 // now set `device_vnode` to the devvp that was acquired.
1583 // note that though the iocount above was dropped, the mount acquires
1584 // an implicit reference against the device.
1585 device_vnode = mp_devvp;
1586 }
1587 }
1588 }
1589 } else {
1590 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1591 error = EINVAL;
1592 }
1593 #else
1594 error = EINVAL;
1595 #endif
1596 } else {
1597 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1598 }
1599
1600 if (flags & MNT_UPDATE) {
1601 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1602 mp->mnt_flag &= ~MNT_RDONLY;
1603 }
1604 mp->mnt_flag &= ~
1605 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1606 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1607 if (error) {
1608 mp->mnt_flag = flag; /* restore flag value */
1609 }
1610 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1611 lck_rw_done(&mp->mnt_rwlock);
1612 is_rwlock_locked = FALSE;
1613 if (!error) {
1614 enablequotas(mp, ctx);
1615 }
1616 goto exit;
1617 }
1618
1619 /*
1620 * Put the new filesystem on the mount list after root.
1621 */
1622 if (error == 0) {
1623 struct vfs_attr vfsattr;
1624 if (device_vnode) {
1625 /*
1626 * cache the IO attributes for the underlying physical media...
1627 * an error return indicates the underlying driver doesn't
1628 * support all the queries necessary... however, reasonable
1629 * defaults will have been set, so no reason to bail or care
1630 *
1631 * Need to do this before calling the MAC hook as it needs
1632 * information from this call.
1633 */
1634 vfs_init_io_attributes(device_vnode, mp);
1635 }
1636
1637 #if CONFIG_MACF
1638 error = mac_mount_check_mount_late(ctx, mp);
1639 if (error != 0) {
1640 goto out4;
1641 }
1642
1643 if (vfs_flags(mp) & MNT_MULTILABEL) {
1644 error = VFS_ROOT(mp, &rvp, ctx);
1645 if (error) {
1646 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1647 goto out4;
1648 }
1649 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1650 /*
1651 * drop reference provided by VFS_ROOT
1652 */
1653 vnode_put(rvp);
1654
1655 if (error) {
1656 goto out4;
1657 }
1658 }
1659 #endif /* MAC */
1660
1661 vnode_lock_spin(vp);
1662 CLR(vp->v_flag, VMOUNT);
1663 vp->v_mountedhere = mp;
1664 vnode_unlock(vp);
1665
1666 /*
1667 * taking the name_cache_lock exclusively will
1668 * insure that everyone is out of the fast path who
1669 * might be trying to use a now stale copy of
1670 * vp->v_mountedhere->mnt_realrootvp
1671 * bumping mount_generation causes the cached values
1672 * to be invalidated
1673 */
1674 name_cache_lock();
1675 mount_generation++;
1676 name_cache_unlock();
1677
1678 error = vnode_ref(vp);
1679 if (error != 0) {
1680 goto out4;
1681 }
1682
1683 have_usecount = TRUE;
1684
1685 error = checkdirs(vp, ctx);
1686 if (error != 0) {
1687 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1688 goto out4;
1689 }
1690 /*
1691 * there is no cleanup code here so I have made it void
1692 * we need to revisit this
1693 */
1694 (void)VFS_START(mp, 0, ctx);
1695
1696 if (mount_list_add(mp) != 0) {
1697 /*
1698 * The system is shutting down trying to umount
1699 * everything, so fail with a plausible errno.
1700 */
1701 error = EBUSY;
1702 goto out4;
1703 }
1704 lck_rw_done(&mp->mnt_rwlock);
1705 is_rwlock_locked = FALSE;
1706
1707 /* Check if this mounted file system supports EAs or named streams. */
1708 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1709 VFSATTR_INIT(&vfsattr);
1710 VFSATTR_WANTED(&vfsattr, f_capabilities);
1711 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1712 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1713 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1714 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1715 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1716 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1717 }
1718 #if NAMEDSTREAMS
1719 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1720 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1721 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1722 }
1723 #endif
1724 /* Check if this file system supports path from id lookups. */
1725 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1726 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1727 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1728 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1729 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1730 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1731 }
1732
1733 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1734 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1735 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1736 }
1737 }
1738 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1739 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1740 }
1741 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1742 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1743 }
1744 /* increment the operations count */
1745 OSAddAtomic(1, &vfs_nummntops);
1746 enablequotas(mp, ctx);
1747
1748 if (device_vnode) {
1749 device_vnode->v_specflags |= SI_MOUNTEDON;
1750 }
1751
1752 /* Now that mount is setup, notify the listeners */
1753 vfs_notify_mount(pvp);
1754 IOBSDMountChange(mp, kIOMountChangeMount);
1755 } else {
1756 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1757 if (mp->mnt_vnodelist.tqh_first != NULL) {
1758 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1759 mp->mnt_vtable->vfc_name, error);
1760 }
1761
1762 vnode_lock_spin(vp);
1763 CLR(vp->v_flag, VMOUNT);
1764 vnode_unlock(vp);
1765 mount_list_lock();
1766 mp->mnt_vtable->vfc_refcount--;
1767 mount_list_unlock();
1768
1769 if (device_vnode) {
1770 vnode_rele(device_vnode);
1771 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1772 }
1773 lck_rw_done(&mp->mnt_rwlock);
1774 is_rwlock_locked = FALSE;
1775
1776 if (nc_smr_enabled) {
1777 vfs_smr_synchronize();
1778 }
1779
1780 /*
1781 * if we get here, we have a mount structure that needs to be freed,
1782 * but since the coveredvp hasn't yet been updated to point at it,
1783 * no need to worry about other threads holding a crossref on this mp
1784 * so it's ok to just free it
1785 */
1786 mount_lock_destroy(mp);
1787 #if CONFIG_MACF
1788 mac_mount_label_destroy(mp);
1789 #endif
1790 zfree(mount_zone, mp);
1791 did_set_lmount = false;
1792 }
1793 exit:
1794 /*
1795 * drop I/O count on the device vp if there was one
1796 */
1797 if (devpath && devvp) {
1798 vnode_put(devvp);
1799 }
1800
1801 if (did_set_lmount) {
1802 mount_lock_spin(mp);
1803 mp->mnt_lflag &= ~MNT_LMOUNT;
1804 mount_unlock(mp);
1805 }
1806
1807 return error;
1808
1809 /* Error condition exits */
1810 out4:
1811 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1812
1813 /*
1814 * If the mount has been placed on the covered vp,
1815 * it may have been discovered by now, so we have
1816 * to treat this just like an unmount
1817 */
1818 mount_lock_spin(mp);
1819 mp->mnt_lflag |= MNT_LDEAD;
1820 mount_unlock(mp);
1821
1822 if (device_vnode != NULLVP) {
1823 vnode_rele(device_vnode);
1824 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1825 ctx);
1826 did_rele = TRUE;
1827 }
1828
1829 vnode_lock_spin(vp);
1830
1831 mp->mnt_crossref++;
1832 vp->v_mountedhere = (mount_t) 0;
1833
1834 vnode_unlock(vp);
1835
1836 if (have_usecount) {
1837 vnode_rele(vp);
1838 }
1839 out3:
1840 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1841 vnode_rele(devvp);
1842 }
1843 out2:
1844 if (devpath && devvp) {
1845 vnode_put(devvp);
1846 }
1847 out1:
1848 /* Release mnt_rwlock only when it was taken */
1849 if (is_rwlock_locked == TRUE) {
1850 if (flag_set) {
1851 mp->mnt_flag = flag; /* restore mnt_flag value */
1852 }
1853 lck_rw_done(&mp->mnt_rwlock);
1854 }
1855
1856 if (did_set_lmount) {
1857 mount_lock_spin(mp);
1858 mp->mnt_lflag &= ~MNT_LMOUNT;
1859 mount_unlock(mp);
1860 }
1861
1862 if (mntalloc) {
1863 if (mp->mnt_crossref) {
1864 mount_dropcrossref(mp, vp, 0);
1865 } else {
1866 if (nc_smr_enabled) {
1867 vfs_smr_synchronize();
1868 }
1869
1870 mount_lock_destroy(mp);
1871 #if CONFIG_MACF
1872 mac_mount_label_destroy(mp);
1873 #endif
1874 zfree(mount_zone, mp);
1875 }
1876 }
1877 if (vfsp_ref) {
1878 mount_list_lock();
1879 vfsp->vfc_refcount--;
1880 mount_list_unlock();
1881 }
1882
1883 return error;
1884 }
1885
1886 /*
1887 * Flush in-core data, check for competing mount attempts,
1888 * and set VMOUNT
1889 */
1890 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1891 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1892 {
1893 #if !CONFIG_MACF
1894 #pragma unused(cnp,fsname)
1895 #endif
1896 struct vnode_attr va;
1897 int error;
1898 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1899 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1900 boolean_t is_busy;
1901
1902 if (!skip_auth) {
1903 /*
1904 * If the user is not root, ensure that they own the directory
1905 * onto which we are attempting to mount.
1906 */
1907 VATTR_INIT(&va);
1908 VATTR_WANTED(&va, va_uid);
1909 if ((error = vnode_getattr(vp, &va, ctx)) ||
1910 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1911 (!vfs_context_issuser(ctx)))) {
1912 error = EPERM;
1913 goto out;
1914 }
1915 }
1916
1917 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1918 goto out;
1919 }
1920
1921 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1922 goto out;
1923 }
1924
1925 if (vp->v_type != VDIR) {
1926 error = ENOTDIR;
1927 goto out;
1928 }
1929
1930 vnode_lock_spin(vp);
1931 is_busy = is_fmount ?
1932 (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1933 (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1934 if (is_busy) {
1935 vnode_unlock(vp);
1936 error = EBUSY;
1937 goto out;
1938 }
1939 SET(vp->v_flag, VMOUNT);
1940 vnode_unlock(vp);
1941
1942 #if CONFIG_MACF
1943 error = mac_mount_check_mount(ctx, vp,
1944 cnp, fsname);
1945 if (error != 0) {
1946 vnode_lock_spin(vp);
1947 CLR(vp->v_flag, VMOUNT);
1948 vnode_unlock(vp);
1949 }
1950 #endif
1951
1952 out:
1953 return error;
1954 }
1955
1956 #if CONFIG_IMGSRC_ACCESS
1957
1958 #define DEBUG_IMGSRC 0
1959
1960 #if DEBUG_IMGSRC
1961 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1962 #else
1963 #define IMGSRC_DEBUG(args...) do { } while(0)
1964 #endif
1965
1966 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1967 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1968 {
1969 struct nameidata nd;
1970 vnode_t vp, realdevvp;
1971 mode_t accessmode;
1972 int error;
1973 enum uio_seg uio = UIO_USERSPACE;
1974
1975 if (ctx == vfs_context_kernel()) {
1976 uio = UIO_SYSSPACE;
1977 }
1978
1979 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1980 if ((error = namei(&nd))) {
1981 IMGSRC_DEBUG("namei() failed with %d\n", error);
1982 return error;
1983 }
1984
1985 vp = nd.ni_vp;
1986
1987 if (!vnode_isblk(vp)) {
1988 IMGSRC_DEBUG("Not block device.\n");
1989 error = ENOTBLK;
1990 goto out;
1991 }
1992
1993 realdevvp = mp->mnt_devvp;
1994 if (realdevvp == NULLVP) {
1995 IMGSRC_DEBUG("No device backs the mount.\n");
1996 error = ENXIO;
1997 goto out;
1998 }
1999
2000 error = vnode_getwithref(realdevvp);
2001 if (error != 0) {
2002 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2003 goto out;
2004 }
2005
2006 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2007 IMGSRC_DEBUG("Wrong dev_t.\n");
2008 error = ENXIO;
2009 goto out1;
2010 }
2011
2012 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2013
2014 /*
2015 * If mount by non-root, then verify that user has necessary
2016 * permissions on the device.
2017 */
2018 if (!vfs_context_issuser(ctx)) {
2019 accessmode = KAUTH_VNODE_READ_DATA;
2020 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2021 accessmode |= KAUTH_VNODE_WRITE_DATA;
2022 }
2023 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2024 IMGSRC_DEBUG("Access denied.\n");
2025 goto out1;
2026 }
2027 }
2028
2029 *devvpp = vp;
2030
2031 out1:
2032 vnode_put(realdevvp);
2033
2034 out:
2035 nameidone(&nd);
2036
2037 if (error) {
2038 vnode_put(vp);
2039 }
2040
2041 return error;
2042 }
2043
2044 /*
2045 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2046 * and call checkdirs()
2047 */
2048 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2049 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2050 {
2051 int error;
2052
2053 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2054
2055 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2056 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2057
2058 vnode_lock_spin(vp);
2059 CLR(vp->v_flag, VMOUNT);
2060 vp->v_mountedhere = mp;
2061 vnode_unlock(vp);
2062
2063 /*
2064 * taking the name_cache_lock exclusively will
2065 * insure that everyone is out of the fast path who
2066 * might be trying to use a now stale copy of
2067 * vp->v_mountedhere->mnt_realrootvp
2068 * bumping mount_generation causes the cached values
2069 * to be invalidated
2070 */
2071 name_cache_lock();
2072 mount_generation++;
2073 name_cache_unlock();
2074
2075 error = vnode_ref(vp);
2076 if (error != 0) {
2077 goto out;
2078 }
2079
2080 error = checkdirs(vp, ctx);
2081 if (error != 0) {
2082 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2083 vnode_rele(vp);
2084 goto out;
2085 }
2086
2087 out:
2088 if (error != 0) {
2089 mp->mnt_vnodecovered = NULLVP;
2090 }
2091 return error;
2092 }
2093
2094 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2095 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2096 {
2097 vnode_rele(vp);
2098 vnode_lock_spin(vp);
2099 vp->v_mountedhere = (mount_t)NULL;
2100 vnode_unlock(vp);
2101
2102 mp->mnt_vnodecovered = NULLVP;
2103 }
2104
2105 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2106 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2107 {
2108 int error;
2109
2110 /* unmount in progress return error */
2111 mount_lock_spin(mp);
2112 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2113 mount_unlock(mp);
2114 return EBUSY;
2115 }
2116 mount_unlock(mp);
2117 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2118
2119 /*
2120 * We only allow the filesystem to be reloaded if it
2121 * is currently mounted read-only.
2122 */
2123 if ((flags & MNT_RELOAD) &&
2124 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2125 error = ENOTSUP;
2126 goto out;
2127 }
2128
2129 /*
2130 * Only root, or the user that did the original mount is
2131 * permitted to update it.
2132 */
2133 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2134 (!vfs_context_issuser(ctx))) {
2135 error = EPERM;
2136 goto out;
2137 }
2138 #if CONFIG_MACF
2139 error = mac_mount_check_remount(ctx, mp);
2140 if (error != 0) {
2141 goto out;
2142 }
2143 #endif
2144
2145 out:
2146 if (error) {
2147 lck_rw_done(&mp->mnt_rwlock);
2148 }
2149
2150 return error;
2151 }
2152
2153 static void
mount_end_update(mount_t mp)2154 mount_end_update(mount_t mp)
2155 {
2156 lck_rw_done(&mp->mnt_rwlock);
2157 }
2158
2159 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2160 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2161 {
2162 vnode_t vp;
2163
2164 if (height >= MAX_IMAGEBOOT_NESTING) {
2165 return EINVAL;
2166 }
2167
2168 vp = imgsrc_rootvnodes[height];
2169 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2170 *rvpp = vp;
2171 return 0;
2172 } else {
2173 return ENOENT;
2174 }
2175 }
2176
2177 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2178 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2179 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2180 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2181 {
2182 int error;
2183 mount_t mp;
2184 boolean_t placed = FALSE;
2185 struct vfstable *vfsp;
2186 user_addr_t devpath;
2187 char *old_mntonname;
2188 vnode_t rvp;
2189 vnode_t devvp;
2190 uint32_t height;
2191 uint32_t flags;
2192
2193 /* If we didn't imageboot, nothing to move */
2194 if (imgsrc_rootvnodes[0] == NULLVP) {
2195 return EINVAL;
2196 }
2197
2198 /* Only root can do this */
2199 if (!vfs_context_issuser(ctx)) {
2200 return EPERM;
2201 }
2202
2203 IMGSRC_DEBUG("looking for root vnode.\n");
2204
2205 /*
2206 * Get root vnode of filesystem we're moving.
2207 */
2208 if (by_index) {
2209 if (is64bit) {
2210 struct user64_mnt_imgsrc_args mia64;
2211 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2212 if (error != 0) {
2213 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2214 return error;
2215 }
2216
2217 height = mia64.mi_height;
2218 flags = mia64.mi_flags;
2219 devpath = (user_addr_t)mia64.mi_devpath;
2220 } else {
2221 struct user32_mnt_imgsrc_args mia32;
2222 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2223 if (error != 0) {
2224 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2225 return error;
2226 }
2227
2228 height = mia32.mi_height;
2229 flags = mia32.mi_flags;
2230 devpath = mia32.mi_devpath;
2231 }
2232 } else {
2233 /*
2234 * For binary compatibility--assumes one level of nesting.
2235 */
2236 if (is64bit) {
2237 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2238 return error;
2239 }
2240 } else {
2241 user32_addr_t tmp;
2242 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2243 return error;
2244 }
2245
2246 /* munge into LP64 addr */
2247 devpath = CAST_USER_ADDR_T(tmp);
2248 }
2249
2250 height = 0;
2251 flags = 0;
2252 }
2253
2254 if (flags != 0) {
2255 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2256 return EINVAL;
2257 }
2258
2259 error = get_imgsrc_rootvnode(height, &rvp);
2260 if (error != 0) {
2261 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2262 return error;
2263 }
2264
2265 IMGSRC_DEBUG("got old root vnode\n");
2266
2267 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2268
2269 /* Can only move once */
2270 mp = vnode_mount(rvp);
2271 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2272 IMGSRC_DEBUG("Already moved.\n");
2273 error = EBUSY;
2274 goto out0;
2275 }
2276
2277 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2278 IMGSRC_DEBUG("Starting updated.\n");
2279
2280 /* Get exclusive rwlock on mount, authorize update on mp */
2281 error = mount_begin_update(mp, ctx, 0);
2282 if (error != 0) {
2283 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2284 goto out0;
2285 }
2286
2287 /*
2288 * It can only be moved once. Flag is set under the rwlock,
2289 * so we're now safe to proceed.
2290 */
2291 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2292 IMGSRC_DEBUG("Already moved [2]\n");
2293 goto out1;
2294 }
2295
2296 IMGSRC_DEBUG("Preparing coveredvp.\n");
2297
2298 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2299 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2300 if (error != 0) {
2301 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2302 goto out1;
2303 }
2304
2305 IMGSRC_DEBUG("Covered vp OK.\n");
2306
2307 /* Sanity check the name caller has provided */
2308 vfsp = mp->mnt_vtable;
2309 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2310 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2311 vfsp->vfc_name, fsname);
2312 error = EINVAL;
2313 goto out2;
2314 }
2315
2316 /* Check the device vnode and update mount-from name, for local filesystems */
2317 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2318 IMGSRC_DEBUG("Local, doing device validation.\n");
2319
2320 if (devpath != USER_ADDR_NULL) {
2321 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2322 if (error) {
2323 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2324 goto out2;
2325 }
2326
2327 vnode_put(devvp);
2328 }
2329 }
2330
2331 /*
2332 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2333 * and increment the name cache's mount generation
2334 */
2335
2336 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2337 error = place_mount_and_checkdirs(mp, vp, ctx);
2338 if (error != 0) {
2339 goto out2;
2340 }
2341
2342 placed = TRUE;
2343
2344 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2345 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2346
2347 /* Forbid future moves */
2348 mount_lock(mp);
2349 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2350 mount_unlock(mp);
2351
2352 /* Finally, add to mount list, completely ready to go */
2353 if (mount_list_add(mp) != 0) {
2354 /*
2355 * The system is shutting down trying to umount
2356 * everything, so fail with a plausible errno.
2357 */
2358 error = EBUSY;
2359 goto out3;
2360 }
2361
2362 mount_end_update(mp);
2363 vnode_put(rvp);
2364 zfree(ZV_NAMEI, old_mntonname);
2365
2366 vfs_notify_mount(pvp);
2367
2368 return 0;
2369 out3:
2370 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2371
2372 mount_lock(mp);
2373 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2374 mount_unlock(mp);
2375
2376 out2:
2377 /*
2378 * Placing the mp on the vnode clears VMOUNT,
2379 * so cleanup is different after that point
2380 */
2381 if (placed) {
2382 /* Rele the vp, clear VMOUNT and v_mountedhere */
2383 undo_place_on_covered_vp(mp, vp);
2384 } else {
2385 vnode_lock_spin(vp);
2386 CLR(vp->v_flag, VMOUNT);
2387 vnode_unlock(vp);
2388 }
2389 out1:
2390 mount_end_update(mp);
2391
2392 out0:
2393 vnode_put(rvp);
2394 zfree(ZV_NAMEI, old_mntonname);
2395 return error;
2396 }
2397
2398 #endif /* CONFIG_IMGSRC_ACCESS */
2399
2400 void
enablequotas(struct mount * mp,vfs_context_t ctx)2401 enablequotas(struct mount *mp, vfs_context_t ctx)
2402 {
2403 struct nameidata qnd;
2404 int type;
2405 char qfpath[MAXPATHLEN];
2406 const char *qfname = QUOTAFILENAME;
2407 const char *qfopsname = QUOTAOPSNAME;
2408 const char *qfextension[] = INITQFNAMES;
2409
2410 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2411 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2412 return;
2413 }
2414 /*
2415 * Enable filesystem disk quotas if necessary.
2416 * We ignore errors as this should not interfere with final mount
2417 */
2418 for (type = 0; type < MAXQUOTAS; type++) {
2419 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2420 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2421 CAST_USER_ADDR_T(qfpath), ctx);
2422 if (namei(&qnd) != 0) {
2423 continue; /* option file to trigger quotas is not present */
2424 }
2425 vnode_put(qnd.ni_vp);
2426 nameidone(&qnd);
2427 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2428
2429 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2430 }
2431 return;
2432 }
2433
2434
2435 static int
checkdirs_callback(proc_t p,void * arg)2436 checkdirs_callback(proc_t p, void * arg)
2437 {
2438 struct cdirargs *cdrp = (struct cdirargs *)arg;
2439 vnode_t olddp = cdrp->olddp;
2440 vnode_t newdp = cdrp->newdp;
2441 struct filedesc *fdp = &p->p_fd;
2442 vnode_t new_cvp = newdp;
2443 vnode_t new_rvp = newdp;
2444 vnode_t old_cvp = NULL;
2445 vnode_t old_rvp = NULL;
2446
2447 /*
2448 * XXX Also needs to iterate each thread in the process to see if it
2449 * XXX is using a per-thread current working directory, and, if so,
2450 * XXX update that as well.
2451 */
2452
2453 /*
2454 * First, with the proc_fdlock held, check to see if we will need
2455 * to do any work. If not, we will get out fast.
2456 */
2457 proc_fdlock(p);
2458 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2459 proc_fdunlock(p);
2460 return PROC_RETURNED;
2461 }
2462 proc_fdunlock(p);
2463
2464 /*
2465 * Ok, we will have to do some work. Always take two refs
2466 * because we might need that many. We'll dispose of whatever
2467 * we ended up not using.
2468 */
2469 if (vnode_ref(newdp) != 0) {
2470 return PROC_RETURNED;
2471 }
2472 if (vnode_ref(newdp) != 0) {
2473 vnode_rele(newdp);
2474 return PROC_RETURNED;
2475 }
2476
2477 proc_dirs_lock_exclusive(p);
2478 /*
2479 * Now do the work. Note: we dropped the proc_fdlock, so we
2480 * have to do all of the checks again.
2481 */
2482 proc_fdlock(p);
2483 if (fdp->fd_cdir == olddp) {
2484 old_cvp = olddp;
2485 fdp->fd_cdir = newdp;
2486 new_cvp = NULL;
2487 }
2488 if (fdp->fd_rdir == olddp) {
2489 old_rvp = olddp;
2490 fdp->fd_rdir = newdp;
2491 new_rvp = NULL;
2492 }
2493 proc_fdunlock(p);
2494 proc_dirs_unlock_exclusive(p);
2495
2496 /*
2497 * Dispose of any references that are no longer needed.
2498 */
2499 if (old_cvp != NULL) {
2500 vnode_rele(old_cvp);
2501 }
2502 if (old_rvp != NULL) {
2503 vnode_rele(old_rvp);
2504 }
2505 if (new_cvp != NULL) {
2506 vnode_rele(new_cvp);
2507 }
2508 if (new_rvp != NULL) {
2509 vnode_rele(new_rvp);
2510 }
2511
2512 return PROC_RETURNED;
2513 }
2514
2515
2516
2517 /*
2518 * Scan all active processes to see if any of them have a current
2519 * or root directory onto which the new filesystem has just been
2520 * mounted. If so, replace them with the new mount point.
2521 */
2522 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2523 checkdirs(vnode_t olddp, vfs_context_t ctx)
2524 {
2525 vnode_t newdp;
2526 vnode_t tvp;
2527 int err;
2528 struct cdirargs cdr;
2529
2530 if (olddp->v_usecount == 1) {
2531 return 0;
2532 }
2533 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2534
2535 if (err != 0) {
2536 #if DIAGNOSTIC
2537 panic("mount: lost mount: error %d", err);
2538 #endif
2539 return err;
2540 }
2541
2542 cdr.olddp = olddp;
2543 cdr.newdp = newdp;
2544 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2545 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2546
2547 if (rootvnode == olddp) {
2548 vnode_ref(newdp);
2549 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2550 tvp = rootvnode;
2551 rootvnode = newdp;
2552 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2553 vnode_rele(tvp);
2554 }
2555
2556 vnode_put(newdp);
2557 return 0;
2558 }
2559
2560 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2561 "com.apple.private.vfs.role-account-unmount"
2562
2563 /*
2564 * Unmount a file system.
2565 *
2566 * Note: unmount takes a path to the vnode mounted on as argument,
2567 * not special file (as before).
2568 */
2569 /* ARGSUSED */
2570 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2571 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2572 {
2573 vnode_t vp;
2574 struct mount *mp;
2575 int error;
2576 struct nameidata nd;
2577 vfs_context_t ctx;
2578
2579 /*
2580 * If the process has the entitlement, use the kernel's context when
2581 * performing lookup on the mount path as the process might lack proper
2582 * permission to access the directory.
2583 */
2584 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2585 vfs_context_kernel() : vfs_context_current();
2586
2587 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2588 UIO_USERSPACE, uap->path, ctx);
2589 error = namei(&nd);
2590 if (error) {
2591 return error;
2592 }
2593 vp = nd.ni_vp;
2594 mp = vp->v_mount;
2595 nameidone(&nd);
2596
2597 #if CONFIG_MACF
2598 error = mac_mount_check_umount(ctx, mp);
2599 if (error != 0) {
2600 vnode_put(vp);
2601 return error;
2602 }
2603 #endif
2604 /*
2605 * Must be the root of the filesystem
2606 */
2607 if ((vp->v_flag & VROOT) == 0) {
2608 vnode_put(vp);
2609 return EINVAL;
2610 }
2611 mount_ref(mp, 0);
2612 vnode_put(vp);
2613 /* safedounmount consumes the mount ref */
2614 return safedounmount(mp, uap->flags, ctx);
2615 }
2616
2617 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2618 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2619 {
2620 mount_t mp;
2621
2622 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2623 if (mp == (mount_t)0) {
2624 return ENOENT;
2625 }
2626 mount_ref(mp, 0);
2627 mount_iterdrop(mp);
2628 /* safedounmount consumes the mount ref */
2629 return safedounmount(mp, flags, ctx);
2630 }
2631
2632 /*
2633 * The mount struct comes with a mount ref which will be consumed.
2634 * Do the actual file system unmount, prevent some common foot shooting.
2635 */
2636 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2637 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2638 {
2639 int error;
2640 proc_t p = vfs_context_proc(ctx);
2641
2642 /*
2643 * If the file system is not responding and MNT_NOBLOCK
2644 * is set and not a forced unmount then return EBUSY.
2645 */
2646 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2647 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2648 error = EBUSY;
2649 goto out;
2650 }
2651
2652 /*
2653 * Skip authorization in two cases:
2654 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2655 * This entitlement allows non-root processes unmount volumes mounted by
2656 * other processes.
2657 * - If the mount is tagged as permissive and this is not a forced-unmount
2658 * attempt.
2659 */
2660 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2661 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2662 /*
2663 * Only root, or the user that did the original mount is
2664 * permitted to unmount this filesystem.
2665 */
2666 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2667 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2668 goto out;
2669 }
2670 }
2671 /*
2672 * Don't allow unmounting the root file system, or other volumes
2673 * associated with it (for example, the associated VM or DATA mounts) .
2674 */
2675 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2676 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2677 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2678 mp->mnt_vfsstat.f_mntonname);
2679 }
2680 error = EBUSY; /* the root (or associated volumes) is always busy */
2681 goto out;
2682 }
2683
2684 /*
2685 * If the mount is providing the root filesystem's disk image
2686 * (i.e. imageboot), don't allow unmounting
2687 */
2688 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2689 error = EBUSY;
2690 goto out;
2691 }
2692
2693 return dounmount(mp, flags, 1, ctx);
2694
2695 out:
2696 mount_drop(mp, 0);
2697 return error;
2698 }
2699
2700 /*
2701 * Do the actual file system unmount.
2702 */
2703 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2704 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2705 {
2706 vnode_t coveredvp = (vnode_t)0;
2707 int error;
2708 int needwakeup = 0;
2709 int forcedunmount = 0;
2710 int lflags = 0;
2711 struct vnode *devvp = NULLVP;
2712 #if CONFIG_TRIGGERS
2713 proc_t p = vfs_context_proc(ctx);
2714 int did_vflush = 0;
2715 int pflags_save = 0;
2716 #endif /* CONFIG_TRIGGERS */
2717
2718 #if CONFIG_FSE
2719 if (!(flags & MNT_FORCE)) {
2720 fsevent_unmount(mp, ctx); /* has to come first! */
2721 }
2722 #endif
2723
2724 mount_lock(mp);
2725
2726 /*
2727 * If already an unmount in progress just return EBUSY.
2728 * Even a forced unmount cannot override.
2729 */
2730 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2731 if (withref != 0) {
2732 mount_drop(mp, 1);
2733 }
2734 mount_unlock(mp);
2735 return EBUSY;
2736 }
2737
2738 if (flags & MNT_FORCE) {
2739 forcedunmount = 1;
2740 mp->mnt_lflag |= MNT_LFORCE;
2741 }
2742
2743 #if CONFIG_TRIGGERS
2744 if (flags & MNT_NOBLOCK && p != kernproc) {
2745 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2746 }
2747 #endif
2748
2749 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2750 mp->mnt_lflag |= MNT_LUNMOUNT;
2751 mp->mnt_flag &= ~MNT_ASYNC;
2752 /*
2753 * anyone currently in the fast path that
2754 * trips over the cached rootvp will be
2755 * dumped out and forced into the slow path
2756 * to regenerate a new cached value
2757 */
2758 mp->mnt_realrootvp = NULLVP;
2759 mount_unlock(mp);
2760
2761 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2762 /*
2763 * Force unmount any mounts in this filesystem.
2764 * If any unmounts fail - just leave them dangling.
2765 * Avoids recursion.
2766 */
2767 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2768 }
2769
2770 /*
2771 * taking the name_cache_lock exclusively will
2772 * insure that everyone is out of the fast path who
2773 * might be trying to use a now stale copy of
2774 * vp->v_mountedhere->mnt_realrootvp
2775 * bumping mount_generation causes the cached values
2776 * to be invalidated
2777 */
2778 name_cache_lock();
2779 mount_generation++;
2780 name_cache_unlock();
2781
2782
2783 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2784 if (withref != 0) {
2785 mount_drop(mp, 0);
2786 }
2787 error = 0;
2788 if (forcedunmount == 0) {
2789 ubc_umount(mp); /* release cached vnodes */
2790 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2791 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2792 if (error) {
2793 mount_lock(mp);
2794 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2795 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2796 mp->mnt_lflag &= ~MNT_LFORCE;
2797 goto out;
2798 }
2799 }
2800 }
2801
2802 IOBSDMountChange(mp, kIOMountChangeUnmount);
2803
2804 #if CONFIG_TRIGGERS
2805 vfs_nested_trigger_unmounts(mp, flags, ctx);
2806 did_vflush = 1;
2807 #endif
2808 if (forcedunmount) {
2809 lflags |= FORCECLOSE;
2810 }
2811 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2812 if ((forcedunmount == 0) && error) {
2813 mount_lock(mp);
2814 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2815 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2816 mp->mnt_lflag &= ~MNT_LFORCE;
2817 goto out;
2818 }
2819
2820 /* make sure there are no one in the mount iterations or lookup */
2821 mount_iterdrain(mp);
2822
2823 error = VFS_UNMOUNT(mp, flags, ctx);
2824 if (error) {
2825 mount_iterreset(mp);
2826 mount_lock(mp);
2827 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2828 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2829 mp->mnt_lflag &= ~MNT_LFORCE;
2830 goto out;
2831 }
2832
2833 /* increment the operations count */
2834 if (!error) {
2835 OSAddAtomic(1, &vfs_nummntops);
2836 }
2837
2838 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2839 /* hold an io reference and drop the usecount before close */
2840 devvp = mp->mnt_devvp;
2841 vnode_getalways(devvp);
2842 vnode_rele(devvp);
2843 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2844 ctx);
2845 vnode_clearmountedon(devvp);
2846 vnode_put(devvp);
2847 }
2848 lck_rw_done(&mp->mnt_rwlock);
2849 mount_list_remove(mp);
2850 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2851
2852 /* mark the mount point hook in the vp but not drop the ref yet */
2853 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2854 /*
2855 * The covered vnode needs special handling. Trying to get an
2856 * iocount must not block here as this may lead to deadlocks
2857 * if the Filesystem to which the covered vnode belongs is
2858 * undergoing forced unmounts. Since we hold a usecount, the
2859 * vnode cannot be reused (it can, however, still be terminated)
2860 */
2861 vnode_getalways(coveredvp);
2862 vnode_lock_spin(coveredvp);
2863
2864 mp->mnt_crossref++;
2865 coveredvp->v_mountedhere = (struct mount *)0;
2866 CLR(coveredvp->v_flag, VMOUNT);
2867
2868 vnode_unlock(coveredvp);
2869 vnode_put(coveredvp);
2870 }
2871
2872 mount_list_lock();
2873 mp->mnt_vtable->vfc_refcount--;
2874 mount_list_unlock();
2875
2876 cache_purgevfs(mp); /* remove cache entries for this file sys */
2877 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2878 mount_lock(mp);
2879 mp->mnt_lflag |= MNT_LDEAD;
2880
2881 if (mp->mnt_lflag & MNT_LWAIT) {
2882 /*
2883 * do the wakeup here
2884 * in case we block in mount_refdrain
2885 * which will drop the mount lock
2886 * and allow anyone blocked in vfs_busy
2887 * to wakeup and see the LDEAD state
2888 */
2889 mp->mnt_lflag &= ~MNT_LWAIT;
2890 wakeup((caddr_t)mp);
2891 }
2892 mount_refdrain(mp);
2893
2894 /* free disk_conditioner_info structure for this mount */
2895 disk_conditioner_unmount(mp);
2896
2897 out:
2898 if (mp->mnt_lflag & MNT_LWAIT) {
2899 mp->mnt_lflag &= ~MNT_LWAIT;
2900 needwakeup = 1;
2901 }
2902
2903 #if CONFIG_TRIGGERS
2904 if (flags & MNT_NOBLOCK && p != kernproc) {
2905 // Restore P_NOREMOTEHANG bit to its previous value
2906 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2907 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2908 }
2909 }
2910
2911 /*
2912 * Callback and context are set together under the mount lock, and
2913 * never cleared, so we're safe to examine them here, drop the lock,
2914 * and call out.
2915 */
2916 if (mp->mnt_triggercallback != NULL) {
2917 mount_unlock(mp);
2918 if (error == 0) {
2919 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2920 } else if (did_vflush) {
2921 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2922 }
2923 } else {
2924 mount_unlock(mp);
2925 }
2926 #else
2927 mount_unlock(mp);
2928 #endif /* CONFIG_TRIGGERS */
2929
2930 lck_rw_done(&mp->mnt_rwlock);
2931
2932 if (needwakeup) {
2933 wakeup((caddr_t)mp);
2934 }
2935
2936 if (!error) {
2937 if ((coveredvp != NULLVP)) {
2938 vnode_t pvp = NULLVP;
2939
2940 /*
2941 * The covered vnode needs special handling. Trying to
2942 * get an iocount must not block here as this may lead
2943 * to deadlocks if the Filesystem to which the covered
2944 * vnode belongs is undergoing forced unmounts. Since we
2945 * hold a usecount, the vnode cannot be reused
2946 * (it can, however, still be terminated).
2947 */
2948 vnode_getalways(coveredvp);
2949
2950 mount_dropcrossref(mp, coveredvp, 0);
2951 /*
2952 * We'll _try_ to detect if this really needs to be
2953 * done. The coveredvp can only be in termination (or
2954 * terminated) if the coveredvp's mount point is in a
2955 * forced unmount (or has been) since we still hold the
2956 * ref.
2957 */
2958 if (!vnode_isrecycled(coveredvp)) {
2959 pvp = vnode_getparent(coveredvp);
2960 #if CONFIG_TRIGGERS
2961 if (coveredvp->v_resolve) {
2962 vnode_trigger_rearm(coveredvp, ctx);
2963 }
2964 #endif
2965 }
2966
2967 vnode_rele(coveredvp);
2968 vnode_put(coveredvp);
2969 coveredvp = NULLVP;
2970
2971 if (pvp) {
2972 lock_vnode_and_post(pvp, NOTE_WRITE);
2973 vnode_put(pvp);
2974 }
2975 } else if (mp->mnt_flag & MNT_ROOTFS) {
2976 if (nc_smr_enabled) {
2977 vfs_smr_synchronize();
2978 }
2979
2980 mount_lock_destroy(mp);
2981 #if CONFIG_MACF
2982 mac_mount_label_destroy(mp);
2983 #endif
2984 zfree(mount_zone, mp);
2985 } else {
2986 panic("dounmount: no coveredvp");
2987 }
2988 }
2989 return error;
2990 }
2991
2992 /*
2993 * Unmount any mounts in this filesystem.
2994 */
2995 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2996 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2997 {
2998 mount_t smp;
2999 fsid_t *fsids, fsid;
3000 int fsids_sz;
3001 int count = 0, i, m = 0;
3002 vnode_t vp;
3003
3004 mount_list_lock();
3005
3006 // Get an array to hold the submounts fsids.
3007 TAILQ_FOREACH(smp, &mountlist, mnt_list)
3008 count++;
3009 fsids_sz = count * sizeof(fsid_t);
3010 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3011 if (fsids == NULL) {
3012 mount_list_unlock();
3013 goto out;
3014 }
3015 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3016
3017 /*
3018 * Fill the array with submount fsids.
3019 * Since mounts are always added to the tail of the mount list, the
3020 * list is always in mount order.
3021 * For each mount check if the mounted-on vnode belongs to a
3022 * mount that's already added to our array of mounts to be unmounted.
3023 */
3024 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3025 vp = smp->mnt_vnodecovered;
3026 if (vp == NULL) {
3027 continue;
3028 }
3029 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3030 for (i = 0; i <= m; i++) {
3031 if (fsids[i].val[0] == fsid.val[0] &&
3032 fsids[i].val[1] == fsid.val[1]) {
3033 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3034 break;
3035 }
3036 }
3037 }
3038 mount_list_unlock();
3039
3040 // Unmount the submounts in reverse order. Ignore errors.
3041 for (i = m; i > 0; i--) {
3042 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3043 if (smp) {
3044 mount_ref(smp, 0);
3045 mount_iterdrop(smp);
3046 (void) dounmount(smp, flags, 1, ctx);
3047 }
3048 }
3049 out:
3050 kfree_data(fsids, fsids_sz);
3051 }
3052
3053 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3054 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3055 {
3056 vnode_hold(dp);
3057 vnode_lock(dp);
3058 mp->mnt_crossref--;
3059
3060 if (mp->mnt_crossref < 0) {
3061 panic("mount cross refs -ve");
3062 }
3063
3064 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3065 if (need_put) {
3066 vnode_put_locked(dp);
3067 }
3068 vnode_drop_and_unlock(dp);
3069
3070 if (nc_smr_enabled) {
3071 vfs_smr_synchronize();
3072 }
3073
3074 mount_lock_destroy(mp);
3075 #if CONFIG_MACF
3076 mac_mount_label_destroy(mp);
3077 #endif
3078 zfree(mount_zone, mp);
3079 return;
3080 }
3081 if (need_put) {
3082 vnode_put_locked(dp);
3083 }
3084 vnode_drop_and_unlock(dp);
3085 }
3086
3087
3088 /*
3089 * Sync each mounted filesystem.
3090 */
3091 #if DIAGNOSTIC
3092 int syncprt = 0;
3093 #endif
3094
3095 int print_vmpage_stat = 0;
3096
3097 /*
3098 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3099 * mounted read-write with the passed waitfor value.
3100 *
3101 * Parameters: mp mount-point descriptor per mounted file-system instance.
3102 * arg user argument (please see below)
3103 *
3104 * User argument is a pointer to 32 bit unsigned integer which describes the
3105 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3106 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3107 * waitfor value.
3108 *
3109 * Returns: VFS_RETURNED
3110 */
3111 static int
sync_callback(mount_t mp,void * arg)3112 sync_callback(mount_t mp, void *arg)
3113 {
3114 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3115 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3116 unsigned waitfor = MNT_NOWAIT;
3117
3118 if (arg) {
3119 waitfor = *(uint32_t*)arg;
3120 }
3121
3122 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3123 if (waitfor != MNT_WAIT &&
3124 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3125 waitfor != MNT_NOWAIT &&
3126 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3127 waitfor != MNT_DWAIT &&
3128 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3129 panic("Passed inappropriate waitfor %u to "
3130 "sync_callback()", waitfor);
3131 }
3132
3133 mp->mnt_flag &= ~MNT_ASYNC;
3134 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3135 if (asyncflag) {
3136 mp->mnt_flag |= MNT_ASYNC;
3137 }
3138 }
3139
3140 return VFS_RETURNED;
3141 }
3142
3143 /* ARGSUSED */
3144 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3145 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3146 {
3147 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3148
3149 if (print_vmpage_stat) {
3150 vm_countdirtypages();
3151 }
3152
3153 #if DIAGNOSTIC
3154 if (syncprt) {
3155 vfs_bufstats();
3156 }
3157 #endif /* DIAGNOSTIC */
3158 return 0;
3159 }
3160
3161 typedef enum {
3162 SYNC_ALL = 0,
3163 SYNC_ONLY_RELIABLE_MEDIA = 1,
3164 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3165 } sync_type_t;
3166
3167 static int
sync_internal_callback(mount_t mp,void * arg)3168 sync_internal_callback(mount_t mp, void *arg)
3169 {
3170 if (arg) {
3171 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3172 (mp->mnt_flag & MNT_LOCAL);
3173 sync_type_t sync_type = *((sync_type_t *)arg);
3174
3175 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3176 return VFS_RETURNED;
3177 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3178 return VFS_RETURNED;
3179 }
3180 }
3181
3182 (void)sync_callback(mp, NULL);
3183
3184 return VFS_RETURNED;
3185 }
3186
3187 int sync_thread_state = 0;
3188 int sync_timeout_seconds = 5;
3189
3190 #define SYNC_THREAD_RUN 0x0001
3191 #define SYNC_THREAD_RUNNING 0x0002
3192
3193 #if CONFIG_PHYS_WRITE_ACCT
3194 thread_t pm_sync_thread;
3195 #endif /* CONFIG_PHYS_WRITE_ACCT */
3196
3197 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3198 sync_thread(__unused void *arg, __unused wait_result_t wr)
3199 {
3200 sync_type_t sync_type;
3201 #if CONFIG_PHYS_WRITE_ACCT
3202 pm_sync_thread = current_thread();
3203 #endif /* CONFIG_PHYS_WRITE_ACCT */
3204
3205 lck_mtx_lock(&sync_mtx_lck);
3206 while (sync_thread_state & SYNC_THREAD_RUN) {
3207 sync_thread_state &= ~SYNC_THREAD_RUN;
3208 lck_mtx_unlock(&sync_mtx_lck);
3209
3210 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3211 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3212 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3213 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3214
3215 lck_mtx_lock(&sync_mtx_lck);
3216 }
3217 /*
3218 * This wakeup _has_ to be issued before the lock is released otherwise
3219 * we may end up waking up a thread in sync_internal which is
3220 * expecting a wakeup from a thread it just created and not from this
3221 * thread which is about to exit.
3222 */
3223 wakeup(&sync_thread_state);
3224 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3225 #if CONFIG_PHYS_WRITE_ACCT
3226 pm_sync_thread = NULL;
3227 #endif /* CONFIG_PHYS_WRITE_ACCT */
3228 lck_mtx_unlock(&sync_mtx_lck);
3229
3230 if (print_vmpage_stat) {
3231 vm_countdirtypages();
3232 }
3233
3234 #if DIAGNOSTIC
3235 if (syncprt) {
3236 vfs_bufstats();
3237 }
3238 #endif /* DIAGNOSTIC */
3239 }
3240
3241 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3242
3243 /*
3244 * An in-kernel sync for power management to call.
3245 * This function always returns within sync_timeout seconds.
3246 */
3247 __private_extern__ int
sync_internal(void)3248 sync_internal(void)
3249 {
3250 thread_t thd = NULL;
3251 int error;
3252 int thread_created = FALSE;
3253 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3254
3255 lck_mtx_lock(&sync_mtx_lck);
3256 sync_thread_state |= SYNC_THREAD_RUN;
3257 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3258 int kr;
3259
3260 sync_thread_state |= SYNC_THREAD_RUNNING;
3261 kr = kernel_thread_start(sync_thread, NULL, &thd);
3262 if (kr != KERN_SUCCESS) {
3263 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3264 lck_mtx_unlock(&sync_mtx_lck);
3265 printf("sync_thread failed\n");
3266 return 0;
3267 }
3268 thread_created = TRUE;
3269 }
3270
3271 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3272 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3273 if (error) {
3274 struct timeval now;
3275
3276 microtime(&now);
3277 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3278 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3279 sync_timeout_last_print.tv_sec = now.tv_sec;
3280 }
3281 }
3282
3283 if (thread_created) {
3284 thread_deallocate(thd);
3285 }
3286
3287 return 0;
3288 } /* end of sync_internal call */
3289
3290 /*
3291 * Change filesystem quotas.
3292 */
3293 #if QUOTA
3294 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3295 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3296 {
3297 struct mount *mp;
3298 int error, quota_cmd, quota_status = 0;
3299 caddr_t datap;
3300 size_t fnamelen;
3301 struct nameidata nd;
3302 vfs_context_t ctx = vfs_context_current();
3303 struct dqblk my_dqblk = {};
3304
3305 AUDIT_ARG(uid, uap->uid);
3306 AUDIT_ARG(cmd, uap->cmd);
3307 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3308 uap->path, ctx);
3309 error = namei(&nd);
3310 if (error) {
3311 return error;
3312 }
3313 mp = nd.ni_vp->v_mount;
3314 mount_ref(mp, 0);
3315 vnode_put(nd.ni_vp);
3316 nameidone(&nd);
3317
3318 #if CONFIG_MACF
3319 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3320 if (error != 0) {
3321 goto out;
3322 }
3323 #endif
3324
3325 /* copyin any data we will need for downstream code */
3326 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3327
3328 switch (quota_cmd) {
3329 case Q_QUOTAON:
3330 /* uap->arg specifies a file from which to take the quotas */
3331 fnamelen = MAXPATHLEN;
3332 datap = zalloc(ZV_NAMEI);
3333 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3334 break;
3335 case Q_GETQUOTA:
3336 /* uap->arg is a pointer to a dqblk structure. */
3337 datap = (caddr_t) &my_dqblk;
3338 break;
3339 case Q_SETQUOTA:
3340 case Q_SETUSE:
3341 /* uap->arg is a pointer to a dqblk structure. */
3342 datap = (caddr_t) &my_dqblk;
3343 if (proc_is64bit(p)) {
3344 struct user_dqblk my_dqblk64;
3345 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3346 if (error == 0) {
3347 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3348 }
3349 } else {
3350 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3351 }
3352 break;
3353 case Q_QUOTASTAT:
3354 /* uap->arg is a pointer to an integer */
3355 datap = (caddr_t) "a_status;
3356 break;
3357 default:
3358 datap = NULL;
3359 break;
3360 } /* switch */
3361
3362 if (error == 0) {
3363 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3364 }
3365
3366 switch (quota_cmd) {
3367 case Q_QUOTAON:
3368 if (datap != NULL) {
3369 zfree(ZV_NAMEI, datap);
3370 }
3371 break;
3372 case Q_GETQUOTA:
3373 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3374 if (error == 0) {
3375 if (proc_is64bit(p)) {
3376 struct user_dqblk my_dqblk64;
3377
3378 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3379 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3380 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3381 } else {
3382 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3383 }
3384 }
3385 break;
3386 case Q_QUOTASTAT:
3387 /* uap->arg is a pointer to an integer */
3388 if (error == 0) {
3389 error = copyout(datap, uap->arg, sizeof(quota_status));
3390 }
3391 break;
3392 default:
3393 break;
3394 } /* switch */
3395
3396 out:
3397 mount_drop(mp, 0);
3398 return error;
3399 }
3400 #else
3401 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3402 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3403 {
3404 return EOPNOTSUPP;
3405 }
3406 #endif /* QUOTA */
3407
3408 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3409 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3410 {
3411 int error;
3412 vfs_context_t ctx = vfs_context_current();
3413
3414 #if CONFIG_MACF
3415 error = mac_mount_check_stat(ctx, mp);
3416 if (error != 0) {
3417 return error;
3418 }
3419 #endif
3420
3421 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3422 if (error != 0) {
3423 return error;
3424 }
3425
3426 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3427 }
3428
3429 /*
3430 * Get filesystem statistics.
3431 *
3432 * Returns: 0 Success
3433 * namei:???
3434 * vfs_update_vfsstat:???
3435 * munge_statfs:EFAULT
3436 */
3437 /* ARGSUSED */
3438 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3439 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3440 {
3441 int error;
3442 struct mount *mp;
3443 struct nameidata nd;
3444 vfs_context_t ctx = vfs_context_current();
3445 vnode_t vp;
3446
3447 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3448 UIO_USERSPACE, uap->path, ctx);
3449 error = namei(&nd);
3450 if (error != 0) {
3451 return error;
3452 }
3453 vp = nd.ni_vp;
3454 mp = vp->v_mount;
3455 nameidone(&nd);
3456
3457 error = statfs_internal(p, mp, uap->buf);
3458 vnode_put(vp);
3459
3460 return error;
3461 }
3462
3463 /*
3464 * Get filesystem statistics.
3465 */
3466 /* ARGSUSED */
3467 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3468 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3469 {
3470 int error;
3471 vnode_t vp = NULL;
3472 struct mount *mp;
3473
3474 AUDIT_ARG(fd, uap->fd);
3475
3476 if ((error = file_vnode(uap->fd, &vp)) ||
3477 (error = vnode_getwithref(vp))) {
3478 goto out;
3479 }
3480
3481 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3482
3483 mp = vp->v_mount;
3484 if (!mp) {
3485 error = EBADF;
3486 goto out_vnode;
3487 }
3488
3489 error = statfs_internal(p, mp, uap->buf);
3490
3491 out_vnode:
3492 vnode_put(vp);
3493
3494 out:
3495 if (vp != NULL) {
3496 file_drop(uap->fd);
3497 }
3498
3499 return error;
3500 }
3501
3502 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3503 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3504 {
3505 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3506
3507 bzero(sfs, sizeof(*sfs));
3508
3509 sfs->f_bsize = vsfs->f_bsize;
3510 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3511 sfs->f_blocks = vsfs->f_blocks;
3512 sfs->f_bfree = vsfs->f_bfree;
3513 sfs->f_bavail = vsfs->f_bavail;
3514 sfs->f_files = vsfs->f_files;
3515 sfs->f_ffree = vsfs->f_ffree;
3516 sfs->f_fsid = vsfs->f_fsid;
3517 sfs->f_owner = vsfs->f_owner;
3518 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3519 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3520 sfs->f_fssubtype = vsfs->f_fssubtype;
3521 sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3522 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3523 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3524 } else {
3525 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3526 }
3527 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3528 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3529 }
3530
3531 /*
3532 * Get file system statistics in 64-bit mode
3533 */
3534 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3535 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3536 {
3537 struct mount *mp;
3538 int error;
3539 struct nameidata *ndp;
3540 struct statfs64 *sfsp;
3541 vfs_context_t ctxp = vfs_context_current();
3542 vnode_t vp;
3543 struct {
3544 struct nameidata nd;
3545 struct statfs64 sfs;
3546 } *__nameidata_statfs64;
3547
3548 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3549 Z_WAITOK);
3550 ndp = &__nameidata_statfs64->nd;
3551
3552 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3553 UIO_USERSPACE, uap->path, ctxp);
3554 error = namei(ndp);
3555 if (error != 0) {
3556 goto out;
3557 }
3558 vp = ndp->ni_vp;
3559 mp = vp->v_mount;
3560 nameidone(ndp);
3561
3562 #if CONFIG_MACF
3563 error = mac_mount_check_stat(ctxp, mp);
3564 if (error != 0) {
3565 vnode_put(vp);
3566 goto out;
3567 }
3568 #endif
3569
3570 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3571 if (error != 0) {
3572 vnode_put(vp);
3573 goto out;
3574 }
3575
3576 sfsp = &__nameidata_statfs64->sfs;
3577 vfs_get_statfs64(mp, sfsp);
3578 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3579 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3580 /* This process does not want to see a seperate data volume mountpoint */
3581 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3582 }
3583 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3584 vnode_put(vp);
3585
3586 out:
3587 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3588
3589 return error;
3590 }
3591
3592 /*
3593 * Get file system statistics in 64-bit mode
3594 */
3595 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3596 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3597 {
3598 struct vnode *vp;
3599 struct mount *mp;
3600 struct statfs64 sfs;
3601 int error;
3602
3603 AUDIT_ARG(fd, uap->fd);
3604
3605 if ((error = file_vnode(uap->fd, &vp))) {
3606 return error;
3607 }
3608
3609 error = vnode_getwithref(vp);
3610 if (error) {
3611 file_drop(uap->fd);
3612 return error;
3613 }
3614
3615 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3616
3617 mp = vp->v_mount;
3618 if (!mp) {
3619 error = EBADF;
3620 goto out;
3621 }
3622
3623 #if CONFIG_MACF
3624 error = mac_mount_check_stat(vfs_context_current(), mp);
3625 if (error != 0) {
3626 goto out;
3627 }
3628 #endif
3629
3630 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3631 goto out;
3632 }
3633
3634 vfs_get_statfs64(mp, &sfs);
3635 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3636 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3637 /* This process does not want to see a seperate data volume mountpoint */
3638 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3639 }
3640 error = copyout(&sfs, uap->buf, sizeof(sfs));
3641
3642 out:
3643 file_drop(uap->fd);
3644 vnode_put(vp);
3645
3646 return error;
3647 }
3648
3649 struct getfsstat_struct {
3650 user_addr_t sfsp;
3651 user_addr_t *mp;
3652 int count;
3653 int maxcount;
3654 int flags;
3655 int error;
3656 };
3657
3658
3659 static int
getfsstat_callback(mount_t mp,void * arg)3660 getfsstat_callback(mount_t mp, void * arg)
3661 {
3662 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3663 struct vfsstatfs *sp;
3664 int error, my_size;
3665 vfs_context_t ctx = vfs_context_current();
3666
3667 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3668 #if CONFIG_MACF
3669 error = mac_mount_check_stat(ctx, mp);
3670 if (error != 0) {
3671 fstp->error = error;
3672 return VFS_RETURNED_DONE;
3673 }
3674 #endif
3675 sp = &mp->mnt_vfsstat;
3676 /*
3677 * If MNT_NOWAIT is specified, do not refresh the
3678 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3679 */
3680 if ((mp->mnt_lflag & MNT_LDEAD) ||
3681 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3682 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3683 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3684 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3685 return VFS_RETURNED;
3686 }
3687
3688 /*
3689 * Need to handle LP64 version of struct statfs
3690 */
3691 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3692 if (error) {
3693 fstp->error = error;
3694 return VFS_RETURNED_DONE;
3695 }
3696 fstp->sfsp += my_size;
3697
3698 if (fstp->mp) {
3699 #if CONFIG_MACF
3700 error = mac_mount_label_get(mp, *fstp->mp);
3701 if (error) {
3702 fstp->error = error;
3703 return VFS_RETURNED_DONE;
3704 }
3705 #endif
3706 fstp->mp++;
3707 }
3708 }
3709 fstp->count++;
3710 return VFS_RETURNED;
3711 }
3712
3713 /*
3714 * Get statistics on all filesystems.
3715 */
3716 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3717 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3718 {
3719 struct __mac_getfsstat_args muap;
3720
3721 muap.buf = uap->buf;
3722 muap.bufsize = uap->bufsize;
3723 muap.mac = USER_ADDR_NULL;
3724 muap.macsize = 0;
3725 muap.flags = uap->flags;
3726
3727 return __mac_getfsstat(p, &muap, retval);
3728 }
3729
3730 /*
3731 * __mac_getfsstat: Get MAC-related file system statistics
3732 *
3733 * Parameters: p (ignored)
3734 * uap User argument descriptor (see below)
3735 * retval Count of file system statistics (N stats)
3736 *
3737 * Indirect: uap->bufsize Buffer size
3738 * uap->macsize MAC info size
3739 * uap->buf Buffer where information will be returned
3740 * uap->mac MAC info
3741 * uap->flags File system flags
3742 *
3743 *
3744 * Returns: 0 Success
3745 * !0 Not success
3746 *
3747 */
3748 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3749 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3750 {
3751 user_addr_t sfsp;
3752 user_addr_t *mp;
3753 size_t count, maxcount, bufsize, macsize;
3754 struct getfsstat_struct fst;
3755
3756 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3757 return EINVAL;
3758 }
3759
3760 bufsize = (size_t) uap->bufsize;
3761 macsize = (size_t) uap->macsize;
3762
3763 if (IS_64BIT_PROCESS(p)) {
3764 maxcount = bufsize / sizeof(struct user64_statfs);
3765 } else {
3766 maxcount = bufsize / sizeof(struct user32_statfs);
3767 }
3768 sfsp = uap->buf;
3769 count = 0;
3770
3771 mp = NULL;
3772
3773 #if CONFIG_MACF
3774 if (uap->mac != USER_ADDR_NULL) {
3775 u_int32_t *mp0;
3776 int error;
3777 unsigned int i;
3778
3779 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3780 if (count != maxcount) {
3781 return EINVAL;
3782 }
3783
3784 /* Copy in the array */
3785 mp0 = kalloc_data(macsize, Z_WAITOK);
3786 if (mp0 == NULL) {
3787 return ENOMEM;
3788 }
3789
3790 error = copyin(uap->mac, mp0, macsize);
3791 if (error) {
3792 kfree_data(mp0, macsize);
3793 return error;
3794 }
3795
3796 /* Normalize to an array of user_addr_t */
3797 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3798 if (mp == NULL) {
3799 kfree_data(mp0, macsize);
3800 return ENOMEM;
3801 }
3802
3803 for (i = 0; i < count; i++) {
3804 if (IS_64BIT_PROCESS(p)) {
3805 mp[i] = ((user_addr_t *)mp0)[i];
3806 } else {
3807 mp[i] = (user_addr_t)mp0[i];
3808 }
3809 }
3810 kfree_data(mp0, macsize);
3811 }
3812 #endif
3813
3814
3815 fst.sfsp = sfsp;
3816 fst.mp = mp;
3817 fst.flags = uap->flags;
3818 fst.count = 0;
3819 fst.error = 0;
3820 fst.maxcount = (int)maxcount;
3821
3822
3823 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3824
3825 if (mp) {
3826 kfree_data(mp, count * sizeof(user_addr_t));
3827 }
3828
3829 if (fst.error) {
3830 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3831 return fst.error;
3832 }
3833
3834 if (fst.sfsp && fst.count > fst.maxcount) {
3835 *retval = fst.maxcount;
3836 } else {
3837 *retval = fst.count;
3838 }
3839 return 0;
3840 }
3841
3842 static int
getfsstat64_callback(mount_t mp,void * arg)3843 getfsstat64_callback(mount_t mp, void * arg)
3844 {
3845 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3846 struct vfsstatfs *sp;
3847 struct statfs64 sfs;
3848 int error;
3849
3850 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3851 #if CONFIG_MACF
3852 error = mac_mount_check_stat(vfs_context_current(), mp);
3853 if (error != 0) {
3854 fstp->error = error;
3855 return VFS_RETURNED_DONE;
3856 }
3857 #endif
3858 sp = &mp->mnt_vfsstat;
3859 /*
3860 * If MNT_NOWAIT is specified, do not refresh the fsstat
3861 * cache. MNT_WAIT overrides MNT_NOWAIT.
3862 *
3863 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3864 * getfsstat, since the constants are out of the same
3865 * namespace.
3866 */
3867 if ((mp->mnt_lflag & MNT_LDEAD) ||
3868 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3869 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3870 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3871 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3872 return VFS_RETURNED;
3873 }
3874
3875 vfs_get_statfs64(mp, &sfs);
3876 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3877 if (error) {
3878 fstp->error = error;
3879 return VFS_RETURNED_DONE;
3880 }
3881 fstp->sfsp += sizeof(sfs);
3882 }
3883 fstp->count++;
3884 return VFS_RETURNED;
3885 }
3886
3887 /*
3888 * Get statistics on all file systems in 64 bit mode.
3889 */
3890 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3891 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3892 {
3893 user_addr_t sfsp;
3894 int count, maxcount;
3895 struct getfsstat_struct fst;
3896
3897 maxcount = uap->bufsize / sizeof(struct statfs64);
3898
3899 sfsp = uap->buf;
3900 count = 0;
3901
3902 fst.sfsp = sfsp;
3903 fst.flags = uap->flags;
3904 fst.count = 0;
3905 fst.error = 0;
3906 fst.maxcount = maxcount;
3907
3908 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3909
3910 if (fst.error) {
3911 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3912 return fst.error;
3913 }
3914
3915 if (fst.sfsp && fst.count > fst.maxcount) {
3916 *retval = fst.maxcount;
3917 } else {
3918 *retval = fst.count;
3919 }
3920
3921 return 0;
3922 }
3923
3924 /*
3925 * gets the associated vnode with the file descriptor passed.
3926 * as input
3927 *
3928 * INPUT
3929 * ctx - vfs context of caller
3930 * fd - file descriptor for which vnode is required.
3931 * vpp - Pointer to pointer to vnode to be returned.
3932 *
3933 * The vnode is returned with an iocount so any vnode obtained
3934 * by this call needs a vnode_put
3935 *
3936 */
3937 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3938 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3939 {
3940 int error;
3941 vnode_t vp;
3942 struct fileproc *fp;
3943 proc_t p = vfs_context_proc(ctx);
3944
3945 *vpp = NULLVP;
3946
3947 error = fp_getfvp(p, fd, &fp, &vp);
3948 if (error) {
3949 return error;
3950 }
3951
3952 error = vnode_getwithref(vp);
3953 if (error) {
3954 (void)fp_drop(p, fd, fp, 0);
3955 return error;
3956 }
3957
3958 (void)fp_drop(p, fd, fp, 0);
3959 *vpp = vp;
3960 return error;
3961 }
3962
3963 /*
3964 * Wrapper function around namei to start lookup from a directory
3965 * specified by a file descriptor ni_dirfd.
3966 *
3967 * In addition to all the errors returned by namei, this call can
3968 * return ENOTDIR if the file descriptor does not refer to a directory.
3969 * and EBADF if the file descriptor is not valid.
3970 */
3971 int
nameiat(struct nameidata * ndp,int dirfd)3972 nameiat(struct nameidata *ndp, int dirfd)
3973 {
3974 if ((dirfd != AT_FDCWD) &&
3975 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3976 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3977 int error = 0;
3978 char c;
3979
3980 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3981 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3982 if (error) {
3983 return error;
3984 }
3985 } else {
3986 c = *((char *)(ndp->ni_dirp));
3987 }
3988
3989 if (c != '/') {
3990 vnode_t dvp_at;
3991
3992 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3993 &dvp_at);
3994 if (error) {
3995 return error;
3996 }
3997
3998 if (vnode_vtype(dvp_at) != VDIR) {
3999 vnode_put(dvp_at);
4000 return ENOTDIR;
4001 }
4002
4003 ndp->ni_dvp = dvp_at;
4004 ndp->ni_cnd.cn_flags |= USEDVP;
4005 error = namei(ndp);
4006 ndp->ni_cnd.cn_flags &= ~USEDVP;
4007 vnode_put(dvp_at);
4008 return error;
4009 }
4010 }
4011
4012 return namei(ndp);
4013 }
4014
4015 /*
4016 * Change current working directory to a given file descriptor.
4017 */
4018 /* ARGSUSED */
4019 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)4020 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
4021 {
4022 vnode_t vp;
4023 vnode_t tdp;
4024 vnode_t tvp;
4025 struct mount *mp;
4026 int error, should_put = 1;
4027 vfs_context_t ctx = vfs_context_current();
4028
4029 AUDIT_ARG(fd, uap->fd);
4030 if (per_thread && uap->fd == -1) {
4031 /*
4032 * Switching back from per-thread to per process CWD; verify we
4033 * in fact have one before proceeding. The only success case
4034 * for this code path is to return 0 preemptively after zapping
4035 * the thread structure contents.
4036 */
4037 thread_t th = vfs_context_thread(ctx);
4038 if (th) {
4039 uthread_t uth = get_bsdthread_info(th);
4040 tvp = uth->uu_cdir;
4041 uth->uu_cdir = NULLVP;
4042 if (tvp != NULLVP) {
4043 vnode_rele(tvp);
4044 return 0;
4045 }
4046 }
4047 return EBADF;
4048 }
4049
4050 if ((error = file_vnode(uap->fd, &vp))) {
4051 return error;
4052 }
4053 if ((error = vnode_getwithref(vp))) {
4054 file_drop(uap->fd);
4055 return error;
4056 }
4057
4058 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4059
4060 if (vp->v_type != VDIR) {
4061 error = ENOTDIR;
4062 goto out;
4063 }
4064
4065 #if CONFIG_MACF
4066 error = mac_vnode_check_chdir(ctx, vp);
4067 if (error) {
4068 goto out;
4069 }
4070 #endif
4071 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4072 if (error) {
4073 goto out;
4074 }
4075
4076 while (!error && (mp = vp->v_mountedhere) != NULL) {
4077 if (vfs_busy(mp, LK_NOWAIT)) {
4078 error = EACCES;
4079 goto out;
4080 }
4081 error = VFS_ROOT(mp, &tdp, ctx);
4082 vfs_unbusy(mp);
4083 if (error) {
4084 break;
4085 }
4086 vnode_put(vp);
4087 vp = tdp;
4088 }
4089 if (error) {
4090 goto out;
4091 }
4092 if ((error = vnode_ref(vp))) {
4093 goto out;
4094 }
4095 vnode_put(vp);
4096 should_put = 0;
4097
4098 if (per_thread) {
4099 thread_t th = vfs_context_thread(ctx);
4100 if (th) {
4101 uthread_t uth = get_bsdthread_info(th);
4102 tvp = uth->uu_cdir;
4103 uth->uu_cdir = vp;
4104 OSBitOrAtomic(P_THCWD, &p->p_flag);
4105 } else {
4106 vnode_rele(vp);
4107 error = ENOENT;
4108 goto out;
4109 }
4110 } else {
4111 proc_dirs_lock_exclusive(p);
4112 proc_fdlock(p);
4113 tvp = p->p_fd.fd_cdir;
4114 p->p_fd.fd_cdir = vp;
4115 proc_fdunlock(p);
4116 proc_dirs_unlock_exclusive(p);
4117 }
4118
4119 if (tvp) {
4120 vnode_rele(tvp);
4121 }
4122
4123 out:
4124 if (should_put) {
4125 vnode_put(vp);
4126 }
4127 file_drop(uap->fd);
4128
4129 return error;
4130 }
4131
4132 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4133 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4134 {
4135 return common_fchdir(p, uap, 0);
4136 }
4137
4138 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4139 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4140 {
4141 return common_fchdir(p, (void *)uap, 1);
4142 }
4143
4144
4145 /*
4146 * Change current working directory (".").
4147 *
4148 * Returns: 0 Success
4149 * change_dir:ENOTDIR
4150 * change_dir:???
4151 * vnode_ref:ENOENT No such file or directory
4152 */
4153 /* ARGSUSED */
4154 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4155 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4156 {
4157 int error;
4158 vnode_t tvp;
4159
4160 error = change_dir(ndp, ctx);
4161 if (error) {
4162 return error;
4163 }
4164 if ((error = vnode_ref(ndp->ni_vp))) {
4165 vnode_put(ndp->ni_vp);
4166 return error;
4167 }
4168 /*
4169 * drop the iocount we picked up in change_dir
4170 */
4171 vnode_put(ndp->ni_vp);
4172
4173 if (per_thread) {
4174 thread_t th = vfs_context_thread(ctx);
4175 if (th) {
4176 uthread_t uth = get_bsdthread_info(th);
4177 tvp = uth->uu_cdir;
4178 uth->uu_cdir = ndp->ni_vp;
4179 OSBitOrAtomic(P_THCWD, &p->p_flag);
4180 } else {
4181 vnode_rele(ndp->ni_vp);
4182 return ENOENT;
4183 }
4184 } else {
4185 proc_dirs_lock_exclusive(p);
4186 proc_fdlock(p);
4187 tvp = p->p_fd.fd_cdir;
4188 p->p_fd.fd_cdir = ndp->ni_vp;
4189 proc_fdunlock(p);
4190 proc_dirs_unlock_exclusive(p);
4191 }
4192
4193 if (tvp) {
4194 vnode_rele(tvp);
4195 }
4196
4197 return 0;
4198 }
4199
4200
4201 /*
4202 * Change current working directory (".").
4203 *
4204 * Returns: 0 Success
4205 * chdir_internal:ENOTDIR
4206 * chdir_internal:ENOENT No such file or directory
4207 * chdir_internal:???
4208 */
4209 /* ARGSUSED */
4210 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4211 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4212 {
4213 struct nameidata nd;
4214 vfs_context_t ctx = vfs_context_current();
4215
4216 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4217 UIO_USERSPACE, uap->path, ctx);
4218
4219 return chdir_internal(p, ctx, &nd, per_thread);
4220 }
4221
4222
4223 /*
4224 * chdir
4225 *
4226 * Change current working directory (".") for the entire process
4227 *
4228 * Parameters: p Process requesting the call
4229 * uap User argument descriptor (see below)
4230 * retval (ignored)
4231 *
4232 * Indirect parameters: uap->path Directory path
4233 *
4234 * Returns: 0 Success
4235 * common_chdir: ENOTDIR
4236 * common_chdir: ENOENT No such file or directory
4237 * common_chdir: ???
4238 *
4239 */
4240 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4241 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4242 {
4243 return common_chdir(p, (void *)uap, 0);
4244 }
4245
4246 /*
4247 * __pthread_chdir
4248 *
4249 * Change current working directory (".") for a single thread
4250 *
4251 * Parameters: p Process requesting the call
4252 * uap User argument descriptor (see below)
4253 * retval (ignored)
4254 *
4255 * Indirect parameters: uap->path Directory path
4256 *
4257 * Returns: 0 Success
4258 * common_chdir: ENOTDIR
4259 * common_chdir: ENOENT No such file or directory
4260 * common_chdir: ???
4261 *
4262 */
4263 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4264 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4265 {
4266 return common_chdir(p, (void *)uap, 1);
4267 }
4268
4269
4270 /*
4271 * Change notion of root (``/'') directory.
4272 */
4273 /* ARGSUSED */
4274 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4275 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4276 {
4277 struct filedesc *fdp = &p->p_fd;
4278 int error;
4279 struct nameidata nd;
4280 vnode_t tvp;
4281 vfs_context_t ctx = vfs_context_current();
4282
4283 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4284 return error;
4285 }
4286
4287 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4288 UIO_USERSPACE, uap->path, ctx);
4289 error = change_dir(&nd, ctx);
4290 if (error) {
4291 return error;
4292 }
4293
4294 #if CONFIG_MACF
4295 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4296 &nd.ni_cnd);
4297 if (error) {
4298 vnode_put(nd.ni_vp);
4299 return error;
4300 }
4301 #endif
4302
4303 if ((error = vnode_ref(nd.ni_vp))) {
4304 vnode_put(nd.ni_vp);
4305 return error;
4306 }
4307 vnode_put(nd.ni_vp);
4308
4309 /*
4310 * This lock provides the guarantee that as long as you hold the lock
4311 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4312 * on a referenced vnode in namei when determining the rootvnode for
4313 * a process.
4314 */
4315 /* needed for synchronization with lookup */
4316 proc_dirs_lock_exclusive(p);
4317 /* needed for setting the flag and other activities on the fd itself */
4318 proc_fdlock(p);
4319 tvp = fdp->fd_rdir;
4320 fdp->fd_rdir = nd.ni_vp;
4321 fdt_flag_set(fdp, FD_CHROOT);
4322 proc_fdunlock(p);
4323 proc_dirs_unlock_exclusive(p);
4324
4325 if (tvp != NULL) {
4326 vnode_rele(tvp);
4327 }
4328
4329 return 0;
4330 }
4331
4332 #define PATHSTATICBUFLEN 256
4333 #define PIVOT_ROOT_ENTITLEMENT \
4334 "com.apple.private.vfs.pivot-root"
4335
4336 #if defined(XNU_TARGET_OS_OSX)
4337 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4338 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4339 {
4340 int error;
4341 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4342 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4343 char *new_rootfs_path_before_buf = NULL;
4344 char *old_rootfs_path_after_buf = NULL;
4345 char *incoming = NULL;
4346 char *outgoing = NULL;
4347 vnode_t incoming_rootvp = NULLVP;
4348 size_t bytes_copied;
4349
4350 /*
4351 * XXX : Additional restrictions needed
4352 * - perhaps callable only once.
4353 */
4354 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4355 return error;
4356 }
4357
4358 /*
4359 * pivot_root can be executed by launchd only.
4360 * Enforce entitlement.
4361 */
4362 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4363 return EPERM;
4364 }
4365
4366 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4367 if (error == ENAMETOOLONG) {
4368 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4369 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4370 }
4371
4372 if (error) {
4373 goto out;
4374 }
4375
4376 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4377 if (error == ENAMETOOLONG) {
4378 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4379 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4380 }
4381 if (error) {
4382 goto out;
4383 }
4384
4385 if (new_rootfs_path_before_buf) {
4386 incoming = new_rootfs_path_before_buf;
4387 } else {
4388 incoming = &new_rootfs_path_before[0];
4389 }
4390
4391 if (old_rootfs_path_after_buf) {
4392 outgoing = old_rootfs_path_after_buf;
4393 } else {
4394 outgoing = &old_rootfs_path_after[0];
4395 }
4396
4397 /*
4398 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4399 * Userland is not allowed to pivot to an image.
4400 */
4401 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4402 if (error) {
4403 goto out;
4404 }
4405 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4406 if (error) {
4407 goto out;
4408 }
4409
4410 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4411
4412 out:
4413 if (incoming_rootvp != NULLVP) {
4414 vnode_put(incoming_rootvp);
4415 incoming_rootvp = NULLVP;
4416 }
4417
4418 if (old_rootfs_path_after_buf) {
4419 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4420 }
4421
4422 if (new_rootfs_path_before_buf) {
4423 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4424 }
4425
4426 return error;
4427 }
4428 #else
4429 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4430 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4431 {
4432 return nosys(p, NULL, retval);
4433 }
4434 #endif /* XNU_TARGET_OS_OSX */
4435
4436 /*
4437 * Common routine for chroot and chdir.
4438 *
4439 * Returns: 0 Success
4440 * ENOTDIR Not a directory
4441 * namei:??? [anything namei can return]
4442 * vnode_authorize:??? [anything vnode_authorize can return]
4443 */
4444 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4445 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4446 {
4447 vnode_t vp;
4448 int error;
4449
4450 if ((error = namei(ndp))) {
4451 return error;
4452 }
4453 nameidone(ndp);
4454 vp = ndp->ni_vp;
4455
4456 if (vp->v_type != VDIR) {
4457 vnode_put(vp);
4458 return ENOTDIR;
4459 }
4460
4461 #if CONFIG_MACF
4462 error = mac_vnode_check_chdir(ctx, vp);
4463 if (error) {
4464 vnode_put(vp);
4465 return error;
4466 }
4467 #endif
4468
4469 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4470 if (error) {
4471 vnode_put(vp);
4472 return error;
4473 }
4474
4475 return error;
4476 }
4477
4478 /*
4479 * Free the vnode data (for directories) associated with the file glob.
4480 */
4481 struct fd_vn_data *
fg_vn_data_alloc(void)4482 fg_vn_data_alloc(void)
4483 {
4484 struct fd_vn_data *fvdata;
4485
4486 /* Allocate per fd vnode data */
4487 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4488 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4489 return fvdata;
4490 }
4491
4492 /*
4493 * Free the vnode data (for directories) associated with the file glob.
4494 */
4495 void
fg_vn_data_free(void * fgvndata)4496 fg_vn_data_free(void *fgvndata)
4497 {
4498 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4499
4500 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4501 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4502 kfree_type(struct fd_vn_data, fvdata);
4503 }
4504
4505 /*
4506 * Check permissions, allocate an open file structure,
4507 * and call the device open routine if any.
4508 *
4509 * Returns: 0 Success
4510 * EINVAL
4511 * EINTR
4512 * falloc:ENFILE
4513 * falloc:EMFILE
4514 * falloc:ENOMEM
4515 * vn_open_auth:???
4516 * dupfdopen:???
4517 * VNOP_ADVLOCK:???
4518 * vnode_setsize:???
4519 *
4520 * XXX Need to implement uid, gid
4521 */
4522 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4523 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4524 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4525 {
4526 proc_t p = vfs_context_proc(ctx);
4527 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4528 struct fileproc *fp;
4529 vnode_t vp;
4530 int flags, oflags, amode;
4531 int type, indx, error;
4532 struct vfs_context context;
4533 vnode_t authvp = NULLVP;
4534
4535 oflags = uflags;
4536
4537 amode = oflags & O_ACCMODE;
4538 /*
4539 * Because O_RDONLY is 0, it is not possible to distinguish between
4540 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4541 * with FREAD/FWRITE.
4542 */
4543 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4544 return EINVAL;
4545 }
4546
4547 flags = FFLAGS(uflags);
4548 CLR(flags, FENCRYPTED);
4549 CLR(flags, FUNENCRYPTED);
4550
4551 AUDIT_ARG(fflags, oflags);
4552 AUDIT_ARG(mode, vap->va_mode);
4553
4554 if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4555 return error;
4556 }
4557 if (flags & O_CLOEXEC) {
4558 fp->fp_flags |= FP_CLOEXEC;
4559 }
4560 if (flags & O_CLOFORK) {
4561 fp->fp_flags |= FP_CLOFORK;
4562 }
4563
4564 /* setup state to recognize when fdesc_open was called */
4565 uu->uu_dupfd = -1;
4566
4567 /*
4568 * Disable read/write access if file is opened with O_EVTONLY and
4569 * the process has requested to deny read/write access.
4570 */
4571 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4572 flags &= ~(FREAD | FWRITE);
4573 }
4574
4575 if (authfd != AUTH_OPEN_NOAUTHFD) {
4576 error = vnode_getfromfd(ctx, authfd, &authvp);
4577 if (error) {
4578 fp_free(p, indx, fp);
4579 return error;
4580 }
4581 }
4582
4583 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4584 if (authvp != NULLVP) {
4585 vnode_put(authvp);
4586 }
4587 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4588 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4589 *retval = indx;
4590 return 0;
4591 }
4592 }
4593 if (error == ERESTART) {
4594 error = EINTR;
4595 }
4596 fp_free(p, indx, fp);
4597 return error;
4598 }
4599
4600 if (authvp != NULLVP) {
4601 vnode_put(authvp);
4602 }
4603
4604 uu->uu_dupfd = 0;
4605 vp = ndp->ni_vp;
4606
4607 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4608 fp->fp_glob->fg_ops = &vnops;
4609 fp_set_data(fp, vp);
4610
4611 #if CONFIG_FILE_LEASES
4612 /*
4613 * If we are creating a file or open with truncate, we need to break the
4614 * lease if there is a read lease placed on the parent dir.
4615 */
4616 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4617 vnode_breakdirlease(vp, true, oflags);
4618 }
4619 /* Now check if there is a lease placed on the file itself. */
4620 error = vnode_breaklease(vp, oflags, ctx);
4621 if (error) {
4622 goto bad;
4623 }
4624 #endif /* CONFIG_FILE_LEASES */
4625
4626 if (flags & (O_EXLOCK | O_SHLOCK)) {
4627 struct flock lf = {
4628 .l_whence = SEEK_SET,
4629 };
4630
4631 if (flags & O_EXLOCK) {
4632 lf.l_type = F_WRLCK;
4633 } else {
4634 lf.l_type = F_RDLCK;
4635 }
4636 type = F_FLOCK;
4637 if ((flags & FNONBLOCK) == 0) {
4638 type |= F_WAIT;
4639 }
4640 #if CONFIG_MACF
4641 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4642 F_SETLK, &lf);
4643 if (error) {
4644 goto bad;
4645 }
4646 #endif
4647 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4648 goto bad;
4649 }
4650 fp->fp_glob->fg_flag |= FWASLOCKED;
4651 }
4652
4653 /* try to truncate by setting the size attribute */
4654 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4655 goto bad;
4656 }
4657
4658 /*
4659 * For directories we hold some additional information in the fd.
4660 */
4661 if (vnode_vtype(vp) == VDIR) {
4662 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4663 } else {
4664 fp->fp_glob->fg_vn_data = NULL;
4665 }
4666
4667 #if CONFIG_SECLUDED_MEMORY
4668 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4669 memory_object_control_t moc;
4670 const char *v_name;
4671
4672 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4673
4674 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4675 /* nothing to do... */
4676 } else if (fp->fp_glob->fg_flag & FWRITE) {
4677 /* writable -> no longer eligible for secluded pages */
4678 memory_object_mark_eligible_for_secluded(moc,
4679 FALSE);
4680 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4681 char pathname[32] = { 0, };
4682 size_t copied;
4683 /* XXX FBDP: better way to detect /Applications/ ? */
4684 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4685 (void)copyinstr(ndp->ni_dirp,
4686 pathname,
4687 sizeof(pathname),
4688 &copied);
4689 } else {
4690 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4691 pathname,
4692 sizeof(pathname),
4693 &copied);
4694 }
4695 pathname[sizeof(pathname) - 1] = '\0';
4696 if (strncmp(pathname,
4697 "/Applications/",
4698 strlen("/Applications/")) == 0 &&
4699 strncmp(pathname,
4700 "/Applications/Camera.app/",
4701 strlen("/Applications/Camera.app/")) != 0) {
4702 /*
4703 * not writable
4704 * AND from "/Applications/"
4705 * AND not from "/Applications/Camera.app/"
4706 * ==> eligible for secluded
4707 */
4708 memory_object_mark_eligible_for_secluded(moc,
4709 TRUE);
4710 }
4711 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4712 (v_name = vnode_getname(vp))) {
4713 size_t len = strlen(v_name);
4714
4715 if (!strncmp(v_name, "dyld", len) ||
4716 !strncmp(v_name, "launchd", len) ||
4717 !strncmp(v_name, "Camera", len) ||
4718 !strncmp(v_name, "SpringBoard", len) ||
4719 !strncmp(v_name, "backboardd", len)) {
4720 /*
4721 * This file matters when launching Camera:
4722 * do not store its contents in the secluded
4723 * pool that will be drained on Camera launch.
4724 */
4725 memory_object_mark_eligible_for_secluded(moc,
4726 FALSE);
4727 } else if (!strncmp(v_name, "mediaserverd", len)) {
4728 memory_object_mark_eligible_for_secluded(moc,
4729 FALSE);
4730 memory_object_mark_for_realtime(moc,
4731 true);
4732 } else if (!strncmp(v_name, "bluetoothd", len)) {
4733 /*
4734 * bluetoothd might be needed for realtime audio
4735 * playback.
4736 */
4737 memory_object_mark_eligible_for_secluded(moc,
4738 FALSE);
4739 memory_object_mark_for_realtime(moc,
4740 true);
4741 } else {
4742 char pathname[64] = { 0, };
4743 size_t copied;
4744 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4745 (void)copyinstr(ndp->ni_dirp,
4746 pathname,
4747 sizeof(pathname),
4748 &copied);
4749 } else {
4750 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4751 pathname,
4752 sizeof(pathname),
4753 &copied);
4754 }
4755 pathname[sizeof(pathname) - 1] = '\0';
4756 if (strncmp(pathname,
4757 "/Library/Audio/Plug-Ins/",
4758 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4759 strncmp(pathname,
4760 "/System/Library/Audio/Plug-Ins/",
4761 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4762 /*
4763 * This may be an audio plugin required
4764 * for realtime playback.
4765 * ==> NOT eligible for secluded.
4766 */
4767 memory_object_mark_eligible_for_secluded(moc,
4768 FALSE);
4769 memory_object_mark_for_realtime(moc,
4770 true);
4771 }
4772 }
4773 vnode_putname(v_name);
4774 }
4775 }
4776 #endif /* CONFIG_SECLUDED_MEMORY */
4777
4778 vnode_put(vp);
4779
4780 /*
4781 * The first terminal open (without a O_NOCTTY) by a session leader
4782 * results in it being set as the controlling terminal.
4783 */
4784 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4785 !(flags & O_NOCTTY)) {
4786 int tmp = 0;
4787
4788 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4789 (caddr_t)&tmp, ctx);
4790 }
4791
4792 proc_fdlock(p);
4793 procfdtbl_releasefd(p, indx, NULL);
4794
4795 fp_drop(p, indx, fp, 1);
4796 proc_fdunlock(p);
4797
4798 *retval = indx;
4799
4800 return 0;
4801 bad:
4802 context = *vfs_context_current();
4803 context.vc_ucred = fp->fp_glob->fg_cred;
4804
4805 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4806 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4807 struct flock lf = {
4808 .l_whence = SEEK_SET,
4809 .l_type = F_UNLCK,
4810 };
4811
4812 (void)VNOP_ADVLOCK(
4813 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4814 }
4815
4816 vn_close(vp, fp->fp_glob->fg_flag, &context);
4817 vnode_put(vp);
4818 fp_free(p, indx, fp);
4819
4820 return error;
4821 }
4822
4823 /*
4824 * While most of the *at syscall handlers can call nameiat() which
4825 * is a wrapper around namei, the use of namei and initialisation
4826 * of nameidata are far removed and in different functions - namei
4827 * gets called in vn_open_auth for open1. So we'll just do here what
4828 * nameiat() does.
4829 */
4830 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4831 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4832 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4833 int dirfd, int authfd)
4834 {
4835 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4836 int error;
4837 char c;
4838
4839 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4840 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4841 if (error) {
4842 return error;
4843 }
4844 } else {
4845 c = *((char *)(ndp->ni_dirp));
4846 }
4847
4848 if (c != '/') {
4849 vnode_t dvp_at;
4850
4851 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4852 &dvp_at);
4853 if (error) {
4854 return error;
4855 }
4856
4857 if (vnode_vtype(dvp_at) != VDIR) {
4858 vnode_put(dvp_at);
4859 return ENOTDIR;
4860 }
4861
4862 ndp->ni_dvp = dvp_at;
4863 ndp->ni_cnd.cn_flags |= USEDVP;
4864 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4865 retval, authfd);
4866 vnode_put(dvp_at);
4867 return error;
4868 }
4869 }
4870
4871 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4872 }
4873
4874 /*
4875 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4876 *
4877 * Parameters: p Process requesting the open
4878 * uap User argument descriptor (see below)
4879 * retval Pointer to an area to receive the
4880 * return calue from the system call
4881 *
4882 * Indirect: uap->path Path to open (same as 'open')
4883 * uap->flags Flags to open (same as 'open'
4884 * uap->uid UID to set, if creating
4885 * uap->gid GID to set, if creating
4886 * uap->mode File mode, if creating (same as 'open')
4887 * uap->xsecurity ACL to set, if creating
4888 *
4889 * Returns: 0 Success
4890 * !0 errno value
4891 *
4892 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4893 *
4894 * XXX: We should enummerate the possible errno values here, and where
4895 * in the code they originated.
4896 */
4897 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4898 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4899 {
4900 int ciferror;
4901 kauth_filesec_t xsecdst;
4902 struct vnode_attr va;
4903 struct nameidata nd;
4904 int cmode;
4905
4906 AUDIT_ARG(owner, uap->uid, uap->gid);
4907
4908 xsecdst = NULL;
4909 if ((uap->xsecurity != USER_ADDR_NULL) &&
4910 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4911 return ciferror;
4912 }
4913
4914 VATTR_INIT(&va);
4915 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4916 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4917 if (uap->uid != KAUTH_UID_NONE) {
4918 VATTR_SET(&va, va_uid, uap->uid);
4919 }
4920 if (uap->gid != KAUTH_GID_NONE) {
4921 VATTR_SET(&va, va_gid, uap->gid);
4922 }
4923 if (xsecdst != NULL) {
4924 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4925 va.va_vaflags |= VA_FILESEC_ACL;
4926 }
4927
4928 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4929 uap->path, vfs_context_current());
4930
4931 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4932 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4933 if (xsecdst != NULL) {
4934 kauth_filesec_free(xsecdst);
4935 }
4936
4937 return ciferror;
4938 }
4939
4940 /*
4941 * Go through the data-protected atomically controlled open (2)
4942 *
4943 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4944 */
4945 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4946 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4947 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4948 {
4949 /*
4950 * Follow the same path as normal open(2)
4951 * Look up the item if it exists, and acquire the vnode.
4952 */
4953 struct vnode_attr va;
4954 struct nameidata nd;
4955 int cmode;
4956 int error;
4957 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4958
4959 VATTR_INIT(&va);
4960 /* Mask off all but regular access permissions */
4961 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4962 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4963
4964 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4965 path, ctx);
4966
4967 /*
4968 * Initialize the extra fields in vnode_attr to pass down our
4969 * extra fields.
4970 * 1. target cprotect class.
4971 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4972 */
4973 if (flags & O_CREAT) {
4974 /* lower level kernel code validates that the class is valid before applying it. */
4975 if (class != PROTECTION_CLASS_DEFAULT) {
4976 /*
4977 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4978 * file behave the same as open (2)
4979 */
4980 VATTR_SET(&va, va_dataprotect_class, class);
4981 }
4982 }
4983
4984 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4985 if (flags & (O_RDWR | O_WRONLY)) {
4986 /*
4987 * Not allowed to write raw encrypted bytes or when opening authenticated.
4988 */
4989 return EINVAL;
4990 }
4991 if (dpflags & O_DP_GETRAWENCRYPTED) {
4992 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4993 }
4994 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4995 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4996 }
4997 if (dpflags & O_DP_AUTHENTICATE) {
4998 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4999 }
5000 }
5001
5002 error = open1at(vfs_context_current(), &nd, flags, &va,
5003 NULL, NULL, retval, fd, authfd);
5004
5005 return error;
5006 }
5007
5008 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5009 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5010 {
5011 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5012 return EINVAL;
5013 }
5014
5015 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5016 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5017 }
5018
5019 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5020 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5021 {
5022 if (uap->dpflags & O_DP_AUTHENTICATE) {
5023 return EINVAL;
5024 }
5025
5026 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5027 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5028 }
5029
5030 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5031 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5032 int fd, enum uio_seg segflg, int *retval)
5033 {
5034 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5035 struct {
5036 struct vnode_attr va;
5037 struct nameidata nd;
5038 } *__open_data;
5039 struct vnode_attr *vap;
5040 struct nameidata *ndp;
5041 int cmode;
5042 int error;
5043
5044 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5045 vap = &__open_data->va;
5046 ndp = &__open_data->nd;
5047
5048 VATTR_INIT(vap);
5049 /* Mask off all but regular access permissions */
5050 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5051 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5052
5053 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5054 segflg, path, ctx);
5055
5056 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5057
5058 kfree_type(typeof(*__open_data), __open_data);
5059
5060 return error;
5061 }
5062
5063 int
open(proc_t p,struct open_args * uap,int32_t * retval)5064 open(proc_t p, struct open_args *uap, int32_t *retval)
5065 {
5066 __pthread_testcancel(1);
5067 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5068 }
5069
5070 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5071 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5072 int32_t *retval)
5073 {
5074 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5075 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5076 }
5077
5078 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5079 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5080 int32_t *retval)
5081 {
5082 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5083 uap->mode, uap->fd, UIO_USERSPACE, retval);
5084 }
5085
5086 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5087 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5088 {
5089 __pthread_testcancel(1);
5090 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5091 }
5092
5093 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5094
5095 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5096 vfs_context_can_open_by_id(vfs_context_t ctx)
5097 {
5098 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5099 return TRUE;
5100 }
5101
5102 return IOTaskHasEntitlement(vfs_context_task(ctx),
5103 OPEN_BY_ID_ENTITLEMENT);
5104 }
5105
5106 /*
5107 * openbyid_np: open a file given a file system id and a file system object id
5108 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5109 * file systems that don't support object ids it is a node id (uint64_t).
5110 *
5111 * Parameters: p Process requesting the open
5112 * uap User argument descriptor (see below)
5113 * retval Pointer to an area to receive the
5114 * return calue from the system call
5115 *
5116 * Indirect: uap->path Path to open (same as 'open')
5117 *
5118 * uap->fsid id of target file system
5119 * uap->objid id of target file system object
5120 * uap->flags Flags to open (same as 'open')
5121 *
5122 * Returns: 0 Success
5123 * !0 errno value
5124 *
5125 *
5126 * XXX: We should enummerate the possible errno values here, and where
5127 * in the code they originated.
5128 */
5129 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5130 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5131 {
5132 fsid_t fsid;
5133 uint64_t objid;
5134 int error;
5135 char *buf = NULL;
5136 int buflen = MAXPATHLEN;
5137 int pathlen = 0;
5138 vfs_context_t ctx = vfs_context_current();
5139
5140 if (!vfs_context_can_open_by_id(ctx)) {
5141 return EPERM;
5142 }
5143
5144 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5145 return error;
5146 }
5147
5148 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5149 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5150 return error;
5151 }
5152
5153 AUDIT_ARG(value32, fsid.val[0]);
5154 AUDIT_ARG(value64, objid);
5155
5156 /*resolve path from fsis, objid*/
5157 do {
5158 buf = kalloc_data(buflen + 1, Z_WAITOK);
5159 if (buf == NULL) {
5160 return ENOMEM;
5161 }
5162
5163 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5164 buf, FSOPT_ISREALFSID, &pathlen);
5165
5166 if (error) {
5167 kfree_data(buf, buflen + 1);
5168 buf = NULL;
5169 }
5170 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5171
5172 if (error) {
5173 return error;
5174 }
5175
5176 buf[pathlen] = 0;
5177
5178 error = openat_internal(
5179 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5180
5181 kfree_data(buf, buflen + 1);
5182
5183 return error;
5184 }
5185
5186
5187 /*
5188 * Create a special file.
5189 */
5190 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5191 int fd);
5192
5193 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5194 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5195 mode_t mode, int fd)
5196 {
5197 vfs_context_t ctx = vfs_context_current();
5198 struct nameidata nd;
5199 vnode_t vp, dvp;
5200 int error;
5201
5202 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5203 if ((mode & S_IFMT) == S_IFIFO) {
5204 return mkfifo1(ctx, upath, vap, fd);
5205 }
5206
5207 AUDIT_ARG(mode, mode);
5208 AUDIT_ARG(value32, vap->va_rdev);
5209
5210 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5211 return error;
5212 }
5213 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5214 UIO_USERSPACE, upath, ctx);
5215 error = nameiat(&nd, fd);
5216 if (error) {
5217 return error;
5218 }
5219 dvp = nd.ni_dvp;
5220 vp = nd.ni_vp;
5221
5222 if (vp != NULL) {
5223 error = EEXIST;
5224 goto out;
5225 }
5226
5227 switch (mode & S_IFMT) {
5228 case S_IFCHR:
5229 VATTR_SET(vap, va_type, VCHR);
5230 break;
5231 case S_IFBLK:
5232 VATTR_SET(vap, va_type, VBLK);
5233 break;
5234 default:
5235 error = EINVAL;
5236 goto out;
5237 }
5238
5239 #if CONFIG_MACF
5240 error = mac_vnode_check_create(ctx,
5241 nd.ni_dvp, &nd.ni_cnd, vap);
5242 if (error) {
5243 goto out;
5244 }
5245 #endif
5246
5247 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5248 goto out;
5249 }
5250
5251 #if CONFIG_FILE_LEASES
5252 vnode_breakdirlease(dvp, false, O_WRONLY);
5253 #endif
5254
5255 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5256 goto out;
5257 }
5258
5259 if (vp) {
5260 int update_flags = 0;
5261
5262 // Make sure the name & parent pointers are hooked up
5263 if (vp->v_name == NULL) {
5264 update_flags |= VNODE_UPDATE_NAME;
5265 }
5266 if (vp->v_parent == NULLVP) {
5267 update_flags |= VNODE_UPDATE_PARENT;
5268 }
5269
5270 if (update_flags) {
5271 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5272 }
5273
5274 #if CONFIG_FSE
5275 add_fsevent(FSE_CREATE_FILE, ctx,
5276 FSE_ARG_VNODE, vp,
5277 FSE_ARG_DONE);
5278 #endif
5279 }
5280
5281 out:
5282 /*
5283 * nameidone has to happen before we vnode_put(dvp)
5284 * since it may need to release the fs_nodelock on the dvp
5285 */
5286 nameidone(&nd);
5287
5288 if (vp) {
5289 vnode_put(vp);
5290 }
5291 vnode_put(dvp);
5292
5293 return error;
5294 }
5295
5296 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5297 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5298 {
5299 struct vnode_attr va;
5300
5301 VATTR_INIT(&va);
5302 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5303 VATTR_SET(&va, va_rdev, uap->dev);
5304
5305 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5306 }
5307
5308 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5309 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5310 {
5311 struct vnode_attr va;
5312
5313 VATTR_INIT(&va);
5314 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5315 VATTR_SET(&va, va_rdev, uap->dev);
5316
5317 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5318 }
5319
5320 /*
5321 * Create a named pipe.
5322 *
5323 * Returns: 0 Success
5324 * EEXIST
5325 * namei:???
5326 * vnode_authorize:???
5327 * vn_create:???
5328 */
5329 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5330 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5331 {
5332 vnode_t vp, dvp;
5333 int error;
5334 struct nameidata nd;
5335
5336 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5337 UIO_USERSPACE, upath, ctx);
5338 error = nameiat(&nd, fd);
5339 if (error) {
5340 return error;
5341 }
5342 dvp = nd.ni_dvp;
5343 vp = nd.ni_vp;
5344
5345 /* check that this is a new file and authorize addition */
5346 if (vp != NULL) {
5347 error = EEXIST;
5348 goto out;
5349 }
5350 VATTR_SET(vap, va_type, VFIFO);
5351
5352 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5353 goto out;
5354 }
5355
5356 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5357 out:
5358 /*
5359 * nameidone has to happen before we vnode_put(dvp)
5360 * since it may need to release the fs_nodelock on the dvp
5361 */
5362 nameidone(&nd);
5363
5364 if (vp) {
5365 vnode_put(vp);
5366 }
5367 vnode_put(dvp);
5368
5369 return error;
5370 }
5371
5372
5373 /*
5374 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5375 *
5376 * Parameters: p Process requesting the open
5377 * uap User argument descriptor (see below)
5378 * retval (Ignored)
5379 *
5380 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5381 * uap->uid UID to set
5382 * uap->gid GID to set
5383 * uap->mode File mode to set (same as 'mkfifo')
5384 * uap->xsecurity ACL to set, if creating
5385 *
5386 * Returns: 0 Success
5387 * !0 errno value
5388 *
5389 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5390 *
5391 * XXX: We should enummerate the possible errno values here, and where
5392 * in the code they originated.
5393 */
5394 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5395 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5396 {
5397 int ciferror;
5398 kauth_filesec_t xsecdst;
5399 struct vnode_attr va;
5400
5401 AUDIT_ARG(owner, uap->uid, uap->gid);
5402
5403 xsecdst = KAUTH_FILESEC_NONE;
5404 if (uap->xsecurity != USER_ADDR_NULL) {
5405 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5406 return ciferror;
5407 }
5408 }
5409
5410 VATTR_INIT(&va);
5411 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5412 if (uap->uid != KAUTH_UID_NONE) {
5413 VATTR_SET(&va, va_uid, uap->uid);
5414 }
5415 if (uap->gid != KAUTH_GID_NONE) {
5416 VATTR_SET(&va, va_gid, uap->gid);
5417 }
5418 if (xsecdst != KAUTH_FILESEC_NONE) {
5419 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5420 va.va_vaflags |= VA_FILESEC_ACL;
5421 }
5422
5423 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5424
5425 if (xsecdst != KAUTH_FILESEC_NONE) {
5426 kauth_filesec_free(xsecdst);
5427 }
5428 return ciferror;
5429 }
5430
5431 /* ARGSUSED */
5432 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5433 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5434 {
5435 struct vnode_attr va;
5436
5437 VATTR_INIT(&va);
5438 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5439
5440 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5441 }
5442
5443 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5444 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5445 {
5446 struct vnode_attr va;
5447
5448 VATTR_INIT(&va);
5449 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5450
5451 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5452 }
5453
5454 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5455 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5456 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5457
5458 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5459 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5460 {
5461 int ret, len = _len;
5462
5463 *truncated_path = 0;
5464
5465 if (firmlink) {
5466 ret = vn_getpath(dvp, path, &len);
5467 } else {
5468 ret = vn_getpath_no_firmlink(dvp, path, &len);
5469 }
5470 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5471 if (leafname) {
5472 path[len - 1] = '/';
5473 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5474 if (len > MAXPATHLEN) {
5475 char *ptr;
5476
5477 // the string got truncated!
5478 *truncated_path = 1;
5479 ptr = strrchr(path, '/');
5480 if (ptr) {
5481 *ptr = '\0'; // chop off the string at the last directory component
5482 }
5483 len = (int)strlen(path) + 1;
5484 }
5485 }
5486 } else if (ret == 0) {
5487 *truncated_path = 1;
5488 } else if (ret != 0) {
5489 struct vnode *mydvp = dvp;
5490
5491 if (ret != ENOSPC) {
5492 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5493 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5494 }
5495 *truncated_path = 1;
5496
5497 do {
5498 if (mydvp->v_parent != NULL) {
5499 mydvp = mydvp->v_parent;
5500 } else if (mydvp->v_mount) {
5501 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5502 break;
5503 } else {
5504 // no parent and no mount point? only thing is to punt and say "/" changed
5505 strlcpy(path, "/", _len);
5506 len = 2;
5507 mydvp = NULL;
5508 }
5509
5510 if (mydvp == NULL) {
5511 break;
5512 }
5513
5514 len = _len;
5515 if (firmlink) {
5516 ret = vn_getpath(mydvp, path, &len);
5517 } else {
5518 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5519 }
5520 } while (ret == ENOSPC);
5521 }
5522
5523 return len;
5524 }
5525
5526 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5527 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5528 {
5529 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5530 }
5531
5532 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5533 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5534 {
5535 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5536 }
5537
5538 /*
5539 * Make a hard file link.
5540 *
5541 * Returns: 0 Success
5542 * EPERM
5543 * EEXIST
5544 * EXDEV
5545 * namei:???
5546 * vnode_authorize:???
5547 * VNOP_LINK:???
5548 */
5549 /* ARGSUSED */
5550 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5551 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5552 user_addr_t link, int flag, enum uio_seg segflg)
5553 {
5554 vnode_t vp, pvp, dvp, lvp;
5555 struct nameidata nd;
5556 int follow;
5557 int error;
5558 #if CONFIG_FSE
5559 fse_info finfo;
5560 #endif
5561 int need_event, has_listeners, need_kpath2;
5562 char *target_path = NULL;
5563 char *no_firmlink_path = NULL;
5564 int truncated = 0;
5565 int truncated_no_firmlink_path = 0;
5566
5567 vp = dvp = lvp = NULLVP;
5568
5569 /* look up the object we are linking to */
5570 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5571 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5572 segflg, path, ctx);
5573
5574 error = nameiat(&nd, fd1);
5575 if (error) {
5576 return error;
5577 }
5578 vp = nd.ni_vp;
5579
5580 nameidone(&nd);
5581
5582 /*
5583 * Normally, linking to directories is not supported.
5584 * However, some file systems may have limited support.
5585 */
5586 if (vp->v_type == VDIR) {
5587 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5588 error = EPERM; /* POSIX */
5589 goto out;
5590 }
5591
5592 /* Linking to a directory requires ownership. */
5593 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5594 struct vnode_attr dva;
5595
5596 VATTR_INIT(&dva);
5597 VATTR_WANTED(&dva, va_uid);
5598 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5599 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5600 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5601 error = EACCES;
5602 goto out;
5603 }
5604 }
5605 }
5606
5607 /* lookup the target node */
5608 #if CONFIG_TRIGGERS
5609 nd.ni_op = OP_LINK;
5610 #endif
5611 nd.ni_cnd.cn_nameiop = CREATE;
5612 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5613 nd.ni_dirp = link;
5614 error = nameiat(&nd, fd2);
5615 if (error != 0) {
5616 goto out;
5617 }
5618 dvp = nd.ni_dvp;
5619 lvp = nd.ni_vp;
5620
5621 #if CONFIG_MACF
5622 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5623 goto out2;
5624 }
5625 #endif
5626
5627 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5628 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5629 goto out2;
5630 }
5631
5632 /* target node must not exist */
5633 if (lvp != NULLVP) {
5634 error = EEXIST;
5635 goto out2;
5636 }
5637 /* cannot link across mountpoints */
5638 if (vnode_mount(vp) != vnode_mount(dvp)) {
5639 error = EXDEV;
5640 goto out2;
5641 }
5642
5643 /* authorize creation of the target note */
5644 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5645 goto out2;
5646 }
5647
5648 #if CONFIG_FILE_LEASES
5649 vnode_breakdirlease(dvp, false, O_WRONLY);
5650 #endif
5651
5652 /* and finally make the link */
5653 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5654 if (error) {
5655 goto out2;
5656 }
5657
5658 #if CONFIG_MACF
5659 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5660 #endif
5661
5662 #if CONFIG_FSE
5663 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5664 #else
5665 need_event = 0;
5666 #endif
5667 has_listeners = kauth_authorize_fileop_has_listeners();
5668
5669 need_kpath2 = 0;
5670 #if CONFIG_AUDIT
5671 if (AUDIT_RECORD_EXISTS()) {
5672 need_kpath2 = 1;
5673 }
5674 #endif
5675
5676 if (need_event || has_listeners || need_kpath2) {
5677 char *link_to_path = NULL;
5678 int len, link_name_len;
5679 int len_no_firmlink_path = 0;
5680
5681 /* build the path to the new link file */
5682 GET_PATH(target_path);
5683
5684 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5685 if (no_firmlink_path == NULL) {
5686 GET_PATH(no_firmlink_path);
5687 }
5688 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5689
5690 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5691
5692 if (has_listeners) {
5693 /* build the path to file we are linking to */
5694 GET_PATH(link_to_path);
5695
5696 link_name_len = MAXPATHLEN;
5697 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5698 /*
5699 * Call out to allow 3rd party notification of rename.
5700 * Ignore result of kauth_authorize_fileop call.
5701 */
5702 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5703 (uintptr_t)link_to_path,
5704 (uintptr_t)target_path);
5705 }
5706 if (link_to_path != NULL) {
5707 RELEASE_PATH(link_to_path);
5708 }
5709 }
5710 #if CONFIG_FSE
5711 if (need_event) {
5712 /* construct fsevent */
5713 if (get_fse_info(vp, &finfo, ctx) == 0) {
5714 if (truncated_no_firmlink_path) {
5715 finfo.mode |= FSE_TRUNCATED_PATH;
5716 }
5717
5718 // build the path to the destination of the link
5719 add_fsevent(FSE_CREATE_FILE, ctx,
5720 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5721 FSE_ARG_FINFO, &finfo,
5722 FSE_ARG_DONE);
5723 }
5724
5725 pvp = vp->v_parent;
5726 // need an iocount on parent vnode in this case
5727 if (pvp && pvp != dvp) {
5728 pvp = vnode_getparent_if_different(vp, dvp);
5729 }
5730 if (pvp) {
5731 add_fsevent(FSE_STAT_CHANGED, ctx,
5732 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5733 }
5734 if (pvp && pvp != dvp) {
5735 vnode_put(pvp);
5736 }
5737 }
5738 #endif
5739 }
5740 out2:
5741 /*
5742 * nameidone has to happen before we vnode_put(dvp)
5743 * since it may need to release the fs_nodelock on the dvp
5744 */
5745 nameidone(&nd);
5746 if (target_path != NULL) {
5747 RELEASE_PATH(target_path);
5748 }
5749 if (no_firmlink_path != NULL) {
5750 RELEASE_PATH(no_firmlink_path);
5751 no_firmlink_path = NULL;
5752 }
5753 out:
5754 if (lvp) {
5755 vnode_put(lvp);
5756 }
5757 if (dvp) {
5758 vnode_put(dvp);
5759 }
5760 vnode_put(vp);
5761 return error;
5762 }
5763
5764 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5765 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5766 {
5767 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5768 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5769 }
5770
5771 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5772 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5773 {
5774 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5775 return EINVAL;
5776 }
5777
5778 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5779 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5780 }
5781
5782 /*
5783 * Make a symbolic link.
5784 *
5785 * We could add support for ACLs here too...
5786 */
5787 /* ARGSUSED */
5788 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5789 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5790 user_addr_t link, enum uio_seg segflg)
5791 {
5792 struct vnode_attr va;
5793 char *path;
5794 int error;
5795 struct nameidata nd;
5796 vnode_t vp, dvp;
5797 size_t dummy = 0;
5798 proc_t p;
5799
5800 error = 0;
5801 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5802 path = zalloc(ZV_NAMEI);
5803 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5804 } else {
5805 path = (char *)path_data;
5806 }
5807 if (error) {
5808 goto out;
5809 }
5810 AUDIT_ARG(text, path); /* This is the link string */
5811
5812 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5813 segflg, link, ctx);
5814
5815 error = nameiat(&nd, fd);
5816 if (error) {
5817 goto out;
5818 }
5819 dvp = nd.ni_dvp;
5820 vp = nd.ni_vp;
5821
5822 p = vfs_context_proc(ctx);
5823 VATTR_INIT(&va);
5824 VATTR_SET(&va, va_type, VLNK);
5825 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5826
5827 #if CONFIG_MACF
5828 error = mac_vnode_check_create(ctx,
5829 dvp, &nd.ni_cnd, &va);
5830 #endif
5831 if (error != 0) {
5832 goto skipit;
5833 }
5834
5835 if (vp != NULL) {
5836 error = EEXIST;
5837 goto skipit;
5838 }
5839
5840 /* authorize */
5841 if (error == 0) {
5842 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5843 }
5844 /* get default ownership, etc. */
5845 if (error == 0) {
5846 error = vnode_authattr_new(dvp, &va, 0, ctx);
5847 }
5848
5849 #if CONFIG_FILE_LEASES
5850 vnode_breakdirlease(dvp, false, O_WRONLY);
5851 #endif
5852
5853 if (error == 0) {
5854 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5855 }
5856
5857 /* do fallback attribute handling */
5858 if (error == 0 && vp) {
5859 error = vnode_setattr_fallback(vp, &va, ctx);
5860 }
5861
5862 #if CONFIG_MACF
5863 if (error == 0 && vp) {
5864 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5865 }
5866 #endif
5867
5868 if (error == 0) {
5869 int update_flags = 0;
5870
5871 /*check if a new vnode was created, else try to get one*/
5872 if (vp == NULL) {
5873 nd.ni_cnd.cn_nameiop = LOOKUP;
5874 #if CONFIG_TRIGGERS
5875 nd.ni_op = OP_LOOKUP;
5876 #endif
5877 /*
5878 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5879 * reallocated again in namei().
5880 */
5881 nd.ni_cnd.cn_flags &= HASBUF;
5882 error = nameiat(&nd, fd);
5883 if (error) {
5884 goto skipit;
5885 }
5886 vp = nd.ni_vp;
5887 }
5888
5889 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5890 /* call out to allow 3rd party notification of rename.
5891 * Ignore result of kauth_authorize_fileop call.
5892 */
5893 if (kauth_authorize_fileop_has_listeners() &&
5894 namei(&nd) == 0) {
5895 char *new_link_path = NULL;
5896 int len;
5897
5898 /* build the path to the new link file */
5899 new_link_path = get_pathbuff();
5900 len = MAXPATHLEN;
5901 vn_getpath(dvp, new_link_path, &len);
5902 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5903 new_link_path[len - 1] = '/';
5904 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5905 }
5906
5907 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5908 (uintptr_t)path, (uintptr_t)new_link_path);
5909 if (new_link_path != NULL) {
5910 release_pathbuff(new_link_path);
5911 }
5912 }
5913 #endif
5914 // Make sure the name & parent pointers are hooked up
5915 if (vp->v_name == NULL) {
5916 update_flags |= VNODE_UPDATE_NAME;
5917 }
5918 if (vp->v_parent == NULLVP) {
5919 update_flags |= VNODE_UPDATE_PARENT;
5920 }
5921
5922 if (update_flags) {
5923 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5924 }
5925
5926 #if CONFIG_FSE
5927 add_fsevent(FSE_CREATE_FILE, ctx,
5928 FSE_ARG_VNODE, vp,
5929 FSE_ARG_DONE);
5930 #endif
5931 }
5932
5933 skipit:
5934 /*
5935 * nameidone has to happen before we vnode_put(dvp)
5936 * since it may need to release the fs_nodelock on the dvp
5937 */
5938 nameidone(&nd);
5939
5940 if (vp) {
5941 vnode_put(vp);
5942 }
5943 vnode_put(dvp);
5944 out:
5945 if (path && (path != (char *)path_data)) {
5946 zfree(ZV_NAMEI, path);
5947 }
5948
5949 return error;
5950 }
5951
5952 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5953 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5954 {
5955 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5956 uap->link, UIO_USERSPACE);
5957 }
5958
5959 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5960 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5961 __unused int32_t *retval)
5962 {
5963 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5964 uap->path2, UIO_USERSPACE);
5965 }
5966
5967 /*
5968 * Delete a whiteout from the filesystem.
5969 * No longer supported.
5970 */
5971 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5972 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5973 {
5974 return ENOTSUP;
5975 }
5976
5977 /*
5978 * Delete a name from the filesystem.
5979 */
5980 /* ARGSUSED */
5981 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5982 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5983 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5984 {
5985 struct {
5986 struct nameidata nd;
5987 #if CONFIG_FSE
5988 struct vnode_attr va;
5989 fse_info finfo;
5990 #endif
5991 } *__unlink_data;
5992 struct nameidata *ndp;
5993 vnode_t vp, dvp;
5994 int error;
5995 struct componentname *cnp;
5996 char *path = NULL;
5997 char *no_firmlink_path = NULL;
5998 int len_path = 0;
5999 int len_no_firmlink_path = 0;
6000 int flags;
6001 int need_event;
6002 int has_listeners;
6003 int truncated_path;
6004 int truncated_no_firmlink_path;
6005 int batched;
6006 struct vnode_attr *vap;
6007 int do_retry;
6008 int retry_count = 0;
6009 int cn_flags;
6010
6011 cn_flags = LOCKPARENT;
6012 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6013 cn_flags |= AUDITVNPATH1;
6014 }
6015 /* If a starting dvp is passed, it trumps any fd passed. */
6016 if (start_dvp) {
6017 cn_flags |= USEDVP;
6018 }
6019
6020 #if NAMEDRSRCFORK
6021 /* unlink or delete is allowed on rsrc forks and named streams */
6022 cn_flags |= CN_ALLOWRSRCFORK;
6023 #endif
6024
6025 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6026 ndp = &__unlink_data->nd;
6027 #if CONFIG_FSE
6028 fse_info *finfop = &__unlink_data->finfo;
6029 #endif
6030
6031 retry:
6032 do_retry = 0;
6033 flags = 0;
6034 need_event = 0;
6035 has_listeners = 0;
6036 truncated_path = 0;
6037 truncated_no_firmlink_path = 0;
6038 vap = NULL;
6039
6040 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6041
6042 ndp->ni_dvp = start_dvp;
6043 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
6044 cnp = &ndp->ni_cnd;
6045
6046 continue_lookup:
6047 error = nameiat(ndp, fd);
6048 if (error) {
6049 goto early_out;
6050 }
6051
6052 dvp = ndp->ni_dvp;
6053 vp = ndp->ni_vp;
6054
6055 /* With Carbon delete semantics, busy files cannot be deleted */
6056 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6057 flags |= VNODE_REMOVE_NODELETEBUSY;
6058 }
6059
6060 /* Skip any potential upcalls if told to. */
6061 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6062 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6063 }
6064
6065 if (vp) {
6066 batched = vnode_compound_remove_available(vp);
6067 /*
6068 * The root of a mounted filesystem cannot be deleted.
6069 */
6070 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6071 error = EBUSY;
6072 goto out;
6073 }
6074
6075 #if DEVELOPMENT || DEBUG
6076 /*
6077 * XXX VSWAP: Check for entitlements or special flag here
6078 * so we can restrict access appropriately.
6079 */
6080 #else /* DEVELOPMENT || DEBUG */
6081
6082 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6083 error = EPERM;
6084 goto out;
6085 }
6086 #endif /* DEVELOPMENT || DEBUG */
6087
6088 if (!batched) {
6089 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6090 if (error) {
6091 if (error == ENOENT) {
6092 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6093 do_retry = 1;
6094 retry_count++;
6095 }
6096 }
6097 goto out;
6098 }
6099 }
6100 } else {
6101 batched = 1;
6102
6103 if (!vnode_compound_remove_available(dvp)) {
6104 panic("No vp, but no compound remove?");
6105 }
6106 }
6107
6108 #if CONFIG_FSE
6109 need_event = need_fsevent(FSE_DELETE, dvp);
6110 if (need_event) {
6111 if (!batched) {
6112 if ((vp->v_flag & VISHARDLINK) == 0) {
6113 /* XXX need to get these data in batched VNOP */
6114 get_fse_info(vp, finfop, ctx);
6115 }
6116 } else {
6117 error =
6118 vfs_get_notify_attributes(&__unlink_data->va);
6119 if (error) {
6120 goto out;
6121 }
6122
6123 vap = &__unlink_data->va;
6124 }
6125 }
6126 #endif
6127 has_listeners = kauth_authorize_fileop_has_listeners();
6128 if (need_event || has_listeners) {
6129 if (path == NULL) {
6130 GET_PATH(path);
6131 }
6132 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6133 if (no_firmlink_path == NULL) {
6134 GET_PATH(no_firmlink_path);
6135 }
6136 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6137 }
6138
6139 #if NAMEDRSRCFORK
6140 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6141 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6142 } else
6143 #endif
6144 {
6145 #if CONFIG_FILE_LEASES
6146 vnode_breakdirlease(dvp, false, O_WRONLY);
6147 #endif
6148
6149 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6150 vp = ndp->ni_vp;
6151 if (error == EKEEPLOOKING) {
6152 if (!batched) {
6153 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6154 }
6155
6156 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6157 panic("EKEEPLOOKING, but continue flag not set?");
6158 }
6159
6160 if (vnode_isdir(vp)) {
6161 error = EISDIR;
6162 goto out;
6163 }
6164 goto continue_lookup;
6165 } else if (error == ENOENT && batched) {
6166 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6167 /*
6168 * For compound VNOPs, the authorization callback may
6169 * return ENOENT in case of racing hardlink lookups
6170 * hitting the name cache, redrive the lookup.
6171 */
6172 do_retry = 1;
6173 retry_count += 1;
6174 goto out;
6175 }
6176 }
6177 }
6178
6179 /*
6180 * Call out to allow 3rd party notification of delete.
6181 * Ignore result of kauth_authorize_fileop call.
6182 */
6183 if (!error) {
6184 if (has_listeners) {
6185 kauth_authorize_fileop(vfs_context_ucred(ctx),
6186 KAUTH_FILEOP_DELETE,
6187 (uintptr_t)vp,
6188 (uintptr_t)path);
6189 }
6190
6191 if (vp->v_flag & VISHARDLINK) {
6192 //
6193 // if a hardlink gets deleted we want to blow away the
6194 // v_parent link because the path that got us to this
6195 // instance of the link is no longer valid. this will
6196 // force the next call to get the path to ask the file
6197 // system instead of just following the v_parent link.
6198 //
6199 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6200 }
6201
6202 #if CONFIG_FSE
6203 if (need_event) {
6204 if (vp->v_flag & VISHARDLINK) {
6205 get_fse_info(vp, finfop, ctx);
6206 } else if (vap) {
6207 vnode_get_fse_info_from_vap(vp, finfop, vap);
6208 }
6209 if (truncated_path) {
6210 finfop->mode |= FSE_TRUNCATED_PATH;
6211 }
6212 add_fsevent(FSE_DELETE, ctx,
6213 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6214 FSE_ARG_FINFO, finfop,
6215 FSE_ARG_DONE);
6216 }
6217 #endif
6218
6219 #if CONFIG_MACF
6220 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6221 #endif
6222 }
6223
6224 out:
6225 if (path != NULL) {
6226 RELEASE_PATH(path);
6227 path = NULL;
6228 }
6229
6230 if (no_firmlink_path != NULL) {
6231 RELEASE_PATH(no_firmlink_path);
6232 no_firmlink_path = NULL;
6233 }
6234 #if NAMEDRSRCFORK
6235 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6236 * will cause its shadow file to go away if necessary.
6237 */
6238 if (vp && (vnode_isnamedstream(vp)) &&
6239 (vp->v_parent != NULLVP) &&
6240 vnode_isshadow(vp)) {
6241 vnode_recycle(vp);
6242 }
6243 #endif
6244 /*
6245 * nameidone has to happen before we vnode_put(dvp)
6246 * since it may need to release the fs_nodelock on the dvp
6247 */
6248 nameidone(ndp);
6249 vnode_put(dvp);
6250 if (vp) {
6251 vnode_put(vp);
6252 }
6253
6254 if (do_retry) {
6255 goto retry;
6256 }
6257
6258 early_out:
6259 kfree_type(typeof(*__unlink_data), __unlink_data);
6260 return error;
6261 }
6262
6263 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6264 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6265 enum uio_seg segflg, int unlink_flags)
6266 {
6267 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6268 unlink_flags);
6269 }
6270
6271 /*
6272 * Delete a name from the filesystem using Carbon semantics.
6273 */
6274 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6275 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6276 {
6277 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6278 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6279 }
6280
6281 /*
6282 * Delete a name from the filesystem using POSIX semantics.
6283 */
6284 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6285 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6286 {
6287 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6288 uap->path, UIO_USERSPACE, 0);
6289 }
6290
6291 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6292 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6293 {
6294 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6295 return EINVAL;
6296 }
6297
6298 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6299 int unlink_flags = 0;
6300
6301 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6302 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6303 }
6304 return rmdirat_internal(vfs_context_current(), uap->fd,
6305 uap->path, UIO_USERSPACE, unlink_flags);
6306 } else {
6307 return unlinkat_internal(vfs_context_current(), uap->fd,
6308 NULLVP, uap->path, UIO_USERSPACE, 0);
6309 }
6310 }
6311
6312 /*
6313 * Reposition read/write file offset.
6314 */
6315 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6316 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6317 {
6318 struct fileproc *fp;
6319 vnode_t vp;
6320 struct vfs_context *ctx;
6321 off_t offset = uap->offset, file_size;
6322 int error;
6323
6324 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6325 if (error == ENOTSUP) {
6326 return ESPIPE;
6327 }
6328 return error;
6329 }
6330 if (vnode_isfifo(vp)) {
6331 file_drop(uap->fd);
6332 return ESPIPE;
6333 }
6334
6335
6336 ctx = vfs_context_current();
6337 #if CONFIG_MACF
6338 if (uap->whence == L_INCR && uap->offset == 0) {
6339 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6340 fp->fp_glob);
6341 } else {
6342 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6343 fp->fp_glob);
6344 }
6345 if (error) {
6346 file_drop(uap->fd);
6347 return error;
6348 }
6349 #endif
6350 if ((error = vnode_getwithref(vp))) {
6351 file_drop(uap->fd);
6352 return error;
6353 }
6354
6355 switch (uap->whence) {
6356 case L_INCR:
6357 offset += fp->fp_glob->fg_offset;
6358 break;
6359 case L_XTND:
6360 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6361 break;
6362 }
6363 offset += file_size;
6364 break;
6365 case L_SET:
6366 break;
6367 case SEEK_HOLE:
6368 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6369 break;
6370 case SEEK_DATA:
6371 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6372 break;
6373 default:
6374 error = EINVAL;
6375 }
6376 if (error == 0) {
6377 if (uap->offset > 0 && offset < 0) {
6378 /* Incremented/relative move past max size */
6379 error = EOVERFLOW;
6380 } else {
6381 /*
6382 * Allow negative offsets on character devices, per
6383 * POSIX 1003.1-2001. Most likely for writing disk
6384 * labels.
6385 */
6386 if (offset < 0 && vp->v_type != VCHR) {
6387 /* Decremented/relative move before start */
6388 error = EINVAL;
6389 } else {
6390 /* Success */
6391 fp->fp_glob->fg_offset = offset;
6392 *retval = fp->fp_glob->fg_offset;
6393 }
6394 }
6395 }
6396
6397 /*
6398 * An lseek can affect whether data is "available to read." Use
6399 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6400 */
6401 post_event_if_success(vp, error, NOTE_NONE);
6402 (void)vnode_put(vp);
6403 file_drop(uap->fd);
6404 return error;
6405 }
6406
6407
6408 /*
6409 * Check access permissions.
6410 *
6411 * Returns: 0 Success
6412 * vnode_authorize:???
6413 */
6414 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6415 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6416 {
6417 kauth_action_t action;
6418 int error;
6419
6420 /*
6421 * If just the regular access bits, convert them to something
6422 * that vnode_authorize will understand.
6423 */
6424 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6425 action = 0;
6426 if (uflags & R_OK) {
6427 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6428 }
6429 if (uflags & W_OK) {
6430 if (vnode_isdir(vp)) {
6431 action |= KAUTH_VNODE_ADD_FILE |
6432 KAUTH_VNODE_ADD_SUBDIRECTORY;
6433 /* might want delete rights here too */
6434 } else {
6435 action |= KAUTH_VNODE_WRITE_DATA;
6436 }
6437 }
6438 if (uflags & X_OK) {
6439 if (vnode_isdir(vp)) {
6440 action |= KAUTH_VNODE_SEARCH;
6441 } else {
6442 action |= KAUTH_VNODE_EXECUTE;
6443 }
6444 }
6445 } else {
6446 /* take advantage of definition of uflags */
6447 action = uflags >> 8;
6448 }
6449
6450 #if CONFIG_MACF
6451 error = mac_vnode_check_access(ctx, vp, uflags);
6452 if (error) {
6453 return error;
6454 }
6455 #endif /* MAC */
6456
6457 /* action == 0 means only check for existence */
6458 if (action != 0) {
6459 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6460 } else {
6461 error = 0;
6462 }
6463
6464 return error;
6465 }
6466
6467
6468
6469 /*
6470 * access_extended: Check access permissions in bulk.
6471 *
6472 * Description: uap->entries Pointer to an array of accessx
6473 * descriptor structs, plus one or
6474 * more NULL terminated strings (see
6475 * "Notes" section below).
6476 * uap->size Size of the area pointed to by
6477 * uap->entries.
6478 * uap->results Pointer to the results array.
6479 *
6480 * Returns: 0 Success
6481 * ENOMEM Insufficient memory
6482 * EINVAL Invalid arguments
6483 * namei:EFAULT Bad address
6484 * namei:ENAMETOOLONG Filename too long
6485 * namei:ENOENT No such file or directory
6486 * namei:ELOOP Too many levels of symbolic links
6487 * namei:EBADF Bad file descriptor
6488 * namei:ENOTDIR Not a directory
6489 * namei:???
6490 * access1:
6491 *
6492 * Implicit returns:
6493 * uap->results Array contents modified
6494 *
6495 * Notes: The uap->entries are structured as an arbitrary length array
6496 * of accessx descriptors, followed by one or more NULL terminated
6497 * strings
6498 *
6499 * struct accessx_descriptor[0]
6500 * ...
6501 * struct accessx_descriptor[n]
6502 * char name_data[0];
6503 *
6504 * We determine the entry count by walking the buffer containing
6505 * the uap->entries argument descriptor. For each descriptor we
6506 * see, the valid values for the offset ad_name_offset will be
6507 * in the byte range:
6508 *
6509 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6510 * to
6511 * [ uap->entries + uap->size - 2 ]
6512 *
6513 * since we must have at least one string, and the string must
6514 * be at least one character plus the NULL terminator in length.
6515 *
6516 * XXX: Need to support the check-as uid argument
6517 */
6518 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6519 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6520 {
6521 struct accessx_descriptor *input = NULL;
6522 errno_t *result = NULL;
6523 errno_t error = 0;
6524 int wantdelete = 0;
6525 size_t desc_max, desc_actual = 0;
6526 unsigned int i, j;
6527 struct vfs_context context;
6528 struct nameidata nd;
6529 int niopts;
6530 vnode_t vp = NULL;
6531 vnode_t dvp = NULL;
6532 #define ACCESSX_MAX_DESCR_ON_STACK 10
6533 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6534
6535 context.vc_ucred = NULL;
6536
6537 /*
6538 * Validate parameters; if valid, copy the descriptor array and string
6539 * arguments into local memory. Before proceeding, the following
6540 * conditions must have been met:
6541 *
6542 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6543 * o There must be sufficient room in the request for at least one
6544 * descriptor and a one yte NUL terminated string.
6545 * o The allocation of local storage must not fail.
6546 */
6547 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6548 return ENOMEM;
6549 }
6550 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6551 return EINVAL;
6552 }
6553 if (uap->size <= sizeof(stack_input)) {
6554 input = stack_input;
6555 } else {
6556 input = kalloc_data(uap->size, Z_WAITOK);
6557 if (input == NULL) {
6558 error = ENOMEM;
6559 goto out;
6560 }
6561 }
6562 error = copyin(uap->entries, input, uap->size);
6563 if (error) {
6564 goto out;
6565 }
6566
6567 AUDIT_ARG(opaque, input, uap->size);
6568
6569 /*
6570 * Force NUL termination of the copyin buffer to avoid nami() running
6571 * off the end. If the caller passes us bogus data, they may get a
6572 * bogus result.
6573 */
6574 ((char *)input)[uap->size - 1] = 0;
6575
6576 /*
6577 * Access is defined as checking against the process' real identity,
6578 * even if operations are checking the effective identity. This
6579 * requires that we use a local vfs context.
6580 */
6581 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6582 context.vc_thread = current_thread();
6583
6584 /*
6585 * Find out how many entries we have, so we can allocate the result
6586 * array by walking the list and adjusting the count downward by the
6587 * earliest string offset we see.
6588 */
6589 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6590 desc_actual = desc_max;
6591 for (i = 0; i < desc_actual; i++) {
6592 /*
6593 * Take the offset to the name string for this entry and
6594 * convert to an input array index, which would be one off
6595 * the end of the array if this entry was the lowest-addressed
6596 * name string.
6597 */
6598 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6599
6600 /*
6601 * An offset greater than the max allowable offset is an error.
6602 * It is also an error for any valid entry to point
6603 * to a location prior to the end of the current entry, if
6604 * it's not a reference to the string of the previous entry.
6605 */
6606 if (j > desc_max || (j != 0 && j <= i)) {
6607 error = EINVAL;
6608 goto out;
6609 }
6610
6611 /* Also do not let ad_name_offset point to something beyond the size of the input */
6612 if (input[i].ad_name_offset >= uap->size) {
6613 error = EINVAL;
6614 goto out;
6615 }
6616
6617 /*
6618 * An offset of 0 means use the previous descriptor's offset;
6619 * this is used to chain multiple requests for the same file
6620 * to avoid multiple lookups.
6621 */
6622 if (j == 0) {
6623 /* This is not valid for the first entry */
6624 if (i == 0) {
6625 error = EINVAL;
6626 goto out;
6627 }
6628 continue;
6629 }
6630
6631 /*
6632 * If the offset of the string for this descriptor is before
6633 * what we believe is the current actual last descriptor,
6634 * then we need to adjust our estimate downward; this permits
6635 * the string table following the last descriptor to be out
6636 * of order relative to the descriptor list.
6637 */
6638 if (j < desc_actual) {
6639 desc_actual = j;
6640 }
6641 }
6642
6643 /*
6644 * We limit the actual number of descriptors we are willing to process
6645 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6646 * requested does not exceed this limit,
6647 */
6648 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6649 error = ENOMEM;
6650 goto out;
6651 }
6652 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6653 if (result == NULL) {
6654 error = ENOMEM;
6655 goto out;
6656 }
6657
6658 /*
6659 * Do the work by iterating over the descriptor entries we know to
6660 * at least appear to contain valid data.
6661 */
6662 error = 0;
6663 for (i = 0; i < desc_actual; i++) {
6664 /*
6665 * If the ad_name_offset is 0, then we use the previous
6666 * results to make the check; otherwise, we are looking up
6667 * a new file name.
6668 */
6669 if (input[i].ad_name_offset != 0) {
6670 /* discard old vnodes */
6671 if (vp) {
6672 vnode_put(vp);
6673 vp = NULL;
6674 }
6675 if (dvp) {
6676 vnode_put(dvp);
6677 dvp = NULL;
6678 }
6679
6680 /*
6681 * Scan forward in the descriptor list to see if we
6682 * need the parent vnode. We will need it if we are
6683 * deleting, since we must have rights to remove
6684 * entries in the parent directory, as well as the
6685 * rights to delete the object itself.
6686 */
6687 wantdelete = input[i].ad_flags & _DELETE_OK;
6688 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6689 if (input[j].ad_flags & _DELETE_OK) {
6690 wantdelete = 1;
6691 }
6692 }
6693
6694 niopts = FOLLOW | AUDITVNPATH1;
6695
6696 /* need parent for vnode_authorize for deletion test */
6697 if (wantdelete) {
6698 niopts |= WANTPARENT;
6699 }
6700
6701 /* do the lookup */
6702 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6703 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6704 &context);
6705 error = namei(&nd);
6706 if (!error) {
6707 vp = nd.ni_vp;
6708 if (wantdelete) {
6709 dvp = nd.ni_dvp;
6710 }
6711 }
6712 nameidone(&nd);
6713 }
6714
6715 /*
6716 * Handle lookup errors.
6717 */
6718 switch (error) {
6719 case ENOENT:
6720 case EACCES:
6721 case EPERM:
6722 case ENOTDIR:
6723 result[i] = error;
6724 break;
6725 case 0:
6726 /* run this access check */
6727 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6728 break;
6729 default:
6730 /* fatal lookup error */
6731
6732 goto out;
6733 }
6734 }
6735
6736 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6737
6738 /* copy out results */
6739 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6740
6741 out:
6742 if (input && input != stack_input) {
6743 kfree_data(input, uap->size);
6744 }
6745 if (result) {
6746 kfree_data(result, desc_actual * sizeof(errno_t));
6747 }
6748 if (vp) {
6749 vnode_put(vp);
6750 }
6751 if (dvp) {
6752 vnode_put(dvp);
6753 }
6754 if (IS_VALID_CRED(context.vc_ucred)) {
6755 kauth_cred_unref(&context.vc_ucred);
6756 }
6757 return error;
6758 }
6759
6760
6761 /*
6762 * Returns: 0 Success
6763 * namei:EFAULT Bad address
6764 * namei:ENAMETOOLONG Filename too long
6765 * namei:ENOENT No such file or directory
6766 * namei:ELOOP Too many levels of symbolic links
6767 * namei:EBADF Bad file descriptor
6768 * namei:ENOTDIR Not a directory
6769 * namei:???
6770 * access1:
6771 */
6772 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6773 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6774 int flag, enum uio_seg segflg)
6775 {
6776 int error;
6777 struct nameidata nd;
6778 int niopts;
6779 struct vfs_context context;
6780 #if NAMEDRSRCFORK
6781 int is_namedstream = 0;
6782 #endif
6783
6784 /*
6785 * Unless the AT_EACCESS option is used, Access is defined as checking
6786 * against the process' real identity, even if operations are checking
6787 * the effective identity. So we need to tweak the credential
6788 * in the context for that case.
6789 */
6790 if (!(flag & AT_EACCESS)) {
6791 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6792 } else {
6793 context.vc_ucred = ctx->vc_ucred;
6794 }
6795 context.vc_thread = ctx->vc_thread;
6796
6797
6798 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6799 /* need parent for vnode_authorize for deletion test */
6800 if (amode & _DELETE_OK) {
6801 niopts |= WANTPARENT;
6802 }
6803 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6804 path, &context);
6805 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6806 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6807 }
6808
6809 #if NAMEDRSRCFORK
6810 /* access(F_OK) calls are allowed for resource forks. */
6811 if (amode == F_OK) {
6812 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6813 }
6814 #endif
6815 error = nameiat(&nd, fd);
6816 if (error) {
6817 goto out;
6818 }
6819
6820 #if NAMEDRSRCFORK
6821 /* Grab reference on the shadow stream file vnode to
6822 * force an inactive on release which will mark it
6823 * for recycle.
6824 */
6825 if (vnode_isnamedstream(nd.ni_vp) &&
6826 (nd.ni_vp->v_parent != NULLVP) &&
6827 vnode_isshadow(nd.ni_vp)) {
6828 is_namedstream = 1;
6829 vnode_ref(nd.ni_vp);
6830 }
6831 #endif
6832
6833 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6834
6835 #if NAMEDRSRCFORK
6836 if (is_namedstream) {
6837 vnode_rele(nd.ni_vp);
6838 }
6839 #endif
6840
6841 vnode_put(nd.ni_vp);
6842 if (amode & _DELETE_OK) {
6843 vnode_put(nd.ni_dvp);
6844 }
6845 nameidone(&nd);
6846
6847 out:
6848 if (!(flag & AT_EACCESS)) {
6849 kauth_cred_unref(&context.vc_ucred);
6850 }
6851 return error;
6852 }
6853
6854 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6855 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6856 {
6857 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6858 uap->path, uap->flags, 0, UIO_USERSPACE);
6859 }
6860
6861 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6862 faccessat(__unused proc_t p, struct faccessat_args *uap,
6863 __unused int32_t *retval)
6864 {
6865 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6866 return EINVAL;
6867 }
6868
6869 return faccessat_internal(vfs_context_current(), uap->fd,
6870 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6871 }
6872
6873 /*
6874 * Returns: 0 Success
6875 * EFAULT
6876 * copyout:EFAULT
6877 * namei:???
6878 * vn_stat:???
6879 */
6880 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6881 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6882 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6883 enum uio_seg segflg, int fd, int flag)
6884 {
6885 struct nameidata nd;
6886 int follow;
6887 union {
6888 struct stat sb;
6889 struct stat64 sb64;
6890 } source = {};
6891 union {
6892 struct user64_stat user64_sb;
6893 struct user32_stat user32_sb;
6894 struct user64_stat64 user64_sb64;
6895 struct user32_stat64 user32_sb64;
6896 } dest = {};
6897 caddr_t sbp;
6898 int error, my_size;
6899 kauth_filesec_t fsec;
6900 size_t xsecurity_bufsize;
6901 void * statptr;
6902 struct fileproc *fp = NULL;
6903 int needsrealdev = 0;
6904
6905 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6906 NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6907 segflg, path, ctx);
6908 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6909 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6910 }
6911
6912 #if NAMEDRSRCFORK
6913 int is_namedstream = 0;
6914 /* stat calls are allowed for resource forks. */
6915 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6916 #endif
6917
6918 if (flag & AT_FDONLY) {
6919 vnode_t fvp;
6920
6921 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6922 if (error) {
6923 return error;
6924 }
6925 if ((error = vnode_getwithref(fvp))) {
6926 file_drop(fd);
6927 return error;
6928 }
6929 nd.ni_vp = fvp;
6930 } else {
6931 error = nameiat(&nd, fd);
6932 if (error) {
6933 return error;
6934 }
6935 }
6936 fsec = KAUTH_FILESEC_NONE;
6937
6938 statptr = (void *)&source;
6939
6940 #if NAMEDRSRCFORK
6941 /* Grab reference on the shadow stream file vnode to
6942 * force an inactive on release which will mark it
6943 * for recycle.
6944 */
6945 if (vnode_isnamedstream(nd.ni_vp) &&
6946 (nd.ni_vp->v_parent != NULLVP) &&
6947 vnode_isshadow(nd.ni_vp)) {
6948 is_namedstream = 1;
6949 vnode_ref(nd.ni_vp);
6950 }
6951 #endif
6952
6953 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6954 if (fp && (xsecurity == USER_ADDR_NULL)) {
6955 /*
6956 * If the caller has the file open, and is not
6957 * requesting extended security information, we are
6958 * going to let them get the basic stat information.
6959 */
6960 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6961 fp->fp_glob->fg_cred);
6962 } else {
6963 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6964 isstat64, needsrealdev, ctx);
6965 }
6966
6967 #if NAMEDRSRCFORK
6968 if (is_namedstream) {
6969 vnode_rele(nd.ni_vp);
6970 }
6971 #endif
6972 vnode_put(nd.ni_vp);
6973 nameidone(&nd);
6974 if (fp) {
6975 file_drop(fd);
6976 fp = NULL;
6977 }
6978
6979 if (error) {
6980 return error;
6981 }
6982 /* Zap spare fields */
6983 if (isstat64 != 0) {
6984 source.sb64.st_lspare = 0;
6985 source.sb64.st_qspare[0] = 0LL;
6986 source.sb64.st_qspare[1] = 0LL;
6987 if (vfs_context_is64bit(ctx)) {
6988 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6989 my_size = sizeof(dest.user64_sb64);
6990 sbp = (caddr_t)&dest.user64_sb64;
6991 } else {
6992 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6993 my_size = sizeof(dest.user32_sb64);
6994 sbp = (caddr_t)&dest.user32_sb64;
6995 }
6996 /*
6997 * Check if we raced (post lookup) against the last unlink of a file.
6998 */
6999 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7000 source.sb64.st_nlink = 1;
7001 }
7002 } else {
7003 source.sb.st_lspare = 0;
7004 source.sb.st_qspare[0] = 0LL;
7005 source.sb.st_qspare[1] = 0LL;
7006 if (vfs_context_is64bit(ctx)) {
7007 munge_user64_stat(&source.sb, &dest.user64_sb);
7008 my_size = sizeof(dest.user64_sb);
7009 sbp = (caddr_t)&dest.user64_sb;
7010 } else {
7011 munge_user32_stat(&source.sb, &dest.user32_sb);
7012 my_size = sizeof(dest.user32_sb);
7013 sbp = (caddr_t)&dest.user32_sb;
7014 }
7015
7016 /*
7017 * Check if we raced (post lookup) against the last unlink of a file.
7018 */
7019 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7020 source.sb.st_nlink = 1;
7021 }
7022 }
7023 if ((error = copyout(sbp, ub, my_size)) != 0) {
7024 goto out;
7025 }
7026
7027 /* caller wants extended security information? */
7028 if (xsecurity != USER_ADDR_NULL) {
7029 /* did we get any? */
7030 if (fsec == KAUTH_FILESEC_NONE) {
7031 if (susize(xsecurity_size, 0) != 0) {
7032 error = EFAULT;
7033 goto out;
7034 }
7035 } else {
7036 /* find the user buffer size */
7037 xsecurity_bufsize = fusize(xsecurity_size);
7038
7039 /* copy out the actual data size */
7040 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7041 error = EFAULT;
7042 goto out;
7043 }
7044
7045 /* if the caller supplied enough room, copy out to it */
7046 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7047 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7048 }
7049 }
7050 }
7051 out:
7052 if (fsec != KAUTH_FILESEC_NONE) {
7053 kauth_filesec_free(fsec);
7054 }
7055 return error;
7056 }
7057
7058 /*
7059 * stat_extended: Get file status; with extended security (ACL).
7060 *
7061 * Parameters: p (ignored)
7062 * uap User argument descriptor (see below)
7063 * retval (ignored)
7064 *
7065 * Indirect: uap->path Path of file to get status from
7066 * uap->ub User buffer (holds file status info)
7067 * uap->xsecurity ACL to get (extended security)
7068 * uap->xsecurity_size Size of ACL
7069 *
7070 * Returns: 0 Success
7071 * !0 errno value
7072 *
7073 */
7074 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7075 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7076 __unused int32_t *retval)
7077 {
7078 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7079 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7080 0);
7081 }
7082
7083 /*
7084 * Returns: 0 Success
7085 * fstatat_internal:??? [see fstatat_internal() in this file]
7086 */
7087 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7088 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7089 {
7090 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7091 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7092 }
7093
7094 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7095 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7096 {
7097 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7098 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7099 }
7100
7101 /*
7102 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7103 *
7104 * Parameters: p (ignored)
7105 * uap User argument descriptor (see below)
7106 * retval (ignored)
7107 *
7108 * Indirect: uap->path Path of file to get status from
7109 * uap->ub User buffer (holds file status info)
7110 * uap->xsecurity ACL to get (extended security)
7111 * uap->xsecurity_size Size of ACL
7112 *
7113 * Returns: 0 Success
7114 * !0 errno value
7115 *
7116 */
7117 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7118 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7119 {
7120 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7121 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7122 0);
7123 }
7124
7125 /*
7126 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7127 *
7128 * Parameters: p (ignored)
7129 * uap User argument descriptor (see below)
7130 * retval (ignored)
7131 *
7132 * Indirect: uap->path Path of file to get status from
7133 * uap->ub User buffer (holds file status info)
7134 * uap->xsecurity ACL to get (extended security)
7135 * uap->xsecurity_size Size of ACL
7136 *
7137 * Returns: 0 Success
7138 * !0 errno value
7139 *
7140 */
7141 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7142 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7143 {
7144 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7145 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7146 AT_SYMLINK_NOFOLLOW);
7147 }
7148
7149 /*
7150 * Get file status; this version does not follow links.
7151 */
7152 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7153 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7154 {
7155 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7156 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7157 }
7158
7159 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7160 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7161 {
7162 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7163 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7164 }
7165
7166 /*
7167 * lstat64_extended: Get file status; can handle large inode numbers; does not
7168 * follow links; with extended security (ACL).
7169 *
7170 * Parameters: p (ignored)
7171 * uap User argument descriptor (see below)
7172 * retval (ignored)
7173 *
7174 * Indirect: uap->path Path of file to get status from
7175 * uap->ub User buffer (holds file status info)
7176 * uap->xsecurity ACL to get (extended security)
7177 * uap->xsecurity_size Size of ACL
7178 *
7179 * Returns: 0 Success
7180 * !0 errno value
7181 *
7182 */
7183 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7184 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7185 {
7186 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7187 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7188 AT_SYMLINK_NOFOLLOW);
7189 }
7190
7191 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7192 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7193 {
7194 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7195 return EINVAL;
7196 }
7197
7198 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7199 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7200 }
7201
7202 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7203 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7204 __unused int32_t *retval)
7205 {
7206 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7207 return EINVAL;
7208 }
7209
7210 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7211 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7212 }
7213
7214 /*
7215 * Get configurable pathname variables.
7216 *
7217 * Returns: 0 Success
7218 * namei:???
7219 * vn_pathconf:???
7220 *
7221 * Notes: Global implementation constants are intended to be
7222 * implemented in this function directly; all other constants
7223 * are per-FS implementation, and therefore must be handled in
7224 * each respective FS, instead.
7225 *
7226 * XXX We implement some things globally right now that should actually be
7227 * XXX per-FS; we will need to deal with this at some point.
7228 */
7229 /* ARGSUSED */
7230 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7231 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7232 {
7233 int error;
7234 struct nameidata nd;
7235 vfs_context_t ctx = vfs_context_current();
7236
7237 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7238 UIO_USERSPACE, uap->path, ctx);
7239 error = namei(&nd);
7240 if (error) {
7241 return error;
7242 }
7243
7244 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7245
7246 vnode_put(nd.ni_vp);
7247 nameidone(&nd);
7248 return error;
7249 }
7250
7251 /*
7252 * Return target name of a symbolic link.
7253 */
7254 /* ARGSUSED */
7255 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7256 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7257 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7258 int *retval)
7259 {
7260 vnode_t vp;
7261 uio_t auio;
7262 int error;
7263 struct nameidata nd;
7264 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
7265 bool put_vnode;
7266
7267 if (bufsize > INT32_MAX) {
7268 return EINVAL;
7269 }
7270
7271 if (lnk_vp) {
7272 vp = lnk_vp;
7273 put_vnode = false;
7274 } else {
7275 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7276 seg, path, ctx);
7277
7278 error = nameiat(&nd, fd);
7279 if (error) {
7280 return error;
7281 }
7282 vp = nd.ni_vp;
7283 put_vnode = true;
7284 nameidone(&nd);
7285 }
7286
7287 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7288 &uio_buf[0], sizeof(uio_buf));
7289 uio_addiov(auio, buf, bufsize);
7290 if (vp->v_type != VLNK) {
7291 error = EINVAL;
7292 } else {
7293 #if CONFIG_MACF
7294 error = mac_vnode_check_readlink(ctx, vp);
7295 #endif
7296 if (error == 0) {
7297 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7298 ctx);
7299 }
7300 if (error == 0) {
7301 error = VNOP_READLINK(vp, auio, ctx);
7302 }
7303 }
7304
7305 if (put_vnode) {
7306 vnode_put(vp);
7307 }
7308
7309 *retval = (int)(bufsize - uio_resid(auio));
7310 return error;
7311 }
7312
7313 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7314 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7315 {
7316 enum uio_seg procseg;
7317 vnode_t vp;
7318 int error;
7319
7320 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7321
7322 AUDIT_ARG(fd, uap->fd);
7323
7324 if ((error = file_vnode(uap->fd, &vp))) {
7325 return error;
7326 }
7327 if ((error = vnode_getwithref(vp))) {
7328 file_drop(uap->fd);
7329 return error;
7330 }
7331
7332 error = readlinkat_internal(vfs_context_current(), -1,
7333 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7334 uap->bufsize, procseg, retval);
7335
7336 vnode_put(vp);
7337 file_drop(uap->fd);
7338 return error;
7339 }
7340
7341 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7342 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7343 {
7344 enum uio_seg procseg;
7345
7346 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7347 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7348 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7349 uap->count, procseg, retval);
7350 }
7351
7352 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7353 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7354 {
7355 enum uio_seg procseg;
7356
7357 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7358 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7359 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7360 retval);
7361 }
7362
7363 /*
7364 * Change file flags, the deep inner layer.
7365 */
7366 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7367 chflags0(vnode_t vp, struct vnode_attr *va,
7368 int (*setattr)(vnode_t, void *, vfs_context_t),
7369 void *arg, vfs_context_t ctx)
7370 {
7371 kauth_action_t action = 0;
7372 int error;
7373
7374 #if CONFIG_MACF
7375 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7376 if (error) {
7377 goto out;
7378 }
7379 #endif
7380
7381 /* request authorisation, disregard immutability */
7382 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7383 goto out;
7384 }
7385 /*
7386 * Request that the auth layer disregard those file flags it's allowed to when
7387 * authorizing this operation; we need to do this in order to be able to
7388 * clear immutable flags.
7389 */
7390 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7391 goto out;
7392 }
7393 error = (*setattr)(vp, arg, ctx);
7394
7395 #if CONFIG_MACF
7396 if (error == 0) {
7397 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7398 }
7399 #endif
7400
7401 out:
7402 return error;
7403 }
7404
7405 /*
7406 * Change file flags.
7407 *
7408 * NOTE: this will vnode_put() `vp'
7409 */
7410 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7411 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7412 {
7413 struct vnode_attr va;
7414 int error;
7415
7416 VATTR_INIT(&va);
7417 VATTR_SET(&va, va_flags, flags);
7418
7419 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7420 vnode_put(vp);
7421
7422 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7423 error = ENOTSUP;
7424 }
7425
7426 return error;
7427 }
7428
7429 /*
7430 * Change flags of a file given a path name.
7431 */
7432 /* ARGSUSED */
7433 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7434 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7435 {
7436 vnode_t vp;
7437 vfs_context_t ctx = vfs_context_current();
7438 int error;
7439 struct nameidata nd;
7440 uint32_t wantparent = 0;
7441
7442 #if CONFIG_FILE_LEASES
7443 wantparent = WANTPARENT;
7444 #endif
7445
7446 AUDIT_ARG(fflags, uap->flags);
7447 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7448 UIO_USERSPACE, uap->path, ctx);
7449 error = namei(&nd);
7450 if (error) {
7451 return error;
7452 }
7453 vp = nd.ni_vp;
7454
7455 #if CONFIG_FILE_LEASES
7456 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7457 vnode_put(nd.ni_dvp);
7458 #endif
7459
7460 nameidone(&nd);
7461
7462 /* we don't vnode_put() here because chflags1 does internally */
7463 error = chflags1(vp, uap->flags, ctx);
7464
7465 return error;
7466 }
7467
7468 /*
7469 * Change flags of a file given a file descriptor.
7470 */
7471 /* ARGSUSED */
7472 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7473 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7474 {
7475 vnode_t vp;
7476 int error;
7477
7478 AUDIT_ARG(fd, uap->fd);
7479 AUDIT_ARG(fflags, uap->flags);
7480 if ((error = file_vnode(uap->fd, &vp))) {
7481 return error;
7482 }
7483
7484 if ((error = vnode_getwithref(vp))) {
7485 file_drop(uap->fd);
7486 return error;
7487 }
7488
7489 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7490
7491 #if CONFIG_FILE_LEASES
7492 vnode_breakdirlease(vp, true, O_WRONLY);
7493 #endif
7494
7495 /* we don't vnode_put() here because chflags1 does internally */
7496 error = chflags1(vp, uap->flags, vfs_context_current());
7497
7498 file_drop(uap->fd);
7499 return error;
7500 }
7501
7502 /*
7503 * Change security information on a filesystem object.
7504 *
7505 * Returns: 0 Success
7506 * EPERM Operation not permitted
7507 * vnode_authattr:??? [anything vnode_authattr can return]
7508 * vnode_authorize:??? [anything vnode_authorize can return]
7509 * vnode_setattr:??? [anything vnode_setattr can return]
7510 *
7511 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7512 * translated to EPERM before being returned.
7513 */
7514 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7515 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7516 {
7517 kauth_action_t action;
7518 int error;
7519
7520 AUDIT_ARG(mode, vap->va_mode);
7521 /* XXX audit new args */
7522
7523 #if NAMEDSTREAMS
7524 /* chmod calls are not allowed for resource forks. */
7525 if (vp->v_flag & VISNAMEDSTREAM) {
7526 return EPERM;
7527 }
7528 #endif
7529
7530 #if CONFIG_MACF
7531 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7532 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7533 return error;
7534 }
7535
7536 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7537 if ((error = mac_vnode_check_setowner(ctx, vp,
7538 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7539 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7540 return error;
7541 }
7542 }
7543
7544 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7545 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7546 return error;
7547 }
7548 #endif
7549
7550 /* make sure that the caller is allowed to set this security information */
7551 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7552 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7553 if (error == EACCES) {
7554 error = EPERM;
7555 }
7556 return error;
7557 }
7558
7559 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7560 return error;
7561 }
7562
7563 #if CONFIG_MACF
7564 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7565 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7566 }
7567
7568 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7569 mac_vnode_notify_setowner(ctx, vp,
7570 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7571 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7572 }
7573
7574 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7575 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7576 }
7577 #endif
7578
7579 return error;
7580 }
7581
7582
7583 /*
7584 * Change mode of a file given a path name.
7585 *
7586 * Returns: 0 Success
7587 * namei:??? [anything namei can return]
7588 * chmod_vnode:??? [anything chmod_vnode can return]
7589 */
7590 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7591 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7592 int fd, int flag, enum uio_seg segflg)
7593 {
7594 struct nameidata nd;
7595 int follow, error;
7596 uint32_t wantparent = 0;
7597
7598 #if CONFIG_FILE_LEASES
7599 wantparent = WANTPARENT;
7600 #endif
7601
7602 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7603 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7604 segflg, path, ctx);
7605 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7606 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7607 }
7608 if ((error = nameiat(&nd, fd))) {
7609 return error;
7610 }
7611
7612 #if CONFIG_FILE_LEASES
7613 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7614 vnode_put(nd.ni_dvp);
7615 #endif
7616
7617 error = chmod_vnode(ctx, nd.ni_vp, vap);
7618 vnode_put(nd.ni_vp);
7619 nameidone(&nd);
7620 return error;
7621 }
7622
7623 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7624 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7625 gid_t gid, user_addr_t xsecurity)
7626 {
7627 int error;
7628
7629 VATTR_INIT(pva);
7630
7631 if (mode != -1) {
7632 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7633 } else {
7634 pva->va_mode = 0;
7635 }
7636
7637 if (uid != KAUTH_UID_NONE) {
7638 VATTR_SET(pva, va_uid, uid);
7639 }
7640
7641 if (gid != KAUTH_GID_NONE) {
7642 VATTR_SET(pva, va_gid, gid);
7643 }
7644
7645 *pxsecdst = NULL;
7646 switch (xsecurity) {
7647 case USER_ADDR_NULL:
7648 break;
7649
7650 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7651 VATTR_SET(pva, va_acl, NULL);
7652 break;
7653
7654 default:
7655 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7656 return error;
7657 }
7658
7659 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7660 pva->va_vaflags |= VA_FILESEC_ACL;
7661 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7662 break;
7663 }
7664
7665 return 0;
7666 }
7667
7668 /*
7669 * chmod_extended: Change the mode of a file given a path name; with extended
7670 * argument list (including extended security (ACL)).
7671 *
7672 * Parameters: p Process requesting the open
7673 * uap User argument descriptor (see below)
7674 * retval (ignored)
7675 *
7676 * Indirect: uap->path Path to object (same as 'chmod')
7677 * uap->uid UID to set
7678 * uap->gid GID to set
7679 * uap->mode File mode to set (same as 'chmod')
7680 * uap->xsecurity ACL to set (or delete)
7681 *
7682 * Returns: 0 Success
7683 * !0 errno value
7684 *
7685 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7686 *
7687 * XXX: We should enummerate the possible errno values here, and where
7688 * in the code they originated.
7689 */
7690 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7691 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7692 {
7693 int error;
7694 struct vnode_attr va;
7695 kauth_filesec_t xsecdst = NULL;
7696
7697 AUDIT_ARG(owner, uap->uid, uap->gid);
7698
7699 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7700 uap->gid, uap->xsecurity);
7701
7702 if (error) {
7703 return error;
7704 }
7705
7706 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7707 UIO_USERSPACE);
7708
7709 if (xsecdst != NULL) {
7710 kauth_filesec_free(xsecdst);
7711 }
7712 return error;
7713 }
7714
7715 /*
7716 * Returns: 0 Success
7717 * chmodat:??? [anything chmodat can return]
7718 */
7719 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7720 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7721 int flag, enum uio_seg segflg)
7722 {
7723 struct vnode_attr va;
7724
7725 VATTR_INIT(&va);
7726 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7727
7728 return chmodat(ctx, path, &va, fd, flag, segflg);
7729 }
7730
7731 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7732 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7733 {
7734 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7735 AT_FDCWD, 0, UIO_USERSPACE);
7736 }
7737
7738 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7739 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7740 {
7741 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7742 return EINVAL;
7743 }
7744
7745 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7746 uap->fd, uap->flag, UIO_USERSPACE);
7747 }
7748
7749 /*
7750 * Change mode of a file given a file descriptor.
7751 */
7752 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7753 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7754 {
7755 vnode_t vp;
7756 int error;
7757
7758 AUDIT_ARG(fd, fd);
7759
7760 if ((error = file_vnode(fd, &vp)) != 0) {
7761 return error;
7762 }
7763 if ((error = vnode_getwithref(vp)) != 0) {
7764 file_drop(fd);
7765 return error;
7766 }
7767 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7768
7769 #if CONFIG_FILE_LEASES
7770 vnode_breakdirlease(vp, true, O_WRONLY);
7771 #endif
7772
7773 error = chmod_vnode(vfs_context_current(), vp, vap);
7774 (void)vnode_put(vp);
7775 file_drop(fd);
7776
7777 return error;
7778 }
7779
7780 /*
7781 * fchmod_extended: Change mode of a file given a file descriptor; with
7782 * extended argument list (including extended security (ACL)).
7783 *
7784 * Parameters: p Process requesting to change file mode
7785 * uap User argument descriptor (see below)
7786 * retval (ignored)
7787 *
7788 * Indirect: uap->mode File mode to set (same as 'chmod')
7789 * uap->uid UID to set
7790 * uap->gid GID to set
7791 * uap->xsecurity ACL to set (or delete)
7792 * uap->fd File descriptor of file to change mode
7793 *
7794 * Returns: 0 Success
7795 * !0 errno value
7796 *
7797 */
7798 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7799 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7800 {
7801 int error;
7802 struct vnode_attr va;
7803 kauth_filesec_t xsecdst = NULL;
7804
7805 AUDIT_ARG(owner, uap->uid, uap->gid);
7806
7807 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7808 uap->gid, uap->xsecurity);
7809
7810 if (error) {
7811 return error;
7812 }
7813
7814 error = fchmod1(p, uap->fd, &va);
7815
7816 if (xsecdst != NULL) {
7817 kauth_filesec_free(xsecdst);
7818 }
7819 return error;
7820 }
7821
7822 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7823 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7824 {
7825 struct vnode_attr va;
7826
7827 VATTR_INIT(&va);
7828 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7829
7830 return fchmod1(p, uap->fd, &va);
7831 }
7832
7833
7834 /*
7835 * Set ownership given a path name.
7836 */
7837 /* ARGSUSED */
7838 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7839 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7840 gid_t gid, int flag, enum uio_seg segflg)
7841 {
7842 vnode_t vp;
7843 struct vnode_attr va;
7844 int error;
7845 struct nameidata nd;
7846 int follow;
7847 kauth_action_t action;
7848 uint32_t wantparent = 0;
7849
7850 #if CONFIG_FILE_LEASES
7851 wantparent = WANTPARENT;
7852 #endif
7853
7854 AUDIT_ARG(owner, uid, gid);
7855
7856 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7857 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent, segflg,
7858 path, ctx);
7859 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7860 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7861 }
7862 error = nameiat(&nd, fd);
7863 if (error) {
7864 return error;
7865 }
7866 vp = nd.ni_vp;
7867
7868 VATTR_INIT(&va);
7869 if (uid != (uid_t)VNOVAL) {
7870 VATTR_SET(&va, va_uid, uid);
7871 }
7872 if (gid != (gid_t)VNOVAL) {
7873 VATTR_SET(&va, va_gid, gid);
7874 }
7875
7876 #if CONFIG_MACF
7877 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7878 if (error) {
7879 goto out;
7880 }
7881 #endif
7882
7883 /* preflight and authorize attribute changes */
7884 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7885 goto out;
7886 }
7887 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7888 goto out;
7889 }
7890
7891 #if CONFIG_FILE_LEASES
7892 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7893 #endif
7894
7895 error = vnode_setattr(vp, &va, ctx);
7896
7897 #if CONFIG_MACF
7898 if (error == 0) {
7899 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7900 }
7901 #endif
7902
7903 out:
7904 /*
7905 * EACCES is only allowed from namei(); permissions failure should
7906 * return EPERM, so we need to translate the error code.
7907 */
7908 if (error == EACCES) {
7909 error = EPERM;
7910 }
7911
7912 #if CONFIG_FILE_LEASES
7913 vnode_put(nd.ni_dvp);
7914 #endif
7915 nameidone(&nd);
7916 vnode_put(vp);
7917 return error;
7918 }
7919
7920 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7921 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7922 {
7923 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7924 uap->uid, uap->gid, 0, UIO_USERSPACE);
7925 }
7926
7927 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7928 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7929 {
7930 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7931 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7932 }
7933
7934 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7935 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7936 {
7937 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7938 return EINVAL;
7939 }
7940
7941 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7942 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7943 }
7944
7945 /*
7946 * Set ownership given a file descriptor.
7947 */
7948 /* ARGSUSED */
7949 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7950 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7951 {
7952 struct vnode_attr va;
7953 vfs_context_t ctx = vfs_context_current();
7954 vnode_t vp;
7955 int error;
7956 kauth_action_t action;
7957
7958 AUDIT_ARG(owner, uap->uid, uap->gid);
7959 AUDIT_ARG(fd, uap->fd);
7960
7961 if ((error = file_vnode(uap->fd, &vp))) {
7962 return error;
7963 }
7964
7965 if ((error = vnode_getwithref(vp))) {
7966 file_drop(uap->fd);
7967 return error;
7968 }
7969 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7970
7971 VATTR_INIT(&va);
7972 if (uap->uid != VNOVAL) {
7973 VATTR_SET(&va, va_uid, uap->uid);
7974 }
7975 if (uap->gid != VNOVAL) {
7976 VATTR_SET(&va, va_gid, uap->gid);
7977 }
7978
7979 #if NAMEDSTREAMS
7980 /* chown calls are not allowed for resource forks. */
7981 if (vp->v_flag & VISNAMEDSTREAM) {
7982 error = EPERM;
7983 goto out;
7984 }
7985 #endif
7986
7987 #if CONFIG_MACF
7988 error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7989 if (error) {
7990 goto out;
7991 }
7992 #endif
7993
7994 /* preflight and authorize attribute changes */
7995 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7996 goto out;
7997 }
7998 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7999 if (error == EACCES) {
8000 error = EPERM;
8001 }
8002 goto out;
8003 }
8004
8005 #if CONFIG_FILE_LEASES
8006 vnode_breakdirlease(vp, true, O_WRONLY);
8007 #endif
8008
8009 error = vnode_setattr(vp, &va, ctx);
8010
8011 #if CONFIG_MACF
8012 if (error == 0) {
8013 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
8014 }
8015 #endif
8016
8017 out:
8018 (void)vnode_put(vp);
8019 file_drop(uap->fd);
8020 return error;
8021 }
8022
8023 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8024 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8025 {
8026 int error;
8027
8028 if (usrtvp == USER_ADDR_NULL) {
8029 struct timeval old_tv;
8030 /* XXX Y2038 bug because of microtime argument */
8031 microtime(&old_tv);
8032 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8033 tsp[1] = tsp[0];
8034 } else {
8035 if (IS_64BIT_PROCESS(current_proc())) {
8036 struct user64_timeval tv[2];
8037 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8038 if (error) {
8039 return error;
8040 }
8041 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8042 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8043 } else {
8044 struct user32_timeval tv[2];
8045 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8046 if (error) {
8047 return error;
8048 }
8049 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8050 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8051 }
8052 }
8053 return 0;
8054 }
8055
8056 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8057 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8058 int nullflag)
8059 {
8060 int error;
8061 struct vnode_attr va;
8062 kauth_action_t action;
8063
8064 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8065
8066 VATTR_INIT(&va);
8067 VATTR_SET(&va, va_access_time, ts[0]);
8068 VATTR_SET(&va, va_modify_time, ts[1]);
8069 if (nullflag) {
8070 va.va_vaflags |= VA_UTIMES_NULL;
8071 }
8072
8073 #if NAMEDSTREAMS
8074 /* utimes calls are not allowed for resource forks. */
8075 if (vp->v_flag & VISNAMEDSTREAM) {
8076 error = EPERM;
8077 goto out;
8078 }
8079 #endif
8080
8081 #if CONFIG_MACF
8082 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8083 if (error) {
8084 goto out;
8085 }
8086 #endif
8087 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8088 if (!nullflag && error == EACCES) {
8089 error = EPERM;
8090 }
8091 goto out;
8092 }
8093
8094 /* since we may not need to auth anything, check here */
8095 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8096 if (!nullflag && error == EACCES) {
8097 error = EPERM;
8098 }
8099 goto out;
8100 }
8101 error = vnode_setattr(vp, &va, ctx);
8102
8103 #if CONFIG_MACF
8104 if (error == 0) {
8105 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8106 }
8107 #endif
8108
8109 out:
8110 return error;
8111 }
8112
8113 /*
8114 * Set the access and modification times of a file.
8115 */
8116 /* ARGSUSED */
8117 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8118 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8119 {
8120 struct timespec ts[2];
8121 user_addr_t usrtvp;
8122 int error;
8123 struct nameidata nd;
8124 vfs_context_t ctx = vfs_context_current();
8125 uint32_t wantparent = 0;
8126
8127 #if CONFIG_FILE_LEASES
8128 wantparent = WANTPARENT;
8129 #endif
8130
8131 /*
8132 * AUDIT: Needed to change the order of operations to do the
8133 * name lookup first because auditing wants the path.
8134 */
8135 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8136 UIO_USERSPACE, uap->path, ctx);
8137 error = namei(&nd);
8138 if (error) {
8139 return error;
8140 }
8141
8142 /*
8143 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8144 * the current time instead.
8145 */
8146 usrtvp = uap->tptr;
8147 if ((error = getutimes(usrtvp, ts)) != 0) {
8148 goto out;
8149 }
8150
8151 #if CONFIG_FILE_LEASES
8152 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8153 #endif
8154
8155 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8156
8157 out:
8158 #if CONFIG_FILE_LEASES
8159 vnode_put(nd.ni_dvp);
8160 #endif
8161 nameidone(&nd);
8162 vnode_put(nd.ni_vp);
8163 return error;
8164 }
8165
8166 /*
8167 * Set the access and modification times of a file.
8168 */
8169 /* ARGSUSED */
8170 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8171 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8172 {
8173 struct timespec ts[2];
8174 vnode_t vp;
8175 user_addr_t usrtvp;
8176 int error;
8177
8178 AUDIT_ARG(fd, uap->fd);
8179 usrtvp = uap->tptr;
8180 if ((error = getutimes(usrtvp, ts)) != 0) {
8181 return error;
8182 }
8183 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8184 return error;
8185 }
8186 if ((error = vnode_getwithref(vp))) {
8187 file_drop(uap->fd);
8188 return error;
8189 }
8190
8191 #if CONFIG_FILE_LEASES
8192 vnode_breakdirlease(vp, true, O_WRONLY);
8193 #endif
8194
8195 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8196
8197 vnode_put(vp);
8198 file_drop(uap->fd);
8199 return error;
8200 }
8201
8202 static int
truncate_validate_common(proc_t p,off_t length)8203 truncate_validate_common(proc_t p, off_t length)
8204 {
8205 rlim_t fsize_limit;
8206
8207 if (length < 0) {
8208 return EINVAL;
8209 }
8210
8211 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8212 if ((rlim_t)length > fsize_limit) {
8213 psignal(p, SIGXFSZ);
8214 return EFBIG;
8215 }
8216
8217 return 0;
8218 }
8219
8220 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8221 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8222 vfs_context_t ctx, boolean_t need_auth)
8223 {
8224 struct vnode_attr va;
8225 kauth_action_t action;
8226 int error;
8227
8228 VATTR_INIT(&va);
8229 VATTR_SET(&va, va_data_size, length);
8230
8231 #if CONFIG_MACF
8232 error = mac_vnode_check_truncate(ctx, cred, vp);
8233 if (error) {
8234 return error;
8235 }
8236 #endif
8237
8238 /*
8239 * If we reached here from `ftruncate` then we already did an effective
8240 * `vnode_authorize` upon open. We honour the result from then.
8241 */
8242 if (need_auth) {
8243 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8244 return error;
8245 }
8246
8247 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8248 return error;
8249 }
8250 }
8251
8252 #if CONFIG_FILE_LEASES
8253 /* Check if there is a lease placed on the parent directory. */
8254 vnode_breakdirlease(vp, true, O_WRONLY);
8255
8256 /* Now check if there is a lease placed on the file itself. */
8257 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8258 #endif
8259
8260 error = vnode_setattr(vp, &va, ctx);
8261
8262 #if CONFIG_MACF
8263 if (error == 0) {
8264 mac_vnode_notify_truncate(ctx, cred, vp);
8265 }
8266 #endif
8267
8268 return error;
8269 }
8270
8271 /*
8272 * Truncate a file given its path name.
8273 */
8274 /* ARGSUSED */
8275 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8276 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8277 {
8278 vfs_context_t ctx = vfs_context_current();
8279 vnode_t vp;
8280 int error;
8281 struct nameidata nd;
8282
8283 if ((error = truncate_validate_common(p, uap->length))) {
8284 return error;
8285 }
8286
8287 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8288 UIO_USERSPACE, uap->path, ctx);
8289
8290 if ((error = namei(&nd))) {
8291 return error;
8292 }
8293
8294 vp = nd.ni_vp;
8295 nameidone(&nd);
8296
8297 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8298 vnode_put(vp);
8299
8300 return error;
8301 }
8302
8303 /*
8304 * Truncate a file given a file descriptor.
8305 */
8306 /* ARGSUSED */
8307 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8308 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8309 {
8310 vnode_t vp;
8311 struct fileproc *fp;
8312 int error;
8313
8314 AUDIT_ARG(fd, uap->fd);
8315
8316 if ((error = truncate_validate_common(p, uap->length))) {
8317 return error;
8318 }
8319
8320 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8321 return error;
8322 }
8323
8324 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8325 case DTYPE_PSXSHM:
8326 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8327 goto out;
8328 case DTYPE_VNODE:
8329 break;
8330 default:
8331 error = EINVAL;
8332 goto out;
8333 }
8334
8335 vp = (vnode_t)fp_get_data(fp);
8336
8337 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8338 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8339 error = EINVAL;
8340 goto out;
8341 }
8342
8343 if ((error = vnode_getwithref(vp)) != 0) {
8344 goto out;
8345 }
8346
8347 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8348
8349 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8350 vfs_context_current(), false);
8351 vnode_put(vp);
8352
8353 out:
8354 file_drop(uap->fd);
8355 return error;
8356 }
8357
8358
8359 /*
8360 * Sync an open file with synchronized I/O _file_ integrity completion
8361 */
8362 /* ARGSUSED */
8363 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8364 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8365 {
8366 __pthread_testcancel(1);
8367 return fsync_common(p, uap, MNT_WAIT);
8368 }
8369
8370
8371 /*
8372 * Sync an open file with synchronized I/O _file_ integrity completion
8373 *
8374 * Notes: This is a legacy support function that does not test for
8375 * thread cancellation points.
8376 */
8377 /* ARGSUSED */
8378 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8379 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8380 {
8381 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8382 }
8383
8384
8385 /*
8386 * Sync an open file with synchronized I/O _data_ integrity completion
8387 */
8388 /* ARGSUSED */
8389 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8390 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8391 {
8392 __pthread_testcancel(1);
8393 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8394 }
8395
8396
8397 /*
8398 * fsync_common
8399 *
8400 * Common fsync code to support both synchronized I/O file integrity completion
8401 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8402 *
8403 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8404 * will only guarantee that the file data contents are retrievable. If
8405 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8406 * includes additional metadata unnecessary for retrieving the file data
8407 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8408 * storage.
8409 *
8410 * Parameters: p The process
8411 * uap->fd The descriptor to synchronize
8412 * flags The data integrity flags
8413 *
8414 * Returns: int Success
8415 * fp_getfvp:EBADF Bad file descriptor
8416 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8417 * VNOP_FSYNC:??? unspecified
8418 *
8419 * Notes: We use struct fsync_args because it is a short name, and all
8420 * caller argument structures are otherwise identical.
8421 */
8422 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8423 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8424 {
8425 vnode_t vp;
8426 struct fileproc *fp;
8427 vfs_context_t ctx = vfs_context_current();
8428 int error;
8429
8430 AUDIT_ARG(fd, uap->fd);
8431
8432 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8433 return error;
8434 }
8435 if ((error = vnode_getwithref(vp))) {
8436 file_drop(uap->fd);
8437 return error;
8438 }
8439
8440 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8441
8442 error = VNOP_FSYNC(vp, flags, ctx);
8443
8444 #if NAMEDRSRCFORK
8445 /* Sync resource fork shadow file if necessary. */
8446 if ((error == 0) &&
8447 (vp->v_flag & VISNAMEDSTREAM) &&
8448 (vp->v_parent != NULLVP) &&
8449 vnode_isshadow(vp) &&
8450 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8451 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8452 }
8453 #endif
8454
8455 (void)vnode_put(vp);
8456 file_drop(uap->fd);
8457 return error;
8458 }
8459
8460 /*
8461 * Duplicate files. Source must be a file, target must be a file or
8462 * must not exist.
8463 *
8464 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8465 * perform inheritance correctly.
8466 */
8467 /* ARGSUSED */
8468 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8469 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8470 {
8471 vnode_t tvp, fvp, tdvp, sdvp;
8472 struct nameidata fromnd, tond;
8473 int error;
8474 vfs_context_t ctx = vfs_context_current();
8475
8476 /* Check that the flags are valid. */
8477 if (uap->flags & ~CPF_MASK) {
8478 return EINVAL;
8479 }
8480
8481 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8482 UIO_USERSPACE, uap->from, ctx);
8483 if ((error = namei(&fromnd))) {
8484 return error;
8485 }
8486 fvp = fromnd.ni_vp;
8487
8488 NDINIT(&tond, CREATE, OP_LINK,
8489 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8490 UIO_USERSPACE, uap->to, ctx);
8491 if ((error = namei(&tond))) {
8492 goto out1;
8493 }
8494 tdvp = tond.ni_dvp;
8495 tvp = tond.ni_vp;
8496
8497 if (tvp != NULL) {
8498 if (!(uap->flags & CPF_OVERWRITE)) {
8499 error = EEXIST;
8500 goto out;
8501 }
8502 }
8503
8504 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8505 error = EISDIR;
8506 goto out;
8507 }
8508
8509 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8510 error = EOPNOTSUPP;
8511 goto out;
8512 }
8513
8514 #if CONFIG_MACF
8515 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8516 goto out;
8517 }
8518 #endif /* CONFIG_MACF */
8519
8520 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8521 goto out;
8522 }
8523 if (tvp) {
8524 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8525 goto out;
8526 }
8527 }
8528 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8529 goto out;
8530 }
8531
8532 if (fvp == tdvp) {
8533 error = EINVAL;
8534 }
8535 /*
8536 * If source is the same as the destination (that is the
8537 * same inode number) then there is nothing to do.
8538 * (fixed to have POSIX semantics - CSM 3/2/98)
8539 */
8540 if (fvp == tvp) {
8541 error = -1;
8542 }
8543
8544 #if CONFIG_FILE_LEASES
8545 vnode_breakdirlease(tdvp, false, O_WRONLY);
8546 #endif
8547
8548 if (!error) {
8549 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8550 }
8551 out:
8552 sdvp = tond.ni_startdir;
8553 /*
8554 * nameidone has to happen before we vnode_put(tdvp)
8555 * since it may need to release the fs_nodelock on the tdvp
8556 */
8557 nameidone(&tond);
8558
8559 if (tvp) {
8560 vnode_put(tvp);
8561 }
8562 vnode_put(tdvp);
8563 vnode_put(sdvp);
8564 out1:
8565 vnode_put(fvp);
8566
8567 nameidone(&fromnd);
8568
8569 if (error == -1) {
8570 return 0;
8571 }
8572 return error;
8573 }
8574
8575 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8576
8577 /*
8578 * Helper function for doing clones. The caller is expected to provide an
8579 * iocounted source vnode and release it.
8580 */
8581 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8582 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8583 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8584 {
8585 vnode_t tvp, tdvp;
8586 struct nameidata tond;
8587 int error;
8588 int follow;
8589 boolean_t free_src_acl;
8590 boolean_t attr_cleanup;
8591 enum vtype v_type;
8592 kauth_action_t action;
8593 struct componentname *cnp;
8594 uint32_t defaulted = 0;
8595 struct vnode_attr va;
8596 struct vnode_attr nva;
8597 uint32_t vnop_flags;
8598
8599 v_type = vnode_vtype(fvp);
8600 switch (v_type) {
8601 case VLNK:
8602 /* FALLTHRU */
8603 case VREG:
8604 action = KAUTH_VNODE_ADD_FILE;
8605 break;
8606 case VDIR:
8607 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8608 fvp->v_mountedhere) {
8609 return EINVAL;
8610 }
8611 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8612 break;
8613 default:
8614 return EINVAL;
8615 }
8616
8617 AUDIT_ARG(fd2, dst_dirfd);
8618 AUDIT_ARG(value32, flags);
8619
8620 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8621 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8622 UIO_USERSPACE, dst, ctx);
8623 if ((error = nameiat(&tond, dst_dirfd))) {
8624 return error;
8625 }
8626 cnp = &tond.ni_cnd;
8627 tdvp = tond.ni_dvp;
8628 tvp = tond.ni_vp;
8629
8630 free_src_acl = FALSE;
8631 attr_cleanup = FALSE;
8632
8633 if (tvp != NULL) {
8634 error = EEXIST;
8635 goto out;
8636 }
8637
8638 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8639 error = EXDEV;
8640 goto out;
8641 }
8642
8643 #if CONFIG_MACF
8644 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8645 goto out;
8646 }
8647 #endif
8648 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8649 goto out;
8650 }
8651
8652 action = KAUTH_VNODE_GENERIC_READ_BITS;
8653 if (data_read_authorised) {
8654 action &= ~KAUTH_VNODE_READ_DATA;
8655 }
8656 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8657 goto out;
8658 }
8659
8660 /*
8661 * certain attributes may need to be changed from the source, we ask for
8662 * those here with the exception of source file's ACLs unless the CLONE_ACL
8663 * flag is specified. By default, the clone file will inherit the target
8664 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8665 * will inherit the source file's ACLs instead.
8666 */
8667 VATTR_INIT(&va);
8668 VATTR_WANTED(&va, va_uid);
8669 VATTR_WANTED(&va, va_gid);
8670 VATTR_WANTED(&va, va_mode);
8671 VATTR_WANTED(&va, va_flags);
8672 if (flags & CLONE_ACL) {
8673 VATTR_WANTED(&va, va_acl);
8674 }
8675
8676 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8677 goto out;
8678 }
8679
8680 VATTR_INIT(&nva);
8681 VATTR_SET(&nva, va_type, v_type);
8682 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8683 VATTR_SET(&nva, va_acl, va.va_acl);
8684 free_src_acl = TRUE;
8685 }
8686
8687 /* Handle ACL inheritance, initialize vap. */
8688 if (v_type == VLNK) {
8689 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8690 } else {
8691 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8692 if (error) {
8693 goto out;
8694 }
8695 attr_cleanup = TRUE;
8696 }
8697
8698 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8699 /*
8700 * We've got initial values for all security parameters,
8701 * If we are superuser, then we can change owners to be the
8702 * same as the source. Both superuser and the owner have default
8703 * WRITE_SECURITY privileges so all other fields can be taken
8704 * from source as well.
8705 */
8706 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8707 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8708 VATTR_SET(&nva, va_uid, va.va_uid);
8709 }
8710 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8711 VATTR_SET(&nva, va_gid, va.va_gid);
8712 }
8713 } else {
8714 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8715 }
8716
8717 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8718 VATTR_SET(&nva, va_mode, va.va_mode);
8719 }
8720 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8721 VATTR_SET(&nva, va_flags,
8722 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8723 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8724 }
8725
8726 #if CONFIG_FILE_LEASES
8727 vnode_breakdirlease(tdvp, false, O_WRONLY);
8728 #endif
8729
8730 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8731
8732 if (!error && tvp) {
8733 int update_flags = 0;
8734 #if CONFIG_FSE
8735 int fsevent;
8736 #endif /* CONFIG_FSE */
8737
8738 /*
8739 * If some of the requested attributes weren't handled by the
8740 * VNOP, use our fallback code.
8741 */
8742 if (!VATTR_ALL_SUPPORTED(&nva)) {
8743 (void)vnode_setattr_fallback(tvp, &nva, ctx);
8744 }
8745
8746 #if CONFIG_MACF
8747 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8748 VNODE_LABEL_CREATE, ctx);
8749 #endif
8750
8751 // Make sure the name & parent pointers are hooked up
8752 if (tvp->v_name == NULL) {
8753 update_flags |= VNODE_UPDATE_NAME;
8754 }
8755 if (tvp->v_parent == NULLVP) {
8756 update_flags |= VNODE_UPDATE_PARENT;
8757 }
8758
8759 if (update_flags) {
8760 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8761 cnp->cn_namelen, cnp->cn_hash, update_flags);
8762 }
8763
8764 #if CONFIG_FSE
8765 switch (vnode_vtype(tvp)) {
8766 case VLNK:
8767 /* FALLTHRU */
8768 case VREG:
8769 fsevent = FSE_CREATE_FILE;
8770 break;
8771 case VDIR:
8772 fsevent = FSE_CREATE_DIR;
8773 break;
8774 default:
8775 goto out;
8776 }
8777
8778 if (need_fsevent(fsevent, tvp)) {
8779 /*
8780 * The following is a sequence of three explicit events.
8781 * A pair of FSE_CLONE events representing the source and destination
8782 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8783 * fseventsd may coalesce the destination clone and create events
8784 * into a single event resulting in the following sequence for a client
8785 * FSE_CLONE (src)
8786 * FSE_CLONE | FSE_CREATE (dst)
8787 */
8788 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8789 FSE_ARG_DONE);
8790 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8791 FSE_ARG_DONE);
8792 }
8793 #endif /* CONFIG_FSE */
8794 }
8795
8796 out:
8797 if (attr_cleanup) {
8798 vn_attribute_cleanup(&nva, defaulted);
8799 }
8800 if (free_src_acl && va.va_acl) {
8801 kauth_acl_free(va.va_acl);
8802 }
8803 nameidone(&tond);
8804 if (tvp) {
8805 vnode_put(tvp);
8806 }
8807 vnode_put(tdvp);
8808 return error;
8809 }
8810
8811 /*
8812 * clone files or directories, target must not exist.
8813 */
8814 /* ARGSUSED */
8815 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8816 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8817 __unused int32_t *retval)
8818 {
8819 vnode_t fvp;
8820 struct nameidata fromnd;
8821 int follow;
8822 int error;
8823 vfs_context_t ctx = vfs_context_current();
8824
8825 /* Check that the flags are valid. */
8826 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8827 return EINVAL;
8828 }
8829
8830 AUDIT_ARG(fd, uap->src_dirfd);
8831
8832 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8833 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8834 UIO_USERSPACE, uap->src, ctx);
8835 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8836 return error;
8837 }
8838
8839 fvp = fromnd.ni_vp;
8840 nameidone(&fromnd);
8841
8842 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8843 uap->flags, ctx);
8844
8845 vnode_put(fvp);
8846 return error;
8847 }
8848
8849 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8850 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8851 __unused int32_t *retval)
8852 {
8853 vnode_t fvp;
8854 struct fileproc *fp;
8855 int error;
8856 vfs_context_t ctx = vfs_context_current();
8857
8858 /* Check that the flags are valid. */
8859 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8860 return EINVAL;
8861 }
8862
8863 AUDIT_ARG(fd, uap->src_fd);
8864 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8865 if (error) {
8866 return error;
8867 }
8868
8869 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8870 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8871 error = EBADF;
8872 goto out;
8873 }
8874
8875 if ((error = vnode_getwithref(fvp))) {
8876 goto out;
8877 }
8878
8879 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8880
8881 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8882 uap->flags, ctx);
8883
8884 vnode_put(fvp);
8885 out:
8886 file_drop(uap->src_fd);
8887 return error;
8888 }
8889
8890 static int
rename_submounts_callback(mount_t mp,void * arg)8891 rename_submounts_callback(mount_t mp, void *arg)
8892 {
8893 int error = 0;
8894 mount_t pmp = (mount_t)arg;
8895 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8896
8897 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8898 return 0;
8899 }
8900
8901 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8902 return 0;
8903 }
8904
8905 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8906 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8907 return -1;
8908 }
8909
8910 size_t pathlen = MAXPATHLEN;
8911 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8912 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8913 }
8914
8915 vfs_unbusy(mp);
8916
8917 return error;
8918 }
8919
8920 /*
8921 * Rename files. Source and destination must either both be directories,
8922 * or both not be directories. If target is a directory, it must be empty.
8923 */
8924 /* ARGSUSED */
8925 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8926 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8927 int tofd, user_addr_t to, int segflg, u_int uflags)
8928 {
8929 vnode_t tvp, tdvp;
8930 vnode_t fvp, fdvp;
8931 vnode_t mnt_fvp;
8932 struct nameidata *fromnd, *tond;
8933 int error = 0;
8934 int do_retry;
8935 int retry_count;
8936 int mntrename;
8937 int need_event;
8938 int need_kpath2;
8939 int has_listeners;
8940 const char *oname = NULL;
8941 char *from_name = NULL, *to_name = NULL;
8942 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8943 int from_len = 0, to_len = 0;
8944 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8945 int holding_mntlock;
8946 int vn_authorize_skipped;
8947 mount_t locked_mp = NULL;
8948 vnode_t oparent = NULLVP;
8949 #if CONFIG_FSE
8950 fse_info from_finfo = {}, to_finfo;
8951 #endif
8952 int from_truncated = 0, to_truncated = 0;
8953 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8954 int batched = 0;
8955 struct vnode_attr *fvap, *tvap;
8956 int continuing = 0;
8957 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8958 int32_t nofollow_any = 0;
8959 /* carving out a chunk for structs that are too big to be on stack. */
8960 struct {
8961 struct nameidata from_node, to_node;
8962 struct vnode_attr fv_attr, tv_attr;
8963 } * __rename_data;
8964
8965 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8966 fromnd = &__rename_data->from_node;
8967 tond = &__rename_data->to_node;
8968
8969 holding_mntlock = 0;
8970 do_retry = 0;
8971 retry_count = 0;
8972 retry:
8973 fvp = tvp = NULL;
8974 fdvp = tdvp = NULL;
8975 fvap = tvap = NULL;
8976 mnt_fvp = NULLVP;
8977 mntrename = FALSE;
8978 vn_authorize_skipped = FALSE;
8979
8980 if (uflags & RENAME_NOFOLLOW_ANY) {
8981 nofollow_any = NAMEI_NOFOLLOW_ANY;
8982 }
8983 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8984 segflg, from, ctx);
8985 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8986
8987 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8988 segflg, to, ctx);
8989 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8990
8991 continue_lookup:
8992 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8993 if ((error = nameiat(fromnd, fromfd))) {
8994 goto out1;
8995 }
8996 fdvp = fromnd->ni_dvp;
8997 fvp = fromnd->ni_vp;
8998
8999 if (fvp && fvp->v_type == VDIR) {
9000 tond->ni_cnd.cn_flags |= WILLBEDIR;
9001 }
9002 }
9003
9004 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9005 if ((error = nameiat(tond, tofd))) {
9006 /*
9007 * Translate error code for rename("dir1", "dir2/.").
9008 */
9009 if (error == EISDIR && fvp->v_type == VDIR) {
9010 error = EINVAL;
9011 }
9012 goto out1;
9013 }
9014 tdvp = tond->ni_dvp;
9015 tvp = tond->ni_vp;
9016 }
9017
9018 #if DEVELOPMENT || DEBUG
9019 /*
9020 * XXX VSWAP: Check for entitlements or special flag here
9021 * so we can restrict access appropriately.
9022 */
9023 #else /* DEVELOPMENT || DEBUG */
9024
9025 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9026 error = EPERM;
9027 goto out1;
9028 }
9029
9030 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9031 error = EPERM;
9032 goto out1;
9033 }
9034 #endif /* DEVELOPMENT || DEBUG */
9035
9036 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9037 error = ENOENT;
9038 goto out1;
9039 }
9040
9041 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9042 int32_t pval = 0;
9043 int err = 0;
9044
9045 /*
9046 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9047 * has the same name as target iff the following conditions are met:
9048 * 1. the target file system is case insensitive
9049 * 2. source and target directories are the same
9050 * 3. source and target files are the same
9051 * 4. name only differs in case (determined by underlying filesystem)
9052 */
9053 if (fvp != tvp || fdvp != tdvp) {
9054 error = EEXIST;
9055 goto out1;
9056 }
9057
9058 /*
9059 * Assume that the target file system is case sensitive if
9060 * _PC_CASE_SENSITIVE selector isn't supported.
9061 */
9062 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9063 if (err != 0 || pval != 0) {
9064 error = EEXIST;
9065 goto out1;
9066 }
9067 }
9068
9069 batched = vnode_compound_rename_available(fdvp);
9070
9071 #if CONFIG_FSE
9072 need_event = need_fsevent(FSE_RENAME, fdvp);
9073 if (need_event) {
9074 if (fvp) {
9075 get_fse_info(fvp, &from_finfo, ctx);
9076 } else {
9077 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9078 if (error) {
9079 goto out1;
9080 }
9081
9082 fvap = &__rename_data->fv_attr;
9083 }
9084
9085 if (tvp) {
9086 get_fse_info(tvp, &to_finfo, ctx);
9087 } else if (batched) {
9088 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9089 if (error) {
9090 goto out1;
9091 }
9092
9093 tvap = &__rename_data->tv_attr;
9094 }
9095 }
9096 #else
9097 need_event = 0;
9098 #endif /* CONFIG_FSE */
9099
9100 has_listeners = kauth_authorize_fileop_has_listeners();
9101
9102 need_kpath2 = 0;
9103 #if CONFIG_AUDIT
9104 if (AUDIT_RECORD_EXISTS()) {
9105 need_kpath2 = 1;
9106 }
9107 #endif
9108
9109 if (need_event || has_listeners) {
9110 if (from_name == NULL) {
9111 GET_PATH(from_name);
9112 }
9113
9114 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9115
9116 if (from_name_no_firmlink == NULL) {
9117 GET_PATH(from_name_no_firmlink);
9118 }
9119
9120 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9121 }
9122
9123 if (need_event || need_kpath2 || has_listeners) {
9124 if (to_name == NULL) {
9125 GET_PATH(to_name);
9126 }
9127
9128 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9129
9130 if (to_name_no_firmlink == NULL) {
9131 GET_PATH(to_name_no_firmlink);
9132 }
9133
9134 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9135 if (to_name && need_kpath2) {
9136 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9137 }
9138 }
9139 if (!fvp) {
9140 /*
9141 * Claim: this check will never reject a valid rename.
9142 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9143 * Suppose fdvp and tdvp are not on the same mount.
9144 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9145 * then you can't move it to within another dir on the same mountpoint.
9146 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9147 *
9148 * If this check passes, then we are safe to pass these vnodes to the same FS.
9149 */
9150 if (fdvp->v_mount != tdvp->v_mount) {
9151 error = EXDEV;
9152 goto out1;
9153 }
9154 goto skipped_lookup;
9155 }
9156
9157 /*
9158 * If the source and destination are the same (i.e. they're
9159 * links to the same vnode) and the target file system is
9160 * case sensitive, then there is nothing to do.
9161 *
9162 * XXX Come back to this.
9163 */
9164 if (fvp == tvp) {
9165 int pathconf_val;
9166
9167 /*
9168 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9169 * then assume that this file system is case sensitive.
9170 */
9171 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9172 pathconf_val != 0) {
9173 vn_authorize_skipped = TRUE;
9174 goto out1;
9175 }
9176 }
9177
9178 /*
9179 * Allow the renaming of mount points.
9180 * - target must not exist
9181 * - target must reside in the same directory as source
9182 * - union mounts cannot be renamed
9183 * - the root fs, and tightly-linked system volumes, cannot be renamed
9184 *
9185 * XXX Handle this in VFS after a continued lookup (if we missed
9186 * in the cache to start off)
9187 *
9188 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9189 * we'll skip past here. The file system is responsible for
9190 * checking that @tvp is not a descendent of @fvp and vice versa
9191 * so it should always return EINVAL if either @tvp or @fvp is the
9192 * root of a volume.
9193 */
9194 if ((fvp->v_flag & VROOT) &&
9195 (fvp->v_type == VDIR) &&
9196 (tvp == NULL) &&
9197 (fvp->v_mountedhere == NULL) &&
9198 (fdvp == tdvp) &&
9199 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9200 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9201 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9202 vnode_t coveredvp;
9203
9204 /* switch fvp to the covered vnode */
9205 coveredvp = fvp->v_mount->mnt_vnodecovered;
9206 if ((vnode_getwithref(coveredvp))) {
9207 error = ENOENT;
9208 goto out1;
9209 }
9210 /*
9211 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9212 * later.
9213 */
9214 mnt_fvp = fvp;
9215
9216 fvp = coveredvp;
9217 mntrename = TRUE;
9218 }
9219 /*
9220 * Check for cross-device rename.
9221 */
9222 if ((fvp->v_mount != tdvp->v_mount) ||
9223 (tvp && (fvp->v_mount != tvp->v_mount))) {
9224 error = EXDEV;
9225 goto out1;
9226 }
9227
9228 /*
9229 * If source is the same as the destination (that is the
9230 * same inode number) then there is nothing to do...
9231 * EXCEPT if the underlying file system supports case
9232 * insensitivity and is case preserving. In this case
9233 * the file system needs to handle the special case of
9234 * getting the same vnode as target (fvp) and source (tvp).
9235 *
9236 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9237 * and _PC_CASE_PRESERVING can have this exception, and they need to
9238 * handle the special case of getting the same vnode as target and
9239 * source. NOTE: Then the target is unlocked going into vnop_rename,
9240 * so not to cause locking problems. There is a single reference on tvp.
9241 *
9242 * NOTE - that fvp == tvp also occurs if they are hard linked and
9243 * that correct behaviour then is just to return success without doing
9244 * anything.
9245 *
9246 * XXX filesystem should take care of this itself, perhaps...
9247 */
9248 if (fvp == tvp && fdvp == tdvp) {
9249 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9250 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9251 fromnd->ni_cnd.cn_namelen)) {
9252 vn_authorize_skipped = TRUE;
9253 goto out1;
9254 }
9255 }
9256
9257 if (holding_mntlock && fvp->v_mount != locked_mp) {
9258 /*
9259 * we're holding a reference and lock
9260 * on locked_mp, but it no longer matches
9261 * what we want to do... so drop our hold
9262 */
9263 mount_unlock_renames(locked_mp);
9264 mount_drop(locked_mp, 0);
9265 holding_mntlock = 0;
9266 }
9267 if (tdvp != fdvp && fvp->v_type == VDIR) {
9268 /*
9269 * serialize renames that re-shape
9270 * the tree... if holding_mntlock is
9271 * set, then we're ready to go...
9272 * otherwise we
9273 * first need to drop the iocounts
9274 * we picked up, second take the
9275 * lock to serialize the access,
9276 * then finally start the lookup
9277 * process over with the lock held
9278 */
9279 if (!holding_mntlock) {
9280 /*
9281 * need to grab a reference on
9282 * the mount point before we
9283 * drop all the iocounts... once
9284 * the iocounts are gone, the mount
9285 * could follow
9286 */
9287 locked_mp = fvp->v_mount;
9288 mount_ref(locked_mp, 0);
9289
9290 /*
9291 * nameidone has to happen before we vnode_put(tvp)
9292 * since it may need to release the fs_nodelock on the tvp
9293 */
9294 nameidone(tond);
9295
9296 if (tvp) {
9297 vnode_put(tvp);
9298 }
9299 vnode_put(tdvp);
9300
9301 /*
9302 * nameidone has to happen before we vnode_put(fdvp)
9303 * since it may need to release the fs_nodelock on the fvp
9304 */
9305 nameidone(fromnd);
9306
9307 vnode_put(fvp);
9308 vnode_put(fdvp);
9309
9310 if (mnt_fvp != NULLVP) {
9311 vnode_put(mnt_fvp);
9312 }
9313
9314 mount_lock_renames(locked_mp);
9315 holding_mntlock = 1;
9316
9317 goto retry;
9318 }
9319 } else {
9320 /*
9321 * when we dropped the iocounts to take
9322 * the lock, we allowed the identity of
9323 * the various vnodes to change... if they did,
9324 * we may no longer be dealing with a rename
9325 * that reshapes the tree... once we're holding
9326 * the iocounts, the vnodes can't change type
9327 * so we're free to drop the lock at this point
9328 * and continue on
9329 */
9330 if (holding_mntlock) {
9331 mount_unlock_renames(locked_mp);
9332 mount_drop(locked_mp, 0);
9333 holding_mntlock = 0;
9334 }
9335 }
9336
9337 if (!batched) {
9338 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9339 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9340 flags, NULL);
9341 if (error) {
9342 if (error == ENOENT) {
9343 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9344 /*
9345 * We encountered a race where after doing the namei,
9346 * tvp stops being valid. If so, simply re-drive the rename
9347 * call from the top.
9348 */
9349 do_retry = 1;
9350 retry_count += 1;
9351 }
9352 }
9353 goto out1;
9354 }
9355 }
9356
9357 /* Release the 'mnt_fvp' now that it is no longer needed. */
9358 if (mnt_fvp != NULLVP) {
9359 vnode_put(mnt_fvp);
9360 mnt_fvp = NULLVP;
9361 }
9362
9363 // save these off so we can later verify that fvp is the same
9364 oname = fvp->v_name;
9365 oparent = fvp->v_parent;
9366
9367 skipped_lookup:
9368 #if CONFIG_FILE_LEASES
9369 /* Lease break needed for source's parent dir? */
9370 vnode_breakdirlease(fdvp, false, O_WRONLY);
9371
9372 /* Lease break needed for target's parent dir? */
9373 vnode_breakdirlease(tdvp, false, O_WRONLY);
9374 #endif
9375
9376 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9377 tdvp, &tvp, &tond->ni_cnd, tvap,
9378 flags, ctx);
9379
9380 if (holding_mntlock) {
9381 /*
9382 * we can drop our serialization
9383 * lock now
9384 */
9385 mount_unlock_renames(locked_mp);
9386 mount_drop(locked_mp, 0);
9387 holding_mntlock = 0;
9388 }
9389 if (error) {
9390 if (error == EDATALESS) {
9391 /*
9392 * If we've been here before, something has gone
9393 * horribly wrong and we should just get out lest
9394 * we spiral around the drain forever.
9395 */
9396 if (flags & VFS_RENAME_DATALESS) {
9397 error = EIO;
9398 goto out1;
9399 }
9400
9401 /*
9402 * The object we're renaming is dataless (or has a
9403 * dataless descendent) and requires materialization
9404 * before the rename occurs. But we're holding the
9405 * mount point's rename lock, so it's not safe to
9406 * make the upcall.
9407 *
9408 * In this case, we release the lock, perform the
9409 * materialization, and start the whole thing over.
9410 */
9411 error = vnode_materialize_dataless_file(fvp,
9412 NAMESPACE_HANDLER_RENAME_OP);
9413
9414 if (error == 0) {
9415 /*
9416 * The next time around we need to tell the
9417 * file system that the materializtaion has
9418 * been performed.
9419 */
9420 flags |= VFS_RENAME_DATALESS;
9421 do_retry = 1;
9422 }
9423 goto out1;
9424 }
9425 if (error == EKEEPLOOKING) {
9426 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9427 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9428 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9429 }
9430 }
9431
9432 fromnd->ni_vp = fvp;
9433 tond->ni_vp = tvp;
9434
9435 goto continue_lookup;
9436 }
9437
9438 /*
9439 * We may encounter a race in the VNOP where the destination didn't
9440 * exist when we did the namei, but it does by the time we go and
9441 * try to create the entry. In this case, we should re-drive this rename
9442 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9443 * but other filesystems susceptible to this race could return it, too.
9444 */
9445 if (error == ERECYCLE) {
9446 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9447 do_retry = 1;
9448 retry_count += 1;
9449 } else {
9450 printf("rename retry limit due to ERECYCLE reached\n");
9451 error = ENOENT;
9452 }
9453 }
9454
9455 /*
9456 * For compound VNOPs, the authorization callback may return
9457 * ENOENT in case of racing hardlink lookups hitting the name
9458 * cache, redrive the lookup.
9459 */
9460 if (batched && error == ENOENT) {
9461 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9462 do_retry = 1;
9463 retry_count += 1;
9464 }
9465 }
9466
9467 goto out1;
9468 }
9469
9470 /* call out to allow 3rd party notification of rename.
9471 * Ignore result of kauth_authorize_fileop call.
9472 */
9473 kauth_authorize_fileop(vfs_context_ucred(ctx),
9474 KAUTH_FILEOP_RENAME,
9475 (uintptr_t)from_name, (uintptr_t)to_name);
9476 if (flags & VFS_RENAME_SWAP) {
9477 kauth_authorize_fileop(vfs_context_ucred(ctx),
9478 KAUTH_FILEOP_RENAME,
9479 (uintptr_t)to_name, (uintptr_t)from_name);
9480 }
9481
9482 #if CONFIG_FSE
9483 if (from_name != NULL && to_name != NULL) {
9484 if (from_truncated || to_truncated) {
9485 // set it here since only the from_finfo gets reported up to user space
9486 from_finfo.mode |= FSE_TRUNCATED_PATH;
9487 }
9488
9489 if (tvap && tvp) {
9490 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9491 }
9492 if (fvap) {
9493 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9494 }
9495
9496 if (tvp) {
9497 add_fsevent(FSE_RENAME, ctx,
9498 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9499 FSE_ARG_FINFO, &from_finfo,
9500 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9501 FSE_ARG_FINFO, &to_finfo,
9502 FSE_ARG_DONE);
9503 if (flags & VFS_RENAME_SWAP) {
9504 /*
9505 * Strictly speaking, swap is the equivalent of
9506 * *three* renames. FSEvents clients should only take
9507 * the events as a hint, so we only bother reporting
9508 * two.
9509 */
9510 add_fsevent(FSE_RENAME, ctx,
9511 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9512 FSE_ARG_FINFO, &to_finfo,
9513 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9514 FSE_ARG_FINFO, &from_finfo,
9515 FSE_ARG_DONE);
9516 }
9517 } else {
9518 add_fsevent(FSE_RENAME, ctx,
9519 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9520 FSE_ARG_FINFO, &from_finfo,
9521 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9522 FSE_ARG_DONE);
9523 }
9524 }
9525 #endif /* CONFIG_FSE */
9526
9527 /*
9528 * update filesystem's mount point data
9529 */
9530 if (mntrename) {
9531 char *cp, *pathend, *mpname;
9532 char * tobuf;
9533 struct mount *mp;
9534 int maxlen;
9535 size_t len = 0;
9536
9537 mp = fvp->v_mountedhere;
9538
9539 if (vfs_busy(mp, LK_NOWAIT)) {
9540 error = EBUSY;
9541 goto out1;
9542 }
9543 tobuf = zalloc(ZV_NAMEI);
9544
9545 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9546 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9547 } else {
9548 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9549 }
9550 if (!error) {
9551 /* find current mount point prefix */
9552 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9553 for (cp = pathend; *cp != '\0'; ++cp) {
9554 if (*cp == '/') {
9555 pathend = cp + 1;
9556 }
9557 }
9558 /* find last component of target name */
9559 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9560 if (*cp == '/') {
9561 mpname = cp + 1;
9562 }
9563 }
9564
9565 /* Update f_mntonname of sub mounts */
9566 vfs_iterate(0, rename_submounts_callback, (void *)mp);
9567
9568 /* append name to prefix */
9569 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9570 bzero(pathend, maxlen);
9571
9572 strlcpy(pathend, mpname, maxlen);
9573 }
9574 zfree(ZV_NAMEI, tobuf);
9575
9576 vfs_unbusy(mp);
9577
9578 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9579 }
9580 /*
9581 * fix up name & parent pointers. note that we first
9582 * check that fvp has the same name/parent pointers it
9583 * had before the rename call... this is a 'weak' check
9584 * at best...
9585 *
9586 * XXX oparent and oname may not be set in the compound vnop case
9587 */
9588 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9589 int update_flags;
9590
9591 update_flags = VNODE_UPDATE_NAME;
9592
9593 if (fdvp != tdvp) {
9594 update_flags |= VNODE_UPDATE_PARENT;
9595 }
9596
9597 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9598 }
9599 out1:
9600 /*
9601 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9602 * skipped earlier as no actual rename was performed.
9603 */
9604 if (vn_authorize_skipped && error == 0) {
9605 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9606 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9607 flags, NULL);
9608 if (error && error == ENOENT) {
9609 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9610 do_retry = 1;
9611 retry_count += 1;
9612 }
9613 }
9614 }
9615 if (to_name != NULL) {
9616 RELEASE_PATH(to_name);
9617 to_name = NULL;
9618 }
9619 if (to_name_no_firmlink != NULL) {
9620 RELEASE_PATH(to_name_no_firmlink);
9621 to_name_no_firmlink = NULL;
9622 }
9623 if (from_name != NULL) {
9624 RELEASE_PATH(from_name);
9625 from_name = NULL;
9626 }
9627 if (from_name_no_firmlink != NULL) {
9628 RELEASE_PATH(from_name_no_firmlink);
9629 from_name_no_firmlink = NULL;
9630 }
9631 if (holding_mntlock) {
9632 mount_unlock_renames(locked_mp);
9633 mount_drop(locked_mp, 0);
9634 holding_mntlock = 0;
9635 }
9636 if (tdvp) {
9637 /*
9638 * nameidone has to happen before we vnode_put(tdvp)
9639 * since it may need to release the fs_nodelock on the tdvp
9640 */
9641 nameidone(tond);
9642
9643 if (tvp) {
9644 vnode_put(tvp);
9645 }
9646 vnode_put(tdvp);
9647 }
9648 if (fdvp) {
9649 /*
9650 * nameidone has to happen before we vnode_put(fdvp)
9651 * since it may need to release the fs_nodelock on the fdvp
9652 */
9653 nameidone(fromnd);
9654
9655 if (fvp) {
9656 vnode_put(fvp);
9657 }
9658 vnode_put(fdvp);
9659 }
9660 if (mnt_fvp != NULLVP) {
9661 vnode_put(mnt_fvp);
9662 }
9663 /*
9664 * If things changed after we did the namei, then we will re-drive
9665 * this rename call from the top.
9666 */
9667 if (do_retry) {
9668 do_retry = 0;
9669 goto retry;
9670 }
9671
9672 kfree_type(typeof(*__rename_data), __rename_data);
9673 return error;
9674 }
9675
9676 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9677 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9678 {
9679 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9680 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9681 }
9682
9683 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9684 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9685 {
9686 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9687 return EINVAL;
9688 }
9689
9690 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9691 return EINVAL;
9692 }
9693
9694 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9695 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9696 }
9697
9698 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9699 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9700 {
9701 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9702 uap->tofd, uap->to, UIO_USERSPACE, 0);
9703 }
9704
9705 /*
9706 * Make a directory file.
9707 *
9708 * Returns: 0 Success
9709 * EEXIST
9710 * namei:???
9711 * vnode_authorize:???
9712 * vn_create:???
9713 */
9714 /* ARGSUSED */
9715 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9716 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9717 enum uio_seg segflg)
9718 {
9719 vnode_t vp, dvp;
9720 int error;
9721 int update_flags = 0;
9722 int batched;
9723 struct nameidata nd;
9724
9725 AUDIT_ARG(mode, vap->va_mode);
9726 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9727 path, ctx);
9728 nd.ni_cnd.cn_flags |= WILLBEDIR;
9729 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9730
9731 continue_lookup:
9732 error = nameiat(&nd, fd);
9733 if (error) {
9734 return error;
9735 }
9736 dvp = nd.ni_dvp;
9737 vp = nd.ni_vp;
9738
9739 if (vp != NULL) {
9740 error = EEXIST;
9741 goto out;
9742 }
9743
9744 batched = vnode_compound_mkdir_available(dvp);
9745
9746 VATTR_SET(vap, va_type, VDIR);
9747
9748 /*
9749 * XXX
9750 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9751 * only get EXISTS or EISDIR for existing path components, and not that it could see
9752 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9753 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9754 */
9755 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9756 if (error == EACCES || error == EPERM) {
9757 int error2;
9758
9759 nameidone(&nd);
9760 vnode_put(dvp);
9761 dvp = NULLVP;
9762
9763 /*
9764 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9765 * rather than EACCESS if the target exists.
9766 */
9767 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9768 path, ctx);
9769 error2 = nameiat(&nd, fd);
9770 if (error2) {
9771 goto out;
9772 } else {
9773 vp = nd.ni_vp;
9774 error = EEXIST;
9775 goto out;
9776 }
9777 }
9778
9779 goto out;
9780 }
9781
9782 #if CONFIG_FILE_LEASES
9783 vnode_breakdirlease(dvp, false, O_WRONLY);
9784 #endif
9785
9786 /*
9787 * make the directory
9788 */
9789 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9790 if (error == EKEEPLOOKING) {
9791 nd.ni_vp = vp;
9792 goto continue_lookup;
9793 }
9794
9795 goto out;
9796 }
9797
9798 // Make sure the name & parent pointers are hooked up
9799 if (vp->v_name == NULL) {
9800 update_flags |= VNODE_UPDATE_NAME;
9801 }
9802 if (vp->v_parent == NULLVP) {
9803 update_flags |= VNODE_UPDATE_PARENT;
9804 }
9805
9806 if (update_flags) {
9807 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9808 }
9809
9810 #if CONFIG_FSE
9811 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9812 #endif
9813
9814 out:
9815 /*
9816 * nameidone has to happen before we vnode_put(dvp)
9817 * since it may need to release the fs_nodelock on the dvp
9818 */
9819 nameidone(&nd);
9820
9821 if (vp) {
9822 vnode_put(vp);
9823 }
9824 if (dvp) {
9825 vnode_put(dvp);
9826 }
9827
9828 return error;
9829 }
9830
9831 /*
9832 * mkdir_extended: Create a directory; with extended security (ACL).
9833 *
9834 * Parameters: p Process requesting to create the directory
9835 * uap User argument descriptor (see below)
9836 * retval (ignored)
9837 *
9838 * Indirect: uap->path Path of directory to create
9839 * uap->mode Access permissions to set
9840 * uap->xsecurity ACL to set
9841 *
9842 * Returns: 0 Success
9843 * !0 Not success
9844 *
9845 */
9846 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9847 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9848 {
9849 int ciferror;
9850 kauth_filesec_t xsecdst;
9851 struct vnode_attr va;
9852
9853 AUDIT_ARG(owner, uap->uid, uap->gid);
9854
9855 xsecdst = NULL;
9856 if ((uap->xsecurity != USER_ADDR_NULL) &&
9857 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9858 return ciferror;
9859 }
9860
9861 VATTR_INIT(&va);
9862 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9863 if (xsecdst != NULL) {
9864 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9865 va.va_vaflags |= VA_FILESEC_ACL;
9866 }
9867
9868 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9869 UIO_USERSPACE);
9870 if (xsecdst != NULL) {
9871 kauth_filesec_free(xsecdst);
9872 }
9873 return ciferror;
9874 }
9875
9876 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9877 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9878 {
9879 struct vnode_attr va;
9880
9881 VATTR_INIT(&va);
9882 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9883
9884 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9885 UIO_USERSPACE);
9886 }
9887
9888 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9889 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9890 {
9891 struct vnode_attr va;
9892
9893 VATTR_INIT(&va);
9894 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9895
9896 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9897 UIO_USERSPACE);
9898 }
9899
9900 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9901 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9902 enum uio_seg segflg, int unlink_flags)
9903 {
9904 struct {
9905 struct nameidata nd;
9906 #if CONFIG_FSE
9907 struct vnode_attr va;
9908 #endif /* CONFIG_FSE */
9909 } *__rmdir_data;
9910 vnode_t vp, dvp;
9911 int error;
9912 struct nameidata *ndp;
9913 char *path = NULL;
9914 char *no_firmlink_path = NULL;
9915 int len_path = 0;
9916 int len_no_firmlink_path = 0;
9917 int has_listeners = 0;
9918 int need_event = 0;
9919 int truncated_path = 0;
9920 int truncated_no_firmlink_path = 0;
9921 struct vnode_attr *vap = NULL;
9922 int restart_count = 0;
9923 int batched;
9924
9925 int restart_flag;
9926
9927 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9928 ndp = &__rmdir_data->nd;
9929
9930 /*
9931 * This loop exists to restart rmdir in the unlikely case that two
9932 * processes are simultaneously trying to remove the same directory
9933 * containing orphaned appleDouble files.
9934 */
9935 do {
9936 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9937 segflg, dirpath, ctx);
9938 ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9939 continue_lookup:
9940 restart_flag = 0;
9941 vap = NULL;
9942
9943 error = nameiat(ndp, fd);
9944 if (error) {
9945 goto err_out;
9946 }
9947
9948 dvp = ndp->ni_dvp;
9949 vp = ndp->ni_vp;
9950
9951 if (vp) {
9952 batched = vnode_compound_rmdir_available(vp);
9953
9954 if (vp->v_flag & VROOT) {
9955 /*
9956 * The root of a mounted filesystem cannot be deleted.
9957 */
9958 error = EBUSY;
9959 goto out;
9960 }
9961
9962 #if DEVELOPMENT || DEBUG
9963 /*
9964 * XXX VSWAP: Check for entitlements or special flag here
9965 * so we can restrict access appropriately.
9966 */
9967 #else /* DEVELOPMENT || DEBUG */
9968
9969 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9970 error = EPERM;
9971 goto out;
9972 }
9973 #endif /* DEVELOPMENT || DEBUG */
9974
9975 /*
9976 * Removed a check here; we used to abort if vp's vid
9977 * was not the same as what we'd seen the last time around.
9978 * I do not think that check was valid, because if we retry
9979 * and all dirents are gone, the directory could legitimately
9980 * be recycled but still be present in a situation where we would
9981 * have had permission to delete. Therefore, we won't make
9982 * an effort to preserve that check now that we may not have a
9983 * vp here.
9984 */
9985
9986 if (!batched) {
9987 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9988 if (error) {
9989 if (error == ENOENT) {
9990 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9991 restart_flag = 1;
9992 restart_count += 1;
9993 }
9994 }
9995 goto out;
9996 }
9997 }
9998 } else {
9999 batched = 1;
10000
10001 if (!vnode_compound_rmdir_available(dvp)) {
10002 panic("No error, but no compound rmdir?");
10003 }
10004 }
10005
10006 #if CONFIG_FSE
10007 fse_info finfo = {0};
10008
10009 need_event = need_fsevent(FSE_DELETE, dvp);
10010 if (need_event) {
10011 if (!batched) {
10012 get_fse_info(vp, &finfo, ctx);
10013 } else {
10014 error = vfs_get_notify_attributes(&__rmdir_data->va);
10015 if (error) {
10016 goto out;
10017 }
10018
10019 vap = &__rmdir_data->va;
10020 }
10021 }
10022 #endif
10023 has_listeners = kauth_authorize_fileop_has_listeners();
10024 if (need_event || has_listeners) {
10025 if (path == NULL) {
10026 GET_PATH(path);
10027 }
10028
10029 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10030
10031 if (no_firmlink_path == NULL) {
10032 GET_PATH(no_firmlink_path);
10033 }
10034
10035 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10036 #if CONFIG_FSE
10037 if (truncated_no_firmlink_path) {
10038 finfo.mode |= FSE_TRUNCATED_PATH;
10039 }
10040 #endif
10041 }
10042
10043 #if CONFIG_FILE_LEASES
10044 vnode_breakdirlease(dvp, false, O_WRONLY);
10045 #endif
10046
10047 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10048 ndp->ni_vp = vp;
10049 if (vp == NULLVP) {
10050 /* Couldn't find a vnode */
10051 goto out;
10052 }
10053
10054 if (error == EKEEPLOOKING) {
10055 goto continue_lookup;
10056 } else if (batched && error == ENOENT) {
10057 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10058 /*
10059 * For compound VNOPs, the authorization callback
10060 * may return ENOENT in case of racing hard link lookups
10061 * redrive the lookup.
10062 */
10063 restart_flag = 1;
10064 restart_count += 1;
10065 goto out;
10066 }
10067 }
10068
10069 /*
10070 * XXX There's no provision for passing flags
10071 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10072 * because it's not empty, then we try again
10073 * with VNOP_REMOVE(), passing in a special
10074 * flag that clever file systems will know
10075 * how to handle.
10076 */
10077 if (error == ENOTEMPTY &&
10078 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10079 /*
10080 * If this fails, we want to keep the original
10081 * error.
10082 */
10083 if (vn_remove(dvp, &vp, ndp,
10084 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10085 error = 0;
10086 }
10087 }
10088
10089 #if CONFIG_APPLEDOUBLE
10090 /*
10091 * Special case to remove orphaned AppleDouble
10092 * files. I don't like putting this in the kernel,
10093 * but carbon does not like putting this in carbon either,
10094 * so here we are.
10095 */
10096 if (error == ENOTEMPTY) {
10097 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10098 if (ad_error == EBUSY) {
10099 error = ad_error;
10100 goto out;
10101 }
10102
10103
10104 /*
10105 * Assuming everything went well, we will try the RMDIR again
10106 */
10107 if (!ad_error) {
10108 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10109 }
10110 }
10111 #endif /* CONFIG_APPLEDOUBLE */
10112 /*
10113 * Call out to allow 3rd party notification of delete.
10114 * Ignore result of kauth_authorize_fileop call.
10115 */
10116 if (!error) {
10117 if (has_listeners) {
10118 kauth_authorize_fileop(vfs_context_ucred(ctx),
10119 KAUTH_FILEOP_DELETE,
10120 (uintptr_t)vp,
10121 (uintptr_t)path);
10122 }
10123
10124 if (vp->v_flag & VISHARDLINK) {
10125 // see the comment in unlink1() about why we update
10126 // the parent of a hard link when it is removed
10127 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10128 }
10129
10130 #if CONFIG_FSE
10131 if (need_event) {
10132 if (vap) {
10133 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10134 }
10135 add_fsevent(FSE_DELETE, ctx,
10136 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10137 FSE_ARG_FINFO, &finfo,
10138 FSE_ARG_DONE);
10139 }
10140 #endif
10141
10142 #if CONFIG_MACF
10143 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10144 #endif
10145 }
10146
10147 out:
10148 if (path != NULL) {
10149 RELEASE_PATH(path);
10150 path = NULL;
10151 }
10152
10153 if (no_firmlink_path != NULL) {
10154 RELEASE_PATH(no_firmlink_path);
10155 no_firmlink_path = NULL;
10156 }
10157
10158 /*
10159 * nameidone has to happen before we vnode_put(dvp)
10160 * since it may need to release the fs_nodelock on the dvp
10161 */
10162 nameidone(ndp);
10163 vnode_put(dvp);
10164
10165 if (vp) {
10166 vnode_put(vp);
10167 }
10168
10169 if (restart_flag == 0) {
10170 wakeup_one((caddr_t)vp);
10171 goto err_out;
10172 }
10173 tsleep(vp, PVFS, "rm AD", 1);
10174 } while (restart_flag != 0);
10175
10176 err_out:
10177 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10178
10179 return error;
10180 }
10181
10182 /*
10183 * Remove a directory file.
10184 */
10185 /* ARGSUSED */
10186 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10187 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10188 {
10189 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10190 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10191 }
10192
10193 /* Get direntry length padded to 8 byte alignment */
10194 #define DIRENT64_LEN(namlen) \
10195 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10196
10197 /* Get dirent length padded to 4 byte alignment */
10198 #define DIRENT_LEN(namelen) \
10199 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10200
10201 /* Get the end of this dirent */
10202 #define DIRENT_END(dep) \
10203 (((char *)(dep)) + (dep)->d_reclen - 1)
10204
10205 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10206 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10207 int *numdirent, vfs_context_t ctxp)
10208 {
10209 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10210 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10211 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10212 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10213 } else {
10214 size_t bufsize;
10215 void * bufptr;
10216 uio_t auio;
10217 struct direntry *entry64;
10218 struct dirent *dep;
10219 size_t bytesread;
10220 int error;
10221
10222 /*
10223 * We're here because the underlying file system does not
10224 * support direnties or we mounted denying support so we must
10225 * fall back to dirents and convert them to direntries.
10226 *
10227 * Our kernel buffer needs to be smaller since re-packing will
10228 * expand each dirent. The worse case (when the name length
10229 * is 3 or less) corresponds to a struct direntry size of 32
10230 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10231 * (4-byte aligned). So having a buffer that is 3/8 the size
10232 * will prevent us from reading more than we can pack.
10233 *
10234 * Since this buffer is wired memory, we will limit the
10235 * buffer size to a maximum of 32K. We would really like to
10236 * use 32K in the MIN(), but we use magic number 87371 to
10237 * prevent uio_resid() * 3 / 8 from overflowing.
10238 */
10239 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10240 bufptr = kalloc_data(bufsize, Z_WAITOK);
10241 if (bufptr == NULL) {
10242 return ENOMEM;
10243 }
10244
10245 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10246 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10247 auio->uio_offset = uio->uio_offset;
10248
10249 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10250
10251 dep = (struct dirent *)bufptr;
10252 bytesread = bufsize - uio_resid(auio);
10253
10254 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10255 /*
10256 * Convert all the entries and copy them out to user's buffer.
10257 */
10258 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10259 /* First check that the dirent struct up to d_name is within the buffer */
10260 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10261 /* Check that the length of the entire dirent is within the buffer */
10262 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10263 /* Check that the actual length including the name doesn't exceed d_reclen */
10264 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10265 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10266 vp->v_mount->mnt_vfsstat.f_mntonname,
10267 vp->v_name ? vp->v_name : "<unknown>");
10268 error = EIO;
10269 break;
10270 }
10271
10272 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10273
10274 bzero(entry64, enbufsize);
10275 /* Convert a dirent to a dirent64. */
10276 entry64->d_ino = dep->d_ino;
10277 entry64->d_seekoff = 0;
10278 entry64->d_reclen = (uint16_t)enbufsize;
10279 entry64->d_namlen = dep->d_namlen;
10280 entry64->d_type = dep->d_type;
10281 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10282
10283 /* Move to next entry. */
10284 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10285
10286 /* Copy entry64 to user's buffer. */
10287 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10288 }
10289
10290 /* Update the real offset using the offset we got from VNOP_READDIR. */
10291 if (error == 0) {
10292 uio->uio_offset = auio->uio_offset;
10293 }
10294 uio_free(auio);
10295 kfree_data(bufptr, bufsize);
10296 kfree_type(struct direntry, entry64);
10297 return error;
10298 }
10299 }
10300
10301 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10302
10303 /*
10304 * Read a block of directory entries in a file system independent format.
10305 */
10306 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10307 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10308 off_t *offset, int *eofflag, int flags)
10309 {
10310 vnode_t vp;
10311 struct vfs_context context = *vfs_context_current(); /* local copy */
10312 struct fileproc *fp;
10313 uio_t auio;
10314 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10315 off_t loff;
10316 int error, numdirent;
10317 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10318
10319 get_from_fd:
10320 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10321 if (error) {
10322 return error;
10323 }
10324
10325 vn_offset_lock(fp->fp_glob);
10326 if (((vnode_t)fp_get_data(fp)) != vp) {
10327 vn_offset_unlock(fp->fp_glob);
10328 file_drop(fd);
10329 goto get_from_fd;
10330 }
10331
10332 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10333 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10334 error = EBADF;
10335 goto out;
10336 }
10337
10338 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10339 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10340 }
10341
10342 #if CONFIG_MACF
10343 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10344 if (error) {
10345 goto out;
10346 }
10347 #endif
10348
10349 if ((error = vnode_getwithref(vp))) {
10350 goto out;
10351 }
10352 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10353
10354 #if CONFIG_UNION_MOUNTS
10355 unionread:
10356 #endif /* CONFIG_UNION_MOUNTS */
10357 if (vp->v_type != VDIR) {
10358 (void)vnode_put(vp);
10359 error = EINVAL;
10360 goto out;
10361 }
10362
10363 #if CONFIG_MACF
10364 error = mac_vnode_check_readdir(&context, vp);
10365 if (error != 0) {
10366 (void)vnode_put(vp);
10367 goto out;
10368 }
10369 #endif /* MAC */
10370
10371 loff = fp->fp_glob->fg_offset;
10372 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10373 uio_addiov(auio, bufp, bufsize);
10374
10375 if (flags & VNODE_READDIR_EXTENDED) {
10376 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10377 fp->fp_glob->fg_offset = uio_offset(auio);
10378 } else {
10379 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10380 fp->fp_glob->fg_offset = uio_offset(auio);
10381 }
10382 if (error) {
10383 (void)vnode_put(vp);
10384 goto out;
10385 }
10386
10387 #if CONFIG_UNION_MOUNTS
10388 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10389 (vp->v_mount->mnt_flag & MNT_UNION)) {
10390 vnode_t uvp;
10391
10392 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10393 if (vnode_ref(uvp) == 0) {
10394 fp_set_data(fp, uvp);
10395 fp->fp_glob->fg_offset = 0;
10396 vnode_rele(vp);
10397 vnode_put(vp);
10398 vp = uvp;
10399 goto unionread;
10400 } else {
10401 /* could not get a ref, can't replace in fd */
10402 vnode_put(uvp);
10403 }
10404 }
10405 }
10406 #endif /* CONFIG_UNION_MOUNTS */
10407
10408 vnode_put(vp);
10409 if (offset) {
10410 *offset = loff;
10411 }
10412
10413 *bytesread = bufsize - uio_resid(auio);
10414 out:
10415 vn_offset_unlock(fp->fp_glob);
10416 file_drop(fd);
10417 return error;
10418 }
10419
10420
10421 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10422 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10423 {
10424 off_t offset;
10425 ssize_t bytesread;
10426 int error, eofflag;
10427
10428 AUDIT_ARG(fd, uap->fd);
10429 error = getdirentries_common(uap->fd, uap->buf, uap->count,
10430 &bytesread, &offset, &eofflag, 0);
10431
10432 if (error == 0) {
10433 if (proc_is64bit(p)) {
10434 user64_long_t base = (user64_long_t)offset;
10435 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10436 } else {
10437 user32_long_t base = (user32_long_t)offset;
10438 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10439 }
10440 *retval = (int)bytesread;
10441 }
10442 return error;
10443 }
10444
10445 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10446 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10447 {
10448 off_t offset;
10449 ssize_t bytesread;
10450 int error, eofflag;
10451 user_size_t bufsize;
10452
10453 AUDIT_ARG(fd, uap->fd);
10454
10455 /*
10456 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10457 * then the kernel carves out the last 4 bytes to return extended
10458 * information to userspace (namely whether we reached EOF with this call).
10459 */
10460 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10461 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10462 } else {
10463 bufsize = uap->bufsize;
10464 }
10465
10466 error = getdirentries_common(uap->fd, uap->buf, bufsize,
10467 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10468
10469 if (error == 0) {
10470 *retval = bytesread;
10471 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10472
10473 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10474 getdirentries64_flags_t flags = 0;
10475 if (eofflag) {
10476 flags |= GETDIRENTRIES64_EOF;
10477 }
10478 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10479 sizeof(flags));
10480 }
10481 }
10482 return error;
10483 }
10484
10485
10486 /*
10487 * Set the mode mask for creation of filesystem nodes.
10488 * XXX implement xsecurity
10489 */
10490 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10491 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10492 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10493 {
10494 AUDIT_ARG(mask, newmask);
10495 proc_fdlock(p);
10496 *retval = p->p_fd.fd_cmask;
10497 p->p_fd.fd_cmask = newmask & ALLPERMS;
10498 proc_fdunlock(p);
10499 return 0;
10500 }
10501
10502 /*
10503 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10504 *
10505 * Parameters: p Process requesting to set the umask
10506 * uap User argument descriptor (see below)
10507 * retval umask of the process (parameter p)
10508 *
10509 * Indirect: uap->newmask umask to set
10510 * uap->xsecurity ACL to set
10511 *
10512 * Returns: 0 Success
10513 * !0 Not success
10514 *
10515 */
10516 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10517 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10518 {
10519 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10520 }
10521
10522 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10523 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10524 {
10525 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10526 }
10527
10528 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10529 "com.apple.private.vfs.revoke-mounted-device"
10530
10531 /*
10532 * Void all references to file by ripping underlying filesystem
10533 * away from vnode.
10534 */
10535 /* ARGSUSED */
10536 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10537 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10538 {
10539 vnode_t vp;
10540 struct vnode_attr va;
10541 vfs_context_t ctx = vfs_context_current();
10542 int error;
10543 struct nameidata nd;
10544
10545 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10546 uap->path, ctx);
10547 error = namei(&nd);
10548 if (error) {
10549 return error;
10550 }
10551 vp = nd.ni_vp;
10552
10553 nameidone(&nd);
10554
10555 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10556 error = ENOTSUP;
10557 goto out;
10558 }
10559
10560 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10561 error = EBUSY;
10562 goto out;
10563 }
10564
10565 #if CONFIG_MACF
10566 error = mac_vnode_check_revoke(ctx, vp);
10567 if (error) {
10568 goto out;
10569 }
10570 #endif
10571
10572 VATTR_INIT(&va);
10573 VATTR_WANTED(&va, va_uid);
10574 if ((error = vnode_getattr(vp, &va, ctx))) {
10575 goto out;
10576 }
10577 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10578 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10579 goto out;
10580 }
10581 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10582 VNOP_REVOKE(vp, REVOKEALL, ctx);
10583 }
10584 out:
10585 vnode_put(vp);
10586 return error;
10587 }
10588
10589
10590 /*
10591 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10592 * The following system calls are designed to support features
10593 * which are specific to the HFS & HFS Plus volume formats
10594 */
10595
10596
10597 /*
10598 * Obtain attribute information on objects in a directory while enumerating
10599 * the directory.
10600 */
10601 /* ARGSUSED */
10602 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10603 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10604 {
10605 vnode_t vp;
10606 struct fileproc *fp;
10607 uio_t auio = NULL;
10608 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10609 uint32_t count = 0, savecount = 0;
10610 uint32_t newstate = 0;
10611 int error, eofflag = 0;
10612 off_t loff = 0;
10613 struct attrlist attributelist;
10614 vfs_context_t ctx = vfs_context_current();
10615 int fd = uap->fd;
10616 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10617 kauth_action_t action;
10618
10619 AUDIT_ARG(fd, fd);
10620
10621 /* Get the attributes into kernel space */
10622 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10623 return error;
10624 }
10625 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10626 return error;
10627 }
10628 savecount = count;
10629
10630 get_from_fd:
10631 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10632 return error;
10633 }
10634
10635 vn_offset_lock(fp->fp_glob);
10636 if (((vnode_t)fp_get_data(fp)) != vp) {
10637 vn_offset_unlock(fp->fp_glob);
10638 file_drop(fd);
10639 goto get_from_fd;
10640 }
10641
10642 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10643 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10644 error = EBADF;
10645 goto out;
10646 }
10647
10648
10649 #if CONFIG_MACF
10650 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10651 fp->fp_glob);
10652 if (error) {
10653 goto out;
10654 }
10655 #endif
10656
10657
10658 if ((error = vnode_getwithref(vp))) {
10659 goto out;
10660 }
10661
10662 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10663
10664 #if CONFIG_UNION_MOUNTS
10665 unionread:
10666 #endif /* CONFIG_UNION_MOUNTS */
10667 if (vp->v_type != VDIR) {
10668 (void)vnode_put(vp);
10669 error = EINVAL;
10670 goto out;
10671 }
10672
10673 #if CONFIG_MACF
10674 error = mac_vnode_check_readdir(ctx, vp);
10675 if (error != 0) {
10676 (void)vnode_put(vp);
10677 goto out;
10678 }
10679 #endif /* MAC */
10680
10681 /* set up the uio structure which will contain the users return buffer */
10682 loff = fp->fp_glob->fg_offset;
10683 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10684 uio_addiov(auio, uap->buffer, uap->buffersize);
10685
10686 /*
10687 * If the only item requested is file names, we can let that past with
10688 * just LIST_DIRECTORY. If they want any other attributes, that means
10689 * they need SEARCH as well.
10690 */
10691 action = KAUTH_VNODE_LIST_DIRECTORY;
10692 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10693 attributelist.fileattr || attributelist.dirattr) {
10694 action |= KAUTH_VNODE_SEARCH;
10695 }
10696
10697 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10698 /* Believe it or not, uap->options only has 32-bits of valid
10699 * info, so truncate before extending again */
10700
10701 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10702 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10703 }
10704
10705 if (error) {
10706 (void) vnode_put(vp);
10707 goto out;
10708 }
10709
10710 #if CONFIG_UNION_MOUNTS
10711 /*
10712 * If we've got the last entry of a directory in a union mount
10713 * then reset the eofflag and pretend there's still more to come.
10714 * The next call will again set eofflag and the buffer will be empty,
10715 * so traverse to the underlying directory and do the directory
10716 * read there.
10717 */
10718 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10719 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10720 eofflag = 0;
10721 } else { // Empty buffer
10722 vnode_t uvp;
10723 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10724 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10725 fp_set_data(fp, uvp);
10726 fp->fp_glob->fg_offset = 0; // reset index for new dir
10727 count = savecount;
10728 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10729 vnode_put(vp);
10730 vp = uvp;
10731 goto unionread;
10732 } else {
10733 /* could not get a ref, can't replace in fd */
10734 vnode_put(uvp);
10735 }
10736 }
10737 }
10738 }
10739 #endif /* CONFIG_UNION_MOUNTS */
10740
10741 (void)vnode_put(vp);
10742
10743 if (error) {
10744 goto out;
10745 }
10746 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10747
10748 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10749 goto out;
10750 }
10751 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10752 goto out;
10753 }
10754 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10755 goto out;
10756 }
10757
10758 *retval = eofflag; /* similar to getdirentries */
10759 error = 0;
10760 out:
10761 vn_offset_unlock(fp->fp_glob);
10762 file_drop(fd);
10763 return error; /* return error earlier, an retval of 0 or 1 now */
10764 } /* end of getdirentriesattr system call */
10765
10766 /*
10767 * Exchange data between two files
10768 */
10769
10770 /* ARGSUSED */
10771 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10772 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10773 {
10774 struct nameidata fnd, snd;
10775 vfs_context_t ctx = vfs_context_current();
10776 vnode_t fvp;
10777 vnode_t svp;
10778 int error;
10779 u_int32_t nameiflags;
10780 char *fpath = NULL;
10781 char *spath = NULL;
10782 int flen = 0, slen = 0;
10783 int from_truncated = 0, to_truncated = 0;
10784 #if CONFIG_FSE
10785 fse_info f_finfo, s_finfo;
10786 #endif
10787
10788 nameiflags = 0;
10789 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10790 nameiflags |= FOLLOW;
10791 }
10792
10793 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10794 UIO_USERSPACE, uap->path1, ctx);
10795
10796 error = namei(&fnd);
10797 if (error) {
10798 goto out2;
10799 }
10800
10801 nameidone(&fnd);
10802 fvp = fnd.ni_vp;
10803
10804 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10805 UIO_USERSPACE, uap->path2, ctx);
10806
10807 error = namei(&snd);
10808 if (error) {
10809 vnode_put(fvp);
10810 goto out2;
10811 }
10812 nameidone(&snd);
10813 svp = snd.ni_vp;
10814
10815 /*
10816 * if the files are the same, return an inval error
10817 */
10818 if (svp == fvp) {
10819 error = EINVAL;
10820 goto out;
10821 }
10822
10823 /*
10824 * if the files are on different volumes, return an error
10825 */
10826 if (svp->v_mount != fvp->v_mount) {
10827 error = EXDEV;
10828 goto out;
10829 }
10830
10831 /* If they're not files, return an error */
10832 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10833 error = EINVAL;
10834 goto out;
10835 }
10836
10837 #if CONFIG_MACF
10838 error = mac_vnode_check_exchangedata(ctx,
10839 fvp, svp);
10840 if (error) {
10841 goto out;
10842 }
10843 #endif
10844 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10845 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10846 goto out;
10847 }
10848
10849 if (
10850 #if CONFIG_FSE
10851 need_fsevent(FSE_EXCHANGE, fvp) ||
10852 #endif
10853 kauth_authorize_fileop_has_listeners()) {
10854 GET_PATH(fpath);
10855 GET_PATH(spath);
10856
10857 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10858 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10859
10860 #if CONFIG_FSE
10861 get_fse_info(fvp, &f_finfo, ctx);
10862 get_fse_info(svp, &s_finfo, ctx);
10863 if (from_truncated || to_truncated) {
10864 // set it here since only the f_finfo gets reported up to user space
10865 f_finfo.mode |= FSE_TRUNCATED_PATH;
10866 }
10867 #endif
10868 }
10869 /* Ok, make the call */
10870 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10871
10872 if (error == 0) {
10873 const char *tmpname;
10874
10875 if (fpath != NULL && spath != NULL) {
10876 /* call out to allow 3rd party notification of exchangedata.
10877 * Ignore result of kauth_authorize_fileop call.
10878 */
10879 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10880 (uintptr_t)fpath, (uintptr_t)spath);
10881 }
10882 name_cache_lock();
10883
10884 tmpname = fvp->v_name;
10885 fvp->v_name = svp->v_name;
10886 svp->v_name = tmpname;
10887
10888 if (fvp->v_parent != svp->v_parent) {
10889 vnode_t tmp;
10890
10891 tmp = fvp->v_parent;
10892 fvp->v_parent = svp->v_parent;
10893 svp->v_parent = tmp;
10894 }
10895 name_cache_unlock();
10896
10897 #if CONFIG_FSE
10898 if (fpath != NULL && spath != NULL) {
10899 add_fsevent(FSE_EXCHANGE, ctx,
10900 FSE_ARG_STRING, flen, fpath,
10901 FSE_ARG_FINFO, &f_finfo,
10902 FSE_ARG_STRING, slen, spath,
10903 FSE_ARG_FINFO, &s_finfo,
10904 FSE_ARG_DONE);
10905 }
10906 #endif
10907 }
10908
10909 out:
10910 if (fpath != NULL) {
10911 RELEASE_PATH(fpath);
10912 }
10913 if (spath != NULL) {
10914 RELEASE_PATH(spath);
10915 }
10916 vnode_put(svp);
10917 vnode_put(fvp);
10918 out2:
10919 return error;
10920 }
10921
10922 /*
10923 * Return (in MB) the amount of freespace on the given vnode's volume.
10924 */
10925 uint32_t freespace_mb(vnode_t vp);
10926
10927 uint32_t
freespace_mb(vnode_t vp)10928 freespace_mb(vnode_t vp)
10929 {
10930 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10931 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10932 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10933 }
10934
10935 #if CONFIG_SEARCHFS
10936
10937 /* ARGSUSED */
10938
10939 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10940 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10941 {
10942 vnode_t vp, tvp;
10943 int i, error = 0;
10944 int fserror = 0;
10945 struct nameidata nd;
10946 struct user64_fssearchblock searchblock;
10947 struct searchstate *state;
10948 struct attrlist *returnattrs;
10949 struct timeval timelimit;
10950 void *searchparams1, *searchparams2;
10951 uio_t auio = NULL;
10952 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10953 uint32_t nummatches;
10954 size_t mallocsize;
10955 uint32_t nameiflags;
10956 vfs_context_t ctx = vfs_context_current();
10957 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10958
10959 /* Start by copying in fsearchblock parameter list */
10960 if (IS_64BIT_PROCESS(p)) {
10961 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10962 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10963 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10964 } else {
10965 struct user32_fssearchblock tmp_searchblock;
10966
10967 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10968 // munge into 64-bit version
10969 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10970 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10971 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10972 searchblock.maxmatches = tmp_searchblock.maxmatches;
10973 /*
10974 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10975 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10976 */
10977 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10978 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10979 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10980 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10981 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10982 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10983 searchblock.searchattrs = tmp_searchblock.searchattrs;
10984 }
10985 if (error) {
10986 return error;
10987 }
10988
10989 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10990 */
10991 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10992 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10993 return EINVAL;
10994 }
10995
10996 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10997 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
10998 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10999 /* block. */
11000 /* */
11001 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
11002 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
11003 /* assumes the size is still 556 bytes it will continue to work */
11004
11005 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11006 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11007
11008 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11009
11010 /* Now set up the various pointers to the correct place in our newly allocated memory */
11011
11012 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11013 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11014 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11015
11016 /* Now copy in the stuff given our local variables. */
11017
11018 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11019 goto freeandexit;
11020 }
11021
11022 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11023 goto freeandexit;
11024 }
11025
11026 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11027 goto freeandexit;
11028 }
11029
11030 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11031 goto freeandexit;
11032 }
11033
11034 /*
11035 * When searching a union mount, need to set the
11036 * start flag at the first call on each layer to
11037 * reset state for the new volume.
11038 */
11039 if (uap->options & SRCHFS_START) {
11040 state->ss_union_layer = 0;
11041 } else {
11042 uap->options |= state->ss_union_flags;
11043 }
11044 state->ss_union_flags = 0;
11045
11046 /*
11047 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11048 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11049 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11050 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11051 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11052 */
11053
11054 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11055 attrreference_t* string_ref;
11056 u_int32_t* start_length;
11057 user64_size_t param_length;
11058
11059 /* validate searchparams1 */
11060 param_length = searchblock.sizeofsearchparams1;
11061 /* skip the word that specifies length of the buffer */
11062 start_length = (u_int32_t*) searchparams1;
11063 start_length = start_length + 1;
11064 string_ref = (attrreference_t*) start_length;
11065
11066 /* ensure no negative offsets or too big offsets */
11067 if (string_ref->attr_dataoffset < 0) {
11068 error = EINVAL;
11069 goto freeandexit;
11070 }
11071 if (string_ref->attr_length > MAXPATHLEN) {
11072 error = EINVAL;
11073 goto freeandexit;
11074 }
11075
11076 /* Check for pointer overflow in the string ref */
11077 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11078 error = EINVAL;
11079 goto freeandexit;
11080 }
11081
11082 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11083 error = EINVAL;
11084 goto freeandexit;
11085 }
11086 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11087 error = EINVAL;
11088 goto freeandexit;
11089 }
11090 }
11091
11092 /* set up the uio structure which will contain the users return buffer */
11093 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11094 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11095
11096 nameiflags = 0;
11097 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11098 nameiflags |= FOLLOW;
11099 }
11100 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11101 UIO_USERSPACE, uap->path, ctx);
11102
11103 error = namei(&nd);
11104 if (error) {
11105 goto freeandexit;
11106 }
11107 vp = nd.ni_vp;
11108 nameidone(&nd);
11109
11110 /*
11111 * Switch to the root vnode for the volume
11112 */
11113 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11114 vnode_put(vp);
11115 if (error) {
11116 goto freeandexit;
11117 }
11118 vp = tvp;
11119
11120 #if CONFIG_UNION_MOUNTS
11121 /*
11122 * If it's a union mount, the path lookup takes
11123 * us to the top layer. But we may need to descend
11124 * to a lower layer. For non-union mounts the layer
11125 * is always zero.
11126 */
11127 for (i = 0; i < (int) state->ss_union_layer; i++) {
11128 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11129 break;
11130 }
11131 tvp = vp;
11132 vp = vp->v_mount->mnt_vnodecovered;
11133 if (vp == NULL) {
11134 vnode_put(tvp);
11135 error = ENOENT;
11136 goto freeandexit;
11137 }
11138 error = vnode_getwithref(vp);
11139 vnode_put(tvp);
11140 if (error) {
11141 goto freeandexit;
11142 }
11143 }
11144 #endif /* CONFIG_UNION_MOUNTS */
11145
11146 #if CONFIG_MACF
11147 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11148 if (error) {
11149 vnode_put(vp);
11150 goto freeandexit;
11151 }
11152 #endif
11153
11154
11155 /*
11156 * If searchblock.maxmatches == 0, then skip the search. This has happened
11157 * before and sometimes the underlying code doesnt deal with it well.
11158 */
11159 if (searchblock.maxmatches == 0) {
11160 nummatches = 0;
11161 goto saveandexit;
11162 }
11163
11164 /*
11165 * Allright, we have everything we need, so lets make that call.
11166 *
11167 * We keep special track of the return value from the file system:
11168 * EAGAIN is an acceptable error condition that shouldn't keep us
11169 * from copying out any results...
11170 */
11171
11172 fserror = VNOP_SEARCHFS(vp,
11173 searchparams1,
11174 searchparams2,
11175 &searchblock.searchattrs,
11176 (uint32_t)searchblock.maxmatches,
11177 &timelimit,
11178 returnattrs,
11179 &nummatches,
11180 (uint32_t)uap->scriptcode,
11181 (uint32_t)uap->options,
11182 auio,
11183 (struct searchstate *) &state->ss_fsstate,
11184 ctx);
11185
11186 #if CONFIG_UNION_MOUNTS
11187 /*
11188 * If it's a union mount we need to be called again
11189 * to search the mounted-on filesystem.
11190 */
11191 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11192 state->ss_union_flags = SRCHFS_START;
11193 state->ss_union_layer++; // search next layer down
11194 fserror = EAGAIN;
11195 }
11196 #endif /* CONFIG_UNION_MOUNTS */
11197
11198 saveandexit:
11199
11200 vnode_put(vp);
11201
11202 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11203 * search state. Everything was already put into he return buffer by the vop call. */
11204
11205 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11206 goto freeandexit;
11207 }
11208
11209 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11210 goto freeandexit;
11211 }
11212
11213 error = fserror;
11214
11215 freeandexit:
11216
11217 kfree_data(searchparams1, mallocsize);
11218
11219 return error;
11220 } /* end of searchfs system call */
11221
11222 #else /* CONFIG_SEARCHFS */
11223
11224 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11225 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11226 {
11227 return ENOTSUP;
11228 }
11229
11230 #endif /* CONFIG_SEARCHFS */
11231
11232
11233 #if CONFIG_DATALESS_FILES
11234
11235 /*
11236 * === Namespace Resolver Up-call Mechanism ===
11237 *
11238 * When I/O is performed to a dataless file or directory (read, write,
11239 * lookup-in, etc.), the file system performs an upcall to the namespace
11240 * resolver (filecoordinationd) to materialize the object.
11241 *
11242 * We need multiple up-calls to be in flight at once, and we need these
11243 * up-calls to be interruptible, thus the following implementation:
11244 *
11245 * => The nspace_resolver_request represents the in-kernel request state.
11246 * It contains a request ID, storage space for the errno code returned
11247 * by filecoordinationd, and flags.
11248 *
11249 * => The request ID is simply a global monotonically incrementing 32-bit
11250 * number. Outstanding requests are stored in a hash table, and the
11251 * hash function is extremely simple.
11252 *
11253 * => When an upcall is to be made to filecoordinationd, a request structure
11254 * is allocated on the stack (it is small, and needs to live only during
11255 * the duration of the call to resolve_nspace_item_ext()). It is
11256 * initialized and inserted into the table. Some backpressure from
11257 * filecoordinationd is applied by limiting the numnber of entries that
11258 * can be inserted into the table (and thus limiting the number of
11259 * outstanding requests issued to filecoordinationd); waiting for an
11260 * available slot is interruptible.
11261 *
11262 * => Once the request has been inserted into the table, the up-call is made
11263 * to filecoordinationd via a MiG-generated stub. The up-call returns
11264 * immediately and filecoordinationd processes the request asynchronously.
11265 *
11266 * => The caller now waits for the request to complete. Tnis is achieved by
11267 * sleeping on the address of the request structure and waiting for
11268 * filecoordinationd to mark the request structure as complete. This
11269 * is an interruptible sleep call; if interrupted, the request structure
11270 * is removed from the table and EINTR is returned to the caller. If
11271 * this occurs, an advisory up-call is made to filecoordinationd with
11272 * the request ID to indicate that the request can be aborted or
11273 * de-prioritized at the discretion of filecoordinationd.
11274 *
11275 * => When filecoordinationd has completed the request, it signals completion
11276 * by writing to the vfs.nspace.complete sysctl node. Only a process
11277 * decorated as a namespace resolver can write to this sysctl node. The
11278 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11279 * The request ID is looked up in the table, and if the request is found,
11280 * the error code is stored in the request structure and a wakeup()
11281 * issued on the address of the request structure. If the request is not
11282 * found, we simply drop the completion notification, assuming that the
11283 * caller was interrupted.
11284 *
11285 * => When the waiting thread wakes up, it extracts the error code from the
11286 * request structure, removes the request from the table, and returns the
11287 * error code to the calling function. Fini!
11288 */
11289
11290 struct nspace_resolver_request {
11291 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11292 vnode_t r_vp;
11293 uint32_t r_req_id;
11294 int r_resolver_error;
11295 int r_flags;
11296 };
11297
11298 #define RRF_COMPLETE 0x0001
11299
11300 static uint32_t
next_nspace_req_id(void)11301 next_nspace_req_id(void)
11302 {
11303 static uint32_t next_req_id;
11304
11305 return OSAddAtomic(1, &next_req_id);
11306 }
11307
11308 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11309 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11310
11311 static LIST_HEAD(nspace_resolver_requesthead,
11312 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11313 static u_long nspace_resolver_request_hashmask;
11314 static u_int nspace_resolver_request_count;
11315 static bool nspace_resolver_request_wait_slot;
11316 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11317 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11318 &nspace_resolver_request_lck_grp);
11319
11320 #define NSPACE_REQ_LOCK() \
11321 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11322 #define NSPACE_REQ_UNLOCK() \
11323 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11324
11325 #define NSPACE_RESOLVER_HASH(req_id) \
11326 (&nspace_resolver_request_hashtbl[(req_id) & \
11327 nspace_resolver_request_hashmask])
11328
11329 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)11330 nspace_resolver_req_lookup(uint32_t req_id)
11331 {
11332 struct nspace_resolver_requesthead *bucket;
11333 struct nspace_resolver_request *req;
11334
11335 bucket = NSPACE_RESOLVER_HASH(req_id);
11336 LIST_FOREACH(req, bucket, r_hashlink) {
11337 if (req->r_req_id == req_id) {
11338 return req;
11339 }
11340 }
11341
11342 return NULL;
11343 }
11344
11345 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11346 nspace_resolver_req_add(struct nspace_resolver_request *req)
11347 {
11348 struct nspace_resolver_requesthead *bucket;
11349 int error;
11350
11351 while (nspace_resolver_request_count >=
11352 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11353 nspace_resolver_request_wait_slot = true;
11354 error = msleep(&nspace_resolver_request_count,
11355 &nspace_resolver_request_hash_mutex,
11356 PVFS | PCATCH, "nspacerq", NULL);
11357 if (error) {
11358 return error;
11359 }
11360 }
11361
11362 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11363 #if DIAGNOSTIC
11364 assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
11365 #endif /* DIAGNOSTIC */
11366 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11367 nspace_resolver_request_count++;
11368
11369 return 0;
11370 }
11371
11372 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11373 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11374 {
11375 struct nspace_resolver_requesthead *bucket;
11376
11377 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11378 #if DIAGNOSTIC
11379 assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
11380 #endif /* DIAGNOSTIC */
11381 LIST_REMOVE(req, r_hashlink);
11382 nspace_resolver_request_count--;
11383
11384 if (nspace_resolver_request_wait_slot) {
11385 nspace_resolver_request_wait_slot = false;
11386 wakeup(&nspace_resolver_request_count);
11387 }
11388 }
11389
11390 static void
nspace_resolver_req_cancel(uint32_t req_id)11391 nspace_resolver_req_cancel(uint32_t req_id)
11392 {
11393 kern_return_t kr;
11394 mach_port_t mp;
11395
11396 // Failures here aren't fatal -- the cancellation message
11397 // sent to the resolver is merely advisory.
11398
11399 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11400 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11401 return;
11402 }
11403
11404 kr = send_nspace_resolve_cancel(mp, req_id);
11405 if (kr != KERN_SUCCESS) {
11406 os_log_error(OS_LOG_DEFAULT,
11407 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11408 }
11409
11410 ipc_port_release_send(mp);
11411 }
11412
11413 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11414 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11415 {
11416 bool send_cancel_message = false;
11417 int error;
11418
11419 NSPACE_REQ_LOCK();
11420
11421 while ((req->r_flags & RRF_COMPLETE) == 0) {
11422 error = msleep(req, &nspace_resolver_request_hash_mutex,
11423 PVFS | PCATCH, "nspace", NULL);
11424 if (error && error != ERESTART) {
11425 req->r_resolver_error = (error == EINTR) ? EINTR :
11426 ETIMEDOUT;
11427 send_cancel_message = true;
11428 break;
11429 }
11430 }
11431
11432 nspace_resolver_req_remove(req);
11433
11434 NSPACE_REQ_UNLOCK();
11435
11436 if (send_cancel_message) {
11437 nspace_resolver_req_cancel(req->r_req_id);
11438 }
11439
11440 return req->r_resolver_error;
11441 }
11442
11443 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11444 nspace_resolver_req_mark_complete(
11445 struct nspace_resolver_request *req,
11446 int resolver_error)
11447 {
11448 req->r_resolver_error = resolver_error;
11449 req->r_flags |= RRF_COMPLETE;
11450 wakeup(req);
11451 }
11452
11453 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)11454 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
11455 {
11456 struct nspace_resolver_request *req;
11457
11458 NSPACE_REQ_LOCK();
11459
11460 // If we don't find the request corresponding to our req_id,
11461 // just drop the completion signal on the floor; it's likely
11462 // that the requester interrupted with a signal.
11463
11464 req = nspace_resolver_req_lookup(req_id);
11465 if (req) {
11466 mount_t locked_mp = NULL;
11467
11468 locked_mp = req->r_vp->v_mount;
11469 mount_ref(locked_mp, 0);
11470 mount_lock_renames(locked_mp);
11471
11472 //
11473 // if the resolver isn't already returning an error and we have an
11474 // orig_gencount, then get an iocount on the request vnode and check
11475 // that the gencount on req->r_vp has not changed.
11476 //
11477 // note: a ref was taken on req->r_vp when the request was created
11478 // and that ref will be dropped by that thread when it wakes up.
11479 //
11480 if (resolver_error == 0 &&
11481 orig_gencount != 0 &&
11482 vnode_getwithref(req->r_vp) == 0) {
11483 struct vnode_attr va;
11484 uint64_t cur_gencount;
11485
11486 VATTR_INIT(&va);
11487 VATTR_WANTED(&va, va_recursive_gencount);
11488
11489 if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
11490 cur_gencount = va.va_recursive_gencount;
11491 } else {
11492 cur_gencount = 0;
11493 }
11494
11495 if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
11496 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
11497
11498 // this error will be returned to the thread that initiated the
11499 // materialization of req->r_vp.
11500 resolver_error = EBUSY;
11501
11502 // note: we explicitly do not return an error to the caller (i.e.
11503 // the thread that did the materialization) because they said they
11504 // don't want one.
11505 }
11506
11507 vnode_put(req->r_vp);
11508 }
11509
11510 mount_unlock_renames(locked_mp);
11511 mount_drop(locked_mp, 0);
11512
11513 nspace_resolver_req_mark_complete(req, resolver_error);
11514 }
11515
11516 NSPACE_REQ_UNLOCK();
11517
11518 return;
11519 }
11520
11521 static struct proc *nspace_resolver_proc;
11522
11523 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11524 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11525 {
11526 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11527 p == nspace_resolver_proc) ? 1 : 0;
11528 return 0;
11529 }
11530
11531 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11532
11533 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11534 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11535 {
11536 vfs_context_t ctx = vfs_context_current();
11537 int error = 0;
11538
11539 //
11540 // The system filecoordinationd runs as uid == 0. This also
11541 // has the nice side-effect of filtering out filecoordinationd
11542 // running in the simulator.
11543 //
11544 if (!vfs_context_issuser(ctx) ||
11545 !vfs_context_is_dataless_resolver(ctx)) {
11546 return EPERM;
11547 }
11548
11549 if (is_resolver) {
11550 NSPACE_REQ_LOCK();
11551
11552 if (nspace_resolver_proc == NULL) {
11553 proc_lock(p);
11554 p->p_lflag |= P_LNSPACE_RESOLVER;
11555 proc_unlock(p);
11556 nspace_resolver_proc = p;
11557 } else {
11558 error = EBUSY;
11559 }
11560
11561 NSPACE_REQ_UNLOCK();
11562 } else {
11563 // This is basically just like the exit case.
11564 // nspace_resolver_exited() will verify that the
11565 // process is the resolver, and will clear the
11566 // global.
11567 nspace_resolver_exited(p);
11568 }
11569
11570 return error;
11571 }
11572
11573 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11574 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11575 {
11576 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11577 (p->p_vfs_iopolicy &
11578 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11579 *is_prevented = 1;
11580 } else {
11581 *is_prevented = 0;
11582 }
11583 return 0;
11584 }
11585
11586 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11587 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11588 {
11589 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11590 return is_prevented ? 0 : EBUSY;
11591 }
11592
11593 if (is_prevented) {
11594 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11595 } else {
11596 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11597 }
11598 return 0;
11599 }
11600
11601 static int
nspace_materialization_get_thread_state(int * is_prevented)11602 nspace_materialization_get_thread_state(int *is_prevented)
11603 {
11604 uthread_t ut = current_uthread();
11605
11606 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11607 return 0;
11608 }
11609
11610 static int
nspace_materialization_set_thread_state(int is_prevented)11611 nspace_materialization_set_thread_state(int is_prevented)
11612 {
11613 uthread_t ut = current_uthread();
11614
11615 if (is_prevented) {
11616 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11617 } else {
11618 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11619 }
11620 return 0;
11621 }
11622
11623 /* the vfs.nspace branch */
11624 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11625
11626 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11627 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11628 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11629 {
11630 struct proc *p = req->p;
11631 int new_value, old_value, changed = 0;
11632 int error;
11633
11634 error = nspace_resolver_get_proc_state(p, &old_value);
11635 if (error) {
11636 return error;
11637 }
11638
11639 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11640 &changed);
11641 if (error == 0 && changed) {
11642 error = nspace_resolver_set_proc_state(p, new_value);
11643 }
11644 return error;
11645 }
11646
11647 /* decorate this process as the dataless file resolver */
11648 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11649 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11650 0, 0, sysctl_nspace_resolver, "I", "");
11651
11652 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11653 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11654 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11655 {
11656 struct proc *p = req->p;
11657 int new_value, old_value, changed = 0;
11658 int error;
11659
11660 error = nspace_materialization_get_proc_state(p, &old_value);
11661 if (error) {
11662 return error;
11663 }
11664
11665 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11666 &changed);
11667 if (error == 0 && changed) {
11668 error = nspace_materialization_set_proc_state(p, new_value);
11669 }
11670 return error;
11671 }
11672
11673 /* decorate this process as not wanting to materialize dataless files */
11674 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11675 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11676 0, 0, sysctl_nspace_prevent_materialization, "I", "");
11677
11678 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11679 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11680 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11681 {
11682 int new_value, old_value, changed = 0;
11683 int error;
11684
11685 error = nspace_materialization_get_thread_state(&old_value);
11686 if (error) {
11687 return error;
11688 }
11689
11690 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11691 &changed);
11692 if (error == 0 && changed) {
11693 error = nspace_materialization_set_thread_state(new_value);
11694 }
11695 return error;
11696 }
11697
11698 /* decorate this thread as not wanting to materialize dataless files */
11699 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11700 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11701 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11702
11703 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11704 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11705 __unused int arg2, struct sysctl_req *req)
11706 {
11707 struct proc *p = req->p;
11708 uint32_t req_status[2] = { 0, 0 };
11709 uint64_t gencount = 0;
11710 int error, is_resolver, changed = 0, gencount_changed;
11711
11712 error = nspace_resolver_get_proc_state(p, &is_resolver);
11713 if (error) {
11714 return error;
11715 }
11716
11717 if (!is_resolver) {
11718 return EPERM;
11719 }
11720
11721 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11722 &changed);
11723 if (error) {
11724 return error;
11725 }
11726
11727 // get the gencount if it was passed
11728 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11729 &gencount_changed);
11730 if (error) {
11731 gencount = 0;
11732 // we ignore the error because the gencount was optional
11733 error = 0;
11734 }
11735
11736 /*
11737 * req_status[0] is the req_id
11738 *
11739 * req_status[1] is the errno
11740 */
11741 if (error == 0 && changed) {
11742 nspace_resolver_req_completed(req_status[0],
11743 (int)req_status[1], gencount);
11744 }
11745 return error;
11746 }
11747
11748 /* Resolver reports completed reqs here. */
11749 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11750 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11751 0, 0, sysctl_nspace_complete, "-", "");
11752
11753 #endif /* CONFIG_DATALESS_FILES */
11754
11755 #if CONFIG_DATALESS_FILES
11756 #define __no_dataless_unused /* nothing */
11757 #else
11758 #define __no_dataless_unused __unused
11759 #endif
11760
11761 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11762 vfs_context_dataless_materialization_is_prevented(
11763 vfs_context_t const ctx __no_dataless_unused)
11764 {
11765 #if CONFIG_DATALESS_FILES
11766 proc_t const p = vfs_context_proc(ctx);
11767 thread_t const t = vfs_context_thread(ctx);
11768 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11769
11770 /*
11771 * Kernel context ==> return EDEADLK, as we would with any random
11772 * process decorated as no-materialize.
11773 */
11774 if (ctx == vfs_context_kernel()) {
11775 return EDEADLK;
11776 }
11777
11778 /*
11779 * If the process has the dataless-manipulation entitlement,
11780 * materialization is prevented, and depending on the kind
11781 * of file system operation, things get to proceed as if the
11782 * object is not dataless.
11783 */
11784 if (vfs_context_is_dataless_manipulator(ctx)) {
11785 return EJUSTRETURN;
11786 }
11787
11788 /*
11789 * Per-thread decorations override any process-wide decorations.
11790 * (Foundation uses this, and this overrides even the dataless-
11791 * manipulation entitlement so as to make API contracts consistent.)
11792 */
11793 if (ut != NULL) {
11794 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11795 return EDEADLK;
11796 }
11797 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11798 return 0;
11799 }
11800 }
11801
11802 /*
11803 * If the process's iopolicy specifies that dataless files
11804 * can be materialized, then we let it go ahead.
11805 */
11806 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11807 return 0;
11808 }
11809 #endif /* CONFIG_DATALESS_FILES */
11810
11811 /*
11812 * The default behavior is to not materialize dataless files;
11813 * return to the caller that deadlock was detected.
11814 */
11815 return EDEADLK;
11816 }
11817
11818 void
nspace_resolver_init(void)11819 nspace_resolver_init(void)
11820 {
11821 #if CONFIG_DATALESS_FILES
11822 nspace_resolver_request_hashtbl =
11823 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11824 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11825 #endif /* CONFIG_DATALESS_FILES */
11826 }
11827
11828 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11829 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11830 {
11831 #if CONFIG_DATALESS_FILES
11832 struct nspace_resolver_requesthead *bucket;
11833 struct nspace_resolver_request *req;
11834 u_long idx;
11835
11836 NSPACE_REQ_LOCK();
11837
11838 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11839 p == nspace_resolver_proc) {
11840 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11841 bucket = &nspace_resolver_request_hashtbl[idx];
11842 LIST_FOREACH(req, bucket, r_hashlink) {
11843 nspace_resolver_req_mark_complete(req,
11844 ETIMEDOUT);
11845 }
11846 }
11847 nspace_resolver_proc = NULL;
11848 }
11849
11850 NSPACE_REQ_UNLOCK();
11851 #endif /* CONFIG_DATALESS_FILES */
11852 }
11853
11854 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11855 resolve_nspace_item(struct vnode *vp, uint64_t op)
11856 {
11857 return resolve_nspace_item_ext(vp, op, NULL);
11858 }
11859
11860 #define DATALESS_RESOLVER_ENTITLEMENT \
11861 "com.apple.private.vfs.dataless-resolver"
11862 #define DATALESS_MANIPULATION_ENTITLEMENT \
11863 "com.apple.private.vfs.dataless-manipulation"
11864
11865 #if CONFIG_DATALESS_FILES
11866 /*
11867 * Return TRUE if the vfs context is associated with the dataless
11868 * resolver.
11869 */
11870 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11871 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11872 {
11873 return IOTaskHasEntitlement(vfs_context_task(ctx),
11874 DATALESS_RESOLVER_ENTITLEMENT);
11875 }
11876 #endif /* CONFIG_DATALESS_FILES */
11877
11878 /*
11879 * Return TRUE if the vfs context is associated with a process entitled
11880 * for dataless manipulation.
11881 *
11882 * XXX Arguably belongs in vfs_subr.c, but is here because of the
11883 * complication around CONFIG_DATALESS_FILES.
11884 */
11885 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11886 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11887 {
11888 #if CONFIG_DATALESS_FILES
11889 task_t task = vfs_context_task(ctx);
11890 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11891 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11892 #else
11893 return false;
11894 #endif /* CONFIG_DATALESS_FILES */
11895 }
11896
11897 #if CONFIG_DATALESS_FILES
11898 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11899 log_materialization_prevented(vnode_t vp, uint64_t op)
11900 {
11901 char p_name[MAXCOMLEN + 1];
11902 char *vntype;
11903 proc_selfname(&p_name[0], sizeof(p_name));
11904
11905 if (vp->v_type == VREG) {
11906 vntype = "File";
11907 } else if (vp->v_type == VDIR) {
11908 vntype = "Dir";
11909 } else if (vp->v_type == VLNK) {
11910 vntype = "SymLink";
11911 } else {
11912 vntype = "Other";
11913 }
11914
11915 #if DEVELOPMENT
11916 char *path = NULL;
11917 int len;
11918
11919 path = get_pathbuff();
11920 len = MAXPATHLEN;
11921 if (path) {
11922 vn_getpath(vp, path, &len);
11923 }
11924
11925 os_log_debug(OS_LOG_DEFAULT,
11926 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11927 p_name, proc_selfpid(),
11928 op, vntype, path ? path : "<unknown-path>");
11929 if (path) {
11930 release_pathbuff(path);
11931 }
11932 #else
11933 os_log_debug(OS_LOG_DEFAULT,
11934 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11935 p_name, proc_selfpid(),
11936 op, vntype);
11937 #endif
11938 }
11939 #endif /* CONFIG_DATALESS_FILES */
11940
11941 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11942 vfs_materialize_item(
11943 struct vnode *vp __no_dataless_unused,
11944 uint64_t op __no_dataless_unused,
11945 int64_t offset __no_dataless_unused,
11946 int64_t size __no_dataless_unused,
11947 char *lookup_name __no_dataless_unused,
11948 size_t const namelen __no_dataless_unused)
11949 {
11950 #if CONFIG_DATALESS_FILES
11951 struct nspace_resolver_request req;
11952 kern_return_t kern_ret;
11953 mach_port_t mach_port;
11954 char *path = NULL;
11955 vfs_context_t context;
11956 int path_len;
11957 int error;
11958 audit_token_t atoken;
11959
11960 /*
11961 * If this is a snapshot event and the vnode is on a disk image just
11962 * pretend nothing happened since any change to the disk image will
11963 * cause the disk image itself to get backed up and this avoids multi-
11964 * way deadlocks between the snapshot handler and the ever popular
11965 * diskimages-helper process. The variable nspace_allow_virtual_devs
11966 * allows this behavior to be overridden (for use by the Mobile
11967 * TimeMachine testing infrastructure which uses disk images).
11968 */
11969 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11970 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11971 return ENOTSUP;
11972 }
11973
11974 context = vfs_context_current();
11975
11976 error = vfs_context_dataless_materialization_is_prevented(context);
11977 if (error) {
11978 log_materialization_prevented(vp, op);
11979 return error;
11980 }
11981
11982 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11983 &mach_port);
11984 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11985 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11986 /*
11987 * Treat this like being unable to access the backing store
11988 * server.
11989 */
11990 return ETIMEDOUT;
11991 }
11992
11993 int path_alloc_len = MAXPATHLEN;
11994 do {
11995 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
11996 if (path == NULL) {
11997 return ENOMEM;
11998 }
11999
12000 path_len = path_alloc_len;
12001 error = vn_getpath(vp, path, &path_len);
12002 if (error == 0) {
12003 break;
12004 } else if (error == ENOSPC) {
12005 kfree_data(path, path_alloc_len);
12006 path = NULL;
12007 } else {
12008 goto out_release_port;
12009 }
12010 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12011
12012 error = vfs_context_copy_audit_token(context, &atoken);
12013 if (error) {
12014 goto out_release_port;
12015 }
12016
12017 req.r_req_id = next_nspace_req_id();
12018 req.r_resolver_error = 0;
12019 req.r_flags = 0;
12020 req.r_vp = vp;
12021
12022 NSPACE_REQ_LOCK();
12023 error = nspace_resolver_req_add(&req);
12024 NSPACE_REQ_UNLOCK();
12025 if (error) {
12026 goto out_release_port;
12027 }
12028
12029 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12030 if (vp->v_type == VDIR) {
12031 char *tmpname = NULL;
12032
12033 /*
12034 * If the caller provided a lookup_name *and* a name length,
12035 * then we assume the lookup_name is not NUL-terminated.
12036 * Allocate a temporary buffer in this case to provide
12037 * a NUL-terminated path name to the IPC call.
12038 */
12039 if (lookup_name != NULL && namelen != 0) {
12040 if (namelen >= PATH_MAX) {
12041 error = EINVAL;
12042 goto out_release_port;
12043 }
12044 tmpname = zalloc(ZV_NAMEI);
12045 strlcpy(tmpname, lookup_name, namelen + 1);
12046 lookup_name = tmpname;
12047 } else if (lookup_name != NULL) {
12048 /*
12049 * If the caller provided a lookup_name with a
12050 * zero name length, then we assume it's NUL-
12051 * terminated. Verify it has a valid length.
12052 */
12053 if (strlen(lookup_name) >= PATH_MAX) {
12054 error = EINVAL;
12055 goto out_release_port;
12056 }
12057 }
12058
12059 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12060 req.r_req_id, (uint32_t)(op & 0xffffffff),
12061 lookup_name == NULL ? "" : lookup_name, path, atoken);
12062
12063 if (tmpname != NULL) {
12064 zfree(ZV_NAMEI, tmpname);
12065
12066 /*
12067 * Poison lookup_name rather than reference
12068 * freed memory.
12069 */
12070 lookup_name = NULL;
12071 }
12072 } else {
12073 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12074 req.r_req_id, (uint32_t)(op & 0xffffffff),
12075 offset, size, path, atoken);
12076 }
12077 if (kern_ret != KERN_SUCCESS) {
12078 /*
12079 * Also treat this like being unable to access the backing
12080 * store server.
12081 */
12082 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12083 kern_ret);
12084 error = ETIMEDOUT;
12085
12086 NSPACE_REQ_LOCK();
12087 nspace_resolver_req_remove(&req);
12088 NSPACE_REQ_UNLOCK();
12089 goto out_release_port;
12090 }
12091
12092 /*
12093 * Give back the memory we allocated earlier while we wait; we
12094 * no longer need it.
12095 */
12096 kfree_data(path, path_alloc_len);
12097 path = NULL;
12098
12099 /*
12100 * Request has been submitted to the resolver. Now (interruptibly)
12101 * wait for completion. Upon requrn, the request will have been
12102 * removed from the lookup table.
12103 */
12104 error = nspace_resolver_req_wait(&req);
12105
12106 out_release_port:
12107 if (path != NULL) {
12108 kfree_data(path, path_alloc_len);
12109 path = NULL;
12110 }
12111 ipc_port_release_send(mach_port);
12112
12113 return error;
12114 #else
12115 return ENOTSUP;
12116 #endif /* CONFIG_DATALESS_FILES */
12117 }
12118
12119 /*
12120 * vfs_materialize_file: Materialize a regular file.
12121 *
12122 * Inputs:
12123 * vp The dataless file to be materialized.
12124 *
12125 * op What kind of operation is being performed:
12126 * -> NAMESPACE_HANDLER_READ_OP
12127 * -> NAMESPACE_HANDLER_WRITE_OP
12128 * -> NAMESPACE_HANDLER_LINK_CREATE
12129 * -> NAMESPACE_HANDLER_DELETE_OP
12130 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12131 * -> NAMESPACE_HANDLER_RENAME_OP
12132 *
12133 * offset offset of I/O for READ or WRITE. Ignored for
12134 * other ops.
12135 *
12136 * size size of I/O for READ or WRITE Ignored for
12137 * other ops.
12138 *
12139 * If offsize or size are -1 for a READ or WRITE, then the resolver should
12140 * consider the range to be unknown.
12141 *
12142 * Upon successful return, the caller may proceed with the operation.
12143 * N.B. the file may still be "dataless" in this case.
12144 */
12145 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12146 vfs_materialize_file(
12147 struct vnode *vp,
12148 uint64_t op,
12149 int64_t offset,
12150 int64_t size)
12151 {
12152 if (vp->v_type != VREG) {
12153 return EFTYPE;
12154 }
12155 return vfs_materialize_item(vp, op, offset, size, NULL, 0);
12156 }
12157
12158 /*
12159 * vfs_materialize_dir:
12160 *
12161 * Inputs:
12162 * vp The dataless directory to be materialized.
12163 *
12164 * op What kind of operation is being performed:
12165 * -> NAMESPACE_HANDLER_READ_OP
12166 * -> NAMESPACE_HANDLER_WRITE_OP
12167 * -> NAMESPACE_HANDLER_DELETE_OP
12168 * -> NAMESPACE_HANDLER_RENAME_OP
12169 * -> NAMESPACE_HANDLER_LOOKUP_OP
12170 *
12171 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12172 * other ops. May or may not be NUL-terminated; see below.
12173 *
12174 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12175 * terminated and namelen is the number of valid bytes in
12176 * lookup_name. If zero, then lookup_name is assumed to be
12177 * NUL-terminated.
12178 *
12179 * Upon successful return, the caller may proceed with the operation.
12180 * N.B. the directory may still be "dataless" in this case.
12181 */
12182 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12183 vfs_materialize_dir(
12184 struct vnode *vp,
12185 uint64_t op,
12186 char *lookup_name,
12187 size_t namelen)
12188 {
12189 if (vp->v_type != VDIR) {
12190 return EFTYPE;
12191 }
12192 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12193 return EINVAL;
12194 }
12195 return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
12196 }
12197
12198 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)12199 resolve_nspace_item_ext(
12200 struct vnode *vp __no_dataless_unused,
12201 uint64_t op __no_dataless_unused,
12202 void *arg __unused)
12203 {
12204 #if CONFIG_DATALESS_FILES
12205 int error;
12206 mach_port_t mp;
12207 char *path = NULL;
12208 int path_len;
12209 kern_return_t kr;
12210 struct nspace_resolver_request req;
12211
12212 // only allow namespace events on regular files, directories and symlinks.
12213 if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
12214 return EFTYPE;
12215 }
12216
12217 //
12218 // if this is a snapshot event and the vnode is on a
12219 // disk image just pretend nothing happened since any
12220 // change to the disk image will cause the disk image
12221 // itself to get backed up and this avoids multi-way
12222 // deadlocks between the snapshot handler and the ever
12223 // popular diskimages-helper process. the variable
12224 // nspace_allow_virtual_devs allows this behavior to
12225 // be overridden (for use by the Mobile TimeMachine
12226 // testing infrastructure which uses disk images)
12227 //
12228 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12229 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12230 return ENOTSUP;
12231 }
12232
12233 error = vfs_context_dataless_materialization_is_prevented(
12234 vfs_context_current());
12235 if (error) {
12236 log_materialization_prevented(vp, op);
12237 return error;
12238 }
12239
12240 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12241 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12242 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12243 // Treat this like being unable to access the backing
12244 // store server.
12245 return ETIMEDOUT;
12246 }
12247
12248 int path_alloc_len = MAXPATHLEN;
12249 do {
12250 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12251 if (path == NULL) {
12252 return ENOMEM;
12253 }
12254
12255 path_len = path_alloc_len;
12256 error = vn_getpath(vp, path, &path_len);
12257 if (error == 0) {
12258 break;
12259 } else if (error == ENOSPC) {
12260 kfree_data(path, path_alloc_len);
12261 path = NULL;
12262 } else {
12263 goto out_release_port;
12264 }
12265 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12266
12267 if (error == 0) {
12268 int xxx_rdar44371223; /* XXX Mig bug */
12269 req.r_req_id = next_nspace_req_id();
12270 req.r_resolver_error = 0;
12271 req.r_flags = 0;
12272
12273 if ((error = vnode_ref(vp)) == 0) { // take a ref so that the vnode doesn't go away
12274 req.r_vp = vp;
12275 } else {
12276 goto out_release_port;
12277 }
12278
12279 NSPACE_REQ_LOCK();
12280 error = nspace_resolver_req_add(&req);
12281 NSPACE_REQ_UNLOCK();
12282 if (error) {
12283 vnode_rele(req.r_vp);
12284 goto out_release_port;
12285 }
12286
12287 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12288 kr = send_nspace_resolve_path(mp, req.r_req_id,
12289 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
12290 path, &xxx_rdar44371223);
12291 if (kr != KERN_SUCCESS) {
12292 // Also treat this like being unable to access
12293 // the backing store server.
12294 os_log_error(OS_LOG_DEFAULT,
12295 "NSPACE resolve_path failure: %d", kr);
12296 error = ETIMEDOUT;
12297
12298 NSPACE_REQ_LOCK();
12299 nspace_resolver_req_remove(&req);
12300 NSPACE_REQ_UNLOCK();
12301 vnode_rele(req.r_vp);
12302 goto out_release_port;
12303 }
12304
12305 // Give back the memory we allocated earlier while
12306 // we wait; we no longer need it.
12307 kfree_data(path, path_alloc_len);
12308 path = NULL;
12309
12310 // Request has been submitted to the resolver.
12311 // Now (interruptibly) wait for completion.
12312 // Upon requrn, the request will have been removed
12313 // from the lookup table.
12314 error = nspace_resolver_req_wait(&req);
12315
12316 vnode_rele(req.r_vp);
12317 }
12318
12319 out_release_port:
12320 if (path != NULL) {
12321 kfree_data(path, path_alloc_len);
12322 path = NULL;
12323 }
12324 ipc_port_release_send(mp);
12325
12326 return error;
12327 #else
12328 return ENOTSUP;
12329 #endif /* CONFIG_DATALESS_FILES */
12330 }
12331
12332 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)12333 nspace_snapshot_event(__unused vnode_t vp, __unused time_t ctime,
12334 __unused uint64_t op_type, __unused void *arg)
12335 {
12336 return 0;
12337 }
12338
12339 #if 0
12340 static int
12341 build_volfs_path(struct vnode *vp, char *path, int *len)
12342 {
12343 struct vnode_attr va;
12344 int ret;
12345
12346 VATTR_INIT(&va);
12347 VATTR_WANTED(&va, va_fsid);
12348 VATTR_WANTED(&va, va_fileid);
12349
12350 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12351 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12352 ret = -1;
12353 } else {
12354 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12355 ret = 0;
12356 }
12357
12358 return ret;
12359 }
12360 #endif
12361
12362 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12363 fsctl_bogus_command_compat(unsigned long cmd)
12364 {
12365 switch (cmd) {
12366 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12367 return FSIOC_SYNC_VOLUME;
12368 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12369 return FSIOC_ROUTEFS_SETROUTEID;
12370 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12371 return FSIOC_SET_PACKAGE_EXTS;
12372 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12373 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12374 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12375 return DISK_CONDITIONER_IOC_GET;
12376 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12377 return DISK_CONDITIONER_IOC_SET;
12378 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12379 return FSIOC_FIOSEEKHOLE;
12380 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12381 return FSIOC_FIOSEEKDATA;
12382 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12383 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12384 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12385 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12386 }
12387
12388 return cmd;
12389 }
12390
12391 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12392 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12393 {
12394 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12395 }
12396
12397 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12398 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12399 {
12400 struct vfs_attr vfa;
12401 mount_t mp = vp->v_mount;
12402 unsigned arg;
12403 int error;
12404
12405 /* record vid of vp so we can drop it below. */
12406 uint32_t vvid = vp->v_id;
12407
12408 /*
12409 * Then grab mount_iterref so that we can release the vnode.
12410 * Without this, a thread may call vnode_iterate_prepare then
12411 * get into a deadlock because we've never released the root vp
12412 */
12413 error = mount_iterref(mp, 0);
12414 if (error) {
12415 return error;
12416 }
12417 vnode_hold(vp);
12418 vnode_put(vp);
12419
12420 arg = MNT_NOWAIT;
12421 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12422 arg = MNT_WAIT;
12423 }
12424
12425 /*
12426 * If the filessytem supports multiple filesytems in a
12427 * partition (For eg APFS volumes in a container, it knows
12428 * that the waitfor argument to VFS_SYNC are flags.
12429 */
12430 VFSATTR_INIT(&vfa);
12431 VFSATTR_WANTED(&vfa, f_capabilities);
12432 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12433 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12434 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12435 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12436 arg |= MNT_VOLUME;
12437 }
12438
12439 /* issue the sync for this volume */
12440 (void)sync_callback(mp, &arg);
12441
12442 /*
12443 * Then release the mount_iterref once we're done syncing; it's not
12444 * needed for the VNOP_IOCTL below
12445 */
12446 mount_iterdrop(mp);
12447
12448 if (arg & FSCTL_SYNC_FULLSYNC) {
12449 /* re-obtain vnode iocount on the root vp, if possible */
12450 error = vnode_getwithvid(vp, vvid);
12451 if (error == 0) {
12452 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12453 vnode_put(vp);
12454 }
12455 }
12456 vnode_drop(vp);
12457 /* mark the argument VP as having been released */
12458 *arg_vp = NULL;
12459 return error;
12460 }
12461
12462 #if ROUTEFS
12463 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12464 handle_routes(user_addr_t udata)
12465 {
12466 char routepath[MAXPATHLEN];
12467 size_t len = 0;
12468 int error;
12469
12470 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12471 return error;
12472 }
12473 bzero(routepath, MAXPATHLEN);
12474 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12475 if (error) {
12476 return error;
12477 }
12478 error = routefs_kernel_mount(routepath);
12479 return error;
12480 }
12481 #endif
12482
12483 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12484 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12485 {
12486 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12487 struct vnode_attr va;
12488 int error;
12489
12490 VATTR_INIT(&va);
12491 VATTR_SET(&va, va_flags, cas->new_flags);
12492
12493 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12494
12495 #if CONFIG_FSE
12496 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12497 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12498 }
12499 #endif
12500
12501 return error;
12502 }
12503
12504 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12505 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12506 {
12507 struct mount *mp = NULL;
12508 errno_t rootauth = 0;
12509
12510 mp = vp->v_mount;
12511
12512 /*
12513 * query the underlying FS and see if it reports something
12514 * sane for this vnode. If volume is authenticated via
12515 * chunklist, leave that for the caller to determine.
12516 */
12517 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12518
12519 return rootauth;
12520 }
12521
12522 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12523 "com.apple.private.kernel.set-package-extensions"
12524
12525 /*
12526 * Make a filesystem-specific control call:
12527 */
12528 /* ARGSUSED */
12529 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12530 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12531 {
12532 int error = 0;
12533 boolean_t is64bit;
12534 u_int size;
12535 #define STK_PARAMS 128
12536 char stkbuf[STK_PARAMS] = {0};
12537 caddr_t data, memp;
12538 vnode_t vp = *arg_vp;
12539
12540 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12541 return ENOTTY;
12542 }
12543
12544 cmd = fsctl_bogus_command_compat(cmd);
12545
12546 size = IOCPARM_LEN(cmd);
12547 if (size > IOCPARM_MAX) {
12548 return EINVAL;
12549 }
12550
12551 is64bit = proc_is64bit(p);
12552
12553 memp = NULL;
12554
12555 if (size > sizeof(stkbuf)) {
12556 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12557 return ENOMEM;
12558 }
12559 data = memp;
12560 } else {
12561 data = &stkbuf[0];
12562 };
12563
12564 if (cmd & IOC_IN) {
12565 if (size) {
12566 error = copyin(udata, data, size);
12567 if (error) {
12568 if (memp) {
12569 kfree_data(memp, size);
12570 }
12571 return error;
12572 }
12573 } else {
12574 if (is64bit) {
12575 *(user_addr_t *)data = udata;
12576 } else {
12577 *(uint32_t *)data = (uint32_t)udata;
12578 }
12579 };
12580 } else if ((cmd & IOC_OUT) && size) {
12581 /*
12582 * Zero the buffer so the user always
12583 * gets back something deterministic.
12584 */
12585 bzero(data, size);
12586 } else if (cmd & IOC_VOID) {
12587 if (is64bit) {
12588 *(user_addr_t *)data = udata;
12589 } else {
12590 *(uint32_t *)data = (uint32_t)udata;
12591 }
12592 }
12593
12594 /* Check to see if it's a generic command */
12595 switch (cmd) {
12596 case FSIOC_SYNC_VOLUME:
12597 error = handle_sync_volume(vp, arg_vp, data, ctx);
12598 break;
12599
12600 case FSIOC_ROUTEFS_SETROUTEID:
12601 #if ROUTEFS
12602 error = handle_routes(udata);
12603 #endif
12604 break;
12605
12606 case FSIOC_SET_PACKAGE_EXTS: {
12607 user_addr_t ext_strings;
12608 uint32_t num_entries;
12609 uint32_t max_width;
12610
12611 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12612 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12613 error = EPERM;
12614 break;
12615 }
12616
12617 if ((is64bit && size != sizeof(user64_package_ext_info))
12618 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12619 // either you're 64-bit and passed a 64-bit struct or
12620 // you're 32-bit and passed a 32-bit struct. otherwise
12621 // it's not ok.
12622 error = EINVAL;
12623 break;
12624 }
12625
12626 if (is64bit) {
12627 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12628 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12629 }
12630 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12631 num_entries = ((user64_package_ext_info *)data)->num_entries;
12632 max_width = ((user64_package_ext_info *)data)->max_width;
12633 } else {
12634 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12635 num_entries = ((user32_package_ext_info *)data)->num_entries;
12636 max_width = ((user32_package_ext_info *)data)->max_width;
12637 }
12638 error = set_package_extensions_table(ext_strings, num_entries, max_width);
12639 }
12640 break;
12641
12642 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12643 {
12644 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12645 break;
12646 }
12647 if (vp->v_mount) {
12648 mount_lock(vp->v_mount);
12649 if (data[0] != 0) {
12650 int i;
12651 for (i = 0; i < MFSTYPENAMELEN; i++) {
12652 if (!data[i]) {
12653 goto continue_copy;
12654 }
12655 }
12656 /*
12657 * Getting here means we have a user data string which has no
12658 * NULL termination in its first MFSTYPENAMELEN bytes.
12659 * This is bogus, let's avoid strlcpy-ing the read data and
12660 * return an error.
12661 */
12662 error = EINVAL;
12663 goto unlock;
12664 continue_copy:
12665 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
12666 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
12667 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12668 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12669 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12670 }
12671 } else {
12672 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12673 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12674 }
12675 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12676 vp->v_mount->fstypename_override[0] = '\0';
12677 }
12678 unlock:
12679 mount_unlock(vp->v_mount);
12680 }
12681 }
12682 break;
12683
12684 case DISK_CONDITIONER_IOC_GET: {
12685 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12686 }
12687 break;
12688
12689 case DISK_CONDITIONER_IOC_SET: {
12690 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12691 }
12692 break;
12693
12694 case FSIOC_CAS_BSDFLAGS:
12695 error = handle_flags(vp, data, ctx);
12696 break;
12697
12698 case FSIOC_FD_ONLY_OPEN_ONCE: {
12699 error = 0;
12700 if (vnode_usecount(vp) > 1) {
12701 vnode_lock_spin(vp);
12702 if (vp->v_lflag & VL_HASSTREAMS) {
12703 if (vnode_isinuse_locked(vp, 1, 1)) {
12704 error = EBUSY;
12705 }
12706 } else if (vnode_usecount(vp) > 1) {
12707 error = EBUSY;
12708 }
12709 vnode_unlock(vp);
12710 }
12711 }
12712 break;
12713
12714 case FSIOC_EVAL_ROOTAUTH:
12715 error = handle_auth(vp, cmd, data, options, ctx);
12716 break;
12717
12718 case FSIOC_TEST_FSE_ACCESS_GRANTED:
12719 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12720 break;
12721
12722 default: {
12723 /* other, known commands shouldn't be passed down here */
12724 switch (cmd) {
12725 case F_PUNCHHOLE:
12726 case F_TRIM_ACTIVE_FILE:
12727 case F_RDADVISE:
12728 case F_TRANSCODEKEY:
12729 case F_GETPROTECTIONLEVEL:
12730 case F_GETDEFAULTPROTLEVEL:
12731 case F_MAKECOMPRESSED:
12732 case F_SET_GREEDY_MODE:
12733 case F_SETSTATICCONTENT:
12734 case F_SETIOTYPE:
12735 case F_SETBACKINGSTORE:
12736 case F_GETPATH_MTMINFO:
12737 case APFSIOC_REVERT_TO_SNAPSHOT:
12738 case FSIOC_FIOSEEKHOLE:
12739 case FSIOC_FIOSEEKDATA:
12740 case HFS_GET_BOOT_INFO:
12741 case HFS_SET_BOOT_INFO:
12742 case FIOPINSWAP:
12743 case F_CHKCLEAN:
12744 case F_FULLFSYNC:
12745 case F_BARRIERFSYNC:
12746 case F_FREEZE_FS:
12747 case F_THAW_FS:
12748 case FSIOC_KERNEL_ROOTAUTH:
12749 case FSIOC_GRAFT_FS:
12750 case FSIOC_UNGRAFT_FS:
12751 case FSIOC_AUTH_FS:
12752 error = EINVAL;
12753 goto outdrop;
12754 }
12755 /* Invoke the filesystem-specific code */
12756 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12757 }
12758 } /* end switch stmt */
12759
12760 /*
12761 * if no errors, copy any data to user. Size was
12762 * already set and checked above.
12763 */
12764 if (error == 0 && (cmd & IOC_OUT) && size) {
12765 error = copyout(data, udata, size);
12766 }
12767
12768 outdrop:
12769 if (memp) {
12770 kfree_data(memp, size);
12771 }
12772
12773 return error;
12774 }
12775
12776 /* ARGSUSED */
12777 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12778 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12779 {
12780 int error;
12781 struct nameidata nd;
12782 uint32_t nameiflags;
12783 vnode_t vp = NULL;
12784 vfs_context_t ctx = vfs_context_current();
12785
12786 AUDIT_ARG(cmd, (int)uap->cmd);
12787 AUDIT_ARG(value32, uap->options);
12788 /* Get the vnode for the file we are getting info on: */
12789 nameiflags = 0;
12790 //
12791 // if we come through fsctl() then the file is by definition not open.
12792 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12793 // lest the caller mistakenly thinks the only open is their own (but in
12794 // reality it's someone elses).
12795 //
12796 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12797 return EINVAL;
12798 }
12799 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12800 nameiflags |= FOLLOW;
12801 }
12802 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12803 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12804 }
12805 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12806 UIO_USERSPACE, uap->path, ctx);
12807 if ((error = namei(&nd))) {
12808 goto done;
12809 }
12810 vp = nd.ni_vp;
12811 nameidone(&nd);
12812
12813 #if CONFIG_MACF
12814 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12815 if (error) {
12816 goto done;
12817 }
12818 #endif
12819
12820 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12821
12822 done:
12823 if (vp) {
12824 vnode_put(vp);
12825 }
12826 return error;
12827 }
12828 /* ARGSUSED */
12829 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12830 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12831 {
12832 int error;
12833 vnode_t vp = NULL;
12834 vfs_context_t ctx = vfs_context_current();
12835 int fd = -1;
12836
12837 AUDIT_ARG(fd, uap->fd);
12838 AUDIT_ARG(cmd, (int)uap->cmd);
12839 AUDIT_ARG(value32, uap->options);
12840
12841 /* Get the vnode for the file we are getting info on: */
12842 if ((error = file_vnode(uap->fd, &vp))) {
12843 return error;
12844 }
12845 fd = uap->fd;
12846 if ((error = vnode_getwithref(vp))) {
12847 file_drop(fd);
12848 return error;
12849 }
12850
12851 #if CONFIG_MACF
12852 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12853 file_drop(fd);
12854 vnode_put(vp);
12855 return error;
12856 }
12857 #endif
12858
12859 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12860
12861 file_drop(fd);
12862
12863 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12864 if (vp) {
12865 vnode_put(vp);
12866 }
12867
12868 return error;
12869 }
12870 /* end of fsctl system call */
12871
12872 #define FILESEC_ACCESS_ENTITLEMENT \
12873 "com.apple.private.vfs.filesec-access"
12874
12875 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12876 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12877 {
12878 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12879 /*
12880 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12881 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12882 */
12883 if ((!setting && vfs_context_issuser(ctx)) ||
12884 IOTaskHasEntitlement(vfs_context_task(ctx),
12885 FILESEC_ACCESS_ENTITLEMENT)) {
12886 return 0;
12887 }
12888 }
12889
12890 return EPERM;
12891 }
12892
12893 /*
12894 * Retrieve the data of an extended attribute.
12895 */
12896 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12897 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12898 {
12899 vnode_t vp;
12900 struct nameidata nd;
12901 char attrname[XATTR_MAXNAMELEN + 1];
12902 vfs_context_t ctx = vfs_context_current();
12903 uio_t auio = NULL;
12904 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12905 size_t attrsize = 0;
12906 size_t namelen;
12907 u_int32_t nameiflags;
12908 int error;
12909 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12910
12911 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12912 return EINVAL;
12913 }
12914
12915 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12916 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12917 if ((error = namei(&nd))) {
12918 return error;
12919 }
12920 vp = nd.ni_vp;
12921 nameidone(&nd);
12922
12923 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12924 if (error != 0) {
12925 goto out;
12926 }
12927 if (xattr_protected(attrname) &&
12928 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12929 goto out;
12930 }
12931 /*
12932 * the specific check for 0xffffffff is a hack to preserve
12933 * binaray compatibilty in K64 with applications that discovered
12934 * that passing in a buf pointer and a size of -1 resulted in
12935 * just the size of the indicated extended attribute being returned.
12936 * this isn't part of the documented behavior, but because of the
12937 * original implemtation's check for "uap->size > 0", this behavior
12938 * was allowed. In K32 that check turned into a signed comparison
12939 * even though uap->size is unsigned... in K64, we blow by that
12940 * check because uap->size is unsigned and doesn't get sign smeared
12941 * in the munger for a 32 bit user app. we also need to add a
12942 * check to limit the maximum size of the buffer being passed in...
12943 * unfortunately, the underlying fileystems seem to just malloc
12944 * the requested size even if the actual extended attribute is tiny.
12945 * because that malloc is for kernel wired memory, we have to put a
12946 * sane limit on it.
12947 *
12948 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12949 * U64 running on K64 will yield -1 (64 bits wide)
12950 * U32/U64 running on K32 will yield -1 (32 bits wide)
12951 */
12952 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12953 goto no_uio;
12954 }
12955
12956 if (uap->value) {
12957 if (uap->size > (size_t)XATTR_MAXSIZE) {
12958 uap->size = XATTR_MAXSIZE;
12959 }
12960
12961 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12962 &uio_buf[0], sizeof(uio_buf));
12963 uio_addiov(auio, uap->value, uap->size);
12964 }
12965 no_uio:
12966 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12967 out:
12968 vnode_put(vp);
12969
12970 if (auio) {
12971 *retval = uap->size - uio_resid(auio);
12972 } else {
12973 *retval = (user_ssize_t)attrsize;
12974 }
12975
12976 return error;
12977 }
12978
12979 /*
12980 * Retrieve the data of an extended attribute.
12981 */
12982 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12983 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12984 {
12985 vnode_t vp;
12986 char attrname[XATTR_MAXNAMELEN + 1];
12987 vfs_context_t ctx = vfs_context_current();
12988 uio_t auio = NULL;
12989 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12990 size_t attrsize = 0;
12991 size_t namelen;
12992 int error;
12993 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12994
12995 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12996 return EINVAL;
12997 }
12998
12999 if ((error = file_vnode(uap->fd, &vp))) {
13000 return error;
13001 }
13002 if ((error = vnode_getwithref(vp))) {
13003 file_drop(uap->fd);
13004 return error;
13005 }
13006 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13007 if (error != 0) {
13008 goto out;
13009 }
13010 if (xattr_protected(attrname) &&
13011 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13012 goto out;
13013 }
13014 if (uap->value && uap->size > 0) {
13015 if (uap->size > (size_t)XATTR_MAXSIZE) {
13016 uap->size = XATTR_MAXSIZE;
13017 }
13018
13019 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13020 &uio_buf[0], sizeof(uio_buf));
13021 uio_addiov(auio, uap->value, uap->size);
13022 }
13023
13024 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13025 out:
13026 (void)vnode_put(vp);
13027 file_drop(uap->fd);
13028
13029 if (auio) {
13030 *retval = uap->size - uio_resid(auio);
13031 } else {
13032 *retval = (user_ssize_t)attrsize;
13033 }
13034 return error;
13035 }
13036
13037 /* struct for checkdirs iteration */
13038 struct setxattr_ctx {
13039 struct nameidata nd;
13040 char attrname[XATTR_MAXNAMELEN + 1];
13041 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13042 };
13043
13044 /*
13045 * Set the data of an extended attribute.
13046 */
13047 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13048 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13049 {
13050 vnode_t vp;
13051 vfs_context_t ctx = vfs_context_current();
13052 uio_t auio = NULL;
13053 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13054 size_t namelen;
13055 u_int32_t nameiflags;
13056 int error;
13057 struct setxattr_ctx *sactx;
13058
13059 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13060 return EINVAL;
13061 }
13062
13063 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13064 if (sactx == NULL) {
13065 return ENOMEM;
13066 }
13067
13068 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13069 if (error != 0) {
13070 if (error == EPERM) {
13071 /* if the string won't fit in attrname, copyinstr emits EPERM */
13072 error = ENAMETOOLONG;
13073 }
13074 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13075 goto out;
13076 }
13077 if (xattr_protected(sactx->attrname) &&
13078 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13079 goto out;
13080 }
13081 if (uap->size != 0 && uap->value == 0) {
13082 error = EINVAL;
13083 goto out;
13084 }
13085 if (uap->size > INT_MAX) {
13086 error = E2BIG;
13087 goto out;
13088 }
13089
13090 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13091 #if CONFIG_FILE_LEASES
13092 nameiflags |= WANTPARENT;
13093 #endif
13094 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13095 if ((error = namei(&sactx->nd))) {
13096 goto out;
13097 }
13098 vp = sactx->nd.ni_vp;
13099 #if CONFIG_FILE_LEASES
13100 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13101 vnode_put(sactx->nd.ni_dvp);
13102 #endif
13103 nameidone(&sactx->nd);
13104
13105 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13106 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13107 uio_addiov(auio, uap->value, uap->size);
13108
13109 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13110 #if CONFIG_FSE
13111 if (error == 0) {
13112 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13113 FSE_ARG_VNODE, vp,
13114 FSE_ARG_DONE);
13115 }
13116 #endif
13117 vnode_put(vp);
13118 out:
13119 kfree_type(struct setxattr_ctx, sactx);
13120 *retval = 0;
13121 return error;
13122 }
13123
13124 /*
13125 * Set the data of an extended attribute.
13126 */
13127 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13128 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13129 {
13130 vnode_t vp;
13131 char attrname[XATTR_MAXNAMELEN + 1];
13132 vfs_context_t ctx = vfs_context_current();
13133 uio_t auio = NULL;
13134 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13135 size_t namelen;
13136 int error;
13137 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13138
13139 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13140 return EINVAL;
13141 }
13142
13143 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13144 if (error != 0) {
13145 if (error == EPERM) {
13146 /* if the string won't fit in attrname, copyinstr emits EPERM */
13147 return ENAMETOOLONG;
13148 }
13149 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13150 return error;
13151 }
13152 if (xattr_protected(attrname) &&
13153 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13154 return error;
13155 }
13156 if (uap->size != 0 && uap->value == 0) {
13157 return EINVAL;
13158 }
13159 if (uap->size > INT_MAX) {
13160 return E2BIG;
13161 }
13162 if ((error = file_vnode(uap->fd, &vp))) {
13163 return error;
13164 }
13165 if ((error = vnode_getwithref(vp))) {
13166 file_drop(uap->fd);
13167 return error;
13168 }
13169
13170 #if CONFIG_FILE_LEASES
13171 vnode_breakdirlease(vp, true, O_WRONLY);
13172 #endif
13173
13174 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13175 &uio_buf[0], sizeof(uio_buf));
13176 uio_addiov(auio, uap->value, uap->size);
13177
13178 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13179 #if CONFIG_FSE
13180 if (error == 0) {
13181 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13182 FSE_ARG_VNODE, vp,
13183 FSE_ARG_DONE);
13184 }
13185 #endif
13186 vnode_put(vp);
13187 file_drop(uap->fd);
13188 *retval = 0;
13189 return error;
13190 }
13191
13192 /*
13193 * Remove an extended attribute.
13194 * XXX Code duplication here.
13195 */
13196 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13197 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13198 {
13199 vnode_t vp;
13200 struct nameidata nd;
13201 char attrname[XATTR_MAXNAMELEN + 1];
13202 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13203 vfs_context_t ctx = vfs_context_current();
13204 size_t namelen;
13205 u_int32_t nameiflags;
13206 int error;
13207
13208 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13209 return EINVAL;
13210 }
13211
13212 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13213 if (error != 0) {
13214 return error;
13215 }
13216 if (xattr_protected(attrname)) {
13217 return EPERM;
13218 }
13219 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13220 #if CONFIG_FILE_LEASES
13221 nameiflags |= WANTPARENT;
13222 #endif
13223 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13224 if ((error = namei(&nd))) {
13225 return error;
13226 }
13227 vp = nd.ni_vp;
13228 #if CONFIG_FILE_LEASES
13229 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13230 vnode_put(nd.ni_dvp);
13231 #endif
13232 nameidone(&nd);
13233
13234 error = vn_removexattr(vp, attrname, uap->options, ctx);
13235 #if CONFIG_FSE
13236 if (error == 0) {
13237 add_fsevent(FSE_XATTR_REMOVED, ctx,
13238 FSE_ARG_VNODE, vp,
13239 FSE_ARG_DONE);
13240 }
13241 #endif
13242 vnode_put(vp);
13243 *retval = 0;
13244 return error;
13245 }
13246
13247 /*
13248 * Remove an extended attribute.
13249 * XXX Code duplication here.
13250 */
13251 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13252 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13253 {
13254 vnode_t vp;
13255 char attrname[XATTR_MAXNAMELEN + 1];
13256 size_t namelen;
13257 int error;
13258 #if CONFIG_FSE
13259 vfs_context_t ctx = vfs_context_current();
13260 #endif
13261
13262 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13263 return EINVAL;
13264 }
13265
13266 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13267 if (error != 0) {
13268 return error;
13269 }
13270 if (xattr_protected(attrname)) {
13271 return EPERM;
13272 }
13273 if ((error = file_vnode(uap->fd, &vp))) {
13274 return error;
13275 }
13276 if ((error = vnode_getwithref(vp))) {
13277 file_drop(uap->fd);
13278 return error;
13279 }
13280
13281 #if CONFIG_FILE_LEASES
13282 vnode_breakdirlease(vp, true, O_WRONLY);
13283 #endif
13284
13285 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13286 #if CONFIG_FSE
13287 if (error == 0) {
13288 add_fsevent(FSE_XATTR_REMOVED, ctx,
13289 FSE_ARG_VNODE, vp,
13290 FSE_ARG_DONE);
13291 }
13292 #endif
13293 vnode_put(vp);
13294 file_drop(uap->fd);
13295 *retval = 0;
13296 return error;
13297 }
13298
13299 /*
13300 * Retrieve the list of extended attribute names.
13301 * XXX Code duplication here.
13302 */
13303 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13304 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13305 {
13306 vnode_t vp;
13307 struct nameidata nd;
13308 vfs_context_t ctx = vfs_context_current();
13309 uio_t auio = NULL;
13310 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13311 size_t attrsize = 0;
13312 u_int32_t nameiflags;
13313 int error;
13314 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13315
13316 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13317 return EINVAL;
13318 }
13319
13320 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13321 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13322 if ((error = namei(&nd))) {
13323 return error;
13324 }
13325 vp = nd.ni_vp;
13326 nameidone(&nd);
13327 if (uap->namebuf != 0 && uap->bufsize > 0) {
13328 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13329 &uio_buf[0], sizeof(uio_buf));
13330 uio_addiov(auio, uap->namebuf, uap->bufsize);
13331 }
13332
13333 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13334
13335 vnode_put(vp);
13336 if (auio) {
13337 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13338 } else {
13339 *retval = (user_ssize_t)attrsize;
13340 }
13341 return error;
13342 }
13343
13344 /*
13345 * Retrieve the list of extended attribute names.
13346 * XXX Code duplication here.
13347 */
13348 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13349 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13350 {
13351 vnode_t vp;
13352 uio_t auio = NULL;
13353 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13354 size_t attrsize = 0;
13355 int error;
13356 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13357
13358 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13359 return EINVAL;
13360 }
13361
13362 if ((error = file_vnode(uap->fd, &vp))) {
13363 return error;
13364 }
13365 if ((error = vnode_getwithref(vp))) {
13366 file_drop(uap->fd);
13367 return error;
13368 }
13369 if (uap->namebuf != 0 && uap->bufsize > 0) {
13370 auio = uio_createwithbuffer(1, 0, spacetype,
13371 UIO_READ, &uio_buf[0], sizeof(uio_buf));
13372 uio_addiov(auio, uap->namebuf, uap->bufsize);
13373 }
13374
13375 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13376
13377 vnode_put(vp);
13378 file_drop(uap->fd);
13379 if (auio) {
13380 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13381 } else {
13382 *retval = (user_ssize_t)attrsize;
13383 }
13384 return error;
13385 }
13386
13387 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13388 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13389 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13390 {
13391 int error;
13392 struct mount *mp = NULL;
13393 vnode_t vp;
13394 int length;
13395 int bpflags;
13396 /* maximum number of times to retry build_path */
13397 unsigned int retries = 0x10;
13398
13399 if (bufsize > FSGETPATH_MAXBUFLEN) {
13400 return EINVAL;
13401 }
13402
13403 if (buf == NULL) {
13404 return ENOMEM;
13405 }
13406
13407 retry:
13408 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13409 error = ENOTSUP; /* unexpected failure */
13410 return ENOTSUP;
13411 }
13412
13413 #if CONFIG_UNION_MOUNTS
13414 unionget:
13415 #endif /* CONFIG_UNION_MOUNTS */
13416 if (objid == 2) {
13417 struct vfs_attr vfsattr;
13418 int use_vfs_root = TRUE;
13419
13420 VFSATTR_INIT(&vfsattr);
13421 VFSATTR_WANTED(&vfsattr, f_capabilities);
13422 if (!(options & FSOPT_ISREALFSID) &&
13423 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13424 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13425 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13426 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13427 use_vfs_root = FALSE;
13428 }
13429 }
13430
13431 if (use_vfs_root) {
13432 error = VFS_ROOT(mp, &vp, ctx);
13433 } else {
13434 error = VFS_VGET(mp, objid, &vp, ctx);
13435 }
13436 } else {
13437 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13438 }
13439
13440 #if CONFIG_UNION_MOUNTS
13441 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13442 /*
13443 * If the fileid isn't found and we're in a union
13444 * mount volume, then see if the fileid is in the
13445 * mounted-on volume.
13446 */
13447 struct mount *tmp = mp;
13448 mp = vnode_mount(tmp->mnt_vnodecovered);
13449 vfs_unbusy(tmp);
13450 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13451 goto unionget;
13452 }
13453 } else {
13454 vfs_unbusy(mp);
13455 }
13456 #else
13457 vfs_unbusy(mp);
13458 #endif /* CONFIG_UNION_MOUNTS */
13459
13460 if (error) {
13461 return error;
13462 }
13463
13464 #if CONFIG_MACF
13465 error = mac_vnode_check_fsgetpath(ctx, vp);
13466 if (error) {
13467 vnode_put(vp);
13468 return error;
13469 }
13470 #endif
13471
13472 /* Obtain the absolute path to this vnode. */
13473 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13474 if (options & FSOPT_NOFIRMLINKPATH) {
13475 bpflags |= BUILDPATH_NO_FIRMLINK;
13476 }
13477 bpflags |= BUILDPATH_CHECK_MOVED;
13478 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13479 vnode_put(vp);
13480
13481 if (error) {
13482 /* there was a race building the path, try a few more times */
13483 if (error == EAGAIN) {
13484 --retries;
13485 if (retries > 0) {
13486 goto retry;
13487 }
13488
13489 error = ENOENT;
13490 }
13491 goto out;
13492 }
13493
13494 AUDIT_ARG(text, buf);
13495
13496 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13497 unsigned long path_words[NUMPARMS];
13498 size_t path_len = sizeof(path_words);
13499
13500 if ((size_t)length < path_len) {
13501 memcpy((char *)path_words, buf, length);
13502 memset((char *)path_words + length, 0, path_len - length);
13503
13504 path_len = length;
13505 } else {
13506 memcpy((char *)path_words, buf + (length - path_len), path_len);
13507 }
13508
13509 kdebug_vfs_lookup(path_words, (int)path_len, vp,
13510 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13511 }
13512
13513 *pathlen = length; /* may be superseded by error */
13514
13515 out:
13516 return error;
13517 }
13518
13519 /*
13520 * Obtain the full pathname of a file system object by id.
13521 */
13522 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13523 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13524 uint32_t options, user_ssize_t *retval)
13525 {
13526 vfs_context_t ctx = vfs_context_current();
13527 fsid_t fsid;
13528 char *realpath;
13529 int length;
13530 int error;
13531
13532 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13533 return EINVAL;
13534 }
13535
13536 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13537 return error;
13538 }
13539 AUDIT_ARG(value32, fsid.val[0]);
13540 AUDIT_ARG(value64, objid);
13541 /* Restrict output buffer size for now. */
13542
13543 if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13544 return EINVAL;
13545 }
13546 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13547 if (realpath == NULL) {
13548 return ENOMEM;
13549 }
13550
13551 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13552 options, &length);
13553
13554 if (error) {
13555 goto out;
13556 }
13557
13558 error = copyout((caddr_t)realpath, buf, length);
13559
13560 *retval = (user_ssize_t)length; /* may be superseded by error */
13561 out:
13562 kfree_data(realpath, bufsize);
13563 return error;
13564 }
13565
13566 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13567 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13568 {
13569 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13570 0, retval);
13571 }
13572
13573 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13574 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13575 {
13576 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13577 uap->options, retval);
13578 }
13579
13580 /*
13581 * Common routine to handle various flavors of statfs data heading out
13582 * to user space.
13583 *
13584 * Returns: 0 Success
13585 * EFAULT
13586 */
13587 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13588 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13589 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13590 boolean_t partial_copy)
13591 {
13592 int error;
13593 int my_size, copy_size;
13594
13595 if (is_64_bit) {
13596 struct user64_statfs sfs;
13597 my_size = copy_size = sizeof(sfs);
13598 bzero(&sfs, my_size);
13599 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13600 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13601 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13602 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13603 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13604 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13605 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13606 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13607 sfs.f_files = (user64_long_t)sfsp->f_files;
13608 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13609 sfs.f_fsid = sfsp->f_fsid;
13610 sfs.f_owner = sfsp->f_owner;
13611 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13612 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13613 } else {
13614 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13615 }
13616 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13617 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13618
13619 if (partial_copy) {
13620 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13621 }
13622 error = copyout((caddr_t)&sfs, bufp, copy_size);
13623 } else {
13624 struct user32_statfs sfs;
13625
13626 my_size = copy_size = sizeof(sfs);
13627 bzero(&sfs, my_size);
13628
13629 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13630 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13631 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13632
13633 /*
13634 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13635 * have to fudge the numbers here in that case. We inflate the blocksize in order
13636 * to reflect the filesystem size as best we can.
13637 */
13638 if ((sfsp->f_blocks > INT_MAX)
13639 /* Hack for 4061702 . I think the real fix is for Carbon to
13640 * look for some volume capability and not depend on hidden
13641 * semantics agreed between a FS and carbon.
13642 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13643 * for Carbon to set bNoVolumeSizes volume attribute.
13644 * Without this the webdavfs files cannot be copied onto
13645 * disk as they look huge. This change should not affect
13646 * XSAN as they should not setting these to -1..
13647 */
13648 && (sfsp->f_blocks != 0xffffffffffffffffULL)
13649 && (sfsp->f_bfree != 0xffffffffffffffffULL)
13650 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13651 int shift;
13652
13653 /*
13654 * Work out how far we have to shift the block count down to make it fit.
13655 * Note that it's possible to have to shift so far that the resulting
13656 * blocksize would be unreportably large. At that point, we will clip
13657 * any values that don't fit.
13658 *
13659 * For safety's sake, we also ensure that f_iosize is never reported as
13660 * being smaller than f_bsize.
13661 */
13662 for (shift = 0; shift < 32; shift++) {
13663 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13664 break;
13665 }
13666 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13667 break;
13668 }
13669 }
13670 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13671 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13672 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13673 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13674 #undef __SHIFT_OR_CLIP
13675 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13676 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13677 } else {
13678 /* filesystem is small enough to be reported honestly */
13679 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13680 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13681 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13682 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13683 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13684 }
13685 sfs.f_files = (user32_long_t)sfsp->f_files;
13686 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13687 sfs.f_fsid = sfsp->f_fsid;
13688 sfs.f_owner = sfsp->f_owner;
13689 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13690 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13691 } else {
13692 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13693 }
13694 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13695 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13696
13697 if (partial_copy) {
13698 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13699 }
13700 error = copyout((caddr_t)&sfs, bufp, copy_size);
13701 }
13702
13703 if (sizep != NULL) {
13704 *sizep = my_size;
13705 }
13706 return error;
13707 }
13708
13709 /*
13710 * copy stat structure into user_stat structure.
13711 */
13712 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13713 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13714 {
13715 bzero(usbp, sizeof(*usbp));
13716
13717 usbp->st_dev = sbp->st_dev;
13718 usbp->st_ino = sbp->st_ino;
13719 usbp->st_mode = sbp->st_mode;
13720 usbp->st_nlink = sbp->st_nlink;
13721 usbp->st_uid = sbp->st_uid;
13722 usbp->st_gid = sbp->st_gid;
13723 usbp->st_rdev = sbp->st_rdev;
13724 #ifndef _POSIX_C_SOURCE
13725 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13726 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13727 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13728 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13729 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13730 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13731 #else
13732 usbp->st_atime = sbp->st_atime;
13733 usbp->st_atimensec = sbp->st_atimensec;
13734 usbp->st_mtime = sbp->st_mtime;
13735 usbp->st_mtimensec = sbp->st_mtimensec;
13736 usbp->st_ctime = sbp->st_ctime;
13737 usbp->st_ctimensec = sbp->st_ctimensec;
13738 #endif
13739 usbp->st_size = sbp->st_size;
13740 usbp->st_blocks = sbp->st_blocks;
13741 usbp->st_blksize = sbp->st_blksize;
13742 usbp->st_flags = sbp->st_flags;
13743 usbp->st_gen = sbp->st_gen;
13744 usbp->st_lspare = sbp->st_lspare;
13745 usbp->st_qspare[0] = sbp->st_qspare[0];
13746 usbp->st_qspare[1] = sbp->st_qspare[1];
13747 }
13748
13749 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13750 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13751 {
13752 bzero(usbp, sizeof(*usbp));
13753
13754 usbp->st_dev = sbp->st_dev;
13755 usbp->st_ino = sbp->st_ino;
13756 usbp->st_mode = sbp->st_mode;
13757 usbp->st_nlink = sbp->st_nlink;
13758 usbp->st_uid = sbp->st_uid;
13759 usbp->st_gid = sbp->st_gid;
13760 usbp->st_rdev = sbp->st_rdev;
13761 #ifndef _POSIX_C_SOURCE
13762 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13763 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13764 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13765 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13766 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13767 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13768 #else
13769 usbp->st_atime = sbp->st_atime;
13770 usbp->st_atimensec = sbp->st_atimensec;
13771 usbp->st_mtime = sbp->st_mtime;
13772 usbp->st_mtimensec = sbp->st_mtimensec;
13773 usbp->st_ctime = sbp->st_ctime;
13774 usbp->st_ctimensec = sbp->st_ctimensec;
13775 #endif
13776 usbp->st_size = sbp->st_size;
13777 usbp->st_blocks = sbp->st_blocks;
13778 usbp->st_blksize = sbp->st_blksize;
13779 usbp->st_flags = sbp->st_flags;
13780 usbp->st_gen = sbp->st_gen;
13781 usbp->st_lspare = sbp->st_lspare;
13782 usbp->st_qspare[0] = sbp->st_qspare[0];
13783 usbp->st_qspare[1] = sbp->st_qspare[1];
13784 }
13785
13786 /*
13787 * copy stat64 structure into user_stat64 structure.
13788 */
13789 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13790 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13791 {
13792 bzero(usbp, sizeof(*usbp));
13793
13794 usbp->st_dev = sbp->st_dev;
13795 usbp->st_ino = sbp->st_ino;
13796 usbp->st_mode = sbp->st_mode;
13797 usbp->st_nlink = sbp->st_nlink;
13798 usbp->st_uid = sbp->st_uid;
13799 usbp->st_gid = sbp->st_gid;
13800 usbp->st_rdev = sbp->st_rdev;
13801 #ifndef _POSIX_C_SOURCE
13802 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13803 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13804 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13805 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13806 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13807 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13808 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13809 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13810 #else
13811 usbp->st_atime = sbp->st_atime;
13812 usbp->st_atimensec = sbp->st_atimensec;
13813 usbp->st_mtime = sbp->st_mtime;
13814 usbp->st_mtimensec = sbp->st_mtimensec;
13815 usbp->st_ctime = sbp->st_ctime;
13816 usbp->st_ctimensec = sbp->st_ctimensec;
13817 usbp->st_birthtime = sbp->st_birthtime;
13818 usbp->st_birthtimensec = sbp->st_birthtimensec;
13819 #endif
13820 usbp->st_size = sbp->st_size;
13821 usbp->st_blocks = sbp->st_blocks;
13822 usbp->st_blksize = sbp->st_blksize;
13823 usbp->st_flags = sbp->st_flags;
13824 usbp->st_gen = sbp->st_gen;
13825 usbp->st_lspare = sbp->st_lspare;
13826 usbp->st_qspare[0] = sbp->st_qspare[0];
13827 usbp->st_qspare[1] = sbp->st_qspare[1];
13828 }
13829
13830 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13831 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13832 {
13833 bzero(usbp, sizeof(*usbp));
13834
13835 usbp->st_dev = sbp->st_dev;
13836 usbp->st_ino = sbp->st_ino;
13837 usbp->st_mode = sbp->st_mode;
13838 usbp->st_nlink = sbp->st_nlink;
13839 usbp->st_uid = sbp->st_uid;
13840 usbp->st_gid = sbp->st_gid;
13841 usbp->st_rdev = sbp->st_rdev;
13842 #ifndef _POSIX_C_SOURCE
13843 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13844 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13845 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13846 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13847 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13848 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13849 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13850 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13851 #else
13852 usbp->st_atime = sbp->st_atime;
13853 usbp->st_atimensec = sbp->st_atimensec;
13854 usbp->st_mtime = sbp->st_mtime;
13855 usbp->st_mtimensec = sbp->st_mtimensec;
13856 usbp->st_ctime = sbp->st_ctime;
13857 usbp->st_ctimensec = sbp->st_ctimensec;
13858 usbp->st_birthtime = sbp->st_birthtime;
13859 usbp->st_birthtimensec = sbp->st_birthtimensec;
13860 #endif
13861 usbp->st_size = sbp->st_size;
13862 usbp->st_blocks = sbp->st_blocks;
13863 usbp->st_blksize = sbp->st_blksize;
13864 usbp->st_flags = sbp->st_flags;
13865 usbp->st_gen = sbp->st_gen;
13866 usbp->st_lspare = sbp->st_lspare;
13867 usbp->st_qspare[0] = sbp->st_qspare[0];
13868 usbp->st_qspare[1] = sbp->st_qspare[1];
13869 }
13870
13871 /*
13872 * Purge buffer cache for simulating cold starts
13873 */
13874 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13875 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13876 {
13877 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13878
13879 return VNODE_RETURNED;
13880 }
13881
13882 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13883 vfs_purge_callback(mount_t mp, __unused void * arg)
13884 {
13885 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13886
13887 return VFS_RETURNED;
13888 }
13889
13890 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13891 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13892 {
13893 if (!kauth_cred_issuser(kauth_cred_get())) {
13894 return EPERM;
13895 }
13896
13897 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13898
13899 return 0;
13900 }
13901
13902 /*
13903 * gets the vnode associated with the (unnamed) snapshot directory
13904 * for a Filesystem. The snapshot directory vnode is returned with
13905 * an iocount on it.
13906 */
13907 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13908 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13909 {
13910 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13911 }
13912
13913 /*
13914 * Get the snapshot vnode.
13915 *
13916 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13917 * needs nameidone() on ndp.
13918 *
13919 * If the snapshot vnode exists it is returned in ndp->ni_vp.
13920 *
13921 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13922 * not needed.
13923 */
13924 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13925 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13926 user_addr_t name, struct nameidata *ndp, int32_t op,
13927 #if !CONFIG_TRIGGERS
13928 __unused
13929 #endif
13930 enum path_operation pathop,
13931 vfs_context_t ctx)
13932 {
13933 int error, i;
13934 caddr_t name_buf;
13935 size_t name_len;
13936 struct vfs_attr vfa;
13937
13938 *sdvpp = NULLVP;
13939 *rvpp = NULLVP;
13940
13941 error = vnode_getfromfd(ctx, dirfd, rvpp);
13942 if (error) {
13943 return error;
13944 }
13945
13946 if (!vnode_isvroot(*rvpp)) {
13947 error = EINVAL;
13948 goto out;
13949 }
13950
13951 /* Make sure the filesystem supports snapshots */
13952 VFSATTR_INIT(&vfa);
13953 VFSATTR_WANTED(&vfa, f_capabilities);
13954 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13955 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13956 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13957 VOL_CAP_INT_SNAPSHOT)) ||
13958 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13959 VOL_CAP_INT_SNAPSHOT))) {
13960 error = ENOTSUP;
13961 goto out;
13962 }
13963
13964 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13965 if (error) {
13966 goto out;
13967 }
13968
13969 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13970 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13971 if (error) {
13972 goto out1;
13973 }
13974
13975 /*
13976 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13977 * (the length returned by copyinstr includes the terminating NUL)
13978 */
13979 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13980 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13981 error = EINVAL;
13982 goto out1;
13983 }
13984 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13985 ;
13986 }
13987 if (i < (int)name_len) {
13988 error = EINVAL;
13989 goto out1;
13990 }
13991
13992 #if CONFIG_MACF
13993 if (op == CREATE) {
13994 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13995 name_buf);
13996 } else if (op == DELETE) {
13997 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13998 name_buf);
13999 }
14000 if (error) {
14001 goto out1;
14002 }
14003 #endif
14004
14005 /* Check if the snapshot already exists ... */
14006 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14007 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14008 ndp->ni_dvp = *sdvpp;
14009
14010 error = namei(ndp);
14011 out1:
14012 zfree(ZV_NAMEI, name_buf);
14013 out:
14014 if (error) {
14015 if (*sdvpp) {
14016 vnode_put(*sdvpp);
14017 *sdvpp = NULLVP;
14018 }
14019 if (*rvpp) {
14020 vnode_put(*rvpp);
14021 *rvpp = NULLVP;
14022 }
14023 }
14024 return error;
14025 }
14026
14027 /*
14028 * create a filesystem snapshot (for supporting filesystems)
14029 *
14030 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14031 * We get to the (unnamed) snapshot directory vnode and create the vnode
14032 * for the snapshot in it.
14033 *
14034 * Restrictions:
14035 *
14036 * a) Passed in name for snapshot cannot have slashes.
14037 * b) name can't be "." or ".."
14038 *
14039 * Since this requires superuser privileges, vnode_authorize calls are not
14040 * made.
14041 */
14042 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14043 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14044 vfs_context_t ctx)
14045 {
14046 vnode_t rvp, snapdvp;
14047 int error;
14048 struct nameidata *ndp;
14049
14050 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14051
14052 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14053 OP_LINK, ctx);
14054 if (error) {
14055 goto out;
14056 }
14057
14058 if (ndp->ni_vp) {
14059 vnode_put(ndp->ni_vp);
14060 error = EEXIST;
14061 } else {
14062 struct vnode_attr *vap;
14063 vnode_t vp = NULLVP;
14064
14065 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14066
14067 VATTR_INIT(vap);
14068 VATTR_SET(vap, va_type, VREG);
14069 VATTR_SET(vap, va_mode, 0);
14070
14071 error = vn_create(snapdvp, &vp, ndp, vap,
14072 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14073 if (!error && vp) {
14074 vnode_put(vp);
14075 }
14076
14077 kfree_type(struct vnode_attr, vap);
14078 }
14079
14080 nameidone(ndp);
14081 vnode_put(snapdvp);
14082 vnode_put(rvp);
14083 out:
14084 kfree_type(struct nameidata, ndp);
14085
14086 return error;
14087 }
14088
14089 /*
14090 * Delete a Filesystem snapshot
14091 *
14092 * get the vnode for the unnamed snapshot directory and the snapshot and
14093 * delete the snapshot.
14094 */
14095 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14096 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14097 vfs_context_t ctx)
14098 {
14099 vnode_t rvp, snapdvp;
14100 int error;
14101 struct nameidata *ndp;
14102
14103 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14104
14105 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14106 OP_UNLINK, ctx);
14107 if (error) {
14108 goto out;
14109 }
14110
14111 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14112 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14113
14114 vnode_put(ndp->ni_vp);
14115 nameidone(ndp);
14116 vnode_put(snapdvp);
14117 vnode_put(rvp);
14118 out:
14119 kfree_type(struct nameidata, ndp);
14120
14121 return error;
14122 }
14123
14124 /*
14125 * Revert a filesystem to a snapshot
14126 *
14127 * Marks the filesystem to revert to the given snapshot on next mount.
14128 */
14129 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14130 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14131 vfs_context_t ctx)
14132 {
14133 int error;
14134 vnode_t rvp;
14135 mount_t mp;
14136 struct fs_snapshot_revert_args revert_data;
14137 struct componentname cnp;
14138 caddr_t name_buf;
14139 size_t name_len;
14140
14141 error = vnode_getfromfd(ctx, dirfd, &rvp);
14142 if (error) {
14143 return error;
14144 }
14145 mp = vnode_mount(rvp);
14146
14147 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14148 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14149 if (error) {
14150 zfree(ZV_NAMEI, name_buf);
14151 vnode_put(rvp);
14152 return error;
14153 }
14154
14155 #if CONFIG_MACF
14156 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14157 if (error) {
14158 zfree(ZV_NAMEI, name_buf);
14159 vnode_put(rvp);
14160 return error;
14161 }
14162 #endif
14163
14164 /*
14165 * Grab mount_iterref so that we can release the vnode,
14166 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14167 */
14168 error = mount_iterref(mp, 0);
14169 vnode_put(rvp);
14170 if (error) {
14171 zfree(ZV_NAMEI, name_buf);
14172 return error;
14173 }
14174
14175 memset(&cnp, 0, sizeof(cnp));
14176 cnp.cn_pnbuf = (char *)name_buf;
14177 cnp.cn_nameiop = LOOKUP;
14178 cnp.cn_flags = ISLASTCN | HASBUF;
14179 cnp.cn_pnlen = MAXPATHLEN;
14180 cnp.cn_nameptr = cnp.cn_pnbuf;
14181 cnp.cn_namelen = (int)name_len;
14182 revert_data.sr_cnp = &cnp;
14183
14184 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14185 mount_iterdrop(mp);
14186 zfree(ZV_NAMEI, name_buf);
14187
14188 if (error) {
14189 /* If there was any error, try again using VNOP_IOCTL */
14190
14191 vnode_t snapdvp;
14192 struct nameidata namend;
14193
14194 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14195 OP_LOOKUP, ctx);
14196 if (error) {
14197 return error;
14198 }
14199
14200
14201 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14202 0, ctx);
14203
14204 vnode_put(namend.ni_vp);
14205 nameidone(&namend);
14206 vnode_put(snapdvp);
14207 vnode_put(rvp);
14208 }
14209
14210 return error;
14211 }
14212
14213 /*
14214 * rename a Filesystem snapshot
14215 *
14216 * get the vnode for the unnamed snapshot directory and the snapshot and
14217 * rename the snapshot. This is a very specialised (and simple) case of
14218 * rename(2) (which has to deal with a lot more complications). It differs
14219 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14220 */
14221 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14222 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14223 __unused uint32_t flags, vfs_context_t ctx)
14224 {
14225 vnode_t rvp, snapdvp;
14226 int error, i;
14227 caddr_t newname_buf;
14228 size_t name_len;
14229 vnode_t fvp;
14230 struct nameidata *fromnd, *tond;
14231 /* carving out a chunk for structs that are too big to be on stack. */
14232 struct {
14233 struct nameidata from_node;
14234 struct nameidata to_node;
14235 } * __rename_data;
14236
14237 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14238 fromnd = &__rename_data->from_node;
14239 tond = &__rename_data->to_node;
14240
14241 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14242 OP_UNLINK, ctx);
14243 if (error) {
14244 goto out;
14245 }
14246 fvp = fromnd->ni_vp;
14247
14248 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14249 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14250 if (error) {
14251 goto out1;
14252 }
14253
14254 /*
14255 * Some sanity checks- new name can't be empty, "." or ".." or have
14256 * slashes.
14257 * (the length returned by copyinstr includes the terminating NUL)
14258 *
14259 * The FS rename VNOP is suppossed to handle this but we'll pick it
14260 * off here itself.
14261 */
14262 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14263 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14264 error = EINVAL;
14265 goto out1;
14266 }
14267 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14268 ;
14269 }
14270 if (i < (int)name_len) {
14271 error = EINVAL;
14272 goto out1;
14273 }
14274
14275 #if CONFIG_MACF
14276 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14277 newname_buf);
14278 if (error) {
14279 goto out1;
14280 }
14281 #endif
14282
14283 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14284 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14285 tond->ni_dvp = snapdvp;
14286
14287 error = namei(tond);
14288 if (error) {
14289 goto out2;
14290 } else if (tond->ni_vp) {
14291 /*
14292 * snapshot rename behaves differently than rename(2) - if the
14293 * new name exists, EEXIST is returned.
14294 */
14295 vnode_put(tond->ni_vp);
14296 error = EEXIST;
14297 goto out2;
14298 }
14299
14300 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14301 &tond->ni_cnd, ctx);
14302
14303 out2:
14304 nameidone(tond);
14305 out1:
14306 zfree(ZV_NAMEI, newname_buf);
14307 vnode_put(fvp);
14308 vnode_put(snapdvp);
14309 vnode_put(rvp);
14310 nameidone(fromnd);
14311 out:
14312 kfree_type(typeof(*__rename_data), __rename_data);
14313 return error;
14314 }
14315
14316 /*
14317 * Mount a Filesystem snapshot
14318 *
14319 * get the vnode for the unnamed snapshot directory and the snapshot and
14320 * mount the snapshot.
14321 */
14322 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14323 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14324 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14325 {
14326 mount_t mp;
14327 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14328 struct fs_snapshot_mount_args smnt_data;
14329 int error;
14330 struct nameidata *snapndp, *dirndp;
14331 /* carving out a chunk for structs that are too big to be on stack. */
14332 struct {
14333 struct nameidata snapnd;
14334 struct nameidata dirnd;
14335 } * __snapshot_mount_data;
14336
14337 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14338 snapndp = &__snapshot_mount_data->snapnd;
14339 dirndp = &__snapshot_mount_data->dirnd;
14340
14341 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14342 OP_LOOKUP, ctx);
14343 if (error) {
14344 goto out;
14345 }
14346
14347 snapvp = snapndp->ni_vp;
14348 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14349 error = EIO;
14350 goto out1;
14351 }
14352
14353 /* Get the vnode to be covered */
14354 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14355 UIO_USERSPACE, directory, ctx);
14356 error = namei(dirndp);
14357 if (error) {
14358 goto out1;
14359 }
14360
14361 vp = dirndp->ni_vp;
14362 pvp = dirndp->ni_dvp;
14363 mp = vnode_mount(rvp);
14364
14365 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14366 error = EINVAL;
14367 goto out2;
14368 }
14369
14370 #if CONFIG_MACF
14371 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14372 mp->mnt_vfsstat.f_fstypename);
14373 if (error) {
14374 goto out2;
14375 }
14376 #endif
14377
14378 smnt_data.sm_mp = mp;
14379 smnt_data.sm_cnp = &snapndp->ni_cnd;
14380 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14381 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
14382 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14383
14384 out2:
14385 vnode_put(vp);
14386 vnode_put(pvp);
14387 nameidone(dirndp);
14388 out1:
14389 vnode_put(snapvp);
14390 vnode_put(snapdvp);
14391 vnode_put(rvp);
14392 nameidone(snapndp);
14393 out:
14394 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14395 return error;
14396 }
14397
14398 /*
14399 * Root from a snapshot of the filesystem
14400 *
14401 * Marks the filesystem to root from the given snapshot on next boot.
14402 */
14403 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14404 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14405 vfs_context_t ctx)
14406 {
14407 int error;
14408 vnode_t rvp;
14409 mount_t mp;
14410 struct fs_snapshot_root_args root_data;
14411 struct componentname cnp;
14412 caddr_t name_buf;
14413 size_t name_len;
14414
14415 error = vnode_getfromfd(ctx, dirfd, &rvp);
14416 if (error) {
14417 return error;
14418 }
14419 mp = vnode_mount(rvp);
14420
14421 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14422 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14423 if (error) {
14424 zfree(ZV_NAMEI, name_buf);
14425 vnode_put(rvp);
14426 return error;
14427 }
14428
14429 // XXX MAC checks ?
14430
14431 /*
14432 * Grab mount_iterref so that we can release the vnode,
14433 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14434 */
14435 error = mount_iterref(mp, 0);
14436 vnode_put(rvp);
14437 if (error) {
14438 zfree(ZV_NAMEI, name_buf);
14439 return error;
14440 }
14441
14442 memset(&cnp, 0, sizeof(cnp));
14443 cnp.cn_pnbuf = (char *)name_buf;
14444 cnp.cn_nameiop = LOOKUP;
14445 cnp.cn_flags = ISLASTCN | HASBUF;
14446 cnp.cn_pnlen = MAXPATHLEN;
14447 cnp.cn_nameptr = cnp.cn_pnbuf;
14448 cnp.cn_namelen = (int)name_len;
14449 root_data.sr_cnp = &cnp;
14450
14451 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14452
14453 mount_iterdrop(mp);
14454 zfree(ZV_NAMEI, name_buf);
14455
14456 return error;
14457 }
14458
14459 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14460 vfs_context_can_snapshot(vfs_context_t ctx)
14461 {
14462 static const char * const snapshot_entitlements[] = {
14463 "com.apple.private.vfs.snapshot",
14464 "com.apple.developer.vfs.snapshot",
14465 "com.apple.private.apfs.arv.limited.snapshot",
14466 };
14467 static const size_t nentitlements =
14468 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14469 size_t i;
14470
14471 task_t task = vfs_context_task(ctx);
14472 for (i = 0; i < nentitlements; i++) {
14473 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14474 return TRUE;
14475 }
14476 }
14477 return FALSE;
14478 }
14479
14480 /*
14481 * FS snapshot operations dispatcher
14482 */
14483 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14484 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14485 __unused int32_t *retval)
14486 {
14487 int error;
14488 vfs_context_t ctx = vfs_context_current();
14489
14490 AUDIT_ARG(fd, uap->dirfd);
14491 AUDIT_ARG(value32, uap->op);
14492
14493 if (!vfs_context_can_snapshot(ctx)) {
14494 return EPERM;
14495 }
14496
14497 /*
14498 * Enforce user authorization for snapshot modification operations,
14499 * or if trying to root from snapshot.
14500 */
14501 if (uap->op != SNAPSHOT_OP_MOUNT) {
14502 vnode_t dvp = NULLVP;
14503 vnode_t devvp = NULLVP;
14504 mount_t mp;
14505
14506 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14507 if (error) {
14508 return error;
14509 }
14510 mp = vnode_mount(dvp);
14511 devvp = mp->mnt_devvp;
14512
14513 /* get an iocount on devvp */
14514 if (devvp == NULLVP) {
14515 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14516 /* for mounts which arent block devices */
14517 if (error == ENOENT) {
14518 error = ENXIO;
14519 }
14520 } else {
14521 error = vnode_getwithref(devvp);
14522 }
14523
14524 if (error) {
14525 vnode_put(dvp);
14526 return error;
14527 }
14528
14529 if ((vfs_context_issuser(ctx) == 0) &&
14530 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14531 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14532 error = EPERM;
14533 }
14534 vnode_put(dvp);
14535 vnode_put(devvp);
14536
14537 if (error) {
14538 return error;
14539 }
14540 }
14541
14542 switch (uap->op) {
14543 case SNAPSHOT_OP_CREATE:
14544 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14545 break;
14546 case SNAPSHOT_OP_DELETE:
14547 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14548 break;
14549 case SNAPSHOT_OP_RENAME:
14550 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14551 uap->flags, ctx);
14552 break;
14553 case SNAPSHOT_OP_MOUNT:
14554 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14555 uap->data, uap->flags, ctx);
14556 break;
14557 case SNAPSHOT_OP_REVERT:
14558 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14559 break;
14560 #if CONFIG_MNT_ROOTSNAP
14561 case SNAPSHOT_OP_ROOT:
14562 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14563 break;
14564 #endif /* CONFIG_MNT_ROOTSNAP */
14565 default:
14566 error = ENOSYS;
14567 }
14568
14569 return error;
14570 }
14571