1 /*
2 * Copyright (c) 1995-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137
138 #include <nfs/nfs_conf.h>
139
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 ((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 release_pathbuff(x)
154 #else
155 #define GET_PATH(x) \
156 ((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
171 #endif
172
173 extern void disk_conditioner_unmount(mount_t mp);
174
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 vnode_t olddp;
178 vnode_t newdp;
179 };
180 /* callback for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192 boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195 struct componentname *cnp, user_addr_t fsmountargs,
196 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200
201 struct fd_vn_data * fg_vn_data_alloc(void);
202
203 /*
204 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205 * Concurrent lookups (or lookups by ids) on hard links can cause the
206 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207 * does) to return ENOENT as the path cannot be returned from the name cache
208 * alone. We have no option but to retry and hope to get one namei->reverse path
209 * generation done without an intervening lookup, lookup by id on the hard link
210 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211 * which currently are the MAC hooks for rename, unlink and rmdir.
212 */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219 int unlink_flags);
220
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236
237 __private_extern__
238 int sync_internal(void);
239
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249
250 extern lck_rw_t rootvnode_rw_lock;
251
252 VFS_SMR_DECLARE;
253 extern uint32_t nc_smr_enabled;
254
255 /*
256 * incremented each time a mount or unmount operation occurs
257 * used to invalidate the cached value of the rootvp in the
258 * mount structure utilized by cache_lookup_path
259 */
260 uint32_t mount_generation = 0;
261
262 /* counts number of mount and unmount operations */
263 unsigned int vfs_nummntops = 0;
264
265 /* system-wide, per-boot unique mount ID */
266 static _Atomic uint64_t mount_unique_id = 1;
267
268 extern const struct fileops vnops;
269 #if CONFIG_APPLEDOUBLE
270 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
271 #endif /* CONFIG_APPLEDOUBLE */
272
273 /* Maximum buffer length supported by fsgetpath(2) */
274 #define FSGETPATH_MAXBUFLEN 8192
275
276 /*
277 * Virtual File System System Calls
278 */
279
280 /*
281 * Private in-kernel mounting spi (specific use-cases only)
282 */
283 boolean_t
vfs_iskernelmount(mount_t mp)284 vfs_iskernelmount(mount_t mp)
285 {
286 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
287 }
288
289 __private_extern__
290 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)291 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
292 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
293 vfs_context_t ctx)
294 {
295 struct nameidata nd;
296 boolean_t did_namei;
297 int error;
298
299 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
300 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
301
302 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
303
304 /*
305 * Get the vnode to be covered if it's not supplied
306 */
307 if (vp == NULLVP) {
308 error = namei(&nd);
309 if (error) {
310 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
311 printf("failed to locate mount-on path: %s ", path);
312 }
313 return error;
314 }
315 vp = nd.ni_vp;
316 pvp = nd.ni_dvp;
317 did_namei = TRUE;
318 } else {
319 char *pnbuf = CAST_DOWN(char *, path);
320
321 nd.ni_cnd.cn_pnbuf = pnbuf;
322 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
323 did_namei = FALSE;
324 }
325
326 kern_flags |= KERNEL_MOUNT_KMOUNT;
327 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
328 syscall_flags, kern_flags, NULL, ctx);
329
330 if (did_namei) {
331 vnode_put(vp);
332 vnode_put(pvp);
333 nameidone(&nd);
334 }
335
336 return error;
337 }
338
339 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)340 vfs_mount_at_path(const char *fstype, const char *path,
341 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
342 int mnt_flags, int flags)
343 {
344 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
345 int error, km_flags = 0;
346 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
347
348 /*
349 * This call is currently restricted to specific use cases.
350 */
351 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
352 return ENOTSUP;
353 }
354
355 #if !defined(XNU_TARGET_OS_OSX)
356 if (strcmp(fstype, "lifs") == 0) {
357 syscall_flags |= MNT_NOEXEC;
358 }
359 #endif
360
361 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
362 km_flags |= KERNEL_MOUNT_NOAUTH;
363 }
364 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
365 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
366 }
367
368 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
369 syscall_flags, km_flags, ctx);
370 if (error) {
371 printf("%s: mount on %s failed, error %d\n", __func__, path,
372 error);
373 }
374
375 return error;
376 }
377
378 int
vfs_mount_override_type_name(mount_t mp,const char * name)379 vfs_mount_override_type_name(mount_t mp, const char *name)
380 {
381 if (mp == NULL || name == NULL) {
382 return EINVAL;
383 }
384
385 /* Override the FS type name. */
386 mount_lock_spin(mp);
387 strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
388 mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
389 mount_unlock(mp);
390
391 return 0;
392 }
393
394 /*
395 * Mount a file system.
396 */
397 /* ARGSUSED */
398 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)399 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
400 {
401 struct __mac_mount_args muap;
402
403 muap.type = uap->type;
404 muap.path = uap->path;
405 muap.flags = uap->flags;
406 muap.data = uap->data;
407 muap.mac_p = USER_ADDR_NULL;
408 return __mac_mount(p, &muap, retval);
409 }
410
411 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)412 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
413 {
414 struct componentname cn;
415 vfs_context_t ctx = vfs_context_current();
416 size_t dummy = 0;
417 int error;
418 int flags = uap->flags;
419 char fstypename[MFSNAMELEN];
420 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
421 vnode_t pvp;
422 vnode_t vp;
423
424 AUDIT_ARG(fd, uap->fd);
425 AUDIT_ARG(fflags, flags);
426 /* fstypename will get audited by mount_common */
427
428 /* Sanity check the flags */
429 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
430 return ENOTSUP;
431 }
432
433 if (flags & MNT_UNION) {
434 return EPERM;
435 }
436
437 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
438 if (error) {
439 return error;
440 }
441
442 if ((error = file_vnode(uap->fd, &vp)) != 0) {
443 return error;
444 }
445
446 if ((error = vnode_getwithref(vp)) != 0) {
447 file_drop(uap->fd);
448 return error;
449 }
450
451 pvp = vnode_getparent(vp);
452 if (pvp == NULL) {
453 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
454 error = EBUSY;
455 } else {
456 error = EINVAL;
457 }
458 vnode_put(vp);
459 file_drop(uap->fd);
460 return error;
461 }
462
463 memset(&cn, 0, sizeof(struct componentname));
464 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
465 cn.cn_pnlen = MAXPATHLEN;
466
467 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
468 zfree(ZV_NAMEI, cn.cn_pnbuf);
469 vnode_put(pvp);
470 vnode_put(vp);
471 file_drop(uap->fd);
472 return error;
473 }
474
475 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
476
477 zfree(ZV_NAMEI, cn.cn_pnbuf);
478 vnode_put(pvp);
479 vnode_put(vp);
480 file_drop(uap->fd);
481
482 return error;
483 }
484
485 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
486
487 /*
488 * Get the size of a graft file (a manifest or payload file).
489 * The vp should be an iocounted vnode.
490 */
491 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)492 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
493 {
494 struct stat64 sb = {};
495 int error;
496
497 *size = 0;
498
499 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
500 if (error) {
501 return error;
502 }
503
504 if (sb.st_size == 0) {
505 error = ENODATA;
506 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
507 error = EFBIG;
508 } else {
509 *size = (size_t) sb.st_size;
510 }
511
512 return error;
513 }
514
515 /*
516 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
517 * `size` must already be validated.
518 */
519 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)520 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
521 {
522 return vn_rdwr(UIO_READ, graft_vp,
523 (caddr_t) buf, (int) size, /* offset */ 0,
524 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
525 vfs_context_ucred(vctx), /* resid */ NULL,
526 vfs_context_proc(vctx));
527 }
528
529 /*
530 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
531 * and read it into `buf`.
532 */
533 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)534 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
535 {
536 vnode_t metadata_vp = NULLVP;
537 int error;
538
539 // Convert this graft fd to a vnode.
540 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
541 goto out;
542 }
543
544 // Get (and validate) size information.
545 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
546 goto out;
547 }
548
549 // Read each file into the provided buffer - we must get the expected amount of bytes.
550 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
551 goto out;
552 }
553
554 out:
555 if (metadata_vp) {
556 vnode_put(metadata_vp);
557 metadata_vp = NULLVP;
558 }
559
560 return error;
561 }
562
563 /*
564 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
565 * provided in `gfs`, saving the size of data read in `gfs`.
566 */
567 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)568 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
569 fsioc_graft_fs_t *gfs)
570 {
571 int error;
572
573 // Read the authentic manifest.
574 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
575 &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
576 return error;
577 }
578
579 // The user manifest is currently unused, but set its size.
580 gfs->user_manifest_size = 0;
581
582 // Read the payload.
583 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
584 &gfs->payload_size, gfs->payload))) {
585 return error;
586 }
587
588 return 0;
589 }
590
591 /*
592 * Call into the filesystem to verify and graft a cryptex.
593 */
594 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)595 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
596 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
597 {
598 fsioc_graft_fs_t gfs = {};
599 uint64_t graft_dir_ino = 0;
600 struct stat64 sb = {};
601 int error;
602
603 // Pre-flight arguments.
604 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
605 // Make sure that this graft version matches what we support.
606 return ENOTSUP;
607 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
608 // For this type, cryptex VP must live on same volume as the target of graft.
609 return EXDEV;
610 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
611 // We cannot graft upon non-directories.
612 return ENOTDIR;
613 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
614 sbc_args->sbc_payload_fd < 0) {
615 // We cannot graft without a manifest and payload.
616 return EINVAL;
617 }
618
619 if (mounton_vp) {
620 // Get the mounton's inode number.
621 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
622 if (error) {
623 return error;
624 }
625 graft_dir_ino = (uint64_t) sb.st_ino;
626 }
627
628 // Create buffers (of our maximum-defined size) to store authentication info.
629 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
630 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
631
632 if (!gfs.authentic_manifest || !gfs.payload) {
633 error = ENOMEM;
634 goto out;
635 }
636
637 // Read our fd's into our buffers.
638 // (Note that this will set the buffer size fields in `gfs`.)
639 error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
640 if (error) {
641 goto out;
642 }
643
644 gfs.graft_version = FSIOC_GRAFT_VERSION;
645 gfs.graft_type = graft_type;
646 gfs.graft_4cc = sbc_args->sbc_4cc;
647 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
648 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
649 }
650 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
651 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
652 }
653 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
654 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
655 }
656 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
657 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
658 }
659 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
660 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
661 }
662 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
663 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
664 }
665 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
666
667 // Call into the FS to perform the graft (and validation).
668 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
669
670 out:
671 if (gfs.authentic_manifest) {
672 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
673 gfs.authentic_manifest = NULL;
674 }
675 if (gfs.payload) {
676 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
677 gfs.payload = NULL;
678 }
679
680 return error;
681 }
682
683 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
684
685 /*
686 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
687 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
688 */
689 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)690 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
691 {
692 int ua_dmgfd = uap->dmg_fd;
693 user_addr_t ua_mountdir = uap->mountdir;
694 uint32_t ua_grafttype = uap->graft_type;
695 user_addr_t ua_graftargs = uap->gda;
696
697 graftdmg_args_un kern_gda = {};
698 int error = 0;
699 secure_boot_cryptex_args_t *sbc_args = NULL;
700
701 vnode_t cryptex_vp = NULLVP;
702 vnode_t mounton_vp = NULLVP;
703 struct nameidata nd = {};
704 vfs_context_t ctx = vfs_context_current();
705
706 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
707 return EPERM;
708 }
709
710 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
711 if (error) {
712 return error;
713 }
714
715 // Copy mount dir in, if provided.
716 if (ua_mountdir != USER_ADDR_NULL) {
717 // Acquire vnode for mount-on path
718 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
719 UIO_USERSPACE, ua_mountdir, ctx);
720
721 error = namei(&nd);
722 if (error) {
723 return error;
724 }
725 mounton_vp = nd.ni_vp;
726 }
727
728 // Convert fd to vnode.
729 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
730 if (error) {
731 goto graftout;
732 }
733
734 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
735 error = EINVAL;
736 } else {
737 sbc_args = &kern_gda.sbc_args;
738 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
739 }
740
741 graftout:
742 if (cryptex_vp) {
743 vnode_put(cryptex_vp);
744 cryptex_vp = NULLVP;
745 }
746 if (mounton_vp) {
747 vnode_put(mounton_vp);
748 mounton_vp = NULLVP;
749 }
750 if (ua_mountdir != USER_ADDR_NULL) {
751 nameidone(&nd);
752 }
753
754 return error;
755 }
756
757 /*
758 * Ungraft a cryptex disk image (via mount dir FD)
759 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
760 */
761 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)762 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
763 {
764 int error = 0;
765 user_addr_t ua_mountdir = uap->mountdir;
766 fsioc_ungraft_fs_t ugfs;
767 vnode_t mounton_vp = NULLVP;
768 struct nameidata nd = {};
769 vfs_context_t ctx = vfs_context_current();
770
771 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
772 return EPERM;
773 }
774
775 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
776 return EINVAL;
777 }
778
779 ugfs.ungraft_flags = 0;
780
781 // Acquire vnode for mount-on path
782 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
783 UIO_USERSPACE, ua_mountdir, ctx);
784
785 error = namei(&nd);
786 if (error) {
787 return error;
788 }
789 mounton_vp = nd.ni_vp;
790
791 // Call into the FS to perform the ungraft
792 error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
793
794 vnode_put(mounton_vp);
795 nameidone(&nd);
796
797 return error;
798 }
799
800
801 void
vfs_notify_mount(vnode_t pdvp)802 vfs_notify_mount(vnode_t pdvp)
803 {
804 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
805 lock_vnode_and_post(pdvp, NOTE_WRITE);
806 }
807
808 /*
809 * __mac_mount:
810 * Mount a file system taking into account MAC label behavior.
811 * See mount(2) man page for more information
812 *
813 * Parameters: p Process requesting the mount
814 * uap User argument descriptor (see below)
815 * retval (ignored)
816 *
817 * Indirect: uap->type Filesystem type
818 * uap->path Path to mount
819 * uap->data Mount arguments
820 * uap->mac_p MAC info
821 * uap->flags Mount flags
822 *
823 *
824 * Returns: 0 Success
825 * !0 Not success
826 */
827 boolean_t root_fs_upgrade_try = FALSE;
828
829 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)830 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
831 {
832 vnode_t pvp = NULL;
833 vnode_t vp = NULL;
834 int need_nameidone = 0;
835 vfs_context_t ctx = vfs_context_current();
836 char fstypename[MFSNAMELEN];
837 struct nameidata nd;
838 size_t dummy = 0;
839 char *labelstr = NULL;
840 size_t labelsz = 0;
841 int flags = uap->flags;
842 int error;
843 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
844 boolean_t is_64bit = IS_64BIT_PROCESS(p);
845 #else
846 #pragma unused(p)
847 #endif
848 /*
849 * Get the fs type name from user space
850 */
851 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
852 if (error) {
853 return error;
854 }
855
856 /*
857 * Get the vnode to be covered
858 */
859 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
860 UIO_USERSPACE, uap->path, ctx);
861 if (flags & MNT_NOFOLLOW) {
862 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
863 }
864 error = namei(&nd);
865 if (error) {
866 goto out;
867 }
868 need_nameidone = 1;
869 vp = nd.ni_vp;
870 pvp = nd.ni_dvp;
871
872 #ifdef CONFIG_IMGSRC_ACCESS
873 /* Mounting image source cannot be batched with other operations */
874 if (flags == MNT_IMGSRC_BY_INDEX) {
875 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
876 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
877 goto out;
878 }
879 #endif /* CONFIG_IMGSRC_ACCESS */
880
881 #if CONFIG_MACF
882 /*
883 * Get the label string (if any) from user space
884 */
885 if (uap->mac_p != USER_ADDR_NULL) {
886 struct user_mac mac;
887 size_t ulen = 0;
888
889 if (is_64bit) {
890 struct user64_mac mac64;
891 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
892 mac.m_buflen = (user_size_t)mac64.m_buflen;
893 mac.m_string = (user_addr_t)mac64.m_string;
894 } else {
895 struct user32_mac mac32;
896 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
897 mac.m_buflen = mac32.m_buflen;
898 mac.m_string = mac32.m_string;
899 }
900 if (error) {
901 goto out;
902 }
903 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
904 (mac.m_buflen < 2)) {
905 error = EINVAL;
906 goto out;
907 }
908 labelsz = mac.m_buflen;
909 labelstr = kalloc_data(labelsz, Z_WAITOK);
910 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
911 if (error) {
912 goto out;
913 }
914 AUDIT_ARG(mac_string, labelstr);
915 }
916 #endif /* CONFIG_MACF */
917
918 AUDIT_ARG(fflags, flags);
919
920 #if !CONFIG_UNION_MOUNTS
921 if (flags & MNT_UNION) {
922 error = EPERM;
923 goto out;
924 }
925 #endif
926
927 if ((vp->v_flag & VROOT) &&
928 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
929 #if CONFIG_UNION_MOUNTS
930 if (!(flags & MNT_UNION)) {
931 flags |= MNT_UPDATE;
932 } else {
933 /*
934 * For a union mount on '/', treat it as fresh
935 * mount instead of update.
936 * Otherwise, union mouting on '/' used to panic the
937 * system before, since mnt_vnodecovered was found to
938 * be NULL for '/' which is required for unionlookup
939 * after it gets ENOENT on union mount.
940 */
941 flags = (flags & ~(MNT_UPDATE));
942 }
943 #else
944 flags |= MNT_UPDATE;
945 #endif /* CONFIG_UNION_MOUNTS */
946
947 #if SECURE_KERNEL
948 if ((flags & MNT_RDONLY) == 0) {
949 /* Release kernels are not allowed to mount "/" as rw */
950 error = EPERM;
951 goto out;
952 }
953 #endif
954
955 /*
956 * See 7392553 for more details on why this check exists.
957 * Suffice to say: If this check is ON and something tries
958 * to mount the rootFS RW, we'll turn off the codesign
959 * bitmap optimization.
960 */
961 #if CHECK_CS_VALIDATION_BITMAP
962 if ((flags & MNT_RDONLY) == 0) {
963 root_fs_upgrade_try = TRUE;
964 }
965 #endif
966 }
967
968 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
969 labelstr, ctx);
970
971 out:
972
973 #if CONFIG_MACF
974 kfree_data(labelstr, labelsz);
975 #endif /* CONFIG_MACF */
976
977 if (vp) {
978 vnode_put(vp);
979 }
980 if (pvp) {
981 vnode_put(pvp);
982 }
983 if (need_nameidone) {
984 nameidone(&nd);
985 }
986
987 return error;
988 }
989
990 /*
991 * common mount implementation (final stage of mounting)
992 *
993 * Arguments:
994 * fstypename file system type (ie it's vfs name)
995 * pvp parent of covered vnode
996 * vp covered vnode
997 * cnp component name (ie path) of covered vnode
998 * flags generic mount flags
999 * fsmountargs file system specific data
1000 * labelstr optional MAC label
1001 * kernelmount TRUE for mounts initiated from inside the kernel
1002 * ctx caller's context
1003 */
1004 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1005 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1006 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1007 char *labelstr, vfs_context_t ctx)
1008 {
1009 #if !CONFIG_MACF
1010 #pragma unused(labelstr)
1011 #endif
1012 struct vnode *devvp = NULLVP;
1013 struct vnode *device_vnode = NULLVP;
1014 #if CONFIG_MACF
1015 struct vnode *rvp;
1016 #endif
1017 struct mount *mp = NULL;
1018 struct vfstable *vfsp = (struct vfstable *)0;
1019 struct proc *p = vfs_context_proc(ctx);
1020 int error, flag = 0;
1021 bool flag_set = false;
1022 user_addr_t devpath = USER_ADDR_NULL;
1023 int ronly = 0;
1024 int mntalloc = 0;
1025 boolean_t vfsp_ref = FALSE;
1026 boolean_t is_rwlock_locked = FALSE;
1027 boolean_t did_rele = FALSE;
1028 boolean_t have_usecount = FALSE;
1029 boolean_t did_set_lmount = FALSE;
1030 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1031
1032 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1033 /* Check for mutually-exclusive flag bits */
1034 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1035 int bitcount = 0;
1036 while (checkflags != 0) {
1037 checkflags &= (checkflags - 1);
1038 bitcount++;
1039 }
1040
1041 if (bitcount > 1) {
1042 //not allowed to request multiple mount-by-role flags
1043 error = EINVAL;
1044 goto out1;
1045 }
1046 #endif
1047
1048 /*
1049 * Process an update for an existing mount
1050 */
1051 if (flags & MNT_UPDATE) {
1052 if ((vp->v_flag & VROOT) == 0) {
1053 error = EINVAL;
1054 goto out1;
1055 }
1056 mp = vp->v_mount;
1057
1058 /* if unmount or mount in progress, return error */
1059 mount_lock_spin(mp);
1060 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1061 mount_unlock(mp);
1062 error = EBUSY;
1063 goto out1;
1064 }
1065 mp->mnt_lflag |= MNT_LMOUNT;
1066 did_set_lmount = TRUE;
1067 mount_unlock(mp);
1068 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1069 is_rwlock_locked = TRUE;
1070 /*
1071 * We only allow the filesystem to be reloaded if it
1072 * is currently mounted read-only.
1073 */
1074 if ((flags & MNT_RELOAD) &&
1075 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1076 error = ENOTSUP;
1077 goto out1;
1078 }
1079
1080 /*
1081 * If content protection is enabled, update mounts are not
1082 * allowed to turn it off.
1083 */
1084 if ((mp->mnt_flag & MNT_CPROTECT) &&
1085 ((flags & MNT_CPROTECT) == 0)) {
1086 error = EINVAL;
1087 goto out1;
1088 }
1089
1090 /*
1091 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1092 * failure to return an error for this so we'll just silently
1093 * add it if it is not passed in.
1094 */
1095 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1096 ((flags & MNT_REMOVABLE) == 0)) {
1097 flags |= MNT_REMOVABLE;
1098 }
1099
1100 /* Can't downgrade the backer of the root FS */
1101 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1102 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1103 error = ENOTSUP;
1104 goto out1;
1105 }
1106
1107 /*
1108 * Only root, or the user that did the original mount is
1109 * permitted to update it.
1110 */
1111 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1112 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1113 goto out1;
1114 }
1115 #if CONFIG_MACF
1116 error = mac_mount_check_remount(ctx, mp);
1117 if (error != 0) {
1118 goto out1;
1119 }
1120 #endif
1121 /*
1122 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1123 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1124 */
1125 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1126 flags |= MNT_NOSUID | MNT_NODEV;
1127 if (mp->mnt_flag & MNT_NOEXEC) {
1128 flags |= MNT_NOEXEC;
1129 }
1130 }
1131 flag = mp->mnt_flag;
1132 flag_set = true;
1133
1134
1135
1136 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1137
1138 vfsp = mp->mnt_vtable;
1139 goto update;
1140 } // MNT_UPDATE
1141
1142 /*
1143 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1144 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1145 */
1146 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1147 flags |= MNT_NOSUID | MNT_NODEV;
1148 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1149 flags |= MNT_NOEXEC;
1150 }
1151 }
1152
1153 /* XXXAUDIT: Should we capture the type on the error path as well? */
1154 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1155 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1156 mount_list_lock();
1157 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1158 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1159 vfsp->vfc_refcount++;
1160 vfsp_ref = TRUE;
1161 break;
1162 }
1163 }
1164 mount_list_unlock();
1165 if (vfsp == NULL) {
1166 error = ENODEV;
1167 goto out1;
1168 }
1169
1170 /*
1171 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1172 * except in ROSV configs and for the initial BaseSystem root.
1173 */
1174 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1175 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1176 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1177 error = EINVAL; /* unsupported request */
1178 goto out1;
1179 }
1180
1181 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1182 if (error != 0) {
1183 goto out1;
1184 }
1185
1186 /*
1187 * Allocate and initialize the filesystem (mount_t)
1188 */
1189 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1190 mntalloc = 1;
1191
1192 /* Initialize the default IO constraints */
1193 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1194 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1195 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1196 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1197 mp->mnt_devblocksize = DEV_BSIZE;
1198 mp->mnt_alignmentmask = PAGE_MASK;
1199 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1200 mp->mnt_ioscale = 1;
1201 mp->mnt_ioflags = 0;
1202 mp->mnt_realrootvp = NULLVP;
1203 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1204
1205 mp->mnt_lflag |= MNT_LMOUNT;
1206 did_set_lmount = TRUE;
1207
1208 TAILQ_INIT(&mp->mnt_vnodelist);
1209 TAILQ_INIT(&mp->mnt_workerqueue);
1210 TAILQ_INIT(&mp->mnt_newvnodes);
1211 mount_lock_init(mp);
1212 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1213 is_rwlock_locked = TRUE;
1214 mp->mnt_op = vfsp->vfc_vfsops;
1215 mp->mnt_vtable = vfsp;
1216 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1217 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1218 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1219 do {
1220 size_t pathlen = MAXPATHLEN;
1221
1222 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1223 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1224 }
1225 } while (0);
1226 mp->mnt_vnodecovered = vp;
1227 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1228 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1229 mp->mnt_devbsdunit = 0;
1230 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1231
1232 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1233 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1234
1235 if (kernelmount) {
1236 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1237 }
1238 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1239 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1240 }
1241
1242 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1243 // kernel mounted devfs
1244 mp->mnt_kern_flag |= MNTK_SYSTEM;
1245 }
1246
1247 update:
1248
1249 /*
1250 * Set the mount level flags.
1251 */
1252 if (flags & MNT_RDONLY) {
1253 mp->mnt_flag |= MNT_RDONLY;
1254 } else if (mp->mnt_flag & MNT_RDONLY) {
1255 // disallow read/write upgrades of file systems that
1256 // had the TYPENAME_OVERRIDE feature set.
1257 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1258 error = EPERM;
1259 goto out1;
1260 }
1261 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1262 }
1263 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1264 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1265 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1266 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1267 MNT_QUARANTINE | MNT_CPROTECT);
1268
1269 #if SECURE_KERNEL
1270 #if !CONFIG_MNT_SUID
1271 /*
1272 * On release builds of iOS based platforms, always enforce NOSUID on
1273 * all mounts. We do this here because we can catch update mounts as well as
1274 * non-update mounts in this case.
1275 */
1276 mp->mnt_flag |= (MNT_NOSUID);
1277 #endif
1278 #endif
1279
1280 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1281 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1282 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1283 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1284 MNT_QUARANTINE | MNT_CPROTECT);
1285
1286 #if CONFIG_MACF
1287 if (flags & MNT_MULTILABEL) {
1288 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1289 error = EINVAL;
1290 goto out1;
1291 }
1292 mp->mnt_flag |= MNT_MULTILABEL;
1293 }
1294 #endif
1295 /*
1296 * Process device path for local file systems if requested.
1297 *
1298 * Snapshot and mount-by-role mounts do not use this path; they are
1299 * passing other opaque data in the device path field.
1300 *
1301 * Basesystemroot mounts pass a device path to be resolved here,
1302 * but it's just a char * already inside the kernel, which
1303 * kernel_mount() shoved into a user_addr_t to call us. So for such
1304 * mounts we must skip copyin (both of the address and of the string
1305 * (in NDINIT).
1306 */
1307 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1308 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1309 boolean_t do_copyin_devpath = true;
1310 #if CONFIG_BASESYSTEMROOT
1311 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1312 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1313 // We have been passed fsmountargs, which is typed as a user_addr_t,
1314 // but is actually a char ** pointing to a (kernelspace) string.
1315 // We manually unpack it with a series of casts and dereferences
1316 // that reverses what was done just above us on the stack in
1317 // imageboot_pivot_image().
1318 // After retrieving the path to the dev node (which we will NDINIT
1319 // in a moment), we pass NULL fsmountargs on to the filesystem.
1320 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1321 char **devnamepp = (char **)fsmountargs;
1322 char *devnamep = *devnamepp;
1323 devpath = CAST_USER_ADDR_T(devnamep);
1324 do_copyin_devpath = false;
1325 fsmountargs = USER_ADDR_NULL;
1326
1327 //Now that we have a mp, denote that this mount is for the basesystem.
1328 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1329 }
1330 #endif // CONFIG_BASESYSTEMROOT
1331
1332 if (do_copyin_devpath) {
1333 if (vfs_context_is64bit(ctx)) {
1334 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1335 goto out1;
1336 }
1337 fsmountargs += sizeof(devpath);
1338 } else {
1339 user32_addr_t tmp;
1340 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1341 goto out1;
1342 }
1343 /* munge into LP64 addr */
1344 devpath = CAST_USER_ADDR_T(tmp);
1345 fsmountargs += sizeof(tmp);
1346 }
1347 }
1348
1349 /* Lookup device and authorize access to it */
1350 if ((devpath)) {
1351 struct nameidata nd;
1352
1353 enum uio_seg seg = UIO_USERSPACE;
1354 #if CONFIG_BASESYSTEMROOT
1355 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1356 seg = UIO_SYSSPACE;
1357 }
1358 #endif // CONFIG_BASESYSTEMROOT
1359
1360 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1361 if ((error = namei(&nd))) {
1362 goto out1;
1363 }
1364
1365 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1366 devvp = nd.ni_vp;
1367
1368 nameidone(&nd);
1369
1370 if (devvp->v_type != VBLK) {
1371 error = ENOTBLK;
1372 goto out2;
1373 }
1374 if (major(devvp->v_rdev) >= nblkdev) {
1375 error = ENXIO;
1376 goto out2;
1377 }
1378 /*
1379 * If mount by non-root, then verify that user has necessary
1380 * permissions on the device.
1381 */
1382 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1383 mode_t accessmode = KAUTH_VNODE_READ_DATA;
1384
1385 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1386 accessmode |= KAUTH_VNODE_WRITE_DATA;
1387 }
1388 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1389 goto out2;
1390 }
1391 }
1392 }
1393 /* On first mount, preflight and open device */
1394 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1395 if ((error = vnode_ref(devvp))) {
1396 goto out2;
1397 }
1398 /*
1399 * Disallow multiple mounts of the same device.
1400 * Disallow mounting of a device that is currently in use
1401 * (except for root, which might share swap device for miniroot).
1402 * Flush out any old buffers remaining from a previous use.
1403 */
1404 if ((error = vfs_mountedon(devvp))) {
1405 goto out3;
1406 }
1407
1408 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1409 error = EBUSY;
1410 goto out3;
1411 }
1412 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1413 error = ENOTBLK;
1414 goto out3;
1415 }
1416 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1417 goto out3;
1418 }
1419
1420 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1421 #if CONFIG_MACF
1422 error = mac_vnode_check_open(ctx,
1423 devvp,
1424 ronly ? FREAD : FREAD | FWRITE);
1425 if (error) {
1426 goto out3;
1427 }
1428 #endif /* MAC */
1429 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1430 goto out3;
1431 }
1432
1433 mp->mnt_devvp = devvp;
1434 device_vnode = devvp;
1435 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1436 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1437 (device_vnode = mp->mnt_devvp)) {
1438 dev_t dev;
1439 int maj;
1440 /*
1441 * If upgrade to read-write by non-root, then verify
1442 * that user has necessary permissions on the device.
1443 */
1444 vnode_getalways(device_vnode);
1445
1446 if (suser(vfs_context_ucred(ctx), NULL) &&
1447 (error = vnode_authorize(device_vnode, NULL,
1448 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1449 ctx)) != 0) {
1450 vnode_put(device_vnode);
1451 goto out2;
1452 }
1453
1454 /* Tell the device that we're upgrading */
1455 dev = (dev_t)device_vnode->v_rdev;
1456 maj = major(dev);
1457
1458 if ((u_int)maj >= (u_int)nblkdev) {
1459 panic("Volume mounted on a device with invalid major number.");
1460 }
1461
1462 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1463 vnode_put(device_vnode);
1464 device_vnode = NULLVP;
1465 if (error != 0) {
1466 goto out2;
1467 }
1468 }
1469 } // localargs && !(snapshot | data | vm)
1470
1471 #if CONFIG_MACF
1472 if ((flags & MNT_UPDATE) == 0) {
1473 mac_mount_label_init(mp);
1474 mac_mount_label_associate(ctx, mp);
1475 }
1476 if (labelstr) {
1477 if ((flags & MNT_UPDATE) != 0) {
1478 error = mac_mount_check_label_update(ctx, mp);
1479 if (error != 0) {
1480 goto out3;
1481 }
1482 }
1483 }
1484 #endif
1485 /*
1486 * Mount the filesystem. We already asserted that internal_flags
1487 * cannot have more than one mount-by-role bit set.
1488 */
1489 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1490 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1491 (caddr_t)fsmountargs, 0, ctx);
1492 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1493 #if CONFIG_ROSV_STARTUP
1494 struct mount *origin_mp = (struct mount*)fsmountargs;
1495 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1496 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1497 if (error) {
1498 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1499 } else {
1500 /* Mark volume associated with system volume */
1501 mp->mnt_kern_flag |= MNTK_SYSTEM;
1502
1503 /* Attempt to acquire the mnt_devvp and set it up */
1504 struct vnode *mp_devvp = NULL;
1505 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1506 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1507 0, &mp_devvp, vfs_context_kernel());
1508 if (!lerr) {
1509 mp->mnt_devvp = mp_devvp;
1510 //vnode_lookup took an iocount, need to drop it.
1511 vnode_put(mp_devvp);
1512 // now set `device_vnode` to the devvp that was acquired.
1513 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1514 // note that though the iocount above was dropped, the mount acquires
1515 // an implicit reference against the device.
1516 device_vnode = mp_devvp;
1517 }
1518 }
1519 }
1520 #else
1521 error = EINVAL;
1522 #endif
1523 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1524 #if CONFIG_MOUNT_VM
1525 struct mount *origin_mp = (struct mount*)fsmountargs;
1526 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1527 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1528 if (error) {
1529 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1530 } else {
1531 /* Mark volume associated with system volume and a swap mount */
1532 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1533 /* Attempt to acquire the mnt_devvp and set it up */
1534 struct vnode *mp_devvp = NULL;
1535 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1536 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1537 0, &mp_devvp, vfs_context_kernel());
1538 if (!lerr) {
1539 mp->mnt_devvp = mp_devvp;
1540 //vnode_lookup took an iocount, need to drop it.
1541 vnode_put(mp_devvp);
1542
1543 // now set `device_vnode` to the devvp that was acquired.
1544 // note that though the iocount above was dropped, the mount acquires
1545 // an implicit reference against the device.
1546 device_vnode = mp_devvp;
1547 }
1548 }
1549 }
1550 #else
1551 error = EINVAL;
1552 #endif
1553 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1554 #if CONFIG_MOUNT_PREBOOTRECOVERY
1555 struct mount *origin_mp = (struct mount*)fsmountargs;
1556 uint32_t mount_role = 0;
1557 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1558 mount_role = VFS_PREBOOT_ROLE;
1559 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1560 mount_role = VFS_RECOVERY_ROLE;
1561 }
1562
1563 if (mount_role != 0) {
1564 fs_role_mount_args_t frma = {origin_mp, mount_role};
1565 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1566 if (error) {
1567 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1568 } else {
1569 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1570 /* Mark volume associated with system volume */
1571 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1572 /* Attempt to acquire the mnt_devvp and set it up */
1573 struct vnode *mp_devvp = NULL;
1574 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1575 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1576 0, &mp_devvp, vfs_context_kernel());
1577 if (!lerr) {
1578 mp->mnt_devvp = mp_devvp;
1579 //vnode_lookup took an iocount, need to drop it.
1580 vnode_put(mp_devvp);
1581
1582 // now set `device_vnode` to the devvp that was acquired.
1583 // note that though the iocount above was dropped, the mount acquires
1584 // an implicit reference against the device.
1585 device_vnode = mp_devvp;
1586 }
1587 }
1588 }
1589 } else {
1590 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1591 error = EINVAL;
1592 }
1593 #else
1594 error = EINVAL;
1595 #endif
1596 } else {
1597 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1598 }
1599
1600 if (flags & MNT_UPDATE) {
1601 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1602 mp->mnt_flag &= ~MNT_RDONLY;
1603 }
1604 mp->mnt_flag &= ~
1605 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1606 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1607 if (error) {
1608 mp->mnt_flag = flag; /* restore flag value */
1609 }
1610 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1611 lck_rw_done(&mp->mnt_rwlock);
1612 is_rwlock_locked = FALSE;
1613 if (!error) {
1614 enablequotas(mp, ctx);
1615 }
1616 goto exit;
1617 }
1618
1619 /*
1620 * Put the new filesystem on the mount list after root.
1621 */
1622 if (error == 0) {
1623 struct vfs_attr vfsattr;
1624 if (device_vnode) {
1625 /*
1626 * cache the IO attributes for the underlying physical media...
1627 * an error return indicates the underlying driver doesn't
1628 * support all the queries necessary... however, reasonable
1629 * defaults will have been set, so no reason to bail or care
1630 *
1631 * Need to do this before calling the MAC hook as it needs
1632 * information from this call.
1633 */
1634 vfs_init_io_attributes(device_vnode, mp);
1635 }
1636
1637 #if CONFIG_MACF
1638 error = mac_mount_check_mount_late(ctx, mp);
1639 if (error != 0) {
1640 goto out4;
1641 }
1642
1643 if (vfs_flags(mp) & MNT_MULTILABEL) {
1644 error = VFS_ROOT(mp, &rvp, ctx);
1645 if (error) {
1646 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1647 goto out4;
1648 }
1649 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1650 /*
1651 * drop reference provided by VFS_ROOT
1652 */
1653 vnode_put(rvp);
1654
1655 if (error) {
1656 goto out4;
1657 }
1658 }
1659 #endif /* MAC */
1660
1661 vnode_lock_spin(vp);
1662 CLR(vp->v_flag, VMOUNT);
1663 vp->v_mountedhere = mp;
1664 SET(vp->v_flag, VMOUNTEDHERE);
1665 vnode_unlock(vp);
1666
1667 /*
1668 * taking the name_cache_lock exclusively will
1669 * insure that everyone is out of the fast path who
1670 * might be trying to use a now stale copy of
1671 * vp->v_mountedhere->mnt_realrootvp
1672 * bumping mount_generation causes the cached values
1673 * to be invalidated
1674 */
1675 name_cache_lock();
1676 mount_generation++;
1677 name_cache_unlock();
1678
1679 error = vnode_ref(vp);
1680 if (error != 0) {
1681 goto out4;
1682 }
1683
1684 have_usecount = TRUE;
1685
1686 error = checkdirs(vp, ctx);
1687 if (error != 0) {
1688 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1689 goto out4;
1690 }
1691 /*
1692 * there is no cleanup code here so I have made it void
1693 * we need to revisit this
1694 */
1695 (void)VFS_START(mp, 0, ctx);
1696
1697 if (mount_list_add(mp) != 0) {
1698 /*
1699 * The system is shutting down trying to umount
1700 * everything, so fail with a plausible errno.
1701 */
1702 error = EBUSY;
1703 goto out4;
1704 }
1705 lck_rw_done(&mp->mnt_rwlock);
1706 is_rwlock_locked = FALSE;
1707
1708 /* Check if this mounted file system supports EAs or named streams. */
1709 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1710 VFSATTR_INIT(&vfsattr);
1711 VFSATTR_WANTED(&vfsattr, f_capabilities);
1712 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1713 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1714 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1715 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1716 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1717 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1718 }
1719 #if NAMEDSTREAMS
1720 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1721 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1722 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1723 }
1724 #endif
1725 /* Check if this file system supports path from id lookups. */
1726 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1727 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1728 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1729 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1730 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1731 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1732 }
1733
1734 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1735 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1736 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1737 }
1738 }
1739 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1740 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1741 }
1742 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1743 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1744 }
1745 /* increment the operations count */
1746 OSAddAtomic(1, &vfs_nummntops);
1747 enablequotas(mp, ctx);
1748
1749 if (device_vnode) {
1750 device_vnode->v_specflags |= SI_MOUNTEDON;
1751 }
1752
1753 /* Now that mount is setup, notify the listeners */
1754 vfs_notify_mount(pvp);
1755 IOBSDMountChange(mp, kIOMountChangeMount);
1756 } else {
1757 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1758 if (mp->mnt_vnodelist.tqh_first != NULL) {
1759 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1760 mp->mnt_vtable->vfc_name, error);
1761 }
1762
1763 vnode_lock_spin(vp);
1764 CLR(vp->v_flag, VMOUNT);
1765 vnode_unlock(vp);
1766 mount_list_lock();
1767 mp->mnt_vtable->vfc_refcount--;
1768 mount_list_unlock();
1769
1770 if (device_vnode) {
1771 vnode_rele(device_vnode);
1772 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1773 }
1774 lck_rw_done(&mp->mnt_rwlock);
1775 is_rwlock_locked = FALSE;
1776
1777 if (nc_smr_enabled) {
1778 vfs_smr_synchronize();
1779 }
1780
1781 /*
1782 * if we get here, we have a mount structure that needs to be freed,
1783 * but since the coveredvp hasn't yet been updated to point at it,
1784 * no need to worry about other threads holding a crossref on this mp
1785 * so it's ok to just free it
1786 */
1787 mount_lock_destroy(mp);
1788 #if CONFIG_MACF
1789 mac_mount_label_destroy(mp);
1790 #endif
1791 zfree(mount_zone, mp);
1792 did_set_lmount = false;
1793 }
1794 exit:
1795 /*
1796 * drop I/O count on the device vp if there was one
1797 */
1798 if (devpath && devvp) {
1799 vnode_put(devvp);
1800 }
1801
1802 if (did_set_lmount) {
1803 mount_lock_spin(mp);
1804 mp->mnt_lflag &= ~MNT_LMOUNT;
1805 mount_unlock(mp);
1806 }
1807
1808 return error;
1809
1810 /* Error condition exits */
1811 out4:
1812 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1813
1814 /*
1815 * If the mount has been placed on the covered vp,
1816 * it may have been discovered by now, so we have
1817 * to treat this just like an unmount
1818 */
1819 mount_lock_spin(mp);
1820 mp->mnt_lflag |= MNT_LDEAD;
1821 mount_unlock(mp);
1822
1823 if (device_vnode != NULLVP) {
1824 vnode_rele(device_vnode);
1825 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1826 ctx);
1827 did_rele = TRUE;
1828 }
1829
1830 vnode_lock_spin(vp);
1831
1832 mp->mnt_crossref++;
1833 CLR(vp->v_flag, VMOUNTEDHERE);
1834 vp->v_mountedhere = (mount_t) 0;
1835
1836 vnode_unlock(vp);
1837
1838 if (have_usecount) {
1839 vnode_rele(vp);
1840 }
1841 out3:
1842 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1843 vnode_rele(devvp);
1844 }
1845 out2:
1846 if (devpath && devvp) {
1847 vnode_put(devvp);
1848 }
1849 out1:
1850 /* Release mnt_rwlock only when it was taken */
1851 if (is_rwlock_locked == TRUE) {
1852 if (flag_set) {
1853 mp->mnt_flag = flag; /* restore mnt_flag value */
1854 }
1855 lck_rw_done(&mp->mnt_rwlock);
1856 }
1857
1858 if (did_set_lmount) {
1859 mount_lock_spin(mp);
1860 mp->mnt_lflag &= ~MNT_LMOUNT;
1861 mount_unlock(mp);
1862 }
1863
1864 if (mntalloc) {
1865 if (mp->mnt_crossref) {
1866 mount_dropcrossref(mp, vp, 0);
1867 } else {
1868 if (nc_smr_enabled) {
1869 vfs_smr_synchronize();
1870 }
1871
1872 mount_lock_destroy(mp);
1873 #if CONFIG_MACF
1874 mac_mount_label_destroy(mp);
1875 #endif
1876 zfree(mount_zone, mp);
1877 }
1878 }
1879 if (vfsp_ref) {
1880 mount_list_lock();
1881 vfsp->vfc_refcount--;
1882 mount_list_unlock();
1883 }
1884
1885 return error;
1886 }
1887
1888 /*
1889 * Flush in-core data, check for competing mount attempts,
1890 * and set VMOUNT
1891 */
1892 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1893 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1894 {
1895 #if !CONFIG_MACF
1896 #pragma unused(cnp,fsname)
1897 #endif
1898 struct vnode_attr va;
1899 int error;
1900 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1901 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1902 boolean_t is_busy;
1903
1904 if (!skip_auth) {
1905 /*
1906 * If the user is not root, ensure that they own the directory
1907 * onto which we are attempting to mount.
1908 */
1909 VATTR_INIT(&va);
1910 VATTR_WANTED(&va, va_uid);
1911 if ((error = vnode_getattr(vp, &va, ctx)) ||
1912 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1913 (!vfs_context_issuser(ctx)))) {
1914 error = EPERM;
1915 goto out;
1916 }
1917 }
1918
1919 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1920 goto out;
1921 }
1922
1923 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1924 goto out;
1925 }
1926
1927 if (vp->v_type != VDIR) {
1928 error = ENOTDIR;
1929 goto out;
1930 }
1931
1932 vnode_lock_spin(vp);
1933 is_busy = is_fmount ?
1934 (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1935 (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1936 if (is_busy) {
1937 vnode_unlock(vp);
1938 error = EBUSY;
1939 goto out;
1940 }
1941 SET(vp->v_flag, VMOUNT);
1942 vnode_unlock(vp);
1943
1944 #if CONFIG_MACF
1945 error = mac_mount_check_mount(ctx, vp,
1946 cnp, fsname);
1947 if (error != 0) {
1948 vnode_lock_spin(vp);
1949 CLR(vp->v_flag, VMOUNT);
1950 vnode_unlock(vp);
1951 }
1952 #endif
1953
1954 out:
1955 return error;
1956 }
1957
1958 #if CONFIG_IMGSRC_ACCESS
1959
1960 #define DEBUG_IMGSRC 0
1961
1962 #if DEBUG_IMGSRC
1963 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1964 #else
1965 #define IMGSRC_DEBUG(args...) do { } while(0)
1966 #endif
1967
1968 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1969 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1970 {
1971 struct nameidata nd;
1972 vnode_t vp, realdevvp;
1973 mode_t accessmode;
1974 int error;
1975 enum uio_seg uio = UIO_USERSPACE;
1976
1977 if (ctx == vfs_context_kernel()) {
1978 uio = UIO_SYSSPACE;
1979 }
1980
1981 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1982 if ((error = namei(&nd))) {
1983 IMGSRC_DEBUG("namei() failed with %d\n", error);
1984 return error;
1985 }
1986
1987 vp = nd.ni_vp;
1988
1989 if (!vnode_isblk(vp)) {
1990 IMGSRC_DEBUG("Not block device.\n");
1991 error = ENOTBLK;
1992 goto out;
1993 }
1994
1995 realdevvp = mp->mnt_devvp;
1996 if (realdevvp == NULLVP) {
1997 IMGSRC_DEBUG("No device backs the mount.\n");
1998 error = ENXIO;
1999 goto out;
2000 }
2001
2002 error = vnode_getwithref(realdevvp);
2003 if (error != 0) {
2004 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2005 goto out;
2006 }
2007
2008 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2009 IMGSRC_DEBUG("Wrong dev_t.\n");
2010 error = ENXIO;
2011 goto out1;
2012 }
2013
2014 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2015
2016 /*
2017 * If mount by non-root, then verify that user has necessary
2018 * permissions on the device.
2019 */
2020 if (!vfs_context_issuser(ctx)) {
2021 accessmode = KAUTH_VNODE_READ_DATA;
2022 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2023 accessmode |= KAUTH_VNODE_WRITE_DATA;
2024 }
2025 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2026 IMGSRC_DEBUG("Access denied.\n");
2027 goto out1;
2028 }
2029 }
2030
2031 *devvpp = vp;
2032
2033 out1:
2034 vnode_put(realdevvp);
2035
2036 out:
2037 nameidone(&nd);
2038
2039 if (error) {
2040 vnode_put(vp);
2041 }
2042
2043 return error;
2044 }
2045
2046 /*
2047 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2048 * and call checkdirs()
2049 */
2050 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2051 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2052 {
2053 int error;
2054
2055 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2056
2057 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2058 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2059
2060 vnode_lock_spin(vp);
2061 CLR(vp->v_flag, VMOUNT);
2062 vp->v_mountedhere = mp;
2063 SET(vp->v_flag, VMOUNTEDHERE);
2064 vnode_unlock(vp);
2065
2066 /*
2067 * taking the name_cache_lock exclusively will
2068 * insure that everyone is out of the fast path who
2069 * might be trying to use a now stale copy of
2070 * vp->v_mountedhere->mnt_realrootvp
2071 * bumping mount_generation causes the cached values
2072 * to be invalidated
2073 */
2074 name_cache_lock();
2075 mount_generation++;
2076 name_cache_unlock();
2077
2078 error = vnode_ref(vp);
2079 if (error != 0) {
2080 goto out;
2081 }
2082
2083 error = checkdirs(vp, ctx);
2084 if (error != 0) {
2085 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2086 vnode_rele(vp);
2087 goto out;
2088 }
2089
2090 out:
2091 if (error != 0) {
2092 mp->mnt_vnodecovered = NULLVP;
2093 }
2094 return error;
2095 }
2096
2097 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2098 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2099 {
2100 vnode_rele(vp);
2101 vnode_lock_spin(vp);
2102 CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2103 vp->v_mountedhere = (mount_t)NULL;
2104 vnode_unlock(vp);
2105
2106 mp->mnt_vnodecovered = NULLVP;
2107 }
2108
2109 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2110 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2111 {
2112 int error;
2113
2114 /* unmount in progress return error */
2115 mount_lock_spin(mp);
2116 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2117 mount_unlock(mp);
2118 return EBUSY;
2119 }
2120 mount_unlock(mp);
2121 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2122
2123 /*
2124 * We only allow the filesystem to be reloaded if it
2125 * is currently mounted read-only.
2126 */
2127 if ((flags & MNT_RELOAD) &&
2128 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2129 error = ENOTSUP;
2130 goto out;
2131 }
2132
2133 /*
2134 * Only root, or the user that did the original mount is
2135 * permitted to update it.
2136 */
2137 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2138 (!vfs_context_issuser(ctx))) {
2139 error = EPERM;
2140 goto out;
2141 }
2142 #if CONFIG_MACF
2143 error = mac_mount_check_remount(ctx, mp);
2144 if (error != 0) {
2145 goto out;
2146 }
2147 #endif
2148
2149 out:
2150 if (error) {
2151 lck_rw_done(&mp->mnt_rwlock);
2152 }
2153
2154 return error;
2155 }
2156
2157 static void
mount_end_update(mount_t mp)2158 mount_end_update(mount_t mp)
2159 {
2160 lck_rw_done(&mp->mnt_rwlock);
2161 }
2162
2163 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2164 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2165 {
2166 vnode_t vp;
2167
2168 if (height >= MAX_IMAGEBOOT_NESTING) {
2169 return EINVAL;
2170 }
2171
2172 vp = imgsrc_rootvnodes[height];
2173 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2174 *rvpp = vp;
2175 return 0;
2176 } else {
2177 return ENOENT;
2178 }
2179 }
2180
2181 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2182 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2183 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2184 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2185 {
2186 int error;
2187 mount_t mp;
2188 boolean_t placed = FALSE;
2189 struct vfstable *vfsp;
2190 user_addr_t devpath;
2191 char *old_mntonname;
2192 vnode_t rvp;
2193 vnode_t devvp;
2194 uint32_t height;
2195 uint32_t flags;
2196
2197 /* If we didn't imageboot, nothing to move */
2198 if (imgsrc_rootvnodes[0] == NULLVP) {
2199 return EINVAL;
2200 }
2201
2202 /* Only root can do this */
2203 if (!vfs_context_issuser(ctx)) {
2204 return EPERM;
2205 }
2206
2207 IMGSRC_DEBUG("looking for root vnode.\n");
2208
2209 /*
2210 * Get root vnode of filesystem we're moving.
2211 */
2212 if (by_index) {
2213 if (is64bit) {
2214 struct user64_mnt_imgsrc_args mia64;
2215 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2216 if (error != 0) {
2217 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2218 return error;
2219 }
2220
2221 height = mia64.mi_height;
2222 flags = mia64.mi_flags;
2223 devpath = (user_addr_t)mia64.mi_devpath;
2224 } else {
2225 struct user32_mnt_imgsrc_args mia32;
2226 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2227 if (error != 0) {
2228 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2229 return error;
2230 }
2231
2232 height = mia32.mi_height;
2233 flags = mia32.mi_flags;
2234 devpath = mia32.mi_devpath;
2235 }
2236 } else {
2237 /*
2238 * For binary compatibility--assumes one level of nesting.
2239 */
2240 if (is64bit) {
2241 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2242 return error;
2243 }
2244 } else {
2245 user32_addr_t tmp;
2246 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2247 return error;
2248 }
2249
2250 /* munge into LP64 addr */
2251 devpath = CAST_USER_ADDR_T(tmp);
2252 }
2253
2254 height = 0;
2255 flags = 0;
2256 }
2257
2258 if (flags != 0) {
2259 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2260 return EINVAL;
2261 }
2262
2263 error = get_imgsrc_rootvnode(height, &rvp);
2264 if (error != 0) {
2265 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2266 return error;
2267 }
2268
2269 IMGSRC_DEBUG("got old root vnode\n");
2270
2271 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2272
2273 /* Can only move once */
2274 mp = vnode_mount(rvp);
2275 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2276 IMGSRC_DEBUG("Already moved.\n");
2277 error = EBUSY;
2278 goto out0;
2279 }
2280
2281 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2282 IMGSRC_DEBUG("Starting updated.\n");
2283
2284 /* Get exclusive rwlock on mount, authorize update on mp */
2285 error = mount_begin_update(mp, ctx, 0);
2286 if (error != 0) {
2287 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2288 goto out0;
2289 }
2290
2291 /*
2292 * It can only be moved once. Flag is set under the rwlock,
2293 * so we're now safe to proceed.
2294 */
2295 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2296 IMGSRC_DEBUG("Already moved [2]\n");
2297 goto out1;
2298 }
2299
2300 IMGSRC_DEBUG("Preparing coveredvp.\n");
2301
2302 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2303 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2304 if (error != 0) {
2305 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2306 goto out1;
2307 }
2308
2309 IMGSRC_DEBUG("Covered vp OK.\n");
2310
2311 /* Sanity check the name caller has provided */
2312 vfsp = mp->mnt_vtable;
2313 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2314 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2315 vfsp->vfc_name, fsname);
2316 error = EINVAL;
2317 goto out2;
2318 }
2319
2320 /* Check the device vnode and update mount-from name, for local filesystems */
2321 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2322 IMGSRC_DEBUG("Local, doing device validation.\n");
2323
2324 if (devpath != USER_ADDR_NULL) {
2325 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2326 if (error) {
2327 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2328 goto out2;
2329 }
2330
2331 vnode_put(devvp);
2332 }
2333 }
2334
2335 /*
2336 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2337 * and increment the name cache's mount generation
2338 */
2339
2340 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2341 error = place_mount_and_checkdirs(mp, vp, ctx);
2342 if (error != 0) {
2343 goto out2;
2344 }
2345
2346 placed = TRUE;
2347
2348 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2349 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2350
2351 /* Forbid future moves */
2352 mount_lock(mp);
2353 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2354 mount_unlock(mp);
2355
2356 /* Finally, add to mount list, completely ready to go */
2357 if (mount_list_add(mp) != 0) {
2358 /*
2359 * The system is shutting down trying to umount
2360 * everything, so fail with a plausible errno.
2361 */
2362 error = EBUSY;
2363 goto out3;
2364 }
2365
2366 mount_end_update(mp);
2367 vnode_put(rvp);
2368 zfree(ZV_NAMEI, old_mntonname);
2369
2370 vfs_notify_mount(pvp);
2371
2372 return 0;
2373 out3:
2374 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2375
2376 mount_lock(mp);
2377 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2378 mount_unlock(mp);
2379
2380 out2:
2381 /*
2382 * Placing the mp on the vnode clears VMOUNT,
2383 * so cleanup is different after that point
2384 */
2385 if (placed) {
2386 /* Rele the vp, clear VMOUNT and v_mountedhere */
2387 undo_place_on_covered_vp(mp, vp);
2388 } else {
2389 vnode_lock_spin(vp);
2390 CLR(vp->v_flag, VMOUNT);
2391 vnode_unlock(vp);
2392 }
2393 out1:
2394 mount_end_update(mp);
2395
2396 out0:
2397 vnode_put(rvp);
2398 zfree(ZV_NAMEI, old_mntonname);
2399 return error;
2400 }
2401
2402 #endif /* CONFIG_IMGSRC_ACCESS */
2403
2404 void
enablequotas(struct mount * mp,vfs_context_t ctx)2405 enablequotas(struct mount *mp, vfs_context_t ctx)
2406 {
2407 struct nameidata qnd;
2408 int type;
2409 char qfpath[MAXPATHLEN];
2410 const char *qfname = QUOTAFILENAME;
2411 const char *qfopsname = QUOTAOPSNAME;
2412 const char *qfextension[] = INITQFNAMES;
2413
2414 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2415 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2416 return;
2417 }
2418 /*
2419 * Enable filesystem disk quotas if necessary.
2420 * We ignore errors as this should not interfere with final mount
2421 */
2422 for (type = 0; type < MAXQUOTAS; type++) {
2423 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2424 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2425 CAST_USER_ADDR_T(qfpath), ctx);
2426 if (namei(&qnd) != 0) {
2427 continue; /* option file to trigger quotas is not present */
2428 }
2429 vnode_put(qnd.ni_vp);
2430 nameidone(&qnd);
2431 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2432
2433 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2434 }
2435 return;
2436 }
2437
2438
2439 static int
checkdirs_callback(proc_t p,void * arg)2440 checkdirs_callback(proc_t p, void * arg)
2441 {
2442 struct cdirargs *cdrp = (struct cdirargs *)arg;
2443 vnode_t olddp = cdrp->olddp;
2444 vnode_t newdp = cdrp->newdp;
2445 struct filedesc *fdp = &p->p_fd;
2446 vnode_t new_cvp = newdp;
2447 vnode_t new_rvp = newdp;
2448 vnode_t old_cvp = NULL;
2449 vnode_t old_rvp = NULL;
2450
2451 /*
2452 * XXX Also needs to iterate each thread in the process to see if it
2453 * XXX is using a per-thread current working directory, and, if so,
2454 * XXX update that as well.
2455 */
2456
2457 /*
2458 * First, with the proc_fdlock held, check to see if we will need
2459 * to do any work. If not, we will get out fast.
2460 */
2461 proc_fdlock(p);
2462 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2463 proc_fdunlock(p);
2464 return PROC_RETURNED;
2465 }
2466 proc_fdunlock(p);
2467
2468 /*
2469 * Ok, we will have to do some work. Always take two refs
2470 * because we might need that many. We'll dispose of whatever
2471 * we ended up not using.
2472 */
2473 if (vnode_ref(newdp) != 0) {
2474 return PROC_RETURNED;
2475 }
2476 if (vnode_ref(newdp) != 0) {
2477 vnode_rele(newdp);
2478 return PROC_RETURNED;
2479 }
2480
2481 proc_dirs_lock_exclusive(p);
2482 /*
2483 * Now do the work. Note: we dropped the proc_fdlock, so we
2484 * have to do all of the checks again.
2485 */
2486 proc_fdlock(p);
2487 if (fdp->fd_cdir == olddp) {
2488 old_cvp = olddp;
2489 fdp->fd_cdir = newdp;
2490 new_cvp = NULL;
2491 }
2492 if (fdp->fd_rdir == olddp) {
2493 old_rvp = olddp;
2494 fdp->fd_rdir = newdp;
2495 new_rvp = NULL;
2496 }
2497 proc_fdunlock(p);
2498 proc_dirs_unlock_exclusive(p);
2499
2500 /*
2501 * Dispose of any references that are no longer needed.
2502 */
2503 if (old_cvp != NULL) {
2504 vnode_rele(old_cvp);
2505 }
2506 if (old_rvp != NULL) {
2507 vnode_rele(old_rvp);
2508 }
2509 if (new_cvp != NULL) {
2510 vnode_rele(new_cvp);
2511 }
2512 if (new_rvp != NULL) {
2513 vnode_rele(new_rvp);
2514 }
2515
2516 return PROC_RETURNED;
2517 }
2518
2519
2520
2521 /*
2522 * Scan all active processes to see if any of them have a current
2523 * or root directory onto which the new filesystem has just been
2524 * mounted. If so, replace them with the new mount point.
2525 */
2526 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2527 checkdirs(vnode_t olddp, vfs_context_t ctx)
2528 {
2529 vnode_t newdp;
2530 vnode_t tvp;
2531 int err;
2532 struct cdirargs cdr;
2533
2534 if (olddp->v_usecount == 1) {
2535 return 0;
2536 }
2537 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2538
2539 if (err != 0) {
2540 #if DIAGNOSTIC
2541 panic("mount: lost mount: error %d", err);
2542 #endif
2543 return err;
2544 }
2545
2546 cdr.olddp = olddp;
2547 cdr.newdp = newdp;
2548 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2549 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2550
2551 if (rootvnode == olddp) {
2552 vnode_ref(newdp);
2553 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2554 tvp = rootvnode;
2555 rootvnode = newdp;
2556 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2557 vnode_rele(tvp);
2558 }
2559
2560 vnode_put(newdp);
2561 return 0;
2562 }
2563
2564 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2565 "com.apple.private.vfs.role-account-unmount"
2566
2567 /*
2568 * Unmount a file system.
2569 *
2570 * Note: unmount takes a path to the vnode mounted on as argument,
2571 * not special file (as before).
2572 */
2573 /* ARGSUSED */
2574 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2575 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2576 {
2577 vnode_t vp;
2578 struct mount *mp;
2579 int error;
2580 struct nameidata nd;
2581 vfs_context_t ctx;
2582
2583 /*
2584 * If the process has the entitlement, use the kernel's context when
2585 * performing lookup on the mount path as the process might lack proper
2586 * permission to access the directory.
2587 */
2588 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2589 vfs_context_kernel() : vfs_context_current();
2590
2591 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2592 UIO_USERSPACE, uap->path, ctx);
2593 error = namei(&nd);
2594 if (error) {
2595 return error;
2596 }
2597 vp = nd.ni_vp;
2598 mp = vp->v_mount;
2599 nameidone(&nd);
2600
2601 #if CONFIG_MACF
2602 error = mac_mount_check_umount(ctx, mp);
2603 if (error != 0) {
2604 vnode_put(vp);
2605 return error;
2606 }
2607 #endif
2608 /*
2609 * Must be the root of the filesystem
2610 */
2611 if ((vp->v_flag & VROOT) == 0) {
2612 vnode_put(vp);
2613 return EINVAL;
2614 }
2615 mount_ref(mp, 0);
2616 vnode_put(vp);
2617 /* safedounmount consumes the mount ref */
2618 return safedounmount(mp, uap->flags, ctx);
2619 }
2620
2621 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2622 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2623 {
2624 mount_t mp;
2625
2626 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2627 if (mp == (mount_t)0) {
2628 return ENOENT;
2629 }
2630 mount_ref(mp, 0);
2631 mount_iterdrop(mp);
2632 /* safedounmount consumes the mount ref */
2633 return safedounmount(mp, flags, ctx);
2634 }
2635
2636 /*
2637 * The mount struct comes with a mount ref which will be consumed.
2638 * Do the actual file system unmount, prevent some common foot shooting.
2639 */
2640 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2641 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2642 {
2643 int error;
2644 proc_t p = vfs_context_proc(ctx);
2645
2646 /*
2647 * If the file system is not responding and MNT_NOBLOCK
2648 * is set and not a forced unmount then return EBUSY.
2649 */
2650 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2651 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2652 error = EBUSY;
2653 goto out;
2654 }
2655
2656 /*
2657 * Skip authorization in two cases:
2658 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2659 * This entitlement allows non-root processes unmount volumes mounted by
2660 * other processes.
2661 * - If the mount is tagged as permissive and this is not a forced-unmount
2662 * attempt.
2663 */
2664 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2665 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2666 /*
2667 * Only root, or the user that did the original mount is
2668 * permitted to unmount this filesystem.
2669 */
2670 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2671 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2672 goto out;
2673 }
2674 }
2675 /*
2676 * Don't allow unmounting the root file system, or other volumes
2677 * associated with it (for example, the associated VM or DATA mounts) .
2678 */
2679 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2680 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2681 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2682 mp->mnt_vfsstat.f_mntonname);
2683 }
2684 error = EBUSY; /* the root (or associated volumes) is always busy */
2685 goto out;
2686 }
2687
2688 /*
2689 * If the mount is providing the root filesystem's disk image
2690 * (i.e. imageboot), don't allow unmounting
2691 */
2692 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2693 error = EBUSY;
2694 goto out;
2695 }
2696
2697 return dounmount(mp, flags, 1, ctx);
2698
2699 out:
2700 mount_drop(mp, 0);
2701 return error;
2702 }
2703
2704 /*
2705 * Do the actual file system unmount.
2706 */
2707 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2708 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2709 {
2710 vnode_t coveredvp = (vnode_t)0;
2711 int error;
2712 int needwakeup = 0;
2713 int forcedunmount = 0;
2714 int lflags = 0;
2715 struct vnode *devvp = NULLVP;
2716 #if CONFIG_TRIGGERS
2717 proc_t p = vfs_context_proc(ctx);
2718 int did_vflush = 0;
2719 int pflags_save = 0;
2720 #endif /* CONFIG_TRIGGERS */
2721
2722 #if CONFIG_FSE
2723 if (!(flags & MNT_FORCE)) {
2724 fsevent_unmount(mp, ctx); /* has to come first! */
2725 }
2726 #endif
2727
2728 mount_lock(mp);
2729
2730 /*
2731 * If already an unmount in progress just return EBUSY.
2732 * Even a forced unmount cannot override.
2733 */
2734 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2735 if (withref != 0) {
2736 mount_drop(mp, 1);
2737 }
2738 mount_unlock(mp);
2739 return EBUSY;
2740 }
2741
2742 if (flags & MNT_FORCE) {
2743 forcedunmount = 1;
2744 mp->mnt_lflag |= MNT_LFORCE;
2745 }
2746
2747 #if CONFIG_TRIGGERS
2748 if (flags & MNT_NOBLOCK && p != kernproc) {
2749 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2750 }
2751 #endif
2752
2753 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2754 mp->mnt_lflag |= MNT_LUNMOUNT;
2755 mp->mnt_flag &= ~MNT_ASYNC;
2756 /*
2757 * anyone currently in the fast path that
2758 * trips over the cached rootvp will be
2759 * dumped out and forced into the slow path
2760 * to regenerate a new cached value
2761 */
2762 mp->mnt_realrootvp = NULLVP;
2763 mount_unlock(mp);
2764
2765 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2766 /*
2767 * Force unmount any mounts in this filesystem.
2768 * If any unmounts fail - just leave them dangling.
2769 * Avoids recursion.
2770 */
2771 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2772 }
2773
2774 /*
2775 * taking the name_cache_lock exclusively will
2776 * insure that everyone is out of the fast path who
2777 * might be trying to use a now stale copy of
2778 * vp->v_mountedhere->mnt_realrootvp
2779 * bumping mount_generation causes the cached values
2780 * to be invalidated
2781 */
2782 name_cache_lock();
2783 mount_generation++;
2784 name_cache_unlock();
2785
2786
2787 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2788 if (withref != 0) {
2789 mount_drop(mp, 0);
2790 }
2791 error = 0;
2792 if (forcedunmount == 0) {
2793 ubc_umount(mp); /* release cached vnodes */
2794 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2795 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2796 if (error) {
2797 mount_lock(mp);
2798 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2799 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2800 mp->mnt_lflag &= ~MNT_LFORCE;
2801 goto out;
2802 }
2803 }
2804 }
2805
2806 IOBSDMountChange(mp, kIOMountChangeUnmount);
2807
2808 #if CONFIG_TRIGGERS
2809 vfs_nested_trigger_unmounts(mp, flags, ctx);
2810 did_vflush = 1;
2811 #endif
2812 if (forcedunmount) {
2813 lflags |= FORCECLOSE;
2814 }
2815 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2816 if ((forcedunmount == 0) && error) {
2817 mount_lock(mp);
2818 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2819 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2820 mp->mnt_lflag &= ~MNT_LFORCE;
2821 goto out;
2822 }
2823
2824 /* make sure there are no one in the mount iterations or lookup */
2825 mount_iterdrain(mp);
2826
2827 error = VFS_UNMOUNT(mp, flags, ctx);
2828 if (error) {
2829 mount_iterreset(mp);
2830 mount_lock(mp);
2831 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2832 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2833 mp->mnt_lflag &= ~MNT_LFORCE;
2834 goto out;
2835 }
2836
2837 /* increment the operations count */
2838 if (!error) {
2839 OSAddAtomic(1, &vfs_nummntops);
2840 }
2841
2842 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2843 /* hold an io reference and drop the usecount before close */
2844 devvp = mp->mnt_devvp;
2845 vnode_getalways(devvp);
2846 vnode_rele(devvp);
2847 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2848 ctx);
2849 vnode_clearmountedon(devvp);
2850 vnode_put(devvp);
2851 }
2852 lck_rw_done(&mp->mnt_rwlock);
2853 mount_list_remove(mp);
2854 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2855
2856 /* mark the mount point hook in the vp but not drop the ref yet */
2857 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2858 /*
2859 * The covered vnode needs special handling. Trying to get an
2860 * iocount must not block here as this may lead to deadlocks
2861 * if the Filesystem to which the covered vnode belongs is
2862 * undergoing forced unmounts. Since we hold a usecount, the
2863 * vnode cannot be reused (it can, however, still be terminated)
2864 */
2865 vnode_getalways(coveredvp);
2866 vnode_lock_spin(coveredvp);
2867
2868 mp->mnt_crossref++;
2869 coveredvp->v_mountedhere = (struct mount *)0;
2870 CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2871 vnode_unlock(coveredvp);
2872 vnode_put(coveredvp);
2873 }
2874
2875 mount_list_lock();
2876 mp->mnt_vtable->vfc_refcount--;
2877 mount_list_unlock();
2878
2879 cache_purgevfs(mp); /* remove cache entries for this file sys */
2880 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2881 mount_lock(mp);
2882 mp->mnt_lflag |= MNT_LDEAD;
2883
2884 if (mp->mnt_lflag & MNT_LWAIT) {
2885 /*
2886 * do the wakeup here
2887 * in case we block in mount_refdrain
2888 * which will drop the mount lock
2889 * and allow anyone blocked in vfs_busy
2890 * to wakeup and see the LDEAD state
2891 */
2892 mp->mnt_lflag &= ~MNT_LWAIT;
2893 wakeup((caddr_t)mp);
2894 }
2895 mount_refdrain(mp);
2896
2897 /* free disk_conditioner_info structure for this mount */
2898 disk_conditioner_unmount(mp);
2899
2900 out:
2901 if (mp->mnt_lflag & MNT_LWAIT) {
2902 mp->mnt_lflag &= ~MNT_LWAIT;
2903 needwakeup = 1;
2904 }
2905
2906 #if CONFIG_TRIGGERS
2907 if (flags & MNT_NOBLOCK && p != kernproc) {
2908 // Restore P_NOREMOTEHANG bit to its previous value
2909 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2910 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2911 }
2912 }
2913
2914 /*
2915 * Callback and context are set together under the mount lock, and
2916 * never cleared, so we're safe to examine them here, drop the lock,
2917 * and call out.
2918 */
2919 if (mp->mnt_triggercallback != NULL) {
2920 mount_unlock(mp);
2921 if (error == 0) {
2922 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2923 } else if (did_vflush) {
2924 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2925 }
2926 } else {
2927 mount_unlock(mp);
2928 }
2929 #else
2930 mount_unlock(mp);
2931 #endif /* CONFIG_TRIGGERS */
2932
2933 lck_rw_done(&mp->mnt_rwlock);
2934
2935 if (needwakeup) {
2936 wakeup((caddr_t)mp);
2937 }
2938
2939 if (!error) {
2940 if ((coveredvp != NULLVP)) {
2941 vnode_t pvp = NULLVP;
2942
2943 /*
2944 * The covered vnode needs special handling. Trying to
2945 * get an iocount must not block here as this may lead
2946 * to deadlocks if the Filesystem to which the covered
2947 * vnode belongs is undergoing forced unmounts. Since we
2948 * hold a usecount, the vnode cannot be reused
2949 * (it can, however, still be terminated).
2950 */
2951 vnode_getalways(coveredvp);
2952
2953 mount_dropcrossref(mp, coveredvp, 0);
2954 /*
2955 * We'll _try_ to detect if this really needs to be
2956 * done. The coveredvp can only be in termination (or
2957 * terminated) if the coveredvp's mount point is in a
2958 * forced unmount (or has been) since we still hold the
2959 * ref.
2960 */
2961 if (!vnode_isrecycled(coveredvp)) {
2962 pvp = vnode_getparent(coveredvp);
2963 #if CONFIG_TRIGGERS
2964 if (coveredvp->v_resolve) {
2965 vnode_trigger_rearm(coveredvp, ctx);
2966 }
2967 #endif
2968 }
2969
2970 vnode_rele(coveredvp);
2971 vnode_put(coveredvp);
2972 coveredvp = NULLVP;
2973
2974 if (pvp) {
2975 lock_vnode_and_post(pvp, NOTE_WRITE);
2976 vnode_put(pvp);
2977 }
2978 } else if (mp->mnt_flag & MNT_ROOTFS) {
2979 if (nc_smr_enabled) {
2980 vfs_smr_synchronize();
2981 }
2982
2983 mount_lock_destroy(mp);
2984 #if CONFIG_MACF
2985 mac_mount_label_destroy(mp);
2986 #endif
2987 zfree(mount_zone, mp);
2988 } else {
2989 panic("dounmount: no coveredvp");
2990 }
2991 }
2992 return error;
2993 }
2994
2995 /*
2996 * Unmount any mounts in this filesystem.
2997 */
2998 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2999 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3000 {
3001 mount_t smp;
3002 fsid_t *fsids, fsid;
3003 int fsids_sz;
3004 int count = 0, i, m = 0;
3005 vnode_t vp;
3006
3007 mount_list_lock();
3008
3009 // Get an array to hold the submounts fsids.
3010 TAILQ_FOREACH(smp, &mountlist, mnt_list)
3011 count++;
3012 fsids_sz = count * sizeof(fsid_t);
3013 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3014 if (fsids == NULL) {
3015 mount_list_unlock();
3016 goto out;
3017 }
3018 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3019
3020 /*
3021 * Fill the array with submount fsids.
3022 * Since mounts are always added to the tail of the mount list, the
3023 * list is always in mount order.
3024 * For each mount check if the mounted-on vnode belongs to a
3025 * mount that's already added to our array of mounts to be unmounted.
3026 */
3027 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3028 vp = smp->mnt_vnodecovered;
3029 if (vp == NULL) {
3030 continue;
3031 }
3032 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3033 for (i = 0; i <= m; i++) {
3034 if (fsids[i].val[0] == fsid.val[0] &&
3035 fsids[i].val[1] == fsid.val[1]) {
3036 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3037 break;
3038 }
3039 }
3040 }
3041 mount_list_unlock();
3042
3043 // Unmount the submounts in reverse order. Ignore errors.
3044 for (i = m; i > 0; i--) {
3045 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3046 if (smp) {
3047 mount_ref(smp, 0);
3048 mount_iterdrop(smp);
3049 (void) dounmount(smp, flags, 1, ctx);
3050 }
3051 }
3052 out:
3053 kfree_data(fsids, fsids_sz);
3054 }
3055
3056 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3057 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3058 {
3059 vnode_hold(dp);
3060 vnode_lock(dp);
3061 mp->mnt_crossref--;
3062
3063 if (mp->mnt_crossref < 0) {
3064 panic("mount cross refs -ve");
3065 }
3066
3067 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3068 if (need_put) {
3069 vnode_put_locked(dp);
3070 }
3071 vnode_drop_and_unlock(dp);
3072
3073 if (nc_smr_enabled) {
3074 vfs_smr_synchronize();
3075 }
3076
3077 mount_lock_destroy(mp);
3078 #if CONFIG_MACF
3079 mac_mount_label_destroy(mp);
3080 #endif
3081 zfree(mount_zone, mp);
3082 return;
3083 }
3084 if (need_put) {
3085 vnode_put_locked(dp);
3086 }
3087 vnode_drop_and_unlock(dp);
3088 }
3089
3090
3091 /*
3092 * Sync each mounted filesystem.
3093 */
3094 #if DIAGNOSTIC
3095 int syncprt = 0;
3096 #endif
3097
3098 int print_vmpage_stat = 0;
3099
3100 /*
3101 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3102 * mounted read-write with the passed waitfor value.
3103 *
3104 * Parameters: mp mount-point descriptor per mounted file-system instance.
3105 * arg user argument (please see below)
3106 *
3107 * User argument is a pointer to 32 bit unsigned integer which describes the
3108 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3109 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3110 * waitfor value.
3111 *
3112 * Returns: VFS_RETURNED
3113 */
3114 static int
sync_callback(mount_t mp,void * arg)3115 sync_callback(mount_t mp, void *arg)
3116 {
3117 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3118 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3119 unsigned waitfor = MNT_NOWAIT;
3120
3121 if (arg) {
3122 waitfor = *(uint32_t*)arg;
3123 }
3124
3125 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3126 if (waitfor != MNT_WAIT &&
3127 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3128 waitfor != MNT_NOWAIT &&
3129 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3130 waitfor != MNT_DWAIT &&
3131 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3132 panic("Passed inappropriate waitfor %u to "
3133 "sync_callback()", waitfor);
3134 }
3135
3136 mp->mnt_flag &= ~MNT_ASYNC;
3137 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3138 if (asyncflag) {
3139 mp->mnt_flag |= MNT_ASYNC;
3140 }
3141 }
3142
3143 return VFS_RETURNED;
3144 }
3145
3146 /* ARGSUSED */
3147 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3148 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3149 {
3150 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3151
3152 if (print_vmpage_stat) {
3153 vm_countdirtypages();
3154 }
3155
3156 #if DIAGNOSTIC
3157 if (syncprt) {
3158 vfs_bufstats();
3159 }
3160 #endif /* DIAGNOSTIC */
3161 return 0;
3162 }
3163
3164 typedef enum {
3165 SYNC_ALL = 0,
3166 SYNC_ONLY_RELIABLE_MEDIA = 1,
3167 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3168 } sync_type_t;
3169
3170 static int
sync_internal_callback(mount_t mp,void * arg)3171 sync_internal_callback(mount_t mp, void *arg)
3172 {
3173 if (arg) {
3174 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3175 (mp->mnt_flag & MNT_LOCAL);
3176 sync_type_t sync_type = *((sync_type_t *)arg);
3177
3178 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3179 return VFS_RETURNED;
3180 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3181 return VFS_RETURNED;
3182 }
3183 }
3184
3185 (void)sync_callback(mp, NULL);
3186
3187 return VFS_RETURNED;
3188 }
3189
3190 int sync_thread_state = 0;
3191 int sync_timeout_seconds = 5;
3192
3193 #define SYNC_THREAD_RUN 0x0001
3194 #define SYNC_THREAD_RUNNING 0x0002
3195
3196 #if CONFIG_PHYS_WRITE_ACCT
3197 thread_t pm_sync_thread;
3198 #endif /* CONFIG_PHYS_WRITE_ACCT */
3199
3200 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3201 sync_thread(__unused void *arg, __unused wait_result_t wr)
3202 {
3203 sync_type_t sync_type;
3204 #if CONFIG_PHYS_WRITE_ACCT
3205 pm_sync_thread = current_thread();
3206 #endif /* CONFIG_PHYS_WRITE_ACCT */
3207
3208 lck_mtx_lock(&sync_mtx_lck);
3209 while (sync_thread_state & SYNC_THREAD_RUN) {
3210 sync_thread_state &= ~SYNC_THREAD_RUN;
3211 lck_mtx_unlock(&sync_mtx_lck);
3212
3213 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3214 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3215 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3216 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3217
3218 lck_mtx_lock(&sync_mtx_lck);
3219 }
3220 /*
3221 * This wakeup _has_ to be issued before the lock is released otherwise
3222 * we may end up waking up a thread in sync_internal which is
3223 * expecting a wakeup from a thread it just created and not from this
3224 * thread which is about to exit.
3225 */
3226 wakeup(&sync_thread_state);
3227 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3228 #if CONFIG_PHYS_WRITE_ACCT
3229 pm_sync_thread = NULL;
3230 #endif /* CONFIG_PHYS_WRITE_ACCT */
3231 lck_mtx_unlock(&sync_mtx_lck);
3232
3233 if (print_vmpage_stat) {
3234 vm_countdirtypages();
3235 }
3236
3237 #if DIAGNOSTIC
3238 if (syncprt) {
3239 vfs_bufstats();
3240 }
3241 #endif /* DIAGNOSTIC */
3242 }
3243
3244 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3245
3246 /*
3247 * An in-kernel sync for power management to call.
3248 * This function always returns within sync_timeout seconds.
3249 */
3250 __private_extern__ int
sync_internal(void)3251 sync_internal(void)
3252 {
3253 thread_t thd = NULL;
3254 int error;
3255 int thread_created = FALSE;
3256 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3257
3258 lck_mtx_lock(&sync_mtx_lck);
3259 sync_thread_state |= SYNC_THREAD_RUN;
3260 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3261 int kr;
3262
3263 sync_thread_state |= SYNC_THREAD_RUNNING;
3264 kr = kernel_thread_start(sync_thread, NULL, &thd);
3265 if (kr != KERN_SUCCESS) {
3266 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3267 lck_mtx_unlock(&sync_mtx_lck);
3268 printf("sync_thread failed\n");
3269 return 0;
3270 }
3271 thread_created = TRUE;
3272 }
3273
3274 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3275 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3276 if (error) {
3277 struct timeval now;
3278
3279 microtime(&now);
3280 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3281 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3282 sync_timeout_last_print.tv_sec = now.tv_sec;
3283 }
3284 }
3285
3286 if (thread_created) {
3287 thread_deallocate(thd);
3288 }
3289
3290 return 0;
3291 } /* end of sync_internal call */
3292
3293 /*
3294 * Change filesystem quotas.
3295 */
3296 #if QUOTA
3297 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3298 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3299 {
3300 struct mount *mp;
3301 int error, quota_cmd, quota_status = 0;
3302 caddr_t datap;
3303 size_t fnamelen;
3304 struct nameidata nd;
3305 vfs_context_t ctx = vfs_context_current();
3306 struct dqblk my_dqblk = {};
3307
3308 AUDIT_ARG(uid, uap->uid);
3309 AUDIT_ARG(cmd, uap->cmd);
3310 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3311 uap->path, ctx);
3312 error = namei(&nd);
3313 if (error) {
3314 return error;
3315 }
3316 mp = nd.ni_vp->v_mount;
3317 mount_ref(mp, 0);
3318 vnode_put(nd.ni_vp);
3319 nameidone(&nd);
3320
3321 #if CONFIG_MACF
3322 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3323 if (error != 0) {
3324 goto out;
3325 }
3326 #endif
3327
3328 /* copyin any data we will need for downstream code */
3329 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3330
3331 switch (quota_cmd) {
3332 case Q_QUOTAON:
3333 /* uap->arg specifies a file from which to take the quotas */
3334 fnamelen = MAXPATHLEN;
3335 datap = zalloc(ZV_NAMEI);
3336 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3337 break;
3338 case Q_GETQUOTA:
3339 /* uap->arg is a pointer to a dqblk structure. */
3340 datap = (caddr_t) &my_dqblk;
3341 break;
3342 case Q_SETQUOTA:
3343 case Q_SETUSE:
3344 /* uap->arg is a pointer to a dqblk structure. */
3345 datap = (caddr_t) &my_dqblk;
3346 if (proc_is64bit(p)) {
3347 struct user_dqblk my_dqblk64;
3348 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3349 if (error == 0) {
3350 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3351 }
3352 } else {
3353 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3354 }
3355 break;
3356 case Q_QUOTASTAT:
3357 /* uap->arg is a pointer to an integer */
3358 datap = (caddr_t) "a_status;
3359 break;
3360 default:
3361 datap = NULL;
3362 break;
3363 } /* switch */
3364
3365 if (error == 0) {
3366 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3367 }
3368
3369 switch (quota_cmd) {
3370 case Q_QUOTAON:
3371 if (datap != NULL) {
3372 zfree(ZV_NAMEI, datap);
3373 }
3374 break;
3375 case Q_GETQUOTA:
3376 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3377 if (error == 0) {
3378 if (proc_is64bit(p)) {
3379 struct user_dqblk my_dqblk64;
3380
3381 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3382 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3383 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3384 } else {
3385 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3386 }
3387 }
3388 break;
3389 case Q_QUOTASTAT:
3390 /* uap->arg is a pointer to an integer */
3391 if (error == 0) {
3392 error = copyout(datap, uap->arg, sizeof(quota_status));
3393 }
3394 break;
3395 default:
3396 break;
3397 } /* switch */
3398
3399 out:
3400 mount_drop(mp, 0);
3401 return error;
3402 }
3403 #else
3404 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3405 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3406 {
3407 return EOPNOTSUPP;
3408 }
3409 #endif /* QUOTA */
3410
3411 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3412 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3413 {
3414 int error;
3415 vfs_context_t ctx = vfs_context_current();
3416
3417 #if CONFIG_MACF
3418 error = mac_mount_check_stat(ctx, mp);
3419 if (error != 0) {
3420 return error;
3421 }
3422 #endif
3423
3424 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3425 if (error != 0) {
3426 return error;
3427 }
3428
3429 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3430 }
3431
3432 /*
3433 * Get filesystem statistics.
3434 *
3435 * Returns: 0 Success
3436 * namei:???
3437 * vfs_update_vfsstat:???
3438 * munge_statfs:EFAULT
3439 */
3440 /* ARGSUSED */
3441 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3442 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3443 {
3444 int error;
3445 struct mount *mp;
3446 struct nameidata nd;
3447 vfs_context_t ctx = vfs_context_current();
3448 vnode_t vp;
3449
3450 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3451 UIO_USERSPACE, uap->path, ctx);
3452 error = namei(&nd);
3453 if (error != 0) {
3454 return error;
3455 }
3456 vp = nd.ni_vp;
3457 mp = vp->v_mount;
3458 nameidone(&nd);
3459
3460 error = statfs_internal(p, mp, uap->buf);
3461 vnode_put(vp);
3462
3463 return error;
3464 }
3465
3466 /*
3467 * Get filesystem statistics.
3468 */
3469 /* ARGSUSED */
3470 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3471 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3472 {
3473 int error;
3474 vnode_t vp = NULL;
3475 struct mount *mp;
3476
3477 AUDIT_ARG(fd, uap->fd);
3478
3479 if ((error = file_vnode(uap->fd, &vp)) ||
3480 (error = vnode_getwithref(vp))) {
3481 goto out;
3482 }
3483
3484 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3485
3486 mp = vp->v_mount;
3487 if (!mp) {
3488 error = EBADF;
3489 goto out_vnode;
3490 }
3491
3492 error = statfs_internal(p, mp, uap->buf);
3493
3494 out_vnode:
3495 vnode_put(vp);
3496
3497 out:
3498 if (vp != NULL) {
3499 file_drop(uap->fd);
3500 }
3501
3502 return error;
3503 }
3504
3505 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3506 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3507 {
3508 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3509
3510 bzero(sfs, sizeof(*sfs));
3511
3512 sfs->f_bsize = vsfs->f_bsize;
3513 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3514 sfs->f_blocks = vsfs->f_blocks;
3515 sfs->f_bfree = vsfs->f_bfree;
3516 sfs->f_bavail = vsfs->f_bavail;
3517 sfs->f_files = vsfs->f_files;
3518 sfs->f_ffree = vsfs->f_ffree;
3519 sfs->f_fsid = vsfs->f_fsid;
3520 sfs->f_owner = vsfs->f_owner;
3521 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3522 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3523 sfs->f_fssubtype = vsfs->f_fssubtype;
3524 sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3525 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3526 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3527 } else {
3528 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3529 }
3530 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3531 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3532 }
3533
3534 /*
3535 * Get file system statistics in 64-bit mode
3536 */
3537 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3538 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3539 {
3540 struct mount *mp;
3541 int error;
3542 struct nameidata *ndp;
3543 struct statfs64 *sfsp;
3544 vfs_context_t ctxp = vfs_context_current();
3545 vnode_t vp;
3546 struct {
3547 struct nameidata nd;
3548 struct statfs64 sfs;
3549 } *__nameidata_statfs64;
3550
3551 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3552 Z_WAITOK);
3553 ndp = &__nameidata_statfs64->nd;
3554
3555 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3556 UIO_USERSPACE, uap->path, ctxp);
3557 error = namei(ndp);
3558 if (error != 0) {
3559 goto out;
3560 }
3561 vp = ndp->ni_vp;
3562 mp = vp->v_mount;
3563 nameidone(ndp);
3564
3565 #if CONFIG_MACF
3566 error = mac_mount_check_stat(ctxp, mp);
3567 if (error != 0) {
3568 vnode_put(vp);
3569 goto out;
3570 }
3571 #endif
3572
3573 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3574 if (error != 0) {
3575 vnode_put(vp);
3576 goto out;
3577 }
3578
3579 sfsp = &__nameidata_statfs64->sfs;
3580 vfs_get_statfs64(mp, sfsp);
3581 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3582 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3583 /* This process does not want to see a seperate data volume mountpoint */
3584 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3585 }
3586 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3587 vnode_put(vp);
3588
3589 out:
3590 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3591
3592 return error;
3593 }
3594
3595 /*
3596 * Get file system statistics in 64-bit mode
3597 */
3598 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3599 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3600 {
3601 struct vnode *vp;
3602 struct mount *mp;
3603 struct statfs64 sfs;
3604 int error;
3605
3606 AUDIT_ARG(fd, uap->fd);
3607
3608 if ((error = file_vnode(uap->fd, &vp))) {
3609 return error;
3610 }
3611
3612 error = vnode_getwithref(vp);
3613 if (error) {
3614 file_drop(uap->fd);
3615 return error;
3616 }
3617
3618 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3619
3620 mp = vp->v_mount;
3621 if (!mp) {
3622 error = EBADF;
3623 goto out;
3624 }
3625
3626 #if CONFIG_MACF
3627 error = mac_mount_check_stat(vfs_context_current(), mp);
3628 if (error != 0) {
3629 goto out;
3630 }
3631 #endif
3632
3633 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3634 goto out;
3635 }
3636
3637 vfs_get_statfs64(mp, &sfs);
3638 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3639 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3640 /* This process does not want to see a seperate data volume mountpoint */
3641 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3642 }
3643 error = copyout(&sfs, uap->buf, sizeof(sfs));
3644
3645 out:
3646 file_drop(uap->fd);
3647 vnode_put(vp);
3648
3649 return error;
3650 }
3651
3652 struct getfsstat_struct {
3653 user_addr_t sfsp;
3654 user_addr_t *mp;
3655 int count;
3656 int maxcount;
3657 int flags;
3658 int error;
3659 };
3660
3661
3662 static int
getfsstat_callback(mount_t mp,void * arg)3663 getfsstat_callback(mount_t mp, void * arg)
3664 {
3665 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3666 struct vfsstatfs *sp;
3667 int error, my_size;
3668 vfs_context_t ctx = vfs_context_current();
3669
3670 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3671 #if CONFIG_MACF
3672 error = mac_mount_check_stat(ctx, mp);
3673 if (error != 0) {
3674 fstp->error = error;
3675 return VFS_RETURNED_DONE;
3676 }
3677 #endif
3678 sp = &mp->mnt_vfsstat;
3679 /*
3680 * If MNT_NOWAIT is specified, do not refresh the
3681 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3682 */
3683 if ((mp->mnt_lflag & MNT_LDEAD) ||
3684 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3685 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3686 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3687 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3688 return VFS_RETURNED;
3689 }
3690
3691 /*
3692 * Need to handle LP64 version of struct statfs
3693 */
3694 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3695 if (error) {
3696 fstp->error = error;
3697 return VFS_RETURNED_DONE;
3698 }
3699 fstp->sfsp += my_size;
3700
3701 if (fstp->mp) {
3702 #if CONFIG_MACF
3703 error = mac_mount_label_get(mp, *fstp->mp);
3704 if (error) {
3705 fstp->error = error;
3706 return VFS_RETURNED_DONE;
3707 }
3708 #endif
3709 fstp->mp++;
3710 }
3711 }
3712 fstp->count++;
3713 return VFS_RETURNED;
3714 }
3715
3716 /*
3717 * Get statistics on all filesystems.
3718 */
3719 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3720 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3721 {
3722 struct __mac_getfsstat_args muap;
3723
3724 muap.buf = uap->buf;
3725 muap.bufsize = uap->bufsize;
3726 muap.mac = USER_ADDR_NULL;
3727 muap.macsize = 0;
3728 muap.flags = uap->flags;
3729
3730 return __mac_getfsstat(p, &muap, retval);
3731 }
3732
3733 /*
3734 * __mac_getfsstat: Get MAC-related file system statistics
3735 *
3736 * Parameters: p (ignored)
3737 * uap User argument descriptor (see below)
3738 * retval Count of file system statistics (N stats)
3739 *
3740 * Indirect: uap->bufsize Buffer size
3741 * uap->macsize MAC info size
3742 * uap->buf Buffer where information will be returned
3743 * uap->mac MAC info
3744 * uap->flags File system flags
3745 *
3746 *
3747 * Returns: 0 Success
3748 * !0 Not success
3749 *
3750 */
3751 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3752 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3753 {
3754 user_addr_t sfsp;
3755 user_addr_t *mp;
3756 size_t count, maxcount, bufsize, macsize;
3757 struct getfsstat_struct fst;
3758
3759 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3760 return EINVAL;
3761 }
3762
3763 bufsize = (size_t) uap->bufsize;
3764 macsize = (size_t) uap->macsize;
3765
3766 if (IS_64BIT_PROCESS(p)) {
3767 maxcount = bufsize / sizeof(struct user64_statfs);
3768 } else {
3769 maxcount = bufsize / sizeof(struct user32_statfs);
3770 }
3771 sfsp = uap->buf;
3772 count = 0;
3773
3774 mp = NULL;
3775
3776 #if CONFIG_MACF
3777 if (uap->mac != USER_ADDR_NULL) {
3778 u_int32_t *mp0;
3779 int error;
3780 unsigned int i;
3781
3782 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3783 if (count != maxcount) {
3784 return EINVAL;
3785 }
3786
3787 /* Copy in the array */
3788 mp0 = kalloc_data(macsize, Z_WAITOK);
3789 if (mp0 == NULL) {
3790 return ENOMEM;
3791 }
3792
3793 error = copyin(uap->mac, mp0, macsize);
3794 if (error) {
3795 kfree_data(mp0, macsize);
3796 return error;
3797 }
3798
3799 /* Normalize to an array of user_addr_t */
3800 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3801 if (mp == NULL) {
3802 kfree_data(mp0, macsize);
3803 return ENOMEM;
3804 }
3805
3806 for (i = 0; i < count; i++) {
3807 if (IS_64BIT_PROCESS(p)) {
3808 mp[i] = ((user_addr_t *)mp0)[i];
3809 } else {
3810 mp[i] = (user_addr_t)mp0[i];
3811 }
3812 }
3813 kfree_data(mp0, macsize);
3814 }
3815 #endif
3816
3817
3818 fst.sfsp = sfsp;
3819 fst.mp = mp;
3820 fst.flags = uap->flags;
3821 fst.count = 0;
3822 fst.error = 0;
3823 fst.maxcount = (int)maxcount;
3824
3825
3826 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3827
3828 if (mp) {
3829 kfree_data(mp, count * sizeof(user_addr_t));
3830 }
3831
3832 if (fst.error) {
3833 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3834 return fst.error;
3835 }
3836
3837 if (fst.sfsp && fst.count > fst.maxcount) {
3838 *retval = fst.maxcount;
3839 } else {
3840 *retval = fst.count;
3841 }
3842 return 0;
3843 }
3844
3845 static int
getfsstat64_callback(mount_t mp,void * arg)3846 getfsstat64_callback(mount_t mp, void * arg)
3847 {
3848 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3849 struct vfsstatfs *sp;
3850 struct statfs64 sfs;
3851 int error;
3852
3853 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3854 #if CONFIG_MACF
3855 error = mac_mount_check_stat(vfs_context_current(), mp);
3856 if (error != 0) {
3857 fstp->error = error;
3858 return VFS_RETURNED_DONE;
3859 }
3860 #endif
3861 sp = &mp->mnt_vfsstat;
3862 /*
3863 * If MNT_NOWAIT is specified, do not refresh the fsstat
3864 * cache. MNT_WAIT overrides MNT_NOWAIT.
3865 *
3866 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3867 * getfsstat, since the constants are out of the same
3868 * namespace.
3869 */
3870 if ((mp->mnt_lflag & MNT_LDEAD) ||
3871 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3872 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3873 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3874 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3875 return VFS_RETURNED;
3876 }
3877
3878 vfs_get_statfs64(mp, &sfs);
3879 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3880 if (error) {
3881 fstp->error = error;
3882 return VFS_RETURNED_DONE;
3883 }
3884 fstp->sfsp += sizeof(sfs);
3885 }
3886 fstp->count++;
3887 return VFS_RETURNED;
3888 }
3889
3890 /*
3891 * Get statistics on all file systems in 64 bit mode.
3892 */
3893 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3894 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3895 {
3896 user_addr_t sfsp;
3897 int count, maxcount;
3898 struct getfsstat_struct fst;
3899
3900 maxcount = uap->bufsize / sizeof(struct statfs64);
3901
3902 sfsp = uap->buf;
3903 count = 0;
3904
3905 fst.sfsp = sfsp;
3906 fst.flags = uap->flags;
3907 fst.count = 0;
3908 fst.error = 0;
3909 fst.maxcount = maxcount;
3910
3911 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3912
3913 if (fst.error) {
3914 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3915 return fst.error;
3916 }
3917
3918 if (fst.sfsp && fst.count > fst.maxcount) {
3919 *retval = fst.maxcount;
3920 } else {
3921 *retval = fst.count;
3922 }
3923
3924 return 0;
3925 }
3926
3927 /*
3928 * gets the associated vnode with the file descriptor passed.
3929 * as input
3930 *
3931 * INPUT
3932 * ctx - vfs context of caller
3933 * fd - file descriptor for which vnode is required.
3934 * vpp - Pointer to pointer to vnode to be returned.
3935 *
3936 * The vnode is returned with an iocount so any vnode obtained
3937 * by this call needs a vnode_put
3938 *
3939 */
3940 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3941 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3942 {
3943 int error;
3944 vnode_t vp;
3945 struct fileproc *fp;
3946 proc_t p = vfs_context_proc(ctx);
3947
3948 *vpp = NULLVP;
3949
3950 error = fp_getfvp(p, fd, &fp, &vp);
3951 if (error) {
3952 return error;
3953 }
3954
3955 error = vnode_getwithref(vp);
3956 if (error) {
3957 (void)fp_drop(p, fd, fp, 0);
3958 return error;
3959 }
3960
3961 (void)fp_drop(p, fd, fp, 0);
3962 *vpp = vp;
3963 return error;
3964 }
3965
3966 /*
3967 * Wrapper function around namei to start lookup from a directory
3968 * specified by a file descriptor ni_dirfd.
3969 *
3970 * In addition to all the errors returned by namei, this call can
3971 * return ENOTDIR if the file descriptor does not refer to a directory.
3972 * and EBADF if the file descriptor is not valid.
3973 */
3974 int
nameiat(struct nameidata * ndp,int dirfd)3975 nameiat(struct nameidata *ndp, int dirfd)
3976 {
3977 if ((dirfd != AT_FDCWD) &&
3978 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3979 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3980 int error = 0;
3981 char c;
3982
3983 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3984 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3985 if (error) {
3986 return error;
3987 }
3988 } else {
3989 c = *((char *)(ndp->ni_dirp));
3990 }
3991
3992 if (c != '/') {
3993 vnode_t dvp_at;
3994
3995 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3996 &dvp_at);
3997 if (error) {
3998 return error;
3999 }
4000
4001 if (vnode_vtype(dvp_at) != VDIR) {
4002 vnode_put(dvp_at);
4003 return ENOTDIR;
4004 }
4005
4006 ndp->ni_dvp = dvp_at;
4007 ndp->ni_cnd.cn_flags |= USEDVP;
4008 error = namei(ndp);
4009 ndp->ni_cnd.cn_flags &= ~USEDVP;
4010 vnode_put(dvp_at);
4011 return error;
4012 }
4013 }
4014
4015 return namei(ndp);
4016 }
4017
4018 /*
4019 * Change current working directory to a given file descriptor.
4020 */
4021 /* ARGSUSED */
4022 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)4023 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
4024 {
4025 vnode_t vp;
4026 vnode_t tdp;
4027 vnode_t tvp;
4028 struct mount *mp;
4029 int error, should_put = 1;
4030 vfs_context_t ctx = vfs_context_current();
4031
4032 AUDIT_ARG(fd, uap->fd);
4033 if (per_thread && uap->fd == -1) {
4034 /*
4035 * Switching back from per-thread to per process CWD; verify we
4036 * in fact have one before proceeding. The only success case
4037 * for this code path is to return 0 preemptively after zapping
4038 * the thread structure contents.
4039 */
4040 thread_t th = vfs_context_thread(ctx);
4041 if (th) {
4042 uthread_t uth = get_bsdthread_info(th);
4043 tvp = uth->uu_cdir;
4044 uth->uu_cdir = NULLVP;
4045 if (tvp != NULLVP) {
4046 vnode_rele(tvp);
4047 return 0;
4048 }
4049 }
4050 return EBADF;
4051 }
4052
4053 if ((error = file_vnode(uap->fd, &vp))) {
4054 return error;
4055 }
4056 if ((error = vnode_getwithref(vp))) {
4057 file_drop(uap->fd);
4058 return error;
4059 }
4060
4061 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4062
4063 if (vp->v_type != VDIR) {
4064 error = ENOTDIR;
4065 goto out;
4066 }
4067
4068 #if CONFIG_MACF
4069 error = mac_vnode_check_chdir(ctx, vp);
4070 if (error) {
4071 goto out;
4072 }
4073 #endif
4074 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4075 if (error) {
4076 goto out;
4077 }
4078
4079 while (!error && (mp = vp->v_mountedhere) != NULL) {
4080 if (vfs_busy(mp, LK_NOWAIT)) {
4081 error = EACCES;
4082 goto out;
4083 }
4084 error = VFS_ROOT(mp, &tdp, ctx);
4085 vfs_unbusy(mp);
4086 if (error) {
4087 break;
4088 }
4089 vnode_put(vp);
4090 vp = tdp;
4091 }
4092 if (error) {
4093 goto out;
4094 }
4095 if ((error = vnode_ref(vp))) {
4096 goto out;
4097 }
4098 vnode_put(vp);
4099 should_put = 0;
4100
4101 if (per_thread) {
4102 thread_t th = vfs_context_thread(ctx);
4103 if (th) {
4104 uthread_t uth = get_bsdthread_info(th);
4105 tvp = uth->uu_cdir;
4106 uth->uu_cdir = vp;
4107 OSBitOrAtomic(P_THCWD, &p->p_flag);
4108 } else {
4109 vnode_rele(vp);
4110 error = ENOENT;
4111 goto out;
4112 }
4113 } else {
4114 proc_dirs_lock_exclusive(p);
4115 proc_fdlock(p);
4116 tvp = p->p_fd.fd_cdir;
4117 p->p_fd.fd_cdir = vp;
4118 proc_fdunlock(p);
4119 proc_dirs_unlock_exclusive(p);
4120 }
4121
4122 if (tvp) {
4123 vnode_rele(tvp);
4124 }
4125
4126 out:
4127 if (should_put) {
4128 vnode_put(vp);
4129 }
4130 file_drop(uap->fd);
4131
4132 return error;
4133 }
4134
4135 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4136 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4137 {
4138 return common_fchdir(p, uap, 0);
4139 }
4140
4141 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4142 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4143 {
4144 return common_fchdir(p, (void *)uap, 1);
4145 }
4146
4147
4148 /*
4149 * Change current working directory (".").
4150 *
4151 * Returns: 0 Success
4152 * change_dir:ENOTDIR
4153 * change_dir:???
4154 * vnode_ref:ENOENT No such file or directory
4155 */
4156 /* ARGSUSED */
4157 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4158 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4159 {
4160 int error;
4161 vnode_t tvp;
4162
4163 error = change_dir(ndp, ctx);
4164 if (error) {
4165 return error;
4166 }
4167 if ((error = vnode_ref(ndp->ni_vp))) {
4168 vnode_put(ndp->ni_vp);
4169 return error;
4170 }
4171 /*
4172 * drop the iocount we picked up in change_dir
4173 */
4174 vnode_put(ndp->ni_vp);
4175
4176 if (per_thread) {
4177 thread_t th = vfs_context_thread(ctx);
4178 if (th) {
4179 uthread_t uth = get_bsdthread_info(th);
4180 tvp = uth->uu_cdir;
4181 uth->uu_cdir = ndp->ni_vp;
4182 OSBitOrAtomic(P_THCWD, &p->p_flag);
4183 } else {
4184 vnode_rele(ndp->ni_vp);
4185 return ENOENT;
4186 }
4187 } else {
4188 proc_dirs_lock_exclusive(p);
4189 proc_fdlock(p);
4190 tvp = p->p_fd.fd_cdir;
4191 p->p_fd.fd_cdir = ndp->ni_vp;
4192 proc_fdunlock(p);
4193 proc_dirs_unlock_exclusive(p);
4194 }
4195
4196 if (tvp) {
4197 vnode_rele(tvp);
4198 }
4199
4200 return 0;
4201 }
4202
4203
4204 /*
4205 * Change current working directory (".").
4206 *
4207 * Returns: 0 Success
4208 * chdir_internal:ENOTDIR
4209 * chdir_internal:ENOENT No such file or directory
4210 * chdir_internal:???
4211 */
4212 /* ARGSUSED */
4213 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4214 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4215 {
4216 struct nameidata nd;
4217 vfs_context_t ctx = vfs_context_current();
4218
4219 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4220 UIO_USERSPACE, uap->path, ctx);
4221
4222 return chdir_internal(p, ctx, &nd, per_thread);
4223 }
4224
4225
4226 /*
4227 * chdir
4228 *
4229 * Change current working directory (".") for the entire process
4230 *
4231 * Parameters: p Process requesting the call
4232 * uap User argument descriptor (see below)
4233 * retval (ignored)
4234 *
4235 * Indirect parameters: uap->path Directory path
4236 *
4237 * Returns: 0 Success
4238 * common_chdir: ENOTDIR
4239 * common_chdir: ENOENT No such file or directory
4240 * common_chdir: ???
4241 *
4242 */
4243 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4244 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4245 {
4246 return common_chdir(p, (void *)uap, 0);
4247 }
4248
4249 /*
4250 * __pthread_chdir
4251 *
4252 * Change current working directory (".") for a single thread
4253 *
4254 * Parameters: p Process requesting the call
4255 * uap User argument descriptor (see below)
4256 * retval (ignored)
4257 *
4258 * Indirect parameters: uap->path Directory path
4259 *
4260 * Returns: 0 Success
4261 * common_chdir: ENOTDIR
4262 * common_chdir: ENOENT No such file or directory
4263 * common_chdir: ???
4264 *
4265 */
4266 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4267 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4268 {
4269 return common_chdir(p, (void *)uap, 1);
4270 }
4271
4272
4273 /*
4274 * Change notion of root (``/'') directory.
4275 */
4276 /* ARGSUSED */
4277 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4278 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4279 {
4280 struct filedesc *fdp = &p->p_fd;
4281 int error;
4282 struct nameidata nd;
4283 vnode_t tvp;
4284 vfs_context_t ctx = vfs_context_current();
4285
4286 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4287 return error;
4288 }
4289
4290 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4291 UIO_USERSPACE, uap->path, ctx);
4292 error = change_dir(&nd, ctx);
4293 if (error) {
4294 return error;
4295 }
4296
4297 #if CONFIG_MACF
4298 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4299 &nd.ni_cnd);
4300 if (error) {
4301 vnode_put(nd.ni_vp);
4302 return error;
4303 }
4304 #endif
4305
4306 if ((error = vnode_ref(nd.ni_vp))) {
4307 vnode_put(nd.ni_vp);
4308 return error;
4309 }
4310 vnode_put(nd.ni_vp);
4311
4312 /*
4313 * This lock provides the guarantee that as long as you hold the lock
4314 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4315 * on a referenced vnode in namei when determining the rootvnode for
4316 * a process.
4317 */
4318 /* needed for synchronization with lookup */
4319 proc_dirs_lock_exclusive(p);
4320 /* needed for setting the flag and other activities on the fd itself */
4321 proc_fdlock(p);
4322 tvp = fdp->fd_rdir;
4323 fdp->fd_rdir = nd.ni_vp;
4324 fdt_flag_set(fdp, FD_CHROOT);
4325 proc_fdunlock(p);
4326 proc_dirs_unlock_exclusive(p);
4327
4328 if (tvp != NULL) {
4329 vnode_rele(tvp);
4330 }
4331
4332 return 0;
4333 }
4334
4335 #define PATHSTATICBUFLEN 256
4336 #define PIVOT_ROOT_ENTITLEMENT \
4337 "com.apple.private.vfs.pivot-root"
4338
4339 #if defined(XNU_TARGET_OS_OSX)
4340 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4341 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4342 {
4343 int error;
4344 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4345 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4346 char *new_rootfs_path_before_buf = NULL;
4347 char *old_rootfs_path_after_buf = NULL;
4348 char *incoming = NULL;
4349 char *outgoing = NULL;
4350 vnode_t incoming_rootvp = NULLVP;
4351 size_t bytes_copied;
4352
4353 /*
4354 * XXX : Additional restrictions needed
4355 * - perhaps callable only once.
4356 */
4357 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4358 return error;
4359 }
4360
4361 /*
4362 * pivot_root can be executed by launchd only.
4363 * Enforce entitlement.
4364 */
4365 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4366 return EPERM;
4367 }
4368
4369 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4370 if (error == ENAMETOOLONG) {
4371 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4372 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4373 }
4374
4375 if (error) {
4376 goto out;
4377 }
4378
4379 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4380 if (error == ENAMETOOLONG) {
4381 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4382 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4383 }
4384 if (error) {
4385 goto out;
4386 }
4387
4388 if (new_rootfs_path_before_buf) {
4389 incoming = new_rootfs_path_before_buf;
4390 } else {
4391 incoming = &new_rootfs_path_before[0];
4392 }
4393
4394 if (old_rootfs_path_after_buf) {
4395 outgoing = old_rootfs_path_after_buf;
4396 } else {
4397 outgoing = &old_rootfs_path_after[0];
4398 }
4399
4400 /*
4401 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4402 * Userland is not allowed to pivot to an image.
4403 */
4404 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4405 if (error) {
4406 goto out;
4407 }
4408 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4409 if (error) {
4410 goto out;
4411 }
4412
4413 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4414
4415 out:
4416 if (incoming_rootvp != NULLVP) {
4417 vnode_put(incoming_rootvp);
4418 incoming_rootvp = NULLVP;
4419 }
4420
4421 if (old_rootfs_path_after_buf) {
4422 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4423 }
4424
4425 if (new_rootfs_path_before_buf) {
4426 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4427 }
4428
4429 return error;
4430 }
4431 #else
4432 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4433 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4434 {
4435 return nosys(p, NULL, retval);
4436 }
4437 #endif /* XNU_TARGET_OS_OSX */
4438
4439 /*
4440 * Common routine for chroot and chdir.
4441 *
4442 * Returns: 0 Success
4443 * ENOTDIR Not a directory
4444 * namei:??? [anything namei can return]
4445 * vnode_authorize:??? [anything vnode_authorize can return]
4446 */
4447 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4448 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4449 {
4450 vnode_t vp;
4451 int error;
4452
4453 if ((error = namei(ndp))) {
4454 return error;
4455 }
4456 nameidone(ndp);
4457 vp = ndp->ni_vp;
4458
4459 if (vp->v_type != VDIR) {
4460 vnode_put(vp);
4461 return ENOTDIR;
4462 }
4463
4464 #if CONFIG_MACF
4465 error = mac_vnode_check_chdir(ctx, vp);
4466 if (error) {
4467 vnode_put(vp);
4468 return error;
4469 }
4470 #endif
4471
4472 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4473 if (error) {
4474 vnode_put(vp);
4475 return error;
4476 }
4477
4478 return error;
4479 }
4480
4481 /*
4482 * Free the vnode data (for directories) associated with the file glob.
4483 */
4484 struct fd_vn_data *
fg_vn_data_alloc(void)4485 fg_vn_data_alloc(void)
4486 {
4487 struct fd_vn_data *fvdata;
4488
4489 /* Allocate per fd vnode data */
4490 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4491 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4492 return fvdata;
4493 }
4494
4495 /*
4496 * Free the vnode data (for directories) associated with the file glob.
4497 */
4498 void
fg_vn_data_free(void * fgvndata)4499 fg_vn_data_free(void *fgvndata)
4500 {
4501 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4502
4503 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4504 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4505 kfree_type(struct fd_vn_data, fvdata);
4506 }
4507
4508 /*
4509 * Check permissions, allocate an open file structure,
4510 * and call the device open routine if any.
4511 *
4512 * Returns: 0 Success
4513 * EINVAL
4514 * EINTR
4515 * falloc:ENFILE
4516 * falloc:EMFILE
4517 * falloc:ENOMEM
4518 * vn_open_auth:???
4519 * dupfdopen:???
4520 * VNOP_ADVLOCK:???
4521 * vnode_setsize:???
4522 *
4523 * XXX Need to implement uid, gid
4524 */
4525 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4526 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4527 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4528 {
4529 proc_t p = vfs_context_proc(ctx);
4530 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4531 struct fileproc *fp;
4532 vnode_t vp;
4533 int flags, oflags, amode;
4534 int type, indx, error;
4535 struct vfs_context context;
4536 vnode_t authvp = NULLVP;
4537
4538 oflags = uflags;
4539
4540 amode = oflags & O_ACCMODE;
4541 /*
4542 * Because O_RDONLY is 0, it is not possible to distinguish between
4543 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4544 * with FREAD/FWRITE.
4545 */
4546 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4547 return EINVAL;
4548 }
4549
4550 flags = FFLAGS(uflags);
4551 CLR(flags, FENCRYPTED);
4552 CLR(flags, FUNENCRYPTED);
4553
4554 AUDIT_ARG(fflags, oflags);
4555 AUDIT_ARG(mode, vap->va_mode);
4556
4557 if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4558 return error;
4559 }
4560 if (flags & O_CLOEXEC) {
4561 fp->fp_flags |= FP_CLOEXEC;
4562 }
4563 if (flags & O_CLOFORK) {
4564 fp->fp_flags |= FP_CLOFORK;
4565 }
4566
4567 /* setup state to recognize when fdesc_open was called */
4568 uu->uu_dupfd = -1;
4569
4570 /*
4571 * Disable read/write access if file is opened with O_EVTONLY and
4572 * the process has requested to deny read/write access.
4573 */
4574 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4575 flags &= ~(FREAD | FWRITE);
4576 }
4577
4578 if (authfd != AUTH_OPEN_NOAUTHFD) {
4579 error = vnode_getfromfd(ctx, authfd, &authvp);
4580 if (error) {
4581 fp_free(p, indx, fp);
4582 return error;
4583 }
4584 }
4585
4586 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4587 if (authvp != NULLVP) {
4588 vnode_put(authvp);
4589 }
4590 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4591 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4592 *retval = indx;
4593 return 0;
4594 }
4595 }
4596 if (error == ERESTART) {
4597 error = EINTR;
4598 }
4599 fp_free(p, indx, fp);
4600 return error;
4601 }
4602
4603 if (authvp != NULLVP) {
4604 vnode_put(authvp);
4605 }
4606
4607 uu->uu_dupfd = 0;
4608 vp = ndp->ni_vp;
4609
4610 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4611 fp->fp_glob->fg_ops = &vnops;
4612 fp_set_data(fp, vp);
4613
4614 #if CONFIG_FILE_LEASES
4615 /*
4616 * If we are creating a file or open with truncate, we need to break the
4617 * lease if there is a read lease placed on the parent dir.
4618 */
4619 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4620 vnode_breakdirlease(vp, true, oflags);
4621 }
4622 /* Now check if there is a lease placed on the file itself. */
4623 error = vnode_breaklease(vp, oflags, ctx);
4624 if (error) {
4625 goto bad;
4626 }
4627 #endif /* CONFIG_FILE_LEASES */
4628
4629 if (flags & (O_EXLOCK | O_SHLOCK)) {
4630 struct flock lf = {
4631 .l_whence = SEEK_SET,
4632 };
4633
4634 if (flags & O_EXLOCK) {
4635 lf.l_type = F_WRLCK;
4636 } else {
4637 lf.l_type = F_RDLCK;
4638 }
4639 type = F_FLOCK;
4640 if ((flags & FNONBLOCK) == 0) {
4641 type |= F_WAIT;
4642 }
4643 #if CONFIG_MACF
4644 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4645 F_SETLK, &lf);
4646 if (error) {
4647 goto bad;
4648 }
4649 #endif
4650 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4651 goto bad;
4652 }
4653 fp->fp_glob->fg_flag |= FWASLOCKED;
4654 }
4655
4656 /* try to truncate by setting the size attribute */
4657 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4658 goto bad;
4659 }
4660
4661 /*
4662 * For directories we hold some additional information in the fd.
4663 */
4664 if (vnode_vtype(vp) == VDIR) {
4665 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4666 } else {
4667 fp->fp_glob->fg_vn_data = NULL;
4668 }
4669
4670 #if CONFIG_SECLUDED_MEMORY
4671 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4672 memory_object_control_t moc;
4673 const char *v_name;
4674
4675 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4676
4677 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4678 /* nothing to do... */
4679 } else if (fp->fp_glob->fg_flag & FWRITE) {
4680 /* writable -> no longer eligible for secluded pages */
4681 memory_object_mark_eligible_for_secluded(moc,
4682 FALSE);
4683 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4684 char pathname[32] = { 0, };
4685 size_t copied;
4686 /* XXX FBDP: better way to detect /Applications/ ? */
4687 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4688 (void)copyinstr(ndp->ni_dirp,
4689 pathname,
4690 sizeof(pathname),
4691 &copied);
4692 } else {
4693 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4694 pathname,
4695 sizeof(pathname),
4696 &copied);
4697 }
4698 pathname[sizeof(pathname) - 1] = '\0';
4699 if (strncmp(pathname,
4700 "/Applications/",
4701 strlen("/Applications/")) == 0 &&
4702 strncmp(pathname,
4703 "/Applications/Camera.app/",
4704 strlen("/Applications/Camera.app/")) != 0) {
4705 /*
4706 * not writable
4707 * AND from "/Applications/"
4708 * AND not from "/Applications/Camera.app/"
4709 * ==> eligible for secluded
4710 */
4711 memory_object_mark_eligible_for_secluded(moc,
4712 TRUE);
4713 }
4714 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4715 (v_name = vnode_getname(vp))) {
4716 size_t len = strlen(v_name);
4717
4718 if (!strncmp(v_name, "dyld", len) ||
4719 !strncmp(v_name, "launchd", len) ||
4720 !strncmp(v_name, "Camera", len) ||
4721 !strncmp(v_name, "SpringBoard", len) ||
4722 !strncmp(v_name, "backboardd", len)) {
4723 /*
4724 * This file matters when launching Camera:
4725 * do not store its contents in the secluded
4726 * pool that will be drained on Camera launch.
4727 */
4728 memory_object_mark_eligible_for_secluded(moc,
4729 FALSE);
4730 } else if (!strncmp(v_name, "mediaserverd", len)) {
4731 memory_object_mark_eligible_for_secluded(moc,
4732 FALSE);
4733 memory_object_mark_for_realtime(moc,
4734 true);
4735 } else if (!strncmp(v_name, "bluetoothd", len)) {
4736 /*
4737 * bluetoothd might be needed for realtime audio
4738 * playback.
4739 */
4740 memory_object_mark_eligible_for_secluded(moc,
4741 FALSE);
4742 memory_object_mark_for_realtime(moc,
4743 true);
4744 } else {
4745 char pathname[64] = { 0, };
4746 size_t copied;
4747 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4748 (void)copyinstr(ndp->ni_dirp,
4749 pathname,
4750 sizeof(pathname),
4751 &copied);
4752 } else {
4753 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4754 pathname,
4755 sizeof(pathname),
4756 &copied);
4757 }
4758 pathname[sizeof(pathname) - 1] = '\0';
4759 if (strncmp(pathname,
4760 "/Library/Audio/Plug-Ins/",
4761 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4762 strncmp(pathname,
4763 "/System/Library/Audio/Plug-Ins/",
4764 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4765 /*
4766 * This may be an audio plugin required
4767 * for realtime playback.
4768 * ==> NOT eligible for secluded.
4769 */
4770 memory_object_mark_eligible_for_secluded(moc,
4771 FALSE);
4772 memory_object_mark_for_realtime(moc,
4773 true);
4774 }
4775 }
4776 vnode_putname(v_name);
4777 }
4778 }
4779 #endif /* CONFIG_SECLUDED_MEMORY */
4780
4781 vnode_put(vp);
4782
4783 /*
4784 * The first terminal open (without a O_NOCTTY) by a session leader
4785 * results in it being set as the controlling terminal.
4786 */
4787 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4788 !(flags & O_NOCTTY)) {
4789 int tmp = 0;
4790
4791 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4792 (caddr_t)&tmp, ctx);
4793 }
4794
4795 proc_fdlock(p);
4796 procfdtbl_releasefd(p, indx, NULL);
4797
4798 fp_drop(p, indx, fp, 1);
4799 proc_fdunlock(p);
4800
4801 *retval = indx;
4802
4803 return 0;
4804 bad:
4805 context = *vfs_context_current();
4806 context.vc_ucred = fp->fp_glob->fg_cred;
4807
4808 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4809 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4810 struct flock lf = {
4811 .l_whence = SEEK_SET,
4812 .l_type = F_UNLCK,
4813 };
4814
4815 (void)VNOP_ADVLOCK(
4816 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4817 }
4818
4819 vn_close(vp, fp->fp_glob->fg_flag, &context);
4820 vnode_put(vp);
4821 fp_free(p, indx, fp);
4822
4823 return error;
4824 }
4825
4826 /*
4827 * While most of the *at syscall handlers can call nameiat() which
4828 * is a wrapper around namei, the use of namei and initialisation
4829 * of nameidata are far removed and in different functions - namei
4830 * gets called in vn_open_auth for open1. So we'll just do here what
4831 * nameiat() does.
4832 */
4833 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4834 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4835 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4836 int dirfd, int authfd)
4837 {
4838 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4839 int error;
4840 char c;
4841
4842 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4843 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4844 if (error) {
4845 return error;
4846 }
4847 } else {
4848 c = *((char *)(ndp->ni_dirp));
4849 }
4850
4851 if (c != '/') {
4852 vnode_t dvp_at;
4853
4854 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4855 &dvp_at);
4856 if (error) {
4857 return error;
4858 }
4859
4860 if (vnode_vtype(dvp_at) != VDIR) {
4861 vnode_put(dvp_at);
4862 return ENOTDIR;
4863 }
4864
4865 ndp->ni_dvp = dvp_at;
4866 ndp->ni_cnd.cn_flags |= USEDVP;
4867 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4868 retval, authfd);
4869 vnode_put(dvp_at);
4870 return error;
4871 }
4872 }
4873
4874 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4875 }
4876
4877 /*
4878 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4879 *
4880 * Parameters: p Process requesting the open
4881 * uap User argument descriptor (see below)
4882 * retval Pointer to an area to receive the
4883 * return calue from the system call
4884 *
4885 * Indirect: uap->path Path to open (same as 'open')
4886 * uap->flags Flags to open (same as 'open'
4887 * uap->uid UID to set, if creating
4888 * uap->gid GID to set, if creating
4889 * uap->mode File mode, if creating (same as 'open')
4890 * uap->xsecurity ACL to set, if creating
4891 *
4892 * Returns: 0 Success
4893 * !0 errno value
4894 *
4895 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4896 *
4897 * XXX: We should enummerate the possible errno values here, and where
4898 * in the code they originated.
4899 */
4900 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4901 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4902 {
4903 int ciferror;
4904 kauth_filesec_t xsecdst;
4905 struct vnode_attr va;
4906 struct nameidata nd;
4907 int cmode;
4908
4909 AUDIT_ARG(owner, uap->uid, uap->gid);
4910
4911 xsecdst = NULL;
4912 if ((uap->xsecurity != USER_ADDR_NULL) &&
4913 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4914 return ciferror;
4915 }
4916
4917 VATTR_INIT(&va);
4918 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4919 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4920 if (uap->uid != KAUTH_UID_NONE) {
4921 VATTR_SET(&va, va_uid, uap->uid);
4922 }
4923 if (uap->gid != KAUTH_GID_NONE) {
4924 VATTR_SET(&va, va_gid, uap->gid);
4925 }
4926 if (xsecdst != NULL) {
4927 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4928 va.va_vaflags |= VA_FILESEC_ACL;
4929 }
4930
4931 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4932 uap->path, vfs_context_current());
4933
4934 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4935 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4936 if (xsecdst != NULL) {
4937 kauth_filesec_free(xsecdst);
4938 }
4939
4940 return ciferror;
4941 }
4942
4943 /*
4944 * Go through the data-protected atomically controlled open (2)
4945 *
4946 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4947 */
4948 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4949 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4950 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4951 {
4952 /*
4953 * Follow the same path as normal open(2)
4954 * Look up the item if it exists, and acquire the vnode.
4955 */
4956 struct vnode_attr va;
4957 struct nameidata nd;
4958 int cmode;
4959 int error;
4960 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4961
4962 VATTR_INIT(&va);
4963 /* Mask off all but regular access permissions */
4964 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4965 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4966
4967 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4968 path, ctx);
4969
4970 /*
4971 * Initialize the extra fields in vnode_attr to pass down our
4972 * extra fields.
4973 * 1. target cprotect class.
4974 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4975 */
4976 if (flags & O_CREAT) {
4977 /* lower level kernel code validates that the class is valid before applying it. */
4978 if (class != PROTECTION_CLASS_DEFAULT) {
4979 /*
4980 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4981 * file behave the same as open (2)
4982 */
4983 VATTR_SET(&va, va_dataprotect_class, class);
4984 }
4985 }
4986
4987 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4988 if (flags & (O_RDWR | O_WRONLY)) {
4989 /*
4990 * Not allowed to write raw encrypted bytes or when opening authenticated.
4991 */
4992 return EINVAL;
4993 }
4994 if (dpflags & O_DP_GETRAWENCRYPTED) {
4995 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4996 }
4997 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4998 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4999 }
5000 if (dpflags & O_DP_AUTHENTICATE) {
5001 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5002 }
5003 }
5004
5005 error = open1at(vfs_context_current(), &nd, flags, &va,
5006 NULL, NULL, retval, fd, authfd);
5007
5008 return error;
5009 }
5010
5011 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5012 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5013 {
5014 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5015 return EINVAL;
5016 }
5017
5018 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5019 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5020 }
5021
5022 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5023 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5024 {
5025 if (uap->dpflags & O_DP_AUTHENTICATE) {
5026 return EINVAL;
5027 }
5028
5029 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5030 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5031 }
5032
5033 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5034 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5035 int fd, enum uio_seg segflg, int *retval)
5036 {
5037 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5038 struct {
5039 struct vnode_attr va;
5040 struct nameidata nd;
5041 } *__open_data;
5042 struct vnode_attr *vap;
5043 struct nameidata *ndp;
5044 int cmode;
5045 int error;
5046
5047 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5048 vap = &__open_data->va;
5049 ndp = &__open_data->nd;
5050
5051 VATTR_INIT(vap);
5052 /* Mask off all but regular access permissions */
5053 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5054 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5055
5056 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5057 segflg, path, ctx);
5058
5059 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5060
5061 kfree_type(typeof(*__open_data), __open_data);
5062
5063 return error;
5064 }
5065
5066 int
open(proc_t p,struct open_args * uap,int32_t * retval)5067 open(proc_t p, struct open_args *uap, int32_t *retval)
5068 {
5069 __pthread_testcancel(1);
5070 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5071 }
5072
5073 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5074 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5075 int32_t *retval)
5076 {
5077 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5078 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5079 }
5080
5081 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5082 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5083 int32_t *retval)
5084 {
5085 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5086 uap->mode, uap->fd, UIO_USERSPACE, retval);
5087 }
5088
5089 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5090 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5091 {
5092 __pthread_testcancel(1);
5093 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5094 }
5095
5096 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5097
5098 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5099 vfs_context_can_open_by_id(vfs_context_t ctx)
5100 {
5101 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5102 return TRUE;
5103 }
5104
5105 return IOTaskHasEntitlement(vfs_context_task(ctx),
5106 OPEN_BY_ID_ENTITLEMENT);
5107 }
5108
5109 /*
5110 * openbyid_np: open a file given a file system id and a file system object id
5111 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5112 * file systems that don't support object ids it is a node id (uint64_t).
5113 *
5114 * Parameters: p Process requesting the open
5115 * uap User argument descriptor (see below)
5116 * retval Pointer to an area to receive the
5117 * return calue from the system call
5118 *
5119 * Indirect: uap->path Path to open (same as 'open')
5120 *
5121 * uap->fsid id of target file system
5122 * uap->objid id of target file system object
5123 * uap->flags Flags to open (same as 'open')
5124 *
5125 * Returns: 0 Success
5126 * !0 errno value
5127 *
5128 *
5129 * XXX: We should enummerate the possible errno values here, and where
5130 * in the code they originated.
5131 */
5132 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5133 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5134 {
5135 fsid_t fsid;
5136 uint64_t objid;
5137 int error;
5138 char *buf = NULL;
5139 int buflen = MAXPATHLEN;
5140 int pathlen = 0;
5141 vfs_context_t ctx = vfs_context_current();
5142
5143 if (!vfs_context_can_open_by_id(ctx)) {
5144 return EPERM;
5145 }
5146
5147 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5148 return error;
5149 }
5150
5151 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5152 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5153 return error;
5154 }
5155
5156 AUDIT_ARG(value32, fsid.val[0]);
5157 AUDIT_ARG(value64, objid);
5158
5159 /*resolve path from fsis, objid*/
5160 do {
5161 buf = kalloc_data(buflen + 1, Z_WAITOK);
5162 if (buf == NULL) {
5163 return ENOMEM;
5164 }
5165
5166 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5167 buf, FSOPT_ISREALFSID, &pathlen);
5168
5169 if (error) {
5170 kfree_data(buf, buflen + 1);
5171 buf = NULL;
5172 }
5173 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5174
5175 if (error) {
5176 return error;
5177 }
5178
5179 buf[pathlen] = 0;
5180
5181 error = openat_internal(
5182 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5183
5184 kfree_data(buf, buflen + 1);
5185
5186 return error;
5187 }
5188
5189
5190 /*
5191 * Create a special file.
5192 */
5193 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5194 int fd);
5195
5196 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5197 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5198 mode_t mode, int fd)
5199 {
5200 vfs_context_t ctx = vfs_context_current();
5201 struct nameidata nd;
5202 vnode_t vp, dvp;
5203 int error;
5204
5205 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5206 if ((mode & S_IFMT) == S_IFIFO) {
5207 return mkfifo1(ctx, upath, vap, fd);
5208 }
5209
5210 AUDIT_ARG(mode, mode);
5211 AUDIT_ARG(value32, vap->va_rdev);
5212
5213 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5214 return error;
5215 }
5216 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5217 UIO_USERSPACE, upath, ctx);
5218 error = nameiat(&nd, fd);
5219 if (error) {
5220 return error;
5221 }
5222 dvp = nd.ni_dvp;
5223 vp = nd.ni_vp;
5224
5225 if (vp != NULL) {
5226 error = EEXIST;
5227 goto out;
5228 }
5229
5230 switch (mode & S_IFMT) {
5231 case S_IFCHR:
5232 VATTR_SET(vap, va_type, VCHR);
5233 break;
5234 case S_IFBLK:
5235 VATTR_SET(vap, va_type, VBLK);
5236 break;
5237 default:
5238 error = EINVAL;
5239 goto out;
5240 }
5241
5242 #if CONFIG_MACF
5243 error = mac_vnode_check_create(ctx,
5244 nd.ni_dvp, &nd.ni_cnd, vap);
5245 if (error) {
5246 goto out;
5247 }
5248 #endif
5249
5250 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5251 goto out;
5252 }
5253
5254 #if CONFIG_FILE_LEASES
5255 vnode_breakdirlease(dvp, false, O_WRONLY);
5256 #endif
5257
5258 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5259 goto out;
5260 }
5261
5262 if (vp) {
5263 int update_flags = 0;
5264
5265 // Make sure the name & parent pointers are hooked up
5266 if (vp->v_name == NULL) {
5267 update_flags |= VNODE_UPDATE_NAME;
5268 }
5269 if (vp->v_parent == NULLVP) {
5270 update_flags |= VNODE_UPDATE_PARENT;
5271 }
5272
5273 if (update_flags) {
5274 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5275 }
5276
5277 #if CONFIG_FSE
5278 add_fsevent(FSE_CREATE_FILE, ctx,
5279 FSE_ARG_VNODE, vp,
5280 FSE_ARG_DONE);
5281 #endif
5282 }
5283
5284 out:
5285 /*
5286 * nameidone has to happen before we vnode_put(dvp)
5287 * since it may need to release the fs_nodelock on the dvp
5288 */
5289 nameidone(&nd);
5290
5291 if (vp) {
5292 vnode_put(vp);
5293 }
5294 vnode_put(dvp);
5295
5296 return error;
5297 }
5298
5299 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5300 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5301 {
5302 struct vnode_attr va;
5303
5304 VATTR_INIT(&va);
5305 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5306 VATTR_SET(&va, va_rdev, uap->dev);
5307
5308 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5309 }
5310
5311 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5312 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5313 {
5314 struct vnode_attr va;
5315
5316 VATTR_INIT(&va);
5317 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5318 VATTR_SET(&va, va_rdev, uap->dev);
5319
5320 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5321 }
5322
5323 /*
5324 * Create a named pipe.
5325 *
5326 * Returns: 0 Success
5327 * EEXIST
5328 * namei:???
5329 * vnode_authorize:???
5330 * vn_create:???
5331 */
5332 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5333 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5334 {
5335 vnode_t vp, dvp;
5336 int error;
5337 struct nameidata nd;
5338
5339 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5340 UIO_USERSPACE, upath, ctx);
5341 error = nameiat(&nd, fd);
5342 if (error) {
5343 return error;
5344 }
5345 dvp = nd.ni_dvp;
5346 vp = nd.ni_vp;
5347
5348 /* check that this is a new file and authorize addition */
5349 if (vp != NULL) {
5350 error = EEXIST;
5351 goto out;
5352 }
5353 VATTR_SET(vap, va_type, VFIFO);
5354
5355 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5356 goto out;
5357 }
5358
5359 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5360 out:
5361 /*
5362 * nameidone has to happen before we vnode_put(dvp)
5363 * since it may need to release the fs_nodelock on the dvp
5364 */
5365 nameidone(&nd);
5366
5367 if (vp) {
5368 vnode_put(vp);
5369 }
5370 vnode_put(dvp);
5371
5372 return error;
5373 }
5374
5375
5376 /*
5377 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5378 *
5379 * Parameters: p Process requesting the open
5380 * uap User argument descriptor (see below)
5381 * retval (Ignored)
5382 *
5383 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5384 * uap->uid UID to set
5385 * uap->gid GID to set
5386 * uap->mode File mode to set (same as 'mkfifo')
5387 * uap->xsecurity ACL to set, if creating
5388 *
5389 * Returns: 0 Success
5390 * !0 errno value
5391 *
5392 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5393 *
5394 * XXX: We should enummerate the possible errno values here, and where
5395 * in the code they originated.
5396 */
5397 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5398 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5399 {
5400 int ciferror;
5401 kauth_filesec_t xsecdst;
5402 struct vnode_attr va;
5403
5404 AUDIT_ARG(owner, uap->uid, uap->gid);
5405
5406 xsecdst = KAUTH_FILESEC_NONE;
5407 if (uap->xsecurity != USER_ADDR_NULL) {
5408 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5409 return ciferror;
5410 }
5411 }
5412
5413 VATTR_INIT(&va);
5414 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5415 if (uap->uid != KAUTH_UID_NONE) {
5416 VATTR_SET(&va, va_uid, uap->uid);
5417 }
5418 if (uap->gid != KAUTH_GID_NONE) {
5419 VATTR_SET(&va, va_gid, uap->gid);
5420 }
5421 if (xsecdst != KAUTH_FILESEC_NONE) {
5422 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5423 va.va_vaflags |= VA_FILESEC_ACL;
5424 }
5425
5426 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5427
5428 if (xsecdst != KAUTH_FILESEC_NONE) {
5429 kauth_filesec_free(xsecdst);
5430 }
5431 return ciferror;
5432 }
5433
5434 /* ARGSUSED */
5435 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5436 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5437 {
5438 struct vnode_attr va;
5439
5440 VATTR_INIT(&va);
5441 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5442
5443 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5444 }
5445
5446 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5447 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5448 {
5449 struct vnode_attr va;
5450
5451 VATTR_INIT(&va);
5452 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5453
5454 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5455 }
5456
5457 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5458 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5459 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5460
5461 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5462 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5463 {
5464 int ret, len = _len;
5465
5466 *truncated_path = 0;
5467
5468 if (firmlink) {
5469 ret = vn_getpath(dvp, path, &len);
5470 } else {
5471 ret = vn_getpath_no_firmlink(dvp, path, &len);
5472 }
5473 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5474 if (leafname) {
5475 path[len - 1] = '/';
5476 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5477 if (len > MAXPATHLEN) {
5478 char *ptr;
5479
5480 // the string got truncated!
5481 *truncated_path = 1;
5482 ptr = strrchr(path, '/');
5483 if (ptr) {
5484 *ptr = '\0'; // chop off the string at the last directory component
5485 }
5486 len = (int)strlen(path) + 1;
5487 }
5488 }
5489 } else if (ret == 0) {
5490 *truncated_path = 1;
5491 } else if (ret != 0) {
5492 struct vnode *mydvp = dvp;
5493
5494 if (ret != ENOSPC) {
5495 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5496 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5497 }
5498 *truncated_path = 1;
5499
5500 do {
5501 if (mydvp->v_parent != NULL) {
5502 mydvp = mydvp->v_parent;
5503 } else if (mydvp->v_mount) {
5504 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5505 break;
5506 } else {
5507 // no parent and no mount point? only thing is to punt and say "/" changed
5508 strlcpy(path, "/", _len);
5509 len = 2;
5510 mydvp = NULL;
5511 }
5512
5513 if (mydvp == NULL) {
5514 break;
5515 }
5516
5517 len = _len;
5518 if (firmlink) {
5519 ret = vn_getpath(mydvp, path, &len);
5520 } else {
5521 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5522 }
5523 } while (ret == ENOSPC);
5524 }
5525
5526 return len;
5527 }
5528
5529 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5530 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5531 {
5532 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5533 }
5534
5535 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5536 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5537 {
5538 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5539 }
5540
5541 /*
5542 * Make a hard file link.
5543 *
5544 * Returns: 0 Success
5545 * EPERM
5546 * EEXIST
5547 * EXDEV
5548 * namei:???
5549 * vnode_authorize:???
5550 * VNOP_LINK:???
5551 */
5552 /* ARGSUSED */
5553 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5554 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5555 user_addr_t link, int flag, enum uio_seg segflg)
5556 {
5557 vnode_t vp, pvp, dvp, lvp;
5558 struct nameidata nd;
5559 int follow;
5560 int error;
5561 #if CONFIG_FSE
5562 fse_info finfo;
5563 #endif
5564 int need_event, has_listeners, need_kpath2;
5565 char *target_path = NULL;
5566 char *no_firmlink_path = NULL;
5567 int truncated = 0;
5568 int truncated_no_firmlink_path = 0;
5569
5570 vp = dvp = lvp = NULLVP;
5571
5572 /* look up the object we are linking to */
5573 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5574 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5575 segflg, path, ctx);
5576
5577 error = nameiat(&nd, fd1);
5578 if (error) {
5579 return error;
5580 }
5581 vp = nd.ni_vp;
5582
5583 nameidone(&nd);
5584
5585 /*
5586 * Normally, linking to directories is not supported.
5587 * However, some file systems may have limited support.
5588 */
5589 if (vp->v_type == VDIR) {
5590 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5591 error = EPERM; /* POSIX */
5592 goto out;
5593 }
5594
5595 /* Linking to a directory requires ownership. */
5596 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5597 struct vnode_attr dva;
5598
5599 VATTR_INIT(&dva);
5600 VATTR_WANTED(&dva, va_uid);
5601 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5602 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5603 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5604 error = EACCES;
5605 goto out;
5606 }
5607 }
5608 }
5609
5610 /* lookup the target node */
5611 #if CONFIG_TRIGGERS
5612 nd.ni_op = OP_LINK;
5613 #endif
5614 nd.ni_cnd.cn_nameiop = CREATE;
5615 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5616 nd.ni_dirp = link;
5617 error = nameiat(&nd, fd2);
5618 if (error != 0) {
5619 goto out;
5620 }
5621 dvp = nd.ni_dvp;
5622 lvp = nd.ni_vp;
5623
5624 #if CONFIG_MACF
5625 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5626 goto out2;
5627 }
5628 #endif
5629
5630 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5631 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5632 goto out2;
5633 }
5634
5635 /* target node must not exist */
5636 if (lvp != NULLVP) {
5637 error = EEXIST;
5638 goto out2;
5639 }
5640 /* cannot link across mountpoints */
5641 if (vnode_mount(vp) != vnode_mount(dvp)) {
5642 error = EXDEV;
5643 goto out2;
5644 }
5645
5646 /* authorize creation of the target note */
5647 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5648 goto out2;
5649 }
5650
5651 #if CONFIG_FILE_LEASES
5652 vnode_breakdirlease(dvp, false, O_WRONLY);
5653 #endif
5654
5655 /* and finally make the link */
5656 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5657 if (error) {
5658 goto out2;
5659 }
5660
5661 #if CONFIG_MACF
5662 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5663 #endif
5664
5665 #if CONFIG_FSE
5666 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5667 #else
5668 need_event = 0;
5669 #endif
5670 has_listeners = kauth_authorize_fileop_has_listeners();
5671
5672 need_kpath2 = 0;
5673 #if CONFIG_AUDIT
5674 if (AUDIT_RECORD_EXISTS()) {
5675 need_kpath2 = 1;
5676 }
5677 #endif
5678
5679 if (need_event || has_listeners || need_kpath2) {
5680 char *link_to_path = NULL;
5681 int len, link_name_len;
5682 int len_no_firmlink_path = 0;
5683
5684 /* build the path to the new link file */
5685 GET_PATH(target_path);
5686
5687 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5688 if (no_firmlink_path == NULL) {
5689 GET_PATH(no_firmlink_path);
5690 }
5691 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5692
5693 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5694
5695 if (has_listeners) {
5696 /* build the path to file we are linking to */
5697 GET_PATH(link_to_path);
5698
5699 link_name_len = MAXPATHLEN;
5700 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5701 /*
5702 * Call out to allow 3rd party notification of rename.
5703 * Ignore result of kauth_authorize_fileop call.
5704 */
5705 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5706 (uintptr_t)link_to_path,
5707 (uintptr_t)target_path);
5708 }
5709 if (link_to_path != NULL) {
5710 RELEASE_PATH(link_to_path);
5711 }
5712 }
5713 #if CONFIG_FSE
5714 if (need_event) {
5715 /* construct fsevent */
5716 if (get_fse_info(vp, &finfo, ctx) == 0) {
5717 if (truncated_no_firmlink_path) {
5718 finfo.mode |= FSE_TRUNCATED_PATH;
5719 }
5720
5721 // build the path to the destination of the link
5722 add_fsevent(FSE_CREATE_FILE, ctx,
5723 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5724 FSE_ARG_FINFO, &finfo,
5725 FSE_ARG_DONE);
5726 }
5727
5728 pvp = vp->v_parent;
5729 // need an iocount on parent vnode in this case
5730 if (pvp && pvp != dvp) {
5731 pvp = vnode_getparent_if_different(vp, dvp);
5732 }
5733 if (pvp) {
5734 add_fsevent(FSE_STAT_CHANGED, ctx,
5735 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5736 }
5737 if (pvp && pvp != dvp) {
5738 vnode_put(pvp);
5739 }
5740 }
5741 #endif
5742 }
5743 out2:
5744 /*
5745 * nameidone has to happen before we vnode_put(dvp)
5746 * since it may need to release the fs_nodelock on the dvp
5747 */
5748 nameidone(&nd);
5749 if (target_path != NULL) {
5750 RELEASE_PATH(target_path);
5751 }
5752 if (no_firmlink_path != NULL) {
5753 RELEASE_PATH(no_firmlink_path);
5754 no_firmlink_path = NULL;
5755 }
5756 out:
5757 if (lvp) {
5758 vnode_put(lvp);
5759 }
5760 if (dvp) {
5761 vnode_put(dvp);
5762 }
5763 vnode_put(vp);
5764 return error;
5765 }
5766
5767 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5768 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5769 {
5770 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5771 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5772 }
5773
5774 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5775 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5776 {
5777 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5778 return EINVAL;
5779 }
5780
5781 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5782 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5783 }
5784
5785 /*
5786 * Make a symbolic link.
5787 *
5788 * We could add support for ACLs here too...
5789 */
5790 /* ARGSUSED */
5791 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5792 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5793 user_addr_t link, enum uio_seg segflg)
5794 {
5795 struct vnode_attr va;
5796 char *path;
5797 int error;
5798 struct nameidata nd;
5799 vnode_t vp, dvp;
5800 size_t dummy = 0;
5801 proc_t p;
5802
5803 error = 0;
5804 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5805 path = zalloc(ZV_NAMEI);
5806 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5807 } else {
5808 path = (char *)path_data;
5809 }
5810 if (error) {
5811 goto out;
5812 }
5813 AUDIT_ARG(text, path); /* This is the link string */
5814
5815 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5816 segflg, link, ctx);
5817
5818 error = nameiat(&nd, fd);
5819 if (error) {
5820 goto out;
5821 }
5822 dvp = nd.ni_dvp;
5823 vp = nd.ni_vp;
5824
5825 p = vfs_context_proc(ctx);
5826 VATTR_INIT(&va);
5827 VATTR_SET(&va, va_type, VLNK);
5828 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5829
5830 #if CONFIG_MACF
5831 error = mac_vnode_check_create(ctx,
5832 dvp, &nd.ni_cnd, &va);
5833 #endif
5834 if (error != 0) {
5835 goto skipit;
5836 }
5837
5838 if (vp != NULL) {
5839 error = EEXIST;
5840 goto skipit;
5841 }
5842
5843 /* authorize */
5844 if (error == 0) {
5845 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5846 }
5847 /* get default ownership, etc. */
5848 if (error == 0) {
5849 error = vnode_authattr_new(dvp, &va, 0, ctx);
5850 }
5851
5852 #if CONFIG_FILE_LEASES
5853 vnode_breakdirlease(dvp, false, O_WRONLY);
5854 #endif
5855
5856 if (error == 0) {
5857 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5858 }
5859
5860 /* do fallback attribute handling */
5861 if (error == 0 && vp) {
5862 error = vnode_setattr_fallback(vp, &va, ctx);
5863 }
5864
5865 #if CONFIG_MACF
5866 if (error == 0 && vp) {
5867 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5868 }
5869 #endif
5870
5871 if (error == 0) {
5872 int update_flags = 0;
5873
5874 /*check if a new vnode was created, else try to get one*/
5875 if (vp == NULL) {
5876 nd.ni_cnd.cn_nameiop = LOOKUP;
5877 #if CONFIG_TRIGGERS
5878 nd.ni_op = OP_LOOKUP;
5879 #endif
5880 /*
5881 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5882 * reallocated again in namei().
5883 */
5884 nd.ni_cnd.cn_flags &= HASBUF;
5885 error = nameiat(&nd, fd);
5886 if (error) {
5887 goto skipit;
5888 }
5889 vp = nd.ni_vp;
5890 }
5891
5892 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5893 /* call out to allow 3rd party notification of rename.
5894 * Ignore result of kauth_authorize_fileop call.
5895 */
5896 if (kauth_authorize_fileop_has_listeners() &&
5897 namei(&nd) == 0) {
5898 char *new_link_path = NULL;
5899 int len;
5900
5901 /* build the path to the new link file */
5902 new_link_path = get_pathbuff();
5903 len = MAXPATHLEN;
5904 vn_getpath(dvp, new_link_path, &len);
5905 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5906 new_link_path[len - 1] = '/';
5907 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5908 }
5909
5910 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5911 (uintptr_t)path, (uintptr_t)new_link_path);
5912 if (new_link_path != NULL) {
5913 release_pathbuff(new_link_path);
5914 }
5915 }
5916 #endif
5917 // Make sure the name & parent pointers are hooked up
5918 if (vp->v_name == NULL) {
5919 update_flags |= VNODE_UPDATE_NAME;
5920 }
5921 if (vp->v_parent == NULLVP) {
5922 update_flags |= VNODE_UPDATE_PARENT;
5923 }
5924
5925 if (update_flags) {
5926 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5927 }
5928
5929 #if CONFIG_FSE
5930 add_fsevent(FSE_CREATE_FILE, ctx,
5931 FSE_ARG_VNODE, vp,
5932 FSE_ARG_DONE);
5933 #endif
5934 }
5935
5936 skipit:
5937 /*
5938 * nameidone has to happen before we vnode_put(dvp)
5939 * since it may need to release the fs_nodelock on the dvp
5940 */
5941 nameidone(&nd);
5942
5943 if (vp) {
5944 vnode_put(vp);
5945 }
5946 vnode_put(dvp);
5947 out:
5948 if (path && (path != (char *)path_data)) {
5949 zfree(ZV_NAMEI, path);
5950 }
5951
5952 return error;
5953 }
5954
5955 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5956 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5957 {
5958 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5959 uap->link, UIO_USERSPACE);
5960 }
5961
5962 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5963 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5964 __unused int32_t *retval)
5965 {
5966 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5967 uap->path2, UIO_USERSPACE);
5968 }
5969
5970 /*
5971 * Delete a whiteout from the filesystem.
5972 * No longer supported.
5973 */
5974 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5975 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5976 {
5977 return ENOTSUP;
5978 }
5979
5980 /*
5981 * Delete a name from the filesystem.
5982 */
5983 /* ARGSUSED */
5984 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5985 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5986 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5987 {
5988 struct {
5989 struct nameidata nd;
5990 #if CONFIG_FSE
5991 struct vnode_attr va;
5992 fse_info finfo;
5993 #endif
5994 } *__unlink_data;
5995 struct nameidata *ndp;
5996 vnode_t vp, dvp;
5997 int error;
5998 struct componentname *cnp;
5999 char *path = NULL;
6000 char *no_firmlink_path = NULL;
6001 int len_path = 0;
6002 int len_no_firmlink_path = 0;
6003 int flags;
6004 int need_event;
6005 int has_listeners;
6006 int truncated_path;
6007 int truncated_no_firmlink_path;
6008 int batched;
6009 struct vnode_attr *vap;
6010 int do_retry;
6011 int retry_count = 0;
6012 int cn_flags;
6013
6014 cn_flags = LOCKPARENT;
6015 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6016 cn_flags |= AUDITVNPATH1;
6017 }
6018 /* If a starting dvp is passed, it trumps any fd passed. */
6019 if (start_dvp) {
6020 cn_flags |= USEDVP;
6021 }
6022
6023 #if NAMEDRSRCFORK
6024 /* unlink or delete is allowed on rsrc forks and named streams */
6025 cn_flags |= CN_ALLOWRSRCFORK;
6026 #endif
6027
6028 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6029 ndp = &__unlink_data->nd;
6030 #if CONFIG_FSE
6031 fse_info *finfop = &__unlink_data->finfo;
6032 #endif
6033
6034 retry:
6035 do_retry = 0;
6036 flags = 0;
6037 need_event = 0;
6038 has_listeners = 0;
6039 truncated_path = 0;
6040 truncated_no_firmlink_path = 0;
6041 vap = NULL;
6042
6043 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6044
6045 ndp->ni_dvp = start_dvp;
6046 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
6047 cnp = &ndp->ni_cnd;
6048
6049 continue_lookup:
6050 error = nameiat(ndp, fd);
6051 if (error) {
6052 goto early_out;
6053 }
6054
6055 dvp = ndp->ni_dvp;
6056 vp = ndp->ni_vp;
6057
6058 /* With Carbon delete semantics, busy files cannot be deleted */
6059 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6060 flags |= VNODE_REMOVE_NODELETEBUSY;
6061 }
6062
6063 /* Skip any potential upcalls if told to. */
6064 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6065 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6066 }
6067
6068 if (vp) {
6069 batched = vnode_compound_remove_available(vp);
6070 /*
6071 * The root of a mounted filesystem cannot be deleted.
6072 */
6073 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6074 error = EBUSY;
6075 goto out;
6076 }
6077
6078 #if DEVELOPMENT || DEBUG
6079 /*
6080 * XXX VSWAP: Check for entitlements or special flag here
6081 * so we can restrict access appropriately.
6082 */
6083 #else /* DEVELOPMENT || DEBUG */
6084
6085 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6086 error = EPERM;
6087 goto out;
6088 }
6089 #endif /* DEVELOPMENT || DEBUG */
6090
6091 if (!batched) {
6092 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6093 if (error) {
6094 if (error == ENOENT) {
6095 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6096 do_retry = 1;
6097 retry_count++;
6098 }
6099 }
6100 goto out;
6101 }
6102 }
6103 } else {
6104 batched = 1;
6105
6106 if (!vnode_compound_remove_available(dvp)) {
6107 panic("No vp, but no compound remove?");
6108 }
6109 }
6110
6111 #if CONFIG_FSE
6112 need_event = need_fsevent(FSE_DELETE, dvp);
6113 if (need_event) {
6114 if (!batched) {
6115 if ((vp->v_flag & VISHARDLINK) == 0) {
6116 /* XXX need to get these data in batched VNOP */
6117 get_fse_info(vp, finfop, ctx);
6118 }
6119 } else {
6120 error =
6121 vfs_get_notify_attributes(&__unlink_data->va);
6122 if (error) {
6123 goto out;
6124 }
6125
6126 vap = &__unlink_data->va;
6127 }
6128 }
6129 #endif
6130 has_listeners = kauth_authorize_fileop_has_listeners();
6131 if (need_event || has_listeners) {
6132 if (path == NULL) {
6133 GET_PATH(path);
6134 }
6135 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6136 if (no_firmlink_path == NULL) {
6137 GET_PATH(no_firmlink_path);
6138 }
6139 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6140 }
6141
6142 #if NAMEDRSRCFORK
6143 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6144 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6145 } else
6146 #endif
6147 {
6148 #if CONFIG_FILE_LEASES
6149 vnode_breakdirlease(dvp, false, O_WRONLY);
6150 #endif
6151
6152 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6153 vp = ndp->ni_vp;
6154 if (error == EKEEPLOOKING) {
6155 if (!batched) {
6156 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6157 }
6158
6159 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6160 panic("EKEEPLOOKING, but continue flag not set?");
6161 }
6162
6163 if (vnode_isdir(vp)) {
6164 error = EISDIR;
6165 goto out;
6166 }
6167 goto continue_lookup;
6168 } else if (error == ENOENT && batched) {
6169 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6170 /*
6171 * For compound VNOPs, the authorization callback may
6172 * return ENOENT in case of racing hardlink lookups
6173 * hitting the name cache, redrive the lookup.
6174 */
6175 do_retry = 1;
6176 retry_count += 1;
6177 goto out;
6178 }
6179 }
6180 }
6181
6182 /*
6183 * Call out to allow 3rd party notification of delete.
6184 * Ignore result of kauth_authorize_fileop call.
6185 */
6186 if (!error) {
6187 if (has_listeners) {
6188 kauth_authorize_fileop(vfs_context_ucred(ctx),
6189 KAUTH_FILEOP_DELETE,
6190 (uintptr_t)vp,
6191 (uintptr_t)path);
6192 }
6193
6194 if (vp->v_flag & VISHARDLINK) {
6195 //
6196 // if a hardlink gets deleted we want to blow away the
6197 // v_parent link because the path that got us to this
6198 // instance of the link is no longer valid. this will
6199 // force the next call to get the path to ask the file
6200 // system instead of just following the v_parent link.
6201 //
6202 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6203 }
6204
6205 #if CONFIG_FSE
6206 if (need_event) {
6207 if (vp->v_flag & VISHARDLINK) {
6208 get_fse_info(vp, finfop, ctx);
6209 } else if (vap) {
6210 vnode_get_fse_info_from_vap(vp, finfop, vap);
6211 }
6212 if (truncated_path) {
6213 finfop->mode |= FSE_TRUNCATED_PATH;
6214 }
6215 add_fsevent(FSE_DELETE, ctx,
6216 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6217 FSE_ARG_FINFO, finfop,
6218 FSE_ARG_DONE);
6219 }
6220 #endif
6221
6222 #if CONFIG_MACF
6223 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6224 #endif
6225 }
6226
6227 out:
6228 if (path != NULL) {
6229 RELEASE_PATH(path);
6230 path = NULL;
6231 }
6232
6233 if (no_firmlink_path != NULL) {
6234 RELEASE_PATH(no_firmlink_path);
6235 no_firmlink_path = NULL;
6236 }
6237 #if NAMEDRSRCFORK
6238 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6239 * will cause its shadow file to go away if necessary.
6240 */
6241 if (vp && (vnode_isnamedstream(vp)) &&
6242 (vp->v_parent != NULLVP) &&
6243 vnode_isshadow(vp)) {
6244 vnode_recycle(vp);
6245 }
6246 #endif
6247 /*
6248 * nameidone has to happen before we vnode_put(dvp)
6249 * since it may need to release the fs_nodelock on the dvp
6250 */
6251 nameidone(ndp);
6252 vnode_put(dvp);
6253 if (vp) {
6254 vnode_put(vp);
6255 }
6256
6257 if (do_retry) {
6258 goto retry;
6259 }
6260
6261 early_out:
6262 kfree_type(typeof(*__unlink_data), __unlink_data);
6263 return error;
6264 }
6265
6266 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6267 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6268 enum uio_seg segflg, int unlink_flags)
6269 {
6270 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6271 unlink_flags);
6272 }
6273
6274 /*
6275 * Delete a name from the filesystem using Carbon semantics.
6276 */
6277 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6278 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6279 {
6280 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6281 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6282 }
6283
6284 /*
6285 * Delete a name from the filesystem using POSIX semantics.
6286 */
6287 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6288 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6289 {
6290 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6291 uap->path, UIO_USERSPACE, 0);
6292 }
6293
6294 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6295 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6296 {
6297 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6298 return EINVAL;
6299 }
6300
6301 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6302 int unlink_flags = 0;
6303
6304 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6305 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6306 }
6307 return rmdirat_internal(vfs_context_current(), uap->fd,
6308 uap->path, UIO_USERSPACE, unlink_flags);
6309 } else {
6310 return unlinkat_internal(vfs_context_current(), uap->fd,
6311 NULLVP, uap->path, UIO_USERSPACE, 0);
6312 }
6313 }
6314
6315 /*
6316 * Reposition read/write file offset.
6317 */
6318 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6319 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6320 {
6321 struct fileproc *fp;
6322 vnode_t vp;
6323 struct vfs_context *ctx;
6324 off_t offset = uap->offset, file_size;
6325 int error;
6326
6327 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6328 if (error == ENOTSUP) {
6329 return ESPIPE;
6330 }
6331 return error;
6332 }
6333 if (vnode_isfifo(vp)) {
6334 file_drop(uap->fd);
6335 return ESPIPE;
6336 }
6337
6338
6339 ctx = vfs_context_current();
6340 #if CONFIG_MACF
6341 if (uap->whence == L_INCR && uap->offset == 0) {
6342 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6343 fp->fp_glob);
6344 } else {
6345 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6346 fp->fp_glob);
6347 }
6348 if (error) {
6349 file_drop(uap->fd);
6350 return error;
6351 }
6352 #endif
6353 if ((error = vnode_getwithref(vp))) {
6354 file_drop(uap->fd);
6355 return error;
6356 }
6357
6358 switch (uap->whence) {
6359 case L_INCR:
6360 offset += fp->fp_glob->fg_offset;
6361 break;
6362 case L_XTND:
6363 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6364 break;
6365 }
6366 offset += file_size;
6367 break;
6368 case L_SET:
6369 break;
6370 case SEEK_HOLE:
6371 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6372 break;
6373 case SEEK_DATA:
6374 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6375 break;
6376 default:
6377 error = EINVAL;
6378 }
6379 if (error == 0) {
6380 if (uap->offset > 0 && offset < 0) {
6381 /* Incremented/relative move past max size */
6382 error = EOVERFLOW;
6383 } else {
6384 /*
6385 * Allow negative offsets on character devices, per
6386 * POSIX 1003.1-2001. Most likely for writing disk
6387 * labels.
6388 */
6389 if (offset < 0 && vp->v_type != VCHR) {
6390 /* Decremented/relative move before start */
6391 error = EINVAL;
6392 } else {
6393 /* Success */
6394 fp->fp_glob->fg_offset = offset;
6395 *retval = fp->fp_glob->fg_offset;
6396 }
6397 }
6398 }
6399
6400 /*
6401 * An lseek can affect whether data is "available to read." Use
6402 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6403 */
6404 post_event_if_success(vp, error, NOTE_NONE);
6405 (void)vnode_put(vp);
6406 file_drop(uap->fd);
6407 return error;
6408 }
6409
6410
6411 /*
6412 * Check access permissions.
6413 *
6414 * Returns: 0 Success
6415 * vnode_authorize:???
6416 */
6417 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6418 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6419 {
6420 kauth_action_t action;
6421 int error;
6422
6423 /*
6424 * If just the regular access bits, convert them to something
6425 * that vnode_authorize will understand.
6426 */
6427 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6428 action = 0;
6429 if (uflags & R_OK) {
6430 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6431 }
6432 if (uflags & W_OK) {
6433 if (vnode_isdir(vp)) {
6434 action |= KAUTH_VNODE_ADD_FILE |
6435 KAUTH_VNODE_ADD_SUBDIRECTORY;
6436 /* might want delete rights here too */
6437 } else {
6438 action |= KAUTH_VNODE_WRITE_DATA;
6439 }
6440 }
6441 if (uflags & X_OK) {
6442 if (vnode_isdir(vp)) {
6443 action |= KAUTH_VNODE_SEARCH;
6444 } else {
6445 action |= KAUTH_VNODE_EXECUTE;
6446 }
6447 }
6448 } else {
6449 /* take advantage of definition of uflags */
6450 action = uflags >> 8;
6451 }
6452
6453 #if CONFIG_MACF
6454 error = mac_vnode_check_access(ctx, vp, uflags);
6455 if (error) {
6456 return error;
6457 }
6458 #endif /* MAC */
6459
6460 /* action == 0 means only check for existence */
6461 if (action != 0) {
6462 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6463 } else {
6464 error = 0;
6465 }
6466
6467 return error;
6468 }
6469
6470
6471
6472 /*
6473 * access_extended: Check access permissions in bulk.
6474 *
6475 * Description: uap->entries Pointer to an array of accessx
6476 * descriptor structs, plus one or
6477 * more NULL terminated strings (see
6478 * "Notes" section below).
6479 * uap->size Size of the area pointed to by
6480 * uap->entries.
6481 * uap->results Pointer to the results array.
6482 *
6483 * Returns: 0 Success
6484 * ENOMEM Insufficient memory
6485 * EINVAL Invalid arguments
6486 * namei:EFAULT Bad address
6487 * namei:ENAMETOOLONG Filename too long
6488 * namei:ENOENT No such file or directory
6489 * namei:ELOOP Too many levels of symbolic links
6490 * namei:EBADF Bad file descriptor
6491 * namei:ENOTDIR Not a directory
6492 * namei:???
6493 * access1:
6494 *
6495 * Implicit returns:
6496 * uap->results Array contents modified
6497 *
6498 * Notes: The uap->entries are structured as an arbitrary length array
6499 * of accessx descriptors, followed by one or more NULL terminated
6500 * strings
6501 *
6502 * struct accessx_descriptor[0]
6503 * ...
6504 * struct accessx_descriptor[n]
6505 * char name_data[0];
6506 *
6507 * We determine the entry count by walking the buffer containing
6508 * the uap->entries argument descriptor. For each descriptor we
6509 * see, the valid values for the offset ad_name_offset will be
6510 * in the byte range:
6511 *
6512 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6513 * to
6514 * [ uap->entries + uap->size - 2 ]
6515 *
6516 * since we must have at least one string, and the string must
6517 * be at least one character plus the NULL terminator in length.
6518 *
6519 * XXX: Need to support the check-as uid argument
6520 */
6521 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6522 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6523 {
6524 struct accessx_descriptor *input = NULL;
6525 errno_t *result = NULL;
6526 errno_t error = 0;
6527 int wantdelete = 0;
6528 size_t desc_max, desc_actual = 0;
6529 unsigned int i, j;
6530 struct vfs_context context;
6531 struct nameidata nd;
6532 int niopts;
6533 vnode_t vp = NULL;
6534 vnode_t dvp = NULL;
6535 #define ACCESSX_MAX_DESCR_ON_STACK 10
6536 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6537
6538 context.vc_ucred = NULL;
6539
6540 /*
6541 * Validate parameters; if valid, copy the descriptor array and string
6542 * arguments into local memory. Before proceeding, the following
6543 * conditions must have been met:
6544 *
6545 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6546 * o There must be sufficient room in the request for at least one
6547 * descriptor and a one yte NUL terminated string.
6548 * o The allocation of local storage must not fail.
6549 */
6550 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6551 return ENOMEM;
6552 }
6553 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6554 return EINVAL;
6555 }
6556 if (uap->size <= sizeof(stack_input)) {
6557 input = stack_input;
6558 } else {
6559 input = kalloc_data(uap->size, Z_WAITOK);
6560 if (input == NULL) {
6561 error = ENOMEM;
6562 goto out;
6563 }
6564 }
6565 error = copyin(uap->entries, input, uap->size);
6566 if (error) {
6567 goto out;
6568 }
6569
6570 AUDIT_ARG(opaque, input, uap->size);
6571
6572 /*
6573 * Force NUL termination of the copyin buffer to avoid nami() running
6574 * off the end. If the caller passes us bogus data, they may get a
6575 * bogus result.
6576 */
6577 ((char *)input)[uap->size - 1] = 0;
6578
6579 /*
6580 * Access is defined as checking against the process' real identity,
6581 * even if operations are checking the effective identity. This
6582 * requires that we use a local vfs context.
6583 */
6584 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6585 context.vc_thread = current_thread();
6586
6587 /*
6588 * Find out how many entries we have, so we can allocate the result
6589 * array by walking the list and adjusting the count downward by the
6590 * earliest string offset we see.
6591 */
6592 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6593 desc_actual = desc_max;
6594 for (i = 0; i < desc_actual; i++) {
6595 /*
6596 * Take the offset to the name string for this entry and
6597 * convert to an input array index, which would be one off
6598 * the end of the array if this entry was the lowest-addressed
6599 * name string.
6600 */
6601 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6602
6603 /*
6604 * An offset greater than the max allowable offset is an error.
6605 * It is also an error for any valid entry to point
6606 * to a location prior to the end of the current entry, if
6607 * it's not a reference to the string of the previous entry.
6608 */
6609 if (j > desc_max || (j != 0 && j <= i)) {
6610 error = EINVAL;
6611 goto out;
6612 }
6613
6614 /* Also do not let ad_name_offset point to something beyond the size of the input */
6615 if (input[i].ad_name_offset >= uap->size) {
6616 error = EINVAL;
6617 goto out;
6618 }
6619
6620 /*
6621 * An offset of 0 means use the previous descriptor's offset;
6622 * this is used to chain multiple requests for the same file
6623 * to avoid multiple lookups.
6624 */
6625 if (j == 0) {
6626 /* This is not valid for the first entry */
6627 if (i == 0) {
6628 error = EINVAL;
6629 goto out;
6630 }
6631 continue;
6632 }
6633
6634 /*
6635 * If the offset of the string for this descriptor is before
6636 * what we believe is the current actual last descriptor,
6637 * then we need to adjust our estimate downward; this permits
6638 * the string table following the last descriptor to be out
6639 * of order relative to the descriptor list.
6640 */
6641 if (j < desc_actual) {
6642 desc_actual = j;
6643 }
6644 }
6645
6646 /*
6647 * We limit the actual number of descriptors we are willing to process
6648 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6649 * requested does not exceed this limit,
6650 */
6651 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6652 error = ENOMEM;
6653 goto out;
6654 }
6655 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6656 if (result == NULL) {
6657 error = ENOMEM;
6658 goto out;
6659 }
6660
6661 /*
6662 * Do the work by iterating over the descriptor entries we know to
6663 * at least appear to contain valid data.
6664 */
6665 error = 0;
6666 for (i = 0; i < desc_actual; i++) {
6667 /*
6668 * If the ad_name_offset is 0, then we use the previous
6669 * results to make the check; otherwise, we are looking up
6670 * a new file name.
6671 */
6672 if (input[i].ad_name_offset != 0) {
6673 /* discard old vnodes */
6674 if (vp) {
6675 vnode_put(vp);
6676 vp = NULL;
6677 }
6678 if (dvp) {
6679 vnode_put(dvp);
6680 dvp = NULL;
6681 }
6682
6683 /*
6684 * Scan forward in the descriptor list to see if we
6685 * need the parent vnode. We will need it if we are
6686 * deleting, since we must have rights to remove
6687 * entries in the parent directory, as well as the
6688 * rights to delete the object itself.
6689 */
6690 wantdelete = input[i].ad_flags & _DELETE_OK;
6691 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6692 if (input[j].ad_flags & _DELETE_OK) {
6693 wantdelete = 1;
6694 }
6695 }
6696
6697 niopts = FOLLOW | AUDITVNPATH1;
6698
6699 /* need parent for vnode_authorize for deletion test */
6700 if (wantdelete) {
6701 niopts |= WANTPARENT;
6702 }
6703
6704 /* do the lookup */
6705 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6706 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6707 &context);
6708 error = namei(&nd);
6709 if (!error) {
6710 vp = nd.ni_vp;
6711 if (wantdelete) {
6712 dvp = nd.ni_dvp;
6713 }
6714 }
6715 nameidone(&nd);
6716 }
6717
6718 /*
6719 * Handle lookup errors.
6720 */
6721 switch (error) {
6722 case ENOENT:
6723 case EACCES:
6724 case EPERM:
6725 case ENOTDIR:
6726 result[i] = error;
6727 break;
6728 case 0:
6729 /* run this access check */
6730 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6731 break;
6732 default:
6733 /* fatal lookup error */
6734
6735 goto out;
6736 }
6737 }
6738
6739 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6740
6741 /* copy out results */
6742 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6743
6744 out:
6745 if (input && input != stack_input) {
6746 kfree_data(input, uap->size);
6747 }
6748 if (result) {
6749 kfree_data(result, desc_actual * sizeof(errno_t));
6750 }
6751 if (vp) {
6752 vnode_put(vp);
6753 }
6754 if (dvp) {
6755 vnode_put(dvp);
6756 }
6757 if (IS_VALID_CRED(context.vc_ucred)) {
6758 kauth_cred_unref(&context.vc_ucred);
6759 }
6760 return error;
6761 }
6762
6763
6764 /*
6765 * Returns: 0 Success
6766 * namei:EFAULT Bad address
6767 * namei:ENAMETOOLONG Filename too long
6768 * namei:ENOENT No such file or directory
6769 * namei:ELOOP Too many levels of symbolic links
6770 * namei:EBADF Bad file descriptor
6771 * namei:ENOTDIR Not a directory
6772 * namei:???
6773 * access1:
6774 */
6775 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6776 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6777 int flag, enum uio_seg segflg)
6778 {
6779 int error;
6780 struct nameidata nd;
6781 int niopts;
6782 struct vfs_context context;
6783 #if NAMEDRSRCFORK
6784 int is_namedstream = 0;
6785 #endif
6786
6787 /*
6788 * Unless the AT_EACCESS option is used, Access is defined as checking
6789 * against the process' real identity, even if operations are checking
6790 * the effective identity. So we need to tweak the credential
6791 * in the context for that case.
6792 */
6793 if (!(flag & AT_EACCESS)) {
6794 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6795 } else {
6796 context.vc_ucred = ctx->vc_ucred;
6797 }
6798 context.vc_thread = ctx->vc_thread;
6799
6800
6801 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6802 /* need parent for vnode_authorize for deletion test */
6803 if (amode & _DELETE_OK) {
6804 niopts |= WANTPARENT;
6805 }
6806 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6807 path, &context);
6808 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6809 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6810 }
6811
6812 #if NAMEDRSRCFORK
6813 /* access(F_OK) calls are allowed for resource forks. */
6814 if (amode == F_OK) {
6815 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6816 }
6817 #endif
6818 error = nameiat(&nd, fd);
6819 if (error) {
6820 goto out;
6821 }
6822
6823 #if NAMEDRSRCFORK
6824 /* Grab reference on the shadow stream file vnode to
6825 * force an inactive on release which will mark it
6826 * for recycle.
6827 */
6828 if (vnode_isnamedstream(nd.ni_vp) &&
6829 (nd.ni_vp->v_parent != NULLVP) &&
6830 vnode_isshadow(nd.ni_vp)) {
6831 is_namedstream = 1;
6832 vnode_ref(nd.ni_vp);
6833 }
6834 #endif
6835
6836 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6837
6838 #if NAMEDRSRCFORK
6839 if (is_namedstream) {
6840 vnode_rele(nd.ni_vp);
6841 }
6842 #endif
6843
6844 vnode_put(nd.ni_vp);
6845 if (amode & _DELETE_OK) {
6846 vnode_put(nd.ni_dvp);
6847 }
6848 nameidone(&nd);
6849
6850 out:
6851 if (!(flag & AT_EACCESS)) {
6852 kauth_cred_unref(&context.vc_ucred);
6853 }
6854 return error;
6855 }
6856
6857 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6858 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6859 {
6860 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6861 uap->path, uap->flags, 0, UIO_USERSPACE);
6862 }
6863
6864 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6865 faccessat(__unused proc_t p, struct faccessat_args *uap,
6866 __unused int32_t *retval)
6867 {
6868 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6869 return EINVAL;
6870 }
6871
6872 return faccessat_internal(vfs_context_current(), uap->fd,
6873 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6874 }
6875
6876 /*
6877 * Returns: 0 Success
6878 * EFAULT
6879 * copyout:EFAULT
6880 * namei:???
6881 * vn_stat:???
6882 */
6883 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6884 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6885 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6886 enum uio_seg segflg, int fd, int flag)
6887 {
6888 struct nameidata nd;
6889 int follow;
6890 union {
6891 struct stat sb;
6892 struct stat64 sb64;
6893 } source = {};
6894 union {
6895 struct user64_stat user64_sb;
6896 struct user32_stat user32_sb;
6897 struct user64_stat64 user64_sb64;
6898 struct user32_stat64 user32_sb64;
6899 } dest = {};
6900 caddr_t sbp;
6901 int error, my_size;
6902 kauth_filesec_t fsec;
6903 size_t xsecurity_bufsize;
6904 void * statptr;
6905 struct fileproc *fp = NULL;
6906 int needsrealdev = 0;
6907
6908 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6909 NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6910 segflg, path, ctx);
6911 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6912 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6913 }
6914
6915 #if NAMEDRSRCFORK
6916 int is_namedstream = 0;
6917 /* stat calls are allowed for resource forks. */
6918 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6919 #endif
6920
6921 if (flag & AT_FDONLY) {
6922 vnode_t fvp;
6923
6924 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6925 if (error) {
6926 return error;
6927 }
6928 if ((error = vnode_getwithref(fvp))) {
6929 file_drop(fd);
6930 return error;
6931 }
6932 nd.ni_vp = fvp;
6933 } else {
6934 error = nameiat(&nd, fd);
6935 if (error) {
6936 return error;
6937 }
6938 }
6939 fsec = KAUTH_FILESEC_NONE;
6940
6941 statptr = (void *)&source;
6942
6943 #if NAMEDRSRCFORK
6944 /* Grab reference on the shadow stream file vnode to
6945 * force an inactive on release which will mark it
6946 * for recycle.
6947 */
6948 if (vnode_isnamedstream(nd.ni_vp) &&
6949 (nd.ni_vp->v_parent != NULLVP) &&
6950 vnode_isshadow(nd.ni_vp)) {
6951 is_namedstream = 1;
6952 vnode_ref(nd.ni_vp);
6953 }
6954 #endif
6955
6956 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6957 if (fp && (xsecurity == USER_ADDR_NULL)) {
6958 /*
6959 * If the caller has the file open, and is not
6960 * requesting extended security information, we are
6961 * going to let them get the basic stat information.
6962 */
6963 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6964 fp->fp_glob->fg_cred);
6965 } else {
6966 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6967 isstat64, needsrealdev, ctx);
6968 }
6969
6970 #if NAMEDRSRCFORK
6971 if (is_namedstream) {
6972 vnode_rele(nd.ni_vp);
6973 }
6974 #endif
6975 vnode_put(nd.ni_vp);
6976 nameidone(&nd);
6977 if (fp) {
6978 file_drop(fd);
6979 fp = NULL;
6980 }
6981
6982 if (error) {
6983 return error;
6984 }
6985 /* Zap spare fields */
6986 if (isstat64 != 0) {
6987 source.sb64.st_lspare = 0;
6988 source.sb64.st_qspare[0] = 0LL;
6989 source.sb64.st_qspare[1] = 0LL;
6990 if (vfs_context_is64bit(ctx)) {
6991 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6992 my_size = sizeof(dest.user64_sb64);
6993 sbp = (caddr_t)&dest.user64_sb64;
6994 } else {
6995 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6996 my_size = sizeof(dest.user32_sb64);
6997 sbp = (caddr_t)&dest.user32_sb64;
6998 }
6999 /*
7000 * Check if we raced (post lookup) against the last unlink of a file.
7001 */
7002 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7003 source.sb64.st_nlink = 1;
7004 }
7005 } else {
7006 source.sb.st_lspare = 0;
7007 source.sb.st_qspare[0] = 0LL;
7008 source.sb.st_qspare[1] = 0LL;
7009 if (vfs_context_is64bit(ctx)) {
7010 munge_user64_stat(&source.sb, &dest.user64_sb);
7011 my_size = sizeof(dest.user64_sb);
7012 sbp = (caddr_t)&dest.user64_sb;
7013 } else {
7014 munge_user32_stat(&source.sb, &dest.user32_sb);
7015 my_size = sizeof(dest.user32_sb);
7016 sbp = (caddr_t)&dest.user32_sb;
7017 }
7018
7019 /*
7020 * Check if we raced (post lookup) against the last unlink of a file.
7021 */
7022 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7023 source.sb.st_nlink = 1;
7024 }
7025 }
7026 if ((error = copyout(sbp, ub, my_size)) != 0) {
7027 goto out;
7028 }
7029
7030 /* caller wants extended security information? */
7031 if (xsecurity != USER_ADDR_NULL) {
7032 /* did we get any? */
7033 if (fsec == KAUTH_FILESEC_NONE) {
7034 if (susize(xsecurity_size, 0) != 0) {
7035 error = EFAULT;
7036 goto out;
7037 }
7038 } else {
7039 /* find the user buffer size */
7040 xsecurity_bufsize = fusize(xsecurity_size);
7041
7042 /* copy out the actual data size */
7043 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7044 error = EFAULT;
7045 goto out;
7046 }
7047
7048 /* if the caller supplied enough room, copy out to it */
7049 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7050 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7051 }
7052 }
7053 }
7054 out:
7055 if (fsec != KAUTH_FILESEC_NONE) {
7056 kauth_filesec_free(fsec);
7057 }
7058 return error;
7059 }
7060
7061 /*
7062 * stat_extended: Get file status; with extended security (ACL).
7063 *
7064 * Parameters: p (ignored)
7065 * uap User argument descriptor (see below)
7066 * retval (ignored)
7067 *
7068 * Indirect: uap->path Path of file to get status from
7069 * uap->ub User buffer (holds file status info)
7070 * uap->xsecurity ACL to get (extended security)
7071 * uap->xsecurity_size Size of ACL
7072 *
7073 * Returns: 0 Success
7074 * !0 errno value
7075 *
7076 */
7077 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7078 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7079 __unused int32_t *retval)
7080 {
7081 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7082 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7083 0);
7084 }
7085
7086 /*
7087 * Returns: 0 Success
7088 * fstatat_internal:??? [see fstatat_internal() in this file]
7089 */
7090 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7091 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7092 {
7093 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7094 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7095 }
7096
7097 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7098 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7099 {
7100 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7101 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7102 }
7103
7104 /*
7105 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7106 *
7107 * Parameters: p (ignored)
7108 * uap User argument descriptor (see below)
7109 * retval (ignored)
7110 *
7111 * Indirect: uap->path Path of file to get status from
7112 * uap->ub User buffer (holds file status info)
7113 * uap->xsecurity ACL to get (extended security)
7114 * uap->xsecurity_size Size of ACL
7115 *
7116 * Returns: 0 Success
7117 * !0 errno value
7118 *
7119 */
7120 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7121 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7122 {
7123 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7124 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7125 0);
7126 }
7127
7128 /*
7129 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7130 *
7131 * Parameters: p (ignored)
7132 * uap User argument descriptor (see below)
7133 * retval (ignored)
7134 *
7135 * Indirect: uap->path Path of file to get status from
7136 * uap->ub User buffer (holds file status info)
7137 * uap->xsecurity ACL to get (extended security)
7138 * uap->xsecurity_size Size of ACL
7139 *
7140 * Returns: 0 Success
7141 * !0 errno value
7142 *
7143 */
7144 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7145 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7146 {
7147 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7148 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7149 AT_SYMLINK_NOFOLLOW);
7150 }
7151
7152 /*
7153 * Get file status; this version does not follow links.
7154 */
7155 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7156 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7157 {
7158 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7159 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7160 }
7161
7162 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7163 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7164 {
7165 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7166 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7167 }
7168
7169 /*
7170 * lstat64_extended: Get file status; can handle large inode numbers; does not
7171 * follow links; with extended security (ACL).
7172 *
7173 * Parameters: p (ignored)
7174 * uap User argument descriptor (see below)
7175 * retval (ignored)
7176 *
7177 * Indirect: uap->path Path of file to get status from
7178 * uap->ub User buffer (holds file status info)
7179 * uap->xsecurity ACL to get (extended security)
7180 * uap->xsecurity_size Size of ACL
7181 *
7182 * Returns: 0 Success
7183 * !0 errno value
7184 *
7185 */
7186 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7187 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7188 {
7189 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7190 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7191 AT_SYMLINK_NOFOLLOW);
7192 }
7193
7194 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7195 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7196 {
7197 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7198 return EINVAL;
7199 }
7200
7201 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7202 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7203 }
7204
7205 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7206 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7207 __unused int32_t *retval)
7208 {
7209 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7210 return EINVAL;
7211 }
7212
7213 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7214 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7215 }
7216
7217 /*
7218 * Get configurable pathname variables.
7219 *
7220 * Returns: 0 Success
7221 * namei:???
7222 * vn_pathconf:???
7223 *
7224 * Notes: Global implementation constants are intended to be
7225 * implemented in this function directly; all other constants
7226 * are per-FS implementation, and therefore must be handled in
7227 * each respective FS, instead.
7228 *
7229 * XXX We implement some things globally right now that should actually be
7230 * XXX per-FS; we will need to deal with this at some point.
7231 */
7232 /* ARGSUSED */
7233 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7234 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7235 {
7236 int error;
7237 struct nameidata nd;
7238 vfs_context_t ctx = vfs_context_current();
7239
7240 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7241 UIO_USERSPACE, uap->path, ctx);
7242 error = namei(&nd);
7243 if (error) {
7244 return error;
7245 }
7246
7247 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7248
7249 vnode_put(nd.ni_vp);
7250 nameidone(&nd);
7251 return error;
7252 }
7253
7254 /*
7255 * Return target name of a symbolic link.
7256 */
7257 /* ARGSUSED */
7258 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7259 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7260 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7261 int *retval)
7262 {
7263 vnode_t vp;
7264 uio_t auio;
7265 int error;
7266 struct nameidata nd;
7267 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
7268 bool put_vnode;
7269
7270 if (bufsize > INT32_MAX) {
7271 return EINVAL;
7272 }
7273
7274 if (lnk_vp) {
7275 vp = lnk_vp;
7276 put_vnode = false;
7277 } else {
7278 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7279 seg, path, ctx);
7280
7281 error = nameiat(&nd, fd);
7282 if (error) {
7283 return error;
7284 }
7285 vp = nd.ni_vp;
7286 put_vnode = true;
7287 nameidone(&nd);
7288 }
7289
7290 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7291 &uio_buf[0], sizeof(uio_buf));
7292 uio_addiov(auio, buf, bufsize);
7293 if (vp->v_type != VLNK) {
7294 error = EINVAL;
7295 } else {
7296 #if CONFIG_MACF
7297 error = mac_vnode_check_readlink(ctx, vp);
7298 #endif
7299 if (error == 0) {
7300 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7301 ctx);
7302 }
7303 if (error == 0) {
7304 error = VNOP_READLINK(vp, auio, ctx);
7305 }
7306 }
7307
7308 if (put_vnode) {
7309 vnode_put(vp);
7310 }
7311
7312 *retval = (int)(bufsize - uio_resid(auio));
7313 return error;
7314 }
7315
7316 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7317 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7318 {
7319 enum uio_seg procseg;
7320 vnode_t vp;
7321 int error;
7322
7323 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7324
7325 AUDIT_ARG(fd, uap->fd);
7326
7327 if ((error = file_vnode(uap->fd, &vp))) {
7328 return error;
7329 }
7330 if ((error = vnode_getwithref(vp))) {
7331 file_drop(uap->fd);
7332 return error;
7333 }
7334
7335 error = readlinkat_internal(vfs_context_current(), -1,
7336 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7337 uap->bufsize, procseg, retval);
7338
7339 vnode_put(vp);
7340 file_drop(uap->fd);
7341 return error;
7342 }
7343
7344 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7345 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7346 {
7347 enum uio_seg procseg;
7348
7349 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7350 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7351 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7352 uap->count, procseg, retval);
7353 }
7354
7355 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7356 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7357 {
7358 enum uio_seg procseg;
7359
7360 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7361 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7362 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7363 retval);
7364 }
7365
7366 /*
7367 * Change file flags, the deep inner layer.
7368 */
7369 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7370 chflags0(vnode_t vp, struct vnode_attr *va,
7371 int (*setattr)(vnode_t, void *, vfs_context_t),
7372 void *arg, vfs_context_t ctx)
7373 {
7374 kauth_action_t action = 0;
7375 int error;
7376
7377 #if CONFIG_MACF
7378 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7379 if (error) {
7380 goto out;
7381 }
7382 #endif
7383
7384 /* request authorisation, disregard immutability */
7385 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7386 goto out;
7387 }
7388 /*
7389 * Request that the auth layer disregard those file flags it's allowed to when
7390 * authorizing this operation; we need to do this in order to be able to
7391 * clear immutable flags.
7392 */
7393 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7394 goto out;
7395 }
7396 error = (*setattr)(vp, arg, ctx);
7397
7398 #if CONFIG_MACF
7399 if (error == 0) {
7400 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7401 }
7402 #endif
7403
7404 out:
7405 return error;
7406 }
7407
7408 /*
7409 * Change file flags.
7410 *
7411 * NOTE: this will vnode_put() `vp'
7412 */
7413 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7414 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7415 {
7416 struct vnode_attr va;
7417 int error;
7418
7419 VATTR_INIT(&va);
7420 VATTR_SET(&va, va_flags, flags);
7421
7422 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7423 vnode_put(vp);
7424
7425 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7426 error = ENOTSUP;
7427 }
7428
7429 return error;
7430 }
7431
7432 /*
7433 * Change flags of a file given a path name.
7434 */
7435 /* ARGSUSED */
7436 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7437 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7438 {
7439 vnode_t vp;
7440 vfs_context_t ctx = vfs_context_current();
7441 int error;
7442 struct nameidata nd;
7443 uint32_t wantparent = 0;
7444
7445 #if CONFIG_FILE_LEASES
7446 wantparent = WANTPARENT;
7447 #endif
7448
7449 AUDIT_ARG(fflags, uap->flags);
7450 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7451 UIO_USERSPACE, uap->path, ctx);
7452 error = namei(&nd);
7453 if (error) {
7454 return error;
7455 }
7456 vp = nd.ni_vp;
7457
7458 #if CONFIG_FILE_LEASES
7459 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7460 vnode_put(nd.ni_dvp);
7461 #endif
7462
7463 nameidone(&nd);
7464
7465 /* we don't vnode_put() here because chflags1 does internally */
7466 error = chflags1(vp, uap->flags, ctx);
7467
7468 return error;
7469 }
7470
7471 /*
7472 * Change flags of a file given a file descriptor.
7473 */
7474 /* ARGSUSED */
7475 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7476 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7477 {
7478 vnode_t vp;
7479 int error;
7480
7481 AUDIT_ARG(fd, uap->fd);
7482 AUDIT_ARG(fflags, uap->flags);
7483 if ((error = file_vnode(uap->fd, &vp))) {
7484 return error;
7485 }
7486
7487 if ((error = vnode_getwithref(vp))) {
7488 file_drop(uap->fd);
7489 return error;
7490 }
7491
7492 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7493
7494 #if CONFIG_FILE_LEASES
7495 vnode_breakdirlease(vp, true, O_WRONLY);
7496 #endif
7497
7498 /* we don't vnode_put() here because chflags1 does internally */
7499 error = chflags1(vp, uap->flags, vfs_context_current());
7500
7501 file_drop(uap->fd);
7502 return error;
7503 }
7504
7505 /*
7506 * Change security information on a filesystem object.
7507 *
7508 * Returns: 0 Success
7509 * EPERM Operation not permitted
7510 * vnode_authattr:??? [anything vnode_authattr can return]
7511 * vnode_authorize:??? [anything vnode_authorize can return]
7512 * vnode_setattr:??? [anything vnode_setattr can return]
7513 *
7514 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7515 * translated to EPERM before being returned.
7516 */
7517 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7518 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7519 {
7520 kauth_action_t action;
7521 int error;
7522
7523 AUDIT_ARG(mode, vap->va_mode);
7524 /* XXX audit new args */
7525
7526 #if NAMEDSTREAMS
7527 /* chmod calls are not allowed for resource forks. */
7528 if (vp->v_flag & VISNAMEDSTREAM) {
7529 return EPERM;
7530 }
7531 #endif
7532
7533 #if CONFIG_MACF
7534 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7535 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7536 return error;
7537 }
7538
7539 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7540 if ((error = mac_vnode_check_setowner(ctx, vp,
7541 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7542 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7543 return error;
7544 }
7545 }
7546
7547 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7548 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7549 return error;
7550 }
7551 #endif
7552
7553 /* make sure that the caller is allowed to set this security information */
7554 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7555 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7556 if (error == EACCES) {
7557 error = EPERM;
7558 }
7559 return error;
7560 }
7561
7562 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7563 return error;
7564 }
7565
7566 #if CONFIG_MACF
7567 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7568 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7569 }
7570
7571 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7572 mac_vnode_notify_setowner(ctx, vp,
7573 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7574 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7575 }
7576
7577 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7578 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7579 }
7580 #endif
7581
7582 return error;
7583 }
7584
7585
7586 /*
7587 * Change mode of a file given a path name.
7588 *
7589 * Returns: 0 Success
7590 * namei:??? [anything namei can return]
7591 * chmod_vnode:??? [anything chmod_vnode can return]
7592 */
7593 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7594 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7595 int fd, int flag, enum uio_seg segflg)
7596 {
7597 struct nameidata nd;
7598 int follow, error;
7599 uint32_t wantparent = 0;
7600
7601 #if CONFIG_FILE_LEASES
7602 wantparent = WANTPARENT;
7603 #endif
7604
7605 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7606 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7607 segflg, path, ctx);
7608 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7609 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7610 }
7611 if ((error = nameiat(&nd, fd))) {
7612 return error;
7613 }
7614
7615 #if CONFIG_FILE_LEASES
7616 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7617 vnode_put(nd.ni_dvp);
7618 #endif
7619
7620 error = chmod_vnode(ctx, nd.ni_vp, vap);
7621 vnode_put(nd.ni_vp);
7622 nameidone(&nd);
7623 return error;
7624 }
7625
7626 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7627 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7628 gid_t gid, user_addr_t xsecurity)
7629 {
7630 int error;
7631
7632 VATTR_INIT(pva);
7633
7634 if (mode != -1) {
7635 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7636 } else {
7637 pva->va_mode = 0;
7638 }
7639
7640 if (uid != KAUTH_UID_NONE) {
7641 VATTR_SET(pva, va_uid, uid);
7642 }
7643
7644 if (gid != KAUTH_GID_NONE) {
7645 VATTR_SET(pva, va_gid, gid);
7646 }
7647
7648 *pxsecdst = NULL;
7649 switch (xsecurity) {
7650 case USER_ADDR_NULL:
7651 break;
7652
7653 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7654 VATTR_SET(pva, va_acl, NULL);
7655 break;
7656
7657 default:
7658 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7659 return error;
7660 }
7661
7662 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7663 pva->va_vaflags |= VA_FILESEC_ACL;
7664 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7665 break;
7666 }
7667
7668 return 0;
7669 }
7670
7671 /*
7672 * chmod_extended: Change the mode of a file given a path name; with extended
7673 * argument list (including extended security (ACL)).
7674 *
7675 * Parameters: p Process requesting the open
7676 * uap User argument descriptor (see below)
7677 * retval (ignored)
7678 *
7679 * Indirect: uap->path Path to object (same as 'chmod')
7680 * uap->uid UID to set
7681 * uap->gid GID to set
7682 * uap->mode File mode to set (same as 'chmod')
7683 * uap->xsecurity ACL to set (or delete)
7684 *
7685 * Returns: 0 Success
7686 * !0 errno value
7687 *
7688 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7689 *
7690 * XXX: We should enummerate the possible errno values here, and where
7691 * in the code they originated.
7692 */
7693 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7694 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7695 {
7696 int error;
7697 struct vnode_attr va;
7698 kauth_filesec_t xsecdst = NULL;
7699
7700 AUDIT_ARG(owner, uap->uid, uap->gid);
7701
7702 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7703 uap->gid, uap->xsecurity);
7704
7705 if (error) {
7706 return error;
7707 }
7708
7709 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7710 UIO_USERSPACE);
7711
7712 if (xsecdst != NULL) {
7713 kauth_filesec_free(xsecdst);
7714 }
7715 return error;
7716 }
7717
7718 /*
7719 * Returns: 0 Success
7720 * chmodat:??? [anything chmodat can return]
7721 */
7722 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7723 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7724 int flag, enum uio_seg segflg)
7725 {
7726 struct vnode_attr va;
7727
7728 VATTR_INIT(&va);
7729 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7730
7731 return chmodat(ctx, path, &va, fd, flag, segflg);
7732 }
7733
7734 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7735 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7736 {
7737 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7738 AT_FDCWD, 0, UIO_USERSPACE);
7739 }
7740
7741 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7742 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7743 {
7744 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7745 return EINVAL;
7746 }
7747
7748 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7749 uap->fd, uap->flag, UIO_USERSPACE);
7750 }
7751
7752 /*
7753 * Change mode of a file given a file descriptor.
7754 */
7755 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7756 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7757 {
7758 vnode_t vp;
7759 int error;
7760
7761 AUDIT_ARG(fd, fd);
7762
7763 if ((error = file_vnode(fd, &vp)) != 0) {
7764 return error;
7765 }
7766 if ((error = vnode_getwithref(vp)) != 0) {
7767 file_drop(fd);
7768 return error;
7769 }
7770 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7771
7772 #if CONFIG_FILE_LEASES
7773 vnode_breakdirlease(vp, true, O_WRONLY);
7774 #endif
7775
7776 error = chmod_vnode(vfs_context_current(), vp, vap);
7777 (void)vnode_put(vp);
7778 file_drop(fd);
7779
7780 return error;
7781 }
7782
7783 /*
7784 * fchmod_extended: Change mode of a file given a file descriptor; with
7785 * extended argument list (including extended security (ACL)).
7786 *
7787 * Parameters: p Process requesting to change file mode
7788 * uap User argument descriptor (see below)
7789 * retval (ignored)
7790 *
7791 * Indirect: uap->mode File mode to set (same as 'chmod')
7792 * uap->uid UID to set
7793 * uap->gid GID to set
7794 * uap->xsecurity ACL to set (or delete)
7795 * uap->fd File descriptor of file to change mode
7796 *
7797 * Returns: 0 Success
7798 * !0 errno value
7799 *
7800 */
7801 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7802 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7803 {
7804 int error;
7805 struct vnode_attr va;
7806 kauth_filesec_t xsecdst = NULL;
7807
7808 AUDIT_ARG(owner, uap->uid, uap->gid);
7809
7810 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7811 uap->gid, uap->xsecurity);
7812
7813 if (error) {
7814 return error;
7815 }
7816
7817 error = fchmod1(p, uap->fd, &va);
7818
7819 if (xsecdst != NULL) {
7820 kauth_filesec_free(xsecdst);
7821 }
7822 return error;
7823 }
7824
7825 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7826 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7827 {
7828 struct vnode_attr va;
7829
7830 VATTR_INIT(&va);
7831 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7832
7833 return fchmod1(p, uap->fd, &va);
7834 }
7835
7836
7837 /*
7838 * Set ownership given a path name.
7839 */
7840 /* ARGSUSED */
7841 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7842 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7843 gid_t gid, int flag, enum uio_seg segflg)
7844 {
7845 vnode_t vp;
7846 struct vnode_attr va;
7847 int error;
7848 struct nameidata nd;
7849 int follow;
7850 kauth_action_t action;
7851 uint32_t wantparent = 0;
7852
7853 #if CONFIG_FILE_LEASES
7854 wantparent = WANTPARENT;
7855 #endif
7856
7857 AUDIT_ARG(owner, uid, gid);
7858
7859 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7860 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent, segflg,
7861 path, ctx);
7862 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7863 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7864 }
7865 error = nameiat(&nd, fd);
7866 if (error) {
7867 return error;
7868 }
7869 vp = nd.ni_vp;
7870
7871 VATTR_INIT(&va);
7872 if (uid != (uid_t)VNOVAL) {
7873 VATTR_SET(&va, va_uid, uid);
7874 }
7875 if (gid != (gid_t)VNOVAL) {
7876 VATTR_SET(&va, va_gid, gid);
7877 }
7878
7879 #if CONFIG_MACF
7880 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7881 if (error) {
7882 goto out;
7883 }
7884 #endif
7885
7886 /* preflight and authorize attribute changes */
7887 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7888 goto out;
7889 }
7890 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7891 goto out;
7892 }
7893
7894 #if CONFIG_FILE_LEASES
7895 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7896 #endif
7897
7898 error = vnode_setattr(vp, &va, ctx);
7899
7900 #if CONFIG_MACF
7901 if (error == 0) {
7902 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7903 }
7904 #endif
7905
7906 out:
7907 /*
7908 * EACCES is only allowed from namei(); permissions failure should
7909 * return EPERM, so we need to translate the error code.
7910 */
7911 if (error == EACCES) {
7912 error = EPERM;
7913 }
7914
7915 #if CONFIG_FILE_LEASES
7916 vnode_put(nd.ni_dvp);
7917 #endif
7918 nameidone(&nd);
7919 vnode_put(vp);
7920 return error;
7921 }
7922
7923 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7924 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7925 {
7926 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7927 uap->uid, uap->gid, 0, UIO_USERSPACE);
7928 }
7929
7930 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7931 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7932 {
7933 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7934 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7935 }
7936
7937 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7938 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7939 {
7940 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7941 return EINVAL;
7942 }
7943
7944 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7945 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7946 }
7947
7948 /*
7949 * Set ownership given a file descriptor.
7950 */
7951 /* ARGSUSED */
7952 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7953 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7954 {
7955 struct vnode_attr va;
7956 vfs_context_t ctx = vfs_context_current();
7957 vnode_t vp;
7958 int error;
7959 kauth_action_t action;
7960
7961 AUDIT_ARG(owner, uap->uid, uap->gid);
7962 AUDIT_ARG(fd, uap->fd);
7963
7964 if ((error = file_vnode(uap->fd, &vp))) {
7965 return error;
7966 }
7967
7968 if ((error = vnode_getwithref(vp))) {
7969 file_drop(uap->fd);
7970 return error;
7971 }
7972 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7973
7974 VATTR_INIT(&va);
7975 if (uap->uid != VNOVAL) {
7976 VATTR_SET(&va, va_uid, uap->uid);
7977 }
7978 if (uap->gid != VNOVAL) {
7979 VATTR_SET(&va, va_gid, uap->gid);
7980 }
7981
7982 #if NAMEDSTREAMS
7983 /* chown calls are not allowed for resource forks. */
7984 if (vp->v_flag & VISNAMEDSTREAM) {
7985 error = EPERM;
7986 goto out;
7987 }
7988 #endif
7989
7990 #if CONFIG_MACF
7991 error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7992 if (error) {
7993 goto out;
7994 }
7995 #endif
7996
7997 /* preflight and authorize attribute changes */
7998 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7999 goto out;
8000 }
8001 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8002 if (error == EACCES) {
8003 error = EPERM;
8004 }
8005 goto out;
8006 }
8007
8008 #if CONFIG_FILE_LEASES
8009 vnode_breakdirlease(vp, true, O_WRONLY);
8010 #endif
8011
8012 error = vnode_setattr(vp, &va, ctx);
8013
8014 #if CONFIG_MACF
8015 if (error == 0) {
8016 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
8017 }
8018 #endif
8019
8020 out:
8021 (void)vnode_put(vp);
8022 file_drop(uap->fd);
8023 return error;
8024 }
8025
8026 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8027 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8028 {
8029 int error;
8030
8031 if (usrtvp == USER_ADDR_NULL) {
8032 struct timeval old_tv;
8033 /* XXX Y2038 bug because of microtime argument */
8034 microtime(&old_tv);
8035 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8036 tsp[1] = tsp[0];
8037 } else {
8038 if (IS_64BIT_PROCESS(current_proc())) {
8039 struct user64_timeval tv[2];
8040 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8041 if (error) {
8042 return error;
8043 }
8044 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8045 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8046 } else {
8047 struct user32_timeval tv[2];
8048 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8049 if (error) {
8050 return error;
8051 }
8052 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8053 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8054 }
8055 }
8056 return 0;
8057 }
8058
8059 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8060 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8061 int nullflag)
8062 {
8063 int error;
8064 struct vnode_attr va;
8065 kauth_action_t action;
8066
8067 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8068
8069 VATTR_INIT(&va);
8070 VATTR_SET(&va, va_access_time, ts[0]);
8071 VATTR_SET(&va, va_modify_time, ts[1]);
8072 if (nullflag) {
8073 va.va_vaflags |= VA_UTIMES_NULL;
8074 }
8075
8076 #if NAMEDSTREAMS
8077 /* utimes calls are not allowed for resource forks. */
8078 if (vp->v_flag & VISNAMEDSTREAM) {
8079 error = EPERM;
8080 goto out;
8081 }
8082 #endif
8083
8084 #if CONFIG_MACF
8085 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8086 if (error) {
8087 goto out;
8088 }
8089 #endif
8090 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8091 if (!nullflag && error == EACCES) {
8092 error = EPERM;
8093 }
8094 goto out;
8095 }
8096
8097 /* since we may not need to auth anything, check here */
8098 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8099 if (!nullflag && error == EACCES) {
8100 error = EPERM;
8101 }
8102 goto out;
8103 }
8104 error = vnode_setattr(vp, &va, ctx);
8105
8106 #if CONFIG_MACF
8107 if (error == 0) {
8108 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8109 }
8110 #endif
8111
8112 out:
8113 return error;
8114 }
8115
8116 /*
8117 * Set the access and modification times of a file.
8118 */
8119 /* ARGSUSED */
8120 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8121 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8122 {
8123 struct timespec ts[2];
8124 user_addr_t usrtvp;
8125 int error;
8126 struct nameidata nd;
8127 vfs_context_t ctx = vfs_context_current();
8128 uint32_t wantparent = 0;
8129
8130 #if CONFIG_FILE_LEASES
8131 wantparent = WANTPARENT;
8132 #endif
8133
8134 /*
8135 * AUDIT: Needed to change the order of operations to do the
8136 * name lookup first because auditing wants the path.
8137 */
8138 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8139 UIO_USERSPACE, uap->path, ctx);
8140 error = namei(&nd);
8141 if (error) {
8142 return error;
8143 }
8144
8145 /*
8146 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8147 * the current time instead.
8148 */
8149 usrtvp = uap->tptr;
8150 if ((error = getutimes(usrtvp, ts)) != 0) {
8151 goto out;
8152 }
8153
8154 #if CONFIG_FILE_LEASES
8155 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8156 #endif
8157
8158 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8159
8160 out:
8161 #if CONFIG_FILE_LEASES
8162 vnode_put(nd.ni_dvp);
8163 #endif
8164 nameidone(&nd);
8165 vnode_put(nd.ni_vp);
8166 return error;
8167 }
8168
8169 /*
8170 * Set the access and modification times of a file.
8171 */
8172 /* ARGSUSED */
8173 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8174 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8175 {
8176 struct timespec ts[2];
8177 vnode_t vp;
8178 user_addr_t usrtvp;
8179 int error;
8180
8181 AUDIT_ARG(fd, uap->fd);
8182 usrtvp = uap->tptr;
8183 if ((error = getutimes(usrtvp, ts)) != 0) {
8184 return error;
8185 }
8186 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8187 return error;
8188 }
8189 if ((error = vnode_getwithref(vp))) {
8190 file_drop(uap->fd);
8191 return error;
8192 }
8193
8194 #if CONFIG_FILE_LEASES
8195 vnode_breakdirlease(vp, true, O_WRONLY);
8196 #endif
8197
8198 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8199
8200 vnode_put(vp);
8201 file_drop(uap->fd);
8202 return error;
8203 }
8204
8205 static int
truncate_validate_common(proc_t p,off_t length)8206 truncate_validate_common(proc_t p, off_t length)
8207 {
8208 rlim_t fsize_limit;
8209
8210 if (length < 0) {
8211 return EINVAL;
8212 }
8213
8214 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8215 if ((rlim_t)length > fsize_limit) {
8216 psignal(p, SIGXFSZ);
8217 return EFBIG;
8218 }
8219
8220 return 0;
8221 }
8222
8223 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8224 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8225 vfs_context_t ctx, boolean_t need_auth)
8226 {
8227 struct vnode_attr va;
8228 kauth_action_t action;
8229 int error;
8230
8231 VATTR_INIT(&va);
8232 VATTR_SET(&va, va_data_size, length);
8233
8234 #if CONFIG_MACF
8235 error = mac_vnode_check_truncate(ctx, cred, vp);
8236 if (error) {
8237 return error;
8238 }
8239 #endif
8240
8241 /*
8242 * If we reached here from `ftruncate` then we already did an effective
8243 * `vnode_authorize` upon open. We honour the result from then.
8244 */
8245 if (need_auth) {
8246 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8247 return error;
8248 }
8249
8250 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8251 return error;
8252 }
8253 }
8254
8255 #if CONFIG_FILE_LEASES
8256 /* Check if there is a lease placed on the parent directory. */
8257 vnode_breakdirlease(vp, true, O_WRONLY);
8258
8259 /* Now check if there is a lease placed on the file itself. */
8260 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8261 #endif
8262
8263 error = vnode_setattr(vp, &va, ctx);
8264
8265 #if CONFIG_MACF
8266 if (error == 0) {
8267 mac_vnode_notify_truncate(ctx, cred, vp);
8268 }
8269 #endif
8270
8271 return error;
8272 }
8273
8274 /*
8275 * Truncate a file given its path name.
8276 */
8277 /* ARGSUSED */
8278 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8279 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8280 {
8281 vfs_context_t ctx = vfs_context_current();
8282 vnode_t vp;
8283 int error;
8284 struct nameidata nd;
8285
8286 if ((error = truncate_validate_common(p, uap->length))) {
8287 return error;
8288 }
8289
8290 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8291 UIO_USERSPACE, uap->path, ctx);
8292
8293 if ((error = namei(&nd))) {
8294 return error;
8295 }
8296
8297 vp = nd.ni_vp;
8298 nameidone(&nd);
8299
8300 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8301 vnode_put(vp);
8302
8303 return error;
8304 }
8305
8306 /*
8307 * Truncate a file given a file descriptor.
8308 */
8309 /* ARGSUSED */
8310 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8311 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8312 {
8313 vnode_t vp;
8314 struct fileproc *fp;
8315 int error;
8316
8317 AUDIT_ARG(fd, uap->fd);
8318
8319 if ((error = truncate_validate_common(p, uap->length))) {
8320 return error;
8321 }
8322
8323 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8324 return error;
8325 }
8326
8327 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8328 case DTYPE_PSXSHM:
8329 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8330 goto out;
8331 case DTYPE_VNODE:
8332 break;
8333 default:
8334 error = EINVAL;
8335 goto out;
8336 }
8337
8338 vp = (vnode_t)fp_get_data(fp);
8339
8340 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8341 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8342 error = EINVAL;
8343 goto out;
8344 }
8345
8346 if ((error = vnode_getwithref(vp)) != 0) {
8347 goto out;
8348 }
8349
8350 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8351
8352 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8353 vfs_context_current(), false);
8354 vnode_put(vp);
8355
8356 out:
8357 file_drop(uap->fd);
8358 return error;
8359 }
8360
8361
8362 /*
8363 * Sync an open file with synchronized I/O _file_ integrity completion
8364 */
8365 /* ARGSUSED */
8366 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8367 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8368 {
8369 __pthread_testcancel(1);
8370 return fsync_common(p, uap, MNT_WAIT);
8371 }
8372
8373
8374 /*
8375 * Sync an open file with synchronized I/O _file_ integrity completion
8376 *
8377 * Notes: This is a legacy support function that does not test for
8378 * thread cancellation points.
8379 */
8380 /* ARGSUSED */
8381 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8382 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8383 {
8384 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8385 }
8386
8387
8388 /*
8389 * Sync an open file with synchronized I/O _data_ integrity completion
8390 */
8391 /* ARGSUSED */
8392 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8393 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8394 {
8395 __pthread_testcancel(1);
8396 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8397 }
8398
8399
8400 /*
8401 * fsync_common
8402 *
8403 * Common fsync code to support both synchronized I/O file integrity completion
8404 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8405 *
8406 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8407 * will only guarantee that the file data contents are retrievable. If
8408 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8409 * includes additional metadata unnecessary for retrieving the file data
8410 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8411 * storage.
8412 *
8413 * Parameters: p The process
8414 * uap->fd The descriptor to synchronize
8415 * flags The data integrity flags
8416 *
8417 * Returns: int Success
8418 * fp_getfvp:EBADF Bad file descriptor
8419 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8420 * VNOP_FSYNC:??? unspecified
8421 *
8422 * Notes: We use struct fsync_args because it is a short name, and all
8423 * caller argument structures are otherwise identical.
8424 */
8425 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8426 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8427 {
8428 vnode_t vp;
8429 struct fileproc *fp;
8430 vfs_context_t ctx = vfs_context_current();
8431 int error;
8432
8433 AUDIT_ARG(fd, uap->fd);
8434
8435 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8436 return error;
8437 }
8438 if ((error = vnode_getwithref(vp))) {
8439 file_drop(uap->fd);
8440 return error;
8441 }
8442
8443 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8444
8445 error = VNOP_FSYNC(vp, flags, ctx);
8446
8447 #if NAMEDRSRCFORK
8448 /* Sync resource fork shadow file if necessary. */
8449 if ((error == 0) &&
8450 (vp->v_flag & VISNAMEDSTREAM) &&
8451 (vp->v_parent != NULLVP) &&
8452 vnode_isshadow(vp) &&
8453 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8454 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8455 }
8456 #endif
8457
8458 (void)vnode_put(vp);
8459 file_drop(uap->fd);
8460 return error;
8461 }
8462
8463 /*
8464 * Duplicate files. Source must be a file, target must be a file or
8465 * must not exist.
8466 *
8467 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8468 * perform inheritance correctly.
8469 */
8470 /* ARGSUSED */
8471 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8472 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8473 {
8474 vnode_t tvp, fvp, tdvp, sdvp;
8475 struct nameidata fromnd, tond;
8476 int error;
8477 vfs_context_t ctx = vfs_context_current();
8478
8479 /* Check that the flags are valid. */
8480 if (uap->flags & ~CPF_MASK) {
8481 return EINVAL;
8482 }
8483
8484 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8485 UIO_USERSPACE, uap->from, ctx);
8486 if ((error = namei(&fromnd))) {
8487 return error;
8488 }
8489 fvp = fromnd.ni_vp;
8490
8491 NDINIT(&tond, CREATE, OP_LINK,
8492 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8493 UIO_USERSPACE, uap->to, ctx);
8494 if ((error = namei(&tond))) {
8495 goto out1;
8496 }
8497 tdvp = tond.ni_dvp;
8498 tvp = tond.ni_vp;
8499
8500 if (tvp != NULL) {
8501 if (!(uap->flags & CPF_OVERWRITE)) {
8502 error = EEXIST;
8503 goto out;
8504 }
8505 }
8506
8507 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8508 error = EISDIR;
8509 goto out;
8510 }
8511
8512 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8513 error = EOPNOTSUPP;
8514 goto out;
8515 }
8516
8517 #if CONFIG_MACF
8518 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8519 goto out;
8520 }
8521 #endif /* CONFIG_MACF */
8522
8523 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8524 goto out;
8525 }
8526 if (tvp) {
8527 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8528 goto out;
8529 }
8530 }
8531 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8532 goto out;
8533 }
8534
8535 if (fvp == tdvp) {
8536 error = EINVAL;
8537 }
8538 /*
8539 * If source is the same as the destination (that is the
8540 * same inode number) then there is nothing to do.
8541 * (fixed to have POSIX semantics - CSM 3/2/98)
8542 */
8543 if (fvp == tvp) {
8544 error = -1;
8545 }
8546
8547 #if CONFIG_FILE_LEASES
8548 vnode_breakdirlease(tdvp, false, O_WRONLY);
8549 #endif
8550
8551 if (!error) {
8552 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8553 }
8554 out:
8555 sdvp = tond.ni_startdir;
8556 /*
8557 * nameidone has to happen before we vnode_put(tdvp)
8558 * since it may need to release the fs_nodelock on the tdvp
8559 */
8560 nameidone(&tond);
8561
8562 if (tvp) {
8563 vnode_put(tvp);
8564 }
8565 vnode_put(tdvp);
8566 vnode_put(sdvp);
8567 out1:
8568 vnode_put(fvp);
8569
8570 nameidone(&fromnd);
8571
8572 if (error == -1) {
8573 return 0;
8574 }
8575 return error;
8576 }
8577
8578 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8579
8580 /*
8581 * Helper function for doing clones. The caller is expected to provide an
8582 * iocounted source vnode and release it.
8583 */
8584 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8585 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8586 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8587 {
8588 vnode_t tvp, tdvp;
8589 struct nameidata tond;
8590 int error;
8591 int follow;
8592 boolean_t free_src_acl;
8593 boolean_t attr_cleanup;
8594 enum vtype v_type;
8595 kauth_action_t action;
8596 struct componentname *cnp;
8597 uint32_t defaulted = 0;
8598 struct vnode_attr va;
8599 struct vnode_attr nva;
8600 uint32_t vnop_flags;
8601
8602 v_type = vnode_vtype(fvp);
8603 switch (v_type) {
8604 case VLNK:
8605 /* FALLTHRU */
8606 case VREG:
8607 action = KAUTH_VNODE_ADD_FILE;
8608 break;
8609 case VDIR:
8610 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8611 fvp->v_mountedhere) {
8612 return EINVAL;
8613 }
8614 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8615 break;
8616 default:
8617 return EINVAL;
8618 }
8619
8620 AUDIT_ARG(fd2, dst_dirfd);
8621 AUDIT_ARG(value32, flags);
8622
8623 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8624 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8625 UIO_USERSPACE, dst, ctx);
8626 if ((error = nameiat(&tond, dst_dirfd))) {
8627 return error;
8628 }
8629 cnp = &tond.ni_cnd;
8630 tdvp = tond.ni_dvp;
8631 tvp = tond.ni_vp;
8632
8633 free_src_acl = FALSE;
8634 attr_cleanup = FALSE;
8635
8636 if (tvp != NULL) {
8637 error = EEXIST;
8638 goto out;
8639 }
8640
8641 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8642 error = EXDEV;
8643 goto out;
8644 }
8645
8646 #if CONFIG_MACF
8647 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8648 goto out;
8649 }
8650 #endif
8651 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8652 goto out;
8653 }
8654
8655 action = KAUTH_VNODE_GENERIC_READ_BITS;
8656 if (data_read_authorised) {
8657 action &= ~KAUTH_VNODE_READ_DATA;
8658 }
8659 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8660 goto out;
8661 }
8662
8663 /*
8664 * certain attributes may need to be changed from the source, we ask for
8665 * those here with the exception of source file's ACLs unless the CLONE_ACL
8666 * flag is specified. By default, the clone file will inherit the target
8667 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8668 * will inherit the source file's ACLs instead.
8669 */
8670 VATTR_INIT(&va);
8671 VATTR_WANTED(&va, va_uid);
8672 VATTR_WANTED(&va, va_gid);
8673 VATTR_WANTED(&va, va_mode);
8674 VATTR_WANTED(&va, va_flags);
8675 if (flags & CLONE_ACL) {
8676 VATTR_WANTED(&va, va_acl);
8677 }
8678
8679 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8680 goto out;
8681 }
8682
8683 VATTR_INIT(&nva);
8684 VATTR_SET(&nva, va_type, v_type);
8685 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8686 VATTR_SET(&nva, va_acl, va.va_acl);
8687 free_src_acl = TRUE;
8688 }
8689
8690 /* Handle ACL inheritance, initialize vap. */
8691 if (v_type == VLNK) {
8692 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8693 } else {
8694 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8695 if (error) {
8696 goto out;
8697 }
8698 attr_cleanup = TRUE;
8699 }
8700
8701 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8702 /*
8703 * We've got initial values for all security parameters,
8704 * If we are superuser, then we can change owners to be the
8705 * same as the source. Both superuser and the owner have default
8706 * WRITE_SECURITY privileges so all other fields can be taken
8707 * from source as well.
8708 */
8709 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8710 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8711 VATTR_SET(&nva, va_uid, va.va_uid);
8712 }
8713 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8714 VATTR_SET(&nva, va_gid, va.va_gid);
8715 }
8716 } else {
8717 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8718 }
8719
8720 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8721 VATTR_SET(&nva, va_mode, va.va_mode);
8722 }
8723 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8724 VATTR_SET(&nva, va_flags,
8725 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8726 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8727 }
8728
8729 #if CONFIG_FILE_LEASES
8730 vnode_breakdirlease(tdvp, false, O_WRONLY);
8731 #endif
8732
8733 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8734
8735 if (!error && tvp) {
8736 int update_flags = 0;
8737 #if CONFIG_FSE
8738 int fsevent;
8739 #endif /* CONFIG_FSE */
8740
8741 /*
8742 * If some of the requested attributes weren't handled by the
8743 * VNOP, use our fallback code.
8744 */
8745 if (!VATTR_ALL_SUPPORTED(&nva)) {
8746 (void)vnode_setattr_fallback(tvp, &nva, ctx);
8747 }
8748
8749 #if CONFIG_MACF
8750 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8751 VNODE_LABEL_CREATE, ctx);
8752 #endif
8753
8754 // Make sure the name & parent pointers are hooked up
8755 if (tvp->v_name == NULL) {
8756 update_flags |= VNODE_UPDATE_NAME;
8757 }
8758 if (tvp->v_parent == NULLVP) {
8759 update_flags |= VNODE_UPDATE_PARENT;
8760 }
8761
8762 if (update_flags) {
8763 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8764 cnp->cn_namelen, cnp->cn_hash, update_flags);
8765 }
8766
8767 #if CONFIG_FSE
8768 switch (vnode_vtype(tvp)) {
8769 case VLNK:
8770 /* FALLTHRU */
8771 case VREG:
8772 fsevent = FSE_CREATE_FILE;
8773 break;
8774 case VDIR:
8775 fsevent = FSE_CREATE_DIR;
8776 break;
8777 default:
8778 goto out;
8779 }
8780
8781 if (need_fsevent(fsevent, tvp)) {
8782 /*
8783 * The following is a sequence of three explicit events.
8784 * A pair of FSE_CLONE events representing the source and destination
8785 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8786 * fseventsd may coalesce the destination clone and create events
8787 * into a single event resulting in the following sequence for a client
8788 * FSE_CLONE (src)
8789 * FSE_CLONE | FSE_CREATE (dst)
8790 */
8791 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8792 FSE_ARG_DONE);
8793 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8794 FSE_ARG_DONE);
8795 }
8796 #endif /* CONFIG_FSE */
8797 }
8798
8799 out:
8800 if (attr_cleanup) {
8801 vn_attribute_cleanup(&nva, defaulted);
8802 }
8803 if (free_src_acl && va.va_acl) {
8804 kauth_acl_free(va.va_acl);
8805 }
8806 nameidone(&tond);
8807 if (tvp) {
8808 vnode_put(tvp);
8809 }
8810 vnode_put(tdvp);
8811 return error;
8812 }
8813
8814 /*
8815 * clone files or directories, target must not exist.
8816 */
8817 /* ARGSUSED */
8818 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8819 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8820 __unused int32_t *retval)
8821 {
8822 vnode_t fvp;
8823 struct nameidata fromnd;
8824 int follow;
8825 int error;
8826 vfs_context_t ctx = vfs_context_current();
8827
8828 /* Check that the flags are valid. */
8829 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8830 return EINVAL;
8831 }
8832
8833 AUDIT_ARG(fd, uap->src_dirfd);
8834
8835 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8836 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8837 UIO_USERSPACE, uap->src, ctx);
8838 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8839 return error;
8840 }
8841
8842 fvp = fromnd.ni_vp;
8843 nameidone(&fromnd);
8844
8845 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8846 uap->flags, ctx);
8847
8848 vnode_put(fvp);
8849 return error;
8850 }
8851
8852 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8853 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8854 __unused int32_t *retval)
8855 {
8856 vnode_t fvp;
8857 struct fileproc *fp;
8858 int error;
8859 vfs_context_t ctx = vfs_context_current();
8860
8861 /* Check that the flags are valid. */
8862 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8863 return EINVAL;
8864 }
8865
8866 AUDIT_ARG(fd, uap->src_fd);
8867 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8868 if (error) {
8869 return error;
8870 }
8871
8872 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8873 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8874 error = EBADF;
8875 goto out;
8876 }
8877
8878 if ((error = vnode_getwithref(fvp))) {
8879 goto out;
8880 }
8881
8882 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8883
8884 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8885 uap->flags, ctx);
8886
8887 vnode_put(fvp);
8888 out:
8889 file_drop(uap->src_fd);
8890 return error;
8891 }
8892
8893 static int
rename_submounts_callback(mount_t mp,void * arg)8894 rename_submounts_callback(mount_t mp, void *arg)
8895 {
8896 int error = 0;
8897 mount_t pmp = (mount_t)arg;
8898 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8899
8900 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8901 return 0;
8902 }
8903
8904 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8905 return 0;
8906 }
8907
8908 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8909 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8910 return -1;
8911 }
8912
8913 size_t pathlen = MAXPATHLEN;
8914 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8915 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8916 }
8917
8918 vfs_unbusy(mp);
8919
8920 return error;
8921 }
8922
8923 /*
8924 * Rename files. Source and destination must either both be directories,
8925 * or both not be directories. If target is a directory, it must be empty.
8926 */
8927 /* ARGSUSED */
8928 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8929 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8930 int tofd, user_addr_t to, int segflg, u_int uflags)
8931 {
8932 vnode_t tvp, tdvp;
8933 vnode_t fvp, fdvp;
8934 vnode_t mnt_fvp;
8935 struct nameidata *fromnd, *tond;
8936 int error = 0;
8937 int do_retry;
8938 int retry_count;
8939 int mntrename;
8940 int need_event;
8941 int need_kpath2;
8942 int has_listeners;
8943 const char *oname = NULL;
8944 char *from_name = NULL, *to_name = NULL;
8945 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8946 int from_len = 0, to_len = 0;
8947 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8948 int holding_mntlock;
8949 int vn_authorize_skipped;
8950 mount_t locked_mp = NULL;
8951 vnode_t oparent = NULLVP;
8952 #if CONFIG_FSE
8953 fse_info from_finfo = {}, to_finfo;
8954 #endif
8955 int from_truncated = 0, to_truncated = 0;
8956 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8957 int batched = 0;
8958 struct vnode_attr *fvap, *tvap;
8959 int continuing = 0;
8960 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8961 int32_t nofollow_any = 0;
8962 /* carving out a chunk for structs that are too big to be on stack. */
8963 struct {
8964 struct nameidata from_node, to_node;
8965 struct vnode_attr fv_attr, tv_attr;
8966 } * __rename_data;
8967
8968 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8969 fromnd = &__rename_data->from_node;
8970 tond = &__rename_data->to_node;
8971
8972 holding_mntlock = 0;
8973 do_retry = 0;
8974 retry_count = 0;
8975 retry:
8976 fvp = tvp = NULL;
8977 fdvp = tdvp = NULL;
8978 fvap = tvap = NULL;
8979 mnt_fvp = NULLVP;
8980 mntrename = FALSE;
8981 vn_authorize_skipped = FALSE;
8982
8983 if (uflags & RENAME_NOFOLLOW_ANY) {
8984 nofollow_any = NAMEI_NOFOLLOW_ANY;
8985 }
8986 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8987 segflg, from, ctx);
8988 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8989
8990 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8991 segflg, to, ctx);
8992 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8993
8994 continue_lookup:
8995 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8996 if ((error = nameiat(fromnd, fromfd))) {
8997 goto out1;
8998 }
8999 fdvp = fromnd->ni_dvp;
9000 fvp = fromnd->ni_vp;
9001
9002 if (fvp && fvp->v_type == VDIR) {
9003 tond->ni_cnd.cn_flags |= WILLBEDIR;
9004 }
9005 }
9006
9007 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9008 if ((error = nameiat(tond, tofd))) {
9009 /*
9010 * Translate error code for rename("dir1", "dir2/.").
9011 */
9012 if (error == EISDIR && fvp->v_type == VDIR) {
9013 error = EINVAL;
9014 }
9015 goto out1;
9016 }
9017 tdvp = tond->ni_dvp;
9018 tvp = tond->ni_vp;
9019 }
9020
9021 #if DEVELOPMENT || DEBUG
9022 /*
9023 * XXX VSWAP: Check for entitlements or special flag here
9024 * so we can restrict access appropriately.
9025 */
9026 #else /* DEVELOPMENT || DEBUG */
9027
9028 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9029 error = EPERM;
9030 goto out1;
9031 }
9032
9033 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9034 error = EPERM;
9035 goto out1;
9036 }
9037 #endif /* DEVELOPMENT || DEBUG */
9038
9039 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9040 error = ENOENT;
9041 goto out1;
9042 }
9043
9044 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9045 int32_t pval = 0;
9046 int err = 0;
9047
9048 /*
9049 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9050 * has the same name as target iff the following conditions are met:
9051 * 1. the target file system is case insensitive
9052 * 2. source and target directories are the same
9053 * 3. source and target files are the same
9054 * 4. name only differs in case (determined by underlying filesystem)
9055 */
9056 if (fvp != tvp || fdvp != tdvp) {
9057 error = EEXIST;
9058 goto out1;
9059 }
9060
9061 /*
9062 * Assume that the target file system is case sensitive if
9063 * _PC_CASE_SENSITIVE selector isn't supported.
9064 */
9065 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9066 if (err != 0 || pval != 0) {
9067 error = EEXIST;
9068 goto out1;
9069 }
9070 }
9071
9072 batched = vnode_compound_rename_available(fdvp);
9073
9074 #if CONFIG_FSE
9075 need_event = need_fsevent(FSE_RENAME, fdvp);
9076 if (need_event) {
9077 if (fvp) {
9078 get_fse_info(fvp, &from_finfo, ctx);
9079 } else {
9080 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9081 if (error) {
9082 goto out1;
9083 }
9084
9085 fvap = &__rename_data->fv_attr;
9086 }
9087
9088 if (tvp) {
9089 get_fse_info(tvp, &to_finfo, ctx);
9090 } else if (batched) {
9091 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9092 if (error) {
9093 goto out1;
9094 }
9095
9096 tvap = &__rename_data->tv_attr;
9097 }
9098 }
9099 #else
9100 need_event = 0;
9101 #endif /* CONFIG_FSE */
9102
9103 has_listeners = kauth_authorize_fileop_has_listeners();
9104
9105 need_kpath2 = 0;
9106 #if CONFIG_AUDIT
9107 if (AUDIT_RECORD_EXISTS()) {
9108 need_kpath2 = 1;
9109 }
9110 #endif
9111
9112 if (need_event || has_listeners) {
9113 if (from_name == NULL) {
9114 GET_PATH(from_name);
9115 }
9116
9117 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9118
9119 if (from_name_no_firmlink == NULL) {
9120 GET_PATH(from_name_no_firmlink);
9121 }
9122
9123 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9124 }
9125
9126 if (need_event || need_kpath2 || has_listeners) {
9127 if (to_name == NULL) {
9128 GET_PATH(to_name);
9129 }
9130
9131 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9132
9133 if (to_name_no_firmlink == NULL) {
9134 GET_PATH(to_name_no_firmlink);
9135 }
9136
9137 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9138 if (to_name && need_kpath2) {
9139 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9140 }
9141 }
9142 if (!fvp) {
9143 /*
9144 * Claim: this check will never reject a valid rename.
9145 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9146 * Suppose fdvp and tdvp are not on the same mount.
9147 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9148 * then you can't move it to within another dir on the same mountpoint.
9149 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9150 *
9151 * If this check passes, then we are safe to pass these vnodes to the same FS.
9152 */
9153 if (fdvp->v_mount != tdvp->v_mount) {
9154 error = EXDEV;
9155 goto out1;
9156 }
9157 goto skipped_lookup;
9158 }
9159
9160 /*
9161 * If the source and destination are the same (i.e. they're
9162 * links to the same vnode) and the target file system is
9163 * case sensitive, then there is nothing to do.
9164 *
9165 * XXX Come back to this.
9166 */
9167 if (fvp == tvp) {
9168 int pathconf_val;
9169
9170 /*
9171 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9172 * then assume that this file system is case sensitive.
9173 */
9174 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9175 pathconf_val != 0) {
9176 vn_authorize_skipped = TRUE;
9177 goto out1;
9178 }
9179 }
9180
9181 /*
9182 * Allow the renaming of mount points.
9183 * - target must not exist
9184 * - target must reside in the same directory as source
9185 * - union mounts cannot be renamed
9186 * - the root fs, and tightly-linked system volumes, cannot be renamed
9187 *
9188 * XXX Handle this in VFS after a continued lookup (if we missed
9189 * in the cache to start off)
9190 *
9191 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9192 * we'll skip past here. The file system is responsible for
9193 * checking that @tvp is not a descendent of @fvp and vice versa
9194 * so it should always return EINVAL if either @tvp or @fvp is the
9195 * root of a volume.
9196 */
9197 if ((fvp->v_flag & VROOT) &&
9198 (fvp->v_type == VDIR) &&
9199 (tvp == NULL) &&
9200 (fvp->v_mountedhere == NULL) &&
9201 (fdvp == tdvp) &&
9202 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9203 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9204 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9205 vnode_t coveredvp;
9206
9207 /* switch fvp to the covered vnode */
9208 coveredvp = fvp->v_mount->mnt_vnodecovered;
9209 if ((vnode_getwithref(coveredvp))) {
9210 error = ENOENT;
9211 goto out1;
9212 }
9213 /*
9214 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9215 * later.
9216 */
9217 mnt_fvp = fvp;
9218
9219 fvp = coveredvp;
9220 mntrename = TRUE;
9221 }
9222 /*
9223 * Check for cross-device rename.
9224 */
9225 if ((fvp->v_mount != tdvp->v_mount) ||
9226 (tvp && (fvp->v_mount != tvp->v_mount))) {
9227 error = EXDEV;
9228 goto out1;
9229 }
9230
9231 /*
9232 * If source is the same as the destination (that is the
9233 * same inode number) then there is nothing to do...
9234 * EXCEPT if the underlying file system supports case
9235 * insensitivity and is case preserving. In this case
9236 * the file system needs to handle the special case of
9237 * getting the same vnode as target (fvp) and source (tvp).
9238 *
9239 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9240 * and _PC_CASE_PRESERVING can have this exception, and they need to
9241 * handle the special case of getting the same vnode as target and
9242 * source. NOTE: Then the target is unlocked going into vnop_rename,
9243 * so not to cause locking problems. There is a single reference on tvp.
9244 *
9245 * NOTE - that fvp == tvp also occurs if they are hard linked and
9246 * that correct behaviour then is just to return success without doing
9247 * anything.
9248 *
9249 * XXX filesystem should take care of this itself, perhaps...
9250 */
9251 if (fvp == tvp && fdvp == tdvp) {
9252 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9253 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9254 fromnd->ni_cnd.cn_namelen)) {
9255 vn_authorize_skipped = TRUE;
9256 goto out1;
9257 }
9258 }
9259
9260 if (holding_mntlock && fvp->v_mount != locked_mp) {
9261 /*
9262 * we're holding a reference and lock
9263 * on locked_mp, but it no longer matches
9264 * what we want to do... so drop our hold
9265 */
9266 mount_unlock_renames(locked_mp);
9267 mount_drop(locked_mp, 0);
9268 holding_mntlock = 0;
9269 }
9270 if (tdvp != fdvp && fvp->v_type == VDIR) {
9271 /*
9272 * serialize renames that re-shape
9273 * the tree... if holding_mntlock is
9274 * set, then we're ready to go...
9275 * otherwise we
9276 * first need to drop the iocounts
9277 * we picked up, second take the
9278 * lock to serialize the access,
9279 * then finally start the lookup
9280 * process over with the lock held
9281 */
9282 if (!holding_mntlock) {
9283 /*
9284 * need to grab a reference on
9285 * the mount point before we
9286 * drop all the iocounts... once
9287 * the iocounts are gone, the mount
9288 * could follow
9289 */
9290 locked_mp = fvp->v_mount;
9291 mount_ref(locked_mp, 0);
9292
9293 /*
9294 * nameidone has to happen before we vnode_put(tvp)
9295 * since it may need to release the fs_nodelock on the tvp
9296 */
9297 nameidone(tond);
9298
9299 if (tvp) {
9300 vnode_put(tvp);
9301 }
9302 vnode_put(tdvp);
9303
9304 /*
9305 * nameidone has to happen before we vnode_put(fdvp)
9306 * since it may need to release the fs_nodelock on the fvp
9307 */
9308 nameidone(fromnd);
9309
9310 vnode_put(fvp);
9311 vnode_put(fdvp);
9312
9313 if (mnt_fvp != NULLVP) {
9314 vnode_put(mnt_fvp);
9315 }
9316
9317 mount_lock_renames(locked_mp);
9318 holding_mntlock = 1;
9319
9320 goto retry;
9321 }
9322 } else {
9323 /*
9324 * when we dropped the iocounts to take
9325 * the lock, we allowed the identity of
9326 * the various vnodes to change... if they did,
9327 * we may no longer be dealing with a rename
9328 * that reshapes the tree... once we're holding
9329 * the iocounts, the vnodes can't change type
9330 * so we're free to drop the lock at this point
9331 * and continue on
9332 */
9333 if (holding_mntlock) {
9334 mount_unlock_renames(locked_mp);
9335 mount_drop(locked_mp, 0);
9336 holding_mntlock = 0;
9337 }
9338 }
9339
9340 if (!batched) {
9341 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9342 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9343 flags, NULL);
9344 if (error) {
9345 if (error == ENOENT) {
9346 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9347 /*
9348 * We encountered a race where after doing the namei,
9349 * tvp stops being valid. If so, simply re-drive the rename
9350 * call from the top.
9351 */
9352 do_retry = 1;
9353 retry_count += 1;
9354 }
9355 }
9356 goto out1;
9357 }
9358 }
9359
9360 /* Release the 'mnt_fvp' now that it is no longer needed. */
9361 if (mnt_fvp != NULLVP) {
9362 vnode_put(mnt_fvp);
9363 mnt_fvp = NULLVP;
9364 }
9365
9366 // save these off so we can later verify that fvp is the same
9367 oname = fvp->v_name;
9368 oparent = fvp->v_parent;
9369
9370 skipped_lookup:
9371 #if CONFIG_FILE_LEASES
9372 /* Lease break needed for source's parent dir? */
9373 vnode_breakdirlease(fdvp, false, O_WRONLY);
9374
9375 /* Lease break needed for target's parent dir? */
9376 vnode_breakdirlease(tdvp, false, O_WRONLY);
9377 #endif
9378
9379 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9380 tdvp, &tvp, &tond->ni_cnd, tvap,
9381 flags, ctx);
9382
9383 if (holding_mntlock) {
9384 /*
9385 * we can drop our serialization
9386 * lock now
9387 */
9388 mount_unlock_renames(locked_mp);
9389 mount_drop(locked_mp, 0);
9390 holding_mntlock = 0;
9391 }
9392 if (error) {
9393 if (error == EDATALESS) {
9394 /*
9395 * If we've been here before, something has gone
9396 * horribly wrong and we should just get out lest
9397 * we spiral around the drain forever.
9398 */
9399 if (flags & VFS_RENAME_DATALESS) {
9400 error = EIO;
9401 goto out1;
9402 }
9403
9404 /*
9405 * The object we're renaming is dataless (or has a
9406 * dataless descendent) and requires materialization
9407 * before the rename occurs. But we're holding the
9408 * mount point's rename lock, so it's not safe to
9409 * make the upcall.
9410 *
9411 * In this case, we release the lock, perform the
9412 * materialization, and start the whole thing over.
9413 */
9414 error = vnode_materialize_dataless_file(fvp,
9415 NAMESPACE_HANDLER_RENAME_OP);
9416
9417 if (error == 0) {
9418 /*
9419 * The next time around we need to tell the
9420 * file system that the materializtaion has
9421 * been performed.
9422 */
9423 flags |= VFS_RENAME_DATALESS;
9424 do_retry = 1;
9425 }
9426 goto out1;
9427 }
9428 if (error == EKEEPLOOKING) {
9429 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9430 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9431 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9432 }
9433 }
9434
9435 fromnd->ni_vp = fvp;
9436 tond->ni_vp = tvp;
9437
9438 goto continue_lookup;
9439 }
9440
9441 /*
9442 * We may encounter a race in the VNOP where the destination didn't
9443 * exist when we did the namei, but it does by the time we go and
9444 * try to create the entry. In this case, we should re-drive this rename
9445 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9446 * but other filesystems susceptible to this race could return it, too.
9447 */
9448 if (error == ERECYCLE) {
9449 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9450 do_retry = 1;
9451 retry_count += 1;
9452 } else {
9453 printf("rename retry limit due to ERECYCLE reached\n");
9454 error = ENOENT;
9455 }
9456 }
9457
9458 /*
9459 * For compound VNOPs, the authorization callback may return
9460 * ENOENT in case of racing hardlink lookups hitting the name
9461 * cache, redrive the lookup.
9462 */
9463 if (batched && error == ENOENT) {
9464 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9465 do_retry = 1;
9466 retry_count += 1;
9467 }
9468 }
9469
9470 goto out1;
9471 }
9472
9473 /* call out to allow 3rd party notification of rename.
9474 * Ignore result of kauth_authorize_fileop call.
9475 */
9476 kauth_authorize_fileop(vfs_context_ucred(ctx),
9477 KAUTH_FILEOP_RENAME,
9478 (uintptr_t)from_name, (uintptr_t)to_name);
9479 if (flags & VFS_RENAME_SWAP) {
9480 kauth_authorize_fileop(vfs_context_ucred(ctx),
9481 KAUTH_FILEOP_RENAME,
9482 (uintptr_t)to_name, (uintptr_t)from_name);
9483 }
9484
9485 #if CONFIG_FSE
9486 if (from_name != NULL && to_name != NULL) {
9487 if (from_truncated || to_truncated) {
9488 // set it here since only the from_finfo gets reported up to user space
9489 from_finfo.mode |= FSE_TRUNCATED_PATH;
9490 }
9491
9492 if (tvap && tvp) {
9493 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9494 }
9495 if (fvap) {
9496 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9497 }
9498
9499 if (tvp) {
9500 add_fsevent(FSE_RENAME, ctx,
9501 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9502 FSE_ARG_FINFO, &from_finfo,
9503 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9504 FSE_ARG_FINFO, &to_finfo,
9505 FSE_ARG_DONE);
9506 if (flags & VFS_RENAME_SWAP) {
9507 /*
9508 * Strictly speaking, swap is the equivalent of
9509 * *three* renames. FSEvents clients should only take
9510 * the events as a hint, so we only bother reporting
9511 * two.
9512 */
9513 add_fsevent(FSE_RENAME, ctx,
9514 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9515 FSE_ARG_FINFO, &to_finfo,
9516 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9517 FSE_ARG_FINFO, &from_finfo,
9518 FSE_ARG_DONE);
9519 }
9520 } else {
9521 add_fsevent(FSE_RENAME, ctx,
9522 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9523 FSE_ARG_FINFO, &from_finfo,
9524 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9525 FSE_ARG_DONE);
9526 }
9527 }
9528 #endif /* CONFIG_FSE */
9529
9530 /*
9531 * update filesystem's mount point data
9532 */
9533 if (mntrename) {
9534 char *cp, *pathend, *mpname;
9535 char * tobuf;
9536 struct mount *mp;
9537 int maxlen;
9538 size_t len = 0;
9539
9540 mp = fvp->v_mountedhere;
9541
9542 if (vfs_busy(mp, LK_NOWAIT)) {
9543 error = EBUSY;
9544 goto out1;
9545 }
9546 tobuf = zalloc(ZV_NAMEI);
9547
9548 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9549 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9550 } else {
9551 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9552 }
9553 if (!error) {
9554 /* find current mount point prefix */
9555 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9556 for (cp = pathend; *cp != '\0'; ++cp) {
9557 if (*cp == '/') {
9558 pathend = cp + 1;
9559 }
9560 }
9561 /* find last component of target name */
9562 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9563 if (*cp == '/') {
9564 mpname = cp + 1;
9565 }
9566 }
9567
9568 /* Update f_mntonname of sub mounts */
9569 vfs_iterate(0, rename_submounts_callback, (void *)mp);
9570
9571 /* append name to prefix */
9572 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9573 bzero(pathend, maxlen);
9574
9575 strlcpy(pathend, mpname, maxlen);
9576 }
9577 zfree(ZV_NAMEI, tobuf);
9578
9579 vfs_unbusy(mp);
9580
9581 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9582 }
9583 /*
9584 * fix up name & parent pointers. note that we first
9585 * check that fvp has the same name/parent pointers it
9586 * had before the rename call... this is a 'weak' check
9587 * at best...
9588 *
9589 * XXX oparent and oname may not be set in the compound vnop case
9590 */
9591 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9592 int update_flags;
9593
9594 update_flags = VNODE_UPDATE_NAME;
9595
9596 if (fdvp != tdvp) {
9597 update_flags |= VNODE_UPDATE_PARENT;
9598 }
9599
9600 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9601 }
9602 out1:
9603 /*
9604 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9605 * skipped earlier as no actual rename was performed.
9606 */
9607 if (vn_authorize_skipped && error == 0) {
9608 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9609 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9610 flags, NULL);
9611 if (error && error == ENOENT) {
9612 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9613 do_retry = 1;
9614 retry_count += 1;
9615 }
9616 }
9617 }
9618 if (to_name != NULL) {
9619 RELEASE_PATH(to_name);
9620 to_name = NULL;
9621 }
9622 if (to_name_no_firmlink != NULL) {
9623 RELEASE_PATH(to_name_no_firmlink);
9624 to_name_no_firmlink = NULL;
9625 }
9626 if (from_name != NULL) {
9627 RELEASE_PATH(from_name);
9628 from_name = NULL;
9629 }
9630 if (from_name_no_firmlink != NULL) {
9631 RELEASE_PATH(from_name_no_firmlink);
9632 from_name_no_firmlink = NULL;
9633 }
9634 if (holding_mntlock) {
9635 mount_unlock_renames(locked_mp);
9636 mount_drop(locked_mp, 0);
9637 holding_mntlock = 0;
9638 }
9639 if (tdvp) {
9640 /*
9641 * nameidone has to happen before we vnode_put(tdvp)
9642 * since it may need to release the fs_nodelock on the tdvp
9643 */
9644 nameidone(tond);
9645
9646 if (tvp) {
9647 vnode_put(tvp);
9648 }
9649 vnode_put(tdvp);
9650 }
9651 if (fdvp) {
9652 /*
9653 * nameidone has to happen before we vnode_put(fdvp)
9654 * since it may need to release the fs_nodelock on the fdvp
9655 */
9656 nameidone(fromnd);
9657
9658 if (fvp) {
9659 vnode_put(fvp);
9660 }
9661 vnode_put(fdvp);
9662 }
9663 if (mnt_fvp != NULLVP) {
9664 vnode_put(mnt_fvp);
9665 }
9666 /*
9667 * If things changed after we did the namei, then we will re-drive
9668 * this rename call from the top.
9669 */
9670 if (do_retry) {
9671 do_retry = 0;
9672 goto retry;
9673 }
9674
9675 kfree_type(typeof(*__rename_data), __rename_data);
9676 return error;
9677 }
9678
9679 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9680 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9681 {
9682 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9683 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9684 }
9685
9686 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9687 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9688 {
9689 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9690 return EINVAL;
9691 }
9692
9693 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9694 return EINVAL;
9695 }
9696
9697 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9698 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9699 }
9700
9701 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9702 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9703 {
9704 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9705 uap->tofd, uap->to, UIO_USERSPACE, 0);
9706 }
9707
9708 /*
9709 * Make a directory file.
9710 *
9711 * Returns: 0 Success
9712 * EEXIST
9713 * namei:???
9714 * vnode_authorize:???
9715 * vn_create:???
9716 */
9717 /* ARGSUSED */
9718 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9719 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9720 enum uio_seg segflg)
9721 {
9722 vnode_t vp, dvp;
9723 int error;
9724 int update_flags = 0;
9725 int batched;
9726 struct nameidata nd;
9727
9728 AUDIT_ARG(mode, vap->va_mode);
9729 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9730 path, ctx);
9731 nd.ni_cnd.cn_flags |= WILLBEDIR;
9732 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9733
9734 continue_lookup:
9735 error = nameiat(&nd, fd);
9736 if (error) {
9737 return error;
9738 }
9739 dvp = nd.ni_dvp;
9740 vp = nd.ni_vp;
9741
9742 if (vp != NULL) {
9743 error = EEXIST;
9744 goto out;
9745 }
9746
9747 batched = vnode_compound_mkdir_available(dvp);
9748
9749 VATTR_SET(vap, va_type, VDIR);
9750
9751 /*
9752 * XXX
9753 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9754 * only get EXISTS or EISDIR for existing path components, and not that it could see
9755 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9756 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9757 */
9758 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9759 if (error == EACCES || error == EPERM) {
9760 int error2;
9761
9762 nameidone(&nd);
9763 vnode_put(dvp);
9764 dvp = NULLVP;
9765
9766 /*
9767 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9768 * rather than EACCESS if the target exists.
9769 */
9770 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9771 path, ctx);
9772 error2 = nameiat(&nd, fd);
9773 if (error2) {
9774 goto out;
9775 } else {
9776 vp = nd.ni_vp;
9777 error = EEXIST;
9778 goto out;
9779 }
9780 }
9781
9782 goto out;
9783 }
9784
9785 #if CONFIG_FILE_LEASES
9786 vnode_breakdirlease(dvp, false, O_WRONLY);
9787 #endif
9788
9789 /*
9790 * make the directory
9791 */
9792 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9793 if (error == EKEEPLOOKING) {
9794 nd.ni_vp = vp;
9795 goto continue_lookup;
9796 }
9797
9798 goto out;
9799 }
9800
9801 // Make sure the name & parent pointers are hooked up
9802 if (vp->v_name == NULL) {
9803 update_flags |= VNODE_UPDATE_NAME;
9804 }
9805 if (vp->v_parent == NULLVP) {
9806 update_flags |= VNODE_UPDATE_PARENT;
9807 }
9808
9809 if (update_flags) {
9810 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9811 }
9812
9813 #if CONFIG_FSE
9814 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9815 #endif
9816
9817 out:
9818 /*
9819 * nameidone has to happen before we vnode_put(dvp)
9820 * since it may need to release the fs_nodelock on the dvp
9821 */
9822 nameidone(&nd);
9823
9824 if (vp) {
9825 vnode_put(vp);
9826 }
9827 if (dvp) {
9828 vnode_put(dvp);
9829 }
9830
9831 return error;
9832 }
9833
9834 /*
9835 * mkdir_extended: Create a directory; with extended security (ACL).
9836 *
9837 * Parameters: p Process requesting to create the directory
9838 * uap User argument descriptor (see below)
9839 * retval (ignored)
9840 *
9841 * Indirect: uap->path Path of directory to create
9842 * uap->mode Access permissions to set
9843 * uap->xsecurity ACL to set
9844 *
9845 * Returns: 0 Success
9846 * !0 Not success
9847 *
9848 */
9849 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9850 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9851 {
9852 int ciferror;
9853 kauth_filesec_t xsecdst;
9854 struct vnode_attr va;
9855
9856 AUDIT_ARG(owner, uap->uid, uap->gid);
9857
9858 xsecdst = NULL;
9859 if ((uap->xsecurity != USER_ADDR_NULL) &&
9860 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9861 return ciferror;
9862 }
9863
9864 VATTR_INIT(&va);
9865 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9866 if (xsecdst != NULL) {
9867 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9868 va.va_vaflags |= VA_FILESEC_ACL;
9869 }
9870
9871 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9872 UIO_USERSPACE);
9873 if (xsecdst != NULL) {
9874 kauth_filesec_free(xsecdst);
9875 }
9876 return ciferror;
9877 }
9878
9879 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9880 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9881 {
9882 struct vnode_attr va;
9883
9884 VATTR_INIT(&va);
9885 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9886
9887 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9888 UIO_USERSPACE);
9889 }
9890
9891 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9892 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9893 {
9894 struct vnode_attr va;
9895
9896 VATTR_INIT(&va);
9897 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9898
9899 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9900 UIO_USERSPACE);
9901 }
9902
9903 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9904 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9905 enum uio_seg segflg, int unlink_flags)
9906 {
9907 struct {
9908 struct nameidata nd;
9909 #if CONFIG_FSE
9910 struct vnode_attr va;
9911 #endif /* CONFIG_FSE */
9912 } *__rmdir_data;
9913 vnode_t vp, dvp;
9914 int error;
9915 struct nameidata *ndp;
9916 char *path = NULL;
9917 char *no_firmlink_path = NULL;
9918 int len_path = 0;
9919 int len_no_firmlink_path = 0;
9920 int has_listeners = 0;
9921 int need_event = 0;
9922 int truncated_path = 0;
9923 int truncated_no_firmlink_path = 0;
9924 struct vnode_attr *vap = NULL;
9925 int restart_count = 0;
9926 int batched;
9927
9928 int restart_flag;
9929
9930 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9931 ndp = &__rmdir_data->nd;
9932
9933 /*
9934 * This loop exists to restart rmdir in the unlikely case that two
9935 * processes are simultaneously trying to remove the same directory
9936 * containing orphaned appleDouble files.
9937 */
9938 do {
9939 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9940 segflg, dirpath, ctx);
9941 ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9942 continue_lookup:
9943 restart_flag = 0;
9944 vap = NULL;
9945
9946 error = nameiat(ndp, fd);
9947 if (error) {
9948 goto err_out;
9949 }
9950
9951 dvp = ndp->ni_dvp;
9952 vp = ndp->ni_vp;
9953
9954 if (vp) {
9955 batched = vnode_compound_rmdir_available(vp);
9956
9957 if (vp->v_flag & VROOT) {
9958 /*
9959 * The root of a mounted filesystem cannot be deleted.
9960 */
9961 error = EBUSY;
9962 goto out;
9963 }
9964
9965 #if DEVELOPMENT || DEBUG
9966 /*
9967 * XXX VSWAP: Check for entitlements or special flag here
9968 * so we can restrict access appropriately.
9969 */
9970 #else /* DEVELOPMENT || DEBUG */
9971
9972 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9973 error = EPERM;
9974 goto out;
9975 }
9976 #endif /* DEVELOPMENT || DEBUG */
9977
9978 /*
9979 * Removed a check here; we used to abort if vp's vid
9980 * was not the same as what we'd seen the last time around.
9981 * I do not think that check was valid, because if we retry
9982 * and all dirents are gone, the directory could legitimately
9983 * be recycled but still be present in a situation where we would
9984 * have had permission to delete. Therefore, we won't make
9985 * an effort to preserve that check now that we may not have a
9986 * vp here.
9987 */
9988
9989 if (!batched) {
9990 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9991 if (error) {
9992 if (error == ENOENT) {
9993 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9994 restart_flag = 1;
9995 restart_count += 1;
9996 }
9997 }
9998 goto out;
9999 }
10000 }
10001 } else {
10002 batched = 1;
10003
10004 if (!vnode_compound_rmdir_available(dvp)) {
10005 panic("No error, but no compound rmdir?");
10006 }
10007 }
10008
10009 #if CONFIG_FSE
10010 fse_info finfo = {0};
10011
10012 need_event = need_fsevent(FSE_DELETE, dvp);
10013 if (need_event) {
10014 if (!batched) {
10015 get_fse_info(vp, &finfo, ctx);
10016 } else {
10017 error = vfs_get_notify_attributes(&__rmdir_data->va);
10018 if (error) {
10019 goto out;
10020 }
10021
10022 vap = &__rmdir_data->va;
10023 }
10024 }
10025 #endif
10026 has_listeners = kauth_authorize_fileop_has_listeners();
10027 if (need_event || has_listeners) {
10028 if (path == NULL) {
10029 GET_PATH(path);
10030 }
10031
10032 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10033
10034 if (no_firmlink_path == NULL) {
10035 GET_PATH(no_firmlink_path);
10036 }
10037
10038 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10039 #if CONFIG_FSE
10040 if (truncated_no_firmlink_path) {
10041 finfo.mode |= FSE_TRUNCATED_PATH;
10042 }
10043 #endif
10044 }
10045
10046 #if CONFIG_FILE_LEASES
10047 vnode_breakdirlease(dvp, false, O_WRONLY);
10048 #endif
10049
10050 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10051 ndp->ni_vp = vp;
10052 if (vp == NULLVP) {
10053 /* Couldn't find a vnode */
10054 goto out;
10055 }
10056
10057 if (error == EKEEPLOOKING) {
10058 goto continue_lookup;
10059 } else if (batched && error == ENOENT) {
10060 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10061 /*
10062 * For compound VNOPs, the authorization callback
10063 * may return ENOENT in case of racing hard link lookups
10064 * redrive the lookup.
10065 */
10066 restart_flag = 1;
10067 restart_count += 1;
10068 goto out;
10069 }
10070 }
10071
10072 /*
10073 * XXX There's no provision for passing flags
10074 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10075 * because it's not empty, then we try again
10076 * with VNOP_REMOVE(), passing in a special
10077 * flag that clever file systems will know
10078 * how to handle.
10079 */
10080 if (error == ENOTEMPTY &&
10081 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10082 /*
10083 * If this fails, we want to keep the original
10084 * error.
10085 */
10086 if (vn_remove(dvp, &vp, ndp,
10087 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10088 error = 0;
10089 }
10090 }
10091
10092 #if CONFIG_APPLEDOUBLE
10093 /*
10094 * Special case to remove orphaned AppleDouble
10095 * files. I don't like putting this in the kernel,
10096 * but carbon does not like putting this in carbon either,
10097 * so here we are.
10098 */
10099 if (error == ENOTEMPTY) {
10100 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10101 if (ad_error == EBUSY) {
10102 error = ad_error;
10103 goto out;
10104 }
10105
10106
10107 /*
10108 * Assuming everything went well, we will try the RMDIR again
10109 */
10110 if (!ad_error) {
10111 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10112 }
10113 }
10114 #endif /* CONFIG_APPLEDOUBLE */
10115 /*
10116 * Call out to allow 3rd party notification of delete.
10117 * Ignore result of kauth_authorize_fileop call.
10118 */
10119 if (!error) {
10120 if (has_listeners) {
10121 kauth_authorize_fileop(vfs_context_ucred(ctx),
10122 KAUTH_FILEOP_DELETE,
10123 (uintptr_t)vp,
10124 (uintptr_t)path);
10125 }
10126
10127 if (vp->v_flag & VISHARDLINK) {
10128 // see the comment in unlink1() about why we update
10129 // the parent of a hard link when it is removed
10130 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10131 }
10132
10133 #if CONFIG_FSE
10134 if (need_event) {
10135 if (vap) {
10136 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10137 }
10138 add_fsevent(FSE_DELETE, ctx,
10139 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10140 FSE_ARG_FINFO, &finfo,
10141 FSE_ARG_DONE);
10142 }
10143 #endif
10144
10145 #if CONFIG_MACF
10146 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10147 #endif
10148 }
10149
10150 out:
10151 if (path != NULL) {
10152 RELEASE_PATH(path);
10153 path = NULL;
10154 }
10155
10156 if (no_firmlink_path != NULL) {
10157 RELEASE_PATH(no_firmlink_path);
10158 no_firmlink_path = NULL;
10159 }
10160
10161 /*
10162 * nameidone has to happen before we vnode_put(dvp)
10163 * since it may need to release the fs_nodelock on the dvp
10164 */
10165 nameidone(ndp);
10166 vnode_put(dvp);
10167
10168 if (vp) {
10169 vnode_put(vp);
10170 }
10171
10172 if (restart_flag == 0) {
10173 wakeup_one((caddr_t)vp);
10174 goto err_out;
10175 }
10176 tsleep(vp, PVFS, "rm AD", 1);
10177 } while (restart_flag != 0);
10178
10179 err_out:
10180 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10181
10182 return error;
10183 }
10184
10185 /*
10186 * Remove a directory file.
10187 */
10188 /* ARGSUSED */
10189 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10190 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10191 {
10192 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10193 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10194 }
10195
10196 /* Get direntry length padded to 8 byte alignment */
10197 #define DIRENT64_LEN(namlen) \
10198 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10199
10200 /* Get dirent length padded to 4 byte alignment */
10201 #define DIRENT_LEN(namelen) \
10202 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10203
10204 /* Get the end of this dirent */
10205 #define DIRENT_END(dep) \
10206 (((char *)(dep)) + (dep)->d_reclen - 1)
10207
10208 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10209 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10210 int *numdirent, vfs_context_t ctxp)
10211 {
10212 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10213 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10214 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10215 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10216 } else {
10217 size_t bufsize;
10218 void * bufptr;
10219 uio_t auio;
10220 struct direntry *entry64;
10221 struct dirent *dep;
10222 size_t bytesread;
10223 int error;
10224
10225 /*
10226 * We're here because the underlying file system does not
10227 * support direnties or we mounted denying support so we must
10228 * fall back to dirents and convert them to direntries.
10229 *
10230 * Our kernel buffer needs to be smaller since re-packing will
10231 * expand each dirent. The worse case (when the name length
10232 * is 3 or less) corresponds to a struct direntry size of 32
10233 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10234 * (4-byte aligned). So having a buffer that is 3/8 the size
10235 * will prevent us from reading more than we can pack.
10236 *
10237 * Since this buffer is wired memory, we will limit the
10238 * buffer size to a maximum of 32K. We would really like to
10239 * use 32K in the MIN(), but we use magic number 87371 to
10240 * prevent uio_resid() * 3 / 8 from overflowing.
10241 */
10242 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10243 bufptr = kalloc_data(bufsize, Z_WAITOK);
10244 if (bufptr == NULL) {
10245 return ENOMEM;
10246 }
10247
10248 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10249 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10250 auio->uio_offset = uio->uio_offset;
10251
10252 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10253
10254 dep = (struct dirent *)bufptr;
10255 bytesread = bufsize - uio_resid(auio);
10256
10257 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10258 /*
10259 * Convert all the entries and copy them out to user's buffer.
10260 */
10261 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10262 /* First check that the dirent struct up to d_name is within the buffer */
10263 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10264 /* Check that the length of the entire dirent is within the buffer */
10265 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10266 /* Check that the actual length including the name doesn't exceed d_reclen */
10267 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10268 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10269 vp->v_mount->mnt_vfsstat.f_mntonname,
10270 vp->v_name ? vp->v_name : "<unknown>");
10271 error = EIO;
10272 break;
10273 }
10274
10275 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10276
10277 bzero(entry64, enbufsize);
10278 /* Convert a dirent to a dirent64. */
10279 entry64->d_ino = dep->d_ino;
10280 entry64->d_seekoff = 0;
10281 entry64->d_reclen = (uint16_t)enbufsize;
10282 entry64->d_namlen = dep->d_namlen;
10283 entry64->d_type = dep->d_type;
10284 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10285
10286 /* Move to next entry. */
10287 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10288
10289 /* Copy entry64 to user's buffer. */
10290 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10291 }
10292
10293 /* Update the real offset using the offset we got from VNOP_READDIR. */
10294 if (error == 0) {
10295 uio->uio_offset = auio->uio_offset;
10296 }
10297 uio_free(auio);
10298 kfree_data(bufptr, bufsize);
10299 kfree_type(struct direntry, entry64);
10300 return error;
10301 }
10302 }
10303
10304 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10305
10306 /*
10307 * Read a block of directory entries in a file system independent format.
10308 */
10309 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10310 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10311 off_t *offset, int *eofflag, int flags)
10312 {
10313 vnode_t vp;
10314 struct vfs_context context = *vfs_context_current(); /* local copy */
10315 struct fileproc *fp;
10316 uio_t auio;
10317 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10318 off_t loff;
10319 int error, numdirent;
10320 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10321
10322 get_from_fd:
10323 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10324 if (error) {
10325 return error;
10326 }
10327
10328 vn_offset_lock(fp->fp_glob);
10329 if (((vnode_t)fp_get_data(fp)) != vp) {
10330 vn_offset_unlock(fp->fp_glob);
10331 file_drop(fd);
10332 goto get_from_fd;
10333 }
10334
10335 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10336 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10337 error = EBADF;
10338 goto out;
10339 }
10340
10341 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10342 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10343 }
10344
10345 #if CONFIG_MACF
10346 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10347 if (error) {
10348 goto out;
10349 }
10350 #endif
10351
10352 if ((error = vnode_getwithref(vp))) {
10353 goto out;
10354 }
10355 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10356
10357 #if CONFIG_UNION_MOUNTS
10358 unionread:
10359 #endif /* CONFIG_UNION_MOUNTS */
10360 if (vp->v_type != VDIR) {
10361 (void)vnode_put(vp);
10362 error = EINVAL;
10363 goto out;
10364 }
10365
10366 #if CONFIG_MACF
10367 error = mac_vnode_check_readdir(&context, vp);
10368 if (error != 0) {
10369 (void)vnode_put(vp);
10370 goto out;
10371 }
10372 #endif /* MAC */
10373
10374 loff = fp->fp_glob->fg_offset;
10375 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10376 uio_addiov(auio, bufp, bufsize);
10377
10378 if (flags & VNODE_READDIR_EXTENDED) {
10379 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10380 fp->fp_glob->fg_offset = uio_offset(auio);
10381 } else {
10382 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10383 fp->fp_glob->fg_offset = uio_offset(auio);
10384 }
10385 if (error) {
10386 (void)vnode_put(vp);
10387 goto out;
10388 }
10389
10390 #if CONFIG_UNION_MOUNTS
10391 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10392 (vp->v_mount->mnt_flag & MNT_UNION)) {
10393 vnode_t uvp;
10394
10395 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10396 if (vnode_ref(uvp) == 0) {
10397 fp_set_data(fp, uvp);
10398 fp->fp_glob->fg_offset = 0;
10399 vnode_rele(vp);
10400 vnode_put(vp);
10401 vp = uvp;
10402 goto unionread;
10403 } else {
10404 /* could not get a ref, can't replace in fd */
10405 vnode_put(uvp);
10406 }
10407 }
10408 }
10409 #endif /* CONFIG_UNION_MOUNTS */
10410
10411 vnode_put(vp);
10412 if (offset) {
10413 *offset = loff;
10414 }
10415
10416 *bytesread = bufsize - uio_resid(auio);
10417 out:
10418 vn_offset_unlock(fp->fp_glob);
10419 file_drop(fd);
10420 return error;
10421 }
10422
10423
10424 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10425 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10426 {
10427 off_t offset;
10428 ssize_t bytesread;
10429 int error, eofflag;
10430
10431 AUDIT_ARG(fd, uap->fd);
10432 error = getdirentries_common(uap->fd, uap->buf, uap->count,
10433 &bytesread, &offset, &eofflag, 0);
10434
10435 if (error == 0) {
10436 if (proc_is64bit(p)) {
10437 user64_long_t base = (user64_long_t)offset;
10438 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10439 } else {
10440 user32_long_t base = (user32_long_t)offset;
10441 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10442 }
10443 *retval = (int)bytesread;
10444 }
10445 return error;
10446 }
10447
10448 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10449 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10450 {
10451 off_t offset;
10452 ssize_t bytesread;
10453 int error, eofflag;
10454 user_size_t bufsize;
10455
10456 AUDIT_ARG(fd, uap->fd);
10457
10458 /*
10459 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10460 * then the kernel carves out the last 4 bytes to return extended
10461 * information to userspace (namely whether we reached EOF with this call).
10462 */
10463 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10464 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10465 } else {
10466 bufsize = uap->bufsize;
10467 }
10468
10469 error = getdirentries_common(uap->fd, uap->buf, bufsize,
10470 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10471
10472 if (error == 0) {
10473 *retval = bytesread;
10474 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10475
10476 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10477 getdirentries64_flags_t flags = 0;
10478 if (eofflag) {
10479 flags |= GETDIRENTRIES64_EOF;
10480 }
10481 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10482 sizeof(flags));
10483 }
10484 }
10485 return error;
10486 }
10487
10488
10489 /*
10490 * Set the mode mask for creation of filesystem nodes.
10491 * XXX implement xsecurity
10492 */
10493 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10494 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10495 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10496 {
10497 AUDIT_ARG(mask, newmask);
10498 proc_fdlock(p);
10499 *retval = p->p_fd.fd_cmask;
10500 p->p_fd.fd_cmask = newmask & ALLPERMS;
10501 proc_fdunlock(p);
10502 return 0;
10503 }
10504
10505 /*
10506 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10507 *
10508 * Parameters: p Process requesting to set the umask
10509 * uap User argument descriptor (see below)
10510 * retval umask of the process (parameter p)
10511 *
10512 * Indirect: uap->newmask umask to set
10513 * uap->xsecurity ACL to set
10514 *
10515 * Returns: 0 Success
10516 * !0 Not success
10517 *
10518 */
10519 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10520 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10521 {
10522 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10523 }
10524
10525 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10526 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10527 {
10528 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10529 }
10530
10531 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10532 "com.apple.private.vfs.revoke-mounted-device"
10533
10534 /*
10535 * Void all references to file by ripping underlying filesystem
10536 * away from vnode.
10537 */
10538 /* ARGSUSED */
10539 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10540 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10541 {
10542 vnode_t vp;
10543 struct vnode_attr va;
10544 vfs_context_t ctx = vfs_context_current();
10545 int error;
10546 struct nameidata nd;
10547
10548 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10549 uap->path, ctx);
10550 error = namei(&nd);
10551 if (error) {
10552 return error;
10553 }
10554 vp = nd.ni_vp;
10555
10556 nameidone(&nd);
10557
10558 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10559 error = ENOTSUP;
10560 goto out;
10561 }
10562
10563 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10564 error = EBUSY;
10565 goto out;
10566 }
10567
10568 #if CONFIG_MACF
10569 error = mac_vnode_check_revoke(ctx, vp);
10570 if (error) {
10571 goto out;
10572 }
10573 #endif
10574
10575 VATTR_INIT(&va);
10576 VATTR_WANTED(&va, va_uid);
10577 if ((error = vnode_getattr(vp, &va, ctx))) {
10578 goto out;
10579 }
10580 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10581 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10582 goto out;
10583 }
10584 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10585 VNOP_REVOKE(vp, REVOKEALL, ctx);
10586 }
10587 out:
10588 vnode_put(vp);
10589 return error;
10590 }
10591
10592
10593 /*
10594 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10595 * The following system calls are designed to support features
10596 * which are specific to the HFS & HFS Plus volume formats
10597 */
10598
10599
10600 /*
10601 * Obtain attribute information on objects in a directory while enumerating
10602 * the directory.
10603 */
10604 /* ARGSUSED */
10605 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10606 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10607 {
10608 vnode_t vp;
10609 struct fileproc *fp;
10610 uio_t auio = NULL;
10611 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10612 uint32_t count = 0, savecount = 0;
10613 uint32_t newstate = 0;
10614 int error, eofflag = 0;
10615 off_t loff = 0;
10616 struct attrlist attributelist;
10617 vfs_context_t ctx = vfs_context_current();
10618 int fd = uap->fd;
10619 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10620 kauth_action_t action;
10621
10622 AUDIT_ARG(fd, fd);
10623
10624 /* Get the attributes into kernel space */
10625 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10626 return error;
10627 }
10628 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10629 return error;
10630 }
10631 savecount = count;
10632
10633 get_from_fd:
10634 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10635 return error;
10636 }
10637
10638 vn_offset_lock(fp->fp_glob);
10639 if (((vnode_t)fp_get_data(fp)) != vp) {
10640 vn_offset_unlock(fp->fp_glob);
10641 file_drop(fd);
10642 goto get_from_fd;
10643 }
10644
10645 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10646 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10647 error = EBADF;
10648 goto out;
10649 }
10650
10651
10652 #if CONFIG_MACF
10653 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10654 fp->fp_glob);
10655 if (error) {
10656 goto out;
10657 }
10658 #endif
10659
10660
10661 if ((error = vnode_getwithref(vp))) {
10662 goto out;
10663 }
10664
10665 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10666
10667 #if CONFIG_UNION_MOUNTS
10668 unionread:
10669 #endif /* CONFIG_UNION_MOUNTS */
10670 if (vp->v_type != VDIR) {
10671 (void)vnode_put(vp);
10672 error = EINVAL;
10673 goto out;
10674 }
10675
10676 #if CONFIG_MACF
10677 error = mac_vnode_check_readdir(ctx, vp);
10678 if (error != 0) {
10679 (void)vnode_put(vp);
10680 goto out;
10681 }
10682 #endif /* MAC */
10683
10684 /* set up the uio structure which will contain the users return buffer */
10685 loff = fp->fp_glob->fg_offset;
10686 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10687 uio_addiov(auio, uap->buffer, uap->buffersize);
10688
10689 /*
10690 * If the only item requested is file names, we can let that past with
10691 * just LIST_DIRECTORY. If they want any other attributes, that means
10692 * they need SEARCH as well.
10693 */
10694 action = KAUTH_VNODE_LIST_DIRECTORY;
10695 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10696 attributelist.fileattr || attributelist.dirattr) {
10697 action |= KAUTH_VNODE_SEARCH;
10698 }
10699
10700 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10701 /* Believe it or not, uap->options only has 32-bits of valid
10702 * info, so truncate before extending again */
10703
10704 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10705 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10706 }
10707
10708 if (error) {
10709 (void) vnode_put(vp);
10710 goto out;
10711 }
10712
10713 #if CONFIG_UNION_MOUNTS
10714 /*
10715 * If we've got the last entry of a directory in a union mount
10716 * then reset the eofflag and pretend there's still more to come.
10717 * The next call will again set eofflag and the buffer will be empty,
10718 * so traverse to the underlying directory and do the directory
10719 * read there.
10720 */
10721 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10722 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10723 eofflag = 0;
10724 } else { // Empty buffer
10725 vnode_t uvp;
10726 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10727 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10728 fp_set_data(fp, uvp);
10729 fp->fp_glob->fg_offset = 0; // reset index for new dir
10730 count = savecount;
10731 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10732 vnode_put(vp);
10733 vp = uvp;
10734 goto unionread;
10735 } else {
10736 /* could not get a ref, can't replace in fd */
10737 vnode_put(uvp);
10738 }
10739 }
10740 }
10741 }
10742 #endif /* CONFIG_UNION_MOUNTS */
10743
10744 (void)vnode_put(vp);
10745
10746 if (error) {
10747 goto out;
10748 }
10749 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10750
10751 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10752 goto out;
10753 }
10754 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10755 goto out;
10756 }
10757 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10758 goto out;
10759 }
10760
10761 *retval = eofflag; /* similar to getdirentries */
10762 error = 0;
10763 out:
10764 vn_offset_unlock(fp->fp_glob);
10765 file_drop(fd);
10766 return error; /* return error earlier, an retval of 0 or 1 now */
10767 } /* end of getdirentriesattr system call */
10768
10769 /*
10770 * Exchange data between two files
10771 */
10772
10773 /* ARGSUSED */
10774 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10775 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10776 {
10777 struct nameidata fnd, snd;
10778 vfs_context_t ctx = vfs_context_current();
10779 vnode_t fvp;
10780 vnode_t svp;
10781 int error;
10782 u_int32_t nameiflags;
10783 char *fpath = NULL;
10784 char *spath = NULL;
10785 int flen = 0, slen = 0;
10786 int from_truncated = 0, to_truncated = 0;
10787 #if CONFIG_FSE
10788 fse_info f_finfo, s_finfo;
10789 #endif
10790
10791 nameiflags = 0;
10792 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10793 nameiflags |= FOLLOW;
10794 }
10795
10796 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10797 UIO_USERSPACE, uap->path1, ctx);
10798
10799 error = namei(&fnd);
10800 if (error) {
10801 goto out2;
10802 }
10803
10804 nameidone(&fnd);
10805 fvp = fnd.ni_vp;
10806
10807 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10808 UIO_USERSPACE, uap->path2, ctx);
10809
10810 error = namei(&snd);
10811 if (error) {
10812 vnode_put(fvp);
10813 goto out2;
10814 }
10815 nameidone(&snd);
10816 svp = snd.ni_vp;
10817
10818 /*
10819 * if the files are the same, return an inval error
10820 */
10821 if (svp == fvp) {
10822 error = EINVAL;
10823 goto out;
10824 }
10825
10826 /*
10827 * if the files are on different volumes, return an error
10828 */
10829 if (svp->v_mount != fvp->v_mount) {
10830 error = EXDEV;
10831 goto out;
10832 }
10833
10834 /* If they're not files, return an error */
10835 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10836 error = EINVAL;
10837 goto out;
10838 }
10839
10840 #if CONFIG_MACF
10841 error = mac_vnode_check_exchangedata(ctx,
10842 fvp, svp);
10843 if (error) {
10844 goto out;
10845 }
10846 #endif
10847 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10848 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10849 goto out;
10850 }
10851
10852 if (
10853 #if CONFIG_FSE
10854 need_fsevent(FSE_EXCHANGE, fvp) ||
10855 #endif
10856 kauth_authorize_fileop_has_listeners()) {
10857 GET_PATH(fpath);
10858 GET_PATH(spath);
10859
10860 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10861 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10862
10863 #if CONFIG_FSE
10864 get_fse_info(fvp, &f_finfo, ctx);
10865 get_fse_info(svp, &s_finfo, ctx);
10866 if (from_truncated || to_truncated) {
10867 // set it here since only the f_finfo gets reported up to user space
10868 f_finfo.mode |= FSE_TRUNCATED_PATH;
10869 }
10870 #endif
10871 }
10872 /* Ok, make the call */
10873 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10874
10875 if (error == 0) {
10876 const char *tmpname;
10877
10878 if (fpath != NULL && spath != NULL) {
10879 /* call out to allow 3rd party notification of exchangedata.
10880 * Ignore result of kauth_authorize_fileop call.
10881 */
10882 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10883 (uintptr_t)fpath, (uintptr_t)spath);
10884 }
10885 name_cache_lock();
10886
10887 tmpname = fvp->v_name;
10888 fvp->v_name = svp->v_name;
10889 svp->v_name = tmpname;
10890
10891 if (fvp->v_parent != svp->v_parent) {
10892 vnode_t tmp;
10893
10894 tmp = fvp->v_parent;
10895 fvp->v_parent = svp->v_parent;
10896 svp->v_parent = tmp;
10897 }
10898 name_cache_unlock();
10899
10900 #if CONFIG_FSE
10901 if (fpath != NULL && spath != NULL) {
10902 add_fsevent(FSE_EXCHANGE, ctx,
10903 FSE_ARG_STRING, flen, fpath,
10904 FSE_ARG_FINFO, &f_finfo,
10905 FSE_ARG_STRING, slen, spath,
10906 FSE_ARG_FINFO, &s_finfo,
10907 FSE_ARG_DONE);
10908 }
10909 #endif
10910 }
10911
10912 out:
10913 if (fpath != NULL) {
10914 RELEASE_PATH(fpath);
10915 }
10916 if (spath != NULL) {
10917 RELEASE_PATH(spath);
10918 }
10919 vnode_put(svp);
10920 vnode_put(fvp);
10921 out2:
10922 return error;
10923 }
10924
10925 /*
10926 * Return (in MB) the amount of freespace on the given vnode's volume.
10927 */
10928 uint32_t freespace_mb(vnode_t vp);
10929
10930 uint32_t
freespace_mb(vnode_t vp)10931 freespace_mb(vnode_t vp)
10932 {
10933 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10934 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10935 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10936 }
10937
10938 #if CONFIG_SEARCHFS
10939
10940 /* ARGSUSED */
10941
10942 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10943 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10944 {
10945 vnode_t vp, tvp;
10946 int i, error = 0;
10947 int fserror = 0;
10948 struct nameidata nd;
10949 struct user64_fssearchblock searchblock;
10950 struct searchstate *state;
10951 struct attrlist *returnattrs;
10952 struct timeval timelimit;
10953 void *searchparams1, *searchparams2;
10954 uio_t auio = NULL;
10955 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10956 uint32_t nummatches;
10957 size_t mallocsize;
10958 uint32_t nameiflags;
10959 vfs_context_t ctx = vfs_context_current();
10960 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10961
10962 /* Start by copying in fsearchblock parameter list */
10963 if (IS_64BIT_PROCESS(p)) {
10964 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10965 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10966 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10967 } else {
10968 struct user32_fssearchblock tmp_searchblock;
10969
10970 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10971 // munge into 64-bit version
10972 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10973 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10974 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10975 searchblock.maxmatches = tmp_searchblock.maxmatches;
10976 /*
10977 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10978 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10979 */
10980 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10981 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10982 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10983 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10984 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10985 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10986 searchblock.searchattrs = tmp_searchblock.searchattrs;
10987 }
10988 if (error) {
10989 return error;
10990 }
10991
10992 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10993 */
10994 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10995 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10996 return EINVAL;
10997 }
10998
10999 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11000 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
11001 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11002 /* block. */
11003 /* */
11004 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
11005 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
11006 /* assumes the size is still 556 bytes it will continue to work */
11007
11008 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11009 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11010
11011 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11012
11013 /* Now set up the various pointers to the correct place in our newly allocated memory */
11014
11015 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11016 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11017 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11018
11019 /* Now copy in the stuff given our local variables. */
11020
11021 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11022 goto freeandexit;
11023 }
11024
11025 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11026 goto freeandexit;
11027 }
11028
11029 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11030 goto freeandexit;
11031 }
11032
11033 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11034 goto freeandexit;
11035 }
11036
11037 /*
11038 * When searching a union mount, need to set the
11039 * start flag at the first call on each layer to
11040 * reset state for the new volume.
11041 */
11042 if (uap->options & SRCHFS_START) {
11043 state->ss_union_layer = 0;
11044 } else {
11045 uap->options |= state->ss_union_flags;
11046 }
11047 state->ss_union_flags = 0;
11048
11049 /*
11050 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11051 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11052 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11053 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11054 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11055 */
11056
11057 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11058 attrreference_t* string_ref;
11059 u_int32_t* start_length;
11060 user64_size_t param_length;
11061
11062 /* validate searchparams1 */
11063 param_length = searchblock.sizeofsearchparams1;
11064 /* skip the word that specifies length of the buffer */
11065 start_length = (u_int32_t*) searchparams1;
11066 start_length = start_length + 1;
11067 string_ref = (attrreference_t*) start_length;
11068
11069 /* ensure no negative offsets or too big offsets */
11070 if (string_ref->attr_dataoffset < 0) {
11071 error = EINVAL;
11072 goto freeandexit;
11073 }
11074 if (string_ref->attr_length > MAXPATHLEN) {
11075 error = EINVAL;
11076 goto freeandexit;
11077 }
11078
11079 /* Check for pointer overflow in the string ref */
11080 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11081 error = EINVAL;
11082 goto freeandexit;
11083 }
11084
11085 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11086 error = EINVAL;
11087 goto freeandexit;
11088 }
11089 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11090 error = EINVAL;
11091 goto freeandexit;
11092 }
11093 }
11094
11095 /* set up the uio structure which will contain the users return buffer */
11096 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11097 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11098
11099 nameiflags = 0;
11100 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11101 nameiflags |= FOLLOW;
11102 }
11103 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11104 UIO_USERSPACE, uap->path, ctx);
11105
11106 error = namei(&nd);
11107 if (error) {
11108 goto freeandexit;
11109 }
11110 vp = nd.ni_vp;
11111 nameidone(&nd);
11112
11113 /*
11114 * Switch to the root vnode for the volume
11115 */
11116 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11117 vnode_put(vp);
11118 if (error) {
11119 goto freeandexit;
11120 }
11121 vp = tvp;
11122
11123 #if CONFIG_UNION_MOUNTS
11124 /*
11125 * If it's a union mount, the path lookup takes
11126 * us to the top layer. But we may need to descend
11127 * to a lower layer. For non-union mounts the layer
11128 * is always zero.
11129 */
11130 for (i = 0; i < (int) state->ss_union_layer; i++) {
11131 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11132 break;
11133 }
11134 tvp = vp;
11135 vp = vp->v_mount->mnt_vnodecovered;
11136 if (vp == NULL) {
11137 vnode_put(tvp);
11138 error = ENOENT;
11139 goto freeandexit;
11140 }
11141 error = vnode_getwithref(vp);
11142 vnode_put(tvp);
11143 if (error) {
11144 goto freeandexit;
11145 }
11146 }
11147 #endif /* CONFIG_UNION_MOUNTS */
11148
11149 #if CONFIG_MACF
11150 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11151 if (error) {
11152 vnode_put(vp);
11153 goto freeandexit;
11154 }
11155 #endif
11156
11157
11158 /*
11159 * If searchblock.maxmatches == 0, then skip the search. This has happened
11160 * before and sometimes the underlying code doesnt deal with it well.
11161 */
11162 if (searchblock.maxmatches == 0) {
11163 nummatches = 0;
11164 goto saveandexit;
11165 }
11166
11167 /*
11168 * Allright, we have everything we need, so lets make that call.
11169 *
11170 * We keep special track of the return value from the file system:
11171 * EAGAIN is an acceptable error condition that shouldn't keep us
11172 * from copying out any results...
11173 */
11174
11175 fserror = VNOP_SEARCHFS(vp,
11176 searchparams1,
11177 searchparams2,
11178 &searchblock.searchattrs,
11179 (uint32_t)searchblock.maxmatches,
11180 &timelimit,
11181 returnattrs,
11182 &nummatches,
11183 (uint32_t)uap->scriptcode,
11184 (uint32_t)uap->options,
11185 auio,
11186 (struct searchstate *) &state->ss_fsstate,
11187 ctx);
11188
11189 #if CONFIG_UNION_MOUNTS
11190 /*
11191 * If it's a union mount we need to be called again
11192 * to search the mounted-on filesystem.
11193 */
11194 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11195 state->ss_union_flags = SRCHFS_START;
11196 state->ss_union_layer++; // search next layer down
11197 fserror = EAGAIN;
11198 }
11199 #endif /* CONFIG_UNION_MOUNTS */
11200
11201 saveandexit:
11202
11203 vnode_put(vp);
11204
11205 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11206 * search state. Everything was already put into he return buffer by the vop call. */
11207
11208 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11209 goto freeandexit;
11210 }
11211
11212 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11213 goto freeandexit;
11214 }
11215
11216 error = fserror;
11217
11218 freeandexit:
11219
11220 kfree_data(searchparams1, mallocsize);
11221
11222 return error;
11223 } /* end of searchfs system call */
11224
11225 #else /* CONFIG_SEARCHFS */
11226
11227 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11228 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11229 {
11230 return ENOTSUP;
11231 }
11232
11233 #endif /* CONFIG_SEARCHFS */
11234
11235
11236 #if CONFIG_DATALESS_FILES
11237
11238 /*
11239 * === Namespace Resolver Up-call Mechanism ===
11240 *
11241 * When I/O is performed to a dataless file or directory (read, write,
11242 * lookup-in, etc.), the file system performs an upcall to the namespace
11243 * resolver (filecoordinationd) to materialize the object.
11244 *
11245 * We need multiple up-calls to be in flight at once, and we need these
11246 * up-calls to be interruptible, thus the following implementation:
11247 *
11248 * => The nspace_resolver_request represents the in-kernel request state.
11249 * It contains a request ID, storage space for the errno code returned
11250 * by filecoordinationd, and flags.
11251 *
11252 * => The request ID is simply a global monotonically incrementing 32-bit
11253 * number. Outstanding requests are stored in a hash table, and the
11254 * hash function is extremely simple.
11255 *
11256 * => When an upcall is to be made to filecoordinationd, a request structure
11257 * is allocated on the stack (it is small, and needs to live only during
11258 * the duration of the call to resolve_nspace_item_ext()). It is
11259 * initialized and inserted into the table. Some backpressure from
11260 * filecoordinationd is applied by limiting the numnber of entries that
11261 * can be inserted into the table (and thus limiting the number of
11262 * outstanding requests issued to filecoordinationd); waiting for an
11263 * available slot is interruptible.
11264 *
11265 * => Once the request has been inserted into the table, the up-call is made
11266 * to filecoordinationd via a MiG-generated stub. The up-call returns
11267 * immediately and filecoordinationd processes the request asynchronously.
11268 *
11269 * => The caller now waits for the request to complete. Tnis is achieved by
11270 * sleeping on the address of the request structure and waiting for
11271 * filecoordinationd to mark the request structure as complete. This
11272 * is an interruptible sleep call; if interrupted, the request structure
11273 * is removed from the table and EINTR is returned to the caller. If
11274 * this occurs, an advisory up-call is made to filecoordinationd with
11275 * the request ID to indicate that the request can be aborted or
11276 * de-prioritized at the discretion of filecoordinationd.
11277 *
11278 * => When filecoordinationd has completed the request, it signals completion
11279 * by writing to the vfs.nspace.complete sysctl node. Only a process
11280 * decorated as a namespace resolver can write to this sysctl node. The
11281 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11282 * The request ID is looked up in the table, and if the request is found,
11283 * the error code is stored in the request structure and a wakeup()
11284 * issued on the address of the request structure. If the request is not
11285 * found, we simply drop the completion notification, assuming that the
11286 * caller was interrupted.
11287 *
11288 * => When the waiting thread wakes up, it extracts the error code from the
11289 * request structure, removes the request from the table, and returns the
11290 * error code to the calling function. Fini!
11291 */
11292
11293 struct nspace_resolver_request {
11294 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11295 vnode_t r_vp;
11296 uint32_t r_req_id;
11297 int r_resolver_error;
11298 int r_flags;
11299 };
11300
11301 #define RRF_COMPLETE 0x0001
11302
11303 static uint32_t
next_nspace_req_id(void)11304 next_nspace_req_id(void)
11305 {
11306 static uint32_t next_req_id;
11307
11308 return OSAddAtomic(1, &next_req_id);
11309 }
11310
11311 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11312 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11313
11314 static LIST_HEAD(nspace_resolver_requesthead,
11315 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11316 static u_long nspace_resolver_request_hashmask;
11317 static u_int nspace_resolver_request_count;
11318 static bool nspace_resolver_request_wait_slot;
11319 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11320 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11321 &nspace_resolver_request_lck_grp);
11322
11323 #define NSPACE_REQ_LOCK() \
11324 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11325 #define NSPACE_REQ_UNLOCK() \
11326 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11327
11328 #define NSPACE_RESOLVER_HASH(req_id) \
11329 (&nspace_resolver_request_hashtbl[(req_id) & \
11330 nspace_resolver_request_hashmask])
11331
11332 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)11333 nspace_resolver_req_lookup(uint32_t req_id)
11334 {
11335 struct nspace_resolver_requesthead *bucket;
11336 struct nspace_resolver_request *req;
11337
11338 bucket = NSPACE_RESOLVER_HASH(req_id);
11339 LIST_FOREACH(req, bucket, r_hashlink) {
11340 if (req->r_req_id == req_id) {
11341 return req;
11342 }
11343 }
11344
11345 return NULL;
11346 }
11347
11348 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11349 nspace_resolver_req_add(struct nspace_resolver_request *req)
11350 {
11351 struct nspace_resolver_requesthead *bucket;
11352 int error;
11353
11354 while (nspace_resolver_request_count >=
11355 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11356 nspace_resolver_request_wait_slot = true;
11357 error = msleep(&nspace_resolver_request_count,
11358 &nspace_resolver_request_hash_mutex,
11359 PVFS | PCATCH, "nspacerq", NULL);
11360 if (error) {
11361 return error;
11362 }
11363 }
11364
11365 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11366 #if DIAGNOSTIC
11367 assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
11368 #endif /* DIAGNOSTIC */
11369 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11370 nspace_resolver_request_count++;
11371
11372 return 0;
11373 }
11374
11375 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11376 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11377 {
11378 struct nspace_resolver_requesthead *bucket;
11379
11380 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11381 #if DIAGNOSTIC
11382 assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
11383 #endif /* DIAGNOSTIC */
11384 LIST_REMOVE(req, r_hashlink);
11385 nspace_resolver_request_count--;
11386
11387 if (nspace_resolver_request_wait_slot) {
11388 nspace_resolver_request_wait_slot = false;
11389 wakeup(&nspace_resolver_request_count);
11390 }
11391 }
11392
11393 static void
nspace_resolver_req_cancel(uint32_t req_id)11394 nspace_resolver_req_cancel(uint32_t req_id)
11395 {
11396 kern_return_t kr;
11397 mach_port_t mp;
11398
11399 // Failures here aren't fatal -- the cancellation message
11400 // sent to the resolver is merely advisory.
11401
11402 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11403 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11404 return;
11405 }
11406
11407 kr = send_nspace_resolve_cancel(mp, req_id);
11408 if (kr != KERN_SUCCESS) {
11409 os_log_error(OS_LOG_DEFAULT,
11410 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11411 }
11412
11413 ipc_port_release_send(mp);
11414 }
11415
11416 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11417 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11418 {
11419 bool send_cancel_message = false;
11420 int error;
11421
11422 NSPACE_REQ_LOCK();
11423
11424 while ((req->r_flags & RRF_COMPLETE) == 0) {
11425 error = msleep(req, &nspace_resolver_request_hash_mutex,
11426 PVFS | PCATCH, "nspace", NULL);
11427 if (error && error != ERESTART) {
11428 req->r_resolver_error = (error == EINTR) ? EINTR :
11429 ETIMEDOUT;
11430 send_cancel_message = true;
11431 break;
11432 }
11433 }
11434
11435 nspace_resolver_req_remove(req);
11436
11437 NSPACE_REQ_UNLOCK();
11438
11439 if (send_cancel_message) {
11440 nspace_resolver_req_cancel(req->r_req_id);
11441 }
11442
11443 return req->r_resolver_error;
11444 }
11445
11446 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11447 nspace_resolver_req_mark_complete(
11448 struct nspace_resolver_request *req,
11449 int resolver_error)
11450 {
11451 req->r_resolver_error = resolver_error;
11452 req->r_flags |= RRF_COMPLETE;
11453 wakeup(req);
11454 }
11455
11456 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)11457 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
11458 {
11459 struct nspace_resolver_request *req;
11460
11461 NSPACE_REQ_LOCK();
11462
11463 // If we don't find the request corresponding to our req_id,
11464 // just drop the completion signal on the floor; it's likely
11465 // that the requester interrupted with a signal.
11466
11467 req = nspace_resolver_req_lookup(req_id);
11468 if (req) {
11469 mount_t locked_mp = NULL;
11470
11471 locked_mp = req->r_vp->v_mount;
11472 mount_ref(locked_mp, 0);
11473 mount_lock_renames(locked_mp);
11474
11475 //
11476 // if the resolver isn't already returning an error and we have an
11477 // orig_gencount, then get an iocount on the request vnode and check
11478 // that the gencount on req->r_vp has not changed.
11479 //
11480 // note: a ref was taken on req->r_vp when the request was created
11481 // and that ref will be dropped by that thread when it wakes up.
11482 //
11483 if (resolver_error == 0 &&
11484 orig_gencount != 0 &&
11485 vnode_getwithref(req->r_vp) == 0) {
11486 struct vnode_attr va;
11487 uint64_t cur_gencount;
11488
11489 VATTR_INIT(&va);
11490 VATTR_WANTED(&va, va_recursive_gencount);
11491
11492 if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
11493 cur_gencount = va.va_recursive_gencount;
11494 } else {
11495 cur_gencount = 0;
11496 }
11497
11498 if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
11499 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
11500
11501 // this error will be returned to the thread that initiated the
11502 // materialization of req->r_vp.
11503 resolver_error = EBUSY;
11504
11505 // note: we explicitly do not return an error to the caller (i.e.
11506 // the thread that did the materialization) because they said they
11507 // don't want one.
11508 }
11509
11510 vnode_put(req->r_vp);
11511 }
11512
11513 mount_unlock_renames(locked_mp);
11514 mount_drop(locked_mp, 0);
11515
11516 nspace_resolver_req_mark_complete(req, resolver_error);
11517 }
11518
11519 NSPACE_REQ_UNLOCK();
11520
11521 return;
11522 }
11523
11524 static struct proc *nspace_resolver_proc;
11525
11526 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11527 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11528 {
11529 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11530 p == nspace_resolver_proc) ? 1 : 0;
11531 return 0;
11532 }
11533
11534 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11535
11536 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11537 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11538 {
11539 vfs_context_t ctx = vfs_context_current();
11540 int error = 0;
11541
11542 //
11543 // The system filecoordinationd runs as uid == 0. This also
11544 // has the nice side-effect of filtering out filecoordinationd
11545 // running in the simulator.
11546 //
11547 if (!vfs_context_issuser(ctx) ||
11548 !vfs_context_is_dataless_resolver(ctx)) {
11549 return EPERM;
11550 }
11551
11552 if (is_resolver) {
11553 NSPACE_REQ_LOCK();
11554
11555 if (nspace_resolver_proc == NULL) {
11556 proc_lock(p);
11557 p->p_lflag |= P_LNSPACE_RESOLVER;
11558 proc_unlock(p);
11559 nspace_resolver_proc = p;
11560 } else {
11561 error = EBUSY;
11562 }
11563
11564 NSPACE_REQ_UNLOCK();
11565 } else {
11566 // This is basically just like the exit case.
11567 // nspace_resolver_exited() will verify that the
11568 // process is the resolver, and will clear the
11569 // global.
11570 nspace_resolver_exited(p);
11571 }
11572
11573 return error;
11574 }
11575
11576 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11577 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11578 {
11579 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11580 (p->p_vfs_iopolicy &
11581 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11582 *is_prevented = 1;
11583 } else {
11584 *is_prevented = 0;
11585 }
11586 return 0;
11587 }
11588
11589 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11590 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11591 {
11592 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11593 return is_prevented ? 0 : EBUSY;
11594 }
11595
11596 if (is_prevented) {
11597 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11598 } else {
11599 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11600 }
11601 return 0;
11602 }
11603
11604 static int
nspace_materialization_get_thread_state(int * is_prevented)11605 nspace_materialization_get_thread_state(int *is_prevented)
11606 {
11607 uthread_t ut = current_uthread();
11608
11609 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11610 return 0;
11611 }
11612
11613 static int
nspace_materialization_set_thread_state(int is_prevented)11614 nspace_materialization_set_thread_state(int is_prevented)
11615 {
11616 uthread_t ut = current_uthread();
11617
11618 if (is_prevented) {
11619 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11620 } else {
11621 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11622 }
11623 return 0;
11624 }
11625
11626 /* the vfs.nspace branch */
11627 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11628
11629 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11630 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11631 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11632 {
11633 struct proc *p = req->p;
11634 int new_value, old_value, changed = 0;
11635 int error;
11636
11637 error = nspace_resolver_get_proc_state(p, &old_value);
11638 if (error) {
11639 return error;
11640 }
11641
11642 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11643 &changed);
11644 if (error == 0 && changed) {
11645 error = nspace_resolver_set_proc_state(p, new_value);
11646 }
11647 return error;
11648 }
11649
11650 /* decorate this process as the dataless file resolver */
11651 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11652 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11653 0, 0, sysctl_nspace_resolver, "I", "");
11654
11655 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11656 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11657 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11658 {
11659 struct proc *p = req->p;
11660 int new_value, old_value, changed = 0;
11661 int error;
11662
11663 error = nspace_materialization_get_proc_state(p, &old_value);
11664 if (error) {
11665 return error;
11666 }
11667
11668 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11669 &changed);
11670 if (error == 0 && changed) {
11671 error = nspace_materialization_set_proc_state(p, new_value);
11672 }
11673 return error;
11674 }
11675
11676 /* decorate this process as not wanting to materialize dataless files */
11677 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11678 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11679 0, 0, sysctl_nspace_prevent_materialization, "I", "");
11680
11681 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11682 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11683 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11684 {
11685 int new_value, old_value, changed = 0;
11686 int error;
11687
11688 error = nspace_materialization_get_thread_state(&old_value);
11689 if (error) {
11690 return error;
11691 }
11692
11693 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11694 &changed);
11695 if (error == 0 && changed) {
11696 error = nspace_materialization_set_thread_state(new_value);
11697 }
11698 return error;
11699 }
11700
11701 /* decorate this thread as not wanting to materialize dataless files */
11702 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11703 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11704 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11705
11706 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11707 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11708 __unused int arg2, struct sysctl_req *req)
11709 {
11710 struct proc *p = req->p;
11711 uint32_t req_status[2] = { 0, 0 };
11712 uint64_t gencount = 0;
11713 int error, is_resolver, changed = 0, gencount_changed;
11714
11715 error = nspace_resolver_get_proc_state(p, &is_resolver);
11716 if (error) {
11717 return error;
11718 }
11719
11720 if (!is_resolver) {
11721 return EPERM;
11722 }
11723
11724 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11725 &changed);
11726 if (error) {
11727 return error;
11728 }
11729
11730 // get the gencount if it was passed
11731 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11732 &gencount_changed);
11733 if (error) {
11734 gencount = 0;
11735 // we ignore the error because the gencount was optional
11736 error = 0;
11737 }
11738
11739 /*
11740 * req_status[0] is the req_id
11741 *
11742 * req_status[1] is the errno
11743 */
11744 if (error == 0 && changed) {
11745 nspace_resolver_req_completed(req_status[0],
11746 (int)req_status[1], gencount);
11747 }
11748 return error;
11749 }
11750
11751 /* Resolver reports completed reqs here. */
11752 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11753 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11754 0, 0, sysctl_nspace_complete, "-", "");
11755
11756 #endif /* CONFIG_DATALESS_FILES */
11757
11758 #if CONFIG_DATALESS_FILES
11759 #define __no_dataless_unused /* nothing */
11760 #else
11761 #define __no_dataless_unused __unused
11762 #endif
11763
11764 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11765 vfs_context_dataless_materialization_is_prevented(
11766 vfs_context_t const ctx __no_dataless_unused)
11767 {
11768 #if CONFIG_DATALESS_FILES
11769 proc_t const p = vfs_context_proc(ctx);
11770 thread_t const t = vfs_context_thread(ctx);
11771 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11772
11773 /*
11774 * Kernel context ==> return EDEADLK, as we would with any random
11775 * process decorated as no-materialize.
11776 */
11777 if (ctx == vfs_context_kernel()) {
11778 return EDEADLK;
11779 }
11780
11781 /*
11782 * If the process has the dataless-manipulation entitlement,
11783 * materialization is prevented, and depending on the kind
11784 * of file system operation, things get to proceed as if the
11785 * object is not dataless.
11786 */
11787 if (vfs_context_is_dataless_manipulator(ctx)) {
11788 return EJUSTRETURN;
11789 }
11790
11791 /*
11792 * Per-thread decorations override any process-wide decorations.
11793 * (Foundation uses this, and this overrides even the dataless-
11794 * manipulation entitlement so as to make API contracts consistent.)
11795 */
11796 if (ut != NULL) {
11797 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11798 return EDEADLK;
11799 }
11800 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11801 return 0;
11802 }
11803 }
11804
11805 /*
11806 * If the process's iopolicy specifies that dataless files
11807 * can be materialized, then we let it go ahead.
11808 */
11809 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11810 return 0;
11811 }
11812 #endif /* CONFIG_DATALESS_FILES */
11813
11814 /*
11815 * The default behavior is to not materialize dataless files;
11816 * return to the caller that deadlock was detected.
11817 */
11818 return EDEADLK;
11819 }
11820
11821 void
nspace_resolver_init(void)11822 nspace_resolver_init(void)
11823 {
11824 #if CONFIG_DATALESS_FILES
11825 nspace_resolver_request_hashtbl =
11826 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11827 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11828 #endif /* CONFIG_DATALESS_FILES */
11829 }
11830
11831 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11832 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11833 {
11834 #if CONFIG_DATALESS_FILES
11835 struct nspace_resolver_requesthead *bucket;
11836 struct nspace_resolver_request *req;
11837 u_long idx;
11838
11839 NSPACE_REQ_LOCK();
11840
11841 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11842 p == nspace_resolver_proc) {
11843 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11844 bucket = &nspace_resolver_request_hashtbl[idx];
11845 LIST_FOREACH(req, bucket, r_hashlink) {
11846 nspace_resolver_req_mark_complete(req,
11847 ETIMEDOUT);
11848 }
11849 }
11850 nspace_resolver_proc = NULL;
11851 }
11852
11853 NSPACE_REQ_UNLOCK();
11854 #endif /* CONFIG_DATALESS_FILES */
11855 }
11856
11857 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11858 resolve_nspace_item(struct vnode *vp, uint64_t op)
11859 {
11860 return resolve_nspace_item_ext(vp, op, NULL);
11861 }
11862
11863 #define DATALESS_RESOLVER_ENTITLEMENT \
11864 "com.apple.private.vfs.dataless-resolver"
11865 #define DATALESS_MANIPULATION_ENTITLEMENT \
11866 "com.apple.private.vfs.dataless-manipulation"
11867
11868 #if CONFIG_DATALESS_FILES
11869 /*
11870 * Return TRUE if the vfs context is associated with the dataless
11871 * resolver.
11872 */
11873 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11874 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11875 {
11876 return IOTaskHasEntitlement(vfs_context_task(ctx),
11877 DATALESS_RESOLVER_ENTITLEMENT);
11878 }
11879 #endif /* CONFIG_DATALESS_FILES */
11880
11881 /*
11882 * Return TRUE if the vfs context is associated with a process entitled
11883 * for dataless manipulation.
11884 *
11885 * XXX Arguably belongs in vfs_subr.c, but is here because of the
11886 * complication around CONFIG_DATALESS_FILES.
11887 */
11888 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11889 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11890 {
11891 #if CONFIG_DATALESS_FILES
11892 task_t task = vfs_context_task(ctx);
11893 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11894 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11895 #else
11896 return false;
11897 #endif /* CONFIG_DATALESS_FILES */
11898 }
11899
11900 #if CONFIG_DATALESS_FILES
11901 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11902 log_materialization_prevented(vnode_t vp, uint64_t op)
11903 {
11904 char p_name[MAXCOMLEN + 1];
11905 char *vntype;
11906 proc_selfname(&p_name[0], sizeof(p_name));
11907
11908 if (vp->v_type == VREG) {
11909 vntype = "File";
11910 } else if (vp->v_type == VDIR) {
11911 vntype = "Dir";
11912 } else if (vp->v_type == VLNK) {
11913 vntype = "SymLink";
11914 } else {
11915 vntype = "Other";
11916 }
11917
11918 #if DEVELOPMENT
11919 char *path = NULL;
11920 int len;
11921
11922 path = get_pathbuff();
11923 len = MAXPATHLEN;
11924 if (path) {
11925 vn_getpath(vp, path, &len);
11926 }
11927
11928 os_log_debug(OS_LOG_DEFAULT,
11929 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11930 p_name, proc_selfpid(),
11931 op, vntype, path ? path : "<unknown-path>");
11932 if (path) {
11933 release_pathbuff(path);
11934 }
11935 #else
11936 os_log_debug(OS_LOG_DEFAULT,
11937 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11938 p_name, proc_selfpid(),
11939 op, vntype);
11940 #endif
11941 }
11942 #endif /* CONFIG_DATALESS_FILES */
11943
11944 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11945 vfs_materialize_item(
11946 struct vnode *vp __no_dataless_unused,
11947 uint64_t op __no_dataless_unused,
11948 int64_t offset __no_dataless_unused,
11949 int64_t size __no_dataless_unused,
11950 char *lookup_name __no_dataless_unused,
11951 size_t const namelen __no_dataless_unused)
11952 {
11953 #if CONFIG_DATALESS_FILES
11954 struct nspace_resolver_request req;
11955 kern_return_t kern_ret;
11956 mach_port_t mach_port;
11957 char *path = NULL;
11958 vfs_context_t context;
11959 int path_len;
11960 int error;
11961 audit_token_t atoken;
11962
11963 /*
11964 * If this is a snapshot event and the vnode is on a disk image just
11965 * pretend nothing happened since any change to the disk image will
11966 * cause the disk image itself to get backed up and this avoids multi-
11967 * way deadlocks between the snapshot handler and the ever popular
11968 * diskimages-helper process. The variable nspace_allow_virtual_devs
11969 * allows this behavior to be overridden (for use by the Mobile
11970 * TimeMachine testing infrastructure which uses disk images).
11971 */
11972 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11973 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11974 return ENOTSUP;
11975 }
11976
11977 context = vfs_context_current();
11978
11979 error = vfs_context_dataless_materialization_is_prevented(context);
11980 if (error) {
11981 log_materialization_prevented(vp, op);
11982 return error;
11983 }
11984
11985 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11986 &mach_port);
11987 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11988 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11989 /*
11990 * Treat this like being unable to access the backing store
11991 * server.
11992 */
11993 return ETIMEDOUT;
11994 }
11995
11996 int path_alloc_len = MAXPATHLEN;
11997 do {
11998 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
11999 if (path == NULL) {
12000 return ENOMEM;
12001 }
12002
12003 path_len = path_alloc_len;
12004 error = vn_getpath(vp, path, &path_len);
12005 if (error == 0) {
12006 break;
12007 } else if (error == ENOSPC) {
12008 kfree_data(path, path_alloc_len);
12009 path = NULL;
12010 } else {
12011 goto out_release_port;
12012 }
12013 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12014
12015 error = vfs_context_copy_audit_token(context, &atoken);
12016 if (error) {
12017 goto out_release_port;
12018 }
12019
12020 req.r_req_id = next_nspace_req_id();
12021 req.r_resolver_error = 0;
12022 req.r_flags = 0;
12023 req.r_vp = vp;
12024
12025 NSPACE_REQ_LOCK();
12026 error = nspace_resolver_req_add(&req);
12027 NSPACE_REQ_UNLOCK();
12028 if (error) {
12029 goto out_release_port;
12030 }
12031
12032 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12033 if (vp->v_type == VDIR) {
12034 char *tmpname = NULL;
12035
12036 /*
12037 * If the caller provided a lookup_name *and* a name length,
12038 * then we assume the lookup_name is not NUL-terminated.
12039 * Allocate a temporary buffer in this case to provide
12040 * a NUL-terminated path name to the IPC call.
12041 */
12042 if (lookup_name != NULL && namelen != 0) {
12043 if (namelen >= PATH_MAX) {
12044 error = EINVAL;
12045 goto out_release_port;
12046 }
12047 tmpname = zalloc(ZV_NAMEI);
12048 strlcpy(tmpname, lookup_name, namelen + 1);
12049 lookup_name = tmpname;
12050 } else if (lookup_name != NULL) {
12051 /*
12052 * If the caller provided a lookup_name with a
12053 * zero name length, then we assume it's NUL-
12054 * terminated. Verify it has a valid length.
12055 */
12056 if (strlen(lookup_name) >= PATH_MAX) {
12057 error = EINVAL;
12058 goto out_release_port;
12059 }
12060 }
12061
12062 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12063 req.r_req_id, (uint32_t)(op & 0xffffffff),
12064 lookup_name == NULL ? "" : lookup_name, path, atoken);
12065
12066 if (tmpname != NULL) {
12067 zfree(ZV_NAMEI, tmpname);
12068
12069 /*
12070 * Poison lookup_name rather than reference
12071 * freed memory.
12072 */
12073 lookup_name = NULL;
12074 }
12075 } else {
12076 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12077 req.r_req_id, (uint32_t)(op & 0xffffffff),
12078 offset, size, path, atoken);
12079 }
12080 if (kern_ret != KERN_SUCCESS) {
12081 /*
12082 * Also treat this like being unable to access the backing
12083 * store server.
12084 */
12085 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12086 kern_ret);
12087 error = ETIMEDOUT;
12088
12089 NSPACE_REQ_LOCK();
12090 nspace_resolver_req_remove(&req);
12091 NSPACE_REQ_UNLOCK();
12092 goto out_release_port;
12093 }
12094
12095 /*
12096 * Give back the memory we allocated earlier while we wait; we
12097 * no longer need it.
12098 */
12099 kfree_data(path, path_alloc_len);
12100 path = NULL;
12101
12102 /*
12103 * Request has been submitted to the resolver. Now (interruptibly)
12104 * wait for completion. Upon requrn, the request will have been
12105 * removed from the lookup table.
12106 */
12107 error = nspace_resolver_req_wait(&req);
12108
12109 out_release_port:
12110 if (path != NULL) {
12111 kfree_data(path, path_alloc_len);
12112 path = NULL;
12113 }
12114 ipc_port_release_send(mach_port);
12115
12116 return error;
12117 #else
12118 return ENOTSUP;
12119 #endif /* CONFIG_DATALESS_FILES */
12120 }
12121
12122 /*
12123 * vfs_materialize_file: Materialize a regular file.
12124 *
12125 * Inputs:
12126 * vp The dataless file to be materialized.
12127 *
12128 * op What kind of operation is being performed:
12129 * -> NAMESPACE_HANDLER_READ_OP
12130 * -> NAMESPACE_HANDLER_WRITE_OP
12131 * -> NAMESPACE_HANDLER_LINK_CREATE
12132 * -> NAMESPACE_HANDLER_DELETE_OP
12133 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12134 * -> NAMESPACE_HANDLER_RENAME_OP
12135 *
12136 * offset offset of I/O for READ or WRITE. Ignored for
12137 * other ops.
12138 *
12139 * size size of I/O for READ or WRITE Ignored for
12140 * other ops.
12141 *
12142 * If offsize or size are -1 for a READ or WRITE, then the resolver should
12143 * consider the range to be unknown.
12144 *
12145 * Upon successful return, the caller may proceed with the operation.
12146 * N.B. the file may still be "dataless" in this case.
12147 */
12148 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12149 vfs_materialize_file(
12150 struct vnode *vp,
12151 uint64_t op,
12152 int64_t offset,
12153 int64_t size)
12154 {
12155 if (vp->v_type != VREG) {
12156 return EFTYPE;
12157 }
12158 return vfs_materialize_item(vp, op, offset, size, NULL, 0);
12159 }
12160
12161 /*
12162 * vfs_materialize_dir:
12163 *
12164 * Inputs:
12165 * vp The dataless directory to be materialized.
12166 *
12167 * op What kind of operation is being performed:
12168 * -> NAMESPACE_HANDLER_READ_OP
12169 * -> NAMESPACE_HANDLER_WRITE_OP
12170 * -> NAMESPACE_HANDLER_DELETE_OP
12171 * -> NAMESPACE_HANDLER_RENAME_OP
12172 * -> NAMESPACE_HANDLER_LOOKUP_OP
12173 *
12174 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12175 * other ops. May or may not be NUL-terminated; see below.
12176 *
12177 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12178 * terminated and namelen is the number of valid bytes in
12179 * lookup_name. If zero, then lookup_name is assumed to be
12180 * NUL-terminated.
12181 *
12182 * Upon successful return, the caller may proceed with the operation.
12183 * N.B. the directory may still be "dataless" in this case.
12184 */
12185 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12186 vfs_materialize_dir(
12187 struct vnode *vp,
12188 uint64_t op,
12189 char *lookup_name,
12190 size_t namelen)
12191 {
12192 if (vp->v_type != VDIR) {
12193 return EFTYPE;
12194 }
12195 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12196 return EINVAL;
12197 }
12198 return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
12199 }
12200
12201 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)12202 resolve_nspace_item_ext(
12203 struct vnode *vp __no_dataless_unused,
12204 uint64_t op __no_dataless_unused,
12205 void *arg __unused)
12206 {
12207 #if CONFIG_DATALESS_FILES
12208 int error;
12209 mach_port_t mp;
12210 char *path = NULL;
12211 int path_len;
12212 kern_return_t kr;
12213 struct nspace_resolver_request req;
12214
12215 // only allow namespace events on regular files, directories and symlinks.
12216 if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
12217 return EFTYPE;
12218 }
12219
12220 //
12221 // if this is a snapshot event and the vnode is on a
12222 // disk image just pretend nothing happened since any
12223 // change to the disk image will cause the disk image
12224 // itself to get backed up and this avoids multi-way
12225 // deadlocks between the snapshot handler and the ever
12226 // popular diskimages-helper process. the variable
12227 // nspace_allow_virtual_devs allows this behavior to
12228 // be overridden (for use by the Mobile TimeMachine
12229 // testing infrastructure which uses disk images)
12230 //
12231 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12232 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12233 return ENOTSUP;
12234 }
12235
12236 error = vfs_context_dataless_materialization_is_prevented(
12237 vfs_context_current());
12238 if (error) {
12239 log_materialization_prevented(vp, op);
12240 return error;
12241 }
12242
12243 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12244 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12245 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12246 // Treat this like being unable to access the backing
12247 // store server.
12248 return ETIMEDOUT;
12249 }
12250
12251 int path_alloc_len = MAXPATHLEN;
12252 do {
12253 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12254 if (path == NULL) {
12255 return ENOMEM;
12256 }
12257
12258 path_len = path_alloc_len;
12259 error = vn_getpath(vp, path, &path_len);
12260 if (error == 0) {
12261 break;
12262 } else if (error == ENOSPC) {
12263 kfree_data(path, path_alloc_len);
12264 path = NULL;
12265 } else {
12266 goto out_release_port;
12267 }
12268 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12269
12270 if (error == 0) {
12271 int xxx_rdar44371223; /* XXX Mig bug */
12272 req.r_req_id = next_nspace_req_id();
12273 req.r_resolver_error = 0;
12274 req.r_flags = 0;
12275
12276 if ((error = vnode_ref(vp)) == 0) { // take a ref so that the vnode doesn't go away
12277 req.r_vp = vp;
12278 } else {
12279 goto out_release_port;
12280 }
12281
12282 NSPACE_REQ_LOCK();
12283 error = nspace_resolver_req_add(&req);
12284 NSPACE_REQ_UNLOCK();
12285 if (error) {
12286 vnode_rele(req.r_vp);
12287 goto out_release_port;
12288 }
12289
12290 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12291 kr = send_nspace_resolve_path(mp, req.r_req_id,
12292 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
12293 path, &xxx_rdar44371223);
12294 if (kr != KERN_SUCCESS) {
12295 // Also treat this like being unable to access
12296 // the backing store server.
12297 os_log_error(OS_LOG_DEFAULT,
12298 "NSPACE resolve_path failure: %d", kr);
12299 error = ETIMEDOUT;
12300
12301 NSPACE_REQ_LOCK();
12302 nspace_resolver_req_remove(&req);
12303 NSPACE_REQ_UNLOCK();
12304 vnode_rele(req.r_vp);
12305 goto out_release_port;
12306 }
12307
12308 // Give back the memory we allocated earlier while
12309 // we wait; we no longer need it.
12310 kfree_data(path, path_alloc_len);
12311 path = NULL;
12312
12313 // Request has been submitted to the resolver.
12314 // Now (interruptibly) wait for completion.
12315 // Upon requrn, the request will have been removed
12316 // from the lookup table.
12317 error = nspace_resolver_req_wait(&req);
12318
12319 vnode_rele(req.r_vp);
12320 }
12321
12322 out_release_port:
12323 if (path != NULL) {
12324 kfree_data(path, path_alloc_len);
12325 path = NULL;
12326 }
12327 ipc_port_release_send(mp);
12328
12329 return error;
12330 #else
12331 return ENOTSUP;
12332 #endif /* CONFIG_DATALESS_FILES */
12333 }
12334
12335 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)12336 nspace_snapshot_event(__unused vnode_t vp, __unused time_t ctime,
12337 __unused uint64_t op_type, __unused void *arg)
12338 {
12339 return 0;
12340 }
12341
12342 #if 0
12343 static int
12344 build_volfs_path(struct vnode *vp, char *path, int *len)
12345 {
12346 struct vnode_attr va;
12347 int ret;
12348
12349 VATTR_INIT(&va);
12350 VATTR_WANTED(&va, va_fsid);
12351 VATTR_WANTED(&va, va_fileid);
12352
12353 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12354 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12355 ret = -1;
12356 } else {
12357 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12358 ret = 0;
12359 }
12360
12361 return ret;
12362 }
12363 #endif
12364
12365 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12366 fsctl_bogus_command_compat(unsigned long cmd)
12367 {
12368 switch (cmd) {
12369 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12370 return FSIOC_SYNC_VOLUME;
12371 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12372 return FSIOC_ROUTEFS_SETROUTEID;
12373 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12374 return FSIOC_SET_PACKAGE_EXTS;
12375 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12376 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12377 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12378 return DISK_CONDITIONER_IOC_GET;
12379 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12380 return DISK_CONDITIONER_IOC_SET;
12381 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12382 return FSIOC_FIOSEEKHOLE;
12383 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12384 return FSIOC_FIOSEEKDATA;
12385 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12386 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12387 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12388 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12389 }
12390
12391 return cmd;
12392 }
12393
12394 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12395 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12396 {
12397 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12398 }
12399
12400 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12401 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12402 {
12403 struct vfs_attr vfa;
12404 mount_t mp = vp->v_mount;
12405 unsigned arg;
12406 int error;
12407
12408 /* record vid of vp so we can drop it below. */
12409 uint32_t vvid = vp->v_id;
12410
12411 /*
12412 * Then grab mount_iterref so that we can release the vnode.
12413 * Without this, a thread may call vnode_iterate_prepare then
12414 * get into a deadlock because we've never released the root vp
12415 */
12416 error = mount_iterref(mp, 0);
12417 if (error) {
12418 return error;
12419 }
12420 vnode_hold(vp);
12421 vnode_put(vp);
12422
12423 arg = MNT_NOWAIT;
12424 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12425 arg = MNT_WAIT;
12426 }
12427
12428 /*
12429 * If the filessytem supports multiple filesytems in a
12430 * partition (For eg APFS volumes in a container, it knows
12431 * that the waitfor argument to VFS_SYNC are flags.
12432 */
12433 VFSATTR_INIT(&vfa);
12434 VFSATTR_WANTED(&vfa, f_capabilities);
12435 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12436 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12437 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12438 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12439 arg |= MNT_VOLUME;
12440 }
12441
12442 /* issue the sync for this volume */
12443 (void)sync_callback(mp, &arg);
12444
12445 /*
12446 * Then release the mount_iterref once we're done syncing; it's not
12447 * needed for the VNOP_IOCTL below
12448 */
12449 mount_iterdrop(mp);
12450
12451 if (arg & FSCTL_SYNC_FULLSYNC) {
12452 /* re-obtain vnode iocount on the root vp, if possible */
12453 error = vnode_getwithvid(vp, vvid);
12454 if (error == 0) {
12455 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12456 vnode_put(vp);
12457 }
12458 }
12459 vnode_drop(vp);
12460 /* mark the argument VP as having been released */
12461 *arg_vp = NULL;
12462 return error;
12463 }
12464
12465 #if ROUTEFS
12466 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12467 handle_routes(user_addr_t udata)
12468 {
12469 char routepath[MAXPATHLEN];
12470 size_t len = 0;
12471 int error;
12472
12473 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12474 return error;
12475 }
12476 bzero(routepath, MAXPATHLEN);
12477 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12478 if (error) {
12479 return error;
12480 }
12481 error = routefs_kernel_mount(routepath);
12482 return error;
12483 }
12484 #endif
12485
12486 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12487 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12488 {
12489 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12490 struct vnode_attr va;
12491 int error;
12492
12493 VATTR_INIT(&va);
12494 VATTR_SET(&va, va_flags, cas->new_flags);
12495
12496 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12497
12498 #if CONFIG_FSE
12499 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12500 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12501 }
12502 #endif
12503
12504 return error;
12505 }
12506
12507 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12508 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12509 {
12510 struct mount *mp = NULL;
12511 errno_t rootauth = 0;
12512
12513 mp = vp->v_mount;
12514
12515 /*
12516 * query the underlying FS and see if it reports something
12517 * sane for this vnode. If volume is authenticated via
12518 * chunklist, leave that for the caller to determine.
12519 */
12520 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12521
12522 return rootauth;
12523 }
12524
12525 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12526 "com.apple.private.kernel.set-package-extensions"
12527
12528 /*
12529 * Make a filesystem-specific control call:
12530 */
12531 /* ARGSUSED */
12532 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12533 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12534 {
12535 int error = 0;
12536 boolean_t is64bit;
12537 u_int size;
12538 #define STK_PARAMS 128
12539 char stkbuf[STK_PARAMS] = {0};
12540 caddr_t data, memp;
12541 vnode_t vp = *arg_vp;
12542
12543 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12544 return ENOTTY;
12545 }
12546
12547 cmd = fsctl_bogus_command_compat(cmd);
12548
12549 size = IOCPARM_LEN(cmd);
12550 if (size > IOCPARM_MAX) {
12551 return EINVAL;
12552 }
12553
12554 is64bit = proc_is64bit(p);
12555
12556 memp = NULL;
12557
12558 if (size > sizeof(stkbuf)) {
12559 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12560 return ENOMEM;
12561 }
12562 data = memp;
12563 } else {
12564 data = &stkbuf[0];
12565 };
12566
12567 if (cmd & IOC_IN) {
12568 if (size) {
12569 error = copyin(udata, data, size);
12570 if (error) {
12571 if (memp) {
12572 kfree_data(memp, size);
12573 }
12574 return error;
12575 }
12576 } else {
12577 if (is64bit) {
12578 *(user_addr_t *)data = udata;
12579 } else {
12580 *(uint32_t *)data = (uint32_t)udata;
12581 }
12582 };
12583 } else if ((cmd & IOC_OUT) && size) {
12584 /*
12585 * Zero the buffer so the user always
12586 * gets back something deterministic.
12587 */
12588 bzero(data, size);
12589 } else if (cmd & IOC_VOID) {
12590 if (is64bit) {
12591 *(user_addr_t *)data = udata;
12592 } else {
12593 *(uint32_t *)data = (uint32_t)udata;
12594 }
12595 }
12596
12597 /* Check to see if it's a generic command */
12598 switch (cmd) {
12599 case FSIOC_SYNC_VOLUME:
12600 error = handle_sync_volume(vp, arg_vp, data, ctx);
12601 break;
12602
12603 case FSIOC_ROUTEFS_SETROUTEID:
12604 #if ROUTEFS
12605 error = handle_routes(udata);
12606 #endif
12607 break;
12608
12609 case FSIOC_SET_PACKAGE_EXTS: {
12610 user_addr_t ext_strings;
12611 uint32_t num_entries;
12612 uint32_t max_width;
12613
12614 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12615 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12616 error = EPERM;
12617 break;
12618 }
12619
12620 if ((is64bit && size != sizeof(user64_package_ext_info))
12621 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12622 // either you're 64-bit and passed a 64-bit struct or
12623 // you're 32-bit and passed a 32-bit struct. otherwise
12624 // it's not ok.
12625 error = EINVAL;
12626 break;
12627 }
12628
12629 if (is64bit) {
12630 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12631 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12632 }
12633 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12634 num_entries = ((user64_package_ext_info *)data)->num_entries;
12635 max_width = ((user64_package_ext_info *)data)->max_width;
12636 } else {
12637 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12638 num_entries = ((user32_package_ext_info *)data)->num_entries;
12639 max_width = ((user32_package_ext_info *)data)->max_width;
12640 }
12641 error = set_package_extensions_table(ext_strings, num_entries, max_width);
12642 }
12643 break;
12644
12645 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12646 {
12647 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12648 break;
12649 }
12650 if (vp->v_mount) {
12651 mount_lock(vp->v_mount);
12652 if (data[0] != 0) {
12653 int i;
12654 for (i = 0; i < MFSTYPENAMELEN; i++) {
12655 if (!data[i]) {
12656 goto continue_copy;
12657 }
12658 }
12659 /*
12660 * Getting here means we have a user data string which has no
12661 * NULL termination in its first MFSTYPENAMELEN bytes.
12662 * This is bogus, let's avoid strlcpy-ing the read data and
12663 * return an error.
12664 */
12665 error = EINVAL;
12666 goto unlock;
12667 continue_copy:
12668 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
12669 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
12670 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12671 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12672 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12673 }
12674 } else {
12675 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12676 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12677 }
12678 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12679 vp->v_mount->fstypename_override[0] = '\0';
12680 }
12681 unlock:
12682 mount_unlock(vp->v_mount);
12683 }
12684 }
12685 break;
12686
12687 case DISK_CONDITIONER_IOC_GET: {
12688 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12689 }
12690 break;
12691
12692 case DISK_CONDITIONER_IOC_SET: {
12693 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12694 }
12695 break;
12696
12697 case FSIOC_CAS_BSDFLAGS:
12698 error = handle_flags(vp, data, ctx);
12699 break;
12700
12701 case FSIOC_FD_ONLY_OPEN_ONCE: {
12702 error = 0;
12703 if (vnode_usecount(vp) > 1) {
12704 vnode_lock_spin(vp);
12705 if (vp->v_lflag & VL_HASSTREAMS) {
12706 if (vnode_isinuse_locked(vp, 1, 1)) {
12707 error = EBUSY;
12708 }
12709 } else if (vnode_usecount(vp) > 1) {
12710 error = EBUSY;
12711 }
12712 vnode_unlock(vp);
12713 }
12714 }
12715 break;
12716
12717 case FSIOC_EVAL_ROOTAUTH:
12718 error = handle_auth(vp, cmd, data, options, ctx);
12719 break;
12720
12721 case FSIOC_TEST_FSE_ACCESS_GRANTED:
12722 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12723 break;
12724
12725 default: {
12726 /* other, known commands shouldn't be passed down here */
12727 switch (cmd) {
12728 case F_PUNCHHOLE:
12729 case F_TRIM_ACTIVE_FILE:
12730 case F_RDADVISE:
12731 case F_TRANSCODEKEY:
12732 case F_GETPROTECTIONLEVEL:
12733 case F_GETDEFAULTPROTLEVEL:
12734 case F_MAKECOMPRESSED:
12735 case F_SET_GREEDY_MODE:
12736 case F_SETSTATICCONTENT:
12737 case F_SETIOTYPE:
12738 case F_SETBACKINGSTORE:
12739 case F_GETPATH_MTMINFO:
12740 case APFSIOC_REVERT_TO_SNAPSHOT:
12741 case FSIOC_FIOSEEKHOLE:
12742 case FSIOC_FIOSEEKDATA:
12743 case HFS_GET_BOOT_INFO:
12744 case HFS_SET_BOOT_INFO:
12745 case FIOPINSWAP:
12746 case F_CHKCLEAN:
12747 case F_FULLFSYNC:
12748 case F_BARRIERFSYNC:
12749 case F_FREEZE_FS:
12750 case F_THAW_FS:
12751 case FSIOC_KERNEL_ROOTAUTH:
12752 case FSIOC_GRAFT_FS:
12753 case FSIOC_UNGRAFT_FS:
12754 case FSIOC_AUTH_FS:
12755 error = EINVAL;
12756 goto outdrop;
12757 }
12758 /* Invoke the filesystem-specific code */
12759 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12760 }
12761 } /* end switch stmt */
12762
12763 /*
12764 * if no errors, copy any data to user. Size was
12765 * already set and checked above.
12766 */
12767 if (error == 0 && (cmd & IOC_OUT) && size) {
12768 error = copyout(data, udata, size);
12769 }
12770
12771 outdrop:
12772 if (memp) {
12773 kfree_data(memp, size);
12774 }
12775
12776 return error;
12777 }
12778
12779 /* ARGSUSED */
12780 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12781 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12782 {
12783 int error;
12784 struct nameidata nd;
12785 uint32_t nameiflags;
12786 vnode_t vp = NULL;
12787 vfs_context_t ctx = vfs_context_current();
12788
12789 AUDIT_ARG(cmd, (int)uap->cmd);
12790 AUDIT_ARG(value32, uap->options);
12791 /* Get the vnode for the file we are getting info on: */
12792 nameiflags = 0;
12793 //
12794 // if we come through fsctl() then the file is by definition not open.
12795 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12796 // lest the caller mistakenly thinks the only open is their own (but in
12797 // reality it's someone elses).
12798 //
12799 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12800 return EINVAL;
12801 }
12802 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12803 nameiflags |= FOLLOW;
12804 }
12805 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12806 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12807 }
12808 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12809 UIO_USERSPACE, uap->path, ctx);
12810 if ((error = namei(&nd))) {
12811 goto done;
12812 }
12813 vp = nd.ni_vp;
12814 nameidone(&nd);
12815
12816 #if CONFIG_MACF
12817 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12818 if (error) {
12819 goto done;
12820 }
12821 #endif
12822
12823 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12824
12825 done:
12826 if (vp) {
12827 vnode_put(vp);
12828 }
12829 return error;
12830 }
12831 /* ARGSUSED */
12832 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12833 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12834 {
12835 int error;
12836 vnode_t vp = NULL;
12837 vfs_context_t ctx = vfs_context_current();
12838 int fd = -1;
12839
12840 AUDIT_ARG(fd, uap->fd);
12841 AUDIT_ARG(cmd, (int)uap->cmd);
12842 AUDIT_ARG(value32, uap->options);
12843
12844 /* Get the vnode for the file we are getting info on: */
12845 if ((error = file_vnode(uap->fd, &vp))) {
12846 return error;
12847 }
12848 fd = uap->fd;
12849 if ((error = vnode_getwithref(vp))) {
12850 file_drop(fd);
12851 return error;
12852 }
12853
12854 #if CONFIG_MACF
12855 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12856 file_drop(fd);
12857 vnode_put(vp);
12858 return error;
12859 }
12860 #endif
12861
12862 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12863
12864 file_drop(fd);
12865
12866 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12867 if (vp) {
12868 vnode_put(vp);
12869 }
12870
12871 return error;
12872 }
12873 /* end of fsctl system call */
12874
12875 #define FILESEC_ACCESS_ENTITLEMENT \
12876 "com.apple.private.vfs.filesec-access"
12877
12878 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12879 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12880 {
12881 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12882 /*
12883 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12884 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12885 */
12886 if ((!setting && vfs_context_issuser(ctx)) ||
12887 IOTaskHasEntitlement(vfs_context_task(ctx),
12888 FILESEC_ACCESS_ENTITLEMENT)) {
12889 return 0;
12890 }
12891 }
12892
12893 return EPERM;
12894 }
12895
12896 /*
12897 * Retrieve the data of an extended attribute.
12898 */
12899 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12900 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12901 {
12902 vnode_t vp;
12903 struct nameidata nd;
12904 char attrname[XATTR_MAXNAMELEN + 1];
12905 vfs_context_t ctx = vfs_context_current();
12906 uio_t auio = NULL;
12907 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12908 size_t attrsize = 0;
12909 size_t namelen;
12910 u_int32_t nameiflags;
12911 int error;
12912 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12913
12914 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12915 return EINVAL;
12916 }
12917
12918 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12919 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12920 if ((error = namei(&nd))) {
12921 return error;
12922 }
12923 vp = nd.ni_vp;
12924 nameidone(&nd);
12925
12926 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12927 if (error != 0) {
12928 goto out;
12929 }
12930 if (xattr_protected(attrname) &&
12931 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12932 goto out;
12933 }
12934 /*
12935 * the specific check for 0xffffffff is a hack to preserve
12936 * binaray compatibilty in K64 with applications that discovered
12937 * that passing in a buf pointer and a size of -1 resulted in
12938 * just the size of the indicated extended attribute being returned.
12939 * this isn't part of the documented behavior, but because of the
12940 * original implemtation's check for "uap->size > 0", this behavior
12941 * was allowed. In K32 that check turned into a signed comparison
12942 * even though uap->size is unsigned... in K64, we blow by that
12943 * check because uap->size is unsigned and doesn't get sign smeared
12944 * in the munger for a 32 bit user app. we also need to add a
12945 * check to limit the maximum size of the buffer being passed in...
12946 * unfortunately, the underlying fileystems seem to just malloc
12947 * the requested size even if the actual extended attribute is tiny.
12948 * because that malloc is for kernel wired memory, we have to put a
12949 * sane limit on it.
12950 *
12951 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12952 * U64 running on K64 will yield -1 (64 bits wide)
12953 * U32/U64 running on K32 will yield -1 (32 bits wide)
12954 */
12955 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12956 goto no_uio;
12957 }
12958
12959 if (uap->value) {
12960 if (uap->size > (size_t)XATTR_MAXSIZE) {
12961 uap->size = XATTR_MAXSIZE;
12962 }
12963
12964 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12965 &uio_buf[0], sizeof(uio_buf));
12966 uio_addiov(auio, uap->value, uap->size);
12967 }
12968 no_uio:
12969 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12970 out:
12971 vnode_put(vp);
12972
12973 if (auio) {
12974 *retval = uap->size - uio_resid(auio);
12975 } else {
12976 *retval = (user_ssize_t)attrsize;
12977 }
12978
12979 return error;
12980 }
12981
12982 /*
12983 * Retrieve the data of an extended attribute.
12984 */
12985 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12986 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12987 {
12988 vnode_t vp;
12989 char attrname[XATTR_MAXNAMELEN + 1];
12990 vfs_context_t ctx = vfs_context_current();
12991 uio_t auio = NULL;
12992 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12993 size_t attrsize = 0;
12994 size_t namelen;
12995 int error;
12996 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12997
12998 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12999 return EINVAL;
13000 }
13001
13002 if ((error = file_vnode(uap->fd, &vp))) {
13003 return error;
13004 }
13005 if ((error = vnode_getwithref(vp))) {
13006 file_drop(uap->fd);
13007 return error;
13008 }
13009 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13010 if (error != 0) {
13011 goto out;
13012 }
13013 if (xattr_protected(attrname) &&
13014 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13015 goto out;
13016 }
13017 if (uap->value && uap->size > 0) {
13018 if (uap->size > (size_t)XATTR_MAXSIZE) {
13019 uap->size = XATTR_MAXSIZE;
13020 }
13021
13022 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13023 &uio_buf[0], sizeof(uio_buf));
13024 uio_addiov(auio, uap->value, uap->size);
13025 }
13026
13027 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13028 out:
13029 (void)vnode_put(vp);
13030 file_drop(uap->fd);
13031
13032 if (auio) {
13033 *retval = uap->size - uio_resid(auio);
13034 } else {
13035 *retval = (user_ssize_t)attrsize;
13036 }
13037 return error;
13038 }
13039
13040 /* struct for checkdirs iteration */
13041 struct setxattr_ctx {
13042 struct nameidata nd;
13043 char attrname[XATTR_MAXNAMELEN + 1];
13044 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13045 };
13046
13047 /*
13048 * Set the data of an extended attribute.
13049 */
13050 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13051 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13052 {
13053 vnode_t vp;
13054 vfs_context_t ctx = vfs_context_current();
13055 uio_t auio = NULL;
13056 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13057 size_t namelen;
13058 u_int32_t nameiflags;
13059 int error;
13060 struct setxattr_ctx *sactx;
13061
13062 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13063 return EINVAL;
13064 }
13065
13066 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13067 if (sactx == NULL) {
13068 return ENOMEM;
13069 }
13070
13071 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13072 if (error != 0) {
13073 if (error == EPERM) {
13074 /* if the string won't fit in attrname, copyinstr emits EPERM */
13075 error = ENAMETOOLONG;
13076 }
13077 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13078 goto out;
13079 }
13080 if (xattr_protected(sactx->attrname) &&
13081 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13082 goto out;
13083 }
13084 if (uap->size != 0 && uap->value == 0) {
13085 error = EINVAL;
13086 goto out;
13087 }
13088 if (uap->size > INT_MAX) {
13089 error = E2BIG;
13090 goto out;
13091 }
13092
13093 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13094 #if CONFIG_FILE_LEASES
13095 nameiflags |= WANTPARENT;
13096 #endif
13097 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13098 if ((error = namei(&sactx->nd))) {
13099 goto out;
13100 }
13101 vp = sactx->nd.ni_vp;
13102 #if CONFIG_FILE_LEASES
13103 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13104 vnode_put(sactx->nd.ni_dvp);
13105 #endif
13106 nameidone(&sactx->nd);
13107
13108 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13109 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13110 uio_addiov(auio, uap->value, uap->size);
13111
13112 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13113 #if CONFIG_FSE
13114 if (error == 0) {
13115 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13116 FSE_ARG_VNODE, vp,
13117 FSE_ARG_DONE);
13118 }
13119 #endif
13120 vnode_put(vp);
13121 out:
13122 kfree_type(struct setxattr_ctx, sactx);
13123 *retval = 0;
13124 return error;
13125 }
13126
13127 /*
13128 * Set the data of an extended attribute.
13129 */
13130 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13131 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13132 {
13133 vnode_t vp;
13134 char attrname[XATTR_MAXNAMELEN + 1];
13135 vfs_context_t ctx = vfs_context_current();
13136 uio_t auio = NULL;
13137 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13138 size_t namelen;
13139 int error;
13140 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13141
13142 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13143 return EINVAL;
13144 }
13145
13146 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13147 if (error != 0) {
13148 if (error == EPERM) {
13149 /* if the string won't fit in attrname, copyinstr emits EPERM */
13150 return ENAMETOOLONG;
13151 }
13152 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13153 return error;
13154 }
13155 if (xattr_protected(attrname) &&
13156 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13157 return error;
13158 }
13159 if (uap->size != 0 && uap->value == 0) {
13160 return EINVAL;
13161 }
13162 if (uap->size > INT_MAX) {
13163 return E2BIG;
13164 }
13165 if ((error = file_vnode(uap->fd, &vp))) {
13166 return error;
13167 }
13168 if ((error = vnode_getwithref(vp))) {
13169 file_drop(uap->fd);
13170 return error;
13171 }
13172
13173 #if CONFIG_FILE_LEASES
13174 vnode_breakdirlease(vp, true, O_WRONLY);
13175 #endif
13176
13177 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13178 &uio_buf[0], sizeof(uio_buf));
13179 uio_addiov(auio, uap->value, uap->size);
13180
13181 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13182 #if CONFIG_FSE
13183 if (error == 0) {
13184 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13185 FSE_ARG_VNODE, vp,
13186 FSE_ARG_DONE);
13187 }
13188 #endif
13189 vnode_put(vp);
13190 file_drop(uap->fd);
13191 *retval = 0;
13192 return error;
13193 }
13194
13195 /*
13196 * Remove an extended attribute.
13197 * XXX Code duplication here.
13198 */
13199 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13200 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13201 {
13202 vnode_t vp;
13203 struct nameidata nd;
13204 char attrname[XATTR_MAXNAMELEN + 1];
13205 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13206 vfs_context_t ctx = vfs_context_current();
13207 size_t namelen;
13208 u_int32_t nameiflags;
13209 int error;
13210
13211 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13212 return EINVAL;
13213 }
13214
13215 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13216 if (error != 0) {
13217 return error;
13218 }
13219 if (xattr_protected(attrname)) {
13220 return EPERM;
13221 }
13222 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13223 #if CONFIG_FILE_LEASES
13224 nameiflags |= WANTPARENT;
13225 #endif
13226 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13227 if ((error = namei(&nd))) {
13228 return error;
13229 }
13230 vp = nd.ni_vp;
13231 #if CONFIG_FILE_LEASES
13232 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13233 vnode_put(nd.ni_dvp);
13234 #endif
13235 nameidone(&nd);
13236
13237 error = vn_removexattr(vp, attrname, uap->options, ctx);
13238 #if CONFIG_FSE
13239 if (error == 0) {
13240 add_fsevent(FSE_XATTR_REMOVED, ctx,
13241 FSE_ARG_VNODE, vp,
13242 FSE_ARG_DONE);
13243 }
13244 #endif
13245 vnode_put(vp);
13246 *retval = 0;
13247 return error;
13248 }
13249
13250 /*
13251 * Remove an extended attribute.
13252 * XXX Code duplication here.
13253 */
13254 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13255 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13256 {
13257 vnode_t vp;
13258 char attrname[XATTR_MAXNAMELEN + 1];
13259 size_t namelen;
13260 int error;
13261 #if CONFIG_FSE
13262 vfs_context_t ctx = vfs_context_current();
13263 #endif
13264
13265 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13266 return EINVAL;
13267 }
13268
13269 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13270 if (error != 0) {
13271 return error;
13272 }
13273 if (xattr_protected(attrname)) {
13274 return EPERM;
13275 }
13276 if ((error = file_vnode(uap->fd, &vp))) {
13277 return error;
13278 }
13279 if ((error = vnode_getwithref(vp))) {
13280 file_drop(uap->fd);
13281 return error;
13282 }
13283
13284 #if CONFIG_FILE_LEASES
13285 vnode_breakdirlease(vp, true, O_WRONLY);
13286 #endif
13287
13288 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13289 #if CONFIG_FSE
13290 if (error == 0) {
13291 add_fsevent(FSE_XATTR_REMOVED, ctx,
13292 FSE_ARG_VNODE, vp,
13293 FSE_ARG_DONE);
13294 }
13295 #endif
13296 vnode_put(vp);
13297 file_drop(uap->fd);
13298 *retval = 0;
13299 return error;
13300 }
13301
13302 /*
13303 * Retrieve the list of extended attribute names.
13304 * XXX Code duplication here.
13305 */
13306 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13307 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13308 {
13309 vnode_t vp;
13310 struct nameidata nd;
13311 vfs_context_t ctx = vfs_context_current();
13312 uio_t auio = NULL;
13313 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13314 size_t attrsize = 0;
13315 u_int32_t nameiflags;
13316 int error;
13317 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13318
13319 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13320 return EINVAL;
13321 }
13322
13323 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13324 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13325 if ((error = namei(&nd))) {
13326 return error;
13327 }
13328 vp = nd.ni_vp;
13329 nameidone(&nd);
13330 if (uap->namebuf != 0 && uap->bufsize > 0) {
13331 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13332 &uio_buf[0], sizeof(uio_buf));
13333 uio_addiov(auio, uap->namebuf, uap->bufsize);
13334 }
13335
13336 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13337
13338 vnode_put(vp);
13339 if (auio) {
13340 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13341 } else {
13342 *retval = (user_ssize_t)attrsize;
13343 }
13344 return error;
13345 }
13346
13347 /*
13348 * Retrieve the list of extended attribute names.
13349 * XXX Code duplication here.
13350 */
13351 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13352 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13353 {
13354 vnode_t vp;
13355 uio_t auio = NULL;
13356 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13357 size_t attrsize = 0;
13358 int error;
13359 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13360
13361 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13362 return EINVAL;
13363 }
13364
13365 if ((error = file_vnode(uap->fd, &vp))) {
13366 return error;
13367 }
13368 if ((error = vnode_getwithref(vp))) {
13369 file_drop(uap->fd);
13370 return error;
13371 }
13372 if (uap->namebuf != 0 && uap->bufsize > 0) {
13373 auio = uio_createwithbuffer(1, 0, spacetype,
13374 UIO_READ, &uio_buf[0], sizeof(uio_buf));
13375 uio_addiov(auio, uap->namebuf, uap->bufsize);
13376 }
13377
13378 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13379
13380 vnode_put(vp);
13381 file_drop(uap->fd);
13382 if (auio) {
13383 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13384 } else {
13385 *retval = (user_ssize_t)attrsize;
13386 }
13387 return error;
13388 }
13389
13390 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13391 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13392 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13393 {
13394 int error;
13395 struct mount *mp = NULL;
13396 vnode_t vp;
13397 int length;
13398 int bpflags;
13399 /* maximum number of times to retry build_path */
13400 unsigned int retries = 0x10;
13401
13402 if (bufsize > FSGETPATH_MAXBUFLEN) {
13403 return EINVAL;
13404 }
13405
13406 if (buf == NULL) {
13407 return ENOMEM;
13408 }
13409
13410 retry:
13411 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13412 error = ENOTSUP; /* unexpected failure */
13413 return ENOTSUP;
13414 }
13415
13416 #if CONFIG_UNION_MOUNTS
13417 unionget:
13418 #endif /* CONFIG_UNION_MOUNTS */
13419 if (objid == 2) {
13420 struct vfs_attr vfsattr;
13421 int use_vfs_root = TRUE;
13422
13423 VFSATTR_INIT(&vfsattr);
13424 VFSATTR_WANTED(&vfsattr, f_capabilities);
13425 if (!(options & FSOPT_ISREALFSID) &&
13426 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13427 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13428 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13429 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13430 use_vfs_root = FALSE;
13431 }
13432 }
13433
13434 if (use_vfs_root) {
13435 error = VFS_ROOT(mp, &vp, ctx);
13436 } else {
13437 error = VFS_VGET(mp, objid, &vp, ctx);
13438 }
13439 } else {
13440 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13441 }
13442
13443 #if CONFIG_UNION_MOUNTS
13444 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13445 /*
13446 * If the fileid isn't found and we're in a union
13447 * mount volume, then see if the fileid is in the
13448 * mounted-on volume.
13449 */
13450 struct mount *tmp = mp;
13451 mp = vnode_mount(tmp->mnt_vnodecovered);
13452 vfs_unbusy(tmp);
13453 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13454 goto unionget;
13455 }
13456 } else {
13457 vfs_unbusy(mp);
13458 }
13459 #else
13460 vfs_unbusy(mp);
13461 #endif /* CONFIG_UNION_MOUNTS */
13462
13463 if (error) {
13464 return error;
13465 }
13466
13467 #if CONFIG_MACF
13468 error = mac_vnode_check_fsgetpath(ctx, vp);
13469 if (error) {
13470 vnode_put(vp);
13471 return error;
13472 }
13473 #endif
13474
13475 /* Obtain the absolute path to this vnode. */
13476 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13477 if (options & FSOPT_NOFIRMLINKPATH) {
13478 bpflags |= BUILDPATH_NO_FIRMLINK;
13479 }
13480 bpflags |= BUILDPATH_CHECK_MOVED;
13481 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13482 vnode_put(vp);
13483
13484 if (error) {
13485 /* there was a race building the path, try a few more times */
13486 if (error == EAGAIN) {
13487 --retries;
13488 if (retries > 0) {
13489 goto retry;
13490 }
13491
13492 error = ENOENT;
13493 }
13494 goto out;
13495 }
13496
13497 AUDIT_ARG(text, buf);
13498
13499 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13500 unsigned long path_words[NUMPARMS];
13501 size_t path_len = sizeof(path_words);
13502
13503 if ((size_t)length < path_len) {
13504 memcpy((char *)path_words, buf, length);
13505 memset((char *)path_words + length, 0, path_len - length);
13506
13507 path_len = length;
13508 } else {
13509 memcpy((char *)path_words, buf + (length - path_len), path_len);
13510 }
13511
13512 kdebug_vfs_lookup(path_words, (int)path_len, vp,
13513 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13514 }
13515
13516 *pathlen = length; /* may be superseded by error */
13517
13518 out:
13519 return error;
13520 }
13521
13522 /*
13523 * Obtain the full pathname of a file system object by id.
13524 */
13525 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13526 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13527 uint32_t options, user_ssize_t *retval)
13528 {
13529 vfs_context_t ctx = vfs_context_current();
13530 fsid_t fsid;
13531 char *realpath;
13532 int length;
13533 int error;
13534
13535 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13536 return EINVAL;
13537 }
13538
13539 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13540 return error;
13541 }
13542 AUDIT_ARG(value32, fsid.val[0]);
13543 AUDIT_ARG(value64, objid);
13544 /* Restrict output buffer size for now. */
13545
13546 if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13547 return EINVAL;
13548 }
13549 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13550 if (realpath == NULL) {
13551 return ENOMEM;
13552 }
13553
13554 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13555 options, &length);
13556
13557 if (error) {
13558 goto out;
13559 }
13560
13561 error = copyout((caddr_t)realpath, buf, length);
13562
13563 *retval = (user_ssize_t)length; /* may be superseded by error */
13564 out:
13565 kfree_data(realpath, bufsize);
13566 return error;
13567 }
13568
13569 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13570 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13571 {
13572 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13573 0, retval);
13574 }
13575
13576 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13577 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13578 {
13579 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13580 uap->options, retval);
13581 }
13582
13583 /*
13584 * Common routine to handle various flavors of statfs data heading out
13585 * to user space.
13586 *
13587 * Returns: 0 Success
13588 * EFAULT
13589 */
13590 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13591 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13592 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13593 boolean_t partial_copy)
13594 {
13595 int error;
13596 int my_size, copy_size;
13597
13598 if (is_64_bit) {
13599 struct user64_statfs sfs;
13600 my_size = copy_size = sizeof(sfs);
13601 bzero(&sfs, my_size);
13602 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13603 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13604 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13605 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13606 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13607 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13608 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13609 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13610 sfs.f_files = (user64_long_t)sfsp->f_files;
13611 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13612 sfs.f_fsid = sfsp->f_fsid;
13613 sfs.f_owner = sfsp->f_owner;
13614 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13615 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13616 } else {
13617 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13618 }
13619 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13620 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13621
13622 if (partial_copy) {
13623 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13624 }
13625 error = copyout((caddr_t)&sfs, bufp, copy_size);
13626 } else {
13627 struct user32_statfs sfs;
13628
13629 my_size = copy_size = sizeof(sfs);
13630 bzero(&sfs, my_size);
13631
13632 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13633 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13634 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13635
13636 /*
13637 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13638 * have to fudge the numbers here in that case. We inflate the blocksize in order
13639 * to reflect the filesystem size as best we can.
13640 */
13641 if ((sfsp->f_blocks > INT_MAX)
13642 /* Hack for 4061702 . I think the real fix is for Carbon to
13643 * look for some volume capability and not depend on hidden
13644 * semantics agreed between a FS and carbon.
13645 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13646 * for Carbon to set bNoVolumeSizes volume attribute.
13647 * Without this the webdavfs files cannot be copied onto
13648 * disk as they look huge. This change should not affect
13649 * XSAN as they should not setting these to -1..
13650 */
13651 && (sfsp->f_blocks != 0xffffffffffffffffULL)
13652 && (sfsp->f_bfree != 0xffffffffffffffffULL)
13653 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13654 int shift;
13655
13656 /*
13657 * Work out how far we have to shift the block count down to make it fit.
13658 * Note that it's possible to have to shift so far that the resulting
13659 * blocksize would be unreportably large. At that point, we will clip
13660 * any values that don't fit.
13661 *
13662 * For safety's sake, we also ensure that f_iosize is never reported as
13663 * being smaller than f_bsize.
13664 */
13665 for (shift = 0; shift < 32; shift++) {
13666 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13667 break;
13668 }
13669 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13670 break;
13671 }
13672 }
13673 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13674 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13675 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13676 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13677 #undef __SHIFT_OR_CLIP
13678 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13679 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13680 } else {
13681 /* filesystem is small enough to be reported honestly */
13682 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13683 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13684 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13685 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13686 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13687 }
13688 sfs.f_files = (user32_long_t)sfsp->f_files;
13689 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13690 sfs.f_fsid = sfsp->f_fsid;
13691 sfs.f_owner = sfsp->f_owner;
13692 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13693 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13694 } else {
13695 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13696 }
13697 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13698 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13699
13700 if (partial_copy) {
13701 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13702 }
13703 error = copyout((caddr_t)&sfs, bufp, copy_size);
13704 }
13705
13706 if (sizep != NULL) {
13707 *sizep = my_size;
13708 }
13709 return error;
13710 }
13711
13712 /*
13713 * copy stat structure into user_stat structure.
13714 */
13715 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13716 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13717 {
13718 bzero(usbp, sizeof(*usbp));
13719
13720 usbp->st_dev = sbp->st_dev;
13721 usbp->st_ino = sbp->st_ino;
13722 usbp->st_mode = sbp->st_mode;
13723 usbp->st_nlink = sbp->st_nlink;
13724 usbp->st_uid = sbp->st_uid;
13725 usbp->st_gid = sbp->st_gid;
13726 usbp->st_rdev = sbp->st_rdev;
13727 #ifndef _POSIX_C_SOURCE
13728 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13729 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13730 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13731 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13732 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13733 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13734 #else
13735 usbp->st_atime = sbp->st_atime;
13736 usbp->st_atimensec = sbp->st_atimensec;
13737 usbp->st_mtime = sbp->st_mtime;
13738 usbp->st_mtimensec = sbp->st_mtimensec;
13739 usbp->st_ctime = sbp->st_ctime;
13740 usbp->st_ctimensec = sbp->st_ctimensec;
13741 #endif
13742 usbp->st_size = sbp->st_size;
13743 usbp->st_blocks = sbp->st_blocks;
13744 usbp->st_blksize = sbp->st_blksize;
13745 usbp->st_flags = sbp->st_flags;
13746 usbp->st_gen = sbp->st_gen;
13747 usbp->st_lspare = sbp->st_lspare;
13748 usbp->st_qspare[0] = sbp->st_qspare[0];
13749 usbp->st_qspare[1] = sbp->st_qspare[1];
13750 }
13751
13752 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13753 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13754 {
13755 bzero(usbp, sizeof(*usbp));
13756
13757 usbp->st_dev = sbp->st_dev;
13758 usbp->st_ino = sbp->st_ino;
13759 usbp->st_mode = sbp->st_mode;
13760 usbp->st_nlink = sbp->st_nlink;
13761 usbp->st_uid = sbp->st_uid;
13762 usbp->st_gid = sbp->st_gid;
13763 usbp->st_rdev = sbp->st_rdev;
13764 #ifndef _POSIX_C_SOURCE
13765 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13766 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13767 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13768 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13769 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13770 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13771 #else
13772 usbp->st_atime = sbp->st_atime;
13773 usbp->st_atimensec = sbp->st_atimensec;
13774 usbp->st_mtime = sbp->st_mtime;
13775 usbp->st_mtimensec = sbp->st_mtimensec;
13776 usbp->st_ctime = sbp->st_ctime;
13777 usbp->st_ctimensec = sbp->st_ctimensec;
13778 #endif
13779 usbp->st_size = sbp->st_size;
13780 usbp->st_blocks = sbp->st_blocks;
13781 usbp->st_blksize = sbp->st_blksize;
13782 usbp->st_flags = sbp->st_flags;
13783 usbp->st_gen = sbp->st_gen;
13784 usbp->st_lspare = sbp->st_lspare;
13785 usbp->st_qspare[0] = sbp->st_qspare[0];
13786 usbp->st_qspare[1] = sbp->st_qspare[1];
13787 }
13788
13789 /*
13790 * copy stat64 structure into user_stat64 structure.
13791 */
13792 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13793 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13794 {
13795 bzero(usbp, sizeof(*usbp));
13796
13797 usbp->st_dev = sbp->st_dev;
13798 usbp->st_ino = sbp->st_ino;
13799 usbp->st_mode = sbp->st_mode;
13800 usbp->st_nlink = sbp->st_nlink;
13801 usbp->st_uid = sbp->st_uid;
13802 usbp->st_gid = sbp->st_gid;
13803 usbp->st_rdev = sbp->st_rdev;
13804 #ifndef _POSIX_C_SOURCE
13805 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13806 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13807 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13808 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13809 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13810 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13811 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13812 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13813 #else
13814 usbp->st_atime = sbp->st_atime;
13815 usbp->st_atimensec = sbp->st_atimensec;
13816 usbp->st_mtime = sbp->st_mtime;
13817 usbp->st_mtimensec = sbp->st_mtimensec;
13818 usbp->st_ctime = sbp->st_ctime;
13819 usbp->st_ctimensec = sbp->st_ctimensec;
13820 usbp->st_birthtime = sbp->st_birthtime;
13821 usbp->st_birthtimensec = sbp->st_birthtimensec;
13822 #endif
13823 usbp->st_size = sbp->st_size;
13824 usbp->st_blocks = sbp->st_blocks;
13825 usbp->st_blksize = sbp->st_blksize;
13826 usbp->st_flags = sbp->st_flags;
13827 usbp->st_gen = sbp->st_gen;
13828 usbp->st_lspare = sbp->st_lspare;
13829 usbp->st_qspare[0] = sbp->st_qspare[0];
13830 usbp->st_qspare[1] = sbp->st_qspare[1];
13831 }
13832
13833 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13834 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13835 {
13836 bzero(usbp, sizeof(*usbp));
13837
13838 usbp->st_dev = sbp->st_dev;
13839 usbp->st_ino = sbp->st_ino;
13840 usbp->st_mode = sbp->st_mode;
13841 usbp->st_nlink = sbp->st_nlink;
13842 usbp->st_uid = sbp->st_uid;
13843 usbp->st_gid = sbp->st_gid;
13844 usbp->st_rdev = sbp->st_rdev;
13845 #ifndef _POSIX_C_SOURCE
13846 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13847 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13848 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13849 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13850 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13851 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13852 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13853 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13854 #else
13855 usbp->st_atime = sbp->st_atime;
13856 usbp->st_atimensec = sbp->st_atimensec;
13857 usbp->st_mtime = sbp->st_mtime;
13858 usbp->st_mtimensec = sbp->st_mtimensec;
13859 usbp->st_ctime = sbp->st_ctime;
13860 usbp->st_ctimensec = sbp->st_ctimensec;
13861 usbp->st_birthtime = sbp->st_birthtime;
13862 usbp->st_birthtimensec = sbp->st_birthtimensec;
13863 #endif
13864 usbp->st_size = sbp->st_size;
13865 usbp->st_blocks = sbp->st_blocks;
13866 usbp->st_blksize = sbp->st_blksize;
13867 usbp->st_flags = sbp->st_flags;
13868 usbp->st_gen = sbp->st_gen;
13869 usbp->st_lspare = sbp->st_lspare;
13870 usbp->st_qspare[0] = sbp->st_qspare[0];
13871 usbp->st_qspare[1] = sbp->st_qspare[1];
13872 }
13873
13874 /*
13875 * Purge buffer cache for simulating cold starts
13876 */
13877 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13878 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13879 {
13880 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13881
13882 return VNODE_RETURNED;
13883 }
13884
13885 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13886 vfs_purge_callback(mount_t mp, __unused void * arg)
13887 {
13888 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13889
13890 return VFS_RETURNED;
13891 }
13892
13893 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
13894 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
13895
13896 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13897 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13898 {
13899 if (!kauth_cred_issuser(kauth_cred_get())) {
13900 return EPERM;
13901 }
13902
13903 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13904
13905 /* also flush any VM pagers backed by files */
13906 if (vfs_purge_vm_pagers) {
13907 vm_purge_filebacked_pagers();
13908 }
13909
13910 return 0;
13911 }
13912
13913 /*
13914 * gets the vnode associated with the (unnamed) snapshot directory
13915 * for a Filesystem. The snapshot directory vnode is returned with
13916 * an iocount on it.
13917 */
13918 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13919 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13920 {
13921 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13922 }
13923
13924 /*
13925 * Get the snapshot vnode.
13926 *
13927 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13928 * needs nameidone() on ndp.
13929 *
13930 * If the snapshot vnode exists it is returned in ndp->ni_vp.
13931 *
13932 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13933 * not needed.
13934 */
13935 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13936 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13937 user_addr_t name, struct nameidata *ndp, int32_t op,
13938 #if !CONFIG_TRIGGERS
13939 __unused
13940 #endif
13941 enum path_operation pathop,
13942 vfs_context_t ctx)
13943 {
13944 int error, i;
13945 caddr_t name_buf;
13946 size_t name_len;
13947 struct vfs_attr vfa;
13948
13949 *sdvpp = NULLVP;
13950 *rvpp = NULLVP;
13951
13952 error = vnode_getfromfd(ctx, dirfd, rvpp);
13953 if (error) {
13954 return error;
13955 }
13956
13957 if (!vnode_isvroot(*rvpp)) {
13958 error = EINVAL;
13959 goto out;
13960 }
13961
13962 /* Make sure the filesystem supports snapshots */
13963 VFSATTR_INIT(&vfa);
13964 VFSATTR_WANTED(&vfa, f_capabilities);
13965 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13966 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13967 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13968 VOL_CAP_INT_SNAPSHOT)) ||
13969 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13970 VOL_CAP_INT_SNAPSHOT))) {
13971 error = ENOTSUP;
13972 goto out;
13973 }
13974
13975 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13976 if (error) {
13977 goto out;
13978 }
13979
13980 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13981 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13982 if (error) {
13983 goto out1;
13984 }
13985
13986 /*
13987 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13988 * (the length returned by copyinstr includes the terminating NUL)
13989 */
13990 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13991 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13992 error = EINVAL;
13993 goto out1;
13994 }
13995 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13996 ;
13997 }
13998 if (i < (int)name_len) {
13999 error = EINVAL;
14000 goto out1;
14001 }
14002
14003 #if CONFIG_MACF
14004 if (op == CREATE) {
14005 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14006 name_buf);
14007 } else if (op == DELETE) {
14008 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14009 name_buf);
14010 }
14011 if (error) {
14012 goto out1;
14013 }
14014 #endif
14015
14016 /* Check if the snapshot already exists ... */
14017 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14018 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14019 ndp->ni_dvp = *sdvpp;
14020
14021 error = namei(ndp);
14022 out1:
14023 zfree(ZV_NAMEI, name_buf);
14024 out:
14025 if (error) {
14026 if (*sdvpp) {
14027 vnode_put(*sdvpp);
14028 *sdvpp = NULLVP;
14029 }
14030 if (*rvpp) {
14031 vnode_put(*rvpp);
14032 *rvpp = NULLVP;
14033 }
14034 }
14035 return error;
14036 }
14037
14038 /*
14039 * create a filesystem snapshot (for supporting filesystems)
14040 *
14041 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14042 * We get to the (unnamed) snapshot directory vnode and create the vnode
14043 * for the snapshot in it.
14044 *
14045 * Restrictions:
14046 *
14047 * a) Passed in name for snapshot cannot have slashes.
14048 * b) name can't be "." or ".."
14049 *
14050 * Since this requires superuser privileges, vnode_authorize calls are not
14051 * made.
14052 */
14053 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14054 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14055 vfs_context_t ctx)
14056 {
14057 vnode_t rvp, snapdvp;
14058 int error;
14059 struct nameidata *ndp;
14060
14061 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14062
14063 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14064 OP_LINK, ctx);
14065 if (error) {
14066 goto out;
14067 }
14068
14069 if (ndp->ni_vp) {
14070 vnode_put(ndp->ni_vp);
14071 error = EEXIST;
14072 } else {
14073 struct vnode_attr *vap;
14074 vnode_t vp = NULLVP;
14075
14076 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14077
14078 VATTR_INIT(vap);
14079 VATTR_SET(vap, va_type, VREG);
14080 VATTR_SET(vap, va_mode, 0);
14081
14082 error = vn_create(snapdvp, &vp, ndp, vap,
14083 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14084 if (!error && vp) {
14085 vnode_put(vp);
14086 }
14087
14088 kfree_type(struct vnode_attr, vap);
14089 }
14090
14091 nameidone(ndp);
14092 vnode_put(snapdvp);
14093 vnode_put(rvp);
14094 out:
14095 kfree_type(struct nameidata, ndp);
14096
14097 return error;
14098 }
14099
14100 /*
14101 * Delete a Filesystem snapshot
14102 *
14103 * get the vnode for the unnamed snapshot directory and the snapshot and
14104 * delete the snapshot.
14105 */
14106 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14107 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14108 vfs_context_t ctx)
14109 {
14110 vnode_t rvp, snapdvp;
14111 int error;
14112 struct nameidata *ndp;
14113
14114 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14115
14116 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14117 OP_UNLINK, ctx);
14118 if (error) {
14119 goto out;
14120 }
14121
14122 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14123 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14124
14125 vnode_put(ndp->ni_vp);
14126 nameidone(ndp);
14127 vnode_put(snapdvp);
14128 vnode_put(rvp);
14129 out:
14130 kfree_type(struct nameidata, ndp);
14131
14132 return error;
14133 }
14134
14135 /*
14136 * Revert a filesystem to a snapshot
14137 *
14138 * Marks the filesystem to revert to the given snapshot on next mount.
14139 */
14140 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14141 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14142 vfs_context_t ctx)
14143 {
14144 int error;
14145 vnode_t rvp;
14146 mount_t mp;
14147 struct fs_snapshot_revert_args revert_data;
14148 struct componentname cnp;
14149 caddr_t name_buf;
14150 size_t name_len;
14151
14152 error = vnode_getfromfd(ctx, dirfd, &rvp);
14153 if (error) {
14154 return error;
14155 }
14156 mp = vnode_mount(rvp);
14157
14158 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14159 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14160 if (error) {
14161 zfree(ZV_NAMEI, name_buf);
14162 vnode_put(rvp);
14163 return error;
14164 }
14165
14166 #if CONFIG_MACF
14167 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14168 if (error) {
14169 zfree(ZV_NAMEI, name_buf);
14170 vnode_put(rvp);
14171 return error;
14172 }
14173 #endif
14174
14175 /*
14176 * Grab mount_iterref so that we can release the vnode,
14177 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14178 */
14179 error = mount_iterref(mp, 0);
14180 vnode_put(rvp);
14181 if (error) {
14182 zfree(ZV_NAMEI, name_buf);
14183 return error;
14184 }
14185
14186 memset(&cnp, 0, sizeof(cnp));
14187 cnp.cn_pnbuf = (char *)name_buf;
14188 cnp.cn_nameiop = LOOKUP;
14189 cnp.cn_flags = ISLASTCN | HASBUF;
14190 cnp.cn_pnlen = MAXPATHLEN;
14191 cnp.cn_nameptr = cnp.cn_pnbuf;
14192 cnp.cn_namelen = (int)name_len;
14193 revert_data.sr_cnp = &cnp;
14194
14195 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14196 mount_iterdrop(mp);
14197 zfree(ZV_NAMEI, name_buf);
14198
14199 if (error) {
14200 /* If there was any error, try again using VNOP_IOCTL */
14201
14202 vnode_t snapdvp;
14203 struct nameidata namend;
14204
14205 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14206 OP_LOOKUP, ctx);
14207 if (error) {
14208 return error;
14209 }
14210
14211
14212 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14213 0, ctx);
14214
14215 vnode_put(namend.ni_vp);
14216 nameidone(&namend);
14217 vnode_put(snapdvp);
14218 vnode_put(rvp);
14219 }
14220
14221 return error;
14222 }
14223
14224 /*
14225 * rename a Filesystem snapshot
14226 *
14227 * get the vnode for the unnamed snapshot directory and the snapshot and
14228 * rename the snapshot. This is a very specialised (and simple) case of
14229 * rename(2) (which has to deal with a lot more complications). It differs
14230 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14231 */
14232 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14233 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14234 __unused uint32_t flags, vfs_context_t ctx)
14235 {
14236 vnode_t rvp, snapdvp;
14237 int error, i;
14238 caddr_t newname_buf;
14239 size_t name_len;
14240 vnode_t fvp;
14241 struct nameidata *fromnd, *tond;
14242 /* carving out a chunk for structs that are too big to be on stack. */
14243 struct {
14244 struct nameidata from_node;
14245 struct nameidata to_node;
14246 } * __rename_data;
14247
14248 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14249 fromnd = &__rename_data->from_node;
14250 tond = &__rename_data->to_node;
14251
14252 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14253 OP_UNLINK, ctx);
14254 if (error) {
14255 goto out;
14256 }
14257 fvp = fromnd->ni_vp;
14258
14259 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14260 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14261 if (error) {
14262 goto out1;
14263 }
14264
14265 /*
14266 * Some sanity checks- new name can't be empty, "." or ".." or have
14267 * slashes.
14268 * (the length returned by copyinstr includes the terminating NUL)
14269 *
14270 * The FS rename VNOP is suppossed to handle this but we'll pick it
14271 * off here itself.
14272 */
14273 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14274 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14275 error = EINVAL;
14276 goto out1;
14277 }
14278 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14279 ;
14280 }
14281 if (i < (int)name_len) {
14282 error = EINVAL;
14283 goto out1;
14284 }
14285
14286 #if CONFIG_MACF
14287 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14288 newname_buf);
14289 if (error) {
14290 goto out1;
14291 }
14292 #endif
14293
14294 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14295 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14296 tond->ni_dvp = snapdvp;
14297
14298 error = namei(tond);
14299 if (error) {
14300 goto out2;
14301 } else if (tond->ni_vp) {
14302 /*
14303 * snapshot rename behaves differently than rename(2) - if the
14304 * new name exists, EEXIST is returned.
14305 */
14306 vnode_put(tond->ni_vp);
14307 error = EEXIST;
14308 goto out2;
14309 }
14310
14311 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14312 &tond->ni_cnd, ctx);
14313
14314 out2:
14315 nameidone(tond);
14316 out1:
14317 zfree(ZV_NAMEI, newname_buf);
14318 vnode_put(fvp);
14319 vnode_put(snapdvp);
14320 vnode_put(rvp);
14321 nameidone(fromnd);
14322 out:
14323 kfree_type(typeof(*__rename_data), __rename_data);
14324 return error;
14325 }
14326
14327 /*
14328 * Mount a Filesystem snapshot
14329 *
14330 * get the vnode for the unnamed snapshot directory and the snapshot and
14331 * mount the snapshot.
14332 */
14333 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14334 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14335 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14336 {
14337 mount_t mp;
14338 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14339 struct fs_snapshot_mount_args smnt_data;
14340 int error;
14341 struct nameidata *snapndp, *dirndp;
14342 /* carving out a chunk for structs that are too big to be on stack. */
14343 struct {
14344 struct nameidata snapnd;
14345 struct nameidata dirnd;
14346 } * __snapshot_mount_data;
14347
14348 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14349 snapndp = &__snapshot_mount_data->snapnd;
14350 dirndp = &__snapshot_mount_data->dirnd;
14351
14352 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14353 OP_LOOKUP, ctx);
14354 if (error) {
14355 goto out;
14356 }
14357
14358 snapvp = snapndp->ni_vp;
14359 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14360 error = EIO;
14361 goto out1;
14362 }
14363
14364 /* Get the vnode to be covered */
14365 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14366 UIO_USERSPACE, directory, ctx);
14367 error = namei(dirndp);
14368 if (error) {
14369 goto out1;
14370 }
14371
14372 vp = dirndp->ni_vp;
14373 pvp = dirndp->ni_dvp;
14374 mp = vnode_mount(rvp);
14375
14376 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14377 error = EINVAL;
14378 goto out2;
14379 }
14380
14381 #if CONFIG_MACF
14382 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14383 mp->mnt_vfsstat.f_fstypename);
14384 if (error) {
14385 goto out2;
14386 }
14387 #endif
14388
14389 smnt_data.sm_mp = mp;
14390 smnt_data.sm_cnp = &snapndp->ni_cnd;
14391 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14392 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
14393 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14394
14395 out2:
14396 vnode_put(vp);
14397 vnode_put(pvp);
14398 nameidone(dirndp);
14399 out1:
14400 vnode_put(snapvp);
14401 vnode_put(snapdvp);
14402 vnode_put(rvp);
14403 nameidone(snapndp);
14404 out:
14405 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14406 return error;
14407 }
14408
14409 /*
14410 * Root from a snapshot of the filesystem
14411 *
14412 * Marks the filesystem to root from the given snapshot on next boot.
14413 */
14414 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14415 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14416 vfs_context_t ctx)
14417 {
14418 int error;
14419 vnode_t rvp;
14420 mount_t mp;
14421 struct fs_snapshot_root_args root_data;
14422 struct componentname cnp;
14423 caddr_t name_buf;
14424 size_t name_len;
14425
14426 error = vnode_getfromfd(ctx, dirfd, &rvp);
14427 if (error) {
14428 return error;
14429 }
14430 mp = vnode_mount(rvp);
14431
14432 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14433 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14434 if (error) {
14435 zfree(ZV_NAMEI, name_buf);
14436 vnode_put(rvp);
14437 return error;
14438 }
14439
14440 // XXX MAC checks ?
14441
14442 /*
14443 * Grab mount_iterref so that we can release the vnode,
14444 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14445 */
14446 error = mount_iterref(mp, 0);
14447 vnode_put(rvp);
14448 if (error) {
14449 zfree(ZV_NAMEI, name_buf);
14450 return error;
14451 }
14452
14453 memset(&cnp, 0, sizeof(cnp));
14454 cnp.cn_pnbuf = (char *)name_buf;
14455 cnp.cn_nameiop = LOOKUP;
14456 cnp.cn_flags = ISLASTCN | HASBUF;
14457 cnp.cn_pnlen = MAXPATHLEN;
14458 cnp.cn_nameptr = cnp.cn_pnbuf;
14459 cnp.cn_namelen = (int)name_len;
14460 root_data.sr_cnp = &cnp;
14461
14462 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14463
14464 mount_iterdrop(mp);
14465 zfree(ZV_NAMEI, name_buf);
14466
14467 return error;
14468 }
14469
14470 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14471 vfs_context_can_snapshot(vfs_context_t ctx)
14472 {
14473 static const char * const snapshot_entitlements[] = {
14474 "com.apple.private.vfs.snapshot",
14475 "com.apple.developer.vfs.snapshot",
14476 "com.apple.private.apfs.arv.limited.snapshot",
14477 };
14478 static const size_t nentitlements =
14479 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14480 size_t i;
14481
14482 task_t task = vfs_context_task(ctx);
14483 for (i = 0; i < nentitlements; i++) {
14484 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14485 return TRUE;
14486 }
14487 }
14488 return FALSE;
14489 }
14490
14491 /*
14492 * FS snapshot operations dispatcher
14493 */
14494 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14495 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14496 __unused int32_t *retval)
14497 {
14498 int error;
14499 vfs_context_t ctx = vfs_context_current();
14500
14501 AUDIT_ARG(fd, uap->dirfd);
14502 AUDIT_ARG(value32, uap->op);
14503
14504 if (!vfs_context_can_snapshot(ctx)) {
14505 return EPERM;
14506 }
14507
14508 /*
14509 * Enforce user authorization for snapshot modification operations,
14510 * or if trying to root from snapshot.
14511 */
14512 if (uap->op != SNAPSHOT_OP_MOUNT) {
14513 vnode_t dvp = NULLVP;
14514 vnode_t devvp = NULLVP;
14515 mount_t mp;
14516
14517 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14518 if (error) {
14519 return error;
14520 }
14521 mp = vnode_mount(dvp);
14522 devvp = mp->mnt_devvp;
14523
14524 /* get an iocount on devvp */
14525 if (devvp == NULLVP) {
14526 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14527 /* for mounts which arent block devices */
14528 if (error == ENOENT) {
14529 error = ENXIO;
14530 }
14531 } else {
14532 error = vnode_getwithref(devvp);
14533 }
14534
14535 if (error) {
14536 vnode_put(dvp);
14537 return error;
14538 }
14539
14540 if ((vfs_context_issuser(ctx) == 0) &&
14541 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14542 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14543 error = EPERM;
14544 }
14545 vnode_put(dvp);
14546 vnode_put(devvp);
14547
14548 if (error) {
14549 return error;
14550 }
14551 }
14552
14553 switch (uap->op) {
14554 case SNAPSHOT_OP_CREATE:
14555 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14556 break;
14557 case SNAPSHOT_OP_DELETE:
14558 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14559 break;
14560 case SNAPSHOT_OP_RENAME:
14561 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14562 uap->flags, ctx);
14563 break;
14564 case SNAPSHOT_OP_MOUNT:
14565 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14566 uap->data, uap->flags, ctx);
14567 break;
14568 case SNAPSHOT_OP_REVERT:
14569 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14570 break;
14571 #if CONFIG_MNT_ROOTSNAP
14572 case SNAPSHOT_OP_ROOT:
14573 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14574 break;
14575 #endif /* CONFIG_MNT_ROOTSNAP */
14576 default:
14577 error = ENOSYS;
14578 }
14579
14580 return error;
14581 }
14582