1 /*
2 * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/syslimits.h> /* For MAXLONGPATHLEN */
77 #include <sys/namei.h>
78 #include <sys/filedesc.h>
79 #include <sys/kernel.h>
80 #include <sys/file_internal.h>
81 #include <sys/stat.h>
82 #include <sys/vnode_internal.h>
83 #include <sys/mount_internal.h>
84 #include <sys/proc_internal.h>
85 #include <sys/kauth.h>
86 #include <sys/uio_internal.h>
87 #include <kern/kalloc.h>
88 #include <sys/mman.h>
89 #include <sys/dirent.h>
90 #include <sys/attr.h>
91 #include <sys/sysctl.h>
92 #include <sys/ubc.h>
93 #include <sys/quota.h>
94 #include <sys/kdebug.h>
95 #include <sys/fsevents.h>
96 #include <sys/imgsrc.h>
97 #include <sys/sysproto.h>
98 #include <sys/sysctl.h>
99 #include <sys/xattr.h>
100 #include <sys/fcntl.h>
101 #include <sys/stdio.h>
102 #include <sys/fsctl.h>
103 #include <sys/ubc_internal.h>
104 #include <sys/disk.h>
105 #include <sys/content_protection.h>
106 #include <sys/clonefile.h>
107 #include <sys/snapshot.h>
108 #include <sys/priv.h>
109 #include <sys/fsgetpath.h>
110 #include <machine/cons.h>
111 #include <machine/limits.h>
112 #include <miscfs/specfs/specdev.h>
113
114 #include <vfs/vfs_disk_conditioner.h>
115 #if CONFIG_EXCLAVES
116 #include <vfs/vfs_exclave_fs.h>
117 #endif
118
119 #include <security/audit/audit.h>
120 #include <bsm/audit_kevents.h>
121
122 #include <mach/mach_types.h>
123 #include <kern/exc_guard.h>
124 #include <kern/kern_types.h>
125 #include <kern/kalloc.h>
126 #include <kern/task.h>
127
128 #include <vm/vm_pageout.h>
129 #include <vm/vm_protos.h>
130 #include <vm/memory_object_xnu.h>
131
132 #include <libkern/OSAtomic.h>
133 #include <os/atomic_private.h>
134 #include <pexpert/pexpert.h>
135 #include <IOKit/IOBSD.h>
136
137 // deps for MIG call
138 #include <kern/host.h>
139 #include <kern/ipc_misc.h>
140 #include <mach/host_priv.h>
141 #include <mach/vfs_nspace.h>
142 #include <os/log.h>
143
144 #include <nfs/nfs_conf.h>
145
146 #if ROUTEFS
147 #include <miscfs/routefs/routefs.h>
148 #endif /* ROUTEFS */
149
150 #if CONFIG_MACF
151 #include <security/mac.h>
152 #include <security/mac_framework.h>
153 #endif
154
155 #if CONFIG_FSE
156 #define GET_PATH(x) \
157 ((x) = get_pathbuff())
158 #define RELEASE_PATH(x) \
159 release_pathbuff(x)
160 #else
161 #define GET_PATH(x) \
162 ((x) = zalloc(ZV_NAMEI))
163 #define RELEASE_PATH(x) \
164 zfree(ZV_NAMEI, x)
165 #endif /* CONFIG_FSE */
166
167 #ifndef HFS_GET_BOOT_INFO
168 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
169 #endif
170
171 #ifndef HFS_SET_BOOT_INFO
172 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
173 #endif
174
175 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
176 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
177 #endif
178
179 extern void disk_conditioner_unmount(mount_t mp);
180
181 /* struct for checkdirs iteration */
182 struct cdirargs {
183 vnode_t olddp;
184 vnode_t newdp;
185 };
186 /* callback for checkdirs iteration */
187 static int checkdirs_callback(proc_t p, void * arg);
188
189 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
190 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
191 void enablequotas(struct mount *mp, vfs_context_t ctx);
192 static int getfsstat_callback(mount_t mp, void * arg);
193 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
194 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
195 static int sync_callback(mount_t, void *);
196 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
197 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
198 boolean_t partial_copy);
199 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
200 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
201 struct componentname *cnp, user_addr_t fsmountargs,
202 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
203 void vfs_notify_mount(vnode_t pdvp);
204
205 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
206
207 struct fd_vn_data * fg_vn_data_alloc(void);
208
209 /*
210 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
211 * Concurrent lookups (or lookups by ids) on hard links can cause the
212 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
213 * does) to return ENOENT as the path cannot be returned from the name cache
214 * alone. We have no option but to retry and hope to get one namei->reverse path
215 * generation done without an intervening lookup, lookup by id on the hard link
216 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
217 * which currently are the MAC hooks for rename, unlink and rmdir.
218 */
219 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
220
221 /* Max retry limit for rename due to vnode recycling. */
222 #define MAX_RENAME_ERECYCLE_RETRIES 1024
223
224 #define MAX_LINK_ENOENT_RETRIES 1024
225
226 /* Max retries for concurrent mounts on the same covered vnode. */
227 #define MAX_MOUNT_RETRIES 10
228
229 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
230 int unlink_flags);
231
232 #ifdef CONFIG_IMGSRC_ACCESS
233 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
234 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
235 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
236 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
237 static void mount_end_update(mount_t mp);
238 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
239 #endif /* CONFIG_IMGSRC_ACCESS */
240
241 //snapshot functions
242 #if CONFIG_MNT_ROOTSNAP
243 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
244 #else
245 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
246 #endif
247
248 __private_extern__
249 int sync_internal(void);
250
251 __private_extern__
252 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
253
254 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
255 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
256
257 /* vars for sync mutex */
258 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
259 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
260
261 extern lck_rw_t rootvnode_rw_lock;
262
263 VFS_SMR_DECLARE;
264 extern uint32_t nc_smr_enabled;
265
266 /*
267 * incremented each time a mount or unmount operation occurs
268 * used to invalidate the cached value of the rootvp in the
269 * mount structure utilized by cache_lookup_path
270 */
271 uint32_t mount_generation = 0;
272
273 /* counts number of mount and unmount operations */
274 unsigned int vfs_nummntops = 0;
275
276 /* system-wide, per-boot unique mount ID */
277 static _Atomic uint64_t mount_unique_id = 1;
278
279 extern const struct fileops vnops;
280 #if CONFIG_APPLEDOUBLE
281 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
282 #endif /* CONFIG_APPLEDOUBLE */
283
284
285 /*
286 * Virtual File System System Calls
287 */
288
289 /*
290 * Private in-kernel mounting spi (specific use-cases only)
291 */
292 boolean_t
vfs_iskernelmount(mount_t mp)293 vfs_iskernelmount(mount_t mp)
294 {
295 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
296 }
297
298 __private_extern__
299 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)300 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
301 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
302 vfs_context_t ctx)
303 {
304 struct nameidata nd;
305 boolean_t did_namei;
306 int error;
307
308 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
309 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
310 if (syscall_flags & MNT_NOFOLLOW) {
311 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
312 }
313
314 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
315
316 /*
317 * Get the vnode to be covered if it's not supplied
318 */
319 if (vp == NULLVP) {
320 error = namei(&nd);
321 if (error) {
322 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
323 printf("failed to locate mount-on path: %s ", path);
324 }
325 return error;
326 }
327 vp = nd.ni_vp;
328 pvp = nd.ni_dvp;
329 did_namei = TRUE;
330 } else {
331 char *pnbuf = CAST_DOWN(char *, path);
332
333 nd.ni_cnd.cn_pnbuf = pnbuf;
334 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
335 did_namei = FALSE;
336 }
337
338 kern_flags |= KERNEL_MOUNT_KMOUNT;
339 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
340 syscall_flags, kern_flags, NULL, ctx);
341
342 if (did_namei) {
343 vnode_put(vp);
344 vnode_put(pvp);
345 nameidone(&nd);
346 }
347
348 return error;
349 }
350
351 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)352 vfs_mount_at_path(const char *fstype, const char *path,
353 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
354 int mnt_flags, int flags)
355 {
356 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
357 int error, km_flags = 0;
358 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
359
360 /*
361 * This call is currently restricted to specific use cases.
362 */
363 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
364 return ENOTSUP;
365 }
366
367 #if !defined(XNU_TARGET_OS_OSX)
368 if (strcmp(fstype, "lifs") == 0) {
369 syscall_flags |= MNT_NOEXEC;
370 }
371 #endif
372
373 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
374 km_flags |= KERNEL_MOUNT_NOAUTH;
375 }
376 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
377 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
378 }
379
380 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
381 syscall_flags, km_flags, ctx);
382 if (error) {
383 printf("%s: mount on %s failed, error %d\n", __func__, path,
384 error);
385 }
386
387 return error;
388 }
389
390 /*
391 * Mount a file system.
392 */
393 /* ARGSUSED */
394 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)395 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
396 {
397 struct __mac_mount_args muap;
398
399 muap.type = uap->type;
400 muap.path = uap->path;
401 muap.flags = uap->flags;
402 muap.data = uap->data;
403 muap.mac_p = USER_ADDR_NULL;
404 return __mac_mount(p, &muap, retval);
405 }
406
407 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)408 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
409 {
410 struct componentname cn;
411 vfs_context_t ctx = vfs_context_current();
412 size_t dummy = 0;
413 int error;
414 int flags = uap->flags;
415 char fstypename[MFSNAMELEN];
416 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
417 vnode_t pvp;
418 vnode_t vp;
419
420 AUDIT_ARG(fd, uap->fd);
421 AUDIT_ARG(fflags, flags);
422 /* fstypename will get audited by mount_common */
423
424 /* Sanity check the flags */
425 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
426 return ENOTSUP;
427 }
428
429 if (flags & MNT_UNION) {
430 return EPERM;
431 }
432
433 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
434 if (error) {
435 return error;
436 }
437
438 if ((error = file_vnode(uap->fd, &vp)) != 0) {
439 return error;
440 }
441
442 if ((error = vnode_getwithref(vp)) != 0) {
443 file_drop(uap->fd);
444 return error;
445 }
446
447 pvp = vnode_getparent(vp);
448 if (pvp == NULL) {
449 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
450 error = EBUSY;
451 } else {
452 error = EINVAL;
453 }
454 vnode_put(vp);
455 file_drop(uap->fd);
456 return error;
457 }
458
459 memset(&cn, 0, sizeof(struct componentname));
460 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
461 cn.cn_pnlen = MAXPATHLEN;
462
463 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
464 zfree(ZV_NAMEI, cn.cn_pnbuf);
465 vnode_put(pvp);
466 vnode_put(vp);
467 file_drop(uap->fd);
468 return error;
469 }
470
471 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
472
473 zfree(ZV_NAMEI, cn.cn_pnbuf);
474 vnode_put(pvp);
475 vnode_put(vp);
476 file_drop(uap->fd);
477
478 return error;
479 }
480
481 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
482
483 /*
484 * Get the size of a graft file (a manifest or payload file).
485 * The vp should be an iocounted vnode.
486 */
487 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)488 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
489 {
490 struct stat64 sb = {};
491 int error;
492
493 *size = 0;
494
495 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
496 if (error) {
497 return error;
498 }
499
500 if (sb.st_size == 0) {
501 error = ENODATA;
502 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
503 error = EFBIG;
504 } else {
505 *size = (size_t) sb.st_size;
506 }
507
508 return error;
509 }
510
511 /*
512 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
513 * `size` must already be validated.
514 */
515 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)516 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
517 {
518 return vn_rdwr(UIO_READ, graft_vp,
519 (caddr_t) buf, (int) size, /* offset */ 0,
520 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
521 vfs_context_ucred(vctx), /* resid */ NULL,
522 vfs_context_proc(vctx));
523 }
524
525 /*
526 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
527 * and read it into `buf`.
528 * If `path_prefix` is non-NULL, verify that the file path has that prefix.
529 */
530 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)531 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
532 {
533 vnode_t metadata_vp = NULLVP;
534 char *path = NULL;
535 int error;
536
537 // Convert this graft fd to a vnode.
538 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
539 goto out;
540 }
541
542 // Verify that the vnode path starts with `path_prefix` if it was passed.
543 if (path_prefix) {
544 int len = MAXPATHLEN;
545 path = zalloc(ZV_NAMEI);
546 if ((error = vn_getpath(metadata_vp, path, &len))) {
547 goto out;
548 }
549 if (strncmp(path, path_prefix, strlen(path_prefix))) {
550 error = EINVAL;
551 goto out;
552 }
553 }
554
555 // Get (and validate) size information.
556 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
557 goto out;
558 }
559
560 // Read each file into the provided buffer - we must get the expected amount of bytes.
561 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
562 goto out;
563 }
564
565 out:
566 if (path) {
567 zfree(ZV_NAMEI, path);
568 }
569 if (metadata_vp) {
570 vnode_put(metadata_vp);
571 metadata_vp = NULLVP;
572 }
573
574 return error;
575 }
576
577 #if XNU_TARGET_OS_OSX
578 #define BASESYSTEM_PATH "/System/Library/BaseSystem/"
579 #if defined(__arm64e__)
580 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
581 #define MOBILE_ASSET_DATA_VAULT_RECOVERYOS_PATH "/System/Volumes/Data/System/Library/AssetsV2/manifests/"
582 #else /* x86_64 */
583 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
584 #define MOBILE_ASSET_DATA_VAULT_RECOVERYOS_PATH "/System/Volumes/Update/MobileAsset/AssetsV2/"
585 #endif /* x86_64 */
586 #else /* !XNU_TARGET_OS_OSX */
587 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
588 #endif /* !XNU_TARGET_OS_OSX */
589
590 /*
591 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
592 * provided in `gfs`, saving the size of data read in `gfs`.
593 */
594 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)595 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
596 vfs_context_t vctx, fsioc_graft_fs_t *gfs)
597 {
598 const char *manifest_path_prefix = NULL;
599 int error;
600
601 // For Mobile Asset, make sure that the manifest comes from a data vault.
602 if ((graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) ||
603 (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET_WITH_CODE)) {
604 manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
605 #if XNU_TARGET_OS_OSX
606 // Check if we're in RecoveryOS by checking for BaseSystem path
607 // existence, and if so use the Data volume path of the data vault.
608 struct nameidata nd;
609 NDINIT(&nd, LOOKUP, OP_LOOKUP, NOFOLLOW, UIO_SYSSPACE,
610 CAST_USER_ADDR_T(BASESYSTEM_PATH), vctx);
611 if (!namei(&nd)) {
612 vnode_t vp = nd.ni_vp;
613 if (vp->v_type == VDIR) {
614 manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_RECOVERYOS_PATH;
615 }
616 vnode_put(vp);
617 nameidone(&nd);
618 }
619 #endif
620 }
621
622 // Read the authentic manifest.
623 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
624 manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
625 return error;
626 }
627
628 // The user manifest is currently unused, but set its size.
629 gfs->user_manifest_size = 0;
630
631 // Read the payload.
632 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
633 NULL, &gfs->payload_size, gfs->payload))) {
634 return error;
635 }
636
637 return 0;
638 }
639
640 /*
641 * Call into the filesystem to verify and graft a cryptex.
642 */
643 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)644 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
645 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
646 {
647 fsioc_graft_fs_t gfs = {};
648 uint64_t graft_dir_ino = 0;
649 struct stat64 sb = {};
650 int error;
651
652 // Pre-flight arguments.
653 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
654 // Make sure that this graft version matches what we support.
655 return ENOTSUP;
656 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
657 // For this type, cryptex VP must live on same volume as the target of graft.
658 return EXDEV;
659 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
660 // We cannot graft upon non-directories.
661 return ENOTDIR;
662 } else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
663 // We do not allow grafts inside disk images.
664 return ENODEV;
665 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
666 sbc_args->sbc_payload_fd < 0) {
667 // We cannot graft without a manifest and payload.
668 return EINVAL;
669 }
670
671 if (mounton_vp) {
672 // Get the mounton's inode number.
673 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
674 if (error) {
675 return error;
676 }
677 graft_dir_ino = (uint64_t) sb.st_ino;
678 }
679
680 // Create buffers (of our maximum-defined size) to store authentication info.
681 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
682 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
683
684 if (!gfs.authentic_manifest || !gfs.payload) {
685 error = ENOMEM;
686 goto out;
687 }
688
689 // Read our fd's into our buffers.
690 // (Note that this will set the buffer size fields in `gfs`.)
691 error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
692 if (error) {
693 goto out;
694 }
695
696 gfs.graft_version = FSIOC_GRAFT_VERSION;
697 gfs.graft_type = graft_type;
698 gfs.graft_4cc = sbc_args->sbc_4cc;
699 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
700 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
701 }
702 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
703 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
704 }
705 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
706 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
707 }
708 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
709 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
710 }
711 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
712 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
713 }
714 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
715 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
716 }
717 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
718
719 // Call into the FS to perform the graft (and validation).
720 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
721
722 out:
723 if (gfs.authentic_manifest) {
724 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
725 gfs.authentic_manifest = NULL;
726 }
727 if (gfs.payload) {
728 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
729 gfs.payload = NULL;
730 }
731
732 return error;
733 }
734
735 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
736
737 /*
738 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
739 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
740 */
741 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)742 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
743 {
744 int ua_dmgfd = uap->dmg_fd;
745 user_addr_t ua_mountdir = uap->mountdir;
746 uint32_t ua_grafttype = uap->graft_type;
747 user_addr_t ua_graftargs = uap->gda;
748
749 graftdmg_args_un kern_gda = {};
750 int error = 0;
751 secure_boot_cryptex_args_t *sbc_args = NULL;
752 bool graft_on_parent = (ua_mountdir == USER_ADDR_NULL);
753
754 vnode_t cryptex_vp = NULLVP;
755 struct nameidata nd = {};
756 vfs_context_t ctx = vfs_context_current();
757 #if CONFIG_MACF
758 vnode_t parent_vp = NULLVP;
759 #endif
760
761 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
762 return EPERM;
763 }
764
765 // Copy graftargs in, if provided.
766 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
767 if (error) {
768 return error;
769 }
770
771 // Convert fd to vnode.
772 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
773 if (error) {
774 return error;
775 }
776
777 if (vnode_isdir(cryptex_vp)) {
778 error = EISDIR;
779 goto graftout;
780 }
781
782 #if CONFIG_MACF
783 if (graft_on_parent) {
784 // Grafting on Cryptex file parent directory, need to get its vp for MAC check.
785 parent_vp = vnode_getparent(cryptex_vp);
786 if (parent_vp == NULLVP) {
787 error = ENOENT;
788 goto graftout;
789 }
790 }
791 #endif
792
793 if (!graft_on_parent) {
794 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
795 UIO_USERSPACE, ua_mountdir, ctx);
796
797 error = namei(&nd);
798 if (error) {
799 goto graftout;
800 }
801 }
802
803 #if CONFIG_MACF
804 vnode_t macf_vp = graft_on_parent ? parent_vp : nd.ni_vp;
805 error = mac_graft_check_graft(ctx, macf_vp);
806 if (error) {
807 goto graftout;
808 }
809 #endif
810
811 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
812 error = EINVAL;
813 } else {
814 sbc_args = &kern_gda.sbc_args;
815 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx,
816 cryptex_vp, graft_on_parent ? NULLVP : nd.ni_vp);
817 }
818
819 #if CONFIG_MACF
820 if (!error) {
821 mac_graft_notify_graft(ctx, macf_vp);
822 }
823 #endif
824
825 graftout:
826 #if CONFIG_MACF
827 if (parent_vp != NULLVP) {
828 vnode_put(parent_vp);
829 parent_vp = NULLVP;
830 }
831 #endif
832 if (cryptex_vp != NULLVP) {
833 vnode_put(cryptex_vp);
834 cryptex_vp = NULLVP;
835 }
836 if (nd.ni_vp != NULLVP) {
837 vnode_put(nd.ni_vp);
838 nameidone(&nd);
839 }
840
841 return error;
842 }
843
844 /*
845 * Ungraft a cryptex disk image (via mount dir FD)
846 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
847 */
848 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)849 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
850 {
851 int error = 0;
852 user_addr_t ua_mountdir = uap->mountdir;
853 fsioc_ungraft_fs_t ugfs = {};
854 struct nameidata nd = {};
855 vfs_context_t ctx = vfs_context_current();
856
857 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
858 return EPERM;
859 }
860
861 if (ua_mountdir == USER_ADDR_NULL) {
862 return EINVAL;
863 }
864
865 if (uap->flags & UNGRAFTDMG_NOFORCE) {
866 ugfs.ungraft_flags |= FSCTL_UNGRAFT_NOFORCE;
867 }
868
869 // Acquire vnode for mount-on path
870 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
871 UIO_USERSPACE, ua_mountdir, ctx);
872
873 error = namei(&nd);
874 if (error) {
875 return error;
876 }
877
878 if (!vnode_isdir(nd.ni_vp)) {
879 error = ENOTDIR;
880 goto ungraftout;
881 }
882
883 #if CONFIG_MACF
884 error = mac_graft_check_ungraft(ctx, nd.ni_vp);
885 if (error) {
886 goto ungraftout;
887 }
888 #endif
889
890 // Call into the FS to perform the ungraft
891 error = VNOP_IOCTL(nd.ni_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
892
893 #if CONFIG_MACF
894 if (!error) {
895 mac_graft_notify_ungraft(ctx, nd.ni_vp);
896 }
897 #endif
898
899 ungraftout:
900 vnode_put(nd.ni_vp);
901 nameidone(&nd);
902
903 return error;
904 }
905
906
907 void
vfs_notify_mount(vnode_t pdvp)908 vfs_notify_mount(vnode_t pdvp)
909 {
910 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
911 lock_vnode_and_post(pdvp, NOTE_WRITE);
912 }
913
914 /*
915 * __mac_mount:
916 * Mount a file system taking into account MAC label behavior.
917 * See mount(2) man page for more information
918 *
919 * Parameters: p Process requesting the mount
920 * uap User argument descriptor (see below)
921 * retval (ignored)
922 *
923 * Indirect: uap->type Filesystem type
924 * uap->path Path to mount
925 * uap->data Mount arguments
926 * uap->mac_p MAC info
927 * uap->flags Mount flags
928 *
929 *
930 * Returns: 0 Success
931 * !0 Not success
932 */
933 boolean_t root_fs_upgrade_try = FALSE;
934
935 #define MAX_NESTED_UNION_MOUNTS 10
936
937 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)938 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
939 {
940 vnode_t pvp = NULLVP;
941 vnode_t vp = NULLVP;
942 int need_nameidone = 0;
943 vfs_context_t ctx = vfs_context_current();
944 char fstypename[MFSNAMELEN];
945 struct nameidata nd;
946 size_t dummy = 0;
947 char *labelstr = NULL;
948 size_t labelsz = 0;
949 int flags = uap->flags;
950 int error;
951 int num_retries = 0;
952 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
953 boolean_t is_64bit = IS_64BIT_PROCESS(p);
954 #else
955 #pragma unused(p)
956 #endif
957 /*
958 * Get the fs type name from user space
959 */
960 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
961 if (error) {
962 return error;
963 }
964
965 retry:
966 /*
967 * Get the vnode to be covered
968 */
969 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
970 UIO_USERSPACE, uap->path, ctx);
971 if (flags & MNT_NOFOLLOW) {
972 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
973 }
974 error = namei(&nd);
975 if (error) {
976 goto out;
977 }
978 need_nameidone = 1;
979 vp = nd.ni_vp;
980 pvp = nd.ni_dvp;
981
982 #ifdef CONFIG_IMGSRC_ACCESS
983 /* Mounting image source cannot be batched with other operations */
984 if (flags == MNT_IMGSRC_BY_INDEX) {
985 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
986 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
987 goto out;
988 }
989 #endif /* CONFIG_IMGSRC_ACCESS */
990
991 #if CONFIG_MACF
992 /*
993 * Get the label string (if any) from user space
994 */
995 if (uap->mac_p != USER_ADDR_NULL) {
996 struct user_mac mac;
997 size_t ulen = 0;
998
999 if (is_64bit) {
1000 struct user64_mac mac64;
1001 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
1002 mac.m_buflen = (user_size_t)mac64.m_buflen;
1003 mac.m_string = (user_addr_t)mac64.m_string;
1004 } else {
1005 struct user32_mac mac32;
1006 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
1007 mac.m_buflen = mac32.m_buflen;
1008 mac.m_string = mac32.m_string;
1009 }
1010 if (error) {
1011 goto out;
1012 }
1013 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
1014 (mac.m_buflen < 2)) {
1015 error = EINVAL;
1016 goto out;
1017 }
1018 labelsz = mac.m_buflen;
1019 labelstr = kalloc_data(labelsz, Z_WAITOK);
1020 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
1021 if (error) {
1022 goto out;
1023 }
1024 AUDIT_ARG(mac_string, labelstr);
1025 }
1026 #endif /* CONFIG_MACF */
1027
1028 AUDIT_ARG(fflags, flags);
1029
1030 if (flags & MNT_UNION) {
1031 #if CONFIG_UNION_MOUNTS
1032 mount_t mp = vp->v_mount;
1033 int nested_union_mounts = 0;
1034
1035 name_cache_lock_shared();
1036
1037 /* Walk up the vnodecovered chain and check for nested union mounts. */
1038 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
1039 while (mp) {
1040 if (!(mp->mnt_flag & MNT_UNION)) {
1041 break;
1042 }
1043 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
1044
1045 /*
1046 * Limit the max nested unon mounts to prevent stack exhaustion
1047 * when calling lookup_traverse_union().
1048 */
1049 if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
1050 error = ELOOP;
1051 break;
1052 }
1053 }
1054
1055 name_cache_unlock();
1056 if (error) {
1057 goto out;
1058 }
1059 #else
1060 error = EPERM;
1061 goto out;
1062 #endif /* CONFIG_UNION_MOUNTS */
1063 }
1064
1065 if ((vp->v_flag & VROOT) &&
1066 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
1067 #if CONFIG_UNION_MOUNTS
1068 if (!(flags & MNT_UNION)) {
1069 flags |= MNT_UPDATE;
1070 } else {
1071 /*
1072 * For a union mount on '/', treat it as fresh
1073 * mount instead of update.
1074 * Otherwise, union mouting on '/' used to panic the
1075 * system before, since mnt_vnodecovered was found to
1076 * be NULL for '/' which is required for unionlookup
1077 * after it gets ENOENT on union mount.
1078 */
1079 flags = (flags & ~(MNT_UPDATE));
1080 }
1081 #else
1082 flags |= MNT_UPDATE;
1083 #endif /* CONFIG_UNION_MOUNTS */
1084
1085 #if SECURE_KERNEL
1086 if ((flags & MNT_RDONLY) == 0) {
1087 /* Release kernels are not allowed to mount "/" as rw */
1088 error = EPERM;
1089 goto out;
1090 }
1091 #endif
1092
1093 /*
1094 * See 7392553 for more details on why this check exists.
1095 * Suffice to say: If this check is ON and something tries
1096 * to mount the rootFS RW, we'll turn off the codesign
1097 * bitmap optimization.
1098 */
1099 #if CHECK_CS_VALIDATION_BITMAP
1100 if ((flags & MNT_RDONLY) == 0) {
1101 root_fs_upgrade_try = TRUE;
1102 }
1103 #endif
1104 }
1105
1106 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1107 labelstr, ctx);
1108
1109 out:
1110
1111 #if CONFIG_MACF
1112 kfree_data(labelstr, labelsz);
1113 #endif /* CONFIG_MACF */
1114
1115 if (vp) {
1116 vnode_put(vp);
1117 vp = NULLVP;
1118 }
1119 if (pvp) {
1120 vnode_put(pvp);
1121 pvp = NULLVP;
1122 }
1123 if (need_nameidone) {
1124 nameidone(&nd);
1125 need_nameidone = 0;
1126 }
1127
1128 if (error == EBUSY) {
1129 /* Retry the lookup and mount again due to concurrent mounts. */
1130 if (++num_retries < MAX_MOUNT_RETRIES) {
1131 goto retry;
1132 }
1133 }
1134
1135 return error;
1136 }
1137
1138 /*
1139 * common mount implementation (final stage of mounting)
1140 *
1141 * Arguments:
1142 * fstypename file system type (ie it's vfs name)
1143 * pvp parent of covered vnode
1144 * vp covered vnode
1145 * cnp component name (ie path) of covered vnode
1146 * flags generic mount flags
1147 * fsmountargs file system specific data
1148 * labelstr optional MAC label
1149 * kernelmount TRUE for mounts initiated from inside the kernel
1150 * ctx caller's context
1151 */
1152 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1153 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1154 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1155 char *labelstr, vfs_context_t ctx)
1156 {
1157 #if !CONFIG_MACF
1158 #pragma unused(labelstr)
1159 #endif
1160 struct vnode *devvp = NULLVP;
1161 struct vnode *device_vnode = NULLVP;
1162 #if CONFIG_MACF
1163 struct vnode *rvp;
1164 #endif
1165 struct mount *mp = NULL;
1166 struct vfstable *vfsp = (struct vfstable *)0;
1167 struct proc *p = vfs_context_proc(ctx);
1168 int error, flag = 0;
1169 bool flag_set = false;
1170 user_addr_t devpath = USER_ADDR_NULL;
1171 int ronly = 0;
1172 int mntalloc = 0;
1173 boolean_t vfsp_ref = FALSE;
1174 boolean_t is_rwlock_locked = FALSE;
1175 boolean_t did_rele = FALSE;
1176 boolean_t have_usecount = FALSE;
1177 boolean_t did_set_lmount = FALSE;
1178 boolean_t did_set_vmount = FALSE;
1179 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1180
1181 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1182 /* Check for mutually-exclusive flag bits */
1183 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1184 int bitcount = 0;
1185 while (checkflags != 0) {
1186 checkflags &= (checkflags - 1);
1187 bitcount++;
1188 }
1189
1190 if (bitcount > 1) {
1191 //not allowed to request multiple mount-by-role flags
1192 error = EINVAL;
1193 goto out1;
1194 }
1195 #endif
1196
1197 /*
1198 * Process an update for an existing mount
1199 */
1200 if (flags & MNT_UPDATE) {
1201 if ((vp->v_flag & VROOT) == 0) {
1202 error = EINVAL;
1203 goto out1;
1204 }
1205 mp = vp->v_mount;
1206
1207 /* if unmount or mount in progress, return error */
1208 mount_lock_spin(mp);
1209 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1210 mount_unlock(mp);
1211 error = EBUSY;
1212 goto out1;
1213 }
1214 mp->mnt_lflag |= MNT_LMOUNT;
1215 did_set_lmount = TRUE;
1216 mount_unlock(mp);
1217 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1218 is_rwlock_locked = TRUE;
1219 /*
1220 * We only allow the filesystem to be reloaded if it
1221 * is currently mounted read-only.
1222 */
1223 if ((flags & MNT_RELOAD) &&
1224 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1225 error = ENOTSUP;
1226 goto out1;
1227 }
1228
1229 /*
1230 * If content protection is enabled, update mounts are not
1231 * allowed to turn it off.
1232 */
1233 if ((mp->mnt_flag & MNT_CPROTECT) &&
1234 ((flags & MNT_CPROTECT) == 0)) {
1235 error = EINVAL;
1236 goto out1;
1237 }
1238
1239 /*
1240 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1241 * failure to return an error for this so we'll just silently
1242 * add it if it is not passed in.
1243 */
1244 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1245 ((flags & MNT_REMOVABLE) == 0)) {
1246 flags |= MNT_REMOVABLE;
1247 }
1248
1249 /* Can't downgrade the backer of the root FS */
1250 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1251 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1252 error = ENOTSUP;
1253 goto out1;
1254 }
1255
1256 /*
1257 * Only root, or the user that did the original mount is
1258 * permitted to update it.
1259 */
1260 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1261 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1262 goto out1;
1263 }
1264 #if CONFIG_MACF
1265 error = mac_mount_check_remount(ctx, mp, flags);
1266 if (error != 0) {
1267 goto out1;
1268 }
1269 #endif
1270 /*
1271 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1272 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1273 */
1274 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1275 flags |= MNT_NOSUID | MNT_NODEV;
1276 if (mp->mnt_flag & MNT_NOEXEC) {
1277 flags |= MNT_NOEXEC;
1278 }
1279 }
1280 flag = mp->mnt_flag;
1281 flag_set = true;
1282
1283
1284
1285 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1286
1287 vfsp = mp->mnt_vtable;
1288 goto update;
1289 } // MNT_UPDATE
1290
1291 /*
1292 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1293 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1294 */
1295 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1296 flags |= MNT_NOSUID | MNT_NODEV;
1297 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1298 flags |= MNT_NOEXEC;
1299 }
1300 }
1301
1302 /* XXXAUDIT: Should we capture the type on the error path as well? */
1303 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1304 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1305 mount_list_lock();
1306 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1307 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1308 vfsp->vfc_refcount++;
1309 vfsp_ref = TRUE;
1310 break;
1311 }
1312 }
1313 mount_list_unlock();
1314 if (vfsp == NULL) {
1315 error = ENODEV;
1316 goto out1;
1317 }
1318
1319 /*
1320 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1321 * except in ROSV configs and for the initial BaseSystem root.
1322 */
1323 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1324 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1325 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1326 error = EINVAL; /* unsupported request */
1327 goto out1;
1328 }
1329
1330 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1331 if (error != 0) {
1332 goto out1;
1333 }
1334
1335 /*
1336 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1337 */
1338 did_set_vmount = TRUE;
1339
1340 /*
1341 * Allocate and initialize the filesystem (mount_t)
1342 */
1343 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1344 mntalloc = 1;
1345
1346 /* Initialize the default IO constraints */
1347 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1348 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1349 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1350 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1351 mp->mnt_devblocksize = DEV_BSIZE;
1352 mp->mnt_alignmentmask = PAGE_MASK;
1353 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1354 mp->mnt_ioscale = 1;
1355 mp->mnt_ioflags = 0;
1356 mp->mnt_realrootvp = NULLVP;
1357 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1358
1359 mp->mnt_lflag |= MNT_LMOUNT;
1360 did_set_lmount = TRUE;
1361
1362 TAILQ_INIT(&mp->mnt_vnodelist);
1363 TAILQ_INIT(&mp->mnt_workerqueue);
1364 TAILQ_INIT(&mp->mnt_newvnodes);
1365 mount_lock_init(mp);
1366 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1367 is_rwlock_locked = TRUE;
1368 mp->mnt_op = vfsp->vfc_vfsops;
1369 mp->mnt_vtable = vfsp;
1370 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1371 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1372 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1373 do {
1374 size_t pathlen = MAXPATHLEN;
1375
1376 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1377 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1378 }
1379 } while (0);
1380 mp->mnt_vnodecovered = vp;
1381 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1382 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1383 mp->mnt_devbsdunit = 0;
1384 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1385
1386 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1387 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1388
1389 if (kernelmount) {
1390 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1391 }
1392 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1393 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1394 }
1395
1396 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1397 // kernel mounted devfs
1398 mp->mnt_kern_flag |= MNTK_SYSTEM;
1399 }
1400
1401 update:
1402
1403 /*
1404 * Set the mount level flags.
1405 */
1406 if (flags & MNT_RDONLY) {
1407 mp->mnt_flag |= MNT_RDONLY;
1408 } else if (mp->mnt_flag & MNT_RDONLY) {
1409 // disallow read/write upgrades of file systems that
1410 // had the TYPENAME_OVERRIDE feature set.
1411 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1412 error = EPERM;
1413 goto out1;
1414 }
1415 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1416 }
1417 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1418 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1419 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1420 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1421 MNT_QUARANTINE | MNT_CPROTECT);
1422
1423 #if SECURE_KERNEL
1424 #if !CONFIG_MNT_SUID
1425 /*
1426 * On release builds of iOS based platforms, always enforce NOSUID on
1427 * all mounts. We do this here because we can catch update mounts as well as
1428 * non-update mounts in this case.
1429 */
1430 mp->mnt_flag |= (MNT_NOSUID);
1431 #endif
1432 #endif
1433
1434 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1435 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1436 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1437 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1438 MNT_QUARANTINE | MNT_CPROTECT);
1439
1440 #if CONFIG_MACF
1441 if (flags & MNT_MULTILABEL) {
1442 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1443 error = EINVAL;
1444 goto out1;
1445 }
1446 mp->mnt_flag |= MNT_MULTILABEL;
1447 }
1448 #endif
1449 /*
1450 * Process device path for local file systems if requested.
1451 *
1452 * Snapshot and mount-by-role mounts do not use this path; they are
1453 * passing other opaque data in the device path field.
1454 *
1455 * Basesystemroot mounts pass a device path to be resolved here,
1456 * but it's just a char * already inside the kernel, which
1457 * kernel_mount() shoved into a user_addr_t to call us. So for such
1458 * mounts we must skip copyin (both of the address and of the string
1459 * (in NDINIT).
1460 */
1461 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1462 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1463 boolean_t do_copyin_devpath = true;
1464 #if CONFIG_BASESYSTEMROOT
1465 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1466 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1467 // We have been passed fsmountargs, which is typed as a user_addr_t,
1468 // but is actually a char ** pointing to a (kernelspace) string.
1469 // We manually unpack it with a series of casts and dereferences
1470 // that reverses what was done just above us on the stack in
1471 // imageboot_pivot_image().
1472 // After retrieving the path to the dev node (which we will NDINIT
1473 // in a moment), we pass NULL fsmountargs on to the filesystem.
1474 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1475 char **devnamepp = (char **)fsmountargs;
1476 char *devnamep = *devnamepp;
1477 devpath = CAST_USER_ADDR_T(devnamep);
1478 do_copyin_devpath = false;
1479 fsmountargs = USER_ADDR_NULL;
1480
1481 //Now that we have a mp, denote that this mount is for the basesystem.
1482 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1483 }
1484 #endif // CONFIG_BASESYSTEMROOT
1485
1486 if (do_copyin_devpath) {
1487 if (vfs_context_is64bit(ctx)) {
1488 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1489 goto out1;
1490 }
1491 fsmountargs += sizeof(devpath);
1492 } else {
1493 user32_addr_t tmp;
1494 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1495 goto out1;
1496 }
1497 /* munge into LP64 addr */
1498 devpath = CAST_USER_ADDR_T(tmp);
1499 fsmountargs += sizeof(tmp);
1500 }
1501 }
1502
1503 /* Lookup device and authorize access to it */
1504 if ((devpath)) {
1505 struct nameidata nd;
1506
1507 enum uio_seg seg = UIO_USERSPACE;
1508 #if CONFIG_BASESYSTEMROOT
1509 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1510 seg = UIO_SYSSPACE;
1511 }
1512 #endif // CONFIG_BASESYSTEMROOT
1513
1514 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1515 if (flags & MNT_NOFOLLOW) {
1516 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
1517 }
1518 if ((error = namei(&nd))) {
1519 goto out1;
1520 }
1521
1522 devvp = nd.ni_vp;
1523
1524 if (devvp->v_type != VBLK) {
1525 error = ENOTBLK;
1526 nameidone(&nd);
1527 goto out2;
1528 }
1529 if (major(devvp->v_rdev) >= nblkdev) {
1530 error = ENXIO;
1531 nameidone(&nd);
1532 goto out2;
1533 }
1534 /*
1535 * If mount by non-root, then verify that user has necessary
1536 * permissions on the device.
1537 */
1538 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1539 kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1540
1541 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1542 accessmode |= KAUTH_VNODE_WRITE_DATA;
1543 }
1544 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1545 nameidone(&nd);
1546 goto out2;
1547 }
1548 }
1549
1550 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1551 nameidone(&nd);
1552 }
1553 /* On first mount, preflight and open device */
1554 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1555 if ((error = vnode_ref(devvp))) {
1556 goto out2;
1557 }
1558 /*
1559 * Disallow multiple mounts of the same device.
1560 * Disallow mounting of a device that is currently in use
1561 * (except for root, which might share swap device for miniroot).
1562 * Flush out any old buffers remaining from a previous use.
1563 */
1564 if ((error = vfs_setmounting(devvp))) {
1565 vnode_rele(devvp);
1566 goto out2;
1567 }
1568
1569 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1570 error = EBUSY;
1571 goto out3;
1572 }
1573 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1574 error = ENOTBLK;
1575 goto out3;
1576 }
1577 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1578 goto out3;
1579 }
1580
1581 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1582 #if CONFIG_MACF
1583 error = mac_vnode_check_open(ctx,
1584 devvp,
1585 ronly ? FREAD : FREAD | FWRITE);
1586 if (error) {
1587 goto out3;
1588 }
1589 #endif /* MAC */
1590 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1591 goto out3;
1592 }
1593
1594 mp->mnt_devvp = devvp;
1595 device_vnode = devvp;
1596 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1597 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1598 (device_vnode = mp->mnt_devvp)) {
1599 dev_t dev;
1600 int maj;
1601 /*
1602 * If upgrade to read-write by non-root, then verify
1603 * that user has necessary permissions on the device.
1604 */
1605 vnode_getalways(device_vnode);
1606
1607 if (suser(vfs_context_ucred(ctx), NULL) &&
1608 (error = vnode_authorize(device_vnode, NULL,
1609 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1610 ctx)) != 0) {
1611 vnode_put(device_vnode);
1612 goto out2;
1613 }
1614
1615 /* Tell the device that we're upgrading */
1616 dev = (dev_t)device_vnode->v_rdev;
1617 maj = major(dev);
1618
1619 if ((u_int)maj >= (u_int)nblkdev) {
1620 panic("Volume mounted on a device with invalid major number.");
1621 }
1622
1623 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1624 vnode_put(device_vnode);
1625 device_vnode = NULLVP;
1626 if (error != 0) {
1627 goto out2;
1628 }
1629 }
1630 } // localargs && !(snapshot | data | vm)
1631
1632 #if CONFIG_MACF
1633 if ((flags & MNT_UPDATE) == 0) {
1634 mac_mount_label_init(mp);
1635 mac_mount_label_associate(ctx, mp);
1636 }
1637 if (labelstr) {
1638 if ((flags & MNT_UPDATE) != 0) {
1639 error = mac_mount_check_label_update(ctx, mp);
1640 if (error != 0) {
1641 goto out3;
1642 }
1643 }
1644 }
1645 #endif
1646 /*
1647 * Mount the filesystem. We already asserted that internal_flags
1648 * cannot have more than one mount-by-role bit set.
1649 */
1650 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1651 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1652 (caddr_t)fsmountargs, 0, ctx);
1653 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1654 #if CONFIG_ROSV_STARTUP
1655 struct mount *origin_mp = (struct mount*)fsmountargs;
1656 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1657 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1658 if (error) {
1659 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1660 } else {
1661 /* Mark volume associated with system volume */
1662 mp->mnt_kern_flag |= MNTK_SYSTEM;
1663
1664 /* Attempt to acquire the mnt_devvp and set it up */
1665 struct vnode *mp_devvp = NULL;
1666 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1667 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1668 0, &mp_devvp, vfs_context_kernel());
1669 if (!lerr) {
1670 mp->mnt_devvp = mp_devvp;
1671 //vnode_lookup took an iocount, need to drop it.
1672 vnode_put(mp_devvp);
1673 // now set `device_vnode` to the devvp that was acquired.
1674 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1675 // note that though the iocount above was dropped, the mount acquires
1676 // an implicit reference against the device.
1677 device_vnode = mp_devvp;
1678 }
1679 }
1680 }
1681 #else
1682 error = EINVAL;
1683 #endif
1684 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1685 #if CONFIG_MOUNT_VM
1686 struct mount *origin_mp = (struct mount*)fsmountargs;
1687 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1688 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1689 if (error) {
1690 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1691 } else {
1692 /* Mark volume associated with system volume and a swap mount */
1693 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1694 /* Attempt to acquire the mnt_devvp and set it up */
1695 struct vnode *mp_devvp = NULL;
1696 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1697 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1698 0, &mp_devvp, vfs_context_kernel());
1699 if (!lerr) {
1700 mp->mnt_devvp = mp_devvp;
1701 //vnode_lookup took an iocount, need to drop it.
1702 vnode_put(mp_devvp);
1703
1704 // now set `device_vnode` to the devvp that was acquired.
1705 // note that though the iocount above was dropped, the mount acquires
1706 // an implicit reference against the device.
1707 device_vnode = mp_devvp;
1708 }
1709 }
1710 }
1711 #else
1712 error = EINVAL;
1713 #endif
1714 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1715 #if CONFIG_MOUNT_PREBOOTRECOVERY
1716 struct mount *origin_mp = (struct mount*)fsmountargs;
1717 uint32_t mount_role = 0;
1718 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1719 mount_role = VFS_PREBOOT_ROLE;
1720 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1721 mount_role = VFS_RECOVERY_ROLE;
1722 }
1723
1724 if (mount_role != 0) {
1725 fs_role_mount_args_t frma = {origin_mp, mount_role};
1726 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1727 if (error) {
1728 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1729 } else {
1730 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1731 /* Mark volume associated with system volume */
1732 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1733 /* Attempt to acquire the mnt_devvp and set it up */
1734 struct vnode *mp_devvp = NULL;
1735 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1736 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1737 0, &mp_devvp, vfs_context_kernel());
1738 if (!lerr) {
1739 mp->mnt_devvp = mp_devvp;
1740 //vnode_lookup took an iocount, need to drop it.
1741 vnode_put(mp_devvp);
1742
1743 // now set `device_vnode` to the devvp that was acquired.
1744 // note that though the iocount above was dropped, the mount acquires
1745 // an implicit reference against the device.
1746 device_vnode = mp_devvp;
1747 }
1748 }
1749 }
1750 } else {
1751 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1752 error = EINVAL;
1753 }
1754 #else
1755 error = EINVAL;
1756 #endif
1757 } else {
1758 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1759 }
1760
1761 if (flags & MNT_UPDATE) {
1762 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1763 mp->mnt_flag &= ~MNT_RDONLY;
1764 }
1765 mp->mnt_flag &= ~
1766 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1767 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1768 if (error) {
1769 mp->mnt_flag = flag; /* restore flag value */
1770 }
1771 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1772 lck_rw_done(&mp->mnt_rwlock);
1773 is_rwlock_locked = FALSE;
1774 if (!error) {
1775 enablequotas(mp, ctx);
1776 }
1777 goto exit;
1778 }
1779
1780 /*
1781 * Put the new filesystem on the mount list after root.
1782 */
1783 if (error == 0) {
1784 struct vfs_attr vfsattr;
1785 if (device_vnode) {
1786 /*
1787 * cache the IO attributes for the underlying physical media...
1788 * an error return indicates the underlying driver doesn't
1789 * support all the queries necessary... however, reasonable
1790 * defaults will have been set, so no reason to bail or care
1791 *
1792 * Need to do this before calling the MAC hook as it needs
1793 * information from this call.
1794 */
1795 vfs_init_io_attributes(device_vnode, mp);
1796 }
1797
1798 #if CONFIG_MACF
1799 error = mac_mount_check_mount_late(ctx, mp);
1800 if (error != 0) {
1801 goto out4;
1802 }
1803
1804 if (vfs_flags(mp) & MNT_MULTILABEL) {
1805 error = VFS_ROOT(mp, &rvp, ctx);
1806 if (error) {
1807 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1808 goto out4;
1809 }
1810 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1811 /*
1812 * drop reference provided by VFS_ROOT
1813 */
1814 vnode_put(rvp);
1815
1816 if (error) {
1817 goto out4;
1818 }
1819 }
1820 #endif /* MAC */
1821
1822 vnode_lock_spin(vp);
1823 CLR(vp->v_flag, VMOUNT);
1824 vp->v_mountedhere = mp;
1825 SET(vp->v_flag, VMOUNTEDHERE);
1826
1827 /*
1828 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1829 * 'v_mountedhere' to be planted.
1830 */
1831 wakeup(&vp->v_flag);
1832 vnode_unlock(vp);
1833
1834 /*
1835 * taking the name_cache_lock exclusively will
1836 * insure that everyone is out of the fast path who
1837 * might be trying to use a now stale copy of
1838 * vp->v_mountedhere->mnt_realrootvp
1839 * bumping mount_generation causes the cached values
1840 * to be invalidated
1841 */
1842 name_cache_lock();
1843 mount_generation++;
1844 name_cache_unlock();
1845
1846 error = vnode_ref(vp);
1847 if (error != 0) {
1848 goto out4;
1849 }
1850
1851 have_usecount = TRUE;
1852
1853 error = checkdirs(vp, ctx);
1854 if (error != 0) {
1855 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1856 goto out4;
1857 }
1858 /*
1859 * there is no cleanup code here so I have made it void
1860 * we need to revisit this
1861 */
1862 (void)VFS_START(mp, 0, ctx);
1863
1864 if (mount_list_add(mp) != 0) {
1865 /*
1866 * The system is shutting down trying to umount
1867 * everything, so fail with a plausible errno.
1868 */
1869 error = EBUSY;
1870 goto out4;
1871 }
1872 lck_rw_done(&mp->mnt_rwlock);
1873 is_rwlock_locked = FALSE;
1874
1875 /* Check if this mounted file system supports EAs or named streams. */
1876 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1877 VFSATTR_INIT(&vfsattr);
1878 VFSATTR_WANTED(&vfsattr, f_capabilities);
1879 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1880 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1881 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1882 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1883 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1884 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1885 }
1886 #if NAMEDSTREAMS
1887 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1888 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1889 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1890 }
1891 #endif
1892 /* Check if this file system supports path from id lookups. */
1893 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1894 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1895 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1896 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1897 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1898 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1899 }
1900
1901 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1902 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1903 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1904 }
1905 }
1906 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1907 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1908 }
1909 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1910 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1911 }
1912 /* Get subtype if supported to cache it */
1913 VFSATTR_INIT(&vfsattr);
1914 VFSATTR_WANTED(&vfsattr, f_fssubtype);
1915 if (vfs_getattr(mp, &vfsattr, ctx) == 0 && VFSATTR_IS_SUPPORTED(&vfsattr, f_fssubtype)) {
1916 mp->mnt_vfsstat.f_fssubtype = vfsattr.f_fssubtype;
1917 }
1918
1919 /* increment the operations count */
1920 OSAddAtomic(1, &vfs_nummntops);
1921 enablequotas(mp, ctx);
1922
1923 if (device_vnode) {
1924 vfs_setmountedon(device_vnode);
1925 }
1926
1927 /* Now that mount is setup, notify the listeners */
1928 vfs_notify_mount(pvp);
1929 IOBSDMountChange(mp, kIOMountChangeMount);
1930 #if CONFIG_MACF
1931 mac_mount_notify_mount(ctx, mp);
1932 #endif /* CONFIG_MACF */
1933 } else {
1934 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1935 if (mp->mnt_vnodelist.tqh_first != NULL) {
1936 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1937 mp->mnt_vtable->vfc_name, error);
1938 }
1939
1940 vnode_lock_spin(vp);
1941 CLR(vp->v_flag, VMOUNT);
1942 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
1943 wakeup(&vp->v_flag);
1944 vnode_unlock(vp);
1945 mount_list_lock();
1946 mp->mnt_vtable->vfc_refcount--;
1947 mount_list_unlock();
1948
1949 if (device_vnode) {
1950 vnode_rele(device_vnode);
1951 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1952 vfs_clearmounting(device_vnode);
1953 }
1954 lck_rw_done(&mp->mnt_rwlock);
1955 is_rwlock_locked = FALSE;
1956
1957 if (nc_smr_enabled) {
1958 vfs_smr_synchronize();
1959 }
1960
1961 /*
1962 * if we get here, we have a mount structure that needs to be freed,
1963 * but since the coveredvp hasn't yet been updated to point at it,
1964 * no need to worry about other threads holding a crossref on this mp
1965 * so it's ok to just free it
1966 */
1967 mount_lock_destroy(mp);
1968 #if CONFIG_MACF
1969 mac_mount_label_destroy(mp);
1970 #endif
1971 zfree(mount_zone, mp);
1972 did_set_lmount = false;
1973 }
1974 exit:
1975 /*
1976 * drop I/O count on the device vp if there was one
1977 */
1978 if (devpath && devvp) {
1979 vnode_put(devvp);
1980 }
1981
1982 if (did_set_lmount) {
1983 mount_lock_spin(mp);
1984 mp->mnt_lflag &= ~MNT_LMOUNT;
1985 mount_unlock(mp);
1986 }
1987
1988 return error;
1989
1990 /* Error condition exits */
1991 out4:
1992 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1993
1994 /*
1995 * If the mount has been placed on the covered vp,
1996 * it may have been discovered by now, so we have
1997 * to treat this just like an unmount
1998 */
1999 mount_lock_spin(mp);
2000 mp->mnt_lflag |= MNT_LDEAD;
2001 mount_unlock(mp);
2002
2003 if (device_vnode != NULLVP) {
2004 vnode_rele(device_vnode);
2005 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2006 ctx);
2007 vfs_clearmounting(device_vnode);
2008 did_rele = TRUE;
2009 }
2010
2011 vnode_lock_spin(vp);
2012
2013 mp->mnt_crossref++;
2014 CLR(vp->v_flag, VMOUNTEDHERE);
2015 vp->v_mountedhere = (mount_t) 0;
2016
2017 vnode_unlock(vp);
2018
2019 if (have_usecount) {
2020 vnode_rele(vp);
2021 }
2022 out3:
2023 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
2024 vnode_rele(devvp);
2025 vfs_clearmounting(devvp);
2026 }
2027 out2:
2028 if (devpath && devvp) {
2029 vnode_put(devvp);
2030 }
2031 out1:
2032 /* Release mnt_rwlock only when it was taken */
2033 if (is_rwlock_locked == TRUE) {
2034 if (flag_set) {
2035 mp->mnt_flag = flag; /* restore mnt_flag value */
2036 }
2037 lck_rw_done(&mp->mnt_rwlock);
2038 }
2039
2040 if (did_set_lmount) {
2041 mount_lock_spin(mp);
2042 mp->mnt_lflag &= ~MNT_LMOUNT;
2043 mount_unlock(mp);
2044 }
2045
2046 if (did_set_vmount) {
2047 vnode_lock_spin(vp);
2048 CLR(vp->v_flag, VMOUNT);
2049 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2050 wakeup(&vp->v_flag);
2051 vnode_unlock(vp);
2052 }
2053
2054 if (mntalloc) {
2055 if (mp->mnt_crossref) {
2056 mount_dropcrossref(mp, vp, 0);
2057 } else {
2058 if (nc_smr_enabled) {
2059 vfs_smr_synchronize();
2060 }
2061
2062 mount_lock_destroy(mp);
2063 #if CONFIG_MACF
2064 mac_mount_label_destroy(mp);
2065 #endif
2066 zfree(mount_zone, mp);
2067 }
2068 }
2069 if (vfsp_ref) {
2070 mount_list_lock();
2071 vfsp->vfc_refcount--;
2072 mount_list_unlock();
2073 }
2074
2075 return error;
2076 }
2077
2078 /*
2079 * Flush in-core data, check for competing mount attempts,
2080 * and set VMOUNT
2081 */
2082 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)2083 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
2084 {
2085 #if !CONFIG_MACF
2086 #pragma unused(cnp,fsname)
2087 #endif
2088 struct vnode_attr va;
2089 int error;
2090 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
2091 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
2092 boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
2093
2094 if (!skip_auth) {
2095 /*
2096 * If the user is not root, ensure that they own the directory
2097 * onto which we are attempting to mount.
2098 */
2099 VATTR_INIT(&va);
2100 VATTR_WANTED(&va, va_uid);
2101 if ((error = vnode_getattr(vp, &va, ctx)) ||
2102 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2103 (!vfs_context_issuser(ctx)))) {
2104 error = EPERM;
2105 goto out;
2106 }
2107 }
2108
2109 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
2110 goto out;
2111 }
2112
2113 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
2114 goto out;
2115 }
2116
2117 if (vp->v_type != VDIR) {
2118 error = ENOTDIR;
2119 goto out;
2120 }
2121
2122 vnode_lock_spin(vp);
2123
2124 if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
2125 error = EBUSY;
2126 } else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
2127 (vp->v_mountedhere != NULL))) {
2128 /*
2129 * For mount triggered from mount() call, we want to wait for the
2130 * current in-progress mount to complete, redo lookup and retry the
2131 * mount again. Similarly, we also want to retry if we lost the race
2132 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
2133 * 'v_mountedhere' has been planted after initial lookup.
2134 */
2135 if (ISSET(vp->v_flag, VMOUNT)) {
2136 vnode_lock_convert(vp);
2137 msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
2138 }
2139 error = EBUSY;
2140 } else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
2141 error = EBUSY;
2142 }
2143
2144 if (error) {
2145 vnode_unlock(vp);
2146 goto out;
2147 }
2148 SET(vp->v_flag, VMOUNT);
2149 vnode_unlock(vp);
2150
2151 #if CONFIG_MACF
2152 error = mac_mount_check_mount(ctx, vp,
2153 cnp, fsname);
2154 if (error != 0) {
2155 vnode_lock_spin(vp);
2156 CLR(vp->v_flag, VMOUNT);
2157 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2158 wakeup(&vp->v_flag);
2159 vnode_unlock(vp);
2160 }
2161 #endif
2162
2163 out:
2164 return error;
2165 }
2166
2167 #if CONFIG_IMGSRC_ACCESS
2168
2169 #define DEBUG_IMGSRC 0
2170
2171 #if DEBUG_IMGSRC
2172 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2173 #else
2174 #define IMGSRC_DEBUG(args...) do { } while(0)
2175 #endif
2176
2177 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2178 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2179 {
2180 struct nameidata nd;
2181 vnode_t vp, realdevvp;
2182 kauth_action_t accessmode;
2183 int error;
2184 enum uio_seg uio = UIO_USERSPACE;
2185
2186 if (ctx == vfs_context_kernel()) {
2187 uio = UIO_SYSSPACE;
2188 }
2189
2190 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2191 if ((error = namei(&nd))) {
2192 IMGSRC_DEBUG("namei() failed with %d\n", error);
2193 return error;
2194 }
2195
2196 vp = nd.ni_vp;
2197
2198 if (!vnode_isblk(vp)) {
2199 IMGSRC_DEBUG("Not block device.\n");
2200 error = ENOTBLK;
2201 goto out;
2202 }
2203
2204 realdevvp = mp->mnt_devvp;
2205 if (realdevvp == NULLVP) {
2206 IMGSRC_DEBUG("No device backs the mount.\n");
2207 error = ENXIO;
2208 goto out;
2209 }
2210
2211 error = vnode_getwithref(realdevvp);
2212 if (error != 0) {
2213 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2214 goto out;
2215 }
2216
2217 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2218 IMGSRC_DEBUG("Wrong dev_t.\n");
2219 error = ENXIO;
2220 goto out1;
2221 }
2222
2223 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2224
2225 /*
2226 * If mount by non-root, then verify that user has necessary
2227 * permissions on the device.
2228 */
2229 if (!vfs_context_issuser(ctx)) {
2230 accessmode = KAUTH_VNODE_READ_DATA;
2231 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2232 accessmode |= KAUTH_VNODE_WRITE_DATA;
2233 }
2234 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2235 IMGSRC_DEBUG("Access denied.\n");
2236 goto out1;
2237 }
2238 }
2239
2240 *devvpp = vp;
2241
2242 out1:
2243 vnode_put(realdevvp);
2244
2245 out:
2246 nameidone(&nd);
2247
2248 if (error) {
2249 vnode_put(vp);
2250 }
2251
2252 return error;
2253 }
2254
2255 /*
2256 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2257 * and call checkdirs()
2258 */
2259 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2260 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2261 {
2262 int error;
2263
2264 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2265
2266 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2267 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2268
2269 vnode_lock_spin(vp);
2270 CLR(vp->v_flag, VMOUNT);
2271 vp->v_mountedhere = mp;
2272 SET(vp->v_flag, VMOUNTEDHERE);
2273 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2274 wakeup(&vp->v_flag);
2275 vnode_unlock(vp);
2276
2277 /*
2278 * taking the name_cache_lock exclusively will
2279 * insure that everyone is out of the fast path who
2280 * might be trying to use a now stale copy of
2281 * vp->v_mountedhere->mnt_realrootvp
2282 * bumping mount_generation causes the cached values
2283 * to be invalidated
2284 */
2285 name_cache_lock();
2286 mount_generation++;
2287 name_cache_unlock();
2288
2289 error = vnode_ref(vp);
2290 if (error != 0) {
2291 goto out;
2292 }
2293
2294 error = checkdirs(vp, ctx);
2295 if (error != 0) {
2296 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2297 vnode_rele(vp);
2298 goto out;
2299 }
2300
2301 out:
2302 if (error != 0) {
2303 mp->mnt_vnodecovered = NULLVP;
2304 }
2305 return error;
2306 }
2307
2308 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2309 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2310 {
2311 vnode_rele(vp);
2312 vnode_lock_spin(vp);
2313 CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2314 vp->v_mountedhere = (mount_t)NULL;
2315 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2316 wakeup(&vp->v_flag);
2317 vnode_unlock(vp);
2318
2319 mp->mnt_vnodecovered = NULLVP;
2320 }
2321
2322 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2323 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2324 {
2325 int error;
2326
2327 /* unmount in progress return error */
2328 mount_lock_spin(mp);
2329 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2330 mount_unlock(mp);
2331 return EBUSY;
2332 }
2333 mount_unlock(mp);
2334 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2335
2336 /*
2337 * We only allow the filesystem to be reloaded if it
2338 * is currently mounted read-only.
2339 */
2340 if ((flags & MNT_RELOAD) &&
2341 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2342 error = ENOTSUP;
2343 goto out;
2344 }
2345
2346 /*
2347 * Only root, or the user that did the original mount is
2348 * permitted to update it.
2349 */
2350 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2351 (!vfs_context_issuser(ctx))) {
2352 error = EPERM;
2353 goto out;
2354 }
2355 #if CONFIG_MACF
2356 error = mac_mount_check_remount(ctx, mp, flags);
2357 if (error != 0) {
2358 goto out;
2359 }
2360 #endif
2361
2362 out:
2363 if (error) {
2364 lck_rw_done(&mp->mnt_rwlock);
2365 }
2366
2367 return error;
2368 }
2369
2370 static void
mount_end_update(mount_t mp)2371 mount_end_update(mount_t mp)
2372 {
2373 lck_rw_done(&mp->mnt_rwlock);
2374 }
2375
2376 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2377 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2378 {
2379 vnode_t vp;
2380
2381 if (height >= MAX_IMAGEBOOT_NESTING) {
2382 return EINVAL;
2383 }
2384
2385 vp = imgsrc_rootvnodes[height];
2386 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2387 *rvpp = vp;
2388 return 0;
2389 } else {
2390 return ENOENT;
2391 }
2392 }
2393
2394 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2395 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2396 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2397 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2398 {
2399 int error;
2400 mount_t mp;
2401 boolean_t placed = FALSE;
2402 struct vfstable *vfsp;
2403 user_addr_t devpath;
2404 char *old_mntonname;
2405 vnode_t rvp;
2406 vnode_t devvp;
2407 uint32_t height;
2408 uint32_t flags;
2409
2410 /* If we didn't imageboot, nothing to move */
2411 if (imgsrc_rootvnodes[0] == NULLVP) {
2412 return EINVAL;
2413 }
2414
2415 /* Only root can do this */
2416 if (!vfs_context_issuser(ctx)) {
2417 return EPERM;
2418 }
2419
2420 IMGSRC_DEBUG("looking for root vnode.\n");
2421
2422 /*
2423 * Get root vnode of filesystem we're moving.
2424 */
2425 if (by_index) {
2426 if (is64bit) {
2427 struct user64_mnt_imgsrc_args mia64;
2428 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2429 if (error != 0) {
2430 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2431 return error;
2432 }
2433
2434 height = mia64.mi_height;
2435 flags = mia64.mi_flags;
2436 devpath = (user_addr_t)mia64.mi_devpath;
2437 } else {
2438 struct user32_mnt_imgsrc_args mia32;
2439 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2440 if (error != 0) {
2441 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2442 return error;
2443 }
2444
2445 height = mia32.mi_height;
2446 flags = mia32.mi_flags;
2447 devpath = mia32.mi_devpath;
2448 }
2449 } else {
2450 /*
2451 * For binary compatibility--assumes one level of nesting.
2452 */
2453 if (is64bit) {
2454 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2455 return error;
2456 }
2457 } else {
2458 user32_addr_t tmp;
2459 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2460 return error;
2461 }
2462
2463 /* munge into LP64 addr */
2464 devpath = CAST_USER_ADDR_T(tmp);
2465 }
2466
2467 height = 0;
2468 flags = 0;
2469 }
2470
2471 if (flags != 0) {
2472 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2473 return EINVAL;
2474 }
2475
2476 error = get_imgsrc_rootvnode(height, &rvp);
2477 if (error != 0) {
2478 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2479 return error;
2480 }
2481
2482 IMGSRC_DEBUG("got old root vnode\n");
2483
2484 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2485
2486 /* Can only move once */
2487 mp = vnode_mount(rvp);
2488 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2489 IMGSRC_DEBUG("Already moved.\n");
2490 error = EBUSY;
2491 goto out0;
2492 }
2493
2494 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2495 IMGSRC_DEBUG("Starting updated.\n");
2496
2497 /* Get exclusive rwlock on mount, authorize update on mp */
2498 error = mount_begin_update(mp, ctx, 0);
2499 if (error != 0) {
2500 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2501 goto out0;
2502 }
2503
2504 /*
2505 * It can only be moved once. Flag is set under the rwlock,
2506 * so we're now safe to proceed.
2507 */
2508 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2509 IMGSRC_DEBUG("Already moved [2]\n");
2510 goto out1;
2511 }
2512
2513 IMGSRC_DEBUG("Preparing coveredvp.\n");
2514
2515 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2516 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2517 if (error != 0) {
2518 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2519 goto out1;
2520 }
2521
2522 IMGSRC_DEBUG("Covered vp OK.\n");
2523
2524 /* Sanity check the name caller has provided */
2525 vfsp = mp->mnt_vtable;
2526 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2527 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2528 vfsp->vfc_name, fsname);
2529 error = EINVAL;
2530 goto out2;
2531 }
2532
2533 /* Check the device vnode and update mount-from name, for local filesystems */
2534 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2535 IMGSRC_DEBUG("Local, doing device validation.\n");
2536
2537 if (devpath != USER_ADDR_NULL) {
2538 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2539 if (error) {
2540 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2541 goto out2;
2542 }
2543
2544 vnode_put(devvp);
2545 }
2546 }
2547
2548 /*
2549 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2550 * and increment the name cache's mount generation
2551 */
2552
2553 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2554 error = place_mount_and_checkdirs(mp, vp, ctx);
2555 if (error != 0) {
2556 goto out2;
2557 }
2558
2559 placed = TRUE;
2560
2561 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2562 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2563
2564 /* Forbid future moves */
2565 mount_lock(mp);
2566 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2567 mount_unlock(mp);
2568
2569 /* Finally, add to mount list, completely ready to go */
2570 if (mount_list_add(mp) != 0) {
2571 /*
2572 * The system is shutting down trying to umount
2573 * everything, so fail with a plausible errno.
2574 */
2575 error = EBUSY;
2576 goto out3;
2577 }
2578
2579 mount_end_update(mp);
2580 vnode_put(rvp);
2581 zfree(ZV_NAMEI, old_mntonname);
2582
2583 vfs_notify_mount(pvp);
2584 #if CONFIG_MACF
2585 mac_mount_notify_mount(ctx, mp);
2586 #endif /* CONFIG_MACF */
2587
2588 return 0;
2589 out3:
2590 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2591
2592 mount_lock(mp);
2593 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2594 mount_unlock(mp);
2595
2596 out2:
2597 /*
2598 * Placing the mp on the vnode clears VMOUNT,
2599 * so cleanup is different after that point
2600 */
2601 if (placed) {
2602 /* Rele the vp, clear VMOUNT and v_mountedhere */
2603 undo_place_on_covered_vp(mp, vp);
2604 } else {
2605 vnode_lock_spin(vp);
2606 CLR(vp->v_flag, VMOUNT);
2607 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2608 wakeup(&vp->v_flag);
2609 vnode_unlock(vp);
2610 }
2611 out1:
2612 mount_end_update(mp);
2613
2614 out0:
2615 vnode_put(rvp);
2616 zfree(ZV_NAMEI, old_mntonname);
2617 return error;
2618 }
2619
2620 #endif /* CONFIG_IMGSRC_ACCESS */
2621
2622 void
enablequotas(struct mount * mp,vfs_context_t ctx)2623 enablequotas(struct mount *mp, vfs_context_t ctx)
2624 {
2625 struct nameidata qnd;
2626 int type;
2627 char qfpath[MAXPATHLEN];
2628 const char *qfname = QUOTAFILENAME;
2629 const char *qfopsname = QUOTAOPSNAME;
2630 const char *qfextension[] = INITQFNAMES;
2631
2632 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2633 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2634 return;
2635 }
2636 /*
2637 * Enable filesystem disk quotas if necessary.
2638 * We ignore errors as this should not interfere with final mount
2639 */
2640 for (type = 0; type < MAXQUOTAS; type++) {
2641 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2642 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2643 CAST_USER_ADDR_T(qfpath), ctx);
2644 if (namei(&qnd) != 0) {
2645 continue; /* option file to trigger quotas is not present */
2646 }
2647 vnode_put(qnd.ni_vp);
2648 nameidone(&qnd);
2649 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2650
2651 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2652 }
2653 return;
2654 }
2655
2656
2657 static int
checkdirs_callback(proc_t p,void * arg)2658 checkdirs_callback(proc_t p, void * arg)
2659 {
2660 struct cdirargs *cdrp = (struct cdirargs *)arg;
2661 vnode_t olddp = cdrp->olddp;
2662 vnode_t newdp = cdrp->newdp;
2663 struct filedesc *fdp = &p->p_fd;
2664 vnode_t new_cvp = newdp;
2665 vnode_t new_rvp = newdp;
2666 vnode_t old_cvp = NULL;
2667 vnode_t old_rvp = NULL;
2668
2669 /*
2670 * XXX Also needs to iterate each thread in the process to see if it
2671 * XXX is using a per-thread current working directory, and, if so,
2672 * XXX update that as well.
2673 */
2674
2675 /*
2676 * First, with the proc_fdlock held, check to see if we will need
2677 * to do any work. If not, we will get out fast.
2678 */
2679 proc_fdlock(p);
2680 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2681 proc_fdunlock(p);
2682 return PROC_RETURNED;
2683 }
2684 proc_fdunlock(p);
2685
2686 /*
2687 * Ok, we will have to do some work. Always take two refs
2688 * because we might need that many. We'll dispose of whatever
2689 * we ended up not using.
2690 */
2691 if (vnode_ref(newdp) != 0) {
2692 return PROC_RETURNED;
2693 }
2694 if (vnode_ref(newdp) != 0) {
2695 vnode_rele(newdp);
2696 return PROC_RETURNED;
2697 }
2698
2699 proc_dirs_lock_exclusive(p);
2700 /*
2701 * Now do the work. Note: we dropped the proc_fdlock, so we
2702 * have to do all of the checks again.
2703 */
2704 proc_fdlock(p);
2705 if (fdp->fd_cdir == olddp) {
2706 old_cvp = olddp;
2707 fdp->fd_cdir = newdp;
2708 new_cvp = NULL;
2709 }
2710 if (fdp->fd_rdir == olddp) {
2711 old_rvp = olddp;
2712 fdp->fd_rdir = newdp;
2713 new_rvp = NULL;
2714 }
2715 proc_fdunlock(p);
2716 proc_dirs_unlock_exclusive(p);
2717
2718 /*
2719 * Dispose of any references that are no longer needed.
2720 */
2721 if (old_cvp != NULL) {
2722 vnode_rele(old_cvp);
2723 }
2724 if (old_rvp != NULL) {
2725 vnode_rele(old_rvp);
2726 }
2727 if (new_cvp != NULL) {
2728 vnode_rele(new_cvp);
2729 }
2730 if (new_rvp != NULL) {
2731 vnode_rele(new_rvp);
2732 }
2733
2734 return PROC_RETURNED;
2735 }
2736
2737
2738
2739 /*
2740 * Scan all active processes to see if any of them have a current
2741 * or root directory onto which the new filesystem has just been
2742 * mounted. If so, replace them with the new mount point.
2743 */
2744 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2745 checkdirs(vnode_t olddp, vfs_context_t ctx)
2746 {
2747 vnode_t newdp;
2748 vnode_t tvp;
2749 int err;
2750 struct cdirargs cdr;
2751
2752 if (olddp->v_usecount == 1) {
2753 return 0;
2754 }
2755 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2756
2757 if (err != 0) {
2758 #if DIAGNOSTIC
2759 panic("mount: lost mount: error %d", err);
2760 #endif
2761 return err;
2762 }
2763
2764 cdr.olddp = olddp;
2765 cdr.newdp = newdp;
2766 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2767 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2768
2769 if (rootvnode == olddp) {
2770 vnode_ref(newdp);
2771 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2772 tvp = rootvnode;
2773 rootvnode = newdp;
2774 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2775 vnode_rele(tvp);
2776 }
2777
2778 vnode_put(newdp);
2779 return 0;
2780 }
2781
2782 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2783 "com.apple.private.vfs.role-account-unmount"
2784 #define SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT \
2785 "com.apple.private.vfs.system-volume-unmount"
2786
2787 /*
2788 * Unmount a file system.
2789 *
2790 * Note: unmount takes a path to the vnode mounted on as argument,
2791 * not special file (as before).
2792 */
2793 /* ARGSUSED */
2794 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2795 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2796 {
2797 vnode_t vp;
2798 struct mount *mp;
2799 int flags = uap->flags;
2800 int error;
2801 struct nameidata nd;
2802 vfs_context_t ctx;
2803
2804 /*
2805 * If the process has the entitlement, use the kernel's context when
2806 * performing lookup on the mount path as the process might lack proper
2807 * permission to access the directory.
2808 */
2809 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2810 vfs_context_kernel() : vfs_context_current();
2811
2812 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2813 UIO_USERSPACE, uap->path, ctx);
2814 if (flags & MNT_NOFOLLOW) {
2815 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
2816 }
2817
2818 error = namei(&nd);
2819 if (error) {
2820 return error;
2821 }
2822 vp = nd.ni_vp;
2823 mp = vp->v_mount;
2824 nameidone(&nd);
2825
2826 /*
2827 * Must be the root of the filesystem
2828 */
2829 if ((vp->v_flag & VROOT) == 0) {
2830 vnode_put(vp);
2831 return EINVAL;
2832 }
2833 #if CONFIG_MACF
2834 error = mac_mount_check_umount(ctx, mp);
2835 if (error != 0) {
2836 vnode_put(vp);
2837 return error;
2838 }
2839 #endif
2840 mount_ref(mp, 0);
2841 vnode_put(vp);
2842 /* safedounmount consumes the mount ref */
2843 return safedounmount(mp, flags, ctx);
2844 }
2845
2846 int
funmount(__unused proc_t p,struct funmount_args * uap,__unused int32_t * retval)2847 funmount(__unused proc_t p, struct funmount_args *uap, __unused int32_t *retval)
2848 {
2849 int error;
2850 vnode_t vp;
2851 struct mount *mp;
2852 vfs_context_t ctx;
2853
2854 AUDIT_ARG(fd, uap->fd);
2855 AUDIT_ARG(fflags, uap->flags);
2856
2857 /*
2858 * If the process has the entitlement, use the kernel's context when
2859 * performing lookup on the mount path as the process might lack proper
2860 * permission to access the directory.
2861 */
2862 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2863 vfs_context_kernel() : vfs_context_current();
2864
2865 error = vnode_getfromfd(ctx, uap->fd, &vp);
2866 if (error) {
2867 return error;
2868 }
2869
2870 /*
2871 * Must be the root of the filesystem
2872 */
2873 if ((vp->v_flag & VROOT) == 0) {
2874 vnode_put(vp);
2875 return EINVAL;
2876 }
2877 mp = vnode_mount(vp);
2878
2879 #if CONFIG_MACF
2880 error = mac_mount_check_umount(ctx, mp);
2881 if (error != 0) {
2882 vnode_put(vp);
2883 return error;
2884 }
2885 #endif
2886 mount_ref(mp, 0);
2887 vnode_put(vp);
2888
2889 /* safedounmount consumes the mount ref */
2890 return safedounmount(mp, uap->flags, ctx);
2891 }
2892
2893 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2894 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2895 {
2896 mount_t mp;
2897
2898 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2899 if (mp == (mount_t)0) {
2900 return ENOENT;
2901 }
2902 mount_ref(mp, 0);
2903 mount_iterdrop(mp);
2904 /* safedounmount consumes the mount ref */
2905 return safedounmount(mp, flags, ctx);
2906 }
2907
2908 /*
2909 * The mount struct comes with a mount ref which will be consumed.
2910 * Do the actual file system unmount, prevent some common foot shooting.
2911 */
2912 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2913 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2914 {
2915 int error;
2916 proc_t p = vfs_context_proc(ctx);
2917
2918 /*
2919 * If the file system is not responding and MNT_NOBLOCK
2920 * is set and not a forced unmount then return EBUSY.
2921 */
2922 if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2923 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2924 error = EBUSY;
2925 goto out;
2926 }
2927
2928 /*
2929 * Skip authorization in two cases:
2930 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2931 * This entitlement allows non-root processes unmount volumes mounted by
2932 * other processes.
2933 * - If the mount is tagged as permissive and this is not a forced-unmount
2934 * attempt.
2935 */
2936 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2937 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2938 /*
2939 * Only root, or the user that did the original mount is
2940 * permitted to unmount this filesystem.
2941 */
2942 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2943 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2944 goto out;
2945 }
2946 }
2947
2948 /*
2949 * Don't allow unmounting the root file system, or other volumes
2950 * associated with it (for example, the associated VM or DATA mounts) .
2951 */
2952 if (mp->mnt_flag & MNT_ROOTFS) {
2953 error = EBUSY; /* the root is always busy */
2954 goto out;
2955 }
2956 if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !IOCurrentTaskHasEntitlement(SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT)) {
2957 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2958 mp->mnt_vfsstat.f_mntonname);
2959 error = EBUSY; /* root-associated volumes are always busy unless caller is entitled */
2960 goto out;
2961 }
2962
2963 /*
2964 * If the mount is providing the root filesystem's disk image
2965 * (i.e. imageboot), don't allow unmounting
2966 */
2967 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2968 error = EBUSY;
2969 goto out;
2970 }
2971
2972 return dounmount(mp, flags, 1, ctx);
2973
2974 out:
2975 mount_drop(mp, 0);
2976 return error;
2977 }
2978
2979 /*
2980 * Do the actual file system unmount.
2981 */
2982 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2983 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2984 {
2985 vnode_t coveredvp = (vnode_t)0;
2986 int error;
2987 int needwakeup = 0;
2988 int forcedunmount = 0;
2989 int lflags = 0;
2990 struct vnode *devvp = NULLVP;
2991 #if CONFIG_TRIGGERS
2992 proc_t p = vfs_context_proc(ctx);
2993 int did_vflush = 0;
2994 int pflags_save = 0;
2995 #endif /* CONFIG_TRIGGERS */
2996
2997 #if CONFIG_FSE
2998 if (!(flags & MNT_FORCE)) {
2999 fsevent_unmount(mp, ctx); /* has to come first! */
3000 }
3001 #endif
3002
3003 mount_lock(mp);
3004
3005 /*
3006 * If already an unmount in progress just return EBUSY.
3007 * Even a forced unmount cannot override.
3008 */
3009 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
3010 if (withref != 0) {
3011 mount_drop(mp, 1);
3012 }
3013 mount_unlock(mp);
3014 return EBUSY;
3015 }
3016
3017 if (flags & MNT_FORCE) {
3018 forcedunmount = 1;
3019 mp->mnt_lflag |= MNT_LFORCE;
3020 }
3021
3022 #if CONFIG_TRIGGERS
3023 if (flags & MNT_NOBLOCK && p != kernproc) {
3024 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
3025 }
3026 #endif
3027
3028 mp->mnt_kern_flag |= MNTK_UNMOUNT;
3029 mp->mnt_lflag |= MNT_LUNMOUNT;
3030 mp->mnt_flag &= ~MNT_ASYNC;
3031 /*
3032 * anyone currently in the fast path that
3033 * trips over the cached rootvp will be
3034 * dumped out and forced into the slow path
3035 * to regenerate a new cached value
3036 */
3037 mp->mnt_realrootvp = NULLVP;
3038 mount_unlock(mp);
3039
3040 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
3041 /*
3042 * Force unmount any mounts in this filesystem.
3043 * If any unmounts fail - just leave them dangling.
3044 * Avoids recursion.
3045 */
3046 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
3047 }
3048
3049 /*
3050 * taking the name_cache_lock exclusively will
3051 * insure that everyone is out of the fast path who
3052 * might be trying to use a now stale copy of
3053 * vp->v_mountedhere->mnt_realrootvp
3054 * bumping mount_generation causes the cached values
3055 * to be invalidated
3056 */
3057 name_cache_lock();
3058 mount_generation++;
3059 name_cache_unlock();
3060
3061 /*
3062 * Make sure there are no one in the mount iterations or lookup.
3063 * Drain makes 'mnt_iterref' -ve so on error exit we need to ensure that
3064 * 'mnt_iterref' is reset back to 0 by calling mount_iterreset().
3065 */
3066 mount_iterdrain(mp);
3067
3068 lck_rw_lock_exclusive(&mp->mnt_rwlock);
3069 if (withref != 0) {
3070 mount_drop(mp, 0);
3071 }
3072 error = 0;
3073 if (forcedunmount == 0) {
3074 ubc_umount(mp); /* release cached vnodes */
3075 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3076 error = VFS_SYNC(mp, MNT_WAIT, ctx);
3077 if (error) {
3078 mount_iterreset(mp);
3079 mount_lock(mp);
3080 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3081 mp->mnt_lflag &= ~MNT_LUNMOUNT;
3082 mp->mnt_lflag &= ~MNT_LFORCE;
3083 goto out;
3084 }
3085 }
3086 }
3087
3088 IOBSDMountChange(mp, kIOMountChangeUnmount);
3089
3090 #if CONFIG_TRIGGERS
3091 vfs_nested_trigger_unmounts(mp, flags, ctx);
3092 did_vflush = 1;
3093 #endif
3094 if (forcedunmount) {
3095 lflags |= FORCECLOSE;
3096 }
3097 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
3098 if ((forcedunmount == 0) && error) {
3099 mount_iterreset(mp);
3100 mount_lock(mp);
3101 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3102 mp->mnt_lflag &= ~MNT_LUNMOUNT;
3103 mp->mnt_lflag &= ~MNT_LFORCE;
3104 goto out;
3105 }
3106
3107 error = VFS_UNMOUNT(mp, flags, ctx);
3108 if (error) {
3109 mount_iterreset(mp);
3110 mount_lock(mp);
3111 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3112 mp->mnt_lflag &= ~MNT_LUNMOUNT;
3113 mp->mnt_lflag &= ~MNT_LFORCE;
3114 goto out;
3115 }
3116
3117 /* increment the operations count */
3118 if (!error) {
3119 OSAddAtomic(1, &vfs_nummntops);
3120 }
3121
3122 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
3123 /* hold an io reference and drop the usecount before close */
3124 devvp = mp->mnt_devvp;
3125 vnode_getalways(devvp);
3126 vnode_rele(devvp);
3127 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
3128 ctx);
3129 vnode_clearmountedon(devvp);
3130 vnode_put(devvp);
3131 }
3132 lck_rw_done(&mp->mnt_rwlock);
3133 mount_list_remove(mp);
3134 lck_rw_lock_exclusive(&mp->mnt_rwlock);
3135
3136 /* mark the mount point hook in the vp but not drop the ref yet */
3137 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
3138 /*
3139 * The covered vnode needs special handling. Trying to get an
3140 * iocount must not block here as this may lead to deadlocks
3141 * if the Filesystem to which the covered vnode belongs is
3142 * undergoing forced unmounts. Since we hold a usecount, the
3143 * vnode cannot be reused (it can, however, still be terminated)
3144 */
3145 vnode_getalways(coveredvp);
3146 vnode_lock_spin(coveredvp);
3147
3148 mp->mnt_crossref++;
3149 coveredvp->v_mountedhere = (struct mount *)0;
3150 CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
3151 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
3152 wakeup(&coveredvp->v_flag);
3153 vnode_unlock(coveredvp);
3154 vnode_put(coveredvp);
3155 }
3156
3157 mount_list_lock();
3158 mp->mnt_vtable->vfc_refcount--;
3159 mount_list_unlock();
3160
3161 cache_purgevfs(mp); /* remove cache entries for this file sys */
3162 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
3163 mount_lock(mp);
3164 mp->mnt_lflag |= MNT_LDEAD;
3165
3166 if (mp->mnt_lflag & MNT_LWAIT) {
3167 /*
3168 * do the wakeup here
3169 * in case we block in mount_refdrain
3170 * which will drop the mount lock
3171 * and allow anyone blocked in vfs_busy
3172 * to wakeup and see the LDEAD state
3173 */
3174 mp->mnt_lflag &= ~MNT_LWAIT;
3175 wakeup((caddr_t)mp);
3176 }
3177 mount_refdrain(mp);
3178
3179 /* free disk_conditioner_info structure for this mount */
3180 disk_conditioner_unmount(mp);
3181
3182 out:
3183 if (mp->mnt_lflag & MNT_LWAIT) {
3184 mp->mnt_lflag &= ~MNT_LWAIT;
3185 needwakeup = 1;
3186 }
3187
3188 #if CONFIG_TRIGGERS
3189 if (flags & MNT_NOBLOCK && p != kernproc) {
3190 // Restore P_NOREMOTEHANG bit to its previous value
3191 if ((pflags_save & P_NOREMOTEHANG) == 0) {
3192 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
3193 }
3194 }
3195
3196 /*
3197 * Callback and context are set together under the mount lock, and
3198 * never cleared, so we're safe to examine them here, drop the lock,
3199 * and call out.
3200 */
3201 if (mp->mnt_triggercallback != NULL) {
3202 mount_unlock(mp);
3203 if (error == 0) {
3204 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
3205 } else if (did_vflush) {
3206 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
3207 }
3208 } else {
3209 mount_unlock(mp);
3210 }
3211 #else
3212 mount_unlock(mp);
3213 #endif /* CONFIG_TRIGGERS */
3214
3215 lck_rw_done(&mp->mnt_rwlock);
3216
3217 if (needwakeup) {
3218 wakeup((caddr_t)mp);
3219 }
3220
3221 if (!error) {
3222 if ((coveredvp != NULLVP)) {
3223 vnode_t pvp = NULLVP;
3224
3225 /*
3226 * The covered vnode needs special handling. Trying to
3227 * get an iocount must not block here as this may lead
3228 * to deadlocks if the Filesystem to which the covered
3229 * vnode belongs is undergoing forced unmounts. Since we
3230 * hold a usecount, the vnode cannot be reused
3231 * (it can, however, still be terminated).
3232 */
3233 vnode_getalways(coveredvp);
3234
3235 mount_dropcrossref(mp, coveredvp, 0);
3236 /*
3237 * We'll _try_ to detect if this really needs to be
3238 * done. The coveredvp can only be in termination (or
3239 * terminated) if the coveredvp's mount point is in a
3240 * forced unmount (or has been) since we still hold the
3241 * ref.
3242 */
3243 if (!vnode_isrecycled(coveredvp)) {
3244 pvp = vnode_getparent(coveredvp);
3245 #if CONFIG_TRIGGERS
3246 if (coveredvp->v_resolve) {
3247 vnode_trigger_rearm(coveredvp, ctx);
3248 }
3249 #endif
3250 }
3251
3252 vnode_rele(coveredvp);
3253 vnode_put(coveredvp);
3254 coveredvp = NULLVP;
3255
3256 if (pvp) {
3257 lock_vnode_and_post(pvp, NOTE_WRITE);
3258 vnode_put(pvp);
3259 }
3260 } else if (mp->mnt_flag & MNT_ROOTFS) {
3261 if (nc_smr_enabled) {
3262 vfs_smr_synchronize();
3263 }
3264
3265 mount_lock_destroy(mp);
3266 #if CONFIG_MACF
3267 mac_mount_label_destroy(mp);
3268 #endif
3269 zfree(mount_zone, mp);
3270 } else {
3271 panic("dounmount: no coveredvp");
3272 }
3273 }
3274 return error;
3275 }
3276
3277 /*
3278 * Unmount any mounts in this filesystem.
3279 */
3280 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3281 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3282 {
3283 mount_t smp;
3284 fsid_t *fsids, fsid;
3285 int fsids_sz;
3286 int count = 0, i, m = 0;
3287 vnode_t vp;
3288
3289 mount_list_lock();
3290
3291 // Get an array to hold the submounts fsids.
3292 TAILQ_FOREACH(smp, &mountlist, mnt_list)
3293 count++;
3294 fsids_sz = count * sizeof(fsid_t);
3295 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3296 if (fsids == NULL) {
3297 mount_list_unlock();
3298 goto out;
3299 }
3300 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3301
3302 /*
3303 * Fill the array with submount fsids.
3304 * Since mounts are always added to the tail of the mount list, the
3305 * list is always in mount order.
3306 * For each mount check if the mounted-on vnode belongs to a
3307 * mount that's already added to our array of mounts to be unmounted.
3308 */
3309 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3310 vp = smp->mnt_vnodecovered;
3311 if (vp == NULL) {
3312 continue;
3313 }
3314 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3315 for (i = 0; i <= m; i++) {
3316 if (fsids[i].val[0] == fsid.val[0] &&
3317 fsids[i].val[1] == fsid.val[1]) {
3318 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3319 break;
3320 }
3321 }
3322 }
3323 mount_list_unlock();
3324
3325 // Unmount the submounts in reverse order. Ignore errors.
3326 for (i = m; i > 0; i--) {
3327 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3328 if (smp) {
3329 mount_ref(smp, 0);
3330 mount_iterdrop(smp);
3331 (void) dounmount(smp, flags, 1, ctx);
3332 }
3333 }
3334 out:
3335 kfree_data(fsids, fsids_sz);
3336 }
3337
3338 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3339 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3340 {
3341 vnode_hold(dp);
3342 vnode_lock(dp);
3343 mp->mnt_crossref--;
3344
3345 if (mp->mnt_crossref < 0) {
3346 panic("mount cross refs -ve");
3347 }
3348
3349 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3350 if (need_put) {
3351 vnode_put_locked(dp);
3352 }
3353 vnode_drop_and_unlock(dp);
3354
3355 if (nc_smr_enabled) {
3356 vfs_smr_synchronize();
3357 }
3358
3359 mount_lock_destroy(mp);
3360 #if CONFIG_MACF
3361 mac_mount_label_destroy(mp);
3362 #endif
3363 zfree(mount_zone, mp);
3364 return;
3365 }
3366 if (need_put) {
3367 vnode_put_locked(dp);
3368 }
3369 vnode_drop_and_unlock(dp);
3370 }
3371
3372
3373 /*
3374 * Sync each mounted filesystem.
3375 */
3376 #if DIAGNOSTIC
3377 int syncprt = 0;
3378 #endif
3379
3380 int print_vmpage_stat = 0;
3381
3382 /*
3383 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3384 * mounted read-write with the passed waitfor value.
3385 *
3386 * Parameters: mp mount-point descriptor per mounted file-system instance.
3387 * arg user argument (please see below)
3388 *
3389 * User argument is a pointer to 32 bit unsigned integer which describes the
3390 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3391 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3392 * waitfor value.
3393 *
3394 * Returns: VFS_RETURNED
3395 */
3396 static int
sync_callback(mount_t mp,void * arg)3397 sync_callback(mount_t mp, void *arg)
3398 {
3399 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3400 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3401 unsigned waitfor = MNT_NOWAIT;
3402
3403 if (arg) {
3404 waitfor = *(uint32_t*)arg;
3405 }
3406
3407 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3408 if (waitfor != MNT_WAIT &&
3409 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3410 waitfor != MNT_NOWAIT &&
3411 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3412 waitfor != MNT_DWAIT &&
3413 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3414 panic("Passed inappropriate waitfor %u to "
3415 "sync_callback()", waitfor);
3416 }
3417
3418 mp->mnt_flag &= ~MNT_ASYNC;
3419 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3420 if (asyncflag) {
3421 mp->mnt_flag |= MNT_ASYNC;
3422 }
3423 }
3424
3425 return VFS_RETURNED;
3426 }
3427
3428 /* ARGSUSED */
3429 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3430 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3431 {
3432 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3433
3434 if (print_vmpage_stat) {
3435 vm_countdirtypages();
3436 }
3437
3438 #if DIAGNOSTIC
3439 if (syncprt) {
3440 vfs_bufstats();
3441 }
3442 #endif /* DIAGNOSTIC */
3443 return 0;
3444 }
3445
3446 typedef enum {
3447 SYNC_ALL = 0,
3448 SYNC_ONLY_RELIABLE_MEDIA = 1,
3449 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3450 } sync_type_t;
3451
3452 static int
sync_internal_callback(mount_t mp,void * arg)3453 sync_internal_callback(mount_t mp, void *arg)
3454 {
3455 if (arg) {
3456 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3457 (mp->mnt_flag & MNT_LOCAL);
3458 sync_type_t sync_type = *((sync_type_t *)arg);
3459
3460 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3461 return VFS_RETURNED;
3462 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3463 return VFS_RETURNED;
3464 }
3465 }
3466
3467 (void)sync_callback(mp, NULL);
3468
3469 return VFS_RETURNED;
3470 }
3471
3472 int sync_thread_state = 0;
3473 int sync_timeout_seconds = 5;
3474
3475 #define SYNC_THREAD_RUN 0x0001
3476 #define SYNC_THREAD_RUNNING 0x0002
3477
3478 #if CONFIG_PHYS_WRITE_ACCT
3479 thread_t pm_sync_thread;
3480 #endif /* CONFIG_PHYS_WRITE_ACCT */
3481
3482 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3483 sync_thread(__unused void *arg, __unused wait_result_t wr)
3484 {
3485 sync_type_t sync_type;
3486 #if CONFIG_PHYS_WRITE_ACCT
3487 pm_sync_thread = current_thread();
3488 #endif /* CONFIG_PHYS_WRITE_ACCT */
3489
3490 lck_mtx_lock(&sync_mtx_lck);
3491 while (sync_thread_state & SYNC_THREAD_RUN) {
3492 sync_thread_state &= ~SYNC_THREAD_RUN;
3493 lck_mtx_unlock(&sync_mtx_lck);
3494
3495 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3496 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3497 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3498 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3499
3500 lck_mtx_lock(&sync_mtx_lck);
3501 }
3502 /*
3503 * This wakeup _has_ to be issued before the lock is released otherwise
3504 * we may end up waking up a thread in sync_internal which is
3505 * expecting a wakeup from a thread it just created and not from this
3506 * thread which is about to exit.
3507 */
3508 wakeup(&sync_thread_state);
3509 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3510 #if CONFIG_PHYS_WRITE_ACCT
3511 pm_sync_thread = NULL;
3512 #endif /* CONFIG_PHYS_WRITE_ACCT */
3513 lck_mtx_unlock(&sync_mtx_lck);
3514
3515 if (print_vmpage_stat) {
3516 vm_countdirtypages();
3517 }
3518
3519 #if DIAGNOSTIC
3520 if (syncprt) {
3521 vfs_bufstats();
3522 }
3523 #endif /* DIAGNOSTIC */
3524 }
3525
3526 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3527
3528 /*
3529 * An in-kernel sync for power management to call.
3530 * This function always returns within sync_timeout seconds.
3531 */
3532 __private_extern__ int
sync_internal(void)3533 sync_internal(void)
3534 {
3535 thread_t thd = NULL;
3536 int error;
3537 int thread_created = FALSE;
3538 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3539
3540 lck_mtx_lock(&sync_mtx_lck);
3541 sync_thread_state |= SYNC_THREAD_RUN;
3542 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3543 int kr;
3544
3545 sync_thread_state |= SYNC_THREAD_RUNNING;
3546 kr = kernel_thread_start(sync_thread, NULL, &thd);
3547 if (kr != KERN_SUCCESS) {
3548 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3549 lck_mtx_unlock(&sync_mtx_lck);
3550 printf("sync_thread failed\n");
3551 return 0;
3552 }
3553 thread_created = TRUE;
3554 }
3555
3556 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3557 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3558 if (error) {
3559 struct timeval now;
3560
3561 microtime(&now);
3562 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3563 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3564 sync_timeout_last_print.tv_sec = now.tv_sec;
3565 }
3566 }
3567
3568 if (thread_created) {
3569 thread_deallocate(thd);
3570 }
3571
3572 return 0;
3573 } /* end of sync_internal call */
3574
3575 /*
3576 * Change filesystem quotas.
3577 */
3578 #if QUOTA
3579 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3580 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3581 {
3582 struct mount *mp;
3583 int error, quota_cmd, quota_status = 0;
3584 caddr_t datap;
3585 size_t fnamelen;
3586 struct nameidata nd;
3587 vfs_context_t ctx = vfs_context_current();
3588 struct dqblk my_dqblk = {};
3589
3590 AUDIT_ARG(uid, uap->uid);
3591 AUDIT_ARG(cmd, uap->cmd);
3592 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3593 uap->path, ctx);
3594 error = namei(&nd);
3595 if (error) {
3596 return error;
3597 }
3598 mp = nd.ni_vp->v_mount;
3599 mount_ref(mp, 0);
3600 vnode_put(nd.ni_vp);
3601 nameidone(&nd);
3602
3603 #if CONFIG_MACF
3604 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3605 if (error != 0) {
3606 goto out;
3607 }
3608 #endif
3609
3610 /* copyin any data we will need for downstream code */
3611 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3612
3613 switch (quota_cmd) {
3614 case Q_QUOTAON:
3615 /* uap->arg specifies a file from which to take the quotas */
3616 fnamelen = MAXPATHLEN;
3617 datap = zalloc(ZV_NAMEI);
3618 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3619 break;
3620 case Q_GETQUOTA:
3621 /* uap->arg is a pointer to a dqblk structure. */
3622 datap = (caddr_t) &my_dqblk;
3623 break;
3624 case Q_SETQUOTA:
3625 case Q_SETUSE:
3626 /* uap->arg is a pointer to a dqblk structure. */
3627 datap = (caddr_t) &my_dqblk;
3628 if (proc_is64bit(p)) {
3629 struct user_dqblk my_dqblk64;
3630 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3631 if (error == 0) {
3632 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3633 }
3634 } else {
3635 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3636 }
3637 break;
3638 case Q_QUOTASTAT:
3639 /* uap->arg is a pointer to an integer */
3640 datap = (caddr_t) "a_status;
3641 break;
3642 default:
3643 datap = NULL;
3644 break;
3645 } /* switch */
3646
3647 if (error == 0) {
3648 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3649 }
3650
3651 switch (quota_cmd) {
3652 case Q_QUOTAON:
3653 if (datap != NULL) {
3654 zfree(ZV_NAMEI, datap);
3655 }
3656 break;
3657 case Q_GETQUOTA:
3658 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3659 if (error == 0) {
3660 if (proc_is64bit(p)) {
3661 struct user_dqblk my_dqblk64;
3662
3663 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3664 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3665 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3666 } else {
3667 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3668 }
3669 }
3670 break;
3671 case Q_QUOTASTAT:
3672 /* uap->arg is a pointer to an integer */
3673 if (error == 0) {
3674 error = copyout(datap, uap->arg, sizeof(quota_status));
3675 }
3676 break;
3677 default:
3678 break;
3679 } /* switch */
3680
3681 out:
3682 mount_drop(mp, 0);
3683 return error;
3684 }
3685 #else
3686 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3687 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3688 {
3689 return EOPNOTSUPP;
3690 }
3691 #endif /* QUOTA */
3692
3693 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3694 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3695 {
3696 int error;
3697 vfs_context_t ctx = vfs_context_current();
3698
3699 #if CONFIG_MACF
3700 error = mac_mount_check_stat(ctx, mp);
3701 if (error != 0) {
3702 return error;
3703 }
3704 #endif
3705
3706 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3707 if (error != 0) {
3708 return error;
3709 }
3710
3711 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3712 }
3713
3714 /*
3715 * Get filesystem statistics.
3716 *
3717 * Returns: 0 Success
3718 * namei:???
3719 * vfs_update_vfsstat:???
3720 * munge_statfs:EFAULT
3721 */
3722 /* ARGSUSED */
3723 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3724 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3725 {
3726 int error;
3727 struct mount *mp;
3728 struct nameidata nd;
3729 vfs_context_t ctx = vfs_context_current();
3730 vnode_t vp;
3731
3732 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3733 UIO_USERSPACE, uap->path, ctx);
3734 error = namei(&nd);
3735 if (error != 0) {
3736 return error;
3737 }
3738 vp = nd.ni_vp;
3739 mp = vp->v_mount;
3740 nameidone(&nd);
3741
3742 error = statfs_internal(p, mp, uap->buf);
3743 vnode_put(vp);
3744
3745 return error;
3746 }
3747
3748 /*
3749 * Get filesystem statistics.
3750 */
3751 /* ARGSUSED */
3752 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3753 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3754 {
3755 int error;
3756 vnode_t vp = NULL;
3757 struct mount *mp;
3758
3759 AUDIT_ARG(fd, uap->fd);
3760
3761 if ((error = file_vnode(uap->fd, &vp)) ||
3762 (error = vnode_getwithref(vp))) {
3763 goto out;
3764 }
3765
3766 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3767
3768 mp = vp->v_mount;
3769 if (!mp) {
3770 error = EBADF;
3771 goto out_vnode;
3772 }
3773
3774 error = statfs_internal(p, mp, uap->buf);
3775
3776 out_vnode:
3777 vnode_put(vp);
3778
3779 out:
3780 if (vp != NULL) {
3781 file_drop(uap->fd);
3782 }
3783
3784 return error;
3785 }
3786
3787 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3788 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3789 {
3790 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3791
3792 bzero(sfs, sizeof(*sfs));
3793
3794 sfs->f_bsize = vsfs->f_bsize;
3795 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3796 sfs->f_blocks = vsfs->f_blocks;
3797 sfs->f_bfree = vsfs->f_bfree;
3798 sfs->f_bavail = vsfs->f_bavail;
3799 sfs->f_files = vsfs->f_files;
3800 sfs->f_ffree = vsfs->f_ffree;
3801 sfs->f_fsid = vsfs->f_fsid;
3802 sfs->f_owner = vsfs->f_owner;
3803 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3804 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3805 sfs->f_fssubtype = vsfs->f_fssubtype;
3806 sfs->f_flags_ext = vfs_getextflags(mp);
3807 vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3808 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3809 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3810 }
3811
3812 /*
3813 * Get file system statistics in 64-bit mode
3814 */
3815 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3816 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3817 {
3818 struct mount *mp;
3819 int error;
3820 struct nameidata *ndp;
3821 struct statfs64 *sfsp;
3822 vfs_context_t ctxp = vfs_context_current();
3823 vnode_t vp;
3824 struct {
3825 struct nameidata nd;
3826 struct statfs64 sfs;
3827 } *__nameidata_statfs64;
3828
3829 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3830 Z_WAITOK);
3831 ndp = &__nameidata_statfs64->nd;
3832
3833 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3834 UIO_USERSPACE, uap->path, ctxp);
3835 error = namei(ndp);
3836 if (error != 0) {
3837 goto out;
3838 }
3839 vp = ndp->ni_vp;
3840 mp = vp->v_mount;
3841 nameidone(ndp);
3842
3843 #if CONFIG_MACF
3844 error = mac_mount_check_stat(ctxp, mp);
3845 if (error != 0) {
3846 vnode_put(vp);
3847 goto out;
3848 }
3849 #endif
3850
3851 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3852 if (error != 0) {
3853 vnode_put(vp);
3854 goto out;
3855 }
3856
3857 sfsp = &__nameidata_statfs64->sfs;
3858 vfs_get_statfs64(mp, sfsp);
3859 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3860 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3861 /* This process does not want to see a seperate data volume mountpoint */
3862 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3863 }
3864 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3865 vnode_put(vp);
3866
3867 out:
3868 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3869
3870 return error;
3871 }
3872
3873 /*
3874 * Get file system statistics in 64-bit mode
3875 */
3876 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3877 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3878 {
3879 struct vnode *vp;
3880 struct mount *mp;
3881 struct statfs64 sfs;
3882 int error;
3883
3884 AUDIT_ARG(fd, uap->fd);
3885
3886 if ((error = file_vnode(uap->fd, &vp))) {
3887 return error;
3888 }
3889
3890 error = vnode_getwithref(vp);
3891 if (error) {
3892 file_drop(uap->fd);
3893 return error;
3894 }
3895
3896 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3897
3898 mp = vp->v_mount;
3899 if (!mp) {
3900 error = EBADF;
3901 goto out;
3902 }
3903
3904 #if CONFIG_MACF
3905 error = mac_mount_check_stat(vfs_context_current(), mp);
3906 if (error != 0) {
3907 goto out;
3908 }
3909 #endif
3910
3911 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3912 goto out;
3913 }
3914
3915 vfs_get_statfs64(mp, &sfs);
3916 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3917 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3918 /* This process does not want to see a seperate data volume mountpoint */
3919 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3920 }
3921 error = copyout(&sfs, uap->buf, sizeof(sfs));
3922
3923 out:
3924 file_drop(uap->fd);
3925 vnode_put(vp);
3926
3927 return error;
3928 }
3929
3930 struct getfsstat_struct {
3931 user_addr_t sfsp;
3932 user_addr_t *mp;
3933 int count;
3934 int maxcount;
3935 int flags;
3936 int error;
3937 };
3938
3939
3940 static int
getfsstat_callback(mount_t mp,void * arg)3941 getfsstat_callback(mount_t mp, void * arg)
3942 {
3943 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3944 struct vfsstatfs *sp;
3945 int error, my_size;
3946 vfs_context_t ctx = vfs_context_current();
3947
3948 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3949 #if CONFIG_MACF
3950 error = mac_mount_check_stat(ctx, mp);
3951 if (error != 0) {
3952 fstp->error = error;
3953 return VFS_RETURNED_DONE;
3954 }
3955 #endif
3956 sp = &mp->mnt_vfsstat;
3957 /*
3958 * If MNT_NOWAIT is specified, do not refresh the
3959 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3960 */
3961 if ((mp->mnt_lflag & MNT_LDEAD) ||
3962 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3963 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3964 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3965 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3966 return VFS_RETURNED;
3967 }
3968
3969 /*
3970 * Need to handle LP64 version of struct statfs
3971 */
3972 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3973 if (error) {
3974 fstp->error = error;
3975 return VFS_RETURNED_DONE;
3976 }
3977 fstp->sfsp += my_size;
3978
3979 if (fstp->mp) {
3980 #if CONFIG_MACF
3981 error = mac_mount_label_get(mp, *fstp->mp);
3982 if (error) {
3983 fstp->error = error;
3984 return VFS_RETURNED_DONE;
3985 }
3986 #endif
3987 fstp->mp++;
3988 }
3989 }
3990 fstp->count++;
3991 return VFS_RETURNED;
3992 }
3993
3994 /*
3995 * Get statistics on all filesystems.
3996 */
3997 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3998 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3999 {
4000 struct __mac_getfsstat_args muap;
4001
4002 muap.buf = uap->buf;
4003 muap.bufsize = uap->bufsize;
4004 muap.mac = USER_ADDR_NULL;
4005 muap.macsize = 0;
4006 muap.flags = uap->flags;
4007
4008 return __mac_getfsstat(p, &muap, retval);
4009 }
4010
4011 /*
4012 * __mac_getfsstat: Get MAC-related file system statistics
4013 *
4014 * Parameters: p (ignored)
4015 * uap User argument descriptor (see below)
4016 * retval Count of file system statistics (N stats)
4017 *
4018 * Indirect: uap->bufsize Buffer size
4019 * uap->macsize MAC info size
4020 * uap->buf Buffer where information will be returned
4021 * uap->mac MAC info
4022 * uap->flags File system flags
4023 *
4024 *
4025 * Returns: 0 Success
4026 * !0 Not success
4027 *
4028 */
4029 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)4030 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
4031 {
4032 user_addr_t sfsp;
4033 user_addr_t *mp;
4034 size_t count, maxcount, bufsize, macsize;
4035 struct getfsstat_struct fst;
4036
4037 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
4038 return EINVAL;
4039 }
4040
4041 bufsize = (size_t) uap->bufsize;
4042 macsize = (size_t) uap->macsize;
4043
4044 if (IS_64BIT_PROCESS(p)) {
4045 maxcount = bufsize / sizeof(struct user64_statfs);
4046 } else {
4047 maxcount = bufsize / sizeof(struct user32_statfs);
4048 }
4049 sfsp = uap->buf;
4050 count = 0;
4051
4052 mp = NULL;
4053
4054 #if CONFIG_MACF
4055 if (uap->mac != USER_ADDR_NULL) {
4056 u_int32_t *mp0;
4057 int error;
4058 unsigned int i;
4059
4060 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
4061 if (count != maxcount) {
4062 return EINVAL;
4063 }
4064
4065 /* Copy in the array */
4066 mp0 = kalloc_data(macsize, Z_WAITOK);
4067 if (mp0 == NULL) {
4068 return ENOMEM;
4069 }
4070
4071 error = copyin(uap->mac, mp0, macsize);
4072 if (error) {
4073 kfree_data(mp0, macsize);
4074 return error;
4075 }
4076
4077 /* Normalize to an array of user_addr_t */
4078 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
4079 if (mp == NULL) {
4080 kfree_data(mp0, macsize);
4081 return ENOMEM;
4082 }
4083
4084 for (i = 0; i < count; i++) {
4085 if (IS_64BIT_PROCESS(p)) {
4086 mp[i] = ((user_addr_t *)mp0)[i];
4087 } else {
4088 mp[i] = (user_addr_t)mp0[i];
4089 }
4090 }
4091 kfree_data(mp0, macsize);
4092 }
4093 #endif
4094
4095
4096 fst.sfsp = sfsp;
4097 fst.mp = mp;
4098 fst.flags = uap->flags;
4099 fst.count = 0;
4100 fst.error = 0;
4101 fst.maxcount = (int)maxcount;
4102
4103
4104 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
4105
4106 if (mp) {
4107 kfree_data(mp, count * sizeof(user_addr_t));
4108 }
4109
4110 if (fst.error) {
4111 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4112 return fst.error;
4113 }
4114
4115 if (fst.sfsp && fst.count > fst.maxcount) {
4116 *retval = fst.maxcount;
4117 } else {
4118 *retval = fst.count;
4119 }
4120 return 0;
4121 }
4122
4123 static int
getfsstat64_callback(mount_t mp,void * arg)4124 getfsstat64_callback(mount_t mp, void * arg)
4125 {
4126 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
4127 struct vfsstatfs *sp;
4128 struct statfs64 sfs;
4129 int error;
4130
4131 if (fstp->sfsp && fstp->count < fstp->maxcount) {
4132 #if CONFIG_MACF
4133 error = mac_mount_check_stat(vfs_context_current(), mp);
4134 if (error != 0) {
4135 fstp->error = error;
4136 return VFS_RETURNED_DONE;
4137 }
4138 #endif
4139 sp = &mp->mnt_vfsstat;
4140 /*
4141 * If MNT_NOWAIT is specified, do not refresh the fsstat
4142 * cache. MNT_WAIT overrides MNT_NOWAIT.
4143 *
4144 * We treat MNT_DWAIT as MNT_WAIT for all instances of
4145 * getfsstat, since the constants are out of the same
4146 * namespace.
4147 */
4148 if ((mp->mnt_lflag & MNT_LDEAD) ||
4149 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
4150 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
4151 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
4152 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
4153 return VFS_RETURNED;
4154 }
4155
4156 vfs_get_statfs64(mp, &sfs);
4157 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
4158 if (error) {
4159 fstp->error = error;
4160 return VFS_RETURNED_DONE;
4161 }
4162 fstp->sfsp += sizeof(sfs);
4163 }
4164 fstp->count++;
4165 return VFS_RETURNED;
4166 }
4167
4168 /*
4169 * Get statistics on all file systems in 64 bit mode.
4170 */
4171 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)4172 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
4173 {
4174 user_addr_t sfsp;
4175 int count, maxcount;
4176 struct getfsstat_struct fst;
4177
4178 maxcount = uap->bufsize / sizeof(struct statfs64);
4179
4180 sfsp = uap->buf;
4181 count = 0;
4182
4183 fst.sfsp = sfsp;
4184 fst.flags = uap->flags;
4185 fst.count = 0;
4186 fst.error = 0;
4187 fst.maxcount = maxcount;
4188
4189 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
4190
4191 if (fst.error) {
4192 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4193 return fst.error;
4194 }
4195
4196 if (fst.sfsp && fst.count > fst.maxcount) {
4197 *retval = fst.maxcount;
4198 } else {
4199 *retval = fst.count;
4200 }
4201
4202 return 0;
4203 }
4204
4205 /*
4206 * gets the associated vnode with the file descriptor passed.
4207 * as input
4208 *
4209 * INPUT
4210 * ctx - vfs context of caller
4211 * fd - file descriptor for which vnode is required.
4212 * vpp - Pointer to pointer to vnode to be returned.
4213 *
4214 * The vnode is returned with an iocount so any vnode obtained
4215 * by this call needs a vnode_put
4216 *
4217 */
4218 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4219 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4220 {
4221 int error;
4222 vnode_t vp;
4223 struct fileproc *fp;
4224 proc_t p = vfs_context_proc(ctx);
4225
4226 *vpp = NULLVP;
4227
4228 error = fp_getfvp(p, fd, &fp, &vp);
4229 if (error) {
4230 return error;
4231 }
4232
4233 error = vnode_getwithref(vp);
4234 if (error) {
4235 (void)fp_drop(p, fd, fp, 0);
4236 return error;
4237 }
4238
4239 (void)fp_drop(p, fd, fp, 0);
4240 *vpp = vp;
4241 return error;
4242 }
4243
4244 int
vnode_getfromid(int volfs_id,uint64_t objid,vfs_context_t ctx,int realfsid,vnode_t * vpp)4245 vnode_getfromid(int volfs_id, uint64_t objid, vfs_context_t ctx, int realfsid, vnode_t *vpp)
4246 {
4247 int error = 0;
4248 vnode_t vp = NULLVP;
4249 struct mount *mp = NULL;
4250
4251 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
4252 error = ENOTSUP; /* unexpected failure */
4253 return ENOTSUP;
4254 }
4255
4256 #if CONFIG_UNION_MOUNTS
4257 unionget:
4258 #endif /* CONFIG_UNION_MOUNTS */
4259 if (objid == 2) {
4260 struct vfs_attr vfsattr;
4261 int use_vfs_root = TRUE;
4262
4263 VFSATTR_INIT(&vfsattr);
4264 VFSATTR_WANTED(&vfsattr, f_capabilities);
4265 if (!realfsid &&
4266 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
4267 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
4268 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
4269 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
4270 use_vfs_root = FALSE;
4271 }
4272 }
4273
4274 if (use_vfs_root) {
4275 error = VFS_ROOT(mp, &vp, ctx);
4276 } else {
4277 error = VFS_VGET(mp, objid, &vp, ctx);
4278 }
4279 } else {
4280 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
4281 }
4282
4283 #if CONFIG_UNION_MOUNTS
4284 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
4285 /*
4286 * If the fileid isn't found and we're in a union
4287 * mount volume, then see if the fileid is in the
4288 * mounted-on volume.
4289 */
4290 struct mount *tmp = mp;
4291 mp = vnode_mount(tmp->mnt_vnodecovered);
4292 vfs_unbusy(tmp);
4293 if (vfs_busy(mp, LK_NOWAIT) == 0) {
4294 goto unionget;
4295 }
4296 } else {
4297 vfs_unbusy(mp);
4298 }
4299 #else
4300 vfs_unbusy(mp);
4301 #endif /* CONFIG_UNION_MOUNTS */
4302
4303 if (!error) {
4304 *vpp = vp;
4305 }
4306
4307 return error;
4308 }
4309
4310 /*
4311 * Wrapper function around namei to start lookup from a directory
4312 * specified by a file descriptor ni_dirfd.
4313 *
4314 * In addition to all the errors returned by namei, this call can
4315 * return ENOTDIR if the file descriptor does not refer to a directory.
4316 * and EBADF if the file descriptor is not valid.
4317 */
4318 int
nameiat(struct nameidata * ndp,int dirfd)4319 nameiat(struct nameidata *ndp, int dirfd)
4320 {
4321 if ((dirfd != AT_FDCWD) &&
4322 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4323 !(ndp->ni_cnd.cn_flags & USEDVP)) {
4324 int error = 0;
4325 char c;
4326
4327 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4328 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4329 if (error) {
4330 return error;
4331 }
4332 } else {
4333 c = *((char *)(ndp->ni_dirp));
4334 }
4335
4336 if (c != '/') {
4337 vnode_t dvp_at;
4338
4339 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4340 &dvp_at);
4341 if (error) {
4342 return error;
4343 }
4344
4345 if (vnode_vtype(dvp_at) != VDIR) {
4346 vnode_put(dvp_at);
4347 return ENOTDIR;
4348 }
4349
4350 ndp->ni_dvp = dvp_at;
4351 ndp->ni_cnd.cn_flags |= USEDVP;
4352 error = namei(ndp);
4353 ndp->ni_cnd.cn_flags &= ~USEDVP;
4354 vnode_put(dvp_at);
4355 return error;
4356 }
4357 }
4358
4359 return namei(ndp);
4360 }
4361
4362 /*
4363 * Change current working directory to a given file descriptor.
4364 */
4365 /* ARGSUSED */
4366 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4367 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4368 {
4369 vnode_t vp;
4370 vnode_t tdp;
4371 vnode_t tvp;
4372 struct mount *mp;
4373 int error, should_put = 1;
4374
4375 AUDIT_ARG(fd, fd);
4376 if (per_thread && fd == -1) {
4377 /*
4378 * Switching back from per-thread to per process CWD; verify we
4379 * in fact have one before proceeding. The only success case
4380 * for this code path is to return 0 preemptively after zapping
4381 * the thread structure contents.
4382 */
4383 thread_t th = vfs_context_thread(ctx);
4384 if (th) {
4385 uthread_t uth = get_bsdthread_info(th);
4386 tvp = uth->uu_cdir;
4387 uth->uu_cdir = NULLVP;
4388 if (tvp != NULLVP) {
4389 vnode_rele(tvp);
4390 return 0;
4391 }
4392 }
4393 return EBADF;
4394 }
4395
4396 if ((error = file_vnode(fd, &vp))) {
4397 return error;
4398 }
4399 if ((error = vnode_getwithref(vp))) {
4400 file_drop(fd);
4401 return error;
4402 }
4403
4404 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4405
4406 if (vp->v_type != VDIR) {
4407 error = ENOTDIR;
4408 goto out;
4409 }
4410
4411 #if CONFIG_MACF
4412 error = mac_vnode_check_chdir(ctx, vp);
4413 if (error) {
4414 goto out;
4415 }
4416 #endif
4417 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4418 if (error) {
4419 goto out;
4420 }
4421
4422 while (!error && (mp = vp->v_mountedhere) != NULL) {
4423 if (vfs_busy(mp, LK_NOWAIT)) {
4424 error = EACCES;
4425 goto out;
4426 }
4427 error = VFS_ROOT(mp, &tdp, ctx);
4428 vfs_unbusy(mp);
4429 if (error) {
4430 break;
4431 }
4432 vnode_put(vp);
4433 vp = tdp;
4434 }
4435 if (error) {
4436 goto out;
4437 }
4438 if ((error = vnode_ref(vp))) {
4439 goto out;
4440 }
4441 vnode_put(vp);
4442 should_put = 0;
4443
4444 if (per_thread) {
4445 thread_t th = vfs_context_thread(ctx);
4446 if (th) {
4447 uthread_t uth = get_bsdthread_info(th);
4448 tvp = uth->uu_cdir;
4449 uth->uu_cdir = vp;
4450 OSBitOrAtomic(P_THCWD, &p->p_flag);
4451 } else {
4452 vnode_rele(vp);
4453 error = ENOENT;
4454 goto out;
4455 }
4456 } else {
4457 proc_dirs_lock_exclusive(p);
4458 proc_fdlock(p);
4459 tvp = p->p_fd.fd_cdir;
4460 p->p_fd.fd_cdir = vp;
4461 proc_fdunlock(p);
4462 proc_dirs_unlock_exclusive(p);
4463 }
4464
4465 if (tvp) {
4466 vnode_rele(tvp);
4467 }
4468
4469 out:
4470 if (should_put) {
4471 vnode_put(vp);
4472 }
4473 file_drop(fd);
4474
4475 return error;
4476 }
4477
4478 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4479 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4480 {
4481 return fchdir(p, vfs_context_current(), uap->fd, false);
4482 }
4483
4484 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4485 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4486 {
4487 return fchdir(p, vfs_context_current(), uap->fd, true);
4488 }
4489
4490
4491 /*
4492 * Change current working directory (".").
4493 *
4494 * Returns: 0 Success
4495 * change_dir:ENOTDIR
4496 * change_dir:???
4497 * vnode_ref:ENOENT No such file or directory
4498 */
4499 /* ARGSUSED */
4500 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4501 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4502 {
4503 int error;
4504 vnode_t tvp;
4505
4506 error = change_dir(ndp, ctx);
4507 if (error) {
4508 return error;
4509 }
4510 if ((error = vnode_ref(ndp->ni_vp))) {
4511 vnode_put(ndp->ni_vp);
4512 return error;
4513 }
4514 /*
4515 * drop the iocount we picked up in change_dir
4516 */
4517 vnode_put(ndp->ni_vp);
4518
4519 if (per_thread) {
4520 thread_t th = vfs_context_thread(ctx);
4521 if (th) {
4522 uthread_t uth = get_bsdthread_info(th);
4523 tvp = uth->uu_cdir;
4524 uth->uu_cdir = ndp->ni_vp;
4525 OSBitOrAtomic(P_THCWD, &p->p_flag);
4526 } else {
4527 vnode_rele(ndp->ni_vp);
4528 return ENOENT;
4529 }
4530 } else {
4531 proc_dirs_lock_exclusive(p);
4532 proc_fdlock(p);
4533 tvp = p->p_fd.fd_cdir;
4534 p->p_fd.fd_cdir = ndp->ni_vp;
4535 proc_fdunlock(p);
4536 proc_dirs_unlock_exclusive(p);
4537 }
4538
4539 if (tvp) {
4540 vnode_rele(tvp);
4541 }
4542
4543 return 0;
4544 }
4545
4546
4547 /*
4548 * Change current working directory (".").
4549 *
4550 * Returns: 0 Success
4551 * chdir_internal:ENOTDIR
4552 * chdir_internal:ENOENT No such file or directory
4553 * chdir_internal:???
4554 */
4555 /* ARGSUSED */
4556 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4557 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4558 {
4559 struct nameidata nd;
4560 vfs_context_t ctx = vfs_context_current();
4561
4562 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4563 UIO_USERSPACE, uap->path, ctx);
4564
4565 return chdir_internal(p, ctx, &nd, per_thread);
4566 }
4567
4568
4569 /*
4570 * chdir
4571 *
4572 * Change current working directory (".") for the entire process
4573 *
4574 * Parameters: p Process requesting the call
4575 * uap User argument descriptor (see below)
4576 * retval (ignored)
4577 *
4578 * Indirect parameters: uap->path Directory path
4579 *
4580 * Returns: 0 Success
4581 * common_chdir: ENOTDIR
4582 * common_chdir: ENOENT No such file or directory
4583 * common_chdir: ???
4584 *
4585 */
4586 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4587 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4588 {
4589 return common_chdir(p, (void *)uap, 0);
4590 }
4591
4592 /*
4593 * __pthread_chdir
4594 *
4595 * Change current working directory (".") for a single thread
4596 *
4597 * Parameters: p Process requesting the call
4598 * uap User argument descriptor (see below)
4599 * retval (ignored)
4600 *
4601 * Indirect parameters: uap->path Directory path
4602 *
4603 * Returns: 0 Success
4604 * common_chdir: ENOTDIR
4605 * common_chdir: ENOENT No such file or directory
4606 * common_chdir: ???
4607 *
4608 */
4609 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4610 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4611 {
4612 return common_chdir(p, (void *)uap, 1);
4613 }
4614
4615 #define CHROOT_ENTITLEMENT "com.apple.private.vfs.chroot"
4616
4617 /*
4618 * Change notion of root (``/'') directory.
4619 */
4620 /* ARGSUSED */
4621 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4622 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4623 {
4624 struct filedesc *fdp = &p->p_fd;
4625 int error;
4626 struct nameidata nd;
4627 vnode_t tvp;
4628 vfs_context_t ctx = vfs_context_current();
4629
4630 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4631 return error;
4632 }
4633
4634 #if XNU_TARGET_OS_IOS && (DEVELOPMENT || DEBUG)
4635 if (!IOTaskHasEntitlement(vfs_context_task(ctx), CHROOT_ENTITLEMENT)) {
4636 mach_exception_code_t code = 0;
4637
4638 os_log_error(OS_LOG_DEFAULT,
4639 "%s: proc %s[%d] calls chroot(2) without entitlement\n",
4640 __func__, proc_best_name(p), proc_getpid(p));
4641
4642 /*
4643 * Generate a simulated EXC_GUARD crash report so we know about the
4644 * violation.
4645 */
4646 EXC_GUARD_ENCODE_TYPE(code, GUARD_TYPE_REJECTED_SC);
4647 task_violated_guard(code, 61 /* SYS_chroot */, NULL, true);
4648 }
4649 #endif
4650
4651 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4652 UIO_USERSPACE, uap->path, ctx);
4653 error = change_dir(&nd, ctx);
4654 if (error) {
4655 return error;
4656 }
4657
4658 #if CONFIG_MACF
4659 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4660 &nd.ni_cnd);
4661 if (error) {
4662 vnode_put(nd.ni_vp);
4663 return error;
4664 }
4665 #endif
4666
4667 if ((error = vnode_ref(nd.ni_vp))) {
4668 vnode_put(nd.ni_vp);
4669 return error;
4670 }
4671 vnode_put(nd.ni_vp);
4672
4673 /*
4674 * This lock provides the guarantee that as long as you hold the lock
4675 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4676 * on a referenced vnode in namei when determining the rootvnode for
4677 * a process.
4678 */
4679 /* needed for synchronization with lookup */
4680 proc_dirs_lock_exclusive(p);
4681 /* needed for setting the flag and other activities on the fd itself */
4682 proc_fdlock(p);
4683 tvp = fdp->fd_rdir;
4684 fdp->fd_rdir = nd.ni_vp;
4685 fdt_flag_set(fdp, FD_CHROOT);
4686 proc_fdunlock(p);
4687 proc_dirs_unlock_exclusive(p);
4688
4689 if (tvp != NULL) {
4690 vnode_rele(tvp);
4691 }
4692
4693 return 0;
4694 }
4695
4696 #define PATHSTATICBUFLEN 256
4697 #define PIVOT_ROOT_ENTITLEMENT \
4698 "com.apple.private.vfs.pivot-root"
4699
4700 #if defined(XNU_TARGET_OS_OSX)
4701 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4702 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4703 {
4704 int error;
4705 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4706 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4707 char *new_rootfs_path_before_buf = NULL;
4708 char *old_rootfs_path_after_buf = NULL;
4709 char *incoming = NULL;
4710 char *outgoing = NULL;
4711 vnode_t incoming_rootvp = NULLVP;
4712 size_t bytes_copied;
4713
4714 /*
4715 * XXX : Additional restrictions needed
4716 * - perhaps callable only once.
4717 */
4718 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4719 return error;
4720 }
4721
4722 /*
4723 * pivot_root can be executed by launchd only.
4724 * Enforce entitlement.
4725 */
4726 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4727 return EPERM;
4728 }
4729
4730 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4731 if (error == ENAMETOOLONG) {
4732 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4733 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4734 }
4735
4736 if (error) {
4737 goto out;
4738 }
4739
4740 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4741 if (error == ENAMETOOLONG) {
4742 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4743 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4744 }
4745 if (error) {
4746 goto out;
4747 }
4748
4749 if (new_rootfs_path_before_buf) {
4750 incoming = new_rootfs_path_before_buf;
4751 } else {
4752 incoming = &new_rootfs_path_before[0];
4753 }
4754
4755 if (old_rootfs_path_after_buf) {
4756 outgoing = old_rootfs_path_after_buf;
4757 } else {
4758 outgoing = &old_rootfs_path_after[0];
4759 }
4760
4761 /*
4762 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4763 * Userland is not allowed to pivot to an image.
4764 */
4765 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4766 if (error) {
4767 goto out;
4768 }
4769 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4770 if (error) {
4771 goto out;
4772 }
4773
4774 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4775
4776 out:
4777 if (incoming_rootvp != NULLVP) {
4778 vnode_put(incoming_rootvp);
4779 incoming_rootvp = NULLVP;
4780 }
4781
4782 if (old_rootfs_path_after_buf) {
4783 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4784 }
4785
4786 if (new_rootfs_path_before_buf) {
4787 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4788 }
4789
4790 return error;
4791 }
4792 #else
4793 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4794 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4795 {
4796 return nosys(p, NULL, retval);
4797 }
4798 #endif /* XNU_TARGET_OS_OSX */
4799
4800 /*
4801 * Common routine for chroot and chdir.
4802 *
4803 * Returns: 0 Success
4804 * ENOTDIR Not a directory
4805 * namei:??? [anything namei can return]
4806 * vnode_authorize:??? [anything vnode_authorize can return]
4807 */
4808 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4809 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4810 {
4811 vnode_t vp;
4812 int error;
4813
4814 if ((error = namei(ndp))) {
4815 return error;
4816 }
4817 nameidone(ndp);
4818 vp = ndp->ni_vp;
4819
4820 if (vp->v_type != VDIR) {
4821 vnode_put(vp);
4822 return ENOTDIR;
4823 }
4824
4825 #if CONFIG_MACF
4826 error = mac_vnode_check_chdir(ctx, vp);
4827 if (error) {
4828 vnode_put(vp);
4829 return error;
4830 }
4831 #endif
4832
4833 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4834 if (error) {
4835 vnode_put(vp);
4836 return error;
4837 }
4838
4839 return error;
4840 }
4841
4842 /*
4843 * Free the vnode data (for directories) associated with the file glob.
4844 */
4845 struct fd_vn_data *
fg_vn_data_alloc(void)4846 fg_vn_data_alloc(void)
4847 {
4848 struct fd_vn_data *fvdata;
4849
4850 /* Allocate per fd vnode data */
4851 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4852 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4853 return fvdata;
4854 }
4855
4856 /*
4857 * Free the vnode data (for directories) associated with the file glob.
4858 */
4859 void
fg_vn_data_free(void * fgvndata)4860 fg_vn_data_free(void *fgvndata)
4861 {
4862 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4863
4864 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4865 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4866 kfree_type(struct fd_vn_data, fvdata);
4867 }
4868
4869 /*
4870 * Check permissions, allocate an open file structure,
4871 * and call the device open routine if any.
4872 *
4873 * Returns: 0 Success
4874 * EINVAL
4875 * EINTR
4876 * falloc:ENFILE
4877 * falloc:EMFILE
4878 * falloc:ENOMEM
4879 * vn_open_auth:???
4880 * dupfdopen:???
4881 * VNOP_ADVLOCK:???
4882 * vnode_setsize:???
4883 *
4884 * XXX Need to implement uid, gid
4885 */
4886 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4887 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4888 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4889 {
4890 proc_t p = vfs_context_proc(ctx);
4891 kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4892 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4893 struct fileproc *fp;
4894 vnode_t vp;
4895 int flags, oflags, amode;
4896 int type, indx, error;
4897 struct vfs_context context;
4898 vnode_t authvp = NULLVP;
4899
4900 oflags = uflags;
4901
4902 amode = oflags & O_ACCMODE;
4903 /*
4904 * Because O_RDONLY is 0, it is not possible to distinguish between
4905 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4906 * with FREAD/FWRITE.
4907 */
4908 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4909 return EINVAL;
4910 }
4911
4912 flags = FFLAGS(uflags);
4913 CLR(flags, FENCRYPTED);
4914 CLR(flags, FUNENCRYPTED);
4915
4916 AUDIT_ARG(fflags, oflags);
4917 AUDIT_ARG(mode, vap->va_mode);
4918
4919 if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4920 return error;
4921 }
4922 if (flags & O_CLOEXEC) {
4923 fp->fp_flags |= FP_CLOEXEC;
4924 }
4925 if (flags & O_CLOFORK) {
4926 fp->fp_flags |= FP_CLOFORK;
4927 }
4928
4929 /* setup state to recognize when fdesc_open was called */
4930 uu->uu_dupfd = -1;
4931
4932 /*
4933 * Disable read/write access if file is opened with O_EVTONLY and
4934 * the process has requested to deny read/write access.
4935 */
4936 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4937 flags &= ~(FREAD | FWRITE);
4938 }
4939
4940 if (authfd != AUTH_OPEN_NOAUTHFD) {
4941 error = vnode_getfromfd(ctx, authfd, &authvp);
4942 if (error) {
4943 fp_free(p, indx, fp);
4944 return error;
4945 }
4946 }
4947
4948 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4949 if (authvp != NULLVP) {
4950 vnode_put(authvp);
4951 }
4952 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4953 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4954 *retval = indx;
4955 return 0;
4956 }
4957 }
4958 if (error == ERESTART) {
4959 error = EINTR;
4960 }
4961 fp_free(p, indx, fp);
4962 return error;
4963 }
4964
4965 if (authvp != NULLVP) {
4966 vnode_put(authvp);
4967 }
4968
4969 uu->uu_dupfd = 0;
4970 vp = ndp->ni_vp;
4971
4972 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4973 fp->fp_glob->fg_ops = &vnops;
4974 fp_set_data(fp, vp);
4975
4976 #if CONFIG_FILE_LEASES
4977 /*
4978 * If we are creating a file or open with truncate, we need to break the
4979 * lease if there is a read lease placed on the parent dir.
4980 */
4981 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4982 vnode_breakdirlease(vp, true, oflags);
4983 }
4984 /* Now check if there is a lease placed on the file itself. */
4985 error = vnode_breaklease(vp, oflags, ctx);
4986 if (error) {
4987 goto bad;
4988 }
4989 #endif /* CONFIG_FILE_LEASES */
4990
4991 if (flags & (O_EXLOCK | O_SHLOCK)) {
4992 struct flock lf = {
4993 .l_whence = SEEK_SET,
4994 };
4995
4996 if (flags & O_EXLOCK) {
4997 lf.l_type = F_WRLCK;
4998 } else {
4999 lf.l_type = F_RDLCK;
5000 }
5001 type = F_FLOCK;
5002 if ((flags & FNONBLOCK) == 0) {
5003 type |= F_WAIT;
5004 }
5005 #if CONFIG_MACF
5006 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
5007 F_SETLK, &lf);
5008 if (error) {
5009 goto bad;
5010 }
5011 #endif
5012 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
5013 goto bad;
5014 }
5015 fp->fp_glob->fg_flag |= FWASLOCKED;
5016 }
5017
5018 /* try to truncate by setting the size attribute */
5019 if (flags & O_TRUNC) {
5020 if ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0) {
5021 goto bad;
5022 }
5023 fp->fp_glob->fg_flag |= FWASWRITTEN;
5024 }
5025
5026 /*
5027 * For directories we hold some additional information in the fd.
5028 */
5029 if (vnode_vtype(vp) == VDIR) {
5030 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
5031 } else {
5032 fp->fp_glob->fg_vn_data = NULL;
5033 }
5034
5035 #if CONFIG_SECLUDED_MEMORY
5036 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
5037 memory_object_control_t moc;
5038 const char *v_name;
5039
5040 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
5041
5042 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
5043 /* nothing to do... */
5044 } else if (fp->fp_glob->fg_flag & FWRITE) {
5045 /* writable -> no longer eligible for secluded pages */
5046 memory_object_mark_eligible_for_secluded(moc,
5047 FALSE);
5048 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
5049 char pathname[32] = { 0, };
5050 size_t copied;
5051 /* XXX FBDP: better way to detect /Applications/ ? */
5052 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5053 (void)copyinstr(ndp->ni_dirp,
5054 pathname,
5055 sizeof(pathname),
5056 &copied);
5057 } else {
5058 copystr(CAST_DOWN(void *, ndp->ni_dirp),
5059 pathname,
5060 sizeof(pathname),
5061 &copied);
5062 }
5063 pathname[sizeof(pathname) - 1] = '\0';
5064 if (strncmp(pathname,
5065 "/Applications/",
5066 strlen("/Applications/")) == 0 &&
5067 strncmp(pathname,
5068 "/Applications/Camera.app/",
5069 strlen("/Applications/Camera.app/")) != 0) {
5070 /*
5071 * not writable
5072 * AND from "/Applications/"
5073 * AND not from "/Applications/Camera.app/"
5074 * ==> eligible for secluded
5075 */
5076 memory_object_mark_eligible_for_secluded(moc,
5077 TRUE);
5078 }
5079 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
5080 (v_name = vnode_getname(vp))) {
5081 size_t len = strlen(v_name);
5082
5083 if (!strncmp(v_name, "dyld", len) ||
5084 !strncmp(v_name, "launchd", len) ||
5085 !strncmp(v_name, "Camera", len) ||
5086 !strncmp(v_name, "SpringBoard", len) ||
5087 !strncmp(v_name, "backboardd", len) ||
5088 !strncmp(v_name, "cameracaptured", len)) {
5089 /*
5090 * This file matters when launching Camera:
5091 * do not store its contents in the secluded
5092 * pool that will be drained on Camera launch.
5093 */
5094 memory_object_mark_eligible_for_secluded(moc,
5095 FALSE);
5096 } else if (!strncmp(v_name, "audiomxd", len) ||
5097 !strncmp(v_name, "mediaplaybackd", len)) {
5098 memory_object_mark_eligible_for_secluded(moc,
5099 FALSE);
5100 memory_object_mark_for_realtime(moc,
5101 true);
5102 } else if (!strncmp(v_name, "bluetoothd", len)) {
5103 /*
5104 * bluetoothd might be needed for realtime audio
5105 * playback.
5106 */
5107 memory_object_mark_eligible_for_secluded(moc,
5108 FALSE);
5109 memory_object_mark_for_realtime(moc,
5110 true);
5111 } else {
5112 char pathname[64] = { 0, };
5113 size_t copied;
5114 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5115 (void)copyinstr(ndp->ni_dirp,
5116 pathname,
5117 sizeof(pathname),
5118 &copied);
5119 } else {
5120 copystr(CAST_DOWN(void *, ndp->ni_dirp),
5121 pathname,
5122 sizeof(pathname),
5123 &copied);
5124 }
5125 pathname[sizeof(pathname) - 1] = '\0';
5126 if (strncmp(pathname,
5127 "/Library/Audio/Plug-Ins/",
5128 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
5129 strncmp(pathname,
5130 "/System/Library/Audio/Plug-Ins/",
5131 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
5132 /*
5133 * This may be an audio plugin required
5134 * for realtime playback.
5135 * ==> NOT eligible for secluded.
5136 */
5137 memory_object_mark_eligible_for_secluded(moc,
5138 FALSE);
5139 memory_object_mark_for_realtime(moc,
5140 true);
5141 }
5142 }
5143 vnode_putname(v_name);
5144 }
5145 }
5146 #endif /* CONFIG_SECLUDED_MEMORY */
5147
5148 vnode_put(vp);
5149
5150 /*
5151 * The first terminal open (without a O_NOCTTY) by a session leader
5152 * results in it being set as the controlling terminal.
5153 */
5154 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
5155 !(flags & O_NOCTTY)) {
5156 int tmp = 0;
5157
5158 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
5159 (caddr_t)&tmp, ctx);
5160 }
5161
5162 proc_fdlock(p);
5163 procfdtbl_releasefd(p, indx, NULL);
5164
5165 fp_drop(p, indx, fp, 1);
5166 proc_fdunlock(p);
5167
5168 *retval = indx;
5169
5170 return 0;
5171 bad:
5172 context = *vfs_context_current();
5173 context.vc_ucred = fp->fp_glob->fg_cred;
5174
5175 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
5176 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
5177 struct flock lf = {
5178 .l_whence = SEEK_SET,
5179 .l_type = F_UNLCK,
5180 };
5181
5182 (void)VNOP_ADVLOCK(
5183 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
5184 }
5185
5186 vn_close(vp, fp->fp_glob->fg_flag, &context);
5187 vnode_put(vp);
5188 fp_free(p, indx, fp);
5189
5190 return error;
5191 }
5192
5193 /*
5194 * While most of the *at syscall handlers can call nameiat() which
5195 * is a wrapper around namei, the use of namei and initialisation
5196 * of nameidata are far removed and in different functions - namei
5197 * gets called in vn_open_auth for open1. So we'll just do here what
5198 * nameiat() does.
5199 */
5200 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)5201 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
5202 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
5203 int dirfd, int authfd)
5204 {
5205 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
5206 int error;
5207 char c;
5208
5209 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5210 error = copyin(ndp->ni_dirp, &c, sizeof(char));
5211 if (error) {
5212 return error;
5213 }
5214 } else {
5215 c = *((char *)(ndp->ni_dirp));
5216 }
5217
5218 if (c != '/') {
5219 vnode_t dvp_at;
5220
5221 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
5222 &dvp_at);
5223 if (error) {
5224 return error;
5225 }
5226
5227 if (vnode_vtype(dvp_at) != VDIR) {
5228 vnode_put(dvp_at);
5229 return ENOTDIR;
5230 }
5231
5232 ndp->ni_dvp = dvp_at;
5233 ndp->ni_cnd.cn_flags |= USEDVP;
5234 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
5235 retval, authfd);
5236 vnode_put(dvp_at);
5237 return error;
5238 }
5239 }
5240
5241 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
5242 }
5243
5244 /*
5245 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
5246 *
5247 * Parameters: p Process requesting the open
5248 * uap User argument descriptor (see below)
5249 * retval Pointer to an area to receive the
5250 * return calue from the system call
5251 *
5252 * Indirect: uap->path Path to open (same as 'open')
5253 * uap->flags Flags to open (same as 'open'
5254 * uap->uid UID to set, if creating
5255 * uap->gid GID to set, if creating
5256 * uap->mode File mode, if creating (same as 'open')
5257 * uap->xsecurity ACL to set, if creating
5258 *
5259 * Returns: 0 Success
5260 * !0 errno value
5261 *
5262 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5263 *
5264 * XXX: We should enummerate the possible errno values here, and where
5265 * in the code they originated.
5266 */
5267 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)5268 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
5269 {
5270 int ciferror;
5271 kauth_filesec_t xsecdst;
5272 struct vnode_attr va;
5273 struct nameidata nd;
5274 int cmode;
5275
5276 AUDIT_ARG(owner, uap->uid, uap->gid);
5277
5278 xsecdst = NULL;
5279 if ((uap->xsecurity != USER_ADDR_NULL) &&
5280 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
5281 return ciferror;
5282 }
5283
5284 VATTR_INIT(&va);
5285 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
5286 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5287 if (uap->uid != KAUTH_UID_NONE) {
5288 VATTR_SET(&va, va_uid, uap->uid);
5289 }
5290 if (uap->gid != KAUTH_GID_NONE) {
5291 VATTR_SET(&va, va_gid, uap->gid);
5292 }
5293 if (xsecdst != NULL) {
5294 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5295 va.va_vaflags |= VA_FILESEC_ACL;
5296 }
5297
5298 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5299 uap->path, vfs_context_current());
5300
5301 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5302 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5303 if (xsecdst != NULL) {
5304 kauth_filesec_free(xsecdst);
5305 }
5306
5307 return ciferror;
5308 }
5309
5310 /*
5311 * Go through the data-protected atomically controlled open (2)
5312 *
5313 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5314 */
5315 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5316 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5317 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5318 {
5319 /*
5320 * Follow the same path as normal open(2)
5321 * Look up the item if it exists, and acquire the vnode.
5322 */
5323 struct vnode_attr va;
5324 struct nameidata nd;
5325 int cmode;
5326 int error;
5327 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5328
5329 VATTR_INIT(&va);
5330 /* Mask off all but regular access permissions */
5331 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5332 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5333
5334 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5335 path, ctx);
5336
5337 /*
5338 * Initialize the extra fields in vnode_attr to pass down our
5339 * extra fields.
5340 * 1. target cprotect class.
5341 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5342 */
5343 if (flags & O_CREAT) {
5344 /* lower level kernel code validates that the class is valid before applying it. */
5345 if (class != PROTECTION_CLASS_DEFAULT) {
5346 /*
5347 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5348 * file behave the same as open (2)
5349 */
5350 VATTR_SET(&va, va_dataprotect_class, class);
5351 }
5352 }
5353
5354 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5355 if (flags & (O_RDWR | O_WRONLY)) {
5356 /*
5357 * Not allowed to write raw encrypted bytes or when opening authenticated.
5358 */
5359 return EINVAL;
5360 }
5361 if (dpflags & O_DP_GETRAWENCRYPTED) {
5362 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5363 }
5364 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5365 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5366 }
5367 if (dpflags & O_DP_AUTHENTICATE) {
5368 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5369 }
5370 }
5371
5372 error = open1at(vfs_context_current(), &nd, flags, &va,
5373 NULL, NULL, retval, fd, authfd);
5374
5375 return error;
5376 }
5377
5378 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5379 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5380 {
5381 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5382 return EINVAL;
5383 }
5384
5385 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5386 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5387 }
5388
5389 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5390 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5391 {
5392 if (uap->dpflags & O_DP_AUTHENTICATE) {
5393 return EINVAL;
5394 }
5395
5396 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5397 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5398 }
5399
5400 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval,uint64_t * objidp,fsid_t * fsidp)5401 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5402 int fd, enum uio_seg segflg, int *retval, uint64_t *objidp, fsid_t *fsidp)
5403 {
5404 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5405 struct {
5406 struct vnode_attr va;
5407 struct nameidata nd;
5408 } *__open_data;
5409 struct vnode_attr *vap;
5410 struct nameidata *ndp;
5411 int cmode;
5412 int error;
5413
5414 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5415 vap = &__open_data->va;
5416 ndp = &__open_data->nd;
5417
5418 VATTR_INIT(vap);
5419 /* Mask off all but regular access permissions */
5420 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5421 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5422
5423 /* Check for fileid and fsid authentication */
5424 if (objidp || fsidp) {
5425 if (!objidp || !fsidp) {
5426 error = EINVAL;
5427 goto out;
5428 }
5429 VATTR_SET(vap, va_flags, VA_VAFILEID);
5430 VATTR_SET(vap, va_fileid, *objidp);
5431 VATTR_SET(vap, va_fsid64, *fsidp);
5432 }
5433
5434 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5435 segflg, path, ctx);
5436
5437 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5438
5439 out:
5440 kfree_type(typeof(*__open_data), __open_data);
5441
5442 return error;
5443 }
5444
5445 int
open(proc_t p,struct open_args * uap,int32_t * retval)5446 open(proc_t p, struct open_args *uap, int32_t *retval)
5447 {
5448 __pthread_testcancel(1);
5449 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5450 }
5451
5452 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5453 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5454 int32_t *retval)
5455 {
5456 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5457 uap->mode, AT_FDCWD, UIO_USERSPACE, retval, NULL, NULL);
5458 }
5459
5460 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5461 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5462 int32_t *retval)
5463 {
5464 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5465 uap->mode, uap->fd, UIO_USERSPACE, retval, NULL, NULL);
5466 }
5467
5468 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5469 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5470 {
5471 __pthread_testcancel(1);
5472 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5473 }
5474
5475 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5476
5477 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5478 vfs_context_can_open_by_id(vfs_context_t ctx)
5479 {
5480 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5481 return TRUE;
5482 }
5483
5484 return IOTaskHasEntitlement(vfs_context_task(ctx),
5485 OPEN_BY_ID_ENTITLEMENT);
5486 }
5487
5488 #define MAX_OPENBYID_NP_RETRIES 10
5489
5490 /*
5491 * openbyid_np: open a file given a file system id and a file system object id
5492 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5493 * file systems that don't support object ids it is a node id (uint64_t).
5494 *
5495 * Parameters: p Process requesting the open
5496 * uap User argument descriptor (see below)
5497 * retval Pointer to an area to receive the
5498 * return calue from the system call
5499 *
5500 * Indirect: uap->path Path to open (same as 'open')
5501 *
5502 * uap->fsid id of target file system
5503 * uap->objid id of target file system object
5504 * uap->flags Flags to open (same as 'open')
5505 *
5506 * Returns: 0 Success
5507 * !0 errno value
5508 *
5509 *
5510 * XXX: We should enummerate the possible errno values here, and where
5511 * in the code they originated.
5512 */
5513 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5514 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5515 {
5516 fsid_t fsid;
5517 uint64_t objid;
5518 int fd;
5519 int error;
5520 int retry_count = 0;
5521 char *buf = NULL;
5522 int buflen = MAXPATHLEN;
5523 int pathlen = 0;
5524 vfs_context_t ctx = vfs_context_current();
5525
5526 if (!vfs_context_can_open_by_id(ctx)) {
5527 return EPERM;
5528 }
5529
5530 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5531 return error;
5532 }
5533
5534 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5535 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5536 return error;
5537 }
5538
5539 AUDIT_ARG(value32, fsid.val[0]);
5540 AUDIT_ARG(value64, objid);
5541
5542 retry:
5543 fd = -1;
5544 error = 0;
5545 buf = NULL;
5546 pathlen = 0;
5547 buflen = MAXPATHLEN;
5548
5549 /*resolve path from fsis, objid*/
5550 do {
5551 buf = kalloc_data(buflen + 1, Z_WAITOK);
5552 if (buf == NULL) {
5553 return ENOMEM;
5554 }
5555
5556 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5557 buf, FSOPT_ISREALFSID, &pathlen);
5558
5559 if (error) {
5560 kfree_data(buf, buflen + 1);
5561 buf = NULL;
5562 }
5563 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5564
5565 if (error) {
5566 return error;
5567 }
5568
5569 buf[pathlen] = 0;
5570
5571 error = openat_internal(
5572 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, &fd, &objid, &fsid);
5573
5574 kfree_data(buf, buflen + 1);
5575
5576 /* Ensure the correct file is opened */
5577 if (error == ERECYCLE) {
5578 if (retry_count < MAX_OPENBYID_NP_RETRIES) {
5579 retry_count += 1;
5580 goto retry;
5581 } else {
5582 printf("openbyid_np() retry limit due to ERECYCLE reached\n");
5583 error = ENOENT;
5584 }
5585 }
5586
5587 if (!error) {
5588 *retval = fd;
5589 }
5590
5591 return error;
5592 }
5593
5594
5595 /*
5596 * Create a special file.
5597 */
5598 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5599 int fd);
5600
5601 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5602 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5603 mode_t mode, int fd)
5604 {
5605 vfs_context_t ctx = vfs_context_current();
5606 struct nameidata nd;
5607 vnode_t vp, dvp;
5608 int error;
5609
5610 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5611 if ((mode & S_IFMT) == S_IFIFO) {
5612 return mkfifo1(ctx, upath, vap, fd);
5613 }
5614
5615 AUDIT_ARG(mode, mode);
5616 AUDIT_ARG(value32, vap->va_rdev);
5617
5618 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5619 return error;
5620 }
5621 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5622 UIO_USERSPACE, upath, ctx);
5623 error = nameiat(&nd, fd);
5624 if (error) {
5625 return error;
5626 }
5627 dvp = nd.ni_dvp;
5628 vp = nd.ni_vp;
5629
5630 if (vp != NULL) {
5631 error = EEXIST;
5632 goto out;
5633 }
5634
5635 switch (mode & S_IFMT) {
5636 case S_IFCHR:
5637 VATTR_SET(vap, va_type, VCHR);
5638 break;
5639 case S_IFBLK:
5640 VATTR_SET(vap, va_type, VBLK);
5641 break;
5642 default:
5643 error = EINVAL;
5644 goto out;
5645 }
5646
5647 #if CONFIG_MACF
5648 error = mac_vnode_check_create(ctx,
5649 nd.ni_dvp, &nd.ni_cnd, vap);
5650 if (error) {
5651 goto out;
5652 }
5653 #endif
5654
5655 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5656 goto out;
5657 }
5658
5659 #if CONFIG_FILE_LEASES
5660 vnode_breakdirlease(dvp, false, O_WRONLY);
5661 #endif
5662
5663 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5664 goto out;
5665 }
5666
5667 if (vp) {
5668 int update_flags = 0;
5669
5670 // Make sure the name & parent pointers are hooked up
5671 if (vp->v_name == NULL) {
5672 update_flags |= VNODE_UPDATE_NAME;
5673 }
5674 if (vp->v_parent == NULLVP) {
5675 update_flags |= VNODE_UPDATE_PARENT;
5676 }
5677
5678 if (update_flags) {
5679 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5680 }
5681
5682 #if CONFIG_FSE
5683 add_fsevent(FSE_CREATE_FILE, ctx,
5684 FSE_ARG_VNODE, vp,
5685 FSE_ARG_DONE);
5686 #endif
5687 }
5688
5689 out:
5690 /*
5691 * nameidone has to happen before we vnode_put(dvp)
5692 * since it may need to release the fs_nodelock on the dvp
5693 */
5694 nameidone(&nd);
5695
5696 if (vp) {
5697 vnode_put(vp);
5698 }
5699 vnode_put(dvp);
5700
5701 return error;
5702 }
5703
5704 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5705 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5706 {
5707 struct vnode_attr va;
5708
5709 VATTR_INIT(&va);
5710 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5711 VATTR_SET(&va, va_rdev, uap->dev);
5712
5713 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5714 }
5715
5716 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5717 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5718 {
5719 struct vnode_attr va;
5720
5721 VATTR_INIT(&va);
5722 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5723 VATTR_SET(&va, va_rdev, uap->dev);
5724
5725 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5726 }
5727
5728 /*
5729 * Create a named pipe.
5730 *
5731 * Returns: 0 Success
5732 * EEXIST
5733 * namei:???
5734 * vnode_authorize:???
5735 * vn_create:???
5736 */
5737 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5738 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5739 {
5740 vnode_t vp, dvp;
5741 int error;
5742 struct nameidata nd;
5743
5744 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5745 UIO_USERSPACE, upath, ctx);
5746 error = nameiat(&nd, fd);
5747 if (error) {
5748 return error;
5749 }
5750 dvp = nd.ni_dvp;
5751 vp = nd.ni_vp;
5752
5753 /* check that this is a new file and authorize addition */
5754 if (vp != NULL) {
5755 error = EEXIST;
5756 goto out;
5757 }
5758 VATTR_SET(vap, va_type, VFIFO);
5759
5760 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5761 goto out;
5762 }
5763
5764 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5765 out:
5766 /*
5767 * nameidone has to happen before we vnode_put(dvp)
5768 * since it may need to release the fs_nodelock on the dvp
5769 */
5770 nameidone(&nd);
5771
5772 if (vp) {
5773 vnode_put(vp);
5774 }
5775 vnode_put(dvp);
5776
5777 return error;
5778 }
5779
5780
5781 /*
5782 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5783 *
5784 * Parameters: p Process requesting the open
5785 * uap User argument descriptor (see below)
5786 * retval (Ignored)
5787 *
5788 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5789 * uap->uid UID to set
5790 * uap->gid GID to set
5791 * uap->mode File mode to set (same as 'mkfifo')
5792 * uap->xsecurity ACL to set, if creating
5793 *
5794 * Returns: 0 Success
5795 * !0 errno value
5796 *
5797 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5798 *
5799 * XXX: We should enummerate the possible errno values here, and where
5800 * in the code they originated.
5801 */
5802 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5803 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5804 {
5805 int ciferror;
5806 kauth_filesec_t xsecdst;
5807 struct vnode_attr va;
5808
5809 AUDIT_ARG(owner, uap->uid, uap->gid);
5810
5811 xsecdst = KAUTH_FILESEC_NONE;
5812 if (uap->xsecurity != USER_ADDR_NULL) {
5813 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5814 return ciferror;
5815 }
5816 }
5817
5818 VATTR_INIT(&va);
5819 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5820 if (uap->uid != KAUTH_UID_NONE) {
5821 VATTR_SET(&va, va_uid, uap->uid);
5822 }
5823 if (uap->gid != KAUTH_GID_NONE) {
5824 VATTR_SET(&va, va_gid, uap->gid);
5825 }
5826 if (xsecdst != KAUTH_FILESEC_NONE) {
5827 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5828 va.va_vaflags |= VA_FILESEC_ACL;
5829 }
5830
5831 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5832
5833 if (xsecdst != KAUTH_FILESEC_NONE) {
5834 kauth_filesec_free(xsecdst);
5835 }
5836 return ciferror;
5837 }
5838
5839 /* ARGSUSED */
5840 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5841 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5842 {
5843 struct vnode_attr va;
5844
5845 VATTR_INIT(&va);
5846 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5847
5848 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5849 }
5850
5851 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5852 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5853 {
5854 struct vnode_attr va;
5855
5856 VATTR_INIT(&va);
5857 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5858
5859 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5860 }
5861
5862 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5863 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5864 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5865
5866 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5867 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5868 {
5869 int ret, len = _len;
5870
5871 *truncated_path = 0;
5872
5873 if (firmlink) {
5874 ret = vn_getpath(dvp, path, &len);
5875 } else {
5876 ret = vn_getpath_no_firmlink(dvp, path, &len);
5877 }
5878 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5879 if (leafname) {
5880 path[len - 1] = '/';
5881 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5882 if (len > MAXPATHLEN) {
5883 char *ptr;
5884
5885 // the string got truncated!
5886 *truncated_path = 1;
5887 ptr = strrchr(path, '/');
5888 if (ptr) {
5889 *ptr = '\0'; // chop off the string at the last directory component
5890 }
5891 len = (int)strlen(path) + 1;
5892 }
5893 }
5894 } else if (ret == 0) {
5895 *truncated_path = 1;
5896 } else if (ret != 0) {
5897 struct vnode *mydvp = dvp;
5898
5899 if (ret != ENOSPC) {
5900 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5901 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5902 }
5903 *truncated_path = 1;
5904
5905 do {
5906 if (mydvp->v_parent != NULL) {
5907 mydvp = mydvp->v_parent;
5908 } else if (mydvp->v_mount) {
5909 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5910 break;
5911 } else {
5912 // no parent and no mount point? only thing is to punt and say "/" changed
5913 strlcpy(path, "/", _len);
5914 len = 2;
5915 mydvp = NULL;
5916 }
5917
5918 if (mydvp == NULL) {
5919 break;
5920 }
5921
5922 len = _len;
5923 if (firmlink) {
5924 ret = vn_getpath(mydvp, path, &len);
5925 } else {
5926 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5927 }
5928 } while (ret == ENOSPC);
5929 }
5930
5931 return len;
5932 }
5933
5934 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5935 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5936 {
5937 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5938 }
5939
5940 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5941 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5942 {
5943 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5944 }
5945
5946 /*
5947 * Make a hard file link.
5948 *
5949 * Returns: 0 Success
5950 * EPERM
5951 * EEXIST
5952 * EXDEV
5953 * namei:???
5954 * vnode_authorize:???
5955 * VNOP_LINK:???
5956 */
5957 /* ARGSUSED */
5958 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5959 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5960 user_addr_t link, int flag, enum uio_seg segflg)
5961 {
5962 vnode_t vp, pvp, dvp, lvp;
5963 struct nameidata nd;
5964 int follow;
5965 int error;
5966 #if CONFIG_FSE
5967 fse_info finfo;
5968 #endif
5969 char *target_path = NULL;
5970 char *no_firmlink_path = NULL;
5971 vnode_t locked_vp = NULLVP;
5972 int truncated = 0;
5973 int truncated_no_firmlink_path = 0;
5974 int num_retries = 0;
5975 int need_event, has_listeners, need_kpath2;
5976 bool do_retry;
5977
5978 /* look up the object we are linking to */
5979 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5980
5981 retry:
5982 do_retry = false;
5983 vp = dvp = lvp = NULLVP;
5984 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5985 segflg, path, ctx);
5986 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
5987 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
5988 }
5989 if (flag & AT_RESOLVE_BENEATH) {
5990 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
5991 }
5992 if (flag & AT_UNIQUE) {
5993 nd.ni_flag |= NAMEI_UNIQUE;
5994 }
5995
5996 error = nameiat(&nd, fd1);
5997 if (error) {
5998 return error;
5999 }
6000 vp = nd.ni_vp;
6001
6002 nameidone(&nd);
6003
6004 /*
6005 * Normally, linking to directories is not supported.
6006 * However, some file systems may have limited support.
6007 */
6008 if (vp->v_type == VDIR) {
6009 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
6010 error = EPERM; /* POSIX */
6011 goto out;
6012 }
6013
6014 /* Linking to a directory requires ownership. */
6015 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
6016 struct vnode_attr dva;
6017
6018 VATTR_INIT(&dva);
6019 VATTR_WANTED(&dva, va_uid);
6020 if (vnode_getattr(vp, &dva, ctx) != 0 ||
6021 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
6022 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
6023 error = EACCES;
6024 goto out;
6025 }
6026 }
6027 }
6028
6029 /* lookup the target node */
6030 #if CONFIG_TRIGGERS
6031 nd.ni_op = OP_LINK;
6032 #endif
6033 nd.ni_cnd.cn_nameiop = CREATE;
6034 nd.ni_flag &= ~NAMEI_UNIQUE;
6035 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
6036 nd.ni_dirp = link;
6037 error = nameiat(&nd, fd2);
6038 if (error != 0) {
6039 goto out;
6040 }
6041 dvp = nd.ni_dvp;
6042 lvp = nd.ni_vp;
6043
6044 assert(locked_vp == NULLVP);
6045 vnode_link_lock(vp);
6046 locked_vp = vp;
6047
6048 #if CONFIG_MACF
6049 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
6050 goto out2;
6051 }
6052 #endif
6053
6054 /* or to anything that kauth doesn't want us to (eg. immutable items) */
6055 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
6056 goto out2;
6057 }
6058
6059 /* target node must not exist */
6060 if (lvp != NULLVP) {
6061 error = EEXIST;
6062 goto out2;
6063 }
6064 /* cannot link across mountpoints */
6065 if (vnode_mount(vp) != vnode_mount(dvp)) {
6066 error = EXDEV;
6067 goto out2;
6068 }
6069
6070 /* authorize creation of the target note */
6071 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
6072 goto out2;
6073 }
6074
6075 #if CONFIG_FILE_LEASES
6076 vnode_breakdirlease(dvp, false, O_WRONLY);
6077 #endif
6078
6079 /* and finally make the link */
6080 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
6081 if (error) {
6082 if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
6083 do_retry = true;
6084 num_retries += 1;
6085 }
6086 goto out2;
6087 }
6088
6089 #if CONFIG_MACF
6090 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
6091 #endif
6092
6093 os_atomic_andnot(&vp->v_ext_flag, VE_NOT_HARDLINK, relaxed);
6094
6095 assert(locked_vp == vp);
6096 vnode_link_unlock(locked_vp);
6097 locked_vp = NULLVP;
6098
6099 #if CONFIG_FSE
6100 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
6101 #else
6102 need_event = 0;
6103 #endif
6104 has_listeners = kauth_authorize_fileop_has_listeners();
6105
6106 need_kpath2 = 0;
6107 #if CONFIG_AUDIT
6108 if (AUDIT_RECORD_EXISTS()) {
6109 need_kpath2 = 1;
6110 }
6111 #endif
6112
6113 if (need_event || has_listeners || need_kpath2) {
6114 char *link_to_path = NULL;
6115 int len, link_name_len;
6116 int len_no_firmlink_path = 0;
6117
6118 /* build the path to the new link file */
6119 GET_PATH(target_path);
6120
6121 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
6122 if (no_firmlink_path == NULL) {
6123 GET_PATH(no_firmlink_path);
6124 }
6125 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6126
6127 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
6128
6129 if (has_listeners) {
6130 /* build the path to file we are linking to */
6131 GET_PATH(link_to_path);
6132
6133 link_name_len = MAXPATHLEN;
6134 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
6135 /*
6136 * Call out to allow 3rd party notification of rename.
6137 * Ignore result of kauth_authorize_fileop call.
6138 */
6139 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
6140 (uintptr_t)link_to_path,
6141 (uintptr_t)target_path);
6142 }
6143 if (link_to_path != NULL) {
6144 RELEASE_PATH(link_to_path);
6145 }
6146 }
6147 #if CONFIG_FSE
6148 if (need_event) {
6149 /* construct fsevent */
6150 if (get_fse_info(vp, &finfo, ctx) == 0) {
6151 if (truncated_no_firmlink_path) {
6152 finfo.mode |= FSE_TRUNCATED_PATH;
6153 }
6154
6155 // build the path to the destination of the link
6156 add_fsevent(FSE_CREATE_FILE, ctx,
6157 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6158 FSE_ARG_FINFO, &finfo,
6159 FSE_ARG_DONE);
6160 }
6161
6162 pvp = vp->v_parent;
6163 // need an iocount on parent vnode in this case
6164 if (pvp && pvp != dvp) {
6165 pvp = vnode_getparent_if_different(vp, dvp);
6166 }
6167 if (pvp) {
6168 add_fsevent(FSE_STAT_CHANGED, ctx,
6169 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
6170 }
6171 if (pvp && pvp != dvp) {
6172 vnode_put(pvp);
6173 }
6174 }
6175 #endif
6176 }
6177 out2:
6178 /*
6179 * nameidone has to happen before we vnode_put(dvp)
6180 * since it may need to release the fs_nodelock on the dvp
6181 */
6182 nameidone(&nd);
6183 if (target_path != NULL) {
6184 RELEASE_PATH(target_path);
6185 target_path = NULL;
6186 }
6187 if (no_firmlink_path != NULL) {
6188 RELEASE_PATH(no_firmlink_path);
6189 no_firmlink_path = NULL;
6190 }
6191 out:
6192 if (locked_vp) {
6193 assert(locked_vp == vp);
6194 vnode_link_unlock(locked_vp);
6195 locked_vp = NULLVP;
6196 }
6197 if (lvp) {
6198 vnode_put(lvp);
6199 }
6200 if (dvp) {
6201 vnode_put(dvp);
6202 }
6203 vnode_put(vp);
6204
6205 if (do_retry) {
6206 goto retry;
6207 }
6208
6209 return error;
6210 }
6211
6212 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)6213 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
6214 {
6215 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6216 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
6217 }
6218
6219 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)6220 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
6221 {
6222 if (uap->flag & ~(AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
6223 return EINVAL;
6224 }
6225
6226 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
6227 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
6228 }
6229
6230 /*
6231 * Make a symbolic link.
6232 *
6233 * We could add support for ACLs here too...
6234 */
6235 /* ARGSUSED */
6236 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)6237 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
6238 user_addr_t link, enum uio_seg segflg)
6239 {
6240 struct vnode_attr va;
6241 char *path;
6242 int error;
6243 struct nameidata nd;
6244 vnode_t vp, dvp;
6245 size_t dummy = 0;
6246 proc_t p;
6247
6248 error = 0;
6249 if (UIO_SEG_IS_USER_SPACE(segflg)) {
6250 path = zalloc(ZV_NAMEI);
6251 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
6252 } else {
6253 path = (char *)path_data;
6254 }
6255 if (error) {
6256 goto out;
6257 }
6258 AUDIT_ARG(text, path); /* This is the link string */
6259
6260 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
6261 segflg, link, ctx);
6262
6263 error = nameiat(&nd, fd);
6264 if (error) {
6265 goto out;
6266 }
6267 dvp = nd.ni_dvp;
6268 vp = nd.ni_vp;
6269
6270 p = vfs_context_proc(ctx);
6271 VATTR_INIT(&va);
6272 VATTR_SET(&va, va_type, VLNK);
6273 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
6274
6275 #if CONFIG_MACF
6276 error = mac_vnode_check_create(ctx,
6277 dvp, &nd.ni_cnd, &va);
6278 #endif
6279 if (error != 0) {
6280 goto skipit;
6281 }
6282
6283 if (vp != NULL) {
6284 error = EEXIST;
6285 goto skipit;
6286 }
6287
6288 /* authorize */
6289 if (error == 0) {
6290 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6291 }
6292 /* get default ownership, etc. */
6293 if (error == 0) {
6294 error = vnode_authattr_new(dvp, &va, 0, ctx);
6295 }
6296
6297 #if CONFIG_FILE_LEASES
6298 vnode_breakdirlease(dvp, false, O_WRONLY);
6299 #endif
6300
6301 if (error == 0) {
6302 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
6303 }
6304
6305 /* do fallback attribute handling */
6306 if (error == 0 && vp) {
6307 error = vnode_setattr_fallback(vp, &va, ctx);
6308 }
6309
6310 #if CONFIG_MACF
6311 if (error == 0 && vp) {
6312 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
6313 }
6314 #endif
6315
6316 if (error == 0) {
6317 int update_flags = 0;
6318
6319 /*check if a new vnode was created, else try to get one*/
6320 if (vp == NULL) {
6321 nd.ni_cnd.cn_nameiop = LOOKUP;
6322 #if CONFIG_TRIGGERS
6323 nd.ni_op = OP_LOOKUP;
6324 #endif
6325 /*
6326 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
6327 * reallocated again in namei().
6328 */
6329 nd.ni_cnd.cn_flags &= HASBUF;
6330 error = nameiat(&nd, fd);
6331 if (error) {
6332 goto skipit;
6333 }
6334 vp = nd.ni_vp;
6335 }
6336
6337 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
6338 /* call out to allow 3rd party notification of rename.
6339 * Ignore result of kauth_authorize_fileop call.
6340 */
6341 if (kauth_authorize_fileop_has_listeners() &&
6342 namei(&nd) == 0) {
6343 char *new_link_path = NULL;
6344 int len;
6345
6346 /* build the path to the new link file */
6347 new_link_path = get_pathbuff();
6348 len = MAXPATHLEN;
6349 vn_getpath(dvp, new_link_path, &len);
6350 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
6351 new_link_path[len - 1] = '/';
6352 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
6353 }
6354
6355 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
6356 (uintptr_t)path, (uintptr_t)new_link_path);
6357 if (new_link_path != NULL) {
6358 release_pathbuff(new_link_path);
6359 }
6360 }
6361 #endif
6362 // Make sure the name & parent pointers are hooked up
6363 if (vp->v_name == NULL) {
6364 update_flags |= VNODE_UPDATE_NAME;
6365 }
6366 if (vp->v_parent == NULLVP) {
6367 update_flags |= VNODE_UPDATE_PARENT;
6368 }
6369
6370 if (update_flags) {
6371 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6372 }
6373
6374 #if CONFIG_FSE
6375 add_fsevent(FSE_CREATE_FILE, ctx,
6376 FSE_ARG_VNODE, vp,
6377 FSE_ARG_DONE);
6378 #endif
6379 }
6380
6381 skipit:
6382 /*
6383 * nameidone has to happen before we vnode_put(dvp)
6384 * since it may need to release the fs_nodelock on the dvp
6385 */
6386 nameidone(&nd);
6387
6388 if (vp) {
6389 vnode_put(vp);
6390 }
6391 vnode_put(dvp);
6392 out:
6393 if (path && (path != (char *)path_data)) {
6394 zfree(ZV_NAMEI, path);
6395 }
6396
6397 return error;
6398 }
6399
6400 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6401 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6402 {
6403 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6404 uap->link, UIO_USERSPACE);
6405 }
6406
6407 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6408 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6409 __unused int32_t *retval)
6410 {
6411 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6412 uap->path2, UIO_USERSPACE);
6413 }
6414
6415 /*
6416 * Delete a whiteout from the filesystem.
6417 * No longer supported.
6418 */
6419 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6420 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6421 {
6422 return ENOTSUP;
6423 }
6424
6425 /*
6426 * Delete a name from the filesystem.
6427 */
6428 /* ARGSUSED */
6429 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6430 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6431 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6432 {
6433 struct {
6434 struct nameidata nd;
6435 #if CONFIG_FSE
6436 struct vnode_attr va;
6437 fse_info finfo;
6438 #endif
6439 } *__unlink_data;
6440 struct nameidata *ndp;
6441 vnode_t vp, dvp;
6442 int error;
6443 struct componentname *cnp;
6444 char *path = NULL;
6445 char *no_firmlink_path = NULL;
6446 int len_path = 0;
6447 int len_no_firmlink_path = 0;
6448 int flags;
6449 int need_event;
6450 int has_listeners;
6451 int truncated_path;
6452 int truncated_no_firmlink_path;
6453 int batched;
6454 struct vnode_attr *vap;
6455 vnode_t locked_vp = NULLVP;
6456 int do_retry;
6457 int retry_count = 0;
6458 int cn_flags;
6459 int namei_flags = 0;
6460
6461 cn_flags = LOCKPARENT;
6462 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6463 cn_flags |= AUDITVNPATH1;
6464 }
6465 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6466 namei_flags |= NAMEI_NOFOLLOW_ANY;
6467 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6468 }
6469 if (unlink_flags & VNODE_REMOVE_RESOLVE_BENEATH) {
6470 namei_flags |= NAMEI_RESOLVE_BENEATH;
6471 unlink_flags &= ~VNODE_REMOVE_RESOLVE_BENEATH;
6472 }
6473 if (unlink_flags & VNODE_REMOVE_UNIQUE) {
6474 namei_flags |= NAMEI_UNIQUE;
6475 unlink_flags &= ~VNODE_REMOVE_UNIQUE;
6476 }
6477
6478 /* If a starting dvp is passed, it trumps any fd passed. */
6479 if (start_dvp) {
6480 cn_flags |= USEDVP;
6481 }
6482
6483 #if NAMEDRSRCFORK
6484 /* unlink or delete is allowed on rsrc forks and named streams */
6485 cn_flags |= CN_ALLOWRSRCFORK;
6486 #endif
6487
6488 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6489 ndp = &__unlink_data->nd;
6490 #if CONFIG_FSE
6491 fse_info *finfop = &__unlink_data->finfo;
6492 #endif
6493
6494 retry:
6495 do_retry = 0;
6496 flags = 0;
6497 need_event = 0;
6498 has_listeners = 0;
6499 truncated_path = 0;
6500 truncated_no_firmlink_path = 0;
6501 vap = NULL;
6502
6503 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6504
6505 ndp->ni_dvp = start_dvp;
6506 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | namei_flags;
6507 cnp = &ndp->ni_cnd;
6508
6509 continue_lookup:
6510 error = nameiat(ndp, fd);
6511 if (error) {
6512 goto early_out;
6513 }
6514
6515 dvp = ndp->ni_dvp;
6516 vp = ndp->ni_vp;
6517
6518 /* With Carbon delete semantics, busy files cannot be deleted */
6519 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6520 flags |= VNODE_REMOVE_NODELETEBUSY;
6521 }
6522
6523 /* Skip any potential upcalls if told to. */
6524 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6525 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6526 }
6527
6528 /* Update speculative telemetry with system discarded use state */
6529 if (unlink_flags & VNODE_REMOVE_SYSTEM_DISCARDED) {
6530 flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6531 }
6532
6533 if (vp) {
6534 batched = vnode_compound_remove_available(vp);
6535 /*
6536 * The root of a mounted filesystem cannot be deleted.
6537 */
6538 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6539 error = EBUSY;
6540 goto out;
6541 }
6542
6543 #if DEVELOPMENT || DEBUG
6544 /*
6545 * XXX VSWAP: Check for entitlements or special flag here
6546 * so we can restrict access appropriately.
6547 */
6548 #else /* DEVELOPMENT || DEBUG */
6549
6550 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6551 error = EPERM;
6552 goto out;
6553 }
6554 #endif /* DEVELOPMENT || DEBUG */
6555
6556 if (!batched) {
6557 vnode_link_lock(vp);
6558 locked_vp = vp;
6559 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6560 if (error) {
6561 if (error == ENOENT) {
6562 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6563 do_retry = 1;
6564 retry_count++;
6565 }
6566 }
6567 vnode_link_unlock(vp);
6568 locked_vp = NULLVP;
6569 goto out;
6570 }
6571 }
6572 } else {
6573 batched = 1;
6574
6575 if (!vnode_compound_remove_available(dvp)) {
6576 panic("No vp, but no compound remove?");
6577 }
6578 }
6579
6580 #if CONFIG_FSE
6581 need_event = need_fsevent(FSE_DELETE, dvp);
6582 if (need_event) {
6583 if (!batched) {
6584 if ((vp->v_flag & VISHARDLINK) == 0) {
6585 /* XXX need to get these data in batched VNOP */
6586 get_fse_info(vp, finfop, ctx);
6587 }
6588 } else {
6589 error =
6590 vfs_get_notify_attributes(&__unlink_data->va);
6591 if (error) {
6592 goto out;
6593 }
6594
6595 vap = &__unlink_data->va;
6596 }
6597 }
6598 #endif
6599 has_listeners = kauth_authorize_fileop_has_listeners();
6600 if (need_event || has_listeners) {
6601 if (path == NULL) {
6602 GET_PATH(path);
6603 }
6604 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6605 if (no_firmlink_path == NULL) {
6606 GET_PATH(no_firmlink_path);
6607 }
6608 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6609 }
6610
6611 #if NAMEDRSRCFORK
6612 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6613 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6614 } else
6615 #endif
6616 {
6617 #if CONFIG_FILE_LEASES
6618 vnode_breakdirlease(dvp, false, O_WRONLY);
6619 #endif
6620
6621 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6622 vp = ndp->ni_vp;
6623 if (error == EKEEPLOOKING) {
6624 if (!batched) {
6625 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6626 }
6627
6628 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6629 panic("EKEEPLOOKING, but continue flag not set?");
6630 }
6631
6632 if (vnode_isdir(vp)) {
6633 error = EISDIR;
6634 goto out;
6635 }
6636 goto continue_lookup;
6637 } else if (error == ENOENT && batched) {
6638 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6639 /*
6640 * For compound VNOPs, the authorization callback may
6641 * return ENOENT in case of racing hardlink lookups
6642 * hitting the name cache, redrive the lookup.
6643 */
6644 do_retry = 1;
6645 retry_count += 1;
6646 goto out;
6647 }
6648 }
6649 }
6650
6651 /*
6652 * Call out to allow 3rd party notification of delete.
6653 * Ignore result of kauth_authorize_fileop call.
6654 */
6655 if (!error) {
6656 if (has_listeners) {
6657 kauth_authorize_fileop(vfs_context_ucred(ctx),
6658 KAUTH_FILEOP_DELETE,
6659 (uintptr_t)vp,
6660 (uintptr_t)path);
6661 }
6662
6663 if (vp->v_flag & VISHARDLINK) {
6664 //
6665 // if a hardlink gets deleted we want to blow away the
6666 // v_parent link because the path that got us to this
6667 // instance of the link is no longer valid. this will
6668 // force the next call to get the path to ask the file
6669 // system instead of just following the v_parent link.
6670 //
6671 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6672 }
6673
6674 #if CONFIG_FSE
6675 if (need_event) {
6676 if (vp->v_flag & VISHARDLINK) {
6677 get_fse_info(vp, finfop, ctx);
6678 } else if (vap) {
6679 vnode_get_fse_info_from_vap(vp, finfop, vap);
6680 }
6681 if (truncated_path) {
6682 finfop->mode |= FSE_TRUNCATED_PATH;
6683 }
6684 add_fsevent(FSE_DELETE, ctx,
6685 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6686 FSE_ARG_FINFO, finfop,
6687 FSE_ARG_DONE);
6688 }
6689 #endif
6690
6691 #if CONFIG_MACF
6692 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6693 #endif
6694 }
6695
6696 out:
6697 if (locked_vp) {
6698 assert(locked_vp == vp);
6699 vnode_link_unlock(locked_vp);
6700 locked_vp = NULLVP;
6701 }
6702
6703 if (path != NULL) {
6704 RELEASE_PATH(path);
6705 path = NULL;
6706 }
6707
6708 if (no_firmlink_path != NULL) {
6709 RELEASE_PATH(no_firmlink_path);
6710 no_firmlink_path = NULL;
6711 }
6712 #if NAMEDRSRCFORK
6713 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6714 * will cause its shadow file to go away if necessary.
6715 */
6716 if (vp && (vnode_isnamedstream(vp)) &&
6717 (vp->v_parent != NULLVP) &&
6718 vnode_isshadow(vp)) {
6719 vnode_recycle(vp);
6720 }
6721 #endif
6722 /*
6723 * nameidone has to happen before we vnode_put(dvp)
6724 * since it may need to release the fs_nodelock on the dvp
6725 */
6726 nameidone(ndp);
6727 vnode_put(dvp);
6728 if (vp) {
6729 vnode_put(vp);
6730 }
6731
6732 if (do_retry) {
6733 goto retry;
6734 }
6735
6736 early_out:
6737 kfree_type(typeof(*__unlink_data), __unlink_data);
6738 return error;
6739 }
6740
6741 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6742 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6743 enum uio_seg segflg, int unlink_flags)
6744 {
6745 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6746 unlink_flags);
6747 }
6748
6749 /*
6750 * Delete a name from the filesystem using Carbon semantics.
6751 */
6752 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6753 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6754 {
6755 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6756 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6757 }
6758
6759 /*
6760 * Delete a name from the filesystem using POSIX semantics.
6761 */
6762 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6763 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6764 {
6765 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6766 uap->path, UIO_USERSPACE, 0);
6767 }
6768
6769 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6770 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6771 {
6772 int unlink_flags = 0;
6773
6774 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED | AT_RESOLVE_BENEATH | AT_NODELETEBUSY | AT_UNIQUE)) {
6775 return EINVAL;
6776 }
6777
6778 if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6779 unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6780 }
6781 if (uap->flag & AT_RESOLVE_BENEATH) {
6782 unlink_flags |= VNODE_REMOVE_RESOLVE_BENEATH;
6783 }
6784 if (uap->flag & AT_SYSTEM_DISCARDED) {
6785 unlink_flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6786 }
6787 if (uap->flag & AT_NODELETEBUSY) {
6788 unlink_flags |= VNODE_REMOVE_NODELETEBUSY;
6789 }
6790 if (uap->flag & AT_UNIQUE) {
6791 unlink_flags |= VNODE_REMOVE_UNIQUE;
6792 }
6793
6794 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6795 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6796 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6797 }
6798 return rmdirat_internal(vfs_context_current(), uap->fd,
6799 uap->path, UIO_USERSPACE, unlink_flags);
6800 } else {
6801 return unlinkat_internal(vfs_context_current(), uap->fd,
6802 NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6803 }
6804 }
6805
6806 /*
6807 * Reposition read/write file offset.
6808 */
6809 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6810 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6811 {
6812 struct fileproc *fp;
6813 vnode_t vp;
6814 struct vfs_context *ctx;
6815 off_t offset = uap->offset, file_size;
6816 int error;
6817
6818 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6819 if (error == ENOTSUP) {
6820 return ESPIPE;
6821 }
6822 return error;
6823 }
6824 if (
6825 // rdar://3837316: Seeking a pipe is disallowed by POSIX.
6826 vnode_isfifo(vp)
6827 // rdar://120750171: Seeking a TTY is undefined and should be denied.
6828 || vnode_istty(vp)
6829 ) {
6830 file_drop(uap->fd);
6831 return ESPIPE;
6832 }
6833
6834
6835 ctx = vfs_context_current();
6836 #if CONFIG_MACF
6837 if (uap->whence == L_INCR && uap->offset == 0) {
6838 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6839 fp->fp_glob);
6840 } else {
6841 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6842 fp->fp_glob);
6843 }
6844 if (error) {
6845 file_drop(uap->fd);
6846 return error;
6847 }
6848 #endif
6849 if ((error = vnode_getwithref(vp))) {
6850 file_drop(uap->fd);
6851 return error;
6852 }
6853
6854 switch (uap->whence) {
6855 case L_INCR:
6856 offset += fp->fp_glob->fg_offset;
6857 break;
6858 case L_XTND:
6859 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6860 break;
6861 }
6862 offset += file_size;
6863 break;
6864 case L_SET:
6865 break;
6866 case SEEK_HOLE:
6867 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6868 break;
6869 case SEEK_DATA:
6870 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6871 break;
6872 default:
6873 error = EINVAL;
6874 }
6875 if (error == 0) {
6876 if (uap->offset > 0 && offset < 0) {
6877 /* Incremented/relative move past max size */
6878 error = EOVERFLOW;
6879 } else {
6880 /*
6881 * Allow negative offsets on character devices, per
6882 * POSIX 1003.1-2001. Most likely for writing disk
6883 * labels.
6884 */
6885 if (offset < 0 && vp->v_type != VCHR) {
6886 /* Decremented/relative move before start */
6887 error = EINVAL;
6888 } else {
6889 /* Success */
6890 fp->fp_glob->fg_offset = offset;
6891 *retval = fp->fp_glob->fg_offset;
6892 }
6893 }
6894 }
6895
6896 /*
6897 * An lseek can affect whether data is "available to read." Use
6898 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6899 */
6900 post_event_if_success(vp, error, NOTE_NONE);
6901 (void)vnode_put(vp);
6902 file_drop(uap->fd);
6903 return error;
6904 }
6905
6906
6907 /*
6908 * Check access permissions.
6909 *
6910 * Returns: 0 Success
6911 * vnode_authorize:???
6912 */
6913 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6914 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6915 {
6916 kauth_action_t action;
6917 int error;
6918
6919 /*
6920 * If just the regular access bits, convert them to something
6921 * that vnode_authorize will understand.
6922 */
6923 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6924 action = 0;
6925 if (uflags & R_OK) {
6926 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6927 }
6928 if (uflags & W_OK) {
6929 if (vnode_isdir(vp)) {
6930 action |= KAUTH_VNODE_ADD_FILE |
6931 KAUTH_VNODE_ADD_SUBDIRECTORY;
6932 /* might want delete rights here too */
6933 } else {
6934 action |= KAUTH_VNODE_WRITE_DATA;
6935 }
6936 }
6937 if (uflags & X_OK) {
6938 if (vnode_isdir(vp)) {
6939 action |= KAUTH_VNODE_SEARCH;
6940 } else {
6941 action |= KAUTH_VNODE_EXECUTE;
6942 }
6943 }
6944 } else {
6945 /* take advantage of definition of uflags */
6946 action = uflags >> 8;
6947 }
6948
6949 #if CONFIG_MACF
6950 error = mac_vnode_check_access(ctx, vp, uflags);
6951 if (error) {
6952 return error;
6953 }
6954 #endif /* MAC */
6955
6956 /* action == 0 means only check for existence */
6957 if (action != 0) {
6958 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6959 } else {
6960 error = 0;
6961 }
6962
6963 return error;
6964 }
6965
6966
6967
6968 /*
6969 * access_extended: Check access permissions in bulk.
6970 *
6971 * Description: uap->entries Pointer to an array of accessx
6972 * descriptor structs, plus one or
6973 * more NULL terminated strings (see
6974 * "Notes" section below).
6975 * uap->size Size of the area pointed to by
6976 * uap->entries.
6977 * uap->results Pointer to the results array.
6978 *
6979 * Returns: 0 Success
6980 * ENOMEM Insufficient memory
6981 * EINVAL Invalid arguments
6982 * namei:EFAULT Bad address
6983 * namei:ENAMETOOLONG Filename too long
6984 * namei:ENOENT No such file or directory
6985 * namei:ELOOP Too many levels of symbolic links
6986 * namei:EBADF Bad file descriptor
6987 * namei:ENOTDIR Not a directory
6988 * namei:???
6989 * access1:
6990 *
6991 * Implicit returns:
6992 * uap->results Array contents modified
6993 *
6994 * Notes: The uap->entries are structured as an arbitrary length array
6995 * of accessx descriptors, followed by one or more NULL terminated
6996 * strings
6997 *
6998 * struct accessx_descriptor[0]
6999 * ...
7000 * struct accessx_descriptor[n]
7001 * char name_data[0];
7002 *
7003 * We determine the entry count by walking the buffer containing
7004 * the uap->entries argument descriptor. For each descriptor we
7005 * see, the valid values for the offset ad_name_offset will be
7006 * in the byte range:
7007 *
7008 * [ uap->entries + sizeof(struct accessx_descriptor) ]
7009 * to
7010 * [ uap->entries + uap->size - 2 ]
7011 *
7012 * since we must have at least one string, and the string must
7013 * be at least one character plus the NULL terminator in length.
7014 *
7015 * XXX: Need to support the check-as uid argument
7016 */
7017 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)7018 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
7019 {
7020 struct accessx_descriptor *input = NULL;
7021 errno_t *result = NULL;
7022 errno_t error = 0;
7023 int wantdelete = 0;
7024 size_t desc_max, desc_actual = 0;
7025 unsigned int i, j;
7026 struct vfs_context context;
7027 struct nameidata nd;
7028 int niopts;
7029 vnode_t vp = NULL;
7030 vnode_t dvp = NULL;
7031 #define ACCESSX_MAX_DESCR_ON_STACK 10
7032 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
7033
7034 context.vc_ucred = NULL;
7035
7036 /*
7037 * Validate parameters; if valid, copy the descriptor array and string
7038 * arguments into local memory. Before proceeding, the following
7039 * conditions must have been met:
7040 *
7041 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
7042 * o There must be sufficient room in the request for at least one
7043 * descriptor and a one yte NUL terminated string.
7044 * o The allocation of local storage must not fail.
7045 */
7046 if (uap->size > ACCESSX_MAX_TABLESIZE) {
7047 return ENOMEM;
7048 }
7049 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
7050 return EINVAL;
7051 }
7052 if (uap->size <= sizeof(stack_input)) {
7053 input = stack_input;
7054 } else {
7055 input = kalloc_data(uap->size, Z_WAITOK);
7056 if (input == NULL) {
7057 error = ENOMEM;
7058 goto out;
7059 }
7060 }
7061 error = copyin(uap->entries, input, uap->size);
7062 if (error) {
7063 goto out;
7064 }
7065
7066 AUDIT_ARG(opaque, input, uap->size);
7067
7068 /*
7069 * Force NUL termination of the copyin buffer to avoid nami() running
7070 * off the end. If the caller passes us bogus data, they may get a
7071 * bogus result.
7072 */
7073 ((char *)input)[uap->size - 1] = 0;
7074
7075 /*
7076 * Access is defined as checking against the process' real identity,
7077 * even if operations are checking the effective identity. This
7078 * requires that we use a local vfs context.
7079 */
7080 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
7081 context.vc_thread = current_thread();
7082
7083 /*
7084 * Find out how many entries we have, so we can allocate the result
7085 * array by walking the list and adjusting the count downward by the
7086 * earliest string offset we see.
7087 */
7088 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
7089 desc_actual = desc_max;
7090 for (i = 0; i < desc_actual; i++) {
7091 /*
7092 * Take the offset to the name string for this entry and
7093 * convert to an input array index, which would be one off
7094 * the end of the array if this entry was the lowest-addressed
7095 * name string.
7096 */
7097 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
7098
7099 /*
7100 * An offset greater than the max allowable offset is an error.
7101 * It is also an error for any valid entry to point
7102 * to a location prior to the end of the current entry, if
7103 * it's not a reference to the string of the previous entry.
7104 */
7105 if (j > desc_max || (j != 0 && j <= i)) {
7106 error = EINVAL;
7107 goto out;
7108 }
7109
7110 /* Also do not let ad_name_offset point to something beyond the size of the input */
7111 if (input[i].ad_name_offset >= uap->size) {
7112 error = EINVAL;
7113 goto out;
7114 }
7115
7116 /*
7117 * An offset of 0 means use the previous descriptor's offset;
7118 * this is used to chain multiple requests for the same file
7119 * to avoid multiple lookups.
7120 */
7121 if (j == 0) {
7122 /* This is not valid for the first entry */
7123 if (i == 0) {
7124 error = EINVAL;
7125 goto out;
7126 }
7127 continue;
7128 }
7129
7130 /*
7131 * If the offset of the string for this descriptor is before
7132 * what we believe is the current actual last descriptor,
7133 * then we need to adjust our estimate downward; this permits
7134 * the string table following the last descriptor to be out
7135 * of order relative to the descriptor list.
7136 */
7137 if (j < desc_actual) {
7138 desc_actual = j;
7139 }
7140 }
7141
7142 /*
7143 * We limit the actual number of descriptors we are willing to process
7144 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
7145 * requested does not exceed this limit,
7146 */
7147 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
7148 error = ENOMEM;
7149 goto out;
7150 }
7151 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
7152 if (result == NULL) {
7153 error = ENOMEM;
7154 goto out;
7155 }
7156
7157 /*
7158 * Do the work by iterating over the descriptor entries we know to
7159 * at least appear to contain valid data.
7160 */
7161 error = 0;
7162 for (i = 0; i < desc_actual; i++) {
7163 /*
7164 * If the ad_name_offset is 0, then we use the previous
7165 * results to make the check; otherwise, we are looking up
7166 * a new file name.
7167 */
7168 if (input[i].ad_name_offset != 0) {
7169 /* discard old vnodes */
7170 if (vp) {
7171 vnode_put(vp);
7172 vp = NULL;
7173 }
7174 if (dvp) {
7175 vnode_put(dvp);
7176 dvp = NULL;
7177 }
7178
7179 /*
7180 * Scan forward in the descriptor list to see if we
7181 * need the parent vnode. We will need it if we are
7182 * deleting, since we must have rights to remove
7183 * entries in the parent directory, as well as the
7184 * rights to delete the object itself.
7185 */
7186 wantdelete = input[i].ad_flags & _DELETE_OK;
7187 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
7188 if (input[j].ad_flags & _DELETE_OK) {
7189 wantdelete = 1;
7190 }
7191 }
7192
7193 niopts = FOLLOW | AUDITVNPATH1;
7194
7195 /* need parent for vnode_authorize for deletion test */
7196 if (wantdelete) {
7197 niopts |= WANTPARENT;
7198 }
7199
7200 /* do the lookup */
7201 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
7202 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
7203 &context);
7204 error = namei(&nd);
7205 if (!error) {
7206 vp = nd.ni_vp;
7207 if (wantdelete) {
7208 dvp = nd.ni_dvp;
7209 }
7210 }
7211 nameidone(&nd);
7212 }
7213
7214 /*
7215 * Handle lookup errors.
7216 */
7217 switch (error) {
7218 case ENOENT:
7219 case EACCES:
7220 case EPERM:
7221 case ENOTDIR:
7222 result[i] = error;
7223 break;
7224 case 0:
7225 /* run this access check */
7226 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
7227 break;
7228 default:
7229 /* fatal lookup error */
7230
7231 goto out;
7232 }
7233 }
7234
7235 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
7236
7237 /* copy out results */
7238 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
7239
7240 out:
7241 if (input && input != stack_input) {
7242 kfree_data(input, uap->size);
7243 }
7244 if (result) {
7245 kfree_data(result, desc_actual * sizeof(errno_t));
7246 }
7247 if (vp) {
7248 vnode_put(vp);
7249 }
7250 if (dvp) {
7251 vnode_put(dvp);
7252 }
7253 if (IS_VALID_CRED(context.vc_ucred)) {
7254 kauth_cred_unref(&context.vc_ucred);
7255 }
7256 return error;
7257 }
7258
7259
7260 /*
7261 * Returns: 0 Success
7262 * namei:EFAULT Bad address
7263 * namei:ENAMETOOLONG Filename too long
7264 * namei:ENOENT No such file or directory
7265 * namei:ELOOP Too many levels of symbolic links
7266 * namei:EBADF Bad file descriptor
7267 * namei:ENOTDIR Not a directory
7268 * namei:???
7269 * access1:
7270 */
7271 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)7272 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
7273 int flag, enum uio_seg segflg)
7274 {
7275 int error;
7276 struct nameidata nd;
7277 int niopts;
7278 struct vfs_context context;
7279 #if NAMEDRSRCFORK
7280 int is_namedstream = 0;
7281 #endif
7282
7283 /*
7284 * Unless the AT_EACCESS option is used, Access is defined as checking
7285 * against the process' real identity, even if operations are checking
7286 * the effective identity. So we need to tweak the credential
7287 * in the context for that case.
7288 */
7289 if (!(flag & AT_EACCESS)) {
7290 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
7291 } else {
7292 context.vc_ucred = ctx->vc_ucred;
7293 }
7294 context.vc_thread = ctx->vc_thread;
7295
7296
7297 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
7298 /* need parent for vnode_authorize for deletion test */
7299 if (amode & _DELETE_OK) {
7300 niopts |= WANTPARENT;
7301 }
7302 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
7303 path, &context);
7304 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7305 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7306 }
7307 if (flag & AT_RESOLVE_BENEATH) {
7308 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
7309 }
7310 if (flag & AT_UNIQUE) {
7311 nd.ni_flag |= NAMEI_UNIQUE;
7312 }
7313
7314 #if NAMEDRSRCFORK
7315 /* access(F_OK) calls are allowed for resource forks. */
7316 if (amode == F_OK) {
7317 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7318 }
7319 #endif
7320 error = nameiat(&nd, fd);
7321 if (error) {
7322 goto out;
7323 }
7324
7325 #if NAMEDRSRCFORK
7326 /* Grab reference on the shadow stream file vnode to
7327 * force an inactive on release which will mark it
7328 * for recycle.
7329 */
7330 if (vnode_isnamedstream(nd.ni_vp) &&
7331 (nd.ni_vp->v_parent != NULLVP) &&
7332 vnode_isshadow(nd.ni_vp)) {
7333 is_namedstream = 1;
7334 vnode_ref(nd.ni_vp);
7335 }
7336 #endif
7337
7338 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
7339
7340 #if NAMEDRSRCFORK
7341 if (is_namedstream) {
7342 vnode_rele(nd.ni_vp);
7343 }
7344 #endif
7345
7346 vnode_put(nd.ni_vp);
7347 if (amode & _DELETE_OK) {
7348 vnode_put(nd.ni_dvp);
7349 }
7350 nameidone(&nd);
7351
7352 out:
7353 if (!(flag & AT_EACCESS)) {
7354 kauth_cred_unref(&context.vc_ucred);
7355 }
7356 return error;
7357 }
7358
7359 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)7360 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
7361 {
7362 return faccessat_internal(vfs_context_current(), AT_FDCWD,
7363 uap->path, uap->flags, 0, UIO_USERSPACE);
7364 }
7365
7366 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)7367 faccessat(__unused proc_t p, struct faccessat_args *uap,
7368 __unused int32_t *retval)
7369 {
7370 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
7371 return EINVAL;
7372 }
7373
7374 return faccessat_internal(vfs_context_current(), uap->fd,
7375 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
7376 }
7377
7378 /*
7379 * Returns: 0 Success
7380 * EFAULT
7381 * copyout:EFAULT
7382 * namei:???
7383 * vn_stat:???
7384 */
7385 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)7386 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
7387 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
7388 enum uio_seg segflg, int fd, int flag)
7389 {
7390 struct nameidata *ndp = NULL;
7391 int follow;
7392 union {
7393 struct stat sb;
7394 struct stat64 sb64;
7395 } source = {};
7396 union {
7397 struct user64_stat user64_sb;
7398 struct user32_stat user32_sb;
7399 struct user64_stat64 user64_sb64;
7400 struct user32_stat64 user32_sb64;
7401 } dest = {};
7402 caddr_t sbp;
7403 int error, my_size;
7404 kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
7405 size_t xsecurity_bufsize;
7406 void * statptr;
7407 struct fileproc *fp = NULL;
7408 int needsrealdev = 0;
7409
7410 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7411 ndp = kalloc_type(struct nameidata, Z_WAITOK);
7412 NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7413 segflg, path, ctx);
7414 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7415 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7416 }
7417 if (flag & AT_RESOLVE_BENEATH) {
7418 ndp->ni_flag |= NAMEI_RESOLVE_BENEATH;
7419 }
7420 if (flag & AT_UNIQUE) {
7421 ndp->ni_flag |= NAMEI_UNIQUE;
7422 }
7423
7424 #if NAMEDRSRCFORK
7425 int is_namedstream = 0;
7426 /* stat calls are allowed for resource forks. */
7427 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7428 #endif
7429
7430 if (flag & AT_FDONLY) {
7431 vnode_t fvp;
7432
7433 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7434 if (error) {
7435 goto out;
7436 }
7437 if ((error = vnode_getwithref(fvp))) {
7438 file_drop(fd);
7439 goto out;
7440 }
7441 ndp->ni_vp = fvp;
7442 } else {
7443 error = nameiat(ndp, fd);
7444 if (error) {
7445 goto out;
7446 }
7447 }
7448
7449 statptr = (void *)&source;
7450
7451 #if NAMEDRSRCFORK
7452 /* Grab reference on the shadow stream file vnode to
7453 * force an inactive on release which will mark it
7454 * for recycle.
7455 */
7456 if (vnode_isnamedstream(ndp->ni_vp) &&
7457 (ndp->ni_vp->v_parent != NULLVP) &&
7458 vnode_isshadow(ndp->ni_vp)) {
7459 is_namedstream = 1;
7460 vnode_ref(ndp->ni_vp);
7461 }
7462 #endif
7463
7464 needsrealdev = flag & AT_REALDEV ? 1 : 0;
7465 if (fp && (xsecurity == USER_ADDR_NULL)) {
7466 /*
7467 * If the caller has the file open, and is not
7468 * requesting extended security information, we are
7469 * going to let them get the basic stat information.
7470 */
7471 error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7472 fp->fp_glob->fg_cred);
7473 } else {
7474 error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7475 isstat64, needsrealdev, ctx);
7476 }
7477
7478 #if NAMEDRSRCFORK
7479 if (is_namedstream) {
7480 vnode_rele(ndp->ni_vp);
7481 }
7482 #endif
7483 vnode_put(ndp->ni_vp);
7484 nameidone(ndp);
7485
7486 if (fp) {
7487 file_drop(fd);
7488 fp = NULL;
7489 }
7490
7491 if (error) {
7492 goto out;
7493 }
7494 /* Zap spare fields */
7495 if (isstat64 != 0) {
7496 source.sb64.st_lspare = 0;
7497 source.sb64.st_qspare[0] = 0LL;
7498 source.sb64.st_qspare[1] = 0LL;
7499 if (vfs_context_is64bit(ctx)) {
7500 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7501 my_size = sizeof(dest.user64_sb64);
7502 sbp = (caddr_t)&dest.user64_sb64;
7503 } else {
7504 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7505 my_size = sizeof(dest.user32_sb64);
7506 sbp = (caddr_t)&dest.user32_sb64;
7507 }
7508 /*
7509 * Check if we raced (post lookup) against the last unlink of a file.
7510 */
7511 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7512 source.sb64.st_nlink = 1;
7513 }
7514 } else {
7515 source.sb.st_lspare = 0;
7516 source.sb.st_qspare[0] = 0LL;
7517 source.sb.st_qspare[1] = 0LL;
7518 if (vfs_context_is64bit(ctx)) {
7519 munge_user64_stat(&source.sb, &dest.user64_sb);
7520 my_size = sizeof(dest.user64_sb);
7521 sbp = (caddr_t)&dest.user64_sb;
7522 } else {
7523 munge_user32_stat(&source.sb, &dest.user32_sb);
7524 my_size = sizeof(dest.user32_sb);
7525 sbp = (caddr_t)&dest.user32_sb;
7526 }
7527
7528 /*
7529 * Check if we raced (post lookup) against the last unlink of a file.
7530 */
7531 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7532 source.sb.st_nlink = 1;
7533 }
7534 }
7535 if ((error = copyout(sbp, ub, my_size)) != 0) {
7536 goto out;
7537 }
7538
7539 /* caller wants extended security information? */
7540 if (xsecurity != USER_ADDR_NULL) {
7541 /* did we get any? */
7542 if (fsec == KAUTH_FILESEC_NONE) {
7543 if (susize(xsecurity_size, 0) != 0) {
7544 error = EFAULT;
7545 goto out;
7546 }
7547 } else {
7548 /* find the user buffer size */
7549 xsecurity_bufsize = fusize(xsecurity_size);
7550
7551 /* copy out the actual data size */
7552 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7553 error = EFAULT;
7554 goto out;
7555 }
7556
7557 /* if the caller supplied enough room, copy out to it */
7558 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7559 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7560 }
7561 }
7562 }
7563 out:
7564 if (ndp) {
7565 kfree_type(struct nameidata, ndp);
7566 }
7567 if (fsec != KAUTH_FILESEC_NONE) {
7568 kauth_filesec_free(fsec);
7569 }
7570 return error;
7571 }
7572
7573 /*
7574 * stat_extended: Get file status; with extended security (ACL).
7575 *
7576 * Parameters: p (ignored)
7577 * uap User argument descriptor (see below)
7578 * retval (ignored)
7579 *
7580 * Indirect: uap->path Path of file to get status from
7581 * uap->ub User buffer (holds file status info)
7582 * uap->xsecurity ACL to get (extended security)
7583 * uap->xsecurity_size Size of ACL
7584 *
7585 * Returns: 0 Success
7586 * !0 errno value
7587 *
7588 */
7589 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7590 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7591 __unused int32_t *retval)
7592 {
7593 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7594 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7595 0);
7596 }
7597
7598 /*
7599 * Returns: 0 Success
7600 * fstatat_internal:??? [see fstatat_internal() in this file]
7601 */
7602 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7603 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7604 {
7605 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7606 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7607 }
7608
7609 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7610 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7611 {
7612 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7613 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7614 }
7615
7616 /*
7617 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7618 *
7619 * Parameters: p (ignored)
7620 * uap User argument descriptor (see below)
7621 * retval (ignored)
7622 *
7623 * Indirect: uap->path Path of file to get status from
7624 * uap->ub User buffer (holds file status info)
7625 * uap->xsecurity ACL to get (extended security)
7626 * uap->xsecurity_size Size of ACL
7627 *
7628 * Returns: 0 Success
7629 * !0 errno value
7630 *
7631 */
7632 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7633 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7634 {
7635 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7636 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7637 0);
7638 }
7639
7640 /*
7641 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7642 *
7643 * Parameters: p (ignored)
7644 * uap User argument descriptor (see below)
7645 * retval (ignored)
7646 *
7647 * Indirect: uap->path Path of file to get status from
7648 * uap->ub User buffer (holds file status info)
7649 * uap->xsecurity ACL to get (extended security)
7650 * uap->xsecurity_size Size of ACL
7651 *
7652 * Returns: 0 Success
7653 * !0 errno value
7654 *
7655 */
7656 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7657 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7658 {
7659 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7660 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7661 AT_SYMLINK_NOFOLLOW);
7662 }
7663
7664 /*
7665 * Get file status; this version does not follow links.
7666 */
7667 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7668 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7669 {
7670 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7671 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7672 }
7673
7674 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7675 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7676 {
7677 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7678 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7679 }
7680
7681 /*
7682 * lstat64_extended: Get file status; can handle large inode numbers; does not
7683 * follow links; with extended security (ACL).
7684 *
7685 * Parameters: p (ignored)
7686 * uap User argument descriptor (see below)
7687 * retval (ignored)
7688 *
7689 * Indirect: uap->path Path of file to get status from
7690 * uap->ub User buffer (holds file status info)
7691 * uap->xsecurity ACL to get (extended security)
7692 * uap->xsecurity_size Size of ACL
7693 *
7694 * Returns: 0 Success
7695 * !0 errno value
7696 *
7697 */
7698 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7699 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7700 {
7701 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7702 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7703 AT_SYMLINK_NOFOLLOW);
7704 }
7705
7706 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7707 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7708 {
7709 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
7710 return EINVAL;
7711 }
7712
7713 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7714 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7715 }
7716
7717 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7718 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7719 __unused int32_t *retval)
7720 {
7721 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
7722 return EINVAL;
7723 }
7724
7725 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7726 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7727 }
7728
7729 /*
7730 * Get configurable pathname variables.
7731 *
7732 * Returns: 0 Success
7733 * namei:???
7734 * vn_pathconf:???
7735 *
7736 * Notes: Global implementation constants are intended to be
7737 * implemented in this function directly; all other constants
7738 * are per-FS implementation, and therefore must be handled in
7739 * each respective FS, instead.
7740 *
7741 * XXX We implement some things globally right now that should actually be
7742 * XXX per-FS; we will need to deal with this at some point.
7743 */
7744 /* ARGSUSED */
7745 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7746 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7747 {
7748 int error;
7749 struct nameidata nd;
7750 vfs_context_t ctx = vfs_context_current();
7751
7752 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7753 UIO_USERSPACE, uap->path, ctx);
7754 error = namei(&nd);
7755 if (error) {
7756 return error;
7757 }
7758
7759 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7760
7761 vnode_put(nd.ni_vp);
7762 nameidone(&nd);
7763 return error;
7764 }
7765
7766 /*
7767 * Return target name of a symbolic link.
7768 */
7769 /* ARGSUSED */
7770 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7771 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7772 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7773 int *retval)
7774 {
7775 vnode_t vp;
7776 uio_t auio;
7777 int error;
7778 struct nameidata nd;
7779 UIO_STACKBUF(uio_buf, 1);
7780 bool put_vnode;
7781
7782 if (bufsize > INT32_MAX) {
7783 return EINVAL;
7784 }
7785
7786 if (lnk_vp) {
7787 vp = lnk_vp;
7788 put_vnode = false;
7789 } else {
7790 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7791 seg, path, ctx);
7792
7793 error = nameiat(&nd, fd);
7794 if (error) {
7795 return error;
7796 }
7797 vp = nd.ni_vp;
7798 put_vnode = true;
7799 nameidone(&nd);
7800 }
7801
7802 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7803 &uio_buf[0], sizeof(uio_buf));
7804 uio_addiov(auio, buf, bufsize);
7805 if (vp->v_type != VLNK) {
7806 error = EINVAL;
7807 } else {
7808 #if CONFIG_MACF
7809 error = mac_vnode_check_readlink(ctx, vp);
7810 #endif
7811 if (error == 0) {
7812 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7813 ctx);
7814 }
7815 if (error == 0) {
7816 error = VNOP_READLINK(vp, auio, ctx);
7817 }
7818 }
7819
7820 if (put_vnode) {
7821 vnode_put(vp);
7822 }
7823
7824 *retval = (int)(bufsize - uio_resid(auio));
7825 return error;
7826 }
7827
7828 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7829 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7830 {
7831 enum uio_seg procseg;
7832 vnode_t vp;
7833 int error;
7834
7835 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7836
7837 AUDIT_ARG(fd, uap->fd);
7838
7839 if ((error = file_vnode(uap->fd, &vp))) {
7840 return error;
7841 }
7842 if ((error = vnode_getwithref(vp))) {
7843 file_drop(uap->fd);
7844 return error;
7845 }
7846
7847 error = readlinkat_internal(vfs_context_current(), -1,
7848 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7849 uap->bufsize, procseg, retval);
7850
7851 vnode_put(vp);
7852 file_drop(uap->fd);
7853 return error;
7854 }
7855
7856 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7857 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7858 {
7859 enum uio_seg procseg;
7860
7861 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7862 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7863 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7864 uap->count, procseg, retval);
7865 }
7866
7867 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7868 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7869 {
7870 enum uio_seg procseg;
7871
7872 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7873 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7874 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7875 retval);
7876 }
7877
7878 /*
7879 * Change file flags, the deep inner layer.
7880 */
7881 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7882 chflags0(vnode_t vp, struct vnode_attr *va,
7883 int (*setattr)(vnode_t, void *, vfs_context_t),
7884 void *arg, vfs_context_t ctx)
7885 {
7886 kauth_action_t action = 0;
7887 int error;
7888
7889 #if CONFIG_MACF
7890 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7891 if (error) {
7892 goto out;
7893 }
7894 #endif
7895
7896 /* request authorisation, disregard immutability */
7897 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7898 goto out;
7899 }
7900 /*
7901 * Request that the auth layer disregard those file flags it's allowed to when
7902 * authorizing this operation; we need to do this in order to be able to
7903 * clear immutable flags.
7904 */
7905 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7906 goto out;
7907 }
7908 error = (*setattr)(vp, arg, ctx);
7909
7910 if (error == 0) {
7911 if (va->va_flags & APPEND) {
7912 os_atomic_or(&vp->v_ext_flag, VE_APPENDONLY, relaxed);
7913 } else {
7914 os_atomic_andnot(&vp->v_ext_flag, VE_APPENDONLY, relaxed);
7915 }
7916 #if CONFIG_MACF
7917 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7918 #endif
7919 }
7920
7921 out:
7922 return error;
7923 }
7924
7925 /*
7926 * Change file flags.
7927 *
7928 * NOTE: this will vnode_put() `vp'
7929 */
7930 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7931 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7932 {
7933 struct vnode_attr va;
7934 int error;
7935
7936 VATTR_INIT(&va);
7937 VATTR_SET(&va, va_flags, flags);
7938
7939 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7940 vnode_put(vp);
7941
7942 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7943 error = ENOTSUP;
7944 }
7945
7946 return error;
7947 }
7948
7949 /*
7950 * Change flags of a file given a path name.
7951 */
7952 /* ARGSUSED */
7953 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7954 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7955 {
7956 vnode_t vp;
7957 vfs_context_t ctx = vfs_context_current();
7958 int error;
7959 struct nameidata nd;
7960 uint32_t wantparent = 0;
7961
7962 #if CONFIG_FILE_LEASES
7963 wantparent = WANTPARENT;
7964 #endif
7965
7966 AUDIT_ARG(fflags, uap->flags);
7967 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7968 UIO_USERSPACE, uap->path, ctx);
7969 error = namei(&nd);
7970 if (error) {
7971 return error;
7972 }
7973 vp = nd.ni_vp;
7974
7975 #if CONFIG_FILE_LEASES
7976 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7977 vnode_put(nd.ni_dvp);
7978 #endif
7979
7980 nameidone(&nd);
7981
7982 /* we don't vnode_put() here because chflags1 does internally */
7983 error = chflags1(vp, uap->flags, ctx);
7984
7985 return error;
7986 }
7987
7988 /*
7989 * Change flags of a file given a file descriptor.
7990 */
7991 /* ARGSUSED */
7992 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7993 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7994 {
7995 vnode_t vp;
7996 int error;
7997
7998 AUDIT_ARG(fd, uap->fd);
7999 AUDIT_ARG(fflags, uap->flags);
8000 if ((error = file_vnode(uap->fd, &vp))) {
8001 return error;
8002 }
8003
8004 if ((error = vnode_getwithref(vp))) {
8005 file_drop(uap->fd);
8006 return error;
8007 }
8008
8009 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8010
8011 #if CONFIG_FILE_LEASES
8012 vnode_breakdirlease(vp, true, O_WRONLY);
8013 #endif
8014
8015 /* we don't vnode_put() here because chflags1 does internally */
8016 error = chflags1(vp, uap->flags, vfs_context_current());
8017
8018 file_drop(uap->fd);
8019 return error;
8020 }
8021
8022 /*
8023 * Change security information on a filesystem object.
8024 *
8025 * Returns: 0 Success
8026 * EPERM Operation not permitted
8027 * vnode_authattr:??? [anything vnode_authattr can return]
8028 * vnode_authorize:??? [anything vnode_authorize can return]
8029 * vnode_setattr:??? [anything vnode_setattr can return]
8030 *
8031 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
8032 * translated to EPERM before being returned.
8033 */
8034 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)8035 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
8036 {
8037 kauth_action_t action;
8038 int error;
8039
8040 AUDIT_ARG(mode, vap->va_mode);
8041 /* XXX audit new args */
8042
8043 #if NAMEDSTREAMS
8044 /* chmod calls are not allowed for resource forks. */
8045 if (vp->v_flag & VISNAMEDSTREAM) {
8046 return EPERM;
8047 }
8048 #endif
8049
8050 #if CONFIG_MACF
8051 if (VATTR_IS_ACTIVE(vap, va_mode) &&
8052 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
8053 return error;
8054 }
8055
8056 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
8057 if ((error = mac_vnode_check_setowner(ctx, vp,
8058 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
8059 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
8060 return error;
8061 }
8062 }
8063
8064 if (VATTR_IS_ACTIVE(vap, va_acl) &&
8065 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
8066 return error;
8067 }
8068 #endif
8069
8070 /* make sure that the caller is allowed to set this security information */
8071 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
8072 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8073 if (error == EACCES) {
8074 error = EPERM;
8075 }
8076 return error;
8077 }
8078
8079 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
8080 return error;
8081 }
8082
8083 #if CONFIG_MACF
8084 if (VATTR_IS_ACTIVE(vap, va_mode)) {
8085 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
8086 }
8087
8088 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
8089 mac_vnode_notify_setowner(ctx, vp,
8090 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
8091 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
8092 }
8093
8094 if (VATTR_IS_ACTIVE(vap, va_acl)) {
8095 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
8096 }
8097 #endif
8098
8099 return error;
8100 }
8101
8102
8103 /*
8104 * Change mode of a file given a path name.
8105 *
8106 * Returns: 0 Success
8107 * namei:??? [anything namei can return]
8108 * chmod_vnode:??? [anything chmod_vnode can return]
8109 */
8110 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)8111 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
8112 int fd, int flag, enum uio_seg segflg)
8113 {
8114 struct nameidata nd;
8115 int follow, error;
8116 uint32_t wantparent = 0;
8117
8118 #if CONFIG_FILE_LEASES
8119 wantparent = WANTPARENT;
8120 #endif
8121
8122 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8123 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
8124 segflg, path, ctx);
8125 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8126 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8127 }
8128 if (flag & AT_RESOLVE_BENEATH) {
8129 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
8130 }
8131 if (flag & AT_UNIQUE) {
8132 nd.ni_flag |= NAMEI_UNIQUE;
8133 }
8134 if ((error = nameiat(&nd, fd))) {
8135 return error;
8136 }
8137
8138 #if CONFIG_FILE_LEASES
8139 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8140 vnode_put(nd.ni_dvp);
8141 #endif
8142
8143 error = chmod_vnode(ctx, nd.ni_vp, vap);
8144 vnode_put(nd.ni_vp);
8145 nameidone(&nd);
8146 return error;
8147 }
8148
8149 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)8150 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
8151 gid_t gid, user_addr_t xsecurity)
8152 {
8153 int error;
8154
8155 VATTR_INIT(pva);
8156
8157 if (mode != -1) {
8158 VATTR_SET(pva, va_mode, mode & ALLPERMS);
8159 } else {
8160 pva->va_mode = 0;
8161 }
8162
8163 if (uid != KAUTH_UID_NONE) {
8164 VATTR_SET(pva, va_uid, uid);
8165 }
8166
8167 if (gid != KAUTH_GID_NONE) {
8168 VATTR_SET(pva, va_gid, gid);
8169 }
8170
8171 *pxsecdst = NULL;
8172 switch (xsecurity) {
8173 case USER_ADDR_NULL:
8174 break;
8175
8176 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
8177 VATTR_SET(pva, va_acl, NULL);
8178 break;
8179
8180 default:
8181 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
8182 return error;
8183 }
8184
8185 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
8186 pva->va_vaflags |= VA_FILESEC_ACL;
8187 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
8188 break;
8189 }
8190
8191 return 0;
8192 }
8193
8194 /*
8195 * chmod_extended: Change the mode of a file given a path name; with extended
8196 * argument list (including extended security (ACL)).
8197 *
8198 * Parameters: p Process requesting the open
8199 * uap User argument descriptor (see below)
8200 * retval (ignored)
8201 *
8202 * Indirect: uap->path Path to object (same as 'chmod')
8203 * uap->uid UID to set
8204 * uap->gid GID to set
8205 * uap->mode File mode to set (same as 'chmod')
8206 * uap->xsecurity ACL to set (or delete)
8207 *
8208 * Returns: 0 Success
8209 * !0 errno value
8210 *
8211 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
8212 *
8213 * XXX: We should enummerate the possible errno values here, and where
8214 * in the code they originated.
8215 */
8216 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)8217 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
8218 {
8219 int error;
8220 struct vnode_attr va;
8221 kauth_filesec_t xsecdst = NULL;
8222
8223 AUDIT_ARG(owner, uap->uid, uap->gid);
8224
8225 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8226 uap->gid, uap->xsecurity);
8227
8228 if (error) {
8229 return error;
8230 }
8231
8232 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
8233 UIO_USERSPACE);
8234
8235 if (xsecdst != NULL) {
8236 kauth_filesec_free(xsecdst);
8237 }
8238 return error;
8239 }
8240
8241 /*
8242 * Returns: 0 Success
8243 * chmodat:??? [anything chmodat can return]
8244 */
8245 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)8246 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
8247 int flag, enum uio_seg segflg)
8248 {
8249 struct vnode_attr va;
8250
8251 VATTR_INIT(&va);
8252 VATTR_SET(&va, va_mode, mode & ALLPERMS);
8253
8254 return chmodat(ctx, path, &va, fd, flag, segflg);
8255 }
8256
8257 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)8258 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
8259 {
8260 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
8261 AT_FDCWD, 0, UIO_USERSPACE);
8262 }
8263
8264 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)8265 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
8266 {
8267 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
8268 return EINVAL;
8269 }
8270
8271 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
8272 uap->fd, uap->flag, UIO_USERSPACE);
8273 }
8274
8275 /*
8276 * Change mode of a file given a file descriptor.
8277 */
8278 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)8279 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
8280 {
8281 vnode_t vp;
8282 int error;
8283
8284 AUDIT_ARG(fd, fd);
8285
8286 if ((error = file_vnode(fd, &vp)) != 0) {
8287 return error;
8288 }
8289 if ((error = vnode_getwithref(vp)) != 0) {
8290 file_drop(fd);
8291 return error;
8292 }
8293 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8294
8295 #if CONFIG_FILE_LEASES
8296 vnode_breakdirlease(vp, true, O_WRONLY);
8297 #endif
8298
8299 error = chmod_vnode(vfs_context_current(), vp, vap);
8300 (void)vnode_put(vp);
8301 file_drop(fd);
8302
8303 return error;
8304 }
8305
8306 /*
8307 * fchmod_extended: Change mode of a file given a file descriptor; with
8308 * extended argument list (including extended security (ACL)).
8309 *
8310 * Parameters: p Process requesting to change file mode
8311 * uap User argument descriptor (see below)
8312 * retval (ignored)
8313 *
8314 * Indirect: uap->mode File mode to set (same as 'chmod')
8315 * uap->uid UID to set
8316 * uap->gid GID to set
8317 * uap->xsecurity ACL to set (or delete)
8318 * uap->fd File descriptor of file to change mode
8319 *
8320 * Returns: 0 Success
8321 * !0 errno value
8322 *
8323 */
8324 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)8325 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
8326 {
8327 int error;
8328 struct vnode_attr va;
8329 kauth_filesec_t xsecdst = NULL;
8330
8331 AUDIT_ARG(owner, uap->uid, uap->gid);
8332
8333 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8334 uap->gid, uap->xsecurity);
8335
8336 if (error) {
8337 return error;
8338 }
8339
8340 error = fchmod1(p, uap->fd, &va);
8341
8342 if (xsecdst != NULL) {
8343 kauth_filesec_free(xsecdst);
8344 }
8345 return error;
8346 }
8347
8348 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)8349 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
8350 {
8351 struct vnode_attr va;
8352
8353 VATTR_INIT(&va);
8354 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
8355
8356 return fchmod1(p, uap->fd, &va);
8357 }
8358
8359 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)8360 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
8361 {
8362 struct vnode_attr va;
8363 kauth_action_t action;
8364 int error;
8365
8366 VATTR_INIT(&va);
8367 if (uid != (uid_t)VNOVAL) {
8368 VATTR_SET(&va, va_uid, uid);
8369 }
8370 if (gid != (gid_t)VNOVAL) {
8371 VATTR_SET(&va, va_gid, gid);
8372 }
8373
8374 #if NAMEDSTREAMS
8375 /* chown calls are not allowed for resource forks. */
8376 if (vp->v_flag & VISNAMEDSTREAM) {
8377 error = EPERM;
8378 goto out;
8379 }
8380 #endif
8381
8382 #if CONFIG_MACF
8383 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
8384 if (error) {
8385 goto out;
8386 }
8387 #endif
8388
8389 /* preflight and authorize attribute changes */
8390 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8391 goto out;
8392 }
8393 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8394 /*
8395 * EACCES is only allowed from namei(); permissions failure should
8396 * return EPERM, so we need to translate the error code.
8397 */
8398 if (error == EACCES) {
8399 error = EPERM;
8400 }
8401
8402 goto out;
8403 }
8404
8405 #if CONFIG_FILE_LEASES
8406 vnode_breakdirlease(vp, true, O_WRONLY);
8407 #endif
8408
8409 error = vnode_setattr(vp, &va, ctx);
8410
8411 #if CONFIG_MACF
8412 if (error == 0) {
8413 mac_vnode_notify_setowner(ctx, vp, uid, gid);
8414 }
8415 #endif
8416
8417 out:
8418 return error;
8419 }
8420
8421 /*
8422 * Set ownership given a path name.
8423 */
8424 /* ARGSUSED */
8425 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)8426 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
8427 gid_t gid, int flag, enum uio_seg segflg)
8428 {
8429 vnode_t vp;
8430 int error;
8431 struct nameidata nd;
8432 int follow;
8433
8434 AUDIT_ARG(owner, uid, gid);
8435
8436 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8437 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8438 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8439 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8440 }
8441 if (flag & AT_RESOLVE_BENEATH) {
8442 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
8443 }
8444 if (flag & AT_UNIQUE) {
8445 nd.ni_flag |= NAMEI_UNIQUE;
8446 }
8447
8448 error = nameiat(&nd, fd);
8449 if (error) {
8450 return error;
8451 }
8452
8453 vp = nd.ni_vp;
8454 error = vn_chown_internal(ctx, vp, uid, gid);
8455
8456 nameidone(&nd);
8457 vnode_put(vp);
8458 return error;
8459 }
8460
8461 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8462 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8463 {
8464 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8465 uap->uid, uap->gid, 0, UIO_USERSPACE);
8466 }
8467
8468 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8469 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8470 {
8471 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8472 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8473 }
8474
8475 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8476 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8477 {
8478 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH | AT_UNIQUE)) {
8479 return EINVAL;
8480 }
8481
8482 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8483 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8484 }
8485
8486 /*
8487 * Set ownership given a file descriptor.
8488 */
8489 /* ARGSUSED */
8490 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8491 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8492 {
8493 vfs_context_t ctx = vfs_context_current();
8494 vnode_t vp;
8495 int error;
8496
8497 AUDIT_ARG(owner, uap->uid, uap->gid);
8498 AUDIT_ARG(fd, uap->fd);
8499
8500 if ((error = file_vnode(uap->fd, &vp))) {
8501 return error;
8502 }
8503
8504 if ((error = vnode_getwithref(vp))) {
8505 file_drop(uap->fd);
8506 return error;
8507 }
8508 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8509
8510 error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8511
8512 (void)vnode_put(vp);
8513 file_drop(uap->fd);
8514 return error;
8515 }
8516
8517 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8518 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8519 {
8520 int error;
8521
8522 if (usrtvp == USER_ADDR_NULL) {
8523 struct timeval old_tv;
8524 /* XXX Y2038 bug because of microtime argument */
8525 microtime(&old_tv);
8526 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8527 tsp[1] = tsp[0];
8528 } else {
8529 if (IS_64BIT_PROCESS(current_proc())) {
8530 struct user64_timeval tv[2];
8531 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8532 if (error) {
8533 return error;
8534 }
8535 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8536 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8537 } else {
8538 struct user32_timeval tv[2];
8539 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8540 if (error) {
8541 return error;
8542 }
8543 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8544 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8545 }
8546 }
8547 return 0;
8548 }
8549
8550 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8551 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8552 int nullflag)
8553 {
8554 int error;
8555 struct vnode_attr va;
8556 kauth_action_t action;
8557
8558 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8559
8560 VATTR_INIT(&va);
8561 VATTR_SET(&va, va_access_time, ts[0]);
8562 VATTR_SET(&va, va_modify_time, ts[1]);
8563 if (nullflag) {
8564 va.va_vaflags |= VA_UTIMES_NULL;
8565 }
8566
8567 #if NAMEDSTREAMS
8568 /* utimes calls are not allowed for resource forks. */
8569 if (vp->v_flag & VISNAMEDSTREAM) {
8570 error = EPERM;
8571 goto out;
8572 }
8573 #endif
8574
8575 #if CONFIG_MACF
8576 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8577 if (error) {
8578 goto out;
8579 }
8580 #endif
8581 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8582 if (!nullflag && error == EACCES) {
8583 error = EPERM;
8584 }
8585 goto out;
8586 }
8587
8588 /* since we may not need to auth anything, check here */
8589 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8590 if (!nullflag && error == EACCES) {
8591 error = EPERM;
8592 }
8593 goto out;
8594 }
8595 error = vnode_setattr(vp, &va, ctx);
8596
8597 #if CONFIG_MACF
8598 if (error == 0) {
8599 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8600 }
8601 #endif
8602
8603 out:
8604 return error;
8605 }
8606
8607 /*
8608 * Set the access and modification times of a file.
8609 */
8610 /* ARGSUSED */
8611 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8612 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8613 {
8614 struct timespec ts[2];
8615 user_addr_t usrtvp;
8616 int error;
8617 struct nameidata nd;
8618 vfs_context_t ctx = vfs_context_current();
8619 uint32_t wantparent = 0;
8620
8621 #if CONFIG_FILE_LEASES
8622 wantparent = WANTPARENT;
8623 #endif
8624
8625 /*
8626 * AUDIT: Needed to change the order of operations to do the
8627 * name lookup first because auditing wants the path.
8628 */
8629 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8630 UIO_USERSPACE, uap->path, ctx);
8631 error = namei(&nd);
8632 if (error) {
8633 return error;
8634 }
8635
8636 /*
8637 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8638 * the current time instead.
8639 */
8640 usrtvp = uap->tptr;
8641 if ((error = getutimes(usrtvp, ts)) != 0) {
8642 goto out;
8643 }
8644
8645 #if CONFIG_FILE_LEASES
8646 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8647 #endif
8648
8649 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8650
8651 out:
8652 #if CONFIG_FILE_LEASES
8653 vnode_put(nd.ni_dvp);
8654 #endif
8655 nameidone(&nd);
8656 vnode_put(nd.ni_vp);
8657 return error;
8658 }
8659
8660 /*
8661 * Set the access and modification times of a file.
8662 */
8663 /* ARGSUSED */
8664 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8665 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8666 {
8667 struct timespec ts[2];
8668 vnode_t vp;
8669 user_addr_t usrtvp;
8670 int error;
8671
8672 AUDIT_ARG(fd, uap->fd);
8673 usrtvp = uap->tptr;
8674 if ((error = getutimes(usrtvp, ts)) != 0) {
8675 return error;
8676 }
8677 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8678 return error;
8679 }
8680 if ((error = vnode_getwithref(vp))) {
8681 file_drop(uap->fd);
8682 return error;
8683 }
8684
8685 #if CONFIG_FILE_LEASES
8686 vnode_breakdirlease(vp, true, O_WRONLY);
8687 #endif
8688
8689 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8690
8691 vnode_put(vp);
8692 file_drop(uap->fd);
8693 return error;
8694 }
8695
8696 static int
truncate_validate_common(proc_t p,off_t length)8697 truncate_validate_common(proc_t p, off_t length)
8698 {
8699 rlim_t fsize_limit;
8700
8701 if (length < 0) {
8702 return EINVAL;
8703 }
8704
8705 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8706 if ((rlim_t)length > fsize_limit) {
8707 psignal(p, SIGXFSZ);
8708 return EFBIG;
8709 }
8710
8711 return 0;
8712 }
8713
8714 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8715 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8716 vfs_context_t ctx, boolean_t need_auth)
8717 {
8718 struct vnode_attr va;
8719 kauth_action_t action;
8720 int error;
8721
8722 VATTR_INIT(&va);
8723 VATTR_SET(&va, va_data_size, length);
8724
8725 #if CONFIG_MACF
8726 error = mac_vnode_check_truncate(ctx, cred, vp);
8727 if (error) {
8728 return error;
8729 }
8730 #endif
8731
8732 /*
8733 * If we reached here from `ftruncate` then we already did an effective
8734 * `vnode_authorize` upon open. We honour the result from then.
8735 */
8736 if (need_auth) {
8737 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8738 return error;
8739 }
8740
8741 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8742 return error;
8743 }
8744 }
8745
8746 #if CONFIG_FILE_LEASES
8747 /* Check if there is a lease placed on the parent directory. */
8748 vnode_breakdirlease(vp, true, O_WRONLY);
8749
8750 /* Now check if there is a lease placed on the file itself. */
8751 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8752 #endif
8753
8754 error = vnode_setattr(vp, &va, ctx);
8755
8756 #if CONFIG_MACF
8757 if (error == 0) {
8758 mac_vnode_notify_truncate(ctx, cred, vp);
8759 }
8760 #endif
8761
8762 return error;
8763 }
8764
8765 /*
8766 * Truncate a file given its path name.
8767 */
8768 /* ARGSUSED */
8769 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8770 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8771 {
8772 vfs_context_t ctx = vfs_context_current();
8773 vnode_t vp;
8774 int error;
8775 struct nameidata nd;
8776
8777 if ((error = truncate_validate_common(p, uap->length))) {
8778 return error;
8779 }
8780
8781 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8782 UIO_USERSPACE, uap->path, ctx);
8783
8784 if ((error = namei(&nd))) {
8785 return error;
8786 }
8787
8788 vp = nd.ni_vp;
8789 nameidone(&nd);
8790
8791 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8792 vnode_put(vp);
8793
8794 return error;
8795 }
8796
8797 /*
8798 * Truncate a file given a file descriptor.
8799 */
8800 /* ARGSUSED */
8801 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8802 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8803 {
8804 vnode_t vp = NULLVP;
8805 struct fileproc *fp;
8806 bool need_vnode_put = false;
8807 int error;
8808
8809 AUDIT_ARG(fd, uap->fd);
8810
8811 if ((error = truncate_validate_common(p, uap->length))) {
8812 return error;
8813 }
8814
8815 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8816 return error;
8817 }
8818
8819 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8820 case DTYPE_PSXSHM:
8821 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8822 goto out;
8823 case DTYPE_VNODE:
8824 break;
8825 default:
8826 error = EINVAL;
8827 goto out;
8828 }
8829
8830 vp = (vnode_t)fp_get_data(fp);
8831
8832 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8833 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8834 error = EINVAL;
8835 goto out;
8836 }
8837
8838 if ((error = vnode_getwithref(vp)) != 0) {
8839 goto out;
8840 }
8841 need_vnode_put = true;
8842
8843 /* Don't allow ftruncate if the file has append-only flag set. */
8844 if (vnode_isappendonly(vp)) {
8845 error = EPERM;
8846 goto out;
8847 }
8848
8849 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8850
8851 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8852 vfs_context_current(), false);
8853 if (!error) {
8854 fp->fp_glob->fg_flag |= FWASWRITTEN;
8855 }
8856
8857 out:
8858 if (vp && need_vnode_put) {
8859 vnode_put(vp);
8860 }
8861
8862 file_drop(uap->fd);
8863 return error;
8864 }
8865
8866
8867 /*
8868 * Sync an open file with synchronized I/O _file_ integrity completion
8869 */
8870 /* ARGSUSED */
8871 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8872 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8873 {
8874 __pthread_testcancel(1);
8875 return fsync_common(p, uap, MNT_WAIT);
8876 }
8877
8878
8879 /*
8880 * Sync an open file with synchronized I/O _file_ integrity completion
8881 *
8882 * Notes: This is a legacy support function that does not test for
8883 * thread cancellation points.
8884 */
8885 /* ARGSUSED */
8886 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8887 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8888 {
8889 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8890 }
8891
8892
8893 /*
8894 * Sync an open file with synchronized I/O _data_ integrity completion
8895 */
8896 /* ARGSUSED */
8897 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8898 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8899 {
8900 __pthread_testcancel(1);
8901 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8902 }
8903
8904
8905 /*
8906 * fsync_common
8907 *
8908 * Common fsync code to support both synchronized I/O file integrity completion
8909 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8910 *
8911 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8912 * will only guarantee that the file data contents are retrievable. If
8913 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8914 * includes additional metadata unnecessary for retrieving the file data
8915 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8916 * storage.
8917 *
8918 * Parameters: p The process
8919 * uap->fd The descriptor to synchronize
8920 * flags The data integrity flags
8921 *
8922 * Returns: int Success
8923 * fp_getfvp:EBADF Bad file descriptor
8924 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8925 * VNOP_FSYNC:??? unspecified
8926 *
8927 * Notes: We use struct fsync_args because it is a short name, and all
8928 * caller argument structures are otherwise identical.
8929 */
8930 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8931 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8932 {
8933 vnode_t vp;
8934 struct fileproc *fp;
8935 vfs_context_t ctx = vfs_context_current();
8936 int error;
8937
8938 AUDIT_ARG(fd, uap->fd);
8939
8940 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8941 return error;
8942 }
8943 if ((error = vnode_getwithref(vp))) {
8944 file_drop(uap->fd);
8945 return error;
8946 }
8947
8948 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8949
8950 error = VNOP_FSYNC(vp, flags, ctx);
8951
8952 #if NAMEDRSRCFORK
8953 /* Sync resource fork shadow file if necessary. */
8954 if ((error == 0) &&
8955 (vp->v_flag & VISNAMEDSTREAM) &&
8956 (vp->v_parent != NULLVP) &&
8957 vnode_isshadow(vp) &&
8958 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8959 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8960 }
8961 #endif
8962
8963 (void)vnode_put(vp);
8964 file_drop(uap->fd);
8965 return error;
8966 }
8967
8968 /*
8969 * Duplicate files. Source must be a file, target must be a file or
8970 * must not exist.
8971 *
8972 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8973 * perform inheritance correctly.
8974 */
8975 /* ARGSUSED */
8976 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8977 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8978 {
8979 vnode_t tvp, fvp, tdvp, sdvp;
8980 struct nameidata fromnd, tond;
8981 int error;
8982 vfs_context_t ctx = vfs_context_current();
8983
8984 /* Check that the flags are valid. */
8985 if (uap->flags & ~CPF_MASK) {
8986 return EINVAL;
8987 }
8988
8989 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8990 UIO_USERSPACE, uap->from, ctx);
8991 if ((error = namei(&fromnd))) {
8992 return error;
8993 }
8994 fvp = fromnd.ni_vp;
8995
8996 NDINIT(&tond, CREATE, OP_LINK,
8997 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8998 UIO_USERSPACE, uap->to, ctx);
8999 if ((error = namei(&tond))) {
9000 goto out1;
9001 }
9002 tdvp = tond.ni_dvp;
9003 tvp = tond.ni_vp;
9004
9005 if (tvp != NULL) {
9006 if (!(uap->flags & CPF_OVERWRITE)) {
9007 error = EEXIST;
9008 goto out;
9009 }
9010 }
9011
9012 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
9013 error = EISDIR;
9014 goto out;
9015 }
9016
9017 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
9018 error = EOPNOTSUPP;
9019 goto out;
9020 }
9021
9022 #if CONFIG_MACF
9023 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
9024 goto out;
9025 }
9026 #endif /* CONFIG_MACF */
9027
9028 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
9029 goto out;
9030 }
9031 if (tvp) {
9032 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
9033 goto out;
9034 }
9035 }
9036 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
9037 goto out;
9038 }
9039
9040 if (fvp == tdvp) {
9041 error = EINVAL;
9042 }
9043 /*
9044 * If source is the same as the destination (that is the
9045 * same inode number) then there is nothing to do.
9046 * (fixed to have POSIX semantics - CSM 3/2/98)
9047 */
9048 if (fvp == tvp) {
9049 error = -1;
9050 }
9051
9052 #if CONFIG_FILE_LEASES
9053 vnode_breakdirlease(tdvp, false, O_WRONLY);
9054 #endif
9055
9056 if (!error) {
9057 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
9058 }
9059 out:
9060 sdvp = tond.ni_startdir;
9061 /*
9062 * nameidone has to happen before we vnode_put(tdvp)
9063 * since it may need to release the fs_nodelock on the tdvp
9064 */
9065 nameidone(&tond);
9066
9067 if (tvp) {
9068 vnode_put(tvp);
9069 }
9070 vnode_put(tdvp);
9071 vnode_put(sdvp);
9072 out1:
9073 vnode_put(fvp);
9074
9075 nameidone(&fromnd);
9076
9077 if (error == -1) {
9078 return 0;
9079 }
9080 return error;
9081 }
9082
9083 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
9084
9085 /*
9086 * Helper function for doing clones. The caller is expected to provide an
9087 * iocounted source vnode and release it.
9088 */
9089 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)9090 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
9091 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
9092 {
9093 vnode_t tvp, tdvp;
9094 struct nameidata *tondp = NULL;
9095 int error;
9096 int follow;
9097 boolean_t free_src_acl;
9098 boolean_t attr_cleanup;
9099 enum vtype v_type;
9100 kauth_action_t action;
9101 struct componentname *cnp;
9102 uint32_t defaulted = 0;
9103 struct {
9104 struct vnode_attr va[2];
9105 } *va2p = NULL;
9106 struct vnode_attr *vap = NULL;
9107 struct vnode_attr *nvap = NULL;
9108 uint32_t vnop_flags;
9109
9110 v_type = vnode_vtype(fvp);
9111 switch (v_type) {
9112 case VLNK:
9113 /* FALLTHRU */
9114 case VREG:
9115 action = KAUTH_VNODE_ADD_FILE;
9116 break;
9117 case VDIR:
9118 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
9119 fvp->v_mountedhere) {
9120 return EINVAL;
9121 }
9122 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
9123 break;
9124 default:
9125 return EINVAL;
9126 }
9127
9128 AUDIT_ARG(fd2, dst_dirfd);
9129 AUDIT_ARG(value32, flags);
9130
9131 tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9132 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9133 NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
9134 UIO_USERSPACE, dst, ctx);
9135 if (flags & CLONE_NOFOLLOW_ANY) {
9136 tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9137 }
9138 if (flags & CLONE_RESOLVE_BENEATH) {
9139 tondp->ni_flag |= NAMEI_RESOLVE_BENEATH;
9140 }
9141
9142 if ((error = nameiat(tondp, dst_dirfd))) {
9143 kfree_type(struct nameidata, tondp);
9144 return error;
9145 }
9146 cnp = &tondp->ni_cnd;
9147 tdvp = tondp->ni_dvp;
9148 tvp = tondp->ni_vp;
9149
9150 free_src_acl = FALSE;
9151 attr_cleanup = FALSE;
9152
9153 if (tvp != NULL) {
9154 error = EEXIST;
9155 goto out;
9156 }
9157
9158 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
9159 error = EXDEV;
9160 goto out;
9161 }
9162
9163 #if CONFIG_MACF
9164 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
9165 goto out;
9166 }
9167 #endif
9168 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
9169 goto out;
9170 }
9171
9172 action = KAUTH_VNODE_GENERIC_READ_BITS;
9173 if (data_read_authorised) {
9174 action &= ~KAUTH_VNODE_READ_DATA;
9175 }
9176 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
9177 goto out;
9178 }
9179
9180 va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
9181 vap = &va2p->va[0];
9182 nvap = &va2p->va[1];
9183
9184 /*
9185 * certain attributes may need to be changed from the source, we ask for
9186 * those here with the exception of source file's ACLs unless the CLONE_ACL
9187 * flag is specified. By default, the clone file will inherit the target
9188 * directory's ACLs unless the the CLONE_ACL flag is specified then it
9189 * will inherit the source file's ACLs instead.
9190 */
9191 VATTR_INIT(vap);
9192 VATTR_WANTED(vap, va_uid);
9193 VATTR_WANTED(vap, va_gid);
9194 VATTR_WANTED(vap, va_mode);
9195 VATTR_WANTED(vap, va_flags);
9196 if (flags & CLONE_ACL) {
9197 VATTR_WANTED(vap, va_acl);
9198 }
9199
9200 if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
9201 goto out;
9202 }
9203
9204 VATTR_INIT(nvap);
9205 VATTR_SET(nvap, va_type, v_type);
9206 if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
9207 VATTR_SET(nvap, va_acl, vap->va_acl);
9208 free_src_acl = TRUE;
9209 }
9210
9211 /* Handle ACL inheritance, initialize vap. */
9212 if (v_type == VLNK) {
9213 error = vnode_authattr_new(tdvp, nvap, 0, ctx);
9214 } else {
9215 error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
9216 if (error) {
9217 goto out;
9218 }
9219 attr_cleanup = TRUE;
9220 }
9221
9222 vnop_flags = VNODE_CLONEFILE_DEFAULT;
9223 /*
9224 * We've got initial values for all security parameters,
9225 * If we are superuser, then we can change owners to be the
9226 * same as the source. Both superuser and the owner have default
9227 * WRITE_SECURITY privileges so all other fields can be taken
9228 * from source as well.
9229 */
9230 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
9231 if (VATTR_IS_SUPPORTED(vap, va_uid)) {
9232 VATTR_SET(nvap, va_uid, vap->va_uid);
9233 }
9234 if (VATTR_IS_SUPPORTED(vap, va_gid)) {
9235 VATTR_SET(nvap, va_gid, vap->va_gid);
9236 }
9237 } else {
9238 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
9239 }
9240
9241 if (VATTR_IS_SUPPORTED(vap, va_mode)) {
9242 VATTR_SET(nvap, va_mode, vap->va_mode);
9243 }
9244 if (VATTR_IS_SUPPORTED(vap, va_flags)) {
9245 VATTR_SET(nvap, va_flags,
9246 ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
9247 (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
9248 }
9249
9250 #if CONFIG_FILE_LEASES
9251 vnode_breakdirlease(tdvp, false, O_WRONLY);
9252 #endif
9253
9254 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
9255
9256 if (!error && tvp) {
9257 int update_flags = 0;
9258 #if CONFIG_FSE
9259 int fsevent;
9260 #endif /* CONFIG_FSE */
9261
9262 /*
9263 * If some of the requested attributes weren't handled by the
9264 * VNOP, use our fallback code.
9265 */
9266 if (!VATTR_ALL_SUPPORTED(nvap)) {
9267 (void)vnode_setattr_fallback(tvp, nvap, ctx);
9268 }
9269
9270 #if CONFIG_MACF
9271 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
9272 VNODE_LABEL_CREATE, ctx);
9273 #endif
9274
9275 // Make sure the name & parent pointers are hooked up
9276 if (tvp->v_name == NULL) {
9277 update_flags |= VNODE_UPDATE_NAME;
9278 }
9279 if (tvp->v_parent == NULLVP) {
9280 update_flags |= VNODE_UPDATE_PARENT;
9281 }
9282
9283 if (update_flags) {
9284 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
9285 cnp->cn_namelen, cnp->cn_hash, update_flags);
9286 }
9287
9288 #if CONFIG_FSE
9289 switch (vnode_vtype(tvp)) {
9290 case VLNK:
9291 /* FALLTHRU */
9292 case VREG:
9293 fsevent = FSE_CREATE_FILE;
9294 break;
9295 case VDIR:
9296 fsevent = FSE_CREATE_DIR;
9297 break;
9298 default:
9299 goto out;
9300 }
9301
9302 if (need_fsevent(fsevent, tvp)) {
9303 /*
9304 * The following is a sequence of three explicit events.
9305 * A pair of FSE_CLONE events representing the source and destination
9306 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
9307 * fseventsd may coalesce the destination clone and create events
9308 * into a single event resulting in the following sequence for a client
9309 * FSE_CLONE (src)
9310 * FSE_CLONE | FSE_CREATE (dst)
9311 */
9312 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
9313 FSE_ARG_DONE);
9314 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
9315 FSE_ARG_DONE);
9316 }
9317 #endif /* CONFIG_FSE */
9318 }
9319
9320 out:
9321 if (attr_cleanup) {
9322 vn_attribute_cleanup(nvap, defaulted);
9323 }
9324 if (free_src_acl && vap->va_acl) {
9325 kauth_acl_free(vap->va_acl);
9326 }
9327 if (va2p) {
9328 kfree_type(typeof(*va2p), va2p);
9329 }
9330 nameidone(tondp);
9331 kfree_type(struct nameidata, tondp);
9332 if (tvp) {
9333 vnode_put(tvp);
9334 }
9335 vnode_put(tdvp);
9336 return error;
9337 }
9338
9339 /*
9340 * clone files or directories, target must not exist.
9341 */
9342 /* ARGSUSED */
9343 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)9344 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
9345 __unused int32_t *retval)
9346 {
9347 vnode_t fvp;
9348 struct nameidata *ndp = NULL;
9349 int follow;
9350 int error;
9351 vfs_context_t ctx = vfs_context_current();
9352
9353 /* Check that the flags are valid. */
9354 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9355 CLONE_NOFOLLOW_ANY | CLONE_RESOLVE_BENEATH)) {
9356 return EINVAL;
9357 }
9358
9359 AUDIT_ARG(fd, uap->src_dirfd);
9360
9361 ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9362
9363 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9364 NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
9365 UIO_USERSPACE, uap->src, ctx);
9366 if (uap->flags & CLONE_NOFOLLOW_ANY) {
9367 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9368 }
9369 if (uap->flags & CLONE_RESOLVE_BENEATH) {
9370 ndp->ni_flag |= NAMEI_RESOLVE_BENEATH;
9371 }
9372
9373 if ((error = nameiat(ndp, uap->src_dirfd))) {
9374 kfree_type(struct nameidata, ndp);
9375 return error;
9376 }
9377
9378 fvp = ndp->ni_vp;
9379 nameidone(ndp);
9380 kfree_type(struct nameidata, ndp);
9381
9382 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
9383 uap->flags, ctx);
9384
9385 vnode_put(fvp);
9386 return error;
9387 }
9388
9389 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)9390 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
9391 __unused int32_t *retval)
9392 {
9393 vnode_t fvp;
9394 struct fileproc *fp;
9395 int error;
9396 vfs_context_t ctx = vfs_context_current();
9397
9398 /* Check that the flags are valid. */
9399 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9400 CLONE_NOFOLLOW_ANY | CLONE_RESOLVE_BENEATH)) {
9401 return EINVAL;
9402 }
9403
9404 AUDIT_ARG(fd, uap->src_fd);
9405 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
9406 if (error) {
9407 return error;
9408 }
9409
9410 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9411 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
9412 error = EBADF;
9413 goto out;
9414 }
9415
9416 if ((error = vnode_getwithref(fvp))) {
9417 goto out;
9418 }
9419
9420 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
9421
9422 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
9423 uap->flags, ctx);
9424
9425 vnode_put(fvp);
9426 out:
9427 file_drop(uap->src_fd);
9428 return error;
9429 }
9430
9431 static int
rename_submounts_callback(mount_t mp,void * arg)9432 rename_submounts_callback(mount_t mp, void *arg)
9433 {
9434 char *prefix = (char *)arg;
9435 int prefix_len = (int)strlen(prefix);
9436 int error = 0;
9437
9438 if (strncmp(mp->mnt_vfsstat.f_mntonname, prefix, prefix_len) != 0) {
9439 return 0;
9440 }
9441
9442 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
9443 return 0;
9444 }
9445
9446 if ((error = vfs_busy(mp, LK_NOWAIT))) {
9447 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
9448 return -1;
9449 }
9450
9451 size_t pathlen = MAXPATHLEN;
9452 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
9453 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
9454 }
9455
9456 vfs_unbusy(mp);
9457
9458 return error;
9459 }
9460
9461 /*
9462 * Rename files. Source and destination must either both be directories,
9463 * or both not be directories. If target is a directory, it must be empty.
9464 */
9465 /* ARGSUSED */
9466 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9467 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9468 int tofd, user_addr_t to, int segflg, u_int uflags)
9469 {
9470 vnode_t tvp, tdvp;
9471 vnode_t fvp, fdvp;
9472 vnode_t mnt_fvp;
9473 struct nameidata *fromnd, *tond;
9474 int error = 0;
9475 int do_retry;
9476 int retry_count;
9477 int mntrename;
9478 int dirrename;
9479 int need_event;
9480 int need_kpath2;
9481 int has_listeners;
9482 const char *oname = NULL;
9483 char *old_dirpath = NULL, *from_name = NULL, *to_name = NULL;
9484 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9485 int from_len = 0, to_len = 0;
9486 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9487 int holding_mntlock;
9488 int vn_authorize_skipped;
9489 mount_t locked_mp = NULL;
9490 vnode_t oparent = NULLVP;
9491 vnode_t locked_vp = NULLVP;
9492 #if CONFIG_FSE
9493 fse_info from_finfo = {}, to_finfo;
9494 #endif
9495 int from_truncated = 0, to_truncated = 0;
9496 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9497 int batched = 0;
9498 struct vnode_attr *fvap, *tvap;
9499 int continuing = 0;
9500 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9501 int32_t nofollow_any = 0;
9502 int32_t resolve_beneath = 0;
9503 /* carving out a chunk for structs that are too big to be on stack. */
9504 struct {
9505 struct nameidata from_node, to_node;
9506 struct vnode_attr fv_attr, tv_attr;
9507 } * __rename_data;
9508
9509 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9510 fromnd = &__rename_data->from_node;
9511 tond = &__rename_data->to_node;
9512
9513 holding_mntlock = 0;
9514 do_retry = 0;
9515 retry_count = 0;
9516 retry:
9517 fvp = tvp = NULL;
9518 fdvp = tdvp = NULL;
9519 fvap = tvap = NULL;
9520 mnt_fvp = NULLVP;
9521 mntrename = dirrename = FALSE;
9522 vn_authorize_skipped = FALSE;
9523
9524 if (uflags & RENAME_NOFOLLOW_ANY) {
9525 nofollow_any = NAMEI_NOFOLLOW_ANY;
9526 }
9527 if (uflags & RENAME_RESOLVE_BENEATH) {
9528 resolve_beneath = NAMEI_RESOLVE_BENEATH;
9529 }
9530 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9531 segflg, from, ctx);
9532 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any | resolve_beneath;
9533
9534 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9535 segflg, to, ctx);
9536 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any | resolve_beneath;
9537
9538 continue_lookup:
9539 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9540 if ((error = nameiat(fromnd, fromfd))) {
9541 goto out1;
9542 }
9543 fdvp = fromnd->ni_dvp;
9544 fvp = fromnd->ni_vp;
9545
9546 if (fvp && fvp->v_type == VDIR) {
9547 tond->ni_cnd.cn_flags |= WILLBEDIR;
9548 #if defined(XNU_TARGET_OS_OSX)
9549 dirrename = TRUE;
9550 #endif
9551 }
9552 }
9553
9554 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9555 if ((error = nameiat(tond, tofd))) {
9556 /*
9557 * Translate error code for rename("dir1", "dir2/.").
9558 */
9559 if (error == EISDIR && fvp->v_type == VDIR) {
9560 error = EINVAL;
9561 }
9562 goto out1;
9563 }
9564 tdvp = tond->ni_dvp;
9565 tvp = tond->ni_vp;
9566 }
9567
9568 #if DEVELOPMENT || DEBUG
9569 /*
9570 * XXX VSWAP: Check for entitlements or special flag here
9571 * so we can restrict access appropriately.
9572 */
9573 #else /* DEVELOPMENT || DEBUG */
9574
9575 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9576 error = EPERM;
9577 goto out1;
9578 }
9579
9580 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9581 error = EPERM;
9582 goto out1;
9583 }
9584 #endif /* DEVELOPMENT || DEBUG */
9585
9586 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9587 error = ENOENT;
9588 goto out1;
9589 }
9590
9591 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9592 int32_t pval = 0;
9593 int err = 0;
9594
9595 /*
9596 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9597 * has the same name as target iff the following conditions are met:
9598 * 1. the target file system is case insensitive
9599 * 2. source and target directories are the same
9600 * 3. source and target files are the same
9601 * 4. name only differs in case (determined by underlying filesystem)
9602 */
9603 if (fvp != tvp || fdvp != tdvp) {
9604 error = EEXIST;
9605 goto out1;
9606 }
9607
9608 /*
9609 * Assume that the target file system is case sensitive if
9610 * _PC_CASE_SENSITIVE selector isn't supported.
9611 */
9612 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9613 if (err != 0 || pval != 0) {
9614 error = EEXIST;
9615 goto out1;
9616 }
9617 }
9618
9619 batched = vnode_compound_rename_available(fdvp);
9620
9621 #if CONFIG_FSE
9622 need_event = need_fsevent(FSE_RENAME, fdvp);
9623 if (need_event) {
9624 if (fvp) {
9625 get_fse_info(fvp, &from_finfo, ctx);
9626 } else {
9627 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9628 if (error) {
9629 goto out1;
9630 }
9631
9632 fvap = &__rename_data->fv_attr;
9633 }
9634
9635 if (tvp) {
9636 get_fse_info(tvp, &to_finfo, ctx);
9637 } else if (batched) {
9638 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9639 if (error) {
9640 goto out1;
9641 }
9642
9643 tvap = &__rename_data->tv_attr;
9644 }
9645 }
9646 #else
9647 need_event = 0;
9648 #endif /* CONFIG_FSE */
9649
9650 has_listeners = kauth_authorize_fileop_has_listeners();
9651
9652 need_kpath2 = 0;
9653 #if CONFIG_AUDIT
9654 if (AUDIT_RECORD_EXISTS()) {
9655 need_kpath2 = 1;
9656 }
9657 #endif
9658
9659 if (need_event || has_listeners) {
9660 if (from_name == NULL) {
9661 GET_PATH(from_name);
9662 }
9663
9664 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9665
9666 if (from_name_no_firmlink == NULL) {
9667 GET_PATH(from_name_no_firmlink);
9668 }
9669
9670 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9671 }
9672
9673 if (need_event || need_kpath2 || has_listeners) {
9674 if (to_name == NULL) {
9675 GET_PATH(to_name);
9676 }
9677
9678 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9679
9680 if (to_name_no_firmlink == NULL) {
9681 GET_PATH(to_name_no_firmlink);
9682 }
9683
9684 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9685 if (to_name && need_kpath2) {
9686 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9687 }
9688 }
9689 if (!fvp) {
9690 /*
9691 * Claim: this check will never reject a valid rename.
9692 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9693 * Suppose fdvp and tdvp are not on the same mount.
9694 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9695 * then you can't move it to within another dir on the same mountpoint.
9696 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9697 *
9698 * If this check passes, then we are safe to pass these vnodes to the same FS.
9699 */
9700 if (fdvp->v_mount != tdvp->v_mount) {
9701 error = EXDEV;
9702 goto out1;
9703 }
9704 goto skipped_lookup;
9705 }
9706
9707 /*
9708 * If the source and destination are the same (i.e. they're
9709 * links to the same vnode) and the target file system is
9710 * case sensitive, then there is nothing to do.
9711 *
9712 * XXX Come back to this.
9713 */
9714 if (fvp == tvp) {
9715 int pathconf_val;
9716
9717 /*
9718 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9719 * then assume that this file system is case sensitive.
9720 */
9721 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9722 pathconf_val != 0) {
9723 vn_authorize_skipped = TRUE;
9724 goto out1;
9725 }
9726 }
9727
9728 /*
9729 * Allow the renaming of mount points.
9730 * - target must not exist
9731 * - target must reside in the same directory as source
9732 * - union mounts cannot be renamed
9733 * - the root fs, and tightly-linked system volumes, cannot be renamed
9734 *
9735 * XXX Handle this in VFS after a continued lookup (if we missed
9736 * in the cache to start off)
9737 *
9738 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9739 * we'll skip past here. The file system is responsible for
9740 * checking that @tvp is not a descendent of @fvp and vice versa
9741 * so it should always return EINVAL if either @tvp or @fvp is the
9742 * root of a volume.
9743 */
9744 if ((fvp->v_flag & VROOT) &&
9745 (fvp->v_type == VDIR) &&
9746 (tvp == NULL) &&
9747 (fvp->v_mountedhere == NULL) &&
9748 (fdvp == tdvp) &&
9749 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9750 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9751 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9752 vnode_t coveredvp;
9753
9754 /* switch fvp to the covered vnode */
9755 coveredvp = fvp->v_mount->mnt_vnodecovered;
9756 if ((vnode_getwithref(coveredvp))) {
9757 error = ENOENT;
9758 goto out1;
9759 }
9760 /*
9761 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9762 * later.
9763 */
9764 mnt_fvp = fvp;
9765
9766 fvp = coveredvp;
9767 mntrename = TRUE;
9768 }
9769 /*
9770 * Check for cross-device rename.
9771 * For rename on mountpoint, we want to also check the source and its parent
9772 * belong to the same mountpoint.
9773 */
9774 if ((fvp->v_mount != tdvp->v_mount) ||
9775 (fvp->v_mount != fdvp->v_mount) ||
9776 (tvp && (fvp->v_mount != tvp->v_mount))) {
9777 error = EXDEV;
9778 goto out1;
9779 }
9780
9781 /*
9782 * If source is the same as the destination (that is the
9783 * same inode number) then there is nothing to do...
9784 * EXCEPT if the underlying file system supports case
9785 * insensitivity and is case preserving. In this case
9786 * the file system needs to handle the special case of
9787 * getting the same vnode as target (fvp) and source (tvp).
9788 *
9789 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9790 * and _PC_CASE_PRESERVING can have this exception, and they need to
9791 * handle the special case of getting the same vnode as target and
9792 * source. NOTE: Then the target is unlocked going into vnop_rename,
9793 * so not to cause locking problems. There is a single reference on tvp.
9794 *
9795 * NOTE - that fvp == tvp also occurs if they are hard linked and
9796 * that correct behaviour then is just to return success without doing
9797 * anything.
9798 *
9799 * XXX filesystem should take care of this itself, perhaps...
9800 */
9801 if (fvp == tvp && fdvp == tdvp) {
9802 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9803 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9804 fromnd->ni_cnd.cn_namelen)) {
9805 vn_authorize_skipped = TRUE;
9806 goto out1;
9807 }
9808 }
9809
9810 if (holding_mntlock && fvp->v_mount != locked_mp) {
9811 /*
9812 * we're holding a reference and lock
9813 * on locked_mp, but it no longer matches
9814 * what we want to do... so drop our hold
9815 */
9816 mount_unlock_renames(locked_mp);
9817 mount_drop(locked_mp, 0);
9818 holding_mntlock = 0;
9819 }
9820 if (tdvp != fdvp && fvp->v_type == VDIR) {
9821 /*
9822 * serialize renames that re-shape
9823 * the tree... if holding_mntlock is
9824 * set, then we're ready to go...
9825 * otherwise we
9826 * first need to drop the iocounts
9827 * we picked up, second take the
9828 * lock to serialize the access,
9829 * then finally start the lookup
9830 * process over with the lock held
9831 */
9832 if (!holding_mntlock) {
9833 /*
9834 * need to grab a reference on
9835 * the mount point before we
9836 * drop all the iocounts... once
9837 * the iocounts are gone, the mount
9838 * could follow
9839 */
9840 locked_mp = fvp->v_mount;
9841 mount_ref(locked_mp, 0);
9842
9843 /*
9844 * nameidone has to happen before we vnode_put(tvp)
9845 * since it may need to release the fs_nodelock on the tvp
9846 */
9847 nameidone(tond);
9848
9849 if (tvp) {
9850 vnode_put(tvp);
9851 }
9852 vnode_put(tdvp);
9853
9854 /*
9855 * nameidone has to happen before we vnode_put(fdvp)
9856 * since it may need to release the fs_nodelock on the fvp
9857 */
9858 nameidone(fromnd);
9859
9860 vnode_put(fvp);
9861 vnode_put(fdvp);
9862
9863 if (mnt_fvp != NULLVP) {
9864 vnode_put(mnt_fvp);
9865 }
9866
9867 mount_lock_renames(locked_mp);
9868 holding_mntlock = 1;
9869
9870 goto retry;
9871 }
9872 } else {
9873 /*
9874 * when we dropped the iocounts to take
9875 * the lock, we allowed the identity of
9876 * the various vnodes to change... if they did,
9877 * we may no longer be dealing with a rename
9878 * that reshapes the tree... once we're holding
9879 * the iocounts, the vnodes can't change type
9880 * so we're free to drop the lock at this point
9881 * and continue on
9882 */
9883 if (holding_mntlock) {
9884 mount_unlock_renames(locked_mp);
9885 mount_drop(locked_mp, 0);
9886 holding_mntlock = 0;
9887 }
9888 }
9889
9890 if (!batched) {
9891 assert(locked_vp == NULLVP);
9892 vnode_link_lock(fvp);
9893 locked_vp = fvp;
9894 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9895 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9896 flags, NULL);
9897 if (error) {
9898 if (error == ENOENT) {
9899 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9900 /*
9901 * We encountered a race where after doing the namei,
9902 * tvp stops being valid. If so, simply re-drive the rename
9903 * call from the top.
9904 */
9905 do_retry = 1;
9906 retry_count += 1;
9907 }
9908 }
9909 goto out1;
9910 }
9911 }
9912
9913 /* Release the 'mnt_fvp' now that it is no longer needed. */
9914 if (mnt_fvp != NULLVP) {
9915 vnode_put(mnt_fvp);
9916 mnt_fvp = NULLVP;
9917 }
9918
9919 // save these off so we can later verify that fvp is the same
9920 oname = fvp->v_name;
9921 oparent = fvp->v_parent;
9922
9923 /*
9924 * If renaming a directory, stash its path which we need later when
9925 * updating the 'f_mntonname' of sub mounts.
9926 */
9927 if (dirrename) {
9928 int pathlen = MAXPATHLEN;
9929
9930 old_dirpath = zalloc(ZV_NAMEI);
9931 error = vn_getpath_fsenter(fvp, old_dirpath, &pathlen);
9932 if (error) {
9933 /*
9934 * Process that supports long path (opt-in to IO policy
9935 * IOPOL_TYPE_VFS_SUPPORT_LONG_PATHS) can have directory with path
9936 * length up to MAXLONGPATHLEN (8192). Since max path length in
9937 * mount's 'f_mntonname' is MAXPATHLEN (1024), this means the
9938 * directory can't be the parent of the sub mounts so we can just
9939 * silently drop the error and skip the check to update the
9940 * 'f_mntonname' of sub mounts.
9941 */
9942 if (error == ENOSPC) {
9943 dirrename = false;
9944 error = 0;
9945 if (old_dirpath) {
9946 zfree(ZV_NAMEI, old_dirpath);
9947 old_dirpath = NULL;
9948 }
9949 } else {
9950 goto out1;
9951 }
9952 }
9953 }
9954
9955 skipped_lookup:
9956 #if CONFIG_FILE_LEASES
9957 /* Lease break needed for source's parent dir? */
9958 vnode_breakdirlease(fdvp, false, O_WRONLY);
9959
9960 /* Lease break needed for target's parent dir? */
9961 vnode_breakdirlease(tdvp, false, O_WRONLY);
9962 #endif
9963
9964 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9965 tdvp, &tvp, &tond->ni_cnd, tvap,
9966 flags, ctx);
9967
9968 if (locked_vp) {
9969 vnode_link_unlock(fvp);
9970 locked_vp = NULLVP;
9971 }
9972
9973 if (holding_mntlock) {
9974 /*
9975 * we can drop our serialization
9976 * lock now
9977 */
9978 mount_unlock_renames(locked_mp);
9979 mount_drop(locked_mp, 0);
9980 holding_mntlock = 0;
9981 }
9982 if (error) {
9983 if (error == EDATALESS) {
9984 /*
9985 * If we've been here before, something has gone
9986 * horribly wrong and we should just get out lest
9987 * we spiral around the drain forever.
9988 */
9989 if (flags & VFS_RENAME_DATALESS) {
9990 error = EIO;
9991 goto out1;
9992 }
9993
9994 /*
9995 * The object we're renaming is dataless (or has a
9996 * dataless descendent) and requires materialization
9997 * before the rename occurs. But we're holding the
9998 * mount point's rename lock, so it's not safe to
9999 * make the upcall.
10000 *
10001 * In this case, we release the lock (above), perform
10002 * the materialization, and start the whole thing over.
10003 */
10004 error = vfs_materialize_reparent(fvp, tdvp);
10005 if (error == 0) {
10006 /*
10007 * The next time around we need to tell the
10008 * file system that the materializtaion has
10009 * been performed.
10010 */
10011 flags |= VFS_RENAME_DATALESS;
10012 do_retry = 1;
10013 }
10014 goto out1;
10015 }
10016 if (error == EKEEPLOOKING) {
10017 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
10018 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
10019 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
10020 }
10021 }
10022
10023 fromnd->ni_vp = fvp;
10024 tond->ni_vp = tvp;
10025
10026 goto continue_lookup;
10027 }
10028
10029 /*
10030 * We may encounter a race in the VNOP where the destination didn't
10031 * exist when we did the namei, but it does by the time we go and
10032 * try to create the entry. In this case, we should re-drive this rename
10033 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
10034 * but other filesystems susceptible to this race could return it, too.
10035 */
10036 if (error == ERECYCLE) {
10037 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
10038 do_retry = 1;
10039 retry_count += 1;
10040 } else {
10041 printf("rename retry limit due to ERECYCLE reached\n");
10042 error = ENOENT;
10043 }
10044 }
10045
10046 /*
10047 * For compound VNOPs, the authorization callback may return
10048 * ENOENT in case of racing hardlink lookups hitting the name
10049 * cache, redrive the lookup.
10050 */
10051 if (batched && error == ENOENT) {
10052 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10053 do_retry = 1;
10054 retry_count += 1;
10055 }
10056 }
10057
10058 goto out1;
10059 }
10060
10061 /* call out to allow 3rd party notification of rename.
10062 * Ignore result of kauth_authorize_fileop call.
10063 */
10064 kauth_authorize_fileop(vfs_context_ucred(ctx),
10065 KAUTH_FILEOP_RENAME,
10066 (uintptr_t)from_name, (uintptr_t)to_name);
10067 if (flags & VFS_RENAME_SWAP) {
10068 kauth_authorize_fileop(vfs_context_ucred(ctx),
10069 KAUTH_FILEOP_RENAME,
10070 (uintptr_t)to_name, (uintptr_t)from_name);
10071 }
10072
10073 #if CONFIG_FSE
10074 if (from_name != NULL && to_name != NULL) {
10075 if (from_truncated || to_truncated) {
10076 // set it here since only the from_finfo gets reported up to user space
10077 from_finfo.mode |= FSE_TRUNCATED_PATH;
10078 }
10079
10080 if (tvap && tvp) {
10081 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
10082 }
10083 if (fvap) {
10084 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
10085 }
10086
10087 if (tvp) {
10088 add_fsevent(FSE_RENAME, ctx,
10089 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10090 FSE_ARG_FINFO, &from_finfo,
10091 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10092 FSE_ARG_FINFO, &to_finfo,
10093 FSE_ARG_DONE);
10094 if (flags & VFS_RENAME_SWAP) {
10095 /*
10096 * Strictly speaking, swap is the equivalent of
10097 * *three* renames. FSEvents clients should only take
10098 * the events as a hint, so we only bother reporting
10099 * two.
10100 */
10101 add_fsevent(FSE_RENAME, ctx,
10102 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10103 FSE_ARG_FINFO, &to_finfo,
10104 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10105 FSE_ARG_FINFO, &from_finfo,
10106 FSE_ARG_DONE);
10107 }
10108 } else {
10109 add_fsevent(FSE_RENAME, ctx,
10110 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10111 FSE_ARG_FINFO, &from_finfo,
10112 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10113 FSE_ARG_DONE);
10114 }
10115 }
10116 #endif /* CONFIG_FSE */
10117
10118 /*
10119 * update filesystem's mount point data
10120 */
10121 if (mntrename) {
10122 char *cp, *pathend, *mpname;
10123 char * tobuf;
10124 struct mount *mp;
10125 int maxlen;
10126 size_t len = 0;
10127
10128 mp = fvp->v_mountedhere;
10129
10130 if (vfs_busy(mp, LK_NOWAIT)) {
10131 error = EBUSY;
10132 goto out1;
10133 }
10134 tobuf = zalloc(ZV_NAMEI);
10135
10136 if (UIO_SEG_IS_USER_SPACE(segflg)) {
10137 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
10138 } else {
10139 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
10140 }
10141 if (!error) {
10142 /* find current mount point prefix */
10143 pathend = &mp->mnt_vfsstat.f_mntonname[0];
10144 for (cp = pathend; *cp != '\0'; ++cp) {
10145 if (*cp == '/') {
10146 pathend = cp + 1;
10147 }
10148 }
10149 /* find last component of target name */
10150 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
10151 if (*cp == '/') {
10152 mpname = cp + 1;
10153 }
10154 }
10155
10156 /* Update f_mntonname of sub mounts */
10157 vfs_iterate(0, rename_submounts_callback,
10158 (void *)mp->mnt_vfsstat.f_mntonname);
10159
10160 /* append name to prefix */
10161 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
10162 bzero(pathend, maxlen);
10163
10164 strlcpy(pathend, mpname, maxlen);
10165 }
10166 zfree(ZV_NAMEI, tobuf);
10167
10168 vfs_unbusy(mp);
10169
10170 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
10171 } else if (dirrename) {
10172 /*
10173 * If we renamed a directory, we need to check if there is any sub
10174 * mount(s) mounted under the directory. If so, then we need to update
10175 * the sub mount's f_mntonname path.
10176 */
10177 vfs_iterate(0, rename_submounts_callback, (void *)old_dirpath);
10178 }
10179
10180 /*
10181 * fix up name & parent pointers. note that we first
10182 * check that fvp has the same name/parent pointers it
10183 * had before the rename call... this is a 'weak' check
10184 * at best...
10185 *
10186 * XXX oparent and oname may not be set in the compound vnop case
10187 */
10188 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
10189 int update_flags;
10190
10191 update_flags = VNODE_UPDATE_NAME;
10192
10193 if (fdvp != tdvp) {
10194 update_flags |= VNODE_UPDATE_PARENT;
10195 }
10196
10197 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
10198 }
10199 out1:
10200 /*
10201 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
10202 * skipped earlier as no actual rename was performed.
10203 */
10204 if (vn_authorize_skipped && error == 0) {
10205 error = vn_authorize_renamex_with_paths(fdvp, fvp,
10206 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
10207 flags, NULL);
10208 if (error && error == ENOENT) {
10209 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10210 do_retry = 1;
10211 retry_count += 1;
10212 }
10213 }
10214 }
10215 if (locked_vp) {
10216 assert(locked_vp == fvp);
10217 vnode_link_unlock(locked_vp);
10218 locked_vp = NULLVP;
10219 }
10220 if (to_name != NULL) {
10221 RELEASE_PATH(to_name);
10222 to_name = NULL;
10223 }
10224 if (to_name_no_firmlink != NULL) {
10225 RELEASE_PATH(to_name_no_firmlink);
10226 to_name_no_firmlink = NULL;
10227 }
10228 if (from_name != NULL) {
10229 RELEASE_PATH(from_name);
10230 from_name = NULL;
10231 }
10232 if (from_name_no_firmlink != NULL) {
10233 RELEASE_PATH(from_name_no_firmlink);
10234 from_name_no_firmlink = NULL;
10235 }
10236 if (old_dirpath != NULL) {
10237 zfree(ZV_NAMEI, old_dirpath);
10238 old_dirpath = NULL;
10239 }
10240 if (holding_mntlock) {
10241 mount_unlock_renames(locked_mp);
10242 mount_drop(locked_mp, 0);
10243 holding_mntlock = 0;
10244 }
10245 if (tdvp) {
10246 /*
10247 * nameidone has to happen before we vnode_put(tdvp)
10248 * since it may need to release the fs_nodelock on the tdvp
10249 */
10250 nameidone(tond);
10251
10252 if (tvp) {
10253 vnode_put(tvp);
10254 }
10255 vnode_put(tdvp);
10256 }
10257 if (fdvp) {
10258 /*
10259 * nameidone has to happen before we vnode_put(fdvp)
10260 * since it may need to release the fs_nodelock on the fdvp
10261 */
10262 nameidone(fromnd);
10263
10264 if (fvp) {
10265 vnode_put(fvp);
10266 }
10267 vnode_put(fdvp);
10268 }
10269 if (mnt_fvp != NULLVP) {
10270 vnode_put(mnt_fvp);
10271 }
10272 /*
10273 * If things changed after we did the namei, then we will re-drive
10274 * this rename call from the top.
10275 */
10276 if (do_retry) {
10277 do_retry = 0;
10278 goto retry;
10279 }
10280
10281 kfree_type(typeof(*__rename_data), __rename_data);
10282 return error;
10283 }
10284
10285 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)10286 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
10287 {
10288 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
10289 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
10290 }
10291
10292 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)10293 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
10294 {
10295 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY | RENAME_RESOLVE_BENEATH)) {
10296 return EINVAL;
10297 }
10298
10299 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
10300 return EINVAL;
10301 }
10302
10303 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
10304 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
10305 }
10306
10307 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)10308 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
10309 {
10310 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
10311 uap->tofd, uap->to, UIO_USERSPACE, 0);
10312 }
10313
10314 /*
10315 * Make a directory file.
10316 *
10317 * Returns: 0 Success
10318 * EEXIST
10319 * namei:???
10320 * vnode_authorize:???
10321 * vn_create:???
10322 */
10323 /* ARGSUSED */
10324 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)10325 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
10326 enum uio_seg segflg)
10327 {
10328 vnode_t vp, dvp;
10329 int error;
10330 int update_flags = 0;
10331 int batched;
10332 struct nameidata nd;
10333
10334 AUDIT_ARG(mode, vap->va_mode);
10335 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
10336 path, ctx);
10337 nd.ni_cnd.cn_flags |= WILLBEDIR;
10338 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
10339
10340 continue_lookup:
10341 error = nameiat(&nd, fd);
10342 if (error) {
10343 return error;
10344 }
10345 dvp = nd.ni_dvp;
10346 vp = nd.ni_vp;
10347
10348 if (vp != NULL) {
10349 error = EEXIST;
10350 goto out;
10351 }
10352
10353 batched = vnode_compound_mkdir_available(dvp);
10354
10355 VATTR_SET(vap, va_type, VDIR);
10356
10357 /*
10358 * XXX
10359 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
10360 * only get EXISTS or EISDIR for existing path components, and not that it could see
10361 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
10362 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
10363 */
10364 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
10365 if (error == EACCES || error == EPERM) {
10366 int error2;
10367
10368 nameidone(&nd);
10369 vnode_put(dvp);
10370 dvp = NULLVP;
10371
10372 /*
10373 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
10374 * rather than EACCESS if the target exists.
10375 */
10376 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
10377 path, ctx);
10378 error2 = nameiat(&nd, fd);
10379 if (error2) {
10380 goto out;
10381 } else {
10382 vp = nd.ni_vp;
10383 error = EEXIST;
10384 goto out;
10385 }
10386 }
10387
10388 goto out;
10389 }
10390
10391 #if CONFIG_FILE_LEASES
10392 vnode_breakdirlease(dvp, false, O_WRONLY);
10393 #endif
10394
10395 /*
10396 * make the directory
10397 */
10398 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
10399 if (error == EKEEPLOOKING) {
10400 nd.ni_vp = vp;
10401 goto continue_lookup;
10402 }
10403
10404 goto out;
10405 }
10406
10407 // Make sure the name & parent pointers are hooked up
10408 if (vp->v_name == NULL) {
10409 update_flags |= VNODE_UPDATE_NAME;
10410 }
10411 if (vp->v_parent == NULLVP) {
10412 update_flags |= VNODE_UPDATE_PARENT;
10413 }
10414
10415 if (update_flags) {
10416 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
10417 }
10418
10419 #if CONFIG_FSE
10420 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
10421 #endif
10422
10423 out:
10424 /*
10425 * nameidone has to happen before we vnode_put(dvp)
10426 * since it may need to release the fs_nodelock on the dvp
10427 */
10428 nameidone(&nd);
10429
10430 if (vp) {
10431 vnode_put(vp);
10432 }
10433 if (dvp) {
10434 vnode_put(dvp);
10435 }
10436
10437 return error;
10438 }
10439
10440 /*
10441 * mkdir_extended: Create a directory; with extended security (ACL).
10442 *
10443 * Parameters: p Process requesting to create the directory
10444 * uap User argument descriptor (see below)
10445 * retval (ignored)
10446 *
10447 * Indirect: uap->path Path of directory to create
10448 * uap->mode Access permissions to set
10449 * uap->xsecurity ACL to set
10450 *
10451 * Returns: 0 Success
10452 * !0 Not success
10453 *
10454 */
10455 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)10456 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
10457 {
10458 int ciferror;
10459 kauth_filesec_t xsecdst;
10460 struct vnode_attr va;
10461
10462 AUDIT_ARG(owner, uap->uid, uap->gid);
10463
10464 xsecdst = NULL;
10465 if ((uap->xsecurity != USER_ADDR_NULL) &&
10466 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
10467 return ciferror;
10468 }
10469
10470 VATTR_INIT(&va);
10471 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10472 if (xsecdst != NULL) {
10473 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
10474 va.va_vaflags |= VA_FILESEC_ACL;
10475 }
10476
10477 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10478 UIO_USERSPACE);
10479 if (xsecdst != NULL) {
10480 kauth_filesec_free(xsecdst);
10481 }
10482 return ciferror;
10483 }
10484
10485 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)10486 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
10487 {
10488 struct vnode_attr va;
10489
10490 VATTR_INIT(&va);
10491 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10492
10493 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10494 UIO_USERSPACE);
10495 }
10496
10497 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)10498 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
10499 {
10500 struct vnode_attr va;
10501
10502 VATTR_INIT(&va);
10503 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10504
10505 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
10506 UIO_USERSPACE);
10507 }
10508
10509 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)10510 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
10511 enum uio_seg segflg, int unlink_flags)
10512 {
10513 struct {
10514 struct nameidata nd;
10515 #if CONFIG_FSE
10516 struct vnode_attr va;
10517 #endif /* CONFIG_FSE */
10518 } *__rmdir_data;
10519 vnode_t vp, dvp;
10520 int error;
10521 struct nameidata *ndp;
10522 char *path = NULL;
10523 char *no_firmlink_path = NULL;
10524 int len_path = 0;
10525 int len_no_firmlink_path = 0;
10526 int has_listeners = 0;
10527 int need_event = 0;
10528 int truncated_path = 0;
10529 int truncated_no_firmlink_path = 0;
10530 struct vnode_attr *vap = NULL;
10531 int restart_count = 0;
10532 int batched;
10533
10534 int restart_flag;
10535 int namei_flags = 0;
10536
10537 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10538 ndp = &__rmdir_data->nd;
10539
10540 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10541 namei_flags |= NAMEI_NOFOLLOW_ANY;
10542 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10543 }
10544 if (unlink_flags & VNODE_REMOVE_RESOLVE_BENEATH) {
10545 namei_flags |= NAMEI_RESOLVE_BENEATH;
10546 unlink_flags &= ~VNODE_REMOVE_RESOLVE_BENEATH;
10547 }
10548 if (unlink_flags & VNODE_REMOVE_UNIQUE) {
10549 namei_flags |= NAMEI_UNIQUE;
10550 unlink_flags &= ~VNODE_REMOVE_UNIQUE;
10551 }
10552
10553 /*
10554 * This loop exists to restart rmdir in the unlikely case that two
10555 * processes are simultaneously trying to remove the same directory
10556 * containing orphaned appleDouble files.
10557 */
10558 do {
10559 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10560 segflg, dirpath, ctx);
10561 ndp->ni_flag = NAMEI_COMPOUNDRMDIR | namei_flags;
10562 continue_lookup:
10563 restart_flag = 0;
10564 vap = NULL;
10565
10566 error = nameiat(ndp, fd);
10567 if (error) {
10568 goto err_out;
10569 }
10570
10571 dvp = ndp->ni_dvp;
10572 vp = ndp->ni_vp;
10573
10574 if (vp) {
10575 batched = vnode_compound_rmdir_available(vp);
10576
10577 if (vp->v_flag & VROOT) {
10578 /*
10579 * The root of a mounted filesystem cannot be deleted.
10580 */
10581 error = EBUSY;
10582 goto out;
10583 }
10584
10585 #if DEVELOPMENT || DEBUG
10586 /*
10587 * XXX VSWAP: Check for entitlements or special flag here
10588 * so we can restrict access appropriately.
10589 */
10590 #else /* DEVELOPMENT || DEBUG */
10591
10592 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10593 error = EPERM;
10594 goto out;
10595 }
10596 #endif /* DEVELOPMENT || DEBUG */
10597
10598 /*
10599 * Removed a check here; we used to abort if vp's vid
10600 * was not the same as what we'd seen the last time around.
10601 * I do not think that check was valid, because if we retry
10602 * and all dirents are gone, the directory could legitimately
10603 * be recycled but still be present in a situation where we would
10604 * have had permission to delete. Therefore, we won't make
10605 * an effort to preserve that check now that we may not have a
10606 * vp here.
10607 */
10608
10609 if (!batched) {
10610 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10611 if (error) {
10612 if (error == ENOENT) {
10613 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10614 restart_flag = 1;
10615 restart_count += 1;
10616 }
10617 }
10618 goto out;
10619 }
10620 }
10621 } else {
10622 batched = 1;
10623
10624 if (!vnode_compound_rmdir_available(dvp)) {
10625 panic("No error, but no compound rmdir?");
10626 }
10627 }
10628
10629 #if CONFIG_FSE
10630 fse_info finfo = {0};
10631
10632 need_event = need_fsevent(FSE_DELETE, dvp);
10633 if (need_event) {
10634 if (!batched) {
10635 get_fse_info(vp, &finfo, ctx);
10636 } else {
10637 error = vfs_get_notify_attributes(&__rmdir_data->va);
10638 if (error) {
10639 goto out;
10640 }
10641
10642 vap = &__rmdir_data->va;
10643 }
10644 }
10645 #endif
10646 has_listeners = kauth_authorize_fileop_has_listeners();
10647 if (need_event || has_listeners) {
10648 if (path == NULL) {
10649 GET_PATH(path);
10650 }
10651
10652 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10653
10654 if (no_firmlink_path == NULL) {
10655 GET_PATH(no_firmlink_path);
10656 }
10657
10658 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10659 #if CONFIG_FSE
10660 if (truncated_no_firmlink_path) {
10661 finfo.mode |= FSE_TRUNCATED_PATH;
10662 }
10663 #endif
10664 }
10665
10666 #if CONFIG_FILE_LEASES
10667 vnode_breakdirlease(dvp, false, O_WRONLY);
10668 #endif
10669
10670 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10671 ndp->ni_vp = vp;
10672 if (vp == NULLVP) {
10673 /* Couldn't find a vnode */
10674 goto out;
10675 }
10676
10677 if (error == EKEEPLOOKING) {
10678 goto continue_lookup;
10679 } else if (batched && error == ENOENT) {
10680 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10681 /*
10682 * For compound VNOPs, the authorization callback
10683 * may return ENOENT in case of racing hard link lookups
10684 * redrive the lookup.
10685 */
10686 restart_flag = 1;
10687 restart_count += 1;
10688 goto out;
10689 }
10690 }
10691
10692 /*
10693 * XXX There's no provision for passing flags
10694 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10695 * because it's not empty, then we try again
10696 * with VNOP_REMOVE(), passing in a special
10697 * flag that clever file systems will know
10698 * how to handle.
10699 */
10700 if (error == ENOTEMPTY &&
10701 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10702 /*
10703 * Only do this if the directory is actually
10704 * marked as DATALESS.
10705 */
10706 struct vnode_attr *lvap =
10707 kalloc_type(struct vnode_attr, Z_WAITOK);
10708
10709 VATTR_INIT(lvap);
10710 VATTR_WANTED(lvap, va_flags);
10711 if (vnode_getattr(vp, lvap, ctx) == 0 &&
10712 VATTR_IS_SUPPORTED(lvap, va_flags) &&
10713 (lvap->va_flags & SF_DATALESS) != 0) {
10714 /*
10715 * If this fails, we want to keep the original
10716 * error.
10717 */
10718 if (vn_remove(dvp, &vp, ndp,
10719 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10720 error = 0;
10721 }
10722 }
10723 kfree_type(struct vnode_attr, lvap);
10724 }
10725
10726 #if CONFIG_APPLEDOUBLE
10727 /*
10728 * Special case to remove orphaned AppleDouble
10729 * files. I don't like putting this in the kernel,
10730 * but carbon does not like putting this in carbon either,
10731 * so here we are.
10732 */
10733 if (error == ENOTEMPTY) {
10734 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10735 if (ad_error == EBUSY) {
10736 error = ad_error;
10737 goto out;
10738 }
10739
10740
10741 /*
10742 * Assuming everything went well, we will try the RMDIR again
10743 */
10744 if (!ad_error) {
10745 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10746 }
10747 }
10748 #endif /* CONFIG_APPLEDOUBLE */
10749 /*
10750 * Call out to allow 3rd party notification of delete.
10751 * Ignore result of kauth_authorize_fileop call.
10752 */
10753 if (!error) {
10754 if (has_listeners) {
10755 kauth_authorize_fileop(vfs_context_ucred(ctx),
10756 KAUTH_FILEOP_DELETE,
10757 (uintptr_t)vp,
10758 (uintptr_t)path);
10759 }
10760
10761 if (vp->v_flag & VISHARDLINK) {
10762 // see the comment in unlink1() about why we update
10763 // the parent of a hard link when it is removed
10764 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10765 }
10766
10767 #if CONFIG_FSE
10768 if (need_event) {
10769 if (vap) {
10770 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10771 }
10772 add_fsevent(FSE_DELETE, ctx,
10773 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10774 FSE_ARG_FINFO, &finfo,
10775 FSE_ARG_DONE);
10776 }
10777 #endif
10778
10779 #if CONFIG_MACF
10780 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10781 #endif
10782 }
10783
10784 out:
10785 if (path != NULL) {
10786 RELEASE_PATH(path);
10787 path = NULL;
10788 }
10789
10790 if (no_firmlink_path != NULL) {
10791 RELEASE_PATH(no_firmlink_path);
10792 no_firmlink_path = NULL;
10793 }
10794
10795 /*
10796 * nameidone has to happen before we vnode_put(dvp)
10797 * since it may need to release the fs_nodelock on the dvp
10798 */
10799 nameidone(ndp);
10800 vnode_put(dvp);
10801
10802 if (vp) {
10803 vnode_put(vp);
10804 }
10805
10806 if (restart_flag == 0) {
10807 wakeup_one((caddr_t)vp);
10808 goto err_out;
10809 }
10810 tsleep(vp, PVFS, "rm AD", 1);
10811 } while (restart_flag != 0);
10812
10813 err_out:
10814 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10815
10816 return error;
10817 }
10818
10819 /*
10820 * Remove a directory file.
10821 */
10822 /* ARGSUSED */
10823 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10824 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10825 {
10826 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10827 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10828 }
10829
10830 /* Get direntry length padded to 8 byte alignment */
10831 #define DIRENT64_LEN(namlen) \
10832 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10833
10834 /* Get dirent length padded to 4 byte alignment */
10835 #define DIRENT_LEN(namelen) \
10836 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10837
10838 /* Get the end of this dirent */
10839 #define DIRENT_END(dep) \
10840 (((char *)(dep)) + (dep)->d_reclen - 1)
10841
10842 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10843 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10844 int *numdirent, vfs_context_t ctxp)
10845 {
10846 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10847 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10848 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10849 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10850 } else {
10851 size_t bufsize;
10852 void * bufptr;
10853 uio_t auio;
10854 struct direntry *entry64;
10855 struct dirent *dep;
10856 size_t bytesread;
10857 int error;
10858
10859 /*
10860 * We're here because the underlying file system does not
10861 * support direnties or we mounted denying support so we must
10862 * fall back to dirents and convert them to direntries.
10863 *
10864 * Our kernel buffer needs to be smaller since re-packing will
10865 * expand each dirent. The worse case (when the name length
10866 * is 3 or less) corresponds to a struct direntry size of 32
10867 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10868 * (4-byte aligned). So having a buffer that is 3/8 the size
10869 * will prevent us from reading more than we can pack.
10870 *
10871 * Since this buffer is wired memory, we will limit the
10872 * buffer size to a maximum of 32K. We would really like to
10873 * use 32K in the MIN(), but we use magic number 87371 to
10874 * prevent uio_resid() * 3 / 8 from overflowing.
10875 */
10876 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10877 bufptr = kalloc_data(bufsize, Z_WAITOK);
10878 if (bufptr == NULL) {
10879 return ENOMEM;
10880 }
10881
10882 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10883 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10884 auio->uio_offset = uio->uio_offset;
10885
10886 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10887
10888 dep = (struct dirent *)bufptr;
10889 bytesread = bufsize - uio_resid(auio);
10890
10891 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10892 /*
10893 * Convert all the entries and copy them out to user's buffer.
10894 */
10895 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10896 /* First check that the dirent struct up to d_name is within the buffer */
10897 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10898 /* Check that the length of the entire dirent is within the buffer */
10899 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10900 /* Check that the actual length including the name doesn't exceed d_reclen */
10901 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10902 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10903 vp->v_mount->mnt_vfsstat.f_mntonname,
10904 vp->v_name ? vp->v_name : "<unknown>");
10905 error = EIO;
10906 break;
10907 }
10908
10909 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10910
10911 bzero(entry64, enbufsize);
10912 /* Convert a dirent to a dirent64. */
10913 entry64->d_ino = dep->d_ino;
10914 entry64->d_seekoff = 0;
10915 entry64->d_reclen = (uint16_t)enbufsize;
10916 entry64->d_namlen = dep->d_namlen;
10917 entry64->d_type = dep->d_type;
10918 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10919
10920 /* Move to next entry. */
10921 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10922
10923 /* Copy entry64 to user's buffer. */
10924 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10925 }
10926
10927 /* Update the real offset using the offset we got from VNOP_READDIR. */
10928 if (error == 0) {
10929 uio->uio_offset = auio->uio_offset;
10930 }
10931 uio_free(auio);
10932 kfree_data(bufptr, bufsize);
10933 kfree_type(struct direntry, entry64);
10934 return error;
10935 }
10936 }
10937
10938 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10939
10940 /*
10941 * Read a block of directory entries in a file system independent format.
10942 */
10943 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10944 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10945 off_t *offset, int *eofflag, int flags)
10946 {
10947 vnode_t vp;
10948 struct vfs_context context = *vfs_context_current(); /* local copy */
10949 struct fileproc *fp;
10950 uio_t auio;
10951 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10952 off_t loff;
10953 int error, numdirent;
10954 UIO_STACKBUF(uio_buf, 1);
10955
10956 get_from_fd:
10957 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10958 if (error) {
10959 return error;
10960 }
10961
10962 vn_offset_lock(fp->fp_glob);
10963 if (((vnode_t)fp_get_data(fp)) != vp) {
10964 vn_offset_unlock(fp->fp_glob);
10965 file_drop(fd);
10966 goto get_from_fd;
10967 }
10968
10969 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10970 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10971 error = EBADF;
10972 goto out;
10973 }
10974
10975 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10976 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10977 }
10978
10979 #if CONFIG_MACF
10980 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10981 if (error) {
10982 goto out;
10983 }
10984 #endif
10985
10986 if ((error = vnode_getwithref(vp))) {
10987 goto out;
10988 }
10989 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10990
10991 #if CONFIG_UNION_MOUNTS
10992 unionread:
10993 #endif /* CONFIG_UNION_MOUNTS */
10994 if (vp->v_type != VDIR) {
10995 (void)vnode_put(vp);
10996 error = EINVAL;
10997 goto out;
10998 }
10999
11000 #if CONFIG_MACF
11001 error = mac_vnode_check_readdir(&context, vp);
11002 if (error != 0) {
11003 (void)vnode_put(vp);
11004 goto out;
11005 }
11006 #endif /* MAC */
11007
11008 loff = fp->fp_glob->fg_offset;
11009 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11010 uio_addiov(auio, bufp, bufsize);
11011
11012 if (flags & VNODE_READDIR_EXTENDED) {
11013 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
11014 fp->fp_glob->fg_offset = uio_offset(auio);
11015 } else {
11016 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
11017 fp->fp_glob->fg_offset = uio_offset(auio);
11018 }
11019 if (error) {
11020 (void)vnode_put(vp);
11021 goto out;
11022 }
11023
11024 #if CONFIG_UNION_MOUNTS
11025 if ((user_ssize_t)bufsize == uio_resid(auio) &&
11026 (vp->v_mount->mnt_flag & MNT_UNION)) {
11027 vnode_t uvp;
11028
11029 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
11030 if (vnode_ref(uvp) == 0) {
11031 if ((error = VNOP_OPEN(uvp, fp->fp_glob->fg_flag, &context)) == 0) {
11032 fp_set_data(fp, uvp);
11033 /* Close the old vnode to maintain proper lifecycle */
11034 VNOP_CLOSE(vp, fp->fp_glob->fg_flag, &context);
11035 fp->fp_glob->fg_offset = 0;
11036 vnode_rele(vp);
11037 vnode_put(vp);
11038 vp = uvp;
11039 goto unionread;
11040 } else {
11041 vnode_rele(uvp);
11042 vnode_put(uvp);
11043 }
11044 } else {
11045 /* could not get a ref, can't replace in fd */
11046 vnode_put(uvp);
11047 }
11048 }
11049 }
11050 #endif /* CONFIG_UNION_MOUNTS */
11051
11052 vnode_put(vp);
11053 if (offset) {
11054 *offset = loff;
11055 }
11056
11057 *bytesread = bufsize - uio_resid(auio);
11058 out:
11059 vn_offset_unlock(fp->fp_glob);
11060 file_drop(fd);
11061 return error;
11062 }
11063
11064
11065 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)11066 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
11067 {
11068 off_t offset;
11069 ssize_t bytesread;
11070 int error, eofflag;
11071
11072 AUDIT_ARG(fd, uap->fd);
11073 error = getdirentries_common(uap->fd, uap->buf, uap->count,
11074 &bytesread, &offset, &eofflag, 0);
11075
11076 if (error == 0) {
11077 if (proc_is64bit(p)) {
11078 user64_long_t base = (user64_long_t)offset;
11079 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
11080 } else {
11081 user32_long_t base = (user32_long_t)offset;
11082 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
11083 }
11084 *retval = (int)bytesread;
11085 }
11086 return error;
11087 }
11088
11089 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)11090 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
11091 {
11092 off_t offset;
11093 ssize_t bytesread;
11094 int error, eofflag;
11095 user_size_t bufsize;
11096
11097 AUDIT_ARG(fd, uap->fd);
11098
11099 /*
11100 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
11101 * then the kernel carves out the last 4 bytes to return extended
11102 * information to userspace (namely whether we reached EOF with this call).
11103 */
11104 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
11105 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
11106 } else {
11107 bufsize = uap->bufsize;
11108 }
11109
11110 error = getdirentries_common(uap->fd, uap->buf, bufsize,
11111 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
11112
11113 if (error == 0) {
11114 *retval = bytesread;
11115 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
11116
11117 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
11118 getdirentries64_flags_t flags = 0;
11119 if (eofflag) {
11120 flags |= GETDIRENTRIES64_EOF;
11121 }
11122 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
11123 sizeof(flags));
11124 }
11125 }
11126 return error;
11127 }
11128
11129
11130 /*
11131 * Set the mode mask for creation of filesystem nodes.
11132 * XXX implement xsecurity
11133 */
11134 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
11135 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)11136 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
11137 {
11138 AUDIT_ARG(mask, newmask);
11139 proc_fdlock(p);
11140 *retval = p->p_fd.fd_cmask;
11141 p->p_fd.fd_cmask = newmask & ALLPERMS;
11142 proc_fdunlock(p);
11143 return 0;
11144 }
11145
11146 /*
11147 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
11148 *
11149 * Parameters: p Process requesting to set the umask
11150 * uap User argument descriptor (see below)
11151 * retval umask of the process (parameter p)
11152 *
11153 * Indirect: uap->newmask umask to set
11154 * uap->xsecurity ACL to set
11155 *
11156 * Returns: 0 Success
11157 * !0 Not success
11158 *
11159 */
11160 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)11161 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
11162 {
11163 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
11164 }
11165
11166 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)11167 umask(proc_t p, struct umask_args *uap, int32_t *retval)
11168 {
11169 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
11170 }
11171
11172 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
11173 "com.apple.private.vfs.revoke-mounted-device"
11174
11175 /*
11176 * Void all references to file by ripping underlying filesystem
11177 * away from vnode.
11178 */
11179 /* ARGSUSED */
11180 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)11181 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
11182 {
11183 vnode_t vp;
11184 struct vnode_attr va;
11185 vfs_context_t ctx = vfs_context_current();
11186 int error;
11187 struct nameidata nd;
11188
11189 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
11190 uap->path, ctx);
11191 error = namei(&nd);
11192 if (error) {
11193 return error;
11194 }
11195 vp = nd.ni_vp;
11196
11197 nameidone(&nd);
11198
11199 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
11200 error = ENOTSUP;
11201 goto out;
11202 }
11203
11204 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
11205 error = EBUSY;
11206 goto out;
11207 }
11208
11209 #if CONFIG_MACF
11210 error = mac_vnode_check_revoke(ctx, vp);
11211 if (error) {
11212 goto out;
11213 }
11214 #endif
11215
11216 VATTR_INIT(&va);
11217 VATTR_WANTED(&va, va_uid);
11218 if ((error = vnode_getattr(vp, &va, ctx))) {
11219 goto out;
11220 }
11221 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
11222 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
11223 goto out;
11224 }
11225 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
11226 VNOP_REVOKE(vp, REVOKEALL, ctx);
11227 }
11228 out:
11229 vnode_put(vp);
11230 return error;
11231 }
11232
11233
11234 /*
11235 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
11236 * The following system calls are designed to support features
11237 * which are specific to the HFS & HFS Plus volume formats
11238 */
11239
11240
11241 /*
11242 * Obtain attribute information on objects in a directory while enumerating
11243 * the directory.
11244 */
11245 /* ARGSUSED */
11246 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)11247 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
11248 {
11249 vnode_t vp;
11250 struct fileproc *fp;
11251 uio_t auio = NULL;
11252 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11253 uint32_t count = 0, savecount = 0;
11254 uint32_t newstate = 0;
11255 int error, eofflag = 0;
11256 off_t loff = 0;
11257 struct attrlist attributelist;
11258 vfs_context_t ctx = vfs_context_current();
11259 int fd = uap->fd;
11260 UIO_STACKBUF(uio_buf, 1);
11261 kauth_action_t action;
11262
11263 AUDIT_ARG(fd, fd);
11264
11265 /* Get the attributes into kernel space */
11266 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
11267 return error;
11268 }
11269 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
11270 return error;
11271 }
11272 savecount = count;
11273
11274 get_from_fd:
11275 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
11276 return error;
11277 }
11278
11279 vn_offset_lock(fp->fp_glob);
11280 if (((vnode_t)fp_get_data(fp)) != vp) {
11281 vn_offset_unlock(fp->fp_glob);
11282 file_drop(fd);
11283 goto get_from_fd;
11284 }
11285
11286 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
11287 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
11288 error = EBADF;
11289 goto out;
11290 }
11291
11292
11293 #if CONFIG_MACF
11294 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
11295 fp->fp_glob);
11296 if (error) {
11297 goto out;
11298 }
11299 #endif
11300
11301
11302 if ((error = vnode_getwithref(vp))) {
11303 goto out;
11304 }
11305
11306 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
11307
11308 #if CONFIG_UNION_MOUNTS
11309 unionread:
11310 #endif /* CONFIG_UNION_MOUNTS */
11311 if (vp->v_type != VDIR) {
11312 (void)vnode_put(vp);
11313 error = EINVAL;
11314 goto out;
11315 }
11316
11317 #if CONFIG_MACF
11318 error = mac_vnode_check_readdir(ctx, vp);
11319 if (error != 0) {
11320 (void)vnode_put(vp);
11321 goto out;
11322 }
11323 #endif /* MAC */
11324
11325 /* set up the uio structure which will contain the users return buffer */
11326 loff = fp->fp_glob->fg_offset;
11327 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11328 uio_addiov(auio, uap->buffer, uap->buffersize);
11329
11330 /*
11331 * If the only item requested is file names, we can let that past with
11332 * just LIST_DIRECTORY. If they want any other attributes, that means
11333 * they need SEARCH as well.
11334 */
11335 action = KAUTH_VNODE_LIST_DIRECTORY;
11336 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
11337 attributelist.fileattr || attributelist.dirattr) {
11338 action |= KAUTH_VNODE_SEARCH;
11339 }
11340
11341 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
11342 /* Believe it or not, uap->options only has 32-bits of valid
11343 * info, so truncate before extending again */
11344
11345 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
11346 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
11347 }
11348
11349 if (error) {
11350 (void) vnode_put(vp);
11351 goto out;
11352 }
11353
11354 #if CONFIG_UNION_MOUNTS
11355 /*
11356 * If we've got the last entry of a directory in a union mount
11357 * then reset the eofflag and pretend there's still more to come.
11358 * The next call will again set eofflag and the buffer will be empty,
11359 * so traverse to the underlying directory and do the directory
11360 * read there.
11361 */
11362 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
11363 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
11364 eofflag = 0;
11365 } else { // Empty buffer
11366 vnode_t uvp;
11367 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
11368 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
11369 if ((error = VNOP_OPEN(uvp, fp->fp_glob->fg_flag, ctx)) == 0) {
11370 fp_set_data(fp, uvp);
11371 /* Close the old vnode to maintain proper lifecycle */
11372 VNOP_CLOSE(vp, fp->fp_glob->fg_flag, ctx);
11373 fp->fp_glob->fg_offset = 0; // reset index for new dir
11374 count = savecount;
11375 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
11376 vnode_put(vp);
11377 vp = uvp;
11378 goto unionread;
11379 } else {
11380 vnode_rele_internal(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
11381 vnode_put(uvp);
11382 }
11383 } else {
11384 /* could not get a ref, can't replace in fd */
11385 vnode_put(uvp);
11386 }
11387 }
11388 }
11389 }
11390 #endif /* CONFIG_UNION_MOUNTS */
11391
11392 (void)vnode_put(vp);
11393
11394 if (error) {
11395 goto out;
11396 }
11397 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
11398
11399 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
11400 goto out;
11401 }
11402 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
11403 goto out;
11404 }
11405 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
11406 goto out;
11407 }
11408
11409 *retval = eofflag; /* similar to getdirentries */
11410 error = 0;
11411 out:
11412 vn_offset_unlock(fp->fp_glob);
11413 file_drop(fd);
11414 return error; /* return error earlier, an retval of 0 or 1 now */
11415 } /* end of getdirentriesattr system call */
11416
11417 /*
11418 * Exchange data between two files
11419 */
11420
11421 /* ARGSUSED */
11422 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)11423 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
11424 {
11425 struct nameidata fnd, snd;
11426 vfs_context_t ctx = vfs_context_current();
11427 vnode_t fvp;
11428 vnode_t svp;
11429 int error;
11430 u_int32_t nameiflags;
11431 char *fpath = NULL;
11432 char *spath = NULL;
11433 int flen = 0, slen = 0;
11434 int from_truncated = 0, to_truncated = 0;
11435 #if CONFIG_FSE
11436 fse_info f_finfo, s_finfo;
11437 #endif
11438
11439 nameiflags = 0;
11440 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11441 nameiflags |= FOLLOW;
11442 }
11443
11444 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
11445 UIO_USERSPACE, uap->path1, ctx);
11446
11447 error = namei(&fnd);
11448 if (error) {
11449 goto out2;
11450 }
11451
11452 nameidone(&fnd);
11453 fvp = fnd.ni_vp;
11454
11455 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
11456 UIO_USERSPACE, uap->path2, ctx);
11457
11458 error = namei(&snd);
11459 if (error) {
11460 vnode_put(fvp);
11461 goto out2;
11462 }
11463 nameidone(&snd);
11464 svp = snd.ni_vp;
11465
11466 /*
11467 * if the files are the same, return an inval error
11468 */
11469 if (svp == fvp) {
11470 error = EINVAL;
11471 goto out;
11472 }
11473
11474 /*
11475 * if the files are on different volumes, return an error
11476 */
11477 if (svp->v_mount != fvp->v_mount) {
11478 error = EXDEV;
11479 goto out;
11480 }
11481
11482 /* If they're not files, return an error */
11483 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
11484 error = EINVAL;
11485 goto out;
11486 }
11487
11488 #if CONFIG_MACF
11489 error = mac_vnode_check_exchangedata(ctx,
11490 fvp, svp);
11491 if (error) {
11492 goto out;
11493 }
11494 #endif
11495 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
11496 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
11497 goto out;
11498 }
11499
11500 if (
11501 #if CONFIG_FSE
11502 need_fsevent(FSE_EXCHANGE, fvp) ||
11503 #endif
11504 kauth_authorize_fileop_has_listeners()) {
11505 GET_PATH(fpath);
11506 GET_PATH(spath);
11507
11508 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
11509 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
11510
11511 #if CONFIG_FSE
11512 get_fse_info(fvp, &f_finfo, ctx);
11513 get_fse_info(svp, &s_finfo, ctx);
11514 if (from_truncated || to_truncated) {
11515 // set it here since only the f_finfo gets reported up to user space
11516 f_finfo.mode |= FSE_TRUNCATED_PATH;
11517 }
11518 #endif
11519 }
11520 /* Ok, make the call */
11521 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
11522
11523 if (error == 0) {
11524 const char *tmpname;
11525
11526 if (fpath != NULL && spath != NULL) {
11527 /* call out to allow 3rd party notification of exchangedata.
11528 * Ignore result of kauth_authorize_fileop call.
11529 */
11530 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
11531 (uintptr_t)fpath, (uintptr_t)spath);
11532 }
11533 name_cache_lock();
11534
11535 tmpname = fvp->v_name;
11536 fvp->v_name = svp->v_name;
11537 svp->v_name = tmpname;
11538
11539 if (fvp->v_parent != svp->v_parent) {
11540 vnode_t tmp;
11541
11542 tmp = fvp->v_parent;
11543 fvp->v_parent = svp->v_parent;
11544 svp->v_parent = tmp;
11545 }
11546 name_cache_unlock();
11547
11548 #if CONFIG_FSE
11549 if (fpath != NULL && spath != NULL) {
11550 add_fsevent(FSE_EXCHANGE, ctx,
11551 FSE_ARG_STRING, flen, fpath,
11552 FSE_ARG_FINFO, &f_finfo,
11553 FSE_ARG_STRING, slen, spath,
11554 FSE_ARG_FINFO, &s_finfo,
11555 FSE_ARG_DONE);
11556 }
11557 #endif
11558 }
11559
11560 out:
11561 if (fpath != NULL) {
11562 RELEASE_PATH(fpath);
11563 }
11564 if (spath != NULL) {
11565 RELEASE_PATH(spath);
11566 }
11567 vnode_put(svp);
11568 vnode_put(fvp);
11569 out2:
11570 return error;
11571 }
11572
11573 /*
11574 * Return (in MB) the amount of freespace on the given vnode's volume.
11575 */
11576 uint32_t freespace_mb(vnode_t vp);
11577
11578 uint32_t
freespace_mb(vnode_t vp)11579 freespace_mb(vnode_t vp)
11580 {
11581 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11582 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11583 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11584 }
11585
11586 #if CONFIG_SEARCHFS
11587
11588 /* ARGSUSED */
11589
11590 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11591 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11592 {
11593 vnode_t vp, tvp;
11594 int i, error = 0;
11595 int fserror = 0;
11596 struct nameidata nd;
11597 struct user64_fssearchblock searchblock;
11598 struct searchstate *state;
11599 struct attrlist *returnattrs;
11600 struct timeval timelimit;
11601 void *searchparams1, *searchparams2;
11602 uio_t auio = NULL;
11603 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11604 uint32_t nummatches;
11605 size_t mallocsize;
11606 uint32_t nameiflags;
11607 vfs_context_t ctx = vfs_context_current();
11608 UIO_STACKBUF(uio_buf, 1);
11609
11610 /* Start by copying in fsearchblock parameter list */
11611 if (IS_64BIT_PROCESS(p)) {
11612 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11613 timelimit.tv_sec = searchblock.timelimit.tv_sec;
11614 timelimit.tv_usec = searchblock.timelimit.tv_usec;
11615 } else {
11616 struct user32_fssearchblock tmp_searchblock;
11617
11618 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11619 // munge into 64-bit version
11620 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11621 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11622 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11623 searchblock.maxmatches = tmp_searchblock.maxmatches;
11624 /*
11625 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11626 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11627 */
11628 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11629 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11630 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11631 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11632 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11633 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11634 searchblock.searchattrs = tmp_searchblock.searchattrs;
11635 }
11636 if (error) {
11637 return error;
11638 }
11639
11640 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11641 */
11642 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11643 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11644 return EINVAL;
11645 }
11646
11647 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11648 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
11649 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11650 /* block. */
11651 /* */
11652 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
11653 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
11654 /* assumes the size is still 556 bytes it will continue to work */
11655
11656 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11657 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11658
11659 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11660
11661 /* Now set up the various pointers to the correct place in our newly allocated memory */
11662
11663 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11664 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11665 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11666
11667 /* Now copy in the stuff given our local variables. */
11668
11669 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11670 goto freeandexit;
11671 }
11672
11673 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11674 goto freeandexit;
11675 }
11676
11677 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11678 goto freeandexit;
11679 }
11680
11681 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11682 goto freeandexit;
11683 }
11684
11685 /*
11686 * When searching a union mount, need to set the
11687 * start flag at the first call on each layer to
11688 * reset state for the new volume.
11689 */
11690 if (uap->options & SRCHFS_START) {
11691 state->ss_union_layer = 0;
11692 } else {
11693 uap->options |= state->ss_union_flags;
11694 }
11695 state->ss_union_flags = 0;
11696
11697 /*
11698 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11699 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11700 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11701 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11702 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11703 */
11704
11705 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11706 attrreference_t* string_ref;
11707 u_int32_t* start_length;
11708 user64_size_t param_length;
11709
11710 /* validate searchparams1 */
11711 param_length = searchblock.sizeofsearchparams1;
11712 /* skip the word that specifies length of the buffer */
11713 start_length = (u_int32_t*) searchparams1;
11714 start_length = start_length + 1;
11715 string_ref = (attrreference_t*) start_length;
11716
11717 /* ensure no negative offsets or too big offsets */
11718 if (string_ref->attr_dataoffset < 0) {
11719 error = EINVAL;
11720 goto freeandexit;
11721 }
11722 if (string_ref->attr_length > MAXPATHLEN) {
11723 error = EINVAL;
11724 goto freeandexit;
11725 }
11726
11727 /* Check for pointer overflow in the string ref */
11728 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11729 error = EINVAL;
11730 goto freeandexit;
11731 }
11732
11733 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11734 error = EINVAL;
11735 goto freeandexit;
11736 }
11737 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11738 error = EINVAL;
11739 goto freeandexit;
11740 }
11741 }
11742
11743 /* set up the uio structure which will contain the users return buffer */
11744 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11745 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11746
11747 nameiflags = 0;
11748 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11749 nameiflags |= FOLLOW;
11750 }
11751 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11752 UIO_USERSPACE, uap->path, ctx);
11753
11754 error = namei(&nd);
11755 if (error) {
11756 goto freeandexit;
11757 }
11758 vp = nd.ni_vp;
11759 nameidone(&nd);
11760
11761 /*
11762 * Switch to the root vnode for the volume
11763 */
11764 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11765 vnode_put(vp);
11766 if (error) {
11767 goto freeandexit;
11768 }
11769 vp = tvp;
11770
11771 #if CONFIG_UNION_MOUNTS
11772 /*
11773 * If it's a union mount, the path lookup takes
11774 * us to the top layer. But we may need to descend
11775 * to a lower layer. For non-union mounts the layer
11776 * is always zero.
11777 */
11778 for (i = 0; i < (int) state->ss_union_layer; i++) {
11779 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11780 break;
11781 }
11782 tvp = vp;
11783 vp = vp->v_mount->mnt_vnodecovered;
11784 if (vp == NULL) {
11785 vnode_put(tvp);
11786 error = ENOENT;
11787 goto freeandexit;
11788 }
11789 error = vnode_getwithref(vp);
11790 vnode_put(tvp);
11791 if (error) {
11792 goto freeandexit;
11793 }
11794 }
11795 #endif /* CONFIG_UNION_MOUNTS */
11796
11797 #if CONFIG_MACF
11798 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11799 if (error) {
11800 vnode_put(vp);
11801 goto freeandexit;
11802 }
11803 #endif
11804
11805
11806 /*
11807 * If searchblock.maxmatches == 0, then skip the search. This has happened
11808 * before and sometimes the underlying code doesnt deal with it well.
11809 */
11810 if (searchblock.maxmatches == 0) {
11811 nummatches = 0;
11812 goto saveandexit;
11813 }
11814
11815 /*
11816 * Allright, we have everything we need, so lets make that call.
11817 *
11818 * We keep special track of the return value from the file system:
11819 * EAGAIN is an acceptable error condition that shouldn't keep us
11820 * from copying out any results...
11821 */
11822
11823 fserror = VNOP_SEARCHFS(vp,
11824 searchparams1,
11825 searchparams2,
11826 &searchblock.searchattrs,
11827 (uint32_t)searchblock.maxmatches,
11828 &timelimit,
11829 returnattrs,
11830 &nummatches,
11831 (uint32_t)uap->scriptcode,
11832 (uint32_t)uap->options,
11833 auio,
11834 (struct searchstate *) &state->ss_fsstate,
11835 ctx);
11836
11837 #if CONFIG_UNION_MOUNTS
11838 /*
11839 * If it's a union mount we need to be called again
11840 * to search the mounted-on filesystem.
11841 */
11842 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11843 state->ss_union_flags = SRCHFS_START;
11844 state->ss_union_layer++; // search next layer down
11845 fserror = EAGAIN;
11846 }
11847 #endif /* CONFIG_UNION_MOUNTS */
11848
11849 saveandexit:
11850
11851 vnode_put(vp);
11852
11853 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11854 * search state. Everything was already put into he return buffer by the vop call. */
11855
11856 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11857 goto freeandexit;
11858 }
11859
11860 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11861 goto freeandexit;
11862 }
11863
11864 error = fserror;
11865
11866 freeandexit:
11867
11868 kfree_data(searchparams1, mallocsize);
11869
11870 return error;
11871 } /* end of searchfs system call */
11872
11873 #else /* CONFIG_SEARCHFS */
11874
11875 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11876 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11877 {
11878 return ENOTSUP;
11879 }
11880
11881 #endif /* CONFIG_SEARCHFS */
11882
11883
11884 #if CONFIG_DATALESS_FILES
11885
11886 /*
11887 * === Namespace Resolver Up-call Mechanism ===
11888 *
11889 * When I/O is performed to a dataless file or directory (read, write,
11890 * lookup-in, etc.), the file system performs an upcall to the namespace
11891 * resolver (filecoordinationd) to materialize the object.
11892 *
11893 * We need multiple up-calls to be in flight at once, and we need these
11894 * up-calls to be interruptible, thus the following implementation:
11895 *
11896 * => The nspace_resolver_request represents the in-kernel request state.
11897 * It contains a request ID, storage space for the errno code returned
11898 * by filecoordinationd, and flags.
11899 *
11900 * => The request ID is simply a global monotonically incrementing 32-bit
11901 * number. Outstanding requests are stored in a hash table, and the
11902 * hash function is extremely simple.
11903 *
11904 * => When an upcall is to be made to filecoordinationd, a request structure
11905 * is allocated on the stack (it is small, and needs to live only during
11906 * the duration of the call to resolve_nspace_item_ext()). It is
11907 * initialized and inserted into the table. Some backpressure from
11908 * filecoordinationd is applied by limiting the numnber of entries that
11909 * can be inserted into the table (and thus limiting the number of
11910 * outstanding requests issued to filecoordinationd); waiting for an
11911 * available slot is interruptible.
11912 *
11913 * => Once the request has been inserted into the table, the up-call is made
11914 * to filecoordinationd via a MiG-generated stub. The up-call returns
11915 * immediately and filecoordinationd processes the request asynchronously.
11916 *
11917 * => The caller now waits for the request to complete. Tnis is achieved by
11918 * sleeping on the address of the request structure and waiting for
11919 * filecoordinationd to mark the request structure as complete. This
11920 * is an interruptible sleep call; if interrupted, the request structure
11921 * is removed from the table and EINTR is returned to the caller. If
11922 * this occurs, an advisory up-call is made to filecoordinationd with
11923 * the request ID to indicate that the request can be aborted or
11924 * de-prioritized at the discretion of filecoordinationd.
11925 *
11926 * => When filecoordinationd has completed the request, it signals completion
11927 * by writing to the vfs.nspace.complete sysctl node. Only a process
11928 * decorated as a namespace resolver can write to this sysctl node. The
11929 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11930 * The request ID is looked up in the table, and if the request is found,
11931 * the error code is stored in the request structure and a wakeup()
11932 * issued on the address of the request structure. If the request is not
11933 * found, we simply drop the completion notification, assuming that the
11934 * caller was interrupted.
11935 *
11936 * => When the waiting thread wakes up, it extracts the error code from the
11937 * request structure, removes the request from the table, and returns the
11938 * error code to the calling function. Fini!
11939 */
11940
11941 struct nspace_resolver_request {
11942 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11943 vnode_t r_vp;
11944 vnode_t r_tdvp;
11945 uint32_t r_req_id;
11946 int r_resolver_error;
11947 int r_flags;
11948 };
11949
11950 #define RRF_COMPLETE 0x0001
11951 #define RRF_COMPLETING 0x0002
11952
11953 struct nspace_resolver_completion_data {
11954 uint32_t req_id;
11955 int32_t resolver_error;
11956 uint64_t orig_gencount;
11957 uint64_t orig_syncroot;
11958 };
11959
11960 static uint32_t
next_nspace_req_id(void)11961 next_nspace_req_id(void)
11962 {
11963 static uint32_t next_req_id;
11964
11965 return OSAddAtomic(1, &next_req_id);
11966 }
11967
11968 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11969 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11970
11971 static LIST_HEAD(nspace_resolver_requesthead,
11972 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11973 static u_long nspace_resolver_request_hashmask;
11974 static u_int nspace_resolver_request_count;
11975 static bool nspace_resolver_request_wait_slot;
11976 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11977 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11978 &nspace_resolver_request_lck_grp);
11979
11980 #define NSPACE_REQ_LOCK() \
11981 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11982 #define NSPACE_REQ_UNLOCK() \
11983 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11984
11985 #define NSPACE_RESOLVER_HASH(req_id) \
11986 (&nspace_resolver_request_hashtbl[(req_id) & \
11987 nspace_resolver_request_hashmask])
11988
11989 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11990 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11991 {
11992 struct nspace_resolver_requesthead *bucket;
11993 struct nspace_resolver_request *req;
11994
11995 bucket = NSPACE_RESOLVER_HASH(req_id);
11996 LIST_FOREACH(req, bucket, r_hashlink) {
11997 if (req->r_req_id == req_id) {
11998 /*
11999 * If this request already has a completion
12000 * pending, don't return it again.
12001 */
12002 if ((req->r_flags & RRF_COMPLETING) != 0 &&
12003 skip_completing) {
12004 req = NULL;
12005 }
12006 return req;
12007 }
12008 }
12009
12010 return NULL;
12011 }
12012
12013 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)12014 nspace_resolver_req_add(struct nspace_resolver_request *req)
12015 {
12016 struct nspace_resolver_requesthead *bucket;
12017 int error;
12018
12019 NSPACE_REQ_LOCK();
12020
12021 while (nspace_resolver_request_count >=
12022 NSPACE_RESOLVER_MAX_OUTSTANDING) {
12023 nspace_resolver_request_wait_slot = true;
12024 error = msleep(&nspace_resolver_request_count,
12025 &nspace_resolver_request_hash_mutex,
12026 PVFS | PCATCH, "nspacerq", NULL);
12027 if (error) {
12028 NSPACE_REQ_UNLOCK();
12029 return error;
12030 }
12031 }
12032
12033 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
12034 #if DIAGNOSTIC
12035 assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
12036 #endif /* DIAGNOSTIC */
12037 LIST_INSERT_HEAD(bucket, req, r_hashlink);
12038 nspace_resolver_request_count++;
12039
12040 NSPACE_REQ_UNLOCK();
12041
12042 return 0;
12043 }
12044
12045 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)12046 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
12047 {
12048 /*
12049 * If a completion is in-progress, we have to wait for the
12050 * completion handler to finish because it's still using 'req',
12051 * which is allocated on our stack a couple of frames up.
12052 */
12053 while ((req->r_flags & RRF_COMPLETING) != 0) {
12054 (void) msleep(req, &nspace_resolver_request_hash_mutex,
12055 PVFS, "nspacecmplt", NULL);
12056 }
12057 }
12058
12059 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)12060 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
12061 {
12062 struct nspace_resolver_requesthead *bucket;
12063
12064 /* We're called with NSPACE_REQ_LOCK held. */
12065
12066 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
12067 #if DIAGNOSTIC
12068 assert((req->r_flags & RRF_COMPLETING) == 0);
12069 assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
12070 #endif /* DIAGNOSTIC */
12071 LIST_REMOVE(req, r_hashlink);
12072 nspace_resolver_request_count--;
12073
12074 if (nspace_resolver_request_wait_slot) {
12075 nspace_resolver_request_wait_slot = false;
12076 wakeup(&nspace_resolver_request_count);
12077 }
12078
12079 nspace_resolver_req_wait_pending_completion(req);
12080
12081 NSPACE_REQ_UNLOCK();
12082 }
12083
12084 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)12085 nspace_resolver_req_remove(struct nspace_resolver_request *req)
12086 {
12087 NSPACE_REQ_LOCK();
12088 nspace_resolver_req_remove_and_unlock(req);
12089 }
12090
12091 static void
nspace_resolver_req_cancel(uint32_t req_id)12092 nspace_resolver_req_cancel(uint32_t req_id)
12093 {
12094 kern_return_t kr;
12095 mach_port_t mp;
12096
12097 // Failures here aren't fatal -- the cancellation message
12098 // sent to the resolver is merely advisory.
12099
12100 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12101 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12102 return;
12103 }
12104
12105 kr = send_nspace_resolve_cancel(mp, req_id);
12106 if (kr != KERN_SUCCESS) {
12107 os_log_error(OS_LOG_DEFAULT,
12108 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
12109 }
12110
12111 ipc_port_release_send(mp);
12112 }
12113
12114 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)12115 nspace_resolver_req_wait(struct nspace_resolver_request *req)
12116 {
12117 bool send_cancel_message = false;
12118 int error;
12119
12120 NSPACE_REQ_LOCK();
12121
12122 while ((req->r_flags & RRF_COMPLETE) == 0) {
12123 error = msleep(req, &nspace_resolver_request_hash_mutex,
12124 PVFS | PCATCH, "nspace", NULL);
12125 if (error && error != ERESTART) {
12126 req->r_resolver_error = (error == EINTR) ? EINTR :
12127 ETIMEDOUT;
12128 send_cancel_message = true;
12129 break;
12130 }
12131 }
12132
12133 nspace_resolver_req_remove_and_unlock(req);
12134
12135 /*
12136 * It's safe to continue referencing 'req' here because it's
12137 * allocated on our caller's stack.
12138 */
12139
12140 if (send_cancel_message) {
12141 nspace_resolver_req_cancel(req->r_req_id);
12142 }
12143
12144 return req->r_resolver_error;
12145 }
12146
12147 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)12148 nspace_resolver_req_mark_complete(
12149 struct nspace_resolver_request *req,
12150 int resolver_error)
12151 {
12152 req->r_resolver_error = resolver_error;
12153 req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
12154 wakeup(req);
12155 }
12156
12157 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)12158 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
12159 {
12160 req->r_flags |= RRF_COMPLETING;
12161 }
12162
12163 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)12164 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
12165 {
12166 struct nspace_resolver_request *req;
12167 int error;
12168 struct vnode_attr va;
12169 vnode_t vp;
12170
12171 NSPACE_REQ_LOCK();
12172
12173 req = nspace_resolver_req_lookup(c->req_id, true);
12174 if (req == NULL) {
12175 /*
12176 * If we don't find the request corresponding to our req_id,
12177 * just drop the completion on the floor; it's likely that
12178 * the requester interrupted with a signal, or it may already
12179 * be completing.
12180 */
12181 NSPACE_REQ_UNLOCK();
12182 return;
12183 }
12184
12185 /*
12186 * Get out now if the resolver reported an error.
12187 */
12188 if ((error = c->resolver_error) != 0) {
12189 goto out;
12190 }
12191
12192 /*
12193 * If the resolver did not specify any namespace shape criteria
12194 * for letting the operation proceed, then get out now.
12195 */
12196 if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
12197 goto out;
12198 }
12199
12200 /*
12201 * We're going to have to acquire the mount rename lock and do
12202 * some I/O in order to verify the criteria. Mark the request
12203 * as pending so no one else messes with it after we drop the
12204 * NSPACE_REQ_LOCK.
12205 */
12206 nspace_resolver_req_mark_completion_pending(req);
12207 NSPACE_REQ_UNLOCK();
12208
12209 /*
12210 * Lock out renames from changing the shape of the tree while
12211 * validate the criteria.
12212 */
12213 mount_t locked_mp = req->r_vp->v_mount;
12214 mount_ref(locked_mp, 0);
12215 mount_lock_renames(locked_mp);
12216
12217 if (c->orig_gencount != 0) {
12218 vp = req->r_vp;
12219 if (error) {
12220 goto out_dropmount;
12221 }
12222
12223 VATTR_INIT(&va);
12224 VATTR_WANTED(&va, va_recursive_gencount);
12225 error = vnode_getattr(vp, &va, vfs_context_kernel());
12226 if (error) {
12227 goto out_dropmount;
12228 }
12229 if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
12230 va.va_recursive_gencount != c->orig_gencount) {
12231 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
12232 c->orig_gencount, va.va_recursive_gencount);
12233 error = EBUSY;
12234 goto out_dropmount;
12235 }
12236 }
12237
12238 /*
12239 * Ignore orig_syncroot if a destination directory wasn't specified
12240 * in the request.
12241 */
12242 if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
12243 uint64_t syncroot_id;
12244
12245 if (error) {
12246 goto out_dropmount;
12247 }
12248
12249 #ifndef APFSIOC_GET_SYNC_ROOT
12250 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
12251 #endif
12252
12253 error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
12254 (caddr_t)&syncroot_id, 0, vfs_context_kernel());
12255 if (error) {
12256 goto out_dropmount;
12257 }
12258 if (syncroot_id != c->orig_syncroot) {
12259 printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
12260 c->orig_syncroot, syncroot_id);
12261 error = EBUSY;
12262 goto out_dropmount;
12263 }
12264 }
12265
12266 out_dropmount:
12267 mount_unlock_renames(locked_mp);
12268 mount_drop(locked_mp, 0);
12269 NSPACE_REQ_LOCK();
12270
12271 out:
12272 nspace_resolver_req_mark_complete(req, error);
12273 NSPACE_REQ_UNLOCK();
12274 }
12275
12276 static struct proc *nspace_resolver_proc;
12277
12278 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)12279 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
12280 {
12281 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12282 p == nspace_resolver_proc) ? 1 : 0;
12283 return 0;
12284 }
12285
12286 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
12287
12288 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)12289 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
12290 {
12291 vfs_context_t ctx = vfs_context_current();
12292 int error = 0;
12293
12294 //
12295 // The system filecoordinationd runs as uid == 0. This also
12296 // has the nice side-effect of filtering out filecoordinationd
12297 // running in the simulator.
12298 //
12299 if (!vfs_context_issuser(ctx) ||
12300 !vfs_context_is_dataless_resolver(ctx)) {
12301 return EPERM;
12302 }
12303
12304 if (is_resolver) {
12305 NSPACE_REQ_LOCK();
12306
12307 if (nspace_resolver_proc == NULL) {
12308 proc_lock(p);
12309 p->p_lflag |= P_LNSPACE_RESOLVER;
12310 proc_unlock(p);
12311 nspace_resolver_proc = p;
12312 } else {
12313 error = EBUSY;
12314 }
12315
12316 NSPACE_REQ_UNLOCK();
12317 } else {
12318 // This is basically just like the exit case.
12319 // nspace_resolver_exited() will verify that the
12320 // process is the resolver, and will clear the
12321 // global.
12322 nspace_resolver_exited(p);
12323 }
12324
12325 return error;
12326 }
12327
12328 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)12329 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
12330 {
12331 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
12332 (p->p_vfs_iopolicy &
12333 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
12334 *is_prevented = 1;
12335 } else {
12336 *is_prevented = 0;
12337 }
12338 return 0;
12339 }
12340
12341 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)12342 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
12343 {
12344 if (p->p_lflag & P_LNSPACE_RESOLVER) {
12345 return is_prevented ? 0 : EBUSY;
12346 }
12347
12348 if (is_prevented) {
12349 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
12350 } else {
12351 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
12352 }
12353 return 0;
12354 }
12355
12356 static int
nspace_materialization_get_thread_state(int * is_prevented)12357 nspace_materialization_get_thread_state(int *is_prevented)
12358 {
12359 uthread_t ut = current_uthread();
12360
12361 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
12362 return 0;
12363 }
12364
12365 static int
nspace_materialization_set_thread_state(int is_prevented)12366 nspace_materialization_set_thread_state(int is_prevented)
12367 {
12368 uthread_t ut = current_uthread();
12369
12370 if (is_prevented) {
12371 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
12372 } else {
12373 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
12374 }
12375 return 0;
12376 }
12377
12378 /* the vfs.nspace branch */
12379 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
12380
12381 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12382 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
12383 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12384 {
12385 struct proc *p = req->p;
12386 int new_value, old_value, changed = 0;
12387 int error;
12388
12389 error = nspace_resolver_get_proc_state(p, &old_value);
12390 if (error) {
12391 return error;
12392 }
12393
12394 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12395 &changed);
12396 if (error == 0 && changed) {
12397 error = nspace_resolver_set_proc_state(p, new_value);
12398 }
12399 return error;
12400 }
12401
12402 /* decorate this process as the dataless file resolver */
12403 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
12404 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12405 0, 0, sysctl_nspace_resolver, "I", "");
12406
12407 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12408 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
12409 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12410 {
12411 struct proc *p = req->p;
12412 int new_value, old_value, changed = 0;
12413 int error;
12414
12415 error = nspace_materialization_get_proc_state(p, &old_value);
12416 if (error) {
12417 return error;
12418 }
12419
12420 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12421 &changed);
12422 if (error == 0 && changed) {
12423 error = nspace_materialization_set_proc_state(p, new_value);
12424 }
12425 return error;
12426 }
12427
12428 /* decorate this process as not wanting to materialize dataless files */
12429 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
12430 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12431 0, 0, sysctl_nspace_prevent_materialization, "I", "");
12432
12433 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12434 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
12435 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12436 {
12437 int new_value, old_value, changed = 0;
12438 int error;
12439
12440 error = nspace_materialization_get_thread_state(&old_value);
12441 if (error) {
12442 return error;
12443 }
12444
12445 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12446 &changed);
12447 if (error == 0 && changed) {
12448 error = nspace_materialization_set_thread_state(new_value);
12449 }
12450 return error;
12451 }
12452
12453 /* decorate this thread as not wanting to materialize dataless files */
12454 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
12455 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12456 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
12457
12458 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12459 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
12460 __unused int arg2, struct sysctl_req *req)
12461 {
12462 struct proc *p = req->p;
12463 uint32_t req_status[2] = { 0, 0 };
12464 uint64_t gencount = 0;
12465 uint64_t syncroot = 0;
12466 int error, is_resolver, changed = 0, other_changed;
12467
12468 error = nspace_resolver_get_proc_state(p, &is_resolver);
12469 if (error) {
12470 return error;
12471 }
12472
12473 if (!is_resolver) {
12474 return EPERM;
12475 }
12476
12477 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
12478 &changed);
12479 if (error) {
12480 return error;
12481 }
12482
12483 /*
12484 * Get the gencount if it was passed. Ignore errors, because
12485 * it's optional.
12486 */
12487 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
12488 &other_changed);
12489 if (error) {
12490 gencount = 0;
12491 error = 0;
12492 }
12493
12494 /*
12495 * ...and now the syncroot ID.
12496 */
12497 error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
12498 &other_changed);
12499 if (error) {
12500 syncroot = 0;
12501 error = 0;
12502 }
12503
12504 /*
12505 * req_status[0] is the req_id
12506 *
12507 * req_status[1] is the errno
12508 */
12509 if (error == 0 && changed) {
12510 const struct nspace_resolver_completion_data cd = {
12511 .req_id = req_status[0],
12512 .resolver_error = req_status[1],
12513 .orig_gencount = gencount,
12514 .orig_syncroot = syncroot,
12515 };
12516 nspace_resolver_req_completed(&cd);
12517 }
12518 return error;
12519 }
12520
12521 /* Resolver reports completed reqs here. */
12522 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
12523 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12524 0, 0, sysctl_nspace_complete, "-", "");
12525
12526 #endif /* CONFIG_DATALESS_FILES */
12527
12528 #if CONFIG_DATALESS_FILES
12529 #define __no_dataless_unused /* nothing */
12530 #else
12531 #define __no_dataless_unused __unused
12532 #endif
12533
12534 static int
vfs_context_dataless_materialization_is_prevented_internal(vfs_context_t const ctx __no_dataless_unused,bool is_original_materialization __no_dataless_unused)12535 vfs_context_dataless_materialization_is_prevented_internal(
12536 vfs_context_t const ctx __no_dataless_unused, bool is_original_materialization __no_dataless_unused)
12537 {
12538 #if CONFIG_DATALESS_FILES
12539 proc_t const p = vfs_context_proc(ctx);
12540
12541 /*
12542 * Kernel context ==> return EDEADLK, as we would with any random
12543 * process decorated as no-materialize.
12544 */
12545 if (ctx == vfs_context_kernel()) {
12546 return EDEADLK;
12547 }
12548
12549 /*
12550 * If the process has the dataless-manipulation entitlement,
12551 * materialization is prevented, and depending on the kind
12552 * of file system operation, things get to proceed as if the
12553 * object is not dataless.
12554 */
12555 if (vfs_context_is_dataless_manipulator(ctx)) {
12556 return EJUSTRETURN;
12557 }
12558
12559 /*
12560 * If the process's iopolicy specifies that dataless files
12561 * can be materialized, then we let it go ahead.
12562 */
12563 if (is_original_materialization) {
12564 return (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES_ORIG) ? 0 : EDEADLK;
12565 } else {
12566 thread_t const t = vfs_context_thread(ctx);
12567 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
12568
12569 /*
12570 * Per-thread decorations override any process-wide decorations.
12571 * (Foundation uses this, and this overrides even the dataless-
12572 * manipulation entitlement so as to make API contracts consistent.)
12573 */
12574 if (ut != NULL) {
12575 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12576 return EDEADLK;
12577 }
12578 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12579 return 0;
12580 }
12581 }
12582
12583 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12584 return 0;
12585 }
12586 }
12587
12588 #endif /* CONFIG_DATALESS_FILES */
12589
12590 /*
12591 * The default behavior is to not materialize dataless files;
12592 * return to the caller that deadlock was detected.
12593 */
12594 return EDEADLK;
12595 }
12596
12597 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12598 vfs_context_dataless_materialization_is_prevented(
12599 vfs_context_t const ctx __no_dataless_unused)
12600 {
12601 return vfs_context_dataless_materialization_is_prevented_internal(ctx, false);
12602 }
12603
12604 int
vfs_context_orig_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12605 vfs_context_orig_dataless_materialization_is_prevented(
12606 vfs_context_t const ctx __no_dataless_unused)
12607 {
12608 return vfs_context_dataless_materialization_is_prevented_internal(ctx, true);
12609 }
12610
12611 void
nspace_resolver_init(void)12612 nspace_resolver_init(void)
12613 {
12614 #if CONFIG_DATALESS_FILES
12615 nspace_resolver_request_hashtbl =
12616 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12617 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12618 #endif /* CONFIG_DATALESS_FILES */
12619 }
12620
12621 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12622 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12623 {
12624 #if CONFIG_DATALESS_FILES
12625 struct nspace_resolver_requesthead *bucket;
12626 struct nspace_resolver_request *req;
12627 u_long idx;
12628
12629 NSPACE_REQ_LOCK();
12630
12631 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12632 p == nspace_resolver_proc) {
12633 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12634 bucket = &nspace_resolver_request_hashtbl[idx];
12635 LIST_FOREACH(req, bucket, r_hashlink) {
12636 nspace_resolver_req_wait_pending_completion(req);
12637 nspace_resolver_req_mark_complete(req,
12638 ETIMEDOUT);
12639 }
12640 }
12641 nspace_resolver_proc = NULL;
12642 }
12643
12644 NSPACE_REQ_UNLOCK();
12645 #endif /* CONFIG_DATALESS_FILES */
12646 }
12647
12648 #define DATALESS_RESOLVER_ENTITLEMENT \
12649 "com.apple.private.vfs.dataless-resolver"
12650 #define DATALESS_MANIPULATION_ENTITLEMENT \
12651 "com.apple.private.vfs.dataless-manipulation"
12652
12653 #if CONFIG_DATALESS_FILES
12654 /*
12655 * Return TRUE if the vfs context is associated with the dataless
12656 * resolver.
12657 */
12658 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12659 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12660 {
12661 return IOTaskHasEntitlement(vfs_context_task(ctx),
12662 DATALESS_RESOLVER_ENTITLEMENT);
12663 }
12664 #endif /* CONFIG_DATALESS_FILES */
12665
12666 /*
12667 * Return TRUE if the vfs context is associated with a process entitled
12668 * for dataless manipulation.
12669 *
12670 * XXX Arguably belongs in vfs_subr.c, but is here because of the
12671 * complication around CONFIG_DATALESS_FILES.
12672 */
12673 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12674 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12675 {
12676 #if CONFIG_DATALESS_FILES
12677 task_t task = vfs_context_task(ctx);
12678 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12679 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12680 #else
12681 return false;
12682 #endif /* CONFIG_DATALESS_FILES */
12683 }
12684
12685 #if CONFIG_DATALESS_FILES
12686 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12687 log_materialization_prevented(vnode_t vp, uint64_t op)
12688 {
12689 char p_name[MAXCOMLEN + 1];
12690 char *vntype;
12691 proc_selfname(&p_name[0], sizeof(p_name));
12692
12693 if (vp->v_type == VREG) {
12694 vntype = "File";
12695 } else if (vp->v_type == VDIR) {
12696 vntype = "Dir";
12697 } else if (vp->v_type == VLNK) {
12698 vntype = "SymLink";
12699 } else {
12700 vntype = "Other";
12701 }
12702
12703 #if DEVELOPMENT
12704 struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12705
12706 VATTR_INIT(vap);
12707 VATTR_WANTED(vap, va_fsid);
12708 VATTR_WANTED(vap, va_fileid);
12709 if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12710 os_log_debug(OS_LOG_DEFAULT,
12711 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12712 p_name, proc_selfpid(), op, vntype,
12713 vap->va_fsid, vap->va_fsid, vap->va_fileid);
12714 } else
12715 #endif
12716 {
12717 os_log_debug(OS_LOG_DEFAULT,
12718 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12719 p_name, proc_selfpid(), op, vntype);
12720 }
12721 #if DEVELOPMENT
12722 kfree_type(struct vnode_attr, vap);
12723 #endif
12724 }
12725 #endif /* CONFIG_DATALESS_FILES */
12726
12727 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12728 vfs_materialize_item(
12729 vnode_t vp __no_dataless_unused,
12730 uint32_t op __no_dataless_unused,
12731 int64_t offset __no_dataless_unused,
12732 int64_t size __no_dataless_unused,
12733 char *lookup_name __no_dataless_unused,
12734 size_t const namelen __no_dataless_unused,
12735 vnode_t tdvp __no_dataless_unused)
12736 {
12737 #if CONFIG_DATALESS_FILES
12738 kern_return_t kern_ret;
12739 mach_port_t mach_port;
12740 char *path = NULL;
12741 vfs_context_t context;
12742 int path_len;
12743 int error;
12744 audit_token_t atoken;
12745 enum vtype vp_vtype;
12746
12747 /* Swap files are special; ignore them */
12748 if (vnode_isswap(vp)) {
12749 return 0;
12750 }
12751
12752 /*
12753 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12754 * are no longer used nor supported.
12755 */
12756 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12757 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12758 return ENOTSUP;
12759 }
12760 if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12761 os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12762 return ENOTSUP;
12763 }
12764
12765 /* Normalize 'op'. */
12766 op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12767
12768 /*
12769 * To-directory is only meaningful for rename operations;
12770 * ignore it if someone handed one to us unexpectedly.
12771 */
12772 if (op != NAMESPACE_HANDLER_RENAME_OP) {
12773 tdvp = NULL;
12774 }
12775
12776 context = vfs_context_current();
12777
12778 /* Remember this for later. */
12779 vp_vtype = vnode_vtype(vp);
12780
12781 error = vfs_context_dataless_materialization_is_prevented(context);
12782 if (error) {
12783 log_materialization_prevented(vp, op);
12784 goto out_check_errors;
12785 }
12786
12787 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12788 &mach_port);
12789 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12790 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12791 /*
12792 * Treat this like being unable to access the backing store
12793 * server.
12794 */
12795 return ETIMEDOUT;
12796 }
12797
12798 int path_alloc_len = MAXPATHLEN;
12799 do {
12800 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12801 if (path == NULL) {
12802 return ENOMEM;
12803 }
12804
12805 path_len = path_alloc_len;
12806 error = vn_getpath(vp, path, &path_len);
12807 if (error == 0) {
12808 break;
12809 } else if (error == ENOSPC) {
12810 kfree_data(path, path_alloc_len);
12811 path = NULL;
12812 } else {
12813 goto out_release_port;
12814 }
12815 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) &&
12816 path_alloc_len <= MAXLONGPATHLEN);
12817
12818 error = vfs_context_copy_audit_token(context, &atoken);
12819 if (error) {
12820 goto out_release_port;
12821 }
12822
12823 struct nspace_resolver_request req = {
12824 .r_req_id = next_nspace_req_id(),
12825 .r_vp = vp,
12826 .r_tdvp = tdvp,
12827 };
12828
12829 error = nspace_resolver_req_add(&req);
12830 if (error) {
12831 goto out_release_port;
12832 }
12833
12834 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12835
12836 if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12837 char *dest_path = NULL;
12838 int dest_path_len;
12839
12840 dest_path = zalloc(ZV_NAMEI);
12841 dest_path_len = MAXPATHLEN;
12842
12843 error = vn_getpath(tdvp, dest_path, &dest_path_len);
12844 if (error) {
12845 zfree(ZV_NAMEI, dest_path);
12846 goto out_release_port;
12847 }
12848
12849 /*
12850 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12851 * compatibility with existing agents in user-space
12852 * who get passed this value.
12853 */
12854 kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12855 req.r_req_id,
12856 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12857 path, dest_path, atoken);
12858
12859 zfree(ZV_NAMEI, dest_path);
12860 } else if (vp_vtype == VDIR) {
12861 char *tmpname = NULL;
12862
12863 /*
12864 * If the caller provided a lookup_name *and* a name length,
12865 * then we assume the lookup_name is not NUL-terminated.
12866 * Allocate a temporary buffer in this case to provide
12867 * a NUL-terminated path name to the IPC call.
12868 */
12869 if (lookup_name != NULL && namelen != 0) {
12870 if (namelen >= PATH_MAX) {
12871 error = EINVAL;
12872 goto out_req_remove;
12873 }
12874 tmpname = zalloc(ZV_NAMEI);
12875 strlcpy(tmpname, lookup_name, namelen + 1);
12876 lookup_name = tmpname;
12877 } else if (lookup_name != NULL) {
12878 /*
12879 * If the caller provided a lookup_name with a
12880 * zero name length, then we assume it's NUL-
12881 * terminated. Verify it has a valid length.
12882 */
12883 if (strlen(lookup_name) >= PATH_MAX) {
12884 error = EINVAL;
12885 goto out_req_remove;
12886 }
12887 }
12888
12889 /* (See above.) */
12890 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12891 req.r_req_id,
12892 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12893 lookup_name == NULL ? "" : lookup_name, path, atoken);
12894
12895 if (tmpname != NULL) {
12896 zfree(ZV_NAMEI, tmpname);
12897
12898 /*
12899 * Poison lookup_name rather than reference
12900 * freed memory.
12901 */
12902 lookup_name = NULL;
12903 }
12904 } else {
12905 /* (See above.) */
12906 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12907 req.r_req_id,
12908 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12909 offset, size, path, atoken);
12910 }
12911 if (kern_ret != KERN_SUCCESS) {
12912 /*
12913 * Also treat this like being unable to access the backing
12914 * store server.
12915 */
12916 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12917 kern_ret);
12918 error = ETIMEDOUT;
12919 goto out_req_remove;
12920 }
12921
12922 /*
12923 * Give back the memory we allocated earlier while we wait; we
12924 * no longer need it.
12925 */
12926 kfree_data(path, path_alloc_len);
12927 path = NULL;
12928
12929 /*
12930 * Request has been submitted to the resolver. Now (interruptibly)
12931 * wait for completion. Upon requrn, the request will have been
12932 * removed from the lookup table.
12933 */
12934 error = nspace_resolver_req_wait(&req);
12935
12936 out_release_port:
12937 if (path != NULL) {
12938 kfree_data(path, path_alloc_len);
12939 path = NULL;
12940 }
12941 ipc_port_release_send(mach_port);
12942
12943 out_check_errors:
12944 /*
12945 * The file resolver owns the logic about what error to return
12946 * to the caller. We only need to handle a couple of special
12947 * cases here:
12948 */
12949 if (error == EJUSTRETURN) {
12950 /*
12951 * The requesting process is allowed to interact with
12952 * dataless objects. Make a couple of sanity-checks
12953 * here to ensure the action makes sense.
12954 */
12955 switch (op) {
12956 case NAMESPACE_HANDLER_WRITE_OP:
12957 case NAMESPACE_HANDLER_TRUNCATE_OP:
12958 case NAMESPACE_HANDLER_RENAME_OP:
12959 /*
12960 * This handles the case of the resolver itself
12961 * writing data to the file (or throwing it
12962 * away).
12963 */
12964 error = 0;
12965 break;
12966 case NAMESPACE_HANDLER_READ_OP:
12967 case NAMESPACE_HANDLER_LOOKUP_OP:
12968 /*
12969 * This handles the case of the resolver needing
12970 * to look up inside of a dataless directory while
12971 * it's in the process of materializing it (for
12972 * example, creating files or directories).
12973 */
12974 error = (vp_vtype == VDIR) ? 0 : EBADF;
12975 break;
12976 default:
12977 error = EBADF;
12978 break;
12979 }
12980 }
12981
12982 return error;
12983
12984 out_req_remove:
12985 nspace_resolver_req_remove(&req);
12986 goto out_release_port;
12987 #else
12988 return ENOTSUP;
12989 #endif /* CONFIG_DATALESS_FILES */
12990 }
12991
12992 /*
12993 * vfs_materialize_file: Materialize a regular file.
12994 *
12995 * Inputs:
12996 * vp The dataless file to be materialized.
12997 *
12998 * op What kind of operation is being performed:
12999 * -> NAMESPACE_HANDLER_READ_OP
13000 * -> NAMESPACE_HANDLER_WRITE_OP
13001 * -> NAMESPACE_HANDLER_LINK_CREATE
13002 * -> NAMESPACE_HANDLER_DELETE_OP
13003 * -> NAMESPACE_HANDLER_TRUNCATE_OP
13004 * -> NAMESPACE_HANDLER_RENAME_OP
13005 *
13006 * offset offset of I/O for READ or WRITE. Ignored for
13007 * other ops.
13008 *
13009 * size size of I/O for READ or WRITE Ignored for
13010 * other ops.
13011 *
13012 * If offset or size are -1 for a READ or WRITE, then the resolver should
13013 * consider the range to be unknown.
13014 *
13015 * Upon successful return, the caller may proceed with the operation.
13016 * N.B. the file may still be "dataless" in this case.
13017 */
13018 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)13019 vfs_materialize_file(
13020 struct vnode *vp,
13021 uint64_t op,
13022 int64_t offset,
13023 int64_t size)
13024 {
13025 if (vp->v_type != VREG) {
13026 return EFTYPE;
13027 }
13028 return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
13029 NULL);
13030 }
13031
13032 /*
13033 * vfs_materialize_dir:
13034 *
13035 * Inputs:
13036 * vp The dataless directory to be materialized.
13037 *
13038 * op What kind of operation is being performed:
13039 * -> NAMESPACE_HANDLER_READ_OP
13040 * -> NAMESPACE_HANDLER_WRITE_OP
13041 * -> NAMESPACE_HANDLER_DELETE_OP
13042 * -> NAMESPACE_HANDLER_RENAME_OP
13043 * -> NAMESPACE_HANDLER_LOOKUP_OP
13044 *
13045 * lookup_name Name being looked up for a LOOKUP op. Ignored for
13046 * other ops. May or may not be NUL-terminated; see below.
13047 *
13048 * namelen If non-zero, then lookup_name is assumed to not be NUL-
13049 * terminated and namelen is the number of valid bytes in
13050 * lookup_name. If zero, then lookup_name is assumed to be
13051 * NUL-terminated.
13052 *
13053 * Upon successful return, the caller may proceed with the operation.
13054 * N.B. the directory may still be "dataless" in this case.
13055 */
13056 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)13057 vfs_materialize_dir(
13058 struct vnode *vp,
13059 uint64_t op,
13060 char *lookup_name,
13061 size_t namelen)
13062 {
13063 if (vp->v_type != VDIR) {
13064 return EFTYPE;
13065 }
13066 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
13067 return EINVAL;
13068 }
13069 return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
13070 namelen, NULL);
13071 }
13072
13073 /*
13074 * vfs_materialize_reparent:
13075 *
13076 * Inputs:
13077 * vp The dataless file or directory to be materialized.
13078 *
13079 * tdvp The new parent directory for the dataless file.
13080 *
13081 * Upon successful return, the caller may proceed with the operation.
13082 * N.B. the item may still be "dataless" in this case.
13083 */
13084 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)13085 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
13086 {
13087 if (vp->v_type != VDIR && vp->v_type != VREG) {
13088 return EFTYPE;
13089 }
13090 return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
13091 0, 0, NULL, 0, tdvp);
13092 }
13093
13094 #if 0
13095 static int
13096 build_volfs_path(struct vnode *vp, char *path, int *len)
13097 {
13098 struct vnode_attr va;
13099 int ret;
13100
13101 VATTR_INIT(&va);
13102 VATTR_WANTED(&va, va_fsid);
13103 VATTR_WANTED(&va, va_fileid);
13104
13105 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
13106 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
13107 ret = -1;
13108 } else {
13109 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
13110 ret = 0;
13111 }
13112
13113 return ret;
13114 }
13115 #endif
13116
13117 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)13118 fsctl_bogus_command_compat(unsigned long cmd)
13119 {
13120 switch (cmd) {
13121 case IOCBASECMD(FSIOC_SYNC_VOLUME):
13122 return FSIOC_SYNC_VOLUME;
13123 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
13124 return FSIOC_ROUTEFS_SETROUTEID;
13125 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
13126 return FSIOC_SET_PACKAGE_EXTS;
13127 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
13128 return FSIOC_SET_FSTYPENAME_OVERRIDE;
13129 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
13130 return DISK_CONDITIONER_IOC_GET;
13131 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
13132 return DISK_CONDITIONER_IOC_SET;
13133 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
13134 return FSIOC_FIOSEEKHOLE;
13135 case IOCBASECMD(FSIOC_FIOSEEKDATA):
13136 return FSIOC_FIOSEEKDATA;
13137 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
13138 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
13139 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
13140 return SPOTLIGHT_IOC_GET_LAST_MTIME;
13141 }
13142
13143 return cmd;
13144 }
13145
13146 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)13147 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
13148 {
13149 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
13150 }
13151
13152 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)13153 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
13154 {
13155 struct vfs_attr vfa;
13156 mount_t mp = vp->v_mount;
13157 unsigned arg;
13158 int error;
13159
13160 /* record vid of vp so we can drop it below. */
13161 uint32_t vvid = vp->v_id;
13162
13163 /*
13164 * Then grab mount_iterref so that we can release the vnode.
13165 * Without this, a thread may call vnode_iterate_prepare then
13166 * get into a deadlock because we've never released the root vp
13167 */
13168 error = mount_iterref(mp, 0);
13169 if (error) {
13170 return error;
13171 }
13172 vnode_hold(vp);
13173 vnode_put(vp);
13174
13175 arg = MNT_NOWAIT;
13176 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
13177 arg = MNT_WAIT;
13178 }
13179
13180 /*
13181 * If the filessytem supports multiple filesytems in a
13182 * partition (For eg APFS volumes in a container, it knows
13183 * that the waitfor argument to VFS_SYNC are flags.
13184 */
13185 VFSATTR_INIT(&vfa);
13186 VFSATTR_WANTED(&vfa, f_capabilities);
13187 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
13188 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
13189 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
13190 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
13191 arg |= MNT_VOLUME;
13192 }
13193
13194 /* issue the sync for this volume */
13195 (void)sync_callback(mp, &arg);
13196
13197 /*
13198 * Then release the mount_iterref once we're done syncing; it's not
13199 * needed for the VNOP_IOCTL below
13200 */
13201 mount_iterdrop(mp);
13202
13203 if (arg & FSCTL_SYNC_FULLSYNC) {
13204 /* re-obtain vnode iocount on the root vp, if possible */
13205 error = vnode_getwithvid(vp, vvid);
13206 if (error == 0) {
13207 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
13208 vnode_put(vp);
13209 }
13210 }
13211 vnode_drop(vp);
13212 /* mark the argument VP as having been released */
13213 *arg_vp = NULL;
13214 return error;
13215 }
13216
13217 #if ROUTEFS
13218 static int __attribute__((noinline))
handle_routes(user_addr_t udata)13219 handle_routes(user_addr_t udata)
13220 {
13221 char routepath[MAXPATHLEN];
13222 size_t len = 0;
13223 int error;
13224
13225 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
13226 return error;
13227 }
13228 bzero(routepath, MAXPATHLEN);
13229 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
13230 if (error) {
13231 return error;
13232 }
13233 error = routefs_kernel_mount(routepath);
13234 return error;
13235 }
13236 #endif
13237
13238 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)13239 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
13240 {
13241 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
13242 struct vnode_attr va;
13243 int error;
13244
13245 VATTR_INIT(&va);
13246 VATTR_SET(&va, va_flags, cas->new_flags);
13247
13248 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
13249
13250 #if CONFIG_FSE
13251 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
13252 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
13253 }
13254 #endif
13255
13256 return error;
13257 }
13258
13259 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)13260 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
13261 {
13262 struct mount *mp = NULL;
13263 errno_t rootauth = 0;
13264
13265 mp = vp->v_mount;
13266
13267 /*
13268 * query the underlying FS and see if it reports something
13269 * sane for this vnode. If volume is authenticated via
13270 * chunklist, leave that for the caller to determine.
13271 */
13272 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13273
13274 return rootauth;
13275 }
13276
13277 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
13278 "com.apple.private.kernel.set-package-extensions"
13279
13280 /*
13281 * Make a filesystem-specific control call:
13282 */
13283 /* ARGSUSED */
13284 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)13285 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
13286 {
13287 int error = 0;
13288 boolean_t is64bit;
13289 u_int size;
13290 #define STK_PARAMS 128
13291 char stkbuf[STK_PARAMS] = {0};
13292 caddr_t data, memp;
13293 vnode_t vp = *arg_vp;
13294
13295 if (vp->v_type == VCHR || vp->v_type == VBLK) {
13296 return ENOTTY;
13297 }
13298
13299 cmd = fsctl_bogus_command_compat(cmd);
13300
13301 size = IOCPARM_LEN(cmd);
13302 if (size > IOCPARM_MAX) {
13303 return EINVAL;
13304 }
13305
13306 is64bit = proc_is64bit(p);
13307
13308 memp = NULL;
13309
13310 if (size > sizeof(stkbuf)) {
13311 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
13312 return ENOMEM;
13313 }
13314 data = memp;
13315 } else {
13316 data = &stkbuf[0];
13317 };
13318
13319 if (cmd & IOC_IN) {
13320 if (size) {
13321 error = copyin(udata, data, size);
13322 if (error) {
13323 if (memp) {
13324 kfree_data(memp, size);
13325 }
13326 return error;
13327 }
13328 } else {
13329 if (is64bit) {
13330 *(user_addr_t *)data = udata;
13331 } else {
13332 *(uint32_t *)data = (uint32_t)udata;
13333 }
13334 };
13335 } else if ((cmd & IOC_OUT) && size) {
13336 /*
13337 * Zero the buffer so the user always
13338 * gets back something deterministic.
13339 */
13340 bzero(data, size);
13341 } else if (cmd & IOC_VOID) {
13342 if (is64bit) {
13343 *(user_addr_t *)data = udata;
13344 } else {
13345 *(uint32_t *)data = (uint32_t)udata;
13346 }
13347 }
13348
13349 /* Check to see if it's a generic command */
13350 switch (cmd) {
13351 case FSIOC_SYNC_VOLUME:
13352 error = handle_sync_volume(vp, arg_vp, data, ctx);
13353 break;
13354
13355 case FSIOC_ROUTEFS_SETROUTEID:
13356 #if ROUTEFS
13357 error = handle_routes(udata);
13358 #endif
13359 break;
13360
13361 case FSIOC_SET_PACKAGE_EXTS: {
13362 user_addr_t ext_strings;
13363 uint32_t num_entries;
13364 uint32_t max_width;
13365
13366 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
13367 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
13368 error = EPERM;
13369 break;
13370 }
13371
13372 if ((is64bit && size != sizeof(user64_package_ext_info))
13373 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
13374 // either you're 64-bit and passed a 64-bit struct or
13375 // you're 32-bit and passed a 32-bit struct. otherwise
13376 // it's not ok.
13377 error = EINVAL;
13378 break;
13379 }
13380
13381 if (is64bit) {
13382 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
13383 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
13384 }
13385 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
13386 num_entries = ((user64_package_ext_info *)data)->num_entries;
13387 max_width = ((user64_package_ext_info *)data)->max_width;
13388 } else {
13389 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
13390 num_entries = ((user32_package_ext_info *)data)->num_entries;
13391 max_width = ((user32_package_ext_info *)data)->max_width;
13392 }
13393 error = set_package_extensions_table(ext_strings, num_entries, max_width);
13394 }
13395 break;
13396
13397 case FSIOC_SET_FSTYPENAME_OVERRIDE:
13398 {
13399 mount_t mp;
13400
13401 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
13402 break;
13403 }
13404 if ((mp = vp->v_mount) != NULL) {
13405 mount_lock(mp);
13406 if (data[0] != 0) {
13407 for (int i = 0; i < MFSTYPENAMELEN; i++) {
13408 if (!data[i]) {
13409 goto continue_copy;
13410 }
13411 }
13412 /*
13413 * Getting here means we have a user data
13414 * string which has no NULL termination in
13415 * its first MFSTYPENAMELEN bytes. This is
13416 * bogus, let's avoid strlcpy-ing the read
13417 * data and return an error.
13418 */
13419 error = EINVAL;
13420 goto unlock;
13421 continue_copy:
13422 vfs_setfstypename_locked(mp, data);
13423 if (vfs_isrdonly(mp) &&
13424 strcmp(data, "mtmfs") == 0) {
13425 mp->mnt_kern_flag |=
13426 MNTK_EXTENDED_SECURITY;
13427 mp->mnt_kern_flag &=
13428 ~MNTK_AUTH_OPAQUE;
13429 }
13430 } else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13431 const char *name =
13432 vfs_getfstypenameref_locked(mp, NULL);
13433 if (strcmp(name, "mtmfs") == 0) {
13434 mp->mnt_kern_flag &=
13435 ~MNTK_EXTENDED_SECURITY;
13436 }
13437 vfs_setfstypename_locked(mp, NULL);
13438 }
13439 unlock:
13440 mount_unlock(mp);
13441 }
13442 }
13443 break;
13444
13445 case DISK_CONDITIONER_IOC_GET: {
13446 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
13447 }
13448 break;
13449
13450 case DISK_CONDITIONER_IOC_SET: {
13451 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
13452 }
13453 break;
13454
13455 case FSIOC_CAS_BSDFLAGS:
13456 error = handle_flags(vp, data, ctx);
13457 break;
13458
13459 case FSIOC_FD_ONLY_OPEN_ONCE: {
13460 error = 0;
13461 if (vnode_usecount(vp) > 1) {
13462 vnode_lock_spin(vp);
13463 if (vp->v_lflag & VL_HASSTREAMS) {
13464 if (vnode_isinuse_locked(vp, 1, 1)) {
13465 error = EBUSY;
13466 }
13467 } else if (vnode_usecount(vp) > 1) {
13468 error = EBUSY;
13469 }
13470 vnode_unlock(vp);
13471 }
13472 }
13473 break;
13474
13475 case FSIOC_EVAL_ROOTAUTH:
13476 error = handle_auth(vp, cmd, data, options, ctx);
13477 break;
13478
13479 case FSIOC_TEST_FSE_ACCESS_GRANTED:
13480 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
13481 break;
13482
13483 #if CONFIG_EXCLAVES
13484 case FSIOC_EXCLAVE_FS_REGISTER:
13485 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13486 error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
13487 } else {
13488 error = EPERM;
13489 }
13490 break;
13491
13492 case FSIOC_EXCLAVE_FS_UNREGISTER:
13493 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13494 error = vfs_exclave_fs_unregister(vp);
13495 } else {
13496 error = EPERM;
13497 }
13498 break;
13499
13500 case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
13501 exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
13502 exclave_fs_base_dir_t *dirs = NULL;
13503 if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT) &&
13504 !IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_LIST_ENTITLEMENT)) {
13505 error = EPERM;
13506 break;
13507 }
13508 if (get_base_dirs->base_dirs) {
13509 if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
13510 error = EINVAL;
13511 break;
13512 }
13513 dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
13514 if (!dirs) {
13515 error = ENOSPC;
13516 break;
13517 }
13518 }
13519 error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
13520 if (!error && dirs) {
13521 error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
13522 get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
13523 }
13524 if (dirs) {
13525 kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
13526 }
13527 }
13528 break;
13529 #endif
13530
13531 default: {
13532 /*
13533 * Other, known commands shouldn't be passed down here.
13534 * (When adding a selector to this list, it may be prudent
13535 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
13536 */
13537 switch (cmd) {
13538 case F_PUNCHHOLE:
13539 case F_TRIM_ACTIVE_FILE:
13540 case F_RDADVISE:
13541 case F_TRANSCODEKEY:
13542 case F_GETPROTECTIONLEVEL:
13543 case F_GETDEFAULTPROTLEVEL:
13544 case F_MAKECOMPRESSED:
13545 case F_SET_GREEDY_MODE:
13546 case F_SETSTATICCONTENT:
13547 case F_SETIOTYPE:
13548 case F_SETBACKINGSTORE:
13549 case F_GETPATH_MTMINFO:
13550 case APFSIOC_REVERT_TO_SNAPSHOT:
13551 case FSIOC_FIOSEEKHOLE:
13552 case FSIOC_FIOSEEKDATA:
13553 case HFS_GET_BOOT_INFO:
13554 case HFS_SET_BOOT_INFO:
13555 case FIOPINSWAP:
13556 case F_CHKCLEAN:
13557 case F_FULLFSYNC:
13558 case F_BARRIERFSYNC:
13559 case F_FREEZE_FS:
13560 case F_THAW_FS:
13561 case FSIOC_KERNEL_ROOTAUTH:
13562 case FSIOC_GRAFT_FS:
13563 case FSIOC_UNGRAFT_FS:
13564 case FSIOC_AUTH_FS:
13565 case F_SPECULATIVE_READ:
13566 case F_ATTRIBUTION_TAG:
13567 case F_TRANSFEREXTENTS:
13568 case F_ASSERT_BG_ACCESS:
13569 case F_RELEASE_BG_ACCESS:
13570 error = EINVAL;
13571 goto outdrop;
13572 }
13573 /* Invoke the filesystem-specific code */
13574 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13575 }
13576 } /* end switch stmt */
13577
13578 /*
13579 * if no errors, copy any data to user. Size was
13580 * already set and checked above.
13581 */
13582 if (error == 0 && (cmd & IOC_OUT) && size) {
13583 error = copyout(data, udata, size);
13584 }
13585
13586 outdrop:
13587 if (memp) {
13588 kfree_data(memp, size);
13589 }
13590
13591 return error;
13592 }
13593
13594 /* ARGSUSED */
13595 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13596 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13597 {
13598 int error;
13599 struct nameidata nd;
13600 uint32_t nameiflags;
13601 vnode_t vp = NULL;
13602 vfs_context_t ctx = vfs_context_current();
13603
13604 AUDIT_ARG(cmd, (int)uap->cmd);
13605 AUDIT_ARG(value32, uap->options);
13606 /* Get the vnode for the file we are getting info on: */
13607 nameiflags = 0;
13608 //
13609 // if we come through fsctl() then the file is by definition not open.
13610 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13611 // lest the caller mistakenly thinks the only open is their own (but in
13612 // reality it's someone elses).
13613 //
13614 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13615 return EINVAL;
13616 }
13617 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13618 nameiflags |= FOLLOW;
13619 }
13620 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13621 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13622 }
13623 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13624 UIO_USERSPACE, uap->path, ctx);
13625 if ((error = namei(&nd))) {
13626 goto done;
13627 }
13628 vp = nd.ni_vp;
13629 nameidone(&nd);
13630
13631 #if CONFIG_MACF
13632 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13633 if (error) {
13634 goto done;
13635 }
13636 #endif
13637
13638 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13639
13640 done:
13641 if (vp) {
13642 vnode_put(vp);
13643 }
13644 return error;
13645 }
13646 /* ARGSUSED */
13647 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13648 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13649 {
13650 int error;
13651 vnode_t vp = NULL;
13652 vfs_context_t ctx = vfs_context_current();
13653 int fd = -1;
13654
13655 AUDIT_ARG(fd, uap->fd);
13656 AUDIT_ARG(cmd, (int)uap->cmd);
13657 AUDIT_ARG(value32, uap->options);
13658
13659 /* Get the vnode for the file we are getting info on: */
13660 if ((error = file_vnode(uap->fd, &vp))) {
13661 return error;
13662 }
13663 fd = uap->fd;
13664 if ((error = vnode_getwithref(vp))) {
13665 file_drop(fd);
13666 return error;
13667 }
13668
13669 #if CONFIG_MACF
13670 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13671 file_drop(fd);
13672 vnode_put(vp);
13673 return error;
13674 }
13675 #endif
13676
13677 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13678
13679 file_drop(fd);
13680
13681 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13682 if (vp) {
13683 vnode_put(vp);
13684 }
13685
13686 return error;
13687 }
13688 /* end of fsctl system call */
13689
13690 #define FILESEC_ACCESS_ENTITLEMENT \
13691 "com.apple.private.vfs.filesec-access"
13692
13693 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13694 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13695 {
13696 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13697 /*
13698 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13699 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13700 */
13701 if ((!setting && vfs_context_issuser(ctx)) ||
13702 IOTaskHasEntitlement(vfs_context_task(ctx),
13703 FILESEC_ACCESS_ENTITLEMENT)) {
13704 return 0;
13705 }
13706 }
13707
13708 return EPERM;
13709 }
13710
13711 /*
13712 * Retrieve the data of an extended attribute.
13713 */
13714 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13715 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13716 {
13717 vnode_t vp;
13718 struct nameidata nd;
13719 char attrname[XATTR_MAXNAMELEN + 1];
13720 vfs_context_t ctx = vfs_context_current();
13721 uio_t auio = NULL;
13722 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13723 size_t attrsize = 0;
13724 size_t namelen;
13725 u_int32_t nameiflags;
13726 int error;
13727 UIO_STACKBUF(uio_buf, 1);
13728
13729 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13730 return EINVAL;
13731 }
13732
13733 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13734 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13735 if (uap->options & XATTR_NOFOLLOW_ANY) {
13736 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13737 }
13738 if (uap->options & XATTR_RESOLVE_BENEATH) {
13739 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
13740 }
13741
13742 if ((error = namei(&nd))) {
13743 return error;
13744 }
13745 vp = nd.ni_vp;
13746 nameidone(&nd);
13747
13748 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13749 if (error != 0) {
13750 goto out;
13751 }
13752 if (xattr_protected(attrname) &&
13753 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13754 goto out;
13755 }
13756 /*
13757 * the specific check for 0xffffffff is a hack to preserve
13758 * binaray compatibilty in K64 with applications that discovered
13759 * that passing in a buf pointer and a size of -1 resulted in
13760 * just the size of the indicated extended attribute being returned.
13761 * this isn't part of the documented behavior, but because of the
13762 * original implemtation's check for "uap->size > 0", this behavior
13763 * was allowed. In K32 that check turned into a signed comparison
13764 * even though uap->size is unsigned... in K64, we blow by that
13765 * check because uap->size is unsigned and doesn't get sign smeared
13766 * in the munger for a 32 bit user app. we also need to add a
13767 * check to limit the maximum size of the buffer being passed in...
13768 * unfortunately, the underlying fileystems seem to just malloc
13769 * the requested size even if the actual extended attribute is tiny.
13770 * because that malloc is for kernel wired memory, we have to put a
13771 * sane limit on it.
13772 *
13773 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13774 * U64 running on K64 will yield -1 (64 bits wide)
13775 * U32/U64 running on K32 will yield -1 (32 bits wide)
13776 */
13777 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13778 goto no_uio;
13779 }
13780
13781 if (uap->value) {
13782 if (uap->size > (size_t)XATTR_MAXSIZE) {
13783 uap->size = XATTR_MAXSIZE;
13784 }
13785
13786 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13787 &uio_buf[0], sizeof(uio_buf));
13788 uio_addiov(auio, uap->value, uap->size);
13789 }
13790 no_uio:
13791 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13792 out:
13793 vnode_put(vp);
13794
13795 if (auio) {
13796 *retval = uap->size - uio_resid(auio);
13797 } else {
13798 *retval = (user_ssize_t)attrsize;
13799 }
13800
13801 return error;
13802 }
13803
13804 /*
13805 * Retrieve the data of an extended attribute.
13806 */
13807 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13808 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13809 {
13810 vnode_t vp;
13811 char attrname[XATTR_MAXNAMELEN + 1];
13812 vfs_context_t ctx = vfs_context_current();
13813 uio_t auio = NULL;
13814 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13815 size_t attrsize = 0;
13816 size_t namelen;
13817 int error;
13818 UIO_STACKBUF(uio_buf, 1);
13819
13820 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13821 XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
13822 return EINVAL;
13823 }
13824
13825 if ((error = file_vnode(uap->fd, &vp))) {
13826 return error;
13827 }
13828 if ((error = vnode_getwithref(vp))) {
13829 file_drop(uap->fd);
13830 return error;
13831 }
13832 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13833 if (error != 0) {
13834 goto out;
13835 }
13836 if (xattr_protected(attrname) &&
13837 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13838 goto out;
13839 }
13840 if (uap->value && uap->size > 0) {
13841 if (uap->size > (size_t)XATTR_MAXSIZE) {
13842 uap->size = XATTR_MAXSIZE;
13843 }
13844
13845 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13846 &uio_buf[0], sizeof(uio_buf));
13847 uio_addiov(auio, uap->value, uap->size);
13848 }
13849
13850 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13851 out:
13852 (void)vnode_put(vp);
13853 file_drop(uap->fd);
13854
13855 if (auio) {
13856 *retval = uap->size - uio_resid(auio);
13857 } else {
13858 *retval = (user_ssize_t)attrsize;
13859 }
13860 return error;
13861 }
13862
13863 /* struct for checkdirs iteration */
13864 struct setxattr_ctx {
13865 struct nameidata nd;
13866 char attrname[XATTR_MAXNAMELEN + 1];
13867 UIO_STACKBUF(uio_buf, 1);
13868 };
13869
13870 /*
13871 * Set the data of an extended attribute.
13872 */
13873 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13874 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13875 {
13876 vnode_t vp;
13877 vfs_context_t ctx = vfs_context_current();
13878 uio_t auio = NULL;
13879 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13880 size_t namelen;
13881 u_int32_t nameiflags;
13882 int error;
13883 struct setxattr_ctx *sactx;
13884
13885 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13886 return EINVAL;
13887 }
13888
13889 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13890 if (sactx == NULL) {
13891 return ENOMEM;
13892 }
13893
13894 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13895 if (error != 0) {
13896 if (error == EPERM) {
13897 /* if the string won't fit in attrname, copyinstr emits EPERM */
13898 error = ENAMETOOLONG;
13899 }
13900 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13901 goto out;
13902 }
13903 if (xattr_protected(sactx->attrname) &&
13904 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13905 goto out;
13906 }
13907 if (uap->size != 0 && uap->value == 0) {
13908 error = EINVAL;
13909 goto out;
13910 }
13911 if (uap->size > INT_MAX) {
13912 error = E2BIG;
13913 goto out;
13914 }
13915
13916 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13917 #if CONFIG_FILE_LEASES
13918 nameiflags |= WANTPARENT;
13919 #endif
13920 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13921 if (uap->options & XATTR_NOFOLLOW_ANY) {
13922 sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13923 }
13924 if (uap->options & XATTR_RESOLVE_BENEATH) {
13925 sactx->nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
13926 }
13927
13928 if ((error = namei(&sactx->nd))) {
13929 goto out;
13930 }
13931 vp = sactx->nd.ni_vp;
13932 #if CONFIG_FILE_LEASES
13933 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13934 vnode_put(sactx->nd.ni_dvp);
13935 #endif
13936 nameidone(&sactx->nd);
13937
13938 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13939 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13940 uio_addiov(auio, uap->value, uap->size);
13941
13942 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13943 #if CONFIG_FSE
13944 if (error == 0) {
13945 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13946 FSE_ARG_VNODE, vp,
13947 FSE_ARG_DONE);
13948 }
13949 #endif
13950 vnode_put(vp);
13951 out:
13952 kfree_type(struct setxattr_ctx, sactx);
13953 *retval = 0;
13954 return error;
13955 }
13956
13957 /*
13958 * Set the data of an extended attribute.
13959 */
13960 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13961 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13962 {
13963 vnode_t vp;
13964 char attrname[XATTR_MAXNAMELEN + 1];
13965 vfs_context_t ctx = vfs_context_current();
13966 uio_t auio = NULL;
13967 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13968 size_t namelen;
13969 int error;
13970 UIO_STACKBUF(uio_buf, 1);
13971
13972 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13973 XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
13974 return EINVAL;
13975 }
13976
13977 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13978 if (error != 0) {
13979 if (error == EPERM) {
13980 /* if the string won't fit in attrname, copyinstr emits EPERM */
13981 return ENAMETOOLONG;
13982 }
13983 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13984 return error;
13985 }
13986 if (xattr_protected(attrname) &&
13987 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13988 return error;
13989 }
13990 if (uap->size != 0 && uap->value == 0) {
13991 return EINVAL;
13992 }
13993 if (uap->size > INT_MAX) {
13994 return E2BIG;
13995 }
13996 if ((error = file_vnode(uap->fd, &vp))) {
13997 return error;
13998 }
13999 if ((error = vnode_getwithref(vp))) {
14000 file_drop(uap->fd);
14001 return error;
14002 }
14003
14004 #if CONFIG_FILE_LEASES
14005 vnode_breakdirlease(vp, true, O_WRONLY);
14006 #endif
14007
14008 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
14009 &uio_buf[0], sizeof(uio_buf));
14010 uio_addiov(auio, uap->value, uap->size);
14011
14012 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
14013 #if CONFIG_FSE
14014 if (error == 0) {
14015 add_fsevent(FSE_XATTR_MODIFIED, ctx,
14016 FSE_ARG_VNODE, vp,
14017 FSE_ARG_DONE);
14018 }
14019 #endif
14020 vnode_put(vp);
14021 file_drop(uap->fd);
14022 *retval = 0;
14023 return error;
14024 }
14025
14026 /*
14027 * Remove an extended attribute.
14028 * XXX Code duplication here.
14029 */
14030 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)14031 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
14032 {
14033 vnode_t vp;
14034 struct nameidata nd;
14035 char attrname[XATTR_MAXNAMELEN + 1];
14036 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
14037 vfs_context_t ctx = vfs_context_current();
14038 size_t namelen;
14039 u_int32_t nameiflags;
14040 int error;
14041
14042 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
14043 return EINVAL;
14044 }
14045
14046 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
14047 if (error != 0) {
14048 return error;
14049 }
14050 if (xattr_protected(attrname)) {
14051 return EPERM;
14052 }
14053 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
14054 #if CONFIG_FILE_LEASES
14055 nameiflags |= WANTPARENT;
14056 #endif
14057 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
14058 if (uap->options & XATTR_NOFOLLOW_ANY) {
14059 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
14060 }
14061 if (uap->options & XATTR_RESOLVE_BENEATH) {
14062 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
14063 }
14064
14065 if ((error = namei(&nd))) {
14066 return error;
14067 }
14068 vp = nd.ni_vp;
14069 #if CONFIG_FILE_LEASES
14070 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
14071 vnode_put(nd.ni_dvp);
14072 #endif
14073 nameidone(&nd);
14074
14075 error = vn_removexattr(vp, attrname, uap->options, ctx);
14076 #if CONFIG_FSE
14077 if (error == 0) {
14078 add_fsevent(FSE_XATTR_REMOVED, ctx,
14079 FSE_ARG_VNODE, vp,
14080 FSE_ARG_DONE);
14081 }
14082 #endif
14083 vnode_put(vp);
14084 *retval = 0;
14085 return error;
14086 }
14087
14088 /*
14089 * Remove an extended attribute.
14090 * XXX Code duplication here.
14091 */
14092 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)14093 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
14094 {
14095 vnode_t vp;
14096 char attrname[XATTR_MAXNAMELEN + 1];
14097 size_t namelen;
14098 int error;
14099 #if CONFIG_FSE
14100 vfs_context_t ctx = vfs_context_current();
14101 #endif
14102
14103 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
14104 XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
14105 return EINVAL;
14106 }
14107
14108 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
14109 if (error != 0) {
14110 return error;
14111 }
14112 if (xattr_protected(attrname)) {
14113 return EPERM;
14114 }
14115 if ((error = file_vnode(uap->fd, &vp))) {
14116 return error;
14117 }
14118 if ((error = vnode_getwithref(vp))) {
14119 file_drop(uap->fd);
14120 return error;
14121 }
14122
14123 #if CONFIG_FILE_LEASES
14124 vnode_breakdirlease(vp, true, O_WRONLY);
14125 #endif
14126
14127 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
14128 #if CONFIG_FSE
14129 if (error == 0) {
14130 add_fsevent(FSE_XATTR_REMOVED, ctx,
14131 FSE_ARG_VNODE, vp,
14132 FSE_ARG_DONE);
14133 }
14134 #endif
14135 vnode_put(vp);
14136 file_drop(uap->fd);
14137 *retval = 0;
14138 return error;
14139 }
14140
14141 /*
14142 * Retrieve the list of extended attribute names.
14143 * XXX Code duplication here.
14144 */
14145 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)14146 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
14147 {
14148 vnode_t vp;
14149 struct nameidata nd;
14150 vfs_context_t ctx = vfs_context_current();
14151 uio_t auio = NULL;
14152 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
14153 size_t attrsize = 0;
14154 u_int32_t nameiflags;
14155 int error;
14156 UIO_STACKBUF(uio_buf, 1);
14157
14158 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
14159 return EINVAL;
14160 }
14161
14162 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
14163 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
14164 if (uap->options & XATTR_NOFOLLOW_ANY) {
14165 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
14166 }
14167 if (uap->options & XATTR_RESOLVE_BENEATH) {
14168 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
14169 }
14170
14171 if ((error = namei(&nd))) {
14172 return error;
14173 }
14174 vp = nd.ni_vp;
14175 nameidone(&nd);
14176 if (uap->namebuf != 0 && uap->bufsize > 0) {
14177 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
14178 &uio_buf[0], sizeof(uio_buf));
14179 uio_addiov(auio, uap->namebuf, uap->bufsize);
14180 }
14181
14182 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
14183
14184 vnode_put(vp);
14185 if (auio) {
14186 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
14187 } else {
14188 *retval = (user_ssize_t)attrsize;
14189 }
14190 return error;
14191 }
14192
14193 /*
14194 * Retrieve the list of extended attribute names.
14195 * XXX Code duplication here.
14196 */
14197 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)14198 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
14199 {
14200 vnode_t vp;
14201 uio_t auio = NULL;
14202 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
14203 size_t attrsize = 0;
14204 int error;
14205 UIO_STACKBUF(uio_buf, 1);
14206
14207 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
14208 XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
14209 return EINVAL;
14210 }
14211
14212 if ((error = file_vnode(uap->fd, &vp))) {
14213 return error;
14214 }
14215 if ((error = vnode_getwithref(vp))) {
14216 file_drop(uap->fd);
14217 return error;
14218 }
14219 if (uap->namebuf != 0 && uap->bufsize > 0) {
14220 auio = uio_createwithbuffer(1, 0, spacetype,
14221 UIO_READ, &uio_buf[0], sizeof(uio_buf));
14222 uio_addiov(auio, uap->namebuf, uap->bufsize);
14223 }
14224
14225 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
14226
14227 vnode_put(vp);
14228 file_drop(uap->fd);
14229 if (auio) {
14230 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
14231 } else {
14232 *retval = (user_ssize_t)attrsize;
14233 }
14234 return error;
14235 }
14236
14237 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)14238 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
14239 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
14240 {
14241 int error;
14242 vnode_t vp;
14243 int length;
14244 int bpflags;
14245 /* maximum number of times to retry build_path */
14246 unsigned int retries = 0x10;
14247
14248 if (bufsize > MAXLONGPATHLEN) {
14249 return EINVAL;
14250 }
14251
14252 if (buf == NULL) {
14253 return ENOMEM;
14254 }
14255
14256 retry:
14257 error = vnode_getfromid(volfs_id, objid, ctx, options & FSOPT_ISREALFSID, &vp);
14258 if (error) {
14259 return error;
14260 }
14261
14262 #if CONFIG_MACF
14263 error = mac_vnode_check_fsgetpath(ctx, vp);
14264 if (error) {
14265 vnode_put(vp);
14266 return error;
14267 }
14268 #endif
14269
14270 /* Obtain the absolute path to this vnode. */
14271 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
14272 if (options & FSOPT_NOFIRMLINKPATH) {
14273 bpflags |= BUILDPATH_NO_FIRMLINK;
14274 }
14275 bpflags |= BUILDPATH_CHECK_MOVED;
14276 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
14277 vnode_put(vp);
14278
14279 if (error) {
14280 /* there was a race building the path, try a few more times */
14281 if (error == EAGAIN) {
14282 --retries;
14283 if (retries > 0) {
14284 goto retry;
14285 }
14286
14287 error = ENOENT;
14288 }
14289 goto out;
14290 }
14291
14292 AUDIT_ARG(text, buf);
14293
14294 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
14295 kdebug_vfs_lookup(buf, length, vp, KDBG_VFSLKUP_LOOKUP);
14296 }
14297
14298 *pathlen = length; /* may be superseded by error */
14299
14300 out:
14301 return error;
14302 }
14303
14304 /*
14305 * Obtain the full pathname of a file system object by id.
14306 */
14307 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)14308 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
14309 uint32_t options, user_ssize_t *retval)
14310 {
14311 vfs_context_t ctx = vfs_context_current();
14312 fsid_t fsid;
14313 char *realpath;
14314 int length;
14315 int error;
14316
14317 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
14318 return EINVAL;
14319 }
14320
14321 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
14322 return error;
14323 }
14324 AUDIT_ARG(value32, fsid.val[0]);
14325 AUDIT_ARG(value64, objid);
14326 /* Restrict output buffer size for now. */
14327
14328 if (bufsize > MAXLONGPATHLEN || bufsize <= 0) {
14329 return EINVAL;
14330 }
14331 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
14332 if (realpath == NULL) {
14333 return ENOMEM;
14334 }
14335
14336 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
14337 options, &length);
14338
14339 if (error) {
14340 goto out;
14341 }
14342
14343 error = copyout((caddr_t)realpath, buf, length);
14344
14345 *retval = (user_ssize_t)length; /* may be superseded by error */
14346 out:
14347 kfree_data(realpath, bufsize);
14348 return error;
14349 }
14350
14351 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)14352 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
14353 {
14354 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
14355 0, retval);
14356 }
14357
14358 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)14359 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
14360 {
14361 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
14362 uap->options, retval);
14363 }
14364
14365 /*
14366 * Common routine to handle various flavors of statfs data heading out
14367 * to user space.
14368 *
14369 * Returns: 0 Success
14370 * EFAULT
14371 */
14372 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)14373 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
14374 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
14375 boolean_t partial_copy)
14376 {
14377 int error;
14378 int my_size, copy_size;
14379
14380 if (is_64_bit) {
14381 struct user64_statfs sfs;
14382 my_size = copy_size = sizeof(sfs);
14383 bzero(&sfs, my_size);
14384 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14385 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14386 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14387 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
14388 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
14389 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
14390 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
14391 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
14392 sfs.f_files = (user64_long_t)sfsp->f_files;
14393 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
14394 sfs.f_fsid = sfsp->f_fsid;
14395 sfs.f_owner = sfsp->f_owner;
14396 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14397 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14398 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14399
14400 if (partial_copy) {
14401 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14402 }
14403 error = copyout((caddr_t)&sfs, bufp, copy_size);
14404 } else {
14405 struct user32_statfs sfs;
14406
14407 my_size = copy_size = sizeof(sfs);
14408 bzero(&sfs, my_size);
14409
14410 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14411 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14412 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14413
14414 /*
14415 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
14416 * have to fudge the numbers here in that case. We inflate the blocksize in order
14417 * to reflect the filesystem size as best we can.
14418 */
14419 if ((sfsp->f_blocks > INT_MAX)
14420 /* Hack for 4061702 . I think the real fix is for Carbon to
14421 * look for some volume capability and not depend on hidden
14422 * semantics agreed between a FS and carbon.
14423 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
14424 * for Carbon to set bNoVolumeSizes volume attribute.
14425 * Without this the webdavfs files cannot be copied onto
14426 * disk as they look huge. This change should not affect
14427 * XSAN as they should not setting these to -1..
14428 */
14429 && (sfsp->f_blocks != 0xffffffffffffffffULL)
14430 && (sfsp->f_bfree != 0xffffffffffffffffULL)
14431 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
14432 int shift;
14433
14434 /*
14435 * Work out how far we have to shift the block count down to make it fit.
14436 * Note that it's possible to have to shift so far that the resulting
14437 * blocksize would be unreportably large. At that point, we will clip
14438 * any values that don't fit.
14439 *
14440 * For safety's sake, we also ensure that f_iosize is never reported as
14441 * being smaller than f_bsize.
14442 */
14443 for (shift = 0; shift < 32; shift++) {
14444 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
14445 break;
14446 }
14447 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
14448 break;
14449 }
14450 }
14451 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
14452 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
14453 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
14454 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
14455 #undef __SHIFT_OR_CLIP
14456 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
14457 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
14458 } else {
14459 /* filesystem is small enough to be reported honestly */
14460 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
14461 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
14462 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
14463 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
14464 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
14465 }
14466 sfs.f_files = (user32_long_t)sfsp->f_files;
14467 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
14468 sfs.f_fsid = sfsp->f_fsid;
14469 sfs.f_owner = sfsp->f_owner;
14470 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14471 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14472 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14473
14474 if (partial_copy) {
14475 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14476 }
14477 error = copyout((caddr_t)&sfs, bufp, copy_size);
14478 }
14479
14480 if (sizep != NULL) {
14481 *sizep = my_size;
14482 }
14483 return error;
14484 }
14485
14486 /*
14487 * copy stat structure into user_stat structure.
14488 */
14489 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)14490 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
14491 {
14492 bzero(usbp, sizeof(*usbp));
14493
14494 usbp->st_dev = sbp->st_dev;
14495 usbp->st_ino = sbp->st_ino;
14496 usbp->st_mode = sbp->st_mode;
14497 usbp->st_nlink = sbp->st_nlink;
14498 usbp->st_uid = sbp->st_uid;
14499 usbp->st_gid = sbp->st_gid;
14500 usbp->st_rdev = sbp->st_rdev;
14501 #ifndef _POSIX_C_SOURCE
14502 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14503 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14504 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14505 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14506 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14507 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14508 #else
14509 usbp->st_atime = sbp->st_atime;
14510 usbp->st_atimensec = sbp->st_atimensec;
14511 usbp->st_mtime = sbp->st_mtime;
14512 usbp->st_mtimensec = sbp->st_mtimensec;
14513 usbp->st_ctime = sbp->st_ctime;
14514 usbp->st_ctimensec = sbp->st_ctimensec;
14515 #endif
14516 usbp->st_size = sbp->st_size;
14517 usbp->st_blocks = sbp->st_blocks;
14518 usbp->st_blksize = sbp->st_blksize;
14519 usbp->st_flags = sbp->st_flags;
14520 usbp->st_gen = sbp->st_gen;
14521 usbp->st_lspare = sbp->st_lspare;
14522 usbp->st_qspare[0] = sbp->st_qspare[0];
14523 usbp->st_qspare[1] = sbp->st_qspare[1];
14524 }
14525
14526 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14527 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14528 {
14529 bzero(usbp, sizeof(*usbp));
14530
14531 usbp->st_dev = sbp->st_dev;
14532 usbp->st_ino = sbp->st_ino;
14533 usbp->st_mode = sbp->st_mode;
14534 usbp->st_nlink = sbp->st_nlink;
14535 usbp->st_uid = sbp->st_uid;
14536 usbp->st_gid = sbp->st_gid;
14537 usbp->st_rdev = sbp->st_rdev;
14538 #ifndef _POSIX_C_SOURCE
14539 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14540 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14541 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14542 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14543 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14544 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14545 #else
14546 usbp->st_atime = sbp->st_atime;
14547 usbp->st_atimensec = sbp->st_atimensec;
14548 usbp->st_mtime = sbp->st_mtime;
14549 usbp->st_mtimensec = sbp->st_mtimensec;
14550 usbp->st_ctime = sbp->st_ctime;
14551 usbp->st_ctimensec = sbp->st_ctimensec;
14552 #endif
14553 usbp->st_size = sbp->st_size;
14554 usbp->st_blocks = sbp->st_blocks;
14555 usbp->st_blksize = sbp->st_blksize;
14556 usbp->st_flags = sbp->st_flags;
14557 usbp->st_gen = sbp->st_gen;
14558 usbp->st_lspare = sbp->st_lspare;
14559 usbp->st_qspare[0] = sbp->st_qspare[0];
14560 usbp->st_qspare[1] = sbp->st_qspare[1];
14561 }
14562
14563 /*
14564 * copy stat64 structure into user_stat64 structure.
14565 */
14566 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14567 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14568 {
14569 bzero(usbp, sizeof(*usbp));
14570
14571 usbp->st_dev = sbp->st_dev;
14572 usbp->st_ino = sbp->st_ino;
14573 usbp->st_mode = sbp->st_mode;
14574 usbp->st_nlink = sbp->st_nlink;
14575 usbp->st_uid = sbp->st_uid;
14576 usbp->st_gid = sbp->st_gid;
14577 usbp->st_rdev = sbp->st_rdev;
14578 #ifndef _POSIX_C_SOURCE
14579 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14580 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14581 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14582 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14583 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14584 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14585 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14586 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14587 #else
14588 usbp->st_atime = sbp->st_atime;
14589 usbp->st_atimensec = sbp->st_atimensec;
14590 usbp->st_mtime = sbp->st_mtime;
14591 usbp->st_mtimensec = sbp->st_mtimensec;
14592 usbp->st_ctime = sbp->st_ctime;
14593 usbp->st_ctimensec = sbp->st_ctimensec;
14594 usbp->st_birthtime = sbp->st_birthtime;
14595 usbp->st_birthtimensec = sbp->st_birthtimensec;
14596 #endif
14597 usbp->st_size = sbp->st_size;
14598 usbp->st_blocks = sbp->st_blocks;
14599 usbp->st_blksize = sbp->st_blksize;
14600 usbp->st_flags = sbp->st_flags;
14601 usbp->st_gen = sbp->st_gen;
14602 usbp->st_lspare = sbp->st_lspare;
14603 usbp->st_qspare[0] = sbp->st_qspare[0];
14604 usbp->st_qspare[1] = sbp->st_qspare[1];
14605 }
14606
14607 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14608 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14609 {
14610 bzero(usbp, sizeof(*usbp));
14611
14612 usbp->st_dev = sbp->st_dev;
14613 usbp->st_ino = sbp->st_ino;
14614 usbp->st_mode = sbp->st_mode;
14615 usbp->st_nlink = sbp->st_nlink;
14616 usbp->st_uid = sbp->st_uid;
14617 usbp->st_gid = sbp->st_gid;
14618 usbp->st_rdev = sbp->st_rdev;
14619 #ifndef _POSIX_C_SOURCE
14620 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14621 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14622 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14623 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14624 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14625 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14626 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14627 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14628 #else
14629 usbp->st_atime = sbp->st_atime;
14630 usbp->st_atimensec = sbp->st_atimensec;
14631 usbp->st_mtime = sbp->st_mtime;
14632 usbp->st_mtimensec = sbp->st_mtimensec;
14633 usbp->st_ctime = sbp->st_ctime;
14634 usbp->st_ctimensec = sbp->st_ctimensec;
14635 usbp->st_birthtime = sbp->st_birthtime;
14636 usbp->st_birthtimensec = sbp->st_birthtimensec;
14637 #endif
14638 usbp->st_size = sbp->st_size;
14639 usbp->st_blocks = sbp->st_blocks;
14640 usbp->st_blksize = sbp->st_blksize;
14641 usbp->st_flags = sbp->st_flags;
14642 usbp->st_gen = sbp->st_gen;
14643 usbp->st_lspare = sbp->st_lspare;
14644 usbp->st_qspare[0] = sbp->st_qspare[0];
14645 usbp->st_qspare[1] = sbp->st_qspare[1];
14646 }
14647
14648 /*
14649 * Purge buffer cache for simulating cold starts
14650 */
14651 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14652 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14653 {
14654 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14655
14656 return VNODE_RETURNED;
14657 }
14658
14659 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14660 vfs_purge_callback(mount_t mp, __unused void * arg)
14661 {
14662 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14663
14664 return VFS_RETURNED;
14665 }
14666
14667 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14668 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14669
14670 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14671 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14672 {
14673 if (!kauth_cred_issuser(kauth_cred_get())) {
14674 return EPERM;
14675 }
14676
14677 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14678
14679 /* also flush any VM pagers backed by files */
14680 if (vfs_purge_vm_pagers) {
14681 vm_purge_filebacked_pagers();
14682 }
14683
14684 return 0;
14685 }
14686
14687 /*
14688 * gets the vnode associated with the (unnamed) snapshot directory
14689 * for a Filesystem. The snapshot directory vnode is returned with
14690 * an iocount on it.
14691 */
14692 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14693 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14694 {
14695 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14696 }
14697
14698 /*
14699 * Get the snapshot vnode.
14700 *
14701 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14702 * needs nameidone() on ndp.
14703 *
14704 * If the snapshot vnode exists it is returned in ndp->ni_vp.
14705 *
14706 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14707 * not needed.
14708 */
14709 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14710 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14711 user_addr_t name, struct nameidata *ndp, int32_t op,
14712 #if !CONFIG_TRIGGERS
14713 __unused
14714 #endif
14715 enum path_operation pathop,
14716 vfs_context_t ctx)
14717 {
14718 int error, i;
14719 caddr_t name_buf;
14720 size_t name_len;
14721 struct vfs_attr vfa;
14722
14723 *sdvpp = NULLVP;
14724 *rvpp = NULLVP;
14725
14726 error = vnode_getfromfd(ctx, dirfd, rvpp);
14727 if (error) {
14728 return error;
14729 }
14730
14731 if (!vnode_isvroot(*rvpp)) {
14732 error = EINVAL;
14733 goto out;
14734 }
14735
14736 /* Make sure the filesystem supports snapshots */
14737 VFSATTR_INIT(&vfa);
14738 VFSATTR_WANTED(&vfa, f_capabilities);
14739 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14740 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14741 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14742 VOL_CAP_INT_SNAPSHOT)) ||
14743 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14744 VOL_CAP_INT_SNAPSHOT))) {
14745 error = ENOTSUP;
14746 goto out;
14747 }
14748
14749 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14750 if (error) {
14751 goto out;
14752 }
14753
14754 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14755 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14756 if (error) {
14757 goto out1;
14758 }
14759
14760 /*
14761 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14762 * (the length returned by copyinstr includes the terminating NUL)
14763 */
14764 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14765 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14766 error = EINVAL;
14767 goto out1;
14768 }
14769 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14770 ;
14771 }
14772 if (i < (int)name_len) {
14773 error = EINVAL;
14774 goto out1;
14775 }
14776
14777 #if CONFIG_MACF
14778 if (op == CREATE) {
14779 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14780 name_buf);
14781 } else if (op == DELETE) {
14782 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14783 name_buf);
14784 }
14785 if (error) {
14786 goto out1;
14787 }
14788 #endif
14789
14790 /* Check if the snapshot already exists ... */
14791 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14792 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14793 ndp->ni_dvp = *sdvpp;
14794
14795 error = namei(ndp);
14796 out1:
14797 zfree(ZV_NAMEI, name_buf);
14798 out:
14799 if (error) {
14800 if (*sdvpp) {
14801 vnode_put(*sdvpp);
14802 *sdvpp = NULLVP;
14803 }
14804 if (*rvpp) {
14805 vnode_put(*rvpp);
14806 *rvpp = NULLVP;
14807 }
14808 }
14809 return error;
14810 }
14811
14812 /*
14813 * create a filesystem snapshot (for supporting filesystems)
14814 *
14815 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14816 * We get to the (unnamed) snapshot directory vnode and create the vnode
14817 * for the snapshot in it.
14818 *
14819 * Restrictions:
14820 *
14821 * a) Passed in name for snapshot cannot have slashes.
14822 * b) name can't be "." or ".."
14823 *
14824 * Since this requires superuser privileges, vnode_authorize calls are not
14825 * made.
14826 */
14827 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14828 snapshot_create(int dirfd, user_addr_t name, uint32_t flags,
14829 vfs_context_t ctx)
14830 {
14831 vnode_t rvp, snapdvp;
14832 int error;
14833 struct nameidata *ndp;
14834
14835 /* No flags are currently defined */
14836 if (flags) {
14837 printf("snapshot_create: Invalid flags passed 0x%x\n", flags);
14838 return EINVAL;
14839 }
14840
14841 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14842
14843 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14844 OP_LINK, ctx);
14845 if (error) {
14846 goto out;
14847 }
14848
14849 if (ndp->ni_vp) {
14850 vnode_put(ndp->ni_vp);
14851 error = EEXIST;
14852 } else {
14853 struct vnode_attr *vap;
14854 vnode_t vp = NULLVP;
14855
14856 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14857
14858 VATTR_INIT(vap);
14859 VATTR_SET(vap, va_type, VREG);
14860 VATTR_SET(vap, va_mode, 0);
14861
14862 error = vn_create(snapdvp, &vp, ndp, vap,
14863 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14864 if (!error && vp) {
14865 vnode_put(vp);
14866 }
14867
14868 kfree_type(struct vnode_attr, vap);
14869 }
14870
14871 nameidone(ndp);
14872 vnode_put(snapdvp);
14873 vnode_put(rvp);
14874 out:
14875 kfree_type(struct nameidata, ndp);
14876
14877 return error;
14878 }
14879
14880 /*
14881 * Delete a Filesystem snapshot
14882 *
14883 * get the vnode for the unnamed snapshot directory and the snapshot and
14884 * delete the snapshot.
14885 */
14886 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14887 snapshot_delete(int dirfd, user_addr_t name, uint32_t flags,
14888 vfs_context_t ctx)
14889 {
14890 vnode_t rvp, snapdvp;
14891 int error;
14892 struct nameidata *ndp;
14893
14894 /* No flags are currently defined */
14895 if (flags) {
14896 printf("snapshot_delete: Invalid flags passed 0x%x\n", flags);
14897 return EINVAL;
14898 }
14899
14900 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14901
14902 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14903 OP_UNLINK, ctx);
14904 if (error) {
14905 goto out;
14906 }
14907
14908 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14909 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14910
14911 vnode_put(ndp->ni_vp);
14912 nameidone(ndp);
14913 vnode_put(snapdvp);
14914 vnode_put(rvp);
14915 out:
14916 kfree_type(struct nameidata, ndp);
14917
14918 return error;
14919 }
14920
14921 /*
14922 * Revert a filesystem to a snapshot
14923 *
14924 * Marks the filesystem to revert to the given snapshot on next mount.
14925 */
14926 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14927 snapshot_revert(int dirfd, user_addr_t name, uint32_t flags,
14928 vfs_context_t ctx)
14929 {
14930 int error;
14931 vnode_t rvp;
14932 mount_t mp;
14933 struct fs_snapshot_revert_args revert_data;
14934 struct componentname cnp;
14935 caddr_t name_buf;
14936 size_t name_len;
14937
14938 /* No flags are currently defined */
14939 if (flags) {
14940 printf("snapshot_revert: Invalid flags passed 0x%x\n", flags);
14941 return EINVAL;
14942 }
14943
14944 error = vnode_getfromfd(ctx, dirfd, &rvp);
14945 if (error) {
14946 return error;
14947 }
14948 mp = vnode_mount(rvp);
14949
14950 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14951 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14952 if (error) {
14953 zfree(ZV_NAMEI, name_buf);
14954 vnode_put(rvp);
14955 return error;
14956 }
14957
14958 #if CONFIG_MACF
14959 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14960 if (error) {
14961 zfree(ZV_NAMEI, name_buf);
14962 vnode_put(rvp);
14963 return error;
14964 }
14965 #endif
14966
14967 /*
14968 * Grab mount_iterref so that we can release the vnode,
14969 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14970 */
14971 error = mount_iterref(mp, 0);
14972 vnode_put(rvp);
14973 if (error) {
14974 zfree(ZV_NAMEI, name_buf);
14975 return error;
14976 }
14977
14978 memset(&cnp, 0, sizeof(cnp));
14979 cnp.cn_pnbuf = (char *)name_buf;
14980 cnp.cn_nameiop = LOOKUP;
14981 cnp.cn_flags = ISLASTCN | HASBUF;
14982 cnp.cn_pnlen = MAXPATHLEN;
14983 cnp.cn_nameptr = cnp.cn_pnbuf;
14984 cnp.cn_namelen = (int)name_len;
14985 revert_data.sr_cnp = &cnp;
14986
14987 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14988 mount_iterdrop(mp);
14989 zfree(ZV_NAMEI, name_buf);
14990
14991 if (error) {
14992 /* If there was any error, try again using VNOP_IOCTL */
14993
14994 vnode_t snapdvp;
14995 struct nameidata namend;
14996
14997 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14998 OP_LOOKUP, ctx);
14999 if (error) {
15000 return error;
15001 }
15002
15003
15004 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
15005 0, ctx);
15006
15007 vnode_put(namend.ni_vp);
15008 nameidone(&namend);
15009 vnode_put(snapdvp);
15010 vnode_put(rvp);
15011 }
15012
15013 return error;
15014 }
15015
15016 /*
15017 * rename a Filesystem snapshot
15018 *
15019 * get the vnode for the unnamed snapshot directory and the snapshot and
15020 * rename the snapshot. This is a very specialised (and simple) case of
15021 * rename(2) (which has to deal with a lot more complications). It differs
15022 * slightly from rename(2) in that EEXIST is returned if the new name exists.
15023 */
15024 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,uint32_t flags,vfs_context_t ctx)15025 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
15026 uint32_t flags, vfs_context_t ctx)
15027 {
15028 vnode_t rvp, snapdvp;
15029 int error, i;
15030 caddr_t newname_buf;
15031 size_t name_len;
15032 vnode_t fvp;
15033 struct nameidata *fromnd, *tond;
15034 /* carving out a chunk for structs that are too big to be on stack. */
15035 struct {
15036 struct nameidata from_node;
15037 struct nameidata to_node;
15038 } * __rename_data;
15039
15040 /* No flags are currently defined */
15041 if (flags) {
15042 printf("snapshot_rename: Invalid flags passed 0x%x\n", flags);
15043 return EINVAL;
15044 }
15045
15046 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
15047 fromnd = &__rename_data->from_node;
15048 tond = &__rename_data->to_node;
15049
15050 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
15051 OP_UNLINK, ctx);
15052 if (error) {
15053 goto out;
15054 }
15055 fvp = fromnd->ni_vp;
15056
15057 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
15058 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
15059 if (error) {
15060 goto out1;
15061 }
15062
15063 /*
15064 * Some sanity checks- new name can't be empty, "." or ".." or have
15065 * slashes.
15066 * (the length returned by copyinstr includes the terminating NUL)
15067 *
15068 * The FS rename VNOP is suppossed to handle this but we'll pick it
15069 * off here itself.
15070 */
15071 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
15072 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
15073 error = EINVAL;
15074 goto out1;
15075 }
15076 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
15077 ;
15078 }
15079 if (i < (int)name_len) {
15080 error = EINVAL;
15081 goto out1;
15082 }
15083
15084 #if CONFIG_MACF
15085 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
15086 newname_buf);
15087 if (error) {
15088 goto out1;
15089 }
15090 #endif
15091
15092 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
15093 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
15094 tond->ni_dvp = snapdvp;
15095
15096 error = namei(tond);
15097 if (error) {
15098 goto out2;
15099 } else if (tond->ni_vp) {
15100 /*
15101 * snapshot rename behaves differently than rename(2) - if the
15102 * new name exists, EEXIST is returned.
15103 */
15104 vnode_put(tond->ni_vp);
15105 error = EEXIST;
15106 goto out2;
15107 }
15108
15109 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
15110 &tond->ni_cnd, ctx);
15111
15112 out2:
15113 nameidone(tond);
15114 out1:
15115 zfree(ZV_NAMEI, newname_buf);
15116 vnode_put(fvp);
15117 vnode_put(snapdvp);
15118 vnode_put(rvp);
15119 nameidone(fromnd);
15120 out:
15121 kfree_type(typeof(*__rename_data), __rename_data);
15122 return error;
15123 }
15124
15125 /*
15126 * Mount a Filesystem snapshot
15127 *
15128 * get the vnode for the unnamed snapshot directory and the snapshot and
15129 * mount the snapshot.
15130 */
15131 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,uint32_t flags,vfs_context_t ctx)15132 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
15133 __unused user_addr_t mnt_data, uint32_t flags, vfs_context_t ctx)
15134 {
15135 mount_t mp;
15136 vnode_t rvp, snapdvp, snapvp, vp, pvp;
15137 struct fs_snapshot_mount_args smnt_data;
15138 int error, mount_flags = 0;
15139 struct nameidata *snapndp, *dirndp;
15140 /* carving out a chunk for structs that are too big to be on stack. */
15141 struct {
15142 struct nameidata snapnd;
15143 struct nameidata dirnd;
15144 } * __snapshot_mount_data;
15145
15146 /* Check for invalid flags */
15147 if (flags & ~SNAPSHOT_MNT_VALIDMASK) {
15148 printf("snapshot_mount: Invalid flags passed 0x%x\n", flags);
15149 return EINVAL;
15150 }
15151
15152 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
15153 snapndp = &__snapshot_mount_data->snapnd;
15154 dirndp = &__snapshot_mount_data->dirnd;
15155
15156 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
15157 OP_LOOKUP, ctx);
15158 if (error) {
15159 goto out;
15160 }
15161
15162 snapvp = snapndp->ni_vp;
15163 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
15164 error = EIO;
15165 goto out1;
15166 }
15167
15168 /* Convert snapshot_mount flags to mount flags */
15169 if (flags & SNAPSHOT_MNT_NOEXEC) {
15170 mount_flags |= MNT_NOEXEC;
15171 }
15172 if (flags & SNAPSHOT_MNT_NOSUID) {
15173 mount_flags |= MNT_NOSUID;
15174 }
15175 if (flags & SNAPSHOT_MNT_NODEV) {
15176 mount_flags |= MNT_NODEV;
15177 }
15178 if (flags & SNAPSHOT_MNT_DONTBROWSE) {
15179 mount_flags |= MNT_DONTBROWSE;
15180 }
15181 if (flags & SNAPSHOT_MNT_IGNORE_OWNERSHIP) {
15182 mount_flags |= MNT_IGNORE_OWNERSHIP;
15183 }
15184 if (flags & SNAPSHOT_MNT_NOFOLLOW) {
15185 mount_flags |= MNT_NOFOLLOW;
15186 }
15187
15188 /* Get the vnode to be covered */
15189 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
15190 UIO_USERSPACE, directory, ctx);
15191 if (mount_flags & MNT_NOFOLLOW) {
15192 dirndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
15193 }
15194
15195 error = namei(dirndp);
15196 if (error) {
15197 goto out1;
15198 }
15199
15200 vp = dirndp->ni_vp;
15201 pvp = dirndp->ni_dvp;
15202 mp = vnode_mount(rvp);
15203
15204 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
15205 error = EINVAL;
15206 goto out2;
15207 }
15208
15209 #if CONFIG_MACF
15210 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
15211 mp->mnt_vfsstat.f_fstypename);
15212 if (error) {
15213 goto out2;
15214 }
15215 #endif
15216
15217 smnt_data.sm_mp = mp;
15218 smnt_data.sm_cnp = &snapndp->ni_cnd;
15219 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
15220 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), mount_flags,
15221 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
15222
15223 out2:
15224 vnode_put(vp);
15225 vnode_put(pvp);
15226 nameidone(dirndp);
15227 out1:
15228 vnode_put(snapvp);
15229 vnode_put(snapdvp);
15230 vnode_put(rvp);
15231 nameidone(snapndp);
15232 out:
15233 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
15234 return error;
15235 }
15236
15237 /*
15238 * Root from a snapshot of the filesystem
15239 *
15240 * Marks the filesystem to root from the given snapshot on next boot.
15241 */
15242 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)15243 snapshot_root(int dirfd, user_addr_t name, uint32_t flags,
15244 vfs_context_t ctx)
15245 {
15246 int error;
15247 vnode_t rvp;
15248 mount_t mp;
15249 struct fs_snapshot_root_args root_data;
15250 struct componentname cnp;
15251 caddr_t name_buf;
15252 size_t name_len;
15253
15254 /* No flags are currently defined */
15255 if (flags) {
15256 printf("snapshot_root: Invalid flags passed 0x%x\n", flags);
15257 return EINVAL;
15258 }
15259
15260 error = vnode_getfromfd(ctx, dirfd, &rvp);
15261 if (error) {
15262 return error;
15263 }
15264 mp = vnode_mount(rvp);
15265
15266 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
15267 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
15268 if (error) {
15269 zfree(ZV_NAMEI, name_buf);
15270 vnode_put(rvp);
15271 return error;
15272 }
15273
15274 // XXX MAC checks ?
15275
15276 /*
15277 * Grab mount_iterref so that we can release the vnode,
15278 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
15279 */
15280 error = mount_iterref(mp, 0);
15281 vnode_put(rvp);
15282 if (error) {
15283 zfree(ZV_NAMEI, name_buf);
15284 return error;
15285 }
15286
15287 memset(&cnp, 0, sizeof(cnp));
15288 cnp.cn_pnbuf = (char *)name_buf;
15289 cnp.cn_nameiop = LOOKUP;
15290 cnp.cn_flags = ISLASTCN | HASBUF;
15291 cnp.cn_pnlen = MAXPATHLEN;
15292 cnp.cn_nameptr = cnp.cn_pnbuf;
15293 cnp.cn_namelen = (int)name_len;
15294 root_data.sr_cnp = &cnp;
15295
15296 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
15297
15298 mount_iterdrop(mp);
15299 zfree(ZV_NAMEI, name_buf);
15300
15301 return error;
15302 }
15303
15304 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)15305 vfs_context_can_snapshot(vfs_context_t ctx)
15306 {
15307 static const char * const snapshot_entitlements[] = {
15308 "com.apple.private.vfs.snapshot",
15309 "com.apple.developer.vfs.snapshot",
15310 "com.apple.private.apfs.arv.limited.snapshot",
15311 };
15312 static const size_t nentitlements =
15313 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
15314 size_t i;
15315
15316 task_t task = vfs_context_task(ctx);
15317 for (i = 0; i < nentitlements; i++) {
15318 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
15319 return TRUE;
15320 }
15321 }
15322 return FALSE;
15323 }
15324
15325 /*
15326 * FS snapshot operations dispatcher
15327 */
15328 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)15329 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
15330 __unused int32_t *retval)
15331 {
15332 int error;
15333 vfs_context_t ctx = vfs_context_current();
15334
15335 AUDIT_ARG(fd, uap->dirfd);
15336 AUDIT_ARG(value32, uap->op);
15337
15338 if (!vfs_context_can_snapshot(ctx)) {
15339 return EPERM;
15340 }
15341
15342 /*
15343 * Enforce user authorization for snapshot modification operations,
15344 * or if trying to root from snapshot.
15345 */
15346 if (uap->op != SNAPSHOT_OP_MOUNT) {
15347 vnode_t dvp = NULLVP;
15348 vnode_t devvp = NULLVP;
15349 mount_t mp;
15350
15351 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
15352 if (error) {
15353 return error;
15354 }
15355 mp = vnode_mount(dvp);
15356 devvp = mp->mnt_devvp;
15357
15358 /* get an iocount on devvp */
15359 if (devvp == NULLVP) {
15360 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
15361 /* for mounts which arent block devices */
15362 if (error == ENOENT) {
15363 error = ENXIO;
15364 }
15365 } else {
15366 error = vnode_getwithref(devvp);
15367 }
15368
15369 if (error) {
15370 vnode_put(dvp);
15371 return error;
15372 }
15373
15374 if ((vfs_context_issuser(ctx) == 0) &&
15375 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
15376 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
15377 error = EPERM;
15378 }
15379 vnode_put(dvp);
15380 vnode_put(devvp);
15381
15382 if (error) {
15383 return error;
15384 }
15385 }
15386
15387 switch (uap->op) {
15388 case SNAPSHOT_OP_CREATE:
15389 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
15390 break;
15391 case SNAPSHOT_OP_DELETE:
15392 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
15393 break;
15394 case SNAPSHOT_OP_RENAME:
15395 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
15396 uap->flags, ctx);
15397 break;
15398 case SNAPSHOT_OP_MOUNT:
15399 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
15400 uap->data, uap->flags, ctx);
15401 break;
15402 case SNAPSHOT_OP_REVERT:
15403 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
15404 break;
15405 #if CONFIG_MNT_ROOTSNAP
15406 case SNAPSHOT_OP_ROOT:
15407 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
15408 break;
15409 #endif /* CONFIG_MNT_ROOTSNAP */
15410 default:
15411 error = ENOSYS;
15412 }
15413
15414 return error;
15415 }
15416