1 /*
2 * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114 #if CONFIG_EXCLAVES
115 #include <vfs/vfs_exclave_fs.h>
116 #endif
117
118 #include <security/audit/audit.h>
119 #include <bsm/audit_kevents.h>
120
121 #include <mach/mach_types.h>
122 #include <kern/kern_types.h>
123 #include <kern/kalloc.h>
124 #include <kern/task.h>
125
126 #include <vm/vm_pageout.h>
127 #include <vm/vm_protos.h>
128 #include <vm/memory_object_xnu.h>
129
130 #include <libkern/OSAtomic.h>
131 #include <os/atomic_private.h>
132 #include <pexpert/pexpert.h>
133 #include <IOKit/IOBSD.h>
134
135 // deps for MIG call
136 #include <kern/host.h>
137 #include <kern/ipc_misc.h>
138 #include <mach/host_priv.h>
139 #include <mach/vfs_nspace.h>
140 #include <os/log.h>
141
142 #include <nfs/nfs_conf.h>
143
144 #if ROUTEFS
145 #include <miscfs/routefs/routefs.h>
146 #endif /* ROUTEFS */
147
148 #if CONFIG_MACF
149 #include <security/mac.h>
150 #include <security/mac_framework.h>
151 #endif
152
153 #if CONFIG_FSE
154 #define GET_PATH(x) \
155 ((x) = get_pathbuff())
156 #define RELEASE_PATH(x) \
157 release_pathbuff(x)
158 #else
159 #define GET_PATH(x) \
160 ((x) = zalloc(ZV_NAMEI))
161 #define RELEASE_PATH(x) \
162 zfree(ZV_NAMEI, x)
163 #endif /* CONFIG_FSE */
164
165 #ifndef HFS_GET_BOOT_INFO
166 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
167 #endif
168
169 #ifndef HFS_SET_BOOT_INFO
170 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
171 #endif
172
173 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
174 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
175 #endif
176
177 extern void disk_conditioner_unmount(mount_t mp);
178
179 /* struct for checkdirs iteration */
180 struct cdirargs {
181 vnode_t olddp;
182 vnode_t newdp;
183 };
184 /* callback for checkdirs iteration */
185 static int checkdirs_callback(proc_t p, void * arg);
186
187 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
188 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
189 void enablequotas(struct mount *mp, vfs_context_t ctx);
190 static int getfsstat_callback(mount_t mp, void * arg);
191 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
192 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
193 static int sync_callback(mount_t, void *);
194 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
195 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
196 boolean_t partial_copy);
197 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
198 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
199 struct componentname *cnp, user_addr_t fsmountargs,
200 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
201 void vfs_notify_mount(vnode_t pdvp);
202
203 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
204
205 struct fd_vn_data * fg_vn_data_alloc(void);
206
207 /*
208 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
209 * Concurrent lookups (or lookups by ids) on hard links can cause the
210 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
211 * does) to return ENOENT as the path cannot be returned from the name cache
212 * alone. We have no option but to retry and hope to get one namei->reverse path
213 * generation done without an intervening lookup, lookup by id on the hard link
214 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
215 * which currently are the MAC hooks for rename, unlink and rmdir.
216 */
217 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
218
219 /* Max retry limit for rename due to vnode recycling. */
220 #define MAX_RENAME_ERECYCLE_RETRIES 1024
221
222 #define MAX_LINK_ENOENT_RETRIES 1024
223
224 /* Max retries for concurrent mounts on the same covered vnode. */
225 #define MAX_MOUNT_RETRIES 10
226
227 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
228 int unlink_flags);
229
230 #ifdef CONFIG_IMGSRC_ACCESS
231 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
232 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
233 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
234 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
235 static void mount_end_update(mount_t mp);
236 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
237 #endif /* CONFIG_IMGSRC_ACCESS */
238
239 //snapshot functions
240 #if CONFIG_MNT_ROOTSNAP
241 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
242 #else
243 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
244 #endif
245
246 __private_extern__
247 int sync_internal(void);
248
249 __private_extern__
250 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
251
252 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
253 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
254
255 /* vars for sync mutex */
256 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
257 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
258
259 extern lck_rw_t rootvnode_rw_lock;
260
261 VFS_SMR_DECLARE;
262 extern uint32_t nc_smr_enabled;
263
264 /*
265 * incremented each time a mount or unmount operation occurs
266 * used to invalidate the cached value of the rootvp in the
267 * mount structure utilized by cache_lookup_path
268 */
269 uint32_t mount_generation = 0;
270
271 /* counts number of mount and unmount operations */
272 unsigned int vfs_nummntops = 0;
273
274 /* system-wide, per-boot unique mount ID */
275 static _Atomic uint64_t mount_unique_id = 1;
276
277 extern const struct fileops vnops;
278 #if CONFIG_APPLEDOUBLE
279 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
280 #endif /* CONFIG_APPLEDOUBLE */
281
282 /* Maximum buffer length supported by fsgetpath(2) */
283 #define FSGETPATH_MAXBUFLEN 8192
284
285 /*
286 * Virtual File System System Calls
287 */
288
289 /*
290 * Private in-kernel mounting spi (specific use-cases only)
291 */
292 boolean_t
vfs_iskernelmount(mount_t mp)293 vfs_iskernelmount(mount_t mp)
294 {
295 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
296 }
297
298 __private_extern__
299 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)300 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
301 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
302 vfs_context_t ctx)
303 {
304 struct nameidata nd;
305 boolean_t did_namei;
306 int error;
307
308 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
309 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
310 if (syscall_flags & MNT_NOFOLLOW) {
311 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
312 }
313
314 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
315
316 /*
317 * Get the vnode to be covered if it's not supplied
318 */
319 if (vp == NULLVP) {
320 error = namei(&nd);
321 if (error) {
322 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
323 printf("failed to locate mount-on path: %s ", path);
324 }
325 return error;
326 }
327 vp = nd.ni_vp;
328 pvp = nd.ni_dvp;
329 did_namei = TRUE;
330 } else {
331 char *pnbuf = CAST_DOWN(char *, path);
332
333 nd.ni_cnd.cn_pnbuf = pnbuf;
334 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
335 did_namei = FALSE;
336 }
337
338 kern_flags |= KERNEL_MOUNT_KMOUNT;
339 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
340 syscall_flags, kern_flags, NULL, ctx);
341
342 if (did_namei) {
343 vnode_put(vp);
344 vnode_put(pvp);
345 nameidone(&nd);
346 }
347
348 return error;
349 }
350
351 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)352 vfs_mount_at_path(const char *fstype, const char *path,
353 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
354 int mnt_flags, int flags)
355 {
356 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
357 int error, km_flags = 0;
358 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
359
360 /*
361 * This call is currently restricted to specific use cases.
362 */
363 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
364 return ENOTSUP;
365 }
366
367 #if !defined(XNU_TARGET_OS_OSX)
368 if (strcmp(fstype, "lifs") == 0) {
369 syscall_flags |= MNT_NOEXEC;
370 }
371 #endif
372
373 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
374 km_flags |= KERNEL_MOUNT_NOAUTH;
375 }
376 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
377 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
378 }
379
380 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
381 syscall_flags, km_flags, ctx);
382 if (error) {
383 printf("%s: mount on %s failed, error %d\n", __func__, path,
384 error);
385 }
386
387 return error;
388 }
389
390 /*
391 * Mount a file system.
392 */
393 /* ARGSUSED */
394 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)395 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
396 {
397 struct __mac_mount_args muap;
398
399 muap.type = uap->type;
400 muap.path = uap->path;
401 muap.flags = uap->flags;
402 muap.data = uap->data;
403 muap.mac_p = USER_ADDR_NULL;
404 return __mac_mount(p, &muap, retval);
405 }
406
407 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)408 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
409 {
410 struct componentname cn;
411 vfs_context_t ctx = vfs_context_current();
412 size_t dummy = 0;
413 int error;
414 int flags = uap->flags;
415 char fstypename[MFSNAMELEN];
416 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
417 vnode_t pvp;
418 vnode_t vp;
419
420 AUDIT_ARG(fd, uap->fd);
421 AUDIT_ARG(fflags, flags);
422 /* fstypename will get audited by mount_common */
423
424 /* Sanity check the flags */
425 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
426 return ENOTSUP;
427 }
428
429 if (flags & MNT_UNION) {
430 return EPERM;
431 }
432
433 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
434 if (error) {
435 return error;
436 }
437
438 if ((error = file_vnode(uap->fd, &vp)) != 0) {
439 return error;
440 }
441
442 if ((error = vnode_getwithref(vp)) != 0) {
443 file_drop(uap->fd);
444 return error;
445 }
446
447 pvp = vnode_getparent(vp);
448 if (pvp == NULL) {
449 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
450 error = EBUSY;
451 } else {
452 error = EINVAL;
453 }
454 vnode_put(vp);
455 file_drop(uap->fd);
456 return error;
457 }
458
459 memset(&cn, 0, sizeof(struct componentname));
460 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
461 cn.cn_pnlen = MAXPATHLEN;
462
463 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
464 zfree(ZV_NAMEI, cn.cn_pnbuf);
465 vnode_put(pvp);
466 vnode_put(vp);
467 file_drop(uap->fd);
468 return error;
469 }
470
471 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
472
473 zfree(ZV_NAMEI, cn.cn_pnbuf);
474 vnode_put(pvp);
475 vnode_put(vp);
476 file_drop(uap->fd);
477
478 return error;
479 }
480
481 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
482
483 /*
484 * Get the size of a graft file (a manifest or payload file).
485 * The vp should be an iocounted vnode.
486 */
487 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)488 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
489 {
490 struct stat64 sb = {};
491 int error;
492
493 *size = 0;
494
495 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
496 if (error) {
497 return error;
498 }
499
500 if (sb.st_size == 0) {
501 error = ENODATA;
502 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
503 error = EFBIG;
504 } else {
505 *size = (size_t) sb.st_size;
506 }
507
508 return error;
509 }
510
511 /*
512 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
513 * `size` must already be validated.
514 */
515 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)516 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
517 {
518 return vn_rdwr(UIO_READ, graft_vp,
519 (caddr_t) buf, (int) size, /* offset */ 0,
520 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
521 vfs_context_ucred(vctx), /* resid */ NULL,
522 vfs_context_proc(vctx));
523 }
524
525 /*
526 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
527 * and read it into `buf`.
528 * If `path_prefix` is non-NULL, verify that the file path has that prefix.
529 */
530 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)531 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
532 {
533 vnode_t metadata_vp = NULLVP;
534 char *path = NULL;
535 int error;
536
537 // Convert this graft fd to a vnode.
538 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
539 goto out;
540 }
541
542 // Verify that the vnode path starts with `path_prefix` if it was passed.
543 if (path_prefix) {
544 int len = MAXPATHLEN;
545 path = zalloc(ZV_NAMEI);
546 if ((error = vn_getpath(metadata_vp, path, &len))) {
547 goto out;
548 }
549 if (strncmp(path, path_prefix, strlen(path_prefix))) {
550 error = EINVAL;
551 goto out;
552 }
553 }
554
555 // Get (and validate) size information.
556 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
557 goto out;
558 }
559
560 // Read each file into the provided buffer - we must get the expected amount of bytes.
561 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
562 goto out;
563 }
564
565 out:
566 if (path) {
567 zfree(ZV_NAMEI, path);
568 }
569 if (metadata_vp) {
570 vnode_put(metadata_vp);
571 metadata_vp = NULLVP;
572 }
573
574 return error;
575 }
576
577 #if XNU_TARGET_OS_OSX
578 #if defined(__arm64e__)
579 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
580 #else /* x86_64 */
581 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
582 #endif /* x86_64 */
583 #else /* !XNU_TARGET_OS_OSX */
584 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
585 #endif /* !XNU_TARGET_OS_OSX */
586
587 /*
588 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
589 * provided in `gfs`, saving the size of data read in `gfs`.
590 */
591 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)592 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
593 vfs_context_t vctx, fsioc_graft_fs_t *gfs)
594 {
595 const char *manifest_path_prefix = NULL;
596 int error;
597
598 // For Mobile Asset, make sure that the manifest comes from a data vault.
599 if (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) {
600 manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
601 }
602
603 // Read the authentic manifest.
604 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
605 manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
606 return error;
607 }
608
609 // The user manifest is currently unused, but set its size.
610 gfs->user_manifest_size = 0;
611
612 // Read the payload.
613 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
614 NULL, &gfs->payload_size, gfs->payload))) {
615 return error;
616 }
617
618 return 0;
619 }
620
621 /*
622 * Call into the filesystem to verify and graft a cryptex.
623 */
624 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)625 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
626 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
627 {
628 fsioc_graft_fs_t gfs = {};
629 uint64_t graft_dir_ino = 0;
630 struct stat64 sb = {};
631 int error;
632
633 // Pre-flight arguments.
634 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
635 // Make sure that this graft version matches what we support.
636 return ENOTSUP;
637 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
638 // For this type, cryptex VP must live on same volume as the target of graft.
639 return EXDEV;
640 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
641 // We cannot graft upon non-directories.
642 return ENOTDIR;
643 } else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
644 // We do not allow grafts inside disk images.
645 return ENODEV;
646 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
647 sbc_args->sbc_payload_fd < 0) {
648 // We cannot graft without a manifest and payload.
649 return EINVAL;
650 }
651
652 if (mounton_vp) {
653 // Get the mounton's inode number.
654 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
655 if (error) {
656 return error;
657 }
658 graft_dir_ino = (uint64_t) sb.st_ino;
659 }
660
661 // Create buffers (of our maximum-defined size) to store authentication info.
662 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
663 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
664
665 if (!gfs.authentic_manifest || !gfs.payload) {
666 error = ENOMEM;
667 goto out;
668 }
669
670 // Read our fd's into our buffers.
671 // (Note that this will set the buffer size fields in `gfs`.)
672 error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
673 if (error) {
674 goto out;
675 }
676
677 gfs.graft_version = FSIOC_GRAFT_VERSION;
678 gfs.graft_type = graft_type;
679 gfs.graft_4cc = sbc_args->sbc_4cc;
680 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
681 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
682 }
683 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
684 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
685 }
686 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
687 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
688 }
689 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
690 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
691 }
692 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
693 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
694 }
695 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
696 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
697 }
698 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
699
700 // Call into the FS to perform the graft (and validation).
701 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
702
703 out:
704 if (gfs.authentic_manifest) {
705 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
706 gfs.authentic_manifest = NULL;
707 }
708 if (gfs.payload) {
709 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
710 gfs.payload = NULL;
711 }
712
713 return error;
714 }
715
716 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
717
718 /*
719 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
720 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
721 */
722 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)723 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
724 {
725 int ua_dmgfd = uap->dmg_fd;
726 user_addr_t ua_mountdir = uap->mountdir;
727 uint32_t ua_grafttype = uap->graft_type;
728 user_addr_t ua_graftargs = uap->gda;
729
730 graftdmg_args_un kern_gda = {};
731 int error = 0;
732 secure_boot_cryptex_args_t *sbc_args = NULL;
733
734 vnode_t cryptex_vp = NULLVP;
735 vnode_t mounton_vp = NULLVP;
736 struct nameidata nd = {};
737 vfs_context_t ctx = vfs_context_current();
738
739 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
740 return EPERM;
741 }
742
743 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
744 if (error) {
745 return error;
746 }
747
748 // Copy mount dir in, if provided.
749 if (ua_mountdir != USER_ADDR_NULL) {
750 // Acquire vnode for mount-on path
751 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
752 UIO_USERSPACE, ua_mountdir, ctx);
753
754 error = namei(&nd);
755 if (error) {
756 return error;
757 }
758 mounton_vp = nd.ni_vp;
759 }
760
761 // Convert fd to vnode.
762 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
763 if (error) {
764 goto graftout;
765 }
766
767 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
768 error = EINVAL;
769 } else {
770 sbc_args = &kern_gda.sbc_args;
771 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
772 }
773
774 graftout:
775 if (cryptex_vp) {
776 vnode_put(cryptex_vp);
777 cryptex_vp = NULLVP;
778 }
779 if (mounton_vp) {
780 vnode_put(mounton_vp);
781 mounton_vp = NULLVP;
782 }
783 if (ua_mountdir != USER_ADDR_NULL) {
784 nameidone(&nd);
785 }
786
787 return error;
788 }
789
790 /*
791 * Ungraft a cryptex disk image (via mount dir FD)
792 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
793 */
794 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)795 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
796 {
797 int error = 0;
798 user_addr_t ua_mountdir = uap->mountdir;
799 fsioc_ungraft_fs_t ugfs;
800 vnode_t mounton_vp = NULLVP;
801 struct nameidata nd = {};
802 vfs_context_t ctx = vfs_context_current();
803
804 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
805 return EPERM;
806 }
807
808 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
809 return EINVAL;
810 }
811
812 ugfs.ungraft_flags = 0;
813
814 // Acquire vnode for mount-on path
815 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
816 UIO_USERSPACE, ua_mountdir, ctx);
817
818 error = namei(&nd);
819 if (error) {
820 return error;
821 }
822 mounton_vp = nd.ni_vp;
823
824 // Call into the FS to perform the ungraft
825 error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
826
827 vnode_put(mounton_vp);
828 nameidone(&nd);
829
830 return error;
831 }
832
833
834 void
vfs_notify_mount(vnode_t pdvp)835 vfs_notify_mount(vnode_t pdvp)
836 {
837 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
838 lock_vnode_and_post(pdvp, NOTE_WRITE);
839 }
840
841 /*
842 * __mac_mount:
843 * Mount a file system taking into account MAC label behavior.
844 * See mount(2) man page for more information
845 *
846 * Parameters: p Process requesting the mount
847 * uap User argument descriptor (see below)
848 * retval (ignored)
849 *
850 * Indirect: uap->type Filesystem type
851 * uap->path Path to mount
852 * uap->data Mount arguments
853 * uap->mac_p MAC info
854 * uap->flags Mount flags
855 *
856 *
857 * Returns: 0 Success
858 * !0 Not success
859 */
860 boolean_t root_fs_upgrade_try = FALSE;
861
862 #define MAX_NESTED_UNION_MOUNTS 10
863
864 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)865 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
866 {
867 vnode_t pvp = NULLVP;
868 vnode_t vp = NULLVP;
869 int need_nameidone = 0;
870 vfs_context_t ctx = vfs_context_current();
871 char fstypename[MFSNAMELEN];
872 struct nameidata nd;
873 size_t dummy = 0;
874 char *labelstr = NULL;
875 size_t labelsz = 0;
876 int flags = uap->flags;
877 int error;
878 int num_retries = 0;
879 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
880 boolean_t is_64bit = IS_64BIT_PROCESS(p);
881 #else
882 #pragma unused(p)
883 #endif
884 /*
885 * Get the fs type name from user space
886 */
887 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
888 if (error) {
889 return error;
890 }
891
892 retry:
893 /*
894 * Get the vnode to be covered
895 */
896 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
897 UIO_USERSPACE, uap->path, ctx);
898 if (flags & MNT_NOFOLLOW) {
899 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
900 }
901 error = namei(&nd);
902 if (error) {
903 goto out;
904 }
905 need_nameidone = 1;
906 vp = nd.ni_vp;
907 pvp = nd.ni_dvp;
908
909 #ifdef CONFIG_IMGSRC_ACCESS
910 /* Mounting image source cannot be batched with other operations */
911 if (flags == MNT_IMGSRC_BY_INDEX) {
912 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
913 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
914 goto out;
915 }
916 #endif /* CONFIG_IMGSRC_ACCESS */
917
918 #if CONFIG_MACF
919 /*
920 * Get the label string (if any) from user space
921 */
922 if (uap->mac_p != USER_ADDR_NULL) {
923 struct user_mac mac;
924 size_t ulen = 0;
925
926 if (is_64bit) {
927 struct user64_mac mac64;
928 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
929 mac.m_buflen = (user_size_t)mac64.m_buflen;
930 mac.m_string = (user_addr_t)mac64.m_string;
931 } else {
932 struct user32_mac mac32;
933 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
934 mac.m_buflen = mac32.m_buflen;
935 mac.m_string = mac32.m_string;
936 }
937 if (error) {
938 goto out;
939 }
940 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
941 (mac.m_buflen < 2)) {
942 error = EINVAL;
943 goto out;
944 }
945 labelsz = mac.m_buflen;
946 labelstr = kalloc_data(labelsz, Z_WAITOK);
947 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
948 if (error) {
949 goto out;
950 }
951 AUDIT_ARG(mac_string, labelstr);
952 }
953 #endif /* CONFIG_MACF */
954
955 AUDIT_ARG(fflags, flags);
956
957 if (flags & MNT_UNION) {
958 #if CONFIG_UNION_MOUNTS
959 mount_t mp = vp->v_mount;
960 int nested_union_mounts = 0;
961
962 name_cache_lock_shared();
963
964 /* Walk up the vnodecovered chain and check for nested union mounts. */
965 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
966 while (mp) {
967 if (!(mp->mnt_flag & MNT_UNION)) {
968 break;
969 }
970 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
971
972 /*
973 * Limit the max nested unon mounts to prevent stack exhaustion
974 * when calling lookup_traverse_union().
975 */
976 if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
977 error = ELOOP;
978 break;
979 }
980 }
981
982 name_cache_unlock();
983 if (error) {
984 goto out;
985 }
986 #else
987 error = EPERM;
988 goto out;
989 #endif /* CONFIG_UNION_MOUNTS */
990 }
991
992 if ((vp->v_flag & VROOT) &&
993 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
994 #if CONFIG_UNION_MOUNTS
995 if (!(flags & MNT_UNION)) {
996 flags |= MNT_UPDATE;
997 } else {
998 /*
999 * For a union mount on '/', treat it as fresh
1000 * mount instead of update.
1001 * Otherwise, union mouting on '/' used to panic the
1002 * system before, since mnt_vnodecovered was found to
1003 * be NULL for '/' which is required for unionlookup
1004 * after it gets ENOENT on union mount.
1005 */
1006 flags = (flags & ~(MNT_UPDATE));
1007 }
1008 #else
1009 flags |= MNT_UPDATE;
1010 #endif /* CONFIG_UNION_MOUNTS */
1011
1012 #if SECURE_KERNEL
1013 if ((flags & MNT_RDONLY) == 0) {
1014 /* Release kernels are not allowed to mount "/" as rw */
1015 error = EPERM;
1016 goto out;
1017 }
1018 #endif
1019
1020 /*
1021 * See 7392553 for more details on why this check exists.
1022 * Suffice to say: If this check is ON and something tries
1023 * to mount the rootFS RW, we'll turn off the codesign
1024 * bitmap optimization.
1025 */
1026 #if CHECK_CS_VALIDATION_BITMAP
1027 if ((flags & MNT_RDONLY) == 0) {
1028 root_fs_upgrade_try = TRUE;
1029 }
1030 #endif
1031 }
1032
1033 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1034 labelstr, ctx);
1035
1036 out:
1037
1038 #if CONFIG_MACF
1039 kfree_data(labelstr, labelsz);
1040 #endif /* CONFIG_MACF */
1041
1042 if (vp) {
1043 vnode_put(vp);
1044 vp = NULLVP;
1045 }
1046 if (pvp) {
1047 vnode_put(pvp);
1048 pvp = NULLVP;
1049 }
1050 if (need_nameidone) {
1051 nameidone(&nd);
1052 need_nameidone = 0;
1053 }
1054
1055 if (error == EBUSY) {
1056 /* Retry the lookup and mount again due to concurrent mounts. */
1057 if (++num_retries < MAX_MOUNT_RETRIES) {
1058 goto retry;
1059 }
1060 }
1061
1062 return error;
1063 }
1064
1065 /*
1066 * common mount implementation (final stage of mounting)
1067 *
1068 * Arguments:
1069 * fstypename file system type (ie it's vfs name)
1070 * pvp parent of covered vnode
1071 * vp covered vnode
1072 * cnp component name (ie path) of covered vnode
1073 * flags generic mount flags
1074 * fsmountargs file system specific data
1075 * labelstr optional MAC label
1076 * kernelmount TRUE for mounts initiated from inside the kernel
1077 * ctx caller's context
1078 */
1079 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1080 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1081 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1082 char *labelstr, vfs_context_t ctx)
1083 {
1084 #if !CONFIG_MACF
1085 #pragma unused(labelstr)
1086 #endif
1087 struct vnode *devvp = NULLVP;
1088 struct vnode *device_vnode = NULLVP;
1089 #if CONFIG_MACF
1090 struct vnode *rvp;
1091 #endif
1092 struct mount *mp = NULL;
1093 struct vfstable *vfsp = (struct vfstable *)0;
1094 struct proc *p = vfs_context_proc(ctx);
1095 int error, flag = 0;
1096 bool flag_set = false;
1097 user_addr_t devpath = USER_ADDR_NULL;
1098 int ronly = 0;
1099 int mntalloc = 0;
1100 boolean_t vfsp_ref = FALSE;
1101 boolean_t is_rwlock_locked = FALSE;
1102 boolean_t did_rele = FALSE;
1103 boolean_t have_usecount = FALSE;
1104 boolean_t did_set_lmount = FALSE;
1105 boolean_t did_set_vmount = FALSE;
1106 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1107
1108 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1109 /* Check for mutually-exclusive flag bits */
1110 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1111 int bitcount = 0;
1112 while (checkflags != 0) {
1113 checkflags &= (checkflags - 1);
1114 bitcount++;
1115 }
1116
1117 if (bitcount > 1) {
1118 //not allowed to request multiple mount-by-role flags
1119 error = EINVAL;
1120 goto out1;
1121 }
1122 #endif
1123
1124 /*
1125 * Process an update for an existing mount
1126 */
1127 if (flags & MNT_UPDATE) {
1128 if ((vp->v_flag & VROOT) == 0) {
1129 error = EINVAL;
1130 goto out1;
1131 }
1132 mp = vp->v_mount;
1133
1134 /* if unmount or mount in progress, return error */
1135 mount_lock_spin(mp);
1136 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1137 mount_unlock(mp);
1138 error = EBUSY;
1139 goto out1;
1140 }
1141 mp->mnt_lflag |= MNT_LMOUNT;
1142 did_set_lmount = TRUE;
1143 mount_unlock(mp);
1144 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1145 is_rwlock_locked = TRUE;
1146 /*
1147 * We only allow the filesystem to be reloaded if it
1148 * is currently mounted read-only.
1149 */
1150 if ((flags & MNT_RELOAD) &&
1151 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1152 error = ENOTSUP;
1153 goto out1;
1154 }
1155
1156 /*
1157 * If content protection is enabled, update mounts are not
1158 * allowed to turn it off.
1159 */
1160 if ((mp->mnt_flag & MNT_CPROTECT) &&
1161 ((flags & MNT_CPROTECT) == 0)) {
1162 error = EINVAL;
1163 goto out1;
1164 }
1165
1166 /*
1167 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1168 * failure to return an error for this so we'll just silently
1169 * add it if it is not passed in.
1170 */
1171 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1172 ((flags & MNT_REMOVABLE) == 0)) {
1173 flags |= MNT_REMOVABLE;
1174 }
1175
1176 /* Can't downgrade the backer of the root FS */
1177 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1178 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1179 error = ENOTSUP;
1180 goto out1;
1181 }
1182
1183 /*
1184 * Only root, or the user that did the original mount is
1185 * permitted to update it.
1186 */
1187 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1188 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1189 goto out1;
1190 }
1191 #if CONFIG_MACF
1192 error = mac_mount_check_remount(ctx, mp, flags);
1193 if (error != 0) {
1194 goto out1;
1195 }
1196 #endif
1197 /*
1198 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1199 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1200 */
1201 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1202 flags |= MNT_NOSUID | MNT_NODEV;
1203 if (mp->mnt_flag & MNT_NOEXEC) {
1204 flags |= MNT_NOEXEC;
1205 }
1206 }
1207 flag = mp->mnt_flag;
1208 flag_set = true;
1209
1210
1211
1212 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1213
1214 vfsp = mp->mnt_vtable;
1215 goto update;
1216 } // MNT_UPDATE
1217
1218 /*
1219 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1220 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1221 */
1222 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1223 flags |= MNT_NOSUID | MNT_NODEV;
1224 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1225 flags |= MNT_NOEXEC;
1226 }
1227 }
1228
1229 /* XXXAUDIT: Should we capture the type on the error path as well? */
1230 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1231 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1232 mount_list_lock();
1233 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1234 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1235 vfsp->vfc_refcount++;
1236 vfsp_ref = TRUE;
1237 break;
1238 }
1239 }
1240 mount_list_unlock();
1241 if (vfsp == NULL) {
1242 error = ENODEV;
1243 goto out1;
1244 }
1245
1246 /*
1247 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1248 * except in ROSV configs and for the initial BaseSystem root.
1249 */
1250 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1251 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1252 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1253 error = EINVAL; /* unsupported request */
1254 goto out1;
1255 }
1256
1257 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1258 if (error != 0) {
1259 goto out1;
1260 }
1261
1262 /*
1263 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1264 */
1265 did_set_vmount = TRUE;
1266
1267 /*
1268 * Allocate and initialize the filesystem (mount_t)
1269 */
1270 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1271 mntalloc = 1;
1272
1273 /* Initialize the default IO constraints */
1274 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1275 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1276 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1277 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1278 mp->mnt_devblocksize = DEV_BSIZE;
1279 mp->mnt_alignmentmask = PAGE_MASK;
1280 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1281 mp->mnt_ioscale = 1;
1282 mp->mnt_ioflags = 0;
1283 mp->mnt_realrootvp = NULLVP;
1284 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1285
1286 mp->mnt_lflag |= MNT_LMOUNT;
1287 did_set_lmount = TRUE;
1288
1289 TAILQ_INIT(&mp->mnt_vnodelist);
1290 TAILQ_INIT(&mp->mnt_workerqueue);
1291 TAILQ_INIT(&mp->mnt_newvnodes);
1292 mount_lock_init(mp);
1293 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1294 is_rwlock_locked = TRUE;
1295 mp->mnt_op = vfsp->vfc_vfsops;
1296 mp->mnt_vtable = vfsp;
1297 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1298 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1299 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1300 do {
1301 size_t pathlen = MAXPATHLEN;
1302
1303 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1304 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1305 }
1306 } while (0);
1307 mp->mnt_vnodecovered = vp;
1308 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1309 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1310 mp->mnt_devbsdunit = 0;
1311 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1312
1313 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1314 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1315
1316 if (kernelmount) {
1317 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1318 }
1319 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1320 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1321 }
1322
1323 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1324 // kernel mounted devfs
1325 mp->mnt_kern_flag |= MNTK_SYSTEM;
1326 }
1327
1328 update:
1329
1330 /*
1331 * Set the mount level flags.
1332 */
1333 if (flags & MNT_RDONLY) {
1334 mp->mnt_flag |= MNT_RDONLY;
1335 } else if (mp->mnt_flag & MNT_RDONLY) {
1336 // disallow read/write upgrades of file systems that
1337 // had the TYPENAME_OVERRIDE feature set.
1338 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1339 error = EPERM;
1340 goto out1;
1341 }
1342 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1343 }
1344 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1345 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1346 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1347 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1348 MNT_QUARANTINE | MNT_CPROTECT);
1349
1350 #if SECURE_KERNEL
1351 #if !CONFIG_MNT_SUID
1352 /*
1353 * On release builds of iOS based platforms, always enforce NOSUID on
1354 * all mounts. We do this here because we can catch update mounts as well as
1355 * non-update mounts in this case.
1356 */
1357 mp->mnt_flag |= (MNT_NOSUID);
1358 #endif
1359 #endif
1360
1361 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1362 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1363 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1364 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1365 MNT_QUARANTINE | MNT_CPROTECT);
1366
1367 #if CONFIG_MACF
1368 if (flags & MNT_MULTILABEL) {
1369 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1370 error = EINVAL;
1371 goto out1;
1372 }
1373 mp->mnt_flag |= MNT_MULTILABEL;
1374 }
1375 #endif
1376 /*
1377 * Process device path for local file systems if requested.
1378 *
1379 * Snapshot and mount-by-role mounts do not use this path; they are
1380 * passing other opaque data in the device path field.
1381 *
1382 * Basesystemroot mounts pass a device path to be resolved here,
1383 * but it's just a char * already inside the kernel, which
1384 * kernel_mount() shoved into a user_addr_t to call us. So for such
1385 * mounts we must skip copyin (both of the address and of the string
1386 * (in NDINIT).
1387 */
1388 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1389 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1390 boolean_t do_copyin_devpath = true;
1391 #if CONFIG_BASESYSTEMROOT
1392 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1393 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1394 // We have been passed fsmountargs, which is typed as a user_addr_t,
1395 // but is actually a char ** pointing to a (kernelspace) string.
1396 // We manually unpack it with a series of casts and dereferences
1397 // that reverses what was done just above us on the stack in
1398 // imageboot_pivot_image().
1399 // After retrieving the path to the dev node (which we will NDINIT
1400 // in a moment), we pass NULL fsmountargs on to the filesystem.
1401 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1402 char **devnamepp = (char **)fsmountargs;
1403 char *devnamep = *devnamepp;
1404 devpath = CAST_USER_ADDR_T(devnamep);
1405 do_copyin_devpath = false;
1406 fsmountargs = USER_ADDR_NULL;
1407
1408 //Now that we have a mp, denote that this mount is for the basesystem.
1409 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1410 }
1411 #endif // CONFIG_BASESYSTEMROOT
1412
1413 if (do_copyin_devpath) {
1414 if (vfs_context_is64bit(ctx)) {
1415 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1416 goto out1;
1417 }
1418 fsmountargs += sizeof(devpath);
1419 } else {
1420 user32_addr_t tmp;
1421 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1422 goto out1;
1423 }
1424 /* munge into LP64 addr */
1425 devpath = CAST_USER_ADDR_T(tmp);
1426 fsmountargs += sizeof(tmp);
1427 }
1428 }
1429
1430 /* Lookup device and authorize access to it */
1431 if ((devpath)) {
1432 struct nameidata nd;
1433
1434 enum uio_seg seg = UIO_USERSPACE;
1435 #if CONFIG_BASESYSTEMROOT
1436 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1437 seg = UIO_SYSSPACE;
1438 }
1439 #endif // CONFIG_BASESYSTEMROOT
1440
1441 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1442 if (flags & MNT_NOFOLLOW) {
1443 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
1444 }
1445 if ((error = namei(&nd))) {
1446 goto out1;
1447 }
1448
1449 devvp = nd.ni_vp;
1450
1451 if (devvp->v_type != VBLK) {
1452 error = ENOTBLK;
1453 nameidone(&nd);
1454 goto out2;
1455 }
1456 if (major(devvp->v_rdev) >= nblkdev) {
1457 error = ENXIO;
1458 nameidone(&nd);
1459 goto out2;
1460 }
1461 /*
1462 * If mount by non-root, then verify that user has necessary
1463 * permissions on the device.
1464 */
1465 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1466 kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1467
1468 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1469 accessmode |= KAUTH_VNODE_WRITE_DATA;
1470 }
1471 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1472 nameidone(&nd);
1473 goto out2;
1474 }
1475 }
1476
1477 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1478 nameidone(&nd);
1479 }
1480 /* On first mount, preflight and open device */
1481 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1482 if ((error = vnode_ref(devvp))) {
1483 goto out2;
1484 }
1485 /*
1486 * Disallow multiple mounts of the same device.
1487 * Disallow mounting of a device that is currently in use
1488 * (except for root, which might share swap device for miniroot).
1489 * Flush out any old buffers remaining from a previous use.
1490 */
1491 if ((error = vfs_setmounting(devvp))) {
1492 vnode_rele(devvp);
1493 goto out2;
1494 }
1495
1496 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1497 error = EBUSY;
1498 goto out3;
1499 }
1500 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1501 error = ENOTBLK;
1502 goto out3;
1503 }
1504 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1505 goto out3;
1506 }
1507
1508 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1509 #if CONFIG_MACF
1510 error = mac_vnode_check_open(ctx,
1511 devvp,
1512 ronly ? FREAD : FREAD | FWRITE);
1513 if (error) {
1514 goto out3;
1515 }
1516 #endif /* MAC */
1517 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1518 goto out3;
1519 }
1520
1521 mp->mnt_devvp = devvp;
1522 device_vnode = devvp;
1523 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1524 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1525 (device_vnode = mp->mnt_devvp)) {
1526 dev_t dev;
1527 int maj;
1528 /*
1529 * If upgrade to read-write by non-root, then verify
1530 * that user has necessary permissions on the device.
1531 */
1532 vnode_getalways(device_vnode);
1533
1534 if (suser(vfs_context_ucred(ctx), NULL) &&
1535 (error = vnode_authorize(device_vnode, NULL,
1536 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1537 ctx)) != 0) {
1538 vnode_put(device_vnode);
1539 goto out2;
1540 }
1541
1542 /* Tell the device that we're upgrading */
1543 dev = (dev_t)device_vnode->v_rdev;
1544 maj = major(dev);
1545
1546 if ((u_int)maj >= (u_int)nblkdev) {
1547 panic("Volume mounted on a device with invalid major number.");
1548 }
1549
1550 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1551 vnode_put(device_vnode);
1552 device_vnode = NULLVP;
1553 if (error != 0) {
1554 goto out2;
1555 }
1556 }
1557 } // localargs && !(snapshot | data | vm)
1558
1559 #if CONFIG_MACF
1560 if ((flags & MNT_UPDATE) == 0) {
1561 mac_mount_label_init(mp);
1562 mac_mount_label_associate(ctx, mp);
1563 }
1564 if (labelstr) {
1565 if ((flags & MNT_UPDATE) != 0) {
1566 error = mac_mount_check_label_update(ctx, mp);
1567 if (error != 0) {
1568 goto out3;
1569 }
1570 }
1571 }
1572 #endif
1573 /*
1574 * Mount the filesystem. We already asserted that internal_flags
1575 * cannot have more than one mount-by-role bit set.
1576 */
1577 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1578 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1579 (caddr_t)fsmountargs, 0, ctx);
1580 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1581 #if CONFIG_ROSV_STARTUP
1582 struct mount *origin_mp = (struct mount*)fsmountargs;
1583 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1584 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1585 if (error) {
1586 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1587 } else {
1588 /* Mark volume associated with system volume */
1589 mp->mnt_kern_flag |= MNTK_SYSTEM;
1590
1591 /* Attempt to acquire the mnt_devvp and set it up */
1592 struct vnode *mp_devvp = NULL;
1593 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1594 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1595 0, &mp_devvp, vfs_context_kernel());
1596 if (!lerr) {
1597 mp->mnt_devvp = mp_devvp;
1598 //vnode_lookup took an iocount, need to drop it.
1599 vnode_put(mp_devvp);
1600 // now set `device_vnode` to the devvp that was acquired.
1601 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1602 // note that though the iocount above was dropped, the mount acquires
1603 // an implicit reference against the device.
1604 device_vnode = mp_devvp;
1605 }
1606 }
1607 }
1608 #else
1609 error = EINVAL;
1610 #endif
1611 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1612 #if CONFIG_MOUNT_VM
1613 struct mount *origin_mp = (struct mount*)fsmountargs;
1614 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1615 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1616 if (error) {
1617 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1618 } else {
1619 /* Mark volume associated with system volume and a swap mount */
1620 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1621 /* Attempt to acquire the mnt_devvp and set it up */
1622 struct vnode *mp_devvp = NULL;
1623 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1624 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1625 0, &mp_devvp, vfs_context_kernel());
1626 if (!lerr) {
1627 mp->mnt_devvp = mp_devvp;
1628 //vnode_lookup took an iocount, need to drop it.
1629 vnode_put(mp_devvp);
1630
1631 // now set `device_vnode` to the devvp that was acquired.
1632 // note that though the iocount above was dropped, the mount acquires
1633 // an implicit reference against the device.
1634 device_vnode = mp_devvp;
1635 }
1636 }
1637 }
1638 #else
1639 error = EINVAL;
1640 #endif
1641 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1642 #if CONFIG_MOUNT_PREBOOTRECOVERY
1643 struct mount *origin_mp = (struct mount*)fsmountargs;
1644 uint32_t mount_role = 0;
1645 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1646 mount_role = VFS_PREBOOT_ROLE;
1647 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1648 mount_role = VFS_RECOVERY_ROLE;
1649 }
1650
1651 if (mount_role != 0) {
1652 fs_role_mount_args_t frma = {origin_mp, mount_role};
1653 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1654 if (error) {
1655 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1656 } else {
1657 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1658 /* Mark volume associated with system volume */
1659 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1660 /* Attempt to acquire the mnt_devvp and set it up */
1661 struct vnode *mp_devvp = NULL;
1662 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1663 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1664 0, &mp_devvp, vfs_context_kernel());
1665 if (!lerr) {
1666 mp->mnt_devvp = mp_devvp;
1667 //vnode_lookup took an iocount, need to drop it.
1668 vnode_put(mp_devvp);
1669
1670 // now set `device_vnode` to the devvp that was acquired.
1671 // note that though the iocount above was dropped, the mount acquires
1672 // an implicit reference against the device.
1673 device_vnode = mp_devvp;
1674 }
1675 }
1676 }
1677 } else {
1678 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1679 error = EINVAL;
1680 }
1681 #else
1682 error = EINVAL;
1683 #endif
1684 } else {
1685 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1686 }
1687
1688 if (flags & MNT_UPDATE) {
1689 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1690 mp->mnt_flag &= ~MNT_RDONLY;
1691 }
1692 mp->mnt_flag &= ~
1693 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1694 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1695 if (error) {
1696 mp->mnt_flag = flag; /* restore flag value */
1697 }
1698 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1699 lck_rw_done(&mp->mnt_rwlock);
1700 is_rwlock_locked = FALSE;
1701 if (!error) {
1702 enablequotas(mp, ctx);
1703 }
1704 goto exit;
1705 }
1706
1707 /*
1708 * Put the new filesystem on the mount list after root.
1709 */
1710 if (error == 0) {
1711 struct vfs_attr vfsattr;
1712 if (device_vnode) {
1713 /*
1714 * cache the IO attributes for the underlying physical media...
1715 * an error return indicates the underlying driver doesn't
1716 * support all the queries necessary... however, reasonable
1717 * defaults will have been set, so no reason to bail or care
1718 *
1719 * Need to do this before calling the MAC hook as it needs
1720 * information from this call.
1721 */
1722 vfs_init_io_attributes(device_vnode, mp);
1723 }
1724
1725 #if CONFIG_MACF
1726 error = mac_mount_check_mount_late(ctx, mp);
1727 if (error != 0) {
1728 goto out4;
1729 }
1730
1731 if (vfs_flags(mp) & MNT_MULTILABEL) {
1732 error = VFS_ROOT(mp, &rvp, ctx);
1733 if (error) {
1734 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1735 goto out4;
1736 }
1737 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1738 /*
1739 * drop reference provided by VFS_ROOT
1740 */
1741 vnode_put(rvp);
1742
1743 if (error) {
1744 goto out4;
1745 }
1746 }
1747 #endif /* MAC */
1748
1749 vnode_lock_spin(vp);
1750 CLR(vp->v_flag, VMOUNT);
1751 vp->v_mountedhere = mp;
1752 SET(vp->v_flag, VMOUNTEDHERE);
1753
1754 /*
1755 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1756 * 'v_mountedhere' to be planted.
1757 */
1758 wakeup(&vp->v_flag);
1759 vnode_unlock(vp);
1760
1761 /*
1762 * taking the name_cache_lock exclusively will
1763 * insure that everyone is out of the fast path who
1764 * might be trying to use a now stale copy of
1765 * vp->v_mountedhere->mnt_realrootvp
1766 * bumping mount_generation causes the cached values
1767 * to be invalidated
1768 */
1769 name_cache_lock();
1770 mount_generation++;
1771 name_cache_unlock();
1772
1773 error = vnode_ref(vp);
1774 if (error != 0) {
1775 goto out4;
1776 }
1777
1778 have_usecount = TRUE;
1779
1780 error = checkdirs(vp, ctx);
1781 if (error != 0) {
1782 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1783 goto out4;
1784 }
1785 /*
1786 * there is no cleanup code here so I have made it void
1787 * we need to revisit this
1788 */
1789 (void)VFS_START(mp, 0, ctx);
1790
1791 if (mount_list_add(mp) != 0) {
1792 /*
1793 * The system is shutting down trying to umount
1794 * everything, so fail with a plausible errno.
1795 */
1796 error = EBUSY;
1797 goto out4;
1798 }
1799 lck_rw_done(&mp->mnt_rwlock);
1800 is_rwlock_locked = FALSE;
1801
1802 /* Check if this mounted file system supports EAs or named streams. */
1803 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1804 VFSATTR_INIT(&vfsattr);
1805 VFSATTR_WANTED(&vfsattr, f_capabilities);
1806 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1807 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1808 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1809 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1810 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1811 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1812 }
1813 #if NAMEDSTREAMS
1814 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1815 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1816 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1817 }
1818 #endif
1819 /* Check if this file system supports path from id lookups. */
1820 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1821 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1822 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1823 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1824 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1825 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1826 }
1827
1828 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1829 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1830 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1831 }
1832 }
1833 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1834 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1835 }
1836 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1837 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1838 }
1839 /* increment the operations count */
1840 OSAddAtomic(1, &vfs_nummntops);
1841 enablequotas(mp, ctx);
1842
1843 if (device_vnode) {
1844 vfs_setmountedon(device_vnode);
1845 }
1846
1847 /* Now that mount is setup, notify the listeners */
1848 vfs_notify_mount(pvp);
1849 IOBSDMountChange(mp, kIOMountChangeMount);
1850 } else {
1851 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1852 if (mp->mnt_vnodelist.tqh_first != NULL) {
1853 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1854 mp->mnt_vtable->vfc_name, error);
1855 }
1856
1857 vnode_lock_spin(vp);
1858 CLR(vp->v_flag, VMOUNT);
1859 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
1860 wakeup(&vp->v_flag);
1861 vnode_unlock(vp);
1862 mount_list_lock();
1863 mp->mnt_vtable->vfc_refcount--;
1864 mount_list_unlock();
1865
1866 if (device_vnode) {
1867 vnode_rele(device_vnode);
1868 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1869 vfs_clearmounting(device_vnode);
1870 }
1871 lck_rw_done(&mp->mnt_rwlock);
1872 is_rwlock_locked = FALSE;
1873
1874 if (nc_smr_enabled) {
1875 vfs_smr_synchronize();
1876 }
1877
1878 /*
1879 * if we get here, we have a mount structure that needs to be freed,
1880 * but since the coveredvp hasn't yet been updated to point at it,
1881 * no need to worry about other threads holding a crossref on this mp
1882 * so it's ok to just free it
1883 */
1884 mount_lock_destroy(mp);
1885 #if CONFIG_MACF
1886 mac_mount_label_destroy(mp);
1887 #endif
1888 zfree(mount_zone, mp);
1889 did_set_lmount = false;
1890 }
1891 exit:
1892 /*
1893 * drop I/O count on the device vp if there was one
1894 */
1895 if (devpath && devvp) {
1896 vnode_put(devvp);
1897 }
1898
1899 if (did_set_lmount) {
1900 mount_lock_spin(mp);
1901 mp->mnt_lflag &= ~MNT_LMOUNT;
1902 mount_unlock(mp);
1903 }
1904
1905 return error;
1906
1907 /* Error condition exits */
1908 out4:
1909 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1910
1911 /*
1912 * If the mount has been placed on the covered vp,
1913 * it may have been discovered by now, so we have
1914 * to treat this just like an unmount
1915 */
1916 mount_lock_spin(mp);
1917 mp->mnt_lflag |= MNT_LDEAD;
1918 mount_unlock(mp);
1919
1920 if (device_vnode != NULLVP) {
1921 vnode_rele(device_vnode);
1922 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1923 ctx);
1924 vfs_clearmounting(device_vnode);
1925 did_rele = TRUE;
1926 }
1927
1928 vnode_lock_spin(vp);
1929
1930 mp->mnt_crossref++;
1931 CLR(vp->v_flag, VMOUNTEDHERE);
1932 vp->v_mountedhere = (mount_t) 0;
1933
1934 vnode_unlock(vp);
1935
1936 if (have_usecount) {
1937 vnode_rele(vp);
1938 }
1939 out3:
1940 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1941 vnode_rele(devvp);
1942 vfs_clearmounting(devvp);
1943 }
1944 out2:
1945 if (devpath && devvp) {
1946 vnode_put(devvp);
1947 }
1948 out1:
1949 /* Release mnt_rwlock only when it was taken */
1950 if (is_rwlock_locked == TRUE) {
1951 if (flag_set) {
1952 mp->mnt_flag = flag; /* restore mnt_flag value */
1953 }
1954 lck_rw_done(&mp->mnt_rwlock);
1955 }
1956
1957 if (did_set_lmount) {
1958 mount_lock_spin(mp);
1959 mp->mnt_lflag &= ~MNT_LMOUNT;
1960 mount_unlock(mp);
1961 }
1962
1963 if (did_set_vmount) {
1964 vnode_lock_spin(vp);
1965 CLR(vp->v_flag, VMOUNT);
1966 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
1967 wakeup(&vp->v_flag);
1968 vnode_unlock(vp);
1969 }
1970
1971 if (mntalloc) {
1972 if (mp->mnt_crossref) {
1973 mount_dropcrossref(mp, vp, 0);
1974 } else {
1975 if (nc_smr_enabled) {
1976 vfs_smr_synchronize();
1977 }
1978
1979 mount_lock_destroy(mp);
1980 #if CONFIG_MACF
1981 mac_mount_label_destroy(mp);
1982 #endif
1983 zfree(mount_zone, mp);
1984 }
1985 }
1986 if (vfsp_ref) {
1987 mount_list_lock();
1988 vfsp->vfc_refcount--;
1989 mount_list_unlock();
1990 }
1991
1992 return error;
1993 }
1994
1995 /*
1996 * Flush in-core data, check for competing mount attempts,
1997 * and set VMOUNT
1998 */
1999 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)2000 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
2001 {
2002 #if !CONFIG_MACF
2003 #pragma unused(cnp,fsname)
2004 #endif
2005 struct vnode_attr va;
2006 int error;
2007 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
2008 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
2009 boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
2010
2011 if (!skip_auth) {
2012 /*
2013 * If the user is not root, ensure that they own the directory
2014 * onto which we are attempting to mount.
2015 */
2016 VATTR_INIT(&va);
2017 VATTR_WANTED(&va, va_uid);
2018 if ((error = vnode_getattr(vp, &va, ctx)) ||
2019 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2020 (!vfs_context_issuser(ctx)))) {
2021 error = EPERM;
2022 goto out;
2023 }
2024 }
2025
2026 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
2027 goto out;
2028 }
2029
2030 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
2031 goto out;
2032 }
2033
2034 if (vp->v_type != VDIR) {
2035 error = ENOTDIR;
2036 goto out;
2037 }
2038
2039 vnode_lock_spin(vp);
2040
2041 if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
2042 error = EBUSY;
2043 } else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
2044 (vp->v_mountedhere != NULL))) {
2045 /*
2046 * For mount triggered from mount() call, we want to wait for the
2047 * current in-progress mount to complete, redo lookup and retry the
2048 * mount again. Similarly, we also want to retry if we lost the race
2049 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
2050 * 'v_mountedhere' has been planted after initial lookup.
2051 */
2052 if (ISSET(vp->v_flag, VMOUNT)) {
2053 vnode_lock_convert(vp);
2054 msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
2055 }
2056 error = EBUSY;
2057 } else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
2058 error = EBUSY;
2059 }
2060
2061 if (error) {
2062 vnode_unlock(vp);
2063 goto out;
2064 }
2065 SET(vp->v_flag, VMOUNT);
2066 vnode_unlock(vp);
2067
2068 #if CONFIG_MACF
2069 error = mac_mount_check_mount(ctx, vp,
2070 cnp, fsname);
2071 if (error != 0) {
2072 vnode_lock_spin(vp);
2073 CLR(vp->v_flag, VMOUNT);
2074 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2075 wakeup(&vp->v_flag);
2076 vnode_unlock(vp);
2077 }
2078 #endif
2079
2080 out:
2081 return error;
2082 }
2083
2084 #if CONFIG_IMGSRC_ACCESS
2085
2086 #define DEBUG_IMGSRC 0
2087
2088 #if DEBUG_IMGSRC
2089 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2090 #else
2091 #define IMGSRC_DEBUG(args...) do { } while(0)
2092 #endif
2093
2094 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2095 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2096 {
2097 struct nameidata nd;
2098 vnode_t vp, realdevvp;
2099 kauth_action_t accessmode;
2100 int error;
2101 enum uio_seg uio = UIO_USERSPACE;
2102
2103 if (ctx == vfs_context_kernel()) {
2104 uio = UIO_SYSSPACE;
2105 }
2106
2107 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2108 if ((error = namei(&nd))) {
2109 IMGSRC_DEBUG("namei() failed with %d\n", error);
2110 return error;
2111 }
2112
2113 vp = nd.ni_vp;
2114
2115 if (!vnode_isblk(vp)) {
2116 IMGSRC_DEBUG("Not block device.\n");
2117 error = ENOTBLK;
2118 goto out;
2119 }
2120
2121 realdevvp = mp->mnt_devvp;
2122 if (realdevvp == NULLVP) {
2123 IMGSRC_DEBUG("No device backs the mount.\n");
2124 error = ENXIO;
2125 goto out;
2126 }
2127
2128 error = vnode_getwithref(realdevvp);
2129 if (error != 0) {
2130 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2131 goto out;
2132 }
2133
2134 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2135 IMGSRC_DEBUG("Wrong dev_t.\n");
2136 error = ENXIO;
2137 goto out1;
2138 }
2139
2140 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2141
2142 /*
2143 * If mount by non-root, then verify that user has necessary
2144 * permissions on the device.
2145 */
2146 if (!vfs_context_issuser(ctx)) {
2147 accessmode = KAUTH_VNODE_READ_DATA;
2148 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2149 accessmode |= KAUTH_VNODE_WRITE_DATA;
2150 }
2151 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2152 IMGSRC_DEBUG("Access denied.\n");
2153 goto out1;
2154 }
2155 }
2156
2157 *devvpp = vp;
2158
2159 out1:
2160 vnode_put(realdevvp);
2161
2162 out:
2163 nameidone(&nd);
2164
2165 if (error) {
2166 vnode_put(vp);
2167 }
2168
2169 return error;
2170 }
2171
2172 /*
2173 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2174 * and call checkdirs()
2175 */
2176 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2177 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2178 {
2179 int error;
2180
2181 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2182
2183 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2184 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2185
2186 vnode_lock_spin(vp);
2187 CLR(vp->v_flag, VMOUNT);
2188 vp->v_mountedhere = mp;
2189 SET(vp->v_flag, VMOUNTEDHERE);
2190 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2191 wakeup(&vp->v_flag);
2192 vnode_unlock(vp);
2193
2194 /*
2195 * taking the name_cache_lock exclusively will
2196 * insure that everyone is out of the fast path who
2197 * might be trying to use a now stale copy of
2198 * vp->v_mountedhere->mnt_realrootvp
2199 * bumping mount_generation causes the cached values
2200 * to be invalidated
2201 */
2202 name_cache_lock();
2203 mount_generation++;
2204 name_cache_unlock();
2205
2206 error = vnode_ref(vp);
2207 if (error != 0) {
2208 goto out;
2209 }
2210
2211 error = checkdirs(vp, ctx);
2212 if (error != 0) {
2213 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2214 vnode_rele(vp);
2215 goto out;
2216 }
2217
2218 out:
2219 if (error != 0) {
2220 mp->mnt_vnodecovered = NULLVP;
2221 }
2222 return error;
2223 }
2224
2225 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2226 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2227 {
2228 vnode_rele(vp);
2229 vnode_lock_spin(vp);
2230 CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2231 vp->v_mountedhere = (mount_t)NULL;
2232 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2233 wakeup(&vp->v_flag);
2234 vnode_unlock(vp);
2235
2236 mp->mnt_vnodecovered = NULLVP;
2237 }
2238
2239 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2240 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2241 {
2242 int error;
2243
2244 /* unmount in progress return error */
2245 mount_lock_spin(mp);
2246 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2247 mount_unlock(mp);
2248 return EBUSY;
2249 }
2250 mount_unlock(mp);
2251 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2252
2253 /*
2254 * We only allow the filesystem to be reloaded if it
2255 * is currently mounted read-only.
2256 */
2257 if ((flags & MNT_RELOAD) &&
2258 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2259 error = ENOTSUP;
2260 goto out;
2261 }
2262
2263 /*
2264 * Only root, or the user that did the original mount is
2265 * permitted to update it.
2266 */
2267 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2268 (!vfs_context_issuser(ctx))) {
2269 error = EPERM;
2270 goto out;
2271 }
2272 #if CONFIG_MACF
2273 error = mac_mount_check_remount(ctx, mp, flags);
2274 if (error != 0) {
2275 goto out;
2276 }
2277 #endif
2278
2279 out:
2280 if (error) {
2281 lck_rw_done(&mp->mnt_rwlock);
2282 }
2283
2284 return error;
2285 }
2286
2287 static void
mount_end_update(mount_t mp)2288 mount_end_update(mount_t mp)
2289 {
2290 lck_rw_done(&mp->mnt_rwlock);
2291 }
2292
2293 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2294 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2295 {
2296 vnode_t vp;
2297
2298 if (height >= MAX_IMAGEBOOT_NESTING) {
2299 return EINVAL;
2300 }
2301
2302 vp = imgsrc_rootvnodes[height];
2303 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2304 *rvpp = vp;
2305 return 0;
2306 } else {
2307 return ENOENT;
2308 }
2309 }
2310
2311 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2312 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2313 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2314 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2315 {
2316 int error;
2317 mount_t mp;
2318 boolean_t placed = FALSE;
2319 struct vfstable *vfsp;
2320 user_addr_t devpath;
2321 char *old_mntonname;
2322 vnode_t rvp;
2323 vnode_t devvp;
2324 uint32_t height;
2325 uint32_t flags;
2326
2327 /* If we didn't imageboot, nothing to move */
2328 if (imgsrc_rootvnodes[0] == NULLVP) {
2329 return EINVAL;
2330 }
2331
2332 /* Only root can do this */
2333 if (!vfs_context_issuser(ctx)) {
2334 return EPERM;
2335 }
2336
2337 IMGSRC_DEBUG("looking for root vnode.\n");
2338
2339 /*
2340 * Get root vnode of filesystem we're moving.
2341 */
2342 if (by_index) {
2343 if (is64bit) {
2344 struct user64_mnt_imgsrc_args mia64;
2345 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2346 if (error != 0) {
2347 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2348 return error;
2349 }
2350
2351 height = mia64.mi_height;
2352 flags = mia64.mi_flags;
2353 devpath = (user_addr_t)mia64.mi_devpath;
2354 } else {
2355 struct user32_mnt_imgsrc_args mia32;
2356 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2357 if (error != 0) {
2358 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2359 return error;
2360 }
2361
2362 height = mia32.mi_height;
2363 flags = mia32.mi_flags;
2364 devpath = mia32.mi_devpath;
2365 }
2366 } else {
2367 /*
2368 * For binary compatibility--assumes one level of nesting.
2369 */
2370 if (is64bit) {
2371 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2372 return error;
2373 }
2374 } else {
2375 user32_addr_t tmp;
2376 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2377 return error;
2378 }
2379
2380 /* munge into LP64 addr */
2381 devpath = CAST_USER_ADDR_T(tmp);
2382 }
2383
2384 height = 0;
2385 flags = 0;
2386 }
2387
2388 if (flags != 0) {
2389 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2390 return EINVAL;
2391 }
2392
2393 error = get_imgsrc_rootvnode(height, &rvp);
2394 if (error != 0) {
2395 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2396 return error;
2397 }
2398
2399 IMGSRC_DEBUG("got old root vnode\n");
2400
2401 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2402
2403 /* Can only move once */
2404 mp = vnode_mount(rvp);
2405 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2406 IMGSRC_DEBUG("Already moved.\n");
2407 error = EBUSY;
2408 goto out0;
2409 }
2410
2411 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2412 IMGSRC_DEBUG("Starting updated.\n");
2413
2414 /* Get exclusive rwlock on mount, authorize update on mp */
2415 error = mount_begin_update(mp, ctx, 0);
2416 if (error != 0) {
2417 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2418 goto out0;
2419 }
2420
2421 /*
2422 * It can only be moved once. Flag is set under the rwlock,
2423 * so we're now safe to proceed.
2424 */
2425 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2426 IMGSRC_DEBUG("Already moved [2]\n");
2427 goto out1;
2428 }
2429
2430 IMGSRC_DEBUG("Preparing coveredvp.\n");
2431
2432 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2433 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2434 if (error != 0) {
2435 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2436 goto out1;
2437 }
2438
2439 IMGSRC_DEBUG("Covered vp OK.\n");
2440
2441 /* Sanity check the name caller has provided */
2442 vfsp = mp->mnt_vtable;
2443 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2444 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2445 vfsp->vfc_name, fsname);
2446 error = EINVAL;
2447 goto out2;
2448 }
2449
2450 /* Check the device vnode and update mount-from name, for local filesystems */
2451 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2452 IMGSRC_DEBUG("Local, doing device validation.\n");
2453
2454 if (devpath != USER_ADDR_NULL) {
2455 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2456 if (error) {
2457 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2458 goto out2;
2459 }
2460
2461 vnode_put(devvp);
2462 }
2463 }
2464
2465 /*
2466 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2467 * and increment the name cache's mount generation
2468 */
2469
2470 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2471 error = place_mount_and_checkdirs(mp, vp, ctx);
2472 if (error != 0) {
2473 goto out2;
2474 }
2475
2476 placed = TRUE;
2477
2478 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2479 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2480
2481 /* Forbid future moves */
2482 mount_lock(mp);
2483 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2484 mount_unlock(mp);
2485
2486 /* Finally, add to mount list, completely ready to go */
2487 if (mount_list_add(mp) != 0) {
2488 /*
2489 * The system is shutting down trying to umount
2490 * everything, so fail with a plausible errno.
2491 */
2492 error = EBUSY;
2493 goto out3;
2494 }
2495
2496 mount_end_update(mp);
2497 vnode_put(rvp);
2498 zfree(ZV_NAMEI, old_mntonname);
2499
2500 vfs_notify_mount(pvp);
2501
2502 return 0;
2503 out3:
2504 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2505
2506 mount_lock(mp);
2507 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2508 mount_unlock(mp);
2509
2510 out2:
2511 /*
2512 * Placing the mp on the vnode clears VMOUNT,
2513 * so cleanup is different after that point
2514 */
2515 if (placed) {
2516 /* Rele the vp, clear VMOUNT and v_mountedhere */
2517 undo_place_on_covered_vp(mp, vp);
2518 } else {
2519 vnode_lock_spin(vp);
2520 CLR(vp->v_flag, VMOUNT);
2521 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2522 wakeup(&vp->v_flag);
2523 vnode_unlock(vp);
2524 }
2525 out1:
2526 mount_end_update(mp);
2527
2528 out0:
2529 vnode_put(rvp);
2530 zfree(ZV_NAMEI, old_mntonname);
2531 return error;
2532 }
2533
2534 #endif /* CONFIG_IMGSRC_ACCESS */
2535
2536 void
enablequotas(struct mount * mp,vfs_context_t ctx)2537 enablequotas(struct mount *mp, vfs_context_t ctx)
2538 {
2539 struct nameidata qnd;
2540 int type;
2541 char qfpath[MAXPATHLEN];
2542 const char *qfname = QUOTAFILENAME;
2543 const char *qfopsname = QUOTAOPSNAME;
2544 const char *qfextension[] = INITQFNAMES;
2545
2546 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2547 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2548 return;
2549 }
2550 /*
2551 * Enable filesystem disk quotas if necessary.
2552 * We ignore errors as this should not interfere with final mount
2553 */
2554 for (type = 0; type < MAXQUOTAS; type++) {
2555 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2556 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2557 CAST_USER_ADDR_T(qfpath), ctx);
2558 if (namei(&qnd) != 0) {
2559 continue; /* option file to trigger quotas is not present */
2560 }
2561 vnode_put(qnd.ni_vp);
2562 nameidone(&qnd);
2563 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2564
2565 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2566 }
2567 return;
2568 }
2569
2570
2571 static int
checkdirs_callback(proc_t p,void * arg)2572 checkdirs_callback(proc_t p, void * arg)
2573 {
2574 struct cdirargs *cdrp = (struct cdirargs *)arg;
2575 vnode_t olddp = cdrp->olddp;
2576 vnode_t newdp = cdrp->newdp;
2577 struct filedesc *fdp = &p->p_fd;
2578 vnode_t new_cvp = newdp;
2579 vnode_t new_rvp = newdp;
2580 vnode_t old_cvp = NULL;
2581 vnode_t old_rvp = NULL;
2582
2583 /*
2584 * XXX Also needs to iterate each thread in the process to see if it
2585 * XXX is using a per-thread current working directory, and, if so,
2586 * XXX update that as well.
2587 */
2588
2589 /*
2590 * First, with the proc_fdlock held, check to see if we will need
2591 * to do any work. If not, we will get out fast.
2592 */
2593 proc_fdlock(p);
2594 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2595 proc_fdunlock(p);
2596 return PROC_RETURNED;
2597 }
2598 proc_fdunlock(p);
2599
2600 /*
2601 * Ok, we will have to do some work. Always take two refs
2602 * because we might need that many. We'll dispose of whatever
2603 * we ended up not using.
2604 */
2605 if (vnode_ref(newdp) != 0) {
2606 return PROC_RETURNED;
2607 }
2608 if (vnode_ref(newdp) != 0) {
2609 vnode_rele(newdp);
2610 return PROC_RETURNED;
2611 }
2612
2613 proc_dirs_lock_exclusive(p);
2614 /*
2615 * Now do the work. Note: we dropped the proc_fdlock, so we
2616 * have to do all of the checks again.
2617 */
2618 proc_fdlock(p);
2619 if (fdp->fd_cdir == olddp) {
2620 old_cvp = olddp;
2621 fdp->fd_cdir = newdp;
2622 new_cvp = NULL;
2623 }
2624 if (fdp->fd_rdir == olddp) {
2625 old_rvp = olddp;
2626 fdp->fd_rdir = newdp;
2627 new_rvp = NULL;
2628 }
2629 proc_fdunlock(p);
2630 proc_dirs_unlock_exclusive(p);
2631
2632 /*
2633 * Dispose of any references that are no longer needed.
2634 */
2635 if (old_cvp != NULL) {
2636 vnode_rele(old_cvp);
2637 }
2638 if (old_rvp != NULL) {
2639 vnode_rele(old_rvp);
2640 }
2641 if (new_cvp != NULL) {
2642 vnode_rele(new_cvp);
2643 }
2644 if (new_rvp != NULL) {
2645 vnode_rele(new_rvp);
2646 }
2647
2648 return PROC_RETURNED;
2649 }
2650
2651
2652
2653 /*
2654 * Scan all active processes to see if any of them have a current
2655 * or root directory onto which the new filesystem has just been
2656 * mounted. If so, replace them with the new mount point.
2657 */
2658 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2659 checkdirs(vnode_t olddp, vfs_context_t ctx)
2660 {
2661 vnode_t newdp;
2662 vnode_t tvp;
2663 int err;
2664 struct cdirargs cdr;
2665
2666 if (olddp->v_usecount == 1) {
2667 return 0;
2668 }
2669 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2670
2671 if (err != 0) {
2672 #if DIAGNOSTIC
2673 panic("mount: lost mount: error %d", err);
2674 #endif
2675 return err;
2676 }
2677
2678 cdr.olddp = olddp;
2679 cdr.newdp = newdp;
2680 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2681 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2682
2683 if (rootvnode == olddp) {
2684 vnode_ref(newdp);
2685 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2686 tvp = rootvnode;
2687 rootvnode = newdp;
2688 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2689 vnode_rele(tvp);
2690 }
2691
2692 vnode_put(newdp);
2693 return 0;
2694 }
2695
2696 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2697 "com.apple.private.vfs.role-account-unmount"
2698
2699 /*
2700 * Unmount a file system.
2701 *
2702 * Note: unmount takes a path to the vnode mounted on as argument,
2703 * not special file (as before).
2704 */
2705 /* ARGSUSED */
2706 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2707 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2708 {
2709 vnode_t vp;
2710 struct mount *mp;
2711 int flags = uap->flags;
2712 int error;
2713 struct nameidata nd;
2714 vfs_context_t ctx;
2715
2716 /*
2717 * If the process has the entitlement, use the kernel's context when
2718 * performing lookup on the mount path as the process might lack proper
2719 * permission to access the directory.
2720 */
2721 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2722 vfs_context_kernel() : vfs_context_current();
2723
2724 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2725 UIO_USERSPACE, uap->path, ctx);
2726 if (flags & MNT_NOFOLLOW) {
2727 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
2728 }
2729
2730 error = namei(&nd);
2731 if (error) {
2732 return error;
2733 }
2734 vp = nd.ni_vp;
2735 mp = vp->v_mount;
2736 nameidone(&nd);
2737
2738 /*
2739 * Must be the root of the filesystem
2740 */
2741 if ((vp->v_flag & VROOT) == 0) {
2742 vnode_put(vp);
2743 return EINVAL;
2744 }
2745 #if CONFIG_MACF
2746 error = mac_mount_check_umount(ctx, mp);
2747 if (error != 0) {
2748 vnode_put(vp);
2749 return error;
2750 }
2751 #endif
2752 mount_ref(mp, 0);
2753 vnode_put(vp);
2754 /* safedounmount consumes the mount ref */
2755 return safedounmount(mp, flags, ctx);
2756 }
2757
2758 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2759 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2760 {
2761 mount_t mp;
2762
2763 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2764 if (mp == (mount_t)0) {
2765 return ENOENT;
2766 }
2767 mount_ref(mp, 0);
2768 mount_iterdrop(mp);
2769 /* safedounmount consumes the mount ref */
2770 return safedounmount(mp, flags, ctx);
2771 }
2772
2773 /*
2774 * The mount struct comes with a mount ref which will be consumed.
2775 * Do the actual file system unmount, prevent some common foot shooting.
2776 */
2777 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2778 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2779 {
2780 int error;
2781 proc_t p = vfs_context_proc(ctx);
2782
2783 /*
2784 * If the file system is not responding and MNT_NOBLOCK
2785 * is set and not a forced unmount then return EBUSY.
2786 */
2787 if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2788 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2789 error = EBUSY;
2790 goto out;
2791 }
2792
2793 /*
2794 * Skip authorization in two cases:
2795 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2796 * This entitlement allows non-root processes unmount volumes mounted by
2797 * other processes.
2798 * - If the mount is tagged as permissive and this is not a forced-unmount
2799 * attempt.
2800 */
2801 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2802 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2803 /*
2804 * Only root, or the user that did the original mount is
2805 * permitted to unmount this filesystem.
2806 */
2807 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2808 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2809 goto out;
2810 }
2811 }
2812 /*
2813 * Don't allow unmounting the root file system, or other volumes
2814 * associated with it (for example, the associated VM or DATA mounts) .
2815 */
2816 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2817 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2818 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2819 mp->mnt_vfsstat.f_mntonname);
2820 }
2821 error = EBUSY; /* the root (or associated volumes) is always busy */
2822 goto out;
2823 }
2824
2825 /*
2826 * If the mount is providing the root filesystem's disk image
2827 * (i.e. imageboot), don't allow unmounting
2828 */
2829 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2830 error = EBUSY;
2831 goto out;
2832 }
2833
2834 return dounmount(mp, flags, 1, ctx);
2835
2836 out:
2837 mount_drop(mp, 0);
2838 return error;
2839 }
2840
2841 /*
2842 * Do the actual file system unmount.
2843 */
2844 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2845 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2846 {
2847 vnode_t coveredvp = (vnode_t)0;
2848 int error;
2849 int needwakeup = 0;
2850 int forcedunmount = 0;
2851 int lflags = 0;
2852 struct vnode *devvp = NULLVP;
2853 #if CONFIG_TRIGGERS
2854 proc_t p = vfs_context_proc(ctx);
2855 int did_vflush = 0;
2856 int pflags_save = 0;
2857 #endif /* CONFIG_TRIGGERS */
2858
2859 #if CONFIG_FSE
2860 if (!(flags & MNT_FORCE)) {
2861 fsevent_unmount(mp, ctx); /* has to come first! */
2862 }
2863 #endif
2864
2865 mount_lock(mp);
2866
2867 /*
2868 * If already an unmount in progress just return EBUSY.
2869 * Even a forced unmount cannot override.
2870 */
2871 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2872 if (withref != 0) {
2873 mount_drop(mp, 1);
2874 }
2875 mount_unlock(mp);
2876 return EBUSY;
2877 }
2878
2879 if (flags & MNT_FORCE) {
2880 forcedunmount = 1;
2881 mp->mnt_lflag |= MNT_LFORCE;
2882 }
2883
2884 #if CONFIG_TRIGGERS
2885 if (flags & MNT_NOBLOCK && p != kernproc) {
2886 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2887 }
2888 #endif
2889
2890 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2891 mp->mnt_lflag |= MNT_LUNMOUNT;
2892 mp->mnt_flag &= ~MNT_ASYNC;
2893 /*
2894 * anyone currently in the fast path that
2895 * trips over the cached rootvp will be
2896 * dumped out and forced into the slow path
2897 * to regenerate a new cached value
2898 */
2899 mp->mnt_realrootvp = NULLVP;
2900 mount_unlock(mp);
2901
2902 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2903 /*
2904 * Force unmount any mounts in this filesystem.
2905 * If any unmounts fail - just leave them dangling.
2906 * Avoids recursion.
2907 */
2908 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2909 }
2910
2911 /*
2912 * taking the name_cache_lock exclusively will
2913 * insure that everyone is out of the fast path who
2914 * might be trying to use a now stale copy of
2915 * vp->v_mountedhere->mnt_realrootvp
2916 * bumping mount_generation causes the cached values
2917 * to be invalidated
2918 */
2919 name_cache_lock();
2920 mount_generation++;
2921 name_cache_unlock();
2922
2923
2924 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2925 if (withref != 0) {
2926 mount_drop(mp, 0);
2927 }
2928 error = 0;
2929 if (forcedunmount == 0) {
2930 ubc_umount(mp); /* release cached vnodes */
2931 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2932 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2933 if (error) {
2934 mount_lock(mp);
2935 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2936 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2937 mp->mnt_lflag &= ~MNT_LFORCE;
2938 goto out;
2939 }
2940 }
2941 }
2942
2943 IOBSDMountChange(mp, kIOMountChangeUnmount);
2944
2945 #if CONFIG_TRIGGERS
2946 vfs_nested_trigger_unmounts(mp, flags, ctx);
2947 did_vflush = 1;
2948 #endif
2949 if (forcedunmount) {
2950 lflags |= FORCECLOSE;
2951 }
2952 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2953 if ((forcedunmount == 0) && error) {
2954 mount_lock(mp);
2955 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2956 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2957 mp->mnt_lflag &= ~MNT_LFORCE;
2958 goto out;
2959 }
2960
2961 /* make sure there are no one in the mount iterations or lookup */
2962 mount_iterdrain(mp);
2963
2964 error = VFS_UNMOUNT(mp, flags, ctx);
2965 if (error) {
2966 mount_iterreset(mp);
2967 mount_lock(mp);
2968 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2969 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2970 mp->mnt_lflag &= ~MNT_LFORCE;
2971 goto out;
2972 }
2973
2974 /* increment the operations count */
2975 if (!error) {
2976 OSAddAtomic(1, &vfs_nummntops);
2977 }
2978
2979 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2980 /* hold an io reference and drop the usecount before close */
2981 devvp = mp->mnt_devvp;
2982 vnode_getalways(devvp);
2983 vnode_rele(devvp);
2984 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2985 ctx);
2986 vnode_clearmountedon(devvp);
2987 vnode_put(devvp);
2988 }
2989 lck_rw_done(&mp->mnt_rwlock);
2990 mount_list_remove(mp);
2991 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2992
2993 /* mark the mount point hook in the vp but not drop the ref yet */
2994 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2995 /*
2996 * The covered vnode needs special handling. Trying to get an
2997 * iocount must not block here as this may lead to deadlocks
2998 * if the Filesystem to which the covered vnode belongs is
2999 * undergoing forced unmounts. Since we hold a usecount, the
3000 * vnode cannot be reused (it can, however, still be terminated)
3001 */
3002 vnode_getalways(coveredvp);
3003 vnode_lock_spin(coveredvp);
3004
3005 mp->mnt_crossref++;
3006 coveredvp->v_mountedhere = (struct mount *)0;
3007 CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
3008 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
3009 wakeup(&coveredvp->v_flag);
3010 vnode_unlock(coveredvp);
3011 vnode_put(coveredvp);
3012 }
3013
3014 mount_list_lock();
3015 mp->mnt_vtable->vfc_refcount--;
3016 mount_list_unlock();
3017
3018 cache_purgevfs(mp); /* remove cache entries for this file sys */
3019 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
3020 mount_lock(mp);
3021 mp->mnt_lflag |= MNT_LDEAD;
3022
3023 if (mp->mnt_lflag & MNT_LWAIT) {
3024 /*
3025 * do the wakeup here
3026 * in case we block in mount_refdrain
3027 * which will drop the mount lock
3028 * and allow anyone blocked in vfs_busy
3029 * to wakeup and see the LDEAD state
3030 */
3031 mp->mnt_lflag &= ~MNT_LWAIT;
3032 wakeup((caddr_t)mp);
3033 }
3034 mount_refdrain(mp);
3035
3036 /* free disk_conditioner_info structure for this mount */
3037 disk_conditioner_unmount(mp);
3038
3039 out:
3040 if (mp->mnt_lflag & MNT_LWAIT) {
3041 mp->mnt_lflag &= ~MNT_LWAIT;
3042 needwakeup = 1;
3043 }
3044
3045 #if CONFIG_TRIGGERS
3046 if (flags & MNT_NOBLOCK && p != kernproc) {
3047 // Restore P_NOREMOTEHANG bit to its previous value
3048 if ((pflags_save & P_NOREMOTEHANG) == 0) {
3049 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
3050 }
3051 }
3052
3053 /*
3054 * Callback and context are set together under the mount lock, and
3055 * never cleared, so we're safe to examine them here, drop the lock,
3056 * and call out.
3057 */
3058 if (mp->mnt_triggercallback != NULL) {
3059 mount_unlock(mp);
3060 if (error == 0) {
3061 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
3062 } else if (did_vflush) {
3063 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
3064 }
3065 } else {
3066 mount_unlock(mp);
3067 }
3068 #else
3069 mount_unlock(mp);
3070 #endif /* CONFIG_TRIGGERS */
3071
3072 lck_rw_done(&mp->mnt_rwlock);
3073
3074 if (needwakeup) {
3075 wakeup((caddr_t)mp);
3076 }
3077
3078 if (!error) {
3079 if ((coveredvp != NULLVP)) {
3080 vnode_t pvp = NULLVP;
3081
3082 /*
3083 * The covered vnode needs special handling. Trying to
3084 * get an iocount must not block here as this may lead
3085 * to deadlocks if the Filesystem to which the covered
3086 * vnode belongs is undergoing forced unmounts. Since we
3087 * hold a usecount, the vnode cannot be reused
3088 * (it can, however, still be terminated).
3089 */
3090 vnode_getalways(coveredvp);
3091
3092 mount_dropcrossref(mp, coveredvp, 0);
3093 /*
3094 * We'll _try_ to detect if this really needs to be
3095 * done. The coveredvp can only be in termination (or
3096 * terminated) if the coveredvp's mount point is in a
3097 * forced unmount (or has been) since we still hold the
3098 * ref.
3099 */
3100 if (!vnode_isrecycled(coveredvp)) {
3101 pvp = vnode_getparent(coveredvp);
3102 #if CONFIG_TRIGGERS
3103 if (coveredvp->v_resolve) {
3104 vnode_trigger_rearm(coveredvp, ctx);
3105 }
3106 #endif
3107 }
3108
3109 vnode_rele(coveredvp);
3110 vnode_put(coveredvp);
3111 coveredvp = NULLVP;
3112
3113 if (pvp) {
3114 lock_vnode_and_post(pvp, NOTE_WRITE);
3115 vnode_put(pvp);
3116 }
3117 } else if (mp->mnt_flag & MNT_ROOTFS) {
3118 if (nc_smr_enabled) {
3119 vfs_smr_synchronize();
3120 }
3121
3122 mount_lock_destroy(mp);
3123 #if CONFIG_MACF
3124 mac_mount_label_destroy(mp);
3125 #endif
3126 zfree(mount_zone, mp);
3127 } else {
3128 panic("dounmount: no coveredvp");
3129 }
3130 }
3131 return error;
3132 }
3133
3134 /*
3135 * Unmount any mounts in this filesystem.
3136 */
3137 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3138 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3139 {
3140 mount_t smp;
3141 fsid_t *fsids, fsid;
3142 int fsids_sz;
3143 int count = 0, i, m = 0;
3144 vnode_t vp;
3145
3146 mount_list_lock();
3147
3148 // Get an array to hold the submounts fsids.
3149 TAILQ_FOREACH(smp, &mountlist, mnt_list)
3150 count++;
3151 fsids_sz = count * sizeof(fsid_t);
3152 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3153 if (fsids == NULL) {
3154 mount_list_unlock();
3155 goto out;
3156 }
3157 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3158
3159 /*
3160 * Fill the array with submount fsids.
3161 * Since mounts are always added to the tail of the mount list, the
3162 * list is always in mount order.
3163 * For each mount check if the mounted-on vnode belongs to a
3164 * mount that's already added to our array of mounts to be unmounted.
3165 */
3166 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3167 vp = smp->mnt_vnodecovered;
3168 if (vp == NULL) {
3169 continue;
3170 }
3171 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3172 for (i = 0; i <= m; i++) {
3173 if (fsids[i].val[0] == fsid.val[0] &&
3174 fsids[i].val[1] == fsid.val[1]) {
3175 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3176 break;
3177 }
3178 }
3179 }
3180 mount_list_unlock();
3181
3182 // Unmount the submounts in reverse order. Ignore errors.
3183 for (i = m; i > 0; i--) {
3184 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3185 if (smp) {
3186 mount_ref(smp, 0);
3187 mount_iterdrop(smp);
3188 (void) dounmount(smp, flags, 1, ctx);
3189 }
3190 }
3191 out:
3192 kfree_data(fsids, fsids_sz);
3193 }
3194
3195 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3196 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3197 {
3198 vnode_hold(dp);
3199 vnode_lock(dp);
3200 mp->mnt_crossref--;
3201
3202 if (mp->mnt_crossref < 0) {
3203 panic("mount cross refs -ve");
3204 }
3205
3206 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3207 if (need_put) {
3208 vnode_put_locked(dp);
3209 }
3210 vnode_drop_and_unlock(dp);
3211
3212 if (nc_smr_enabled) {
3213 vfs_smr_synchronize();
3214 }
3215
3216 mount_lock_destroy(mp);
3217 #if CONFIG_MACF
3218 mac_mount_label_destroy(mp);
3219 #endif
3220 zfree(mount_zone, mp);
3221 return;
3222 }
3223 if (need_put) {
3224 vnode_put_locked(dp);
3225 }
3226 vnode_drop_and_unlock(dp);
3227 }
3228
3229
3230 /*
3231 * Sync each mounted filesystem.
3232 */
3233 #if DIAGNOSTIC
3234 int syncprt = 0;
3235 #endif
3236
3237 int print_vmpage_stat = 0;
3238
3239 /*
3240 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3241 * mounted read-write with the passed waitfor value.
3242 *
3243 * Parameters: mp mount-point descriptor per mounted file-system instance.
3244 * arg user argument (please see below)
3245 *
3246 * User argument is a pointer to 32 bit unsigned integer which describes the
3247 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3248 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3249 * waitfor value.
3250 *
3251 * Returns: VFS_RETURNED
3252 */
3253 static int
sync_callback(mount_t mp,void * arg)3254 sync_callback(mount_t mp, void *arg)
3255 {
3256 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3257 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3258 unsigned waitfor = MNT_NOWAIT;
3259
3260 if (arg) {
3261 waitfor = *(uint32_t*)arg;
3262 }
3263
3264 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3265 if (waitfor != MNT_WAIT &&
3266 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3267 waitfor != MNT_NOWAIT &&
3268 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3269 waitfor != MNT_DWAIT &&
3270 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3271 panic("Passed inappropriate waitfor %u to "
3272 "sync_callback()", waitfor);
3273 }
3274
3275 mp->mnt_flag &= ~MNT_ASYNC;
3276 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3277 if (asyncflag) {
3278 mp->mnt_flag |= MNT_ASYNC;
3279 }
3280 }
3281
3282 return VFS_RETURNED;
3283 }
3284
3285 /* ARGSUSED */
3286 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3287 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3288 {
3289 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3290
3291 if (print_vmpage_stat) {
3292 vm_countdirtypages();
3293 }
3294
3295 #if DIAGNOSTIC
3296 if (syncprt) {
3297 vfs_bufstats();
3298 }
3299 #endif /* DIAGNOSTIC */
3300 return 0;
3301 }
3302
3303 typedef enum {
3304 SYNC_ALL = 0,
3305 SYNC_ONLY_RELIABLE_MEDIA = 1,
3306 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3307 } sync_type_t;
3308
3309 static int
sync_internal_callback(mount_t mp,void * arg)3310 sync_internal_callback(mount_t mp, void *arg)
3311 {
3312 if (arg) {
3313 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3314 (mp->mnt_flag & MNT_LOCAL);
3315 sync_type_t sync_type = *((sync_type_t *)arg);
3316
3317 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3318 return VFS_RETURNED;
3319 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3320 return VFS_RETURNED;
3321 }
3322 }
3323
3324 (void)sync_callback(mp, NULL);
3325
3326 return VFS_RETURNED;
3327 }
3328
3329 int sync_thread_state = 0;
3330 int sync_timeout_seconds = 5;
3331
3332 #define SYNC_THREAD_RUN 0x0001
3333 #define SYNC_THREAD_RUNNING 0x0002
3334
3335 #if CONFIG_PHYS_WRITE_ACCT
3336 thread_t pm_sync_thread;
3337 #endif /* CONFIG_PHYS_WRITE_ACCT */
3338
3339 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3340 sync_thread(__unused void *arg, __unused wait_result_t wr)
3341 {
3342 sync_type_t sync_type;
3343 #if CONFIG_PHYS_WRITE_ACCT
3344 pm_sync_thread = current_thread();
3345 #endif /* CONFIG_PHYS_WRITE_ACCT */
3346
3347 lck_mtx_lock(&sync_mtx_lck);
3348 while (sync_thread_state & SYNC_THREAD_RUN) {
3349 sync_thread_state &= ~SYNC_THREAD_RUN;
3350 lck_mtx_unlock(&sync_mtx_lck);
3351
3352 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3353 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3354 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3355 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3356
3357 lck_mtx_lock(&sync_mtx_lck);
3358 }
3359 /*
3360 * This wakeup _has_ to be issued before the lock is released otherwise
3361 * we may end up waking up a thread in sync_internal which is
3362 * expecting a wakeup from a thread it just created and not from this
3363 * thread which is about to exit.
3364 */
3365 wakeup(&sync_thread_state);
3366 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3367 #if CONFIG_PHYS_WRITE_ACCT
3368 pm_sync_thread = NULL;
3369 #endif /* CONFIG_PHYS_WRITE_ACCT */
3370 lck_mtx_unlock(&sync_mtx_lck);
3371
3372 if (print_vmpage_stat) {
3373 vm_countdirtypages();
3374 }
3375
3376 #if DIAGNOSTIC
3377 if (syncprt) {
3378 vfs_bufstats();
3379 }
3380 #endif /* DIAGNOSTIC */
3381 }
3382
3383 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3384
3385 /*
3386 * An in-kernel sync for power management to call.
3387 * This function always returns within sync_timeout seconds.
3388 */
3389 __private_extern__ int
sync_internal(void)3390 sync_internal(void)
3391 {
3392 thread_t thd = NULL;
3393 int error;
3394 int thread_created = FALSE;
3395 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3396
3397 lck_mtx_lock(&sync_mtx_lck);
3398 sync_thread_state |= SYNC_THREAD_RUN;
3399 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3400 int kr;
3401
3402 sync_thread_state |= SYNC_THREAD_RUNNING;
3403 kr = kernel_thread_start(sync_thread, NULL, &thd);
3404 if (kr != KERN_SUCCESS) {
3405 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3406 lck_mtx_unlock(&sync_mtx_lck);
3407 printf("sync_thread failed\n");
3408 return 0;
3409 }
3410 thread_created = TRUE;
3411 }
3412
3413 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3414 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3415 if (error) {
3416 struct timeval now;
3417
3418 microtime(&now);
3419 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3420 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3421 sync_timeout_last_print.tv_sec = now.tv_sec;
3422 }
3423 }
3424
3425 if (thread_created) {
3426 thread_deallocate(thd);
3427 }
3428
3429 return 0;
3430 } /* end of sync_internal call */
3431
3432 /*
3433 * Change filesystem quotas.
3434 */
3435 #if QUOTA
3436 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3437 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3438 {
3439 struct mount *mp;
3440 int error, quota_cmd, quota_status = 0;
3441 caddr_t datap;
3442 size_t fnamelen;
3443 struct nameidata nd;
3444 vfs_context_t ctx = vfs_context_current();
3445 struct dqblk my_dqblk = {};
3446
3447 AUDIT_ARG(uid, uap->uid);
3448 AUDIT_ARG(cmd, uap->cmd);
3449 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3450 uap->path, ctx);
3451 error = namei(&nd);
3452 if (error) {
3453 return error;
3454 }
3455 mp = nd.ni_vp->v_mount;
3456 mount_ref(mp, 0);
3457 vnode_put(nd.ni_vp);
3458 nameidone(&nd);
3459
3460 #if CONFIG_MACF
3461 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3462 if (error != 0) {
3463 goto out;
3464 }
3465 #endif
3466
3467 /* copyin any data we will need for downstream code */
3468 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3469
3470 switch (quota_cmd) {
3471 case Q_QUOTAON:
3472 /* uap->arg specifies a file from which to take the quotas */
3473 fnamelen = MAXPATHLEN;
3474 datap = zalloc(ZV_NAMEI);
3475 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3476 break;
3477 case Q_GETQUOTA:
3478 /* uap->arg is a pointer to a dqblk structure. */
3479 datap = (caddr_t) &my_dqblk;
3480 break;
3481 case Q_SETQUOTA:
3482 case Q_SETUSE:
3483 /* uap->arg is a pointer to a dqblk structure. */
3484 datap = (caddr_t) &my_dqblk;
3485 if (proc_is64bit(p)) {
3486 struct user_dqblk my_dqblk64;
3487 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3488 if (error == 0) {
3489 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3490 }
3491 } else {
3492 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3493 }
3494 break;
3495 case Q_QUOTASTAT:
3496 /* uap->arg is a pointer to an integer */
3497 datap = (caddr_t) "a_status;
3498 break;
3499 default:
3500 datap = NULL;
3501 break;
3502 } /* switch */
3503
3504 if (error == 0) {
3505 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3506 }
3507
3508 switch (quota_cmd) {
3509 case Q_QUOTAON:
3510 if (datap != NULL) {
3511 zfree(ZV_NAMEI, datap);
3512 }
3513 break;
3514 case Q_GETQUOTA:
3515 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3516 if (error == 0) {
3517 if (proc_is64bit(p)) {
3518 struct user_dqblk my_dqblk64;
3519
3520 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3521 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3522 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3523 } else {
3524 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3525 }
3526 }
3527 break;
3528 case Q_QUOTASTAT:
3529 /* uap->arg is a pointer to an integer */
3530 if (error == 0) {
3531 error = copyout(datap, uap->arg, sizeof(quota_status));
3532 }
3533 break;
3534 default:
3535 break;
3536 } /* switch */
3537
3538 out:
3539 mount_drop(mp, 0);
3540 return error;
3541 }
3542 #else
3543 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3544 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3545 {
3546 return EOPNOTSUPP;
3547 }
3548 #endif /* QUOTA */
3549
3550 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3551 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3552 {
3553 int error;
3554 vfs_context_t ctx = vfs_context_current();
3555
3556 #if CONFIG_MACF
3557 error = mac_mount_check_stat(ctx, mp);
3558 if (error != 0) {
3559 return error;
3560 }
3561 #endif
3562
3563 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3564 if (error != 0) {
3565 return error;
3566 }
3567
3568 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3569 }
3570
3571 /*
3572 * Get filesystem statistics.
3573 *
3574 * Returns: 0 Success
3575 * namei:???
3576 * vfs_update_vfsstat:???
3577 * munge_statfs:EFAULT
3578 */
3579 /* ARGSUSED */
3580 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3581 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3582 {
3583 int error;
3584 struct mount *mp;
3585 struct nameidata nd;
3586 vfs_context_t ctx = vfs_context_current();
3587 vnode_t vp;
3588
3589 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3590 UIO_USERSPACE, uap->path, ctx);
3591 error = namei(&nd);
3592 if (error != 0) {
3593 return error;
3594 }
3595 vp = nd.ni_vp;
3596 mp = vp->v_mount;
3597 nameidone(&nd);
3598
3599 error = statfs_internal(p, mp, uap->buf);
3600 vnode_put(vp);
3601
3602 return error;
3603 }
3604
3605 /*
3606 * Get filesystem statistics.
3607 */
3608 /* ARGSUSED */
3609 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3610 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3611 {
3612 int error;
3613 vnode_t vp = NULL;
3614 struct mount *mp;
3615
3616 AUDIT_ARG(fd, uap->fd);
3617
3618 if ((error = file_vnode(uap->fd, &vp)) ||
3619 (error = vnode_getwithref(vp))) {
3620 goto out;
3621 }
3622
3623 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3624
3625 mp = vp->v_mount;
3626 if (!mp) {
3627 error = EBADF;
3628 goto out_vnode;
3629 }
3630
3631 error = statfs_internal(p, mp, uap->buf);
3632
3633 out_vnode:
3634 vnode_put(vp);
3635
3636 out:
3637 if (vp != NULL) {
3638 file_drop(uap->fd);
3639 }
3640
3641 return error;
3642 }
3643
3644 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3645 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3646 {
3647 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3648
3649 bzero(sfs, sizeof(*sfs));
3650
3651 sfs->f_bsize = vsfs->f_bsize;
3652 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3653 sfs->f_blocks = vsfs->f_blocks;
3654 sfs->f_bfree = vsfs->f_bfree;
3655 sfs->f_bavail = vsfs->f_bavail;
3656 sfs->f_files = vsfs->f_files;
3657 sfs->f_ffree = vsfs->f_ffree;
3658 sfs->f_fsid = vsfs->f_fsid;
3659 sfs->f_owner = vsfs->f_owner;
3660 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3661 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3662 sfs->f_fssubtype = vsfs->f_fssubtype;
3663 sfs->f_flags_ext = vfs_getextflags(mp);
3664 vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3665 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3666 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3667 }
3668
3669 /*
3670 * Get file system statistics in 64-bit mode
3671 */
3672 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3673 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3674 {
3675 struct mount *mp;
3676 int error;
3677 struct nameidata *ndp;
3678 struct statfs64 *sfsp;
3679 vfs_context_t ctxp = vfs_context_current();
3680 vnode_t vp;
3681 struct {
3682 struct nameidata nd;
3683 struct statfs64 sfs;
3684 } *__nameidata_statfs64;
3685
3686 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3687 Z_WAITOK);
3688 ndp = &__nameidata_statfs64->nd;
3689
3690 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3691 UIO_USERSPACE, uap->path, ctxp);
3692 error = namei(ndp);
3693 if (error != 0) {
3694 goto out;
3695 }
3696 vp = ndp->ni_vp;
3697 mp = vp->v_mount;
3698 nameidone(ndp);
3699
3700 #if CONFIG_MACF
3701 error = mac_mount_check_stat(ctxp, mp);
3702 if (error != 0) {
3703 vnode_put(vp);
3704 goto out;
3705 }
3706 #endif
3707
3708 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3709 if (error != 0) {
3710 vnode_put(vp);
3711 goto out;
3712 }
3713
3714 sfsp = &__nameidata_statfs64->sfs;
3715 vfs_get_statfs64(mp, sfsp);
3716 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3717 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3718 /* This process does not want to see a seperate data volume mountpoint */
3719 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3720 }
3721 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3722 vnode_put(vp);
3723
3724 out:
3725 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3726
3727 return error;
3728 }
3729
3730 /*
3731 * Get file system statistics in 64-bit mode
3732 */
3733 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3734 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3735 {
3736 struct vnode *vp;
3737 struct mount *mp;
3738 struct statfs64 sfs;
3739 int error;
3740
3741 AUDIT_ARG(fd, uap->fd);
3742
3743 if ((error = file_vnode(uap->fd, &vp))) {
3744 return error;
3745 }
3746
3747 error = vnode_getwithref(vp);
3748 if (error) {
3749 file_drop(uap->fd);
3750 return error;
3751 }
3752
3753 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3754
3755 mp = vp->v_mount;
3756 if (!mp) {
3757 error = EBADF;
3758 goto out;
3759 }
3760
3761 #if CONFIG_MACF
3762 error = mac_mount_check_stat(vfs_context_current(), mp);
3763 if (error != 0) {
3764 goto out;
3765 }
3766 #endif
3767
3768 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3769 goto out;
3770 }
3771
3772 vfs_get_statfs64(mp, &sfs);
3773 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3774 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3775 /* This process does not want to see a seperate data volume mountpoint */
3776 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3777 }
3778 error = copyout(&sfs, uap->buf, sizeof(sfs));
3779
3780 out:
3781 file_drop(uap->fd);
3782 vnode_put(vp);
3783
3784 return error;
3785 }
3786
3787 struct getfsstat_struct {
3788 user_addr_t sfsp;
3789 user_addr_t *mp;
3790 int count;
3791 int maxcount;
3792 int flags;
3793 int error;
3794 };
3795
3796
3797 static int
getfsstat_callback(mount_t mp,void * arg)3798 getfsstat_callback(mount_t mp, void * arg)
3799 {
3800 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3801 struct vfsstatfs *sp;
3802 int error, my_size;
3803 vfs_context_t ctx = vfs_context_current();
3804
3805 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3806 #if CONFIG_MACF
3807 error = mac_mount_check_stat(ctx, mp);
3808 if (error != 0) {
3809 fstp->error = error;
3810 return VFS_RETURNED_DONE;
3811 }
3812 #endif
3813 sp = &mp->mnt_vfsstat;
3814 /*
3815 * If MNT_NOWAIT is specified, do not refresh the
3816 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3817 */
3818 if ((mp->mnt_lflag & MNT_LDEAD) ||
3819 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3820 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3821 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3822 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3823 return VFS_RETURNED;
3824 }
3825
3826 /*
3827 * Need to handle LP64 version of struct statfs
3828 */
3829 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3830 if (error) {
3831 fstp->error = error;
3832 return VFS_RETURNED_DONE;
3833 }
3834 fstp->sfsp += my_size;
3835
3836 if (fstp->mp) {
3837 #if CONFIG_MACF
3838 error = mac_mount_label_get(mp, *fstp->mp);
3839 if (error) {
3840 fstp->error = error;
3841 return VFS_RETURNED_DONE;
3842 }
3843 #endif
3844 fstp->mp++;
3845 }
3846 }
3847 fstp->count++;
3848 return VFS_RETURNED;
3849 }
3850
3851 /*
3852 * Get statistics on all filesystems.
3853 */
3854 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3855 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3856 {
3857 struct __mac_getfsstat_args muap;
3858
3859 muap.buf = uap->buf;
3860 muap.bufsize = uap->bufsize;
3861 muap.mac = USER_ADDR_NULL;
3862 muap.macsize = 0;
3863 muap.flags = uap->flags;
3864
3865 return __mac_getfsstat(p, &muap, retval);
3866 }
3867
3868 /*
3869 * __mac_getfsstat: Get MAC-related file system statistics
3870 *
3871 * Parameters: p (ignored)
3872 * uap User argument descriptor (see below)
3873 * retval Count of file system statistics (N stats)
3874 *
3875 * Indirect: uap->bufsize Buffer size
3876 * uap->macsize MAC info size
3877 * uap->buf Buffer where information will be returned
3878 * uap->mac MAC info
3879 * uap->flags File system flags
3880 *
3881 *
3882 * Returns: 0 Success
3883 * !0 Not success
3884 *
3885 */
3886 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3887 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3888 {
3889 user_addr_t sfsp;
3890 user_addr_t *mp;
3891 size_t count, maxcount, bufsize, macsize;
3892 struct getfsstat_struct fst;
3893
3894 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3895 return EINVAL;
3896 }
3897
3898 bufsize = (size_t) uap->bufsize;
3899 macsize = (size_t) uap->macsize;
3900
3901 if (IS_64BIT_PROCESS(p)) {
3902 maxcount = bufsize / sizeof(struct user64_statfs);
3903 } else {
3904 maxcount = bufsize / sizeof(struct user32_statfs);
3905 }
3906 sfsp = uap->buf;
3907 count = 0;
3908
3909 mp = NULL;
3910
3911 #if CONFIG_MACF
3912 if (uap->mac != USER_ADDR_NULL) {
3913 u_int32_t *mp0;
3914 int error;
3915 unsigned int i;
3916
3917 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3918 if (count != maxcount) {
3919 return EINVAL;
3920 }
3921
3922 /* Copy in the array */
3923 mp0 = kalloc_data(macsize, Z_WAITOK);
3924 if (mp0 == NULL) {
3925 return ENOMEM;
3926 }
3927
3928 error = copyin(uap->mac, mp0, macsize);
3929 if (error) {
3930 kfree_data(mp0, macsize);
3931 return error;
3932 }
3933
3934 /* Normalize to an array of user_addr_t */
3935 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3936 if (mp == NULL) {
3937 kfree_data(mp0, macsize);
3938 return ENOMEM;
3939 }
3940
3941 for (i = 0; i < count; i++) {
3942 if (IS_64BIT_PROCESS(p)) {
3943 mp[i] = ((user_addr_t *)mp0)[i];
3944 } else {
3945 mp[i] = (user_addr_t)mp0[i];
3946 }
3947 }
3948 kfree_data(mp0, macsize);
3949 }
3950 #endif
3951
3952
3953 fst.sfsp = sfsp;
3954 fst.mp = mp;
3955 fst.flags = uap->flags;
3956 fst.count = 0;
3957 fst.error = 0;
3958 fst.maxcount = (int)maxcount;
3959
3960
3961 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3962
3963 if (mp) {
3964 kfree_data(mp, count * sizeof(user_addr_t));
3965 }
3966
3967 if (fst.error) {
3968 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3969 return fst.error;
3970 }
3971
3972 if (fst.sfsp && fst.count > fst.maxcount) {
3973 *retval = fst.maxcount;
3974 } else {
3975 *retval = fst.count;
3976 }
3977 return 0;
3978 }
3979
3980 static int
getfsstat64_callback(mount_t mp,void * arg)3981 getfsstat64_callback(mount_t mp, void * arg)
3982 {
3983 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3984 struct vfsstatfs *sp;
3985 struct statfs64 sfs;
3986 int error;
3987
3988 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3989 #if CONFIG_MACF
3990 error = mac_mount_check_stat(vfs_context_current(), mp);
3991 if (error != 0) {
3992 fstp->error = error;
3993 return VFS_RETURNED_DONE;
3994 }
3995 #endif
3996 sp = &mp->mnt_vfsstat;
3997 /*
3998 * If MNT_NOWAIT is specified, do not refresh the fsstat
3999 * cache. MNT_WAIT overrides MNT_NOWAIT.
4000 *
4001 * We treat MNT_DWAIT as MNT_WAIT for all instances of
4002 * getfsstat, since the constants are out of the same
4003 * namespace.
4004 */
4005 if ((mp->mnt_lflag & MNT_LDEAD) ||
4006 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
4007 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
4008 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
4009 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
4010 return VFS_RETURNED;
4011 }
4012
4013 vfs_get_statfs64(mp, &sfs);
4014 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
4015 if (error) {
4016 fstp->error = error;
4017 return VFS_RETURNED_DONE;
4018 }
4019 fstp->sfsp += sizeof(sfs);
4020 }
4021 fstp->count++;
4022 return VFS_RETURNED;
4023 }
4024
4025 /*
4026 * Get statistics on all file systems in 64 bit mode.
4027 */
4028 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)4029 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
4030 {
4031 user_addr_t sfsp;
4032 int count, maxcount;
4033 struct getfsstat_struct fst;
4034
4035 maxcount = uap->bufsize / sizeof(struct statfs64);
4036
4037 sfsp = uap->buf;
4038 count = 0;
4039
4040 fst.sfsp = sfsp;
4041 fst.flags = uap->flags;
4042 fst.count = 0;
4043 fst.error = 0;
4044 fst.maxcount = maxcount;
4045
4046 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
4047
4048 if (fst.error) {
4049 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4050 return fst.error;
4051 }
4052
4053 if (fst.sfsp && fst.count > fst.maxcount) {
4054 *retval = fst.maxcount;
4055 } else {
4056 *retval = fst.count;
4057 }
4058
4059 return 0;
4060 }
4061
4062 /*
4063 * gets the associated vnode with the file descriptor passed.
4064 * as input
4065 *
4066 * INPUT
4067 * ctx - vfs context of caller
4068 * fd - file descriptor for which vnode is required.
4069 * vpp - Pointer to pointer to vnode to be returned.
4070 *
4071 * The vnode is returned with an iocount so any vnode obtained
4072 * by this call needs a vnode_put
4073 *
4074 */
4075 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4076 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4077 {
4078 int error;
4079 vnode_t vp;
4080 struct fileproc *fp;
4081 proc_t p = vfs_context_proc(ctx);
4082
4083 *vpp = NULLVP;
4084
4085 error = fp_getfvp(p, fd, &fp, &vp);
4086 if (error) {
4087 return error;
4088 }
4089
4090 error = vnode_getwithref(vp);
4091 if (error) {
4092 (void)fp_drop(p, fd, fp, 0);
4093 return error;
4094 }
4095
4096 (void)fp_drop(p, fd, fp, 0);
4097 *vpp = vp;
4098 return error;
4099 }
4100
4101 /*
4102 * Wrapper function around namei to start lookup from a directory
4103 * specified by a file descriptor ni_dirfd.
4104 *
4105 * In addition to all the errors returned by namei, this call can
4106 * return ENOTDIR if the file descriptor does not refer to a directory.
4107 * and EBADF if the file descriptor is not valid.
4108 */
4109 int
nameiat(struct nameidata * ndp,int dirfd)4110 nameiat(struct nameidata *ndp, int dirfd)
4111 {
4112 if ((dirfd != AT_FDCWD) &&
4113 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4114 !(ndp->ni_cnd.cn_flags & USEDVP)) {
4115 int error = 0;
4116 char c;
4117
4118 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4119 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4120 if (error) {
4121 return error;
4122 }
4123 } else {
4124 c = *((char *)(ndp->ni_dirp));
4125 }
4126
4127 if (c != '/') {
4128 vnode_t dvp_at;
4129
4130 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4131 &dvp_at);
4132 if (error) {
4133 return error;
4134 }
4135
4136 if (vnode_vtype(dvp_at) != VDIR) {
4137 vnode_put(dvp_at);
4138 return ENOTDIR;
4139 }
4140
4141 ndp->ni_dvp = dvp_at;
4142 ndp->ni_cnd.cn_flags |= USEDVP;
4143 error = namei(ndp);
4144 ndp->ni_cnd.cn_flags &= ~USEDVP;
4145 vnode_put(dvp_at);
4146 return error;
4147 }
4148 }
4149
4150 return namei(ndp);
4151 }
4152
4153 /*
4154 * Change current working directory to a given file descriptor.
4155 */
4156 /* ARGSUSED */
4157 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4158 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4159 {
4160 vnode_t vp;
4161 vnode_t tdp;
4162 vnode_t tvp;
4163 struct mount *mp;
4164 int error, should_put = 1;
4165
4166 AUDIT_ARG(fd, fd);
4167 if (per_thread && fd == -1) {
4168 /*
4169 * Switching back from per-thread to per process CWD; verify we
4170 * in fact have one before proceeding. The only success case
4171 * for this code path is to return 0 preemptively after zapping
4172 * the thread structure contents.
4173 */
4174 thread_t th = vfs_context_thread(ctx);
4175 if (th) {
4176 uthread_t uth = get_bsdthread_info(th);
4177 tvp = uth->uu_cdir;
4178 uth->uu_cdir = NULLVP;
4179 if (tvp != NULLVP) {
4180 vnode_rele(tvp);
4181 return 0;
4182 }
4183 }
4184 return EBADF;
4185 }
4186
4187 if ((error = file_vnode(fd, &vp))) {
4188 return error;
4189 }
4190 if ((error = vnode_getwithref(vp))) {
4191 file_drop(fd);
4192 return error;
4193 }
4194
4195 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4196
4197 if (vp->v_type != VDIR) {
4198 error = ENOTDIR;
4199 goto out;
4200 }
4201
4202 #if CONFIG_MACF
4203 error = mac_vnode_check_chdir(ctx, vp);
4204 if (error) {
4205 goto out;
4206 }
4207 #endif
4208 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4209 if (error) {
4210 goto out;
4211 }
4212
4213 while (!error && (mp = vp->v_mountedhere) != NULL) {
4214 if (vfs_busy(mp, LK_NOWAIT)) {
4215 error = EACCES;
4216 goto out;
4217 }
4218 error = VFS_ROOT(mp, &tdp, ctx);
4219 vfs_unbusy(mp);
4220 if (error) {
4221 break;
4222 }
4223 vnode_put(vp);
4224 vp = tdp;
4225 }
4226 if (error) {
4227 goto out;
4228 }
4229 if ((error = vnode_ref(vp))) {
4230 goto out;
4231 }
4232 vnode_put(vp);
4233 should_put = 0;
4234
4235 if (per_thread) {
4236 thread_t th = vfs_context_thread(ctx);
4237 if (th) {
4238 uthread_t uth = get_bsdthread_info(th);
4239 tvp = uth->uu_cdir;
4240 uth->uu_cdir = vp;
4241 OSBitOrAtomic(P_THCWD, &p->p_flag);
4242 } else {
4243 vnode_rele(vp);
4244 error = ENOENT;
4245 goto out;
4246 }
4247 } else {
4248 proc_dirs_lock_exclusive(p);
4249 proc_fdlock(p);
4250 tvp = p->p_fd.fd_cdir;
4251 p->p_fd.fd_cdir = vp;
4252 proc_fdunlock(p);
4253 proc_dirs_unlock_exclusive(p);
4254 }
4255
4256 if (tvp) {
4257 vnode_rele(tvp);
4258 }
4259
4260 out:
4261 if (should_put) {
4262 vnode_put(vp);
4263 }
4264 file_drop(fd);
4265
4266 return error;
4267 }
4268
4269 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4270 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4271 {
4272 return fchdir(p, vfs_context_current(), uap->fd, false);
4273 }
4274
4275 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4276 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4277 {
4278 return fchdir(p, vfs_context_current(), uap->fd, true);
4279 }
4280
4281
4282 /*
4283 * Change current working directory (".").
4284 *
4285 * Returns: 0 Success
4286 * change_dir:ENOTDIR
4287 * change_dir:???
4288 * vnode_ref:ENOENT No such file or directory
4289 */
4290 /* ARGSUSED */
4291 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4292 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4293 {
4294 int error;
4295 vnode_t tvp;
4296
4297 error = change_dir(ndp, ctx);
4298 if (error) {
4299 return error;
4300 }
4301 if ((error = vnode_ref(ndp->ni_vp))) {
4302 vnode_put(ndp->ni_vp);
4303 return error;
4304 }
4305 /*
4306 * drop the iocount we picked up in change_dir
4307 */
4308 vnode_put(ndp->ni_vp);
4309
4310 if (per_thread) {
4311 thread_t th = vfs_context_thread(ctx);
4312 if (th) {
4313 uthread_t uth = get_bsdthread_info(th);
4314 tvp = uth->uu_cdir;
4315 uth->uu_cdir = ndp->ni_vp;
4316 OSBitOrAtomic(P_THCWD, &p->p_flag);
4317 } else {
4318 vnode_rele(ndp->ni_vp);
4319 return ENOENT;
4320 }
4321 } else {
4322 proc_dirs_lock_exclusive(p);
4323 proc_fdlock(p);
4324 tvp = p->p_fd.fd_cdir;
4325 p->p_fd.fd_cdir = ndp->ni_vp;
4326 proc_fdunlock(p);
4327 proc_dirs_unlock_exclusive(p);
4328 }
4329
4330 if (tvp) {
4331 vnode_rele(tvp);
4332 }
4333
4334 return 0;
4335 }
4336
4337
4338 /*
4339 * Change current working directory (".").
4340 *
4341 * Returns: 0 Success
4342 * chdir_internal:ENOTDIR
4343 * chdir_internal:ENOENT No such file or directory
4344 * chdir_internal:???
4345 */
4346 /* ARGSUSED */
4347 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4348 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4349 {
4350 struct nameidata nd;
4351 vfs_context_t ctx = vfs_context_current();
4352
4353 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4354 UIO_USERSPACE, uap->path, ctx);
4355
4356 return chdir_internal(p, ctx, &nd, per_thread);
4357 }
4358
4359
4360 /*
4361 * chdir
4362 *
4363 * Change current working directory (".") for the entire process
4364 *
4365 * Parameters: p Process requesting the call
4366 * uap User argument descriptor (see below)
4367 * retval (ignored)
4368 *
4369 * Indirect parameters: uap->path Directory path
4370 *
4371 * Returns: 0 Success
4372 * common_chdir: ENOTDIR
4373 * common_chdir: ENOENT No such file or directory
4374 * common_chdir: ???
4375 *
4376 */
4377 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4378 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4379 {
4380 return common_chdir(p, (void *)uap, 0);
4381 }
4382
4383 /*
4384 * __pthread_chdir
4385 *
4386 * Change current working directory (".") for a single thread
4387 *
4388 * Parameters: p Process requesting the call
4389 * uap User argument descriptor (see below)
4390 * retval (ignored)
4391 *
4392 * Indirect parameters: uap->path Directory path
4393 *
4394 * Returns: 0 Success
4395 * common_chdir: ENOTDIR
4396 * common_chdir: ENOENT No such file or directory
4397 * common_chdir: ???
4398 *
4399 */
4400 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4401 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4402 {
4403 return common_chdir(p, (void *)uap, 1);
4404 }
4405
4406
4407 /*
4408 * Change notion of root (``/'') directory.
4409 */
4410 /* ARGSUSED */
4411 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4412 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4413 {
4414 struct filedesc *fdp = &p->p_fd;
4415 int error;
4416 struct nameidata nd;
4417 vnode_t tvp;
4418 vfs_context_t ctx = vfs_context_current();
4419
4420 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4421 return error;
4422 }
4423
4424 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4425 UIO_USERSPACE, uap->path, ctx);
4426 error = change_dir(&nd, ctx);
4427 if (error) {
4428 return error;
4429 }
4430
4431 #if CONFIG_MACF
4432 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4433 &nd.ni_cnd);
4434 if (error) {
4435 vnode_put(nd.ni_vp);
4436 return error;
4437 }
4438 #endif
4439
4440 if ((error = vnode_ref(nd.ni_vp))) {
4441 vnode_put(nd.ni_vp);
4442 return error;
4443 }
4444 vnode_put(nd.ni_vp);
4445
4446 /*
4447 * This lock provides the guarantee that as long as you hold the lock
4448 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4449 * on a referenced vnode in namei when determining the rootvnode for
4450 * a process.
4451 */
4452 /* needed for synchronization with lookup */
4453 proc_dirs_lock_exclusive(p);
4454 /* needed for setting the flag and other activities on the fd itself */
4455 proc_fdlock(p);
4456 tvp = fdp->fd_rdir;
4457 fdp->fd_rdir = nd.ni_vp;
4458 fdt_flag_set(fdp, FD_CHROOT);
4459 proc_fdunlock(p);
4460 proc_dirs_unlock_exclusive(p);
4461
4462 if (tvp != NULL) {
4463 vnode_rele(tvp);
4464 }
4465
4466 return 0;
4467 }
4468
4469 #define PATHSTATICBUFLEN 256
4470 #define PIVOT_ROOT_ENTITLEMENT \
4471 "com.apple.private.vfs.pivot-root"
4472
4473 #if defined(XNU_TARGET_OS_OSX)
4474 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4475 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4476 {
4477 int error;
4478 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4479 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4480 char *new_rootfs_path_before_buf = NULL;
4481 char *old_rootfs_path_after_buf = NULL;
4482 char *incoming = NULL;
4483 char *outgoing = NULL;
4484 vnode_t incoming_rootvp = NULLVP;
4485 size_t bytes_copied;
4486
4487 /*
4488 * XXX : Additional restrictions needed
4489 * - perhaps callable only once.
4490 */
4491 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4492 return error;
4493 }
4494
4495 /*
4496 * pivot_root can be executed by launchd only.
4497 * Enforce entitlement.
4498 */
4499 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4500 return EPERM;
4501 }
4502
4503 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4504 if (error == ENAMETOOLONG) {
4505 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4506 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4507 }
4508
4509 if (error) {
4510 goto out;
4511 }
4512
4513 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4514 if (error == ENAMETOOLONG) {
4515 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4516 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4517 }
4518 if (error) {
4519 goto out;
4520 }
4521
4522 if (new_rootfs_path_before_buf) {
4523 incoming = new_rootfs_path_before_buf;
4524 } else {
4525 incoming = &new_rootfs_path_before[0];
4526 }
4527
4528 if (old_rootfs_path_after_buf) {
4529 outgoing = old_rootfs_path_after_buf;
4530 } else {
4531 outgoing = &old_rootfs_path_after[0];
4532 }
4533
4534 /*
4535 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4536 * Userland is not allowed to pivot to an image.
4537 */
4538 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4539 if (error) {
4540 goto out;
4541 }
4542 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4543 if (error) {
4544 goto out;
4545 }
4546
4547 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4548
4549 out:
4550 if (incoming_rootvp != NULLVP) {
4551 vnode_put(incoming_rootvp);
4552 incoming_rootvp = NULLVP;
4553 }
4554
4555 if (old_rootfs_path_after_buf) {
4556 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4557 }
4558
4559 if (new_rootfs_path_before_buf) {
4560 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4561 }
4562
4563 return error;
4564 }
4565 #else
4566 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4567 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4568 {
4569 return nosys(p, NULL, retval);
4570 }
4571 #endif /* XNU_TARGET_OS_OSX */
4572
4573 /*
4574 * Common routine for chroot and chdir.
4575 *
4576 * Returns: 0 Success
4577 * ENOTDIR Not a directory
4578 * namei:??? [anything namei can return]
4579 * vnode_authorize:??? [anything vnode_authorize can return]
4580 */
4581 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4582 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4583 {
4584 vnode_t vp;
4585 int error;
4586
4587 if ((error = namei(ndp))) {
4588 return error;
4589 }
4590 nameidone(ndp);
4591 vp = ndp->ni_vp;
4592
4593 if (vp->v_type != VDIR) {
4594 vnode_put(vp);
4595 return ENOTDIR;
4596 }
4597
4598 #if CONFIG_MACF
4599 error = mac_vnode_check_chdir(ctx, vp);
4600 if (error) {
4601 vnode_put(vp);
4602 return error;
4603 }
4604 #endif
4605
4606 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4607 if (error) {
4608 vnode_put(vp);
4609 return error;
4610 }
4611
4612 return error;
4613 }
4614
4615 /*
4616 * Free the vnode data (for directories) associated with the file glob.
4617 */
4618 struct fd_vn_data *
fg_vn_data_alloc(void)4619 fg_vn_data_alloc(void)
4620 {
4621 struct fd_vn_data *fvdata;
4622
4623 /* Allocate per fd vnode data */
4624 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4625 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4626 return fvdata;
4627 }
4628
4629 /*
4630 * Free the vnode data (for directories) associated with the file glob.
4631 */
4632 void
fg_vn_data_free(void * fgvndata)4633 fg_vn_data_free(void *fgvndata)
4634 {
4635 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4636
4637 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4638 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4639 kfree_type(struct fd_vn_data, fvdata);
4640 }
4641
4642 /*
4643 * Check permissions, allocate an open file structure,
4644 * and call the device open routine if any.
4645 *
4646 * Returns: 0 Success
4647 * EINVAL
4648 * EINTR
4649 * falloc:ENFILE
4650 * falloc:EMFILE
4651 * falloc:ENOMEM
4652 * vn_open_auth:???
4653 * dupfdopen:???
4654 * VNOP_ADVLOCK:???
4655 * vnode_setsize:???
4656 *
4657 * XXX Need to implement uid, gid
4658 */
4659 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4660 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4661 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4662 {
4663 proc_t p = vfs_context_proc(ctx);
4664 kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4665 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4666 struct fileproc *fp;
4667 vnode_t vp;
4668 int flags, oflags, amode;
4669 int type, indx, error;
4670 struct vfs_context context;
4671 vnode_t authvp = NULLVP;
4672
4673 oflags = uflags;
4674
4675 amode = oflags & O_ACCMODE;
4676 /*
4677 * Because O_RDONLY is 0, it is not possible to distinguish between
4678 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4679 * with FREAD/FWRITE.
4680 */
4681 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4682 return EINVAL;
4683 }
4684
4685 flags = FFLAGS(uflags);
4686 CLR(flags, FENCRYPTED);
4687 CLR(flags, FUNENCRYPTED);
4688
4689 AUDIT_ARG(fflags, oflags);
4690 AUDIT_ARG(mode, vap->va_mode);
4691
4692 if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4693 return error;
4694 }
4695 if (flags & O_CLOEXEC) {
4696 fp->fp_flags |= FP_CLOEXEC;
4697 }
4698 if (flags & O_CLOFORK) {
4699 fp->fp_flags |= FP_CLOFORK;
4700 }
4701
4702 /* setup state to recognize when fdesc_open was called */
4703 uu->uu_dupfd = -1;
4704
4705 /*
4706 * Disable read/write access if file is opened with O_EVTONLY and
4707 * the process has requested to deny read/write access.
4708 */
4709 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4710 flags &= ~(FREAD | FWRITE);
4711 }
4712
4713 if (authfd != AUTH_OPEN_NOAUTHFD) {
4714 error = vnode_getfromfd(ctx, authfd, &authvp);
4715 if (error) {
4716 fp_free(p, indx, fp);
4717 return error;
4718 }
4719 }
4720
4721 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4722 if (authvp != NULLVP) {
4723 vnode_put(authvp);
4724 }
4725 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4726 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4727 *retval = indx;
4728 return 0;
4729 }
4730 }
4731 if (error == ERESTART) {
4732 error = EINTR;
4733 }
4734 fp_free(p, indx, fp);
4735 return error;
4736 }
4737
4738 if (authvp != NULLVP) {
4739 vnode_put(authvp);
4740 }
4741
4742 uu->uu_dupfd = 0;
4743 vp = ndp->ni_vp;
4744
4745 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4746 fp->fp_glob->fg_ops = &vnops;
4747 fp_set_data(fp, vp);
4748
4749 #if CONFIG_FILE_LEASES
4750 /*
4751 * If we are creating a file or open with truncate, we need to break the
4752 * lease if there is a read lease placed on the parent dir.
4753 */
4754 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4755 vnode_breakdirlease(vp, true, oflags);
4756 }
4757 /* Now check if there is a lease placed on the file itself. */
4758 error = vnode_breaklease(vp, oflags, ctx);
4759 if (error) {
4760 goto bad;
4761 }
4762 #endif /* CONFIG_FILE_LEASES */
4763
4764 if (flags & (O_EXLOCK | O_SHLOCK)) {
4765 struct flock lf = {
4766 .l_whence = SEEK_SET,
4767 };
4768
4769 if (flags & O_EXLOCK) {
4770 lf.l_type = F_WRLCK;
4771 } else {
4772 lf.l_type = F_RDLCK;
4773 }
4774 type = F_FLOCK;
4775 if ((flags & FNONBLOCK) == 0) {
4776 type |= F_WAIT;
4777 }
4778 #if CONFIG_MACF
4779 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4780 F_SETLK, &lf);
4781 if (error) {
4782 goto bad;
4783 }
4784 #endif
4785 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4786 goto bad;
4787 }
4788 fp->fp_glob->fg_flag |= FWASLOCKED;
4789 }
4790
4791 /* try to truncate by setting the size attribute */
4792 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4793 goto bad;
4794 }
4795
4796 /*
4797 * For directories we hold some additional information in the fd.
4798 */
4799 if (vnode_vtype(vp) == VDIR) {
4800 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4801 } else {
4802 fp->fp_glob->fg_vn_data = NULL;
4803 }
4804
4805 #if CONFIG_SECLUDED_MEMORY
4806 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4807 memory_object_control_t moc;
4808 const char *v_name;
4809
4810 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4811
4812 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4813 /* nothing to do... */
4814 } else if (fp->fp_glob->fg_flag & FWRITE) {
4815 /* writable -> no longer eligible for secluded pages */
4816 memory_object_mark_eligible_for_secluded(moc,
4817 FALSE);
4818 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4819 char pathname[32] = { 0, };
4820 size_t copied;
4821 /* XXX FBDP: better way to detect /Applications/ ? */
4822 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4823 (void)copyinstr(ndp->ni_dirp,
4824 pathname,
4825 sizeof(pathname),
4826 &copied);
4827 } else {
4828 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4829 pathname,
4830 sizeof(pathname),
4831 &copied);
4832 }
4833 pathname[sizeof(pathname) - 1] = '\0';
4834 if (strncmp(pathname,
4835 "/Applications/",
4836 strlen("/Applications/")) == 0 &&
4837 strncmp(pathname,
4838 "/Applications/Camera.app/",
4839 strlen("/Applications/Camera.app/")) != 0) {
4840 /*
4841 * not writable
4842 * AND from "/Applications/"
4843 * AND not from "/Applications/Camera.app/"
4844 * ==> eligible for secluded
4845 */
4846 memory_object_mark_eligible_for_secluded(moc,
4847 TRUE);
4848 }
4849 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4850 (v_name = vnode_getname(vp))) {
4851 size_t len = strlen(v_name);
4852
4853 if (!strncmp(v_name, "dyld", len) ||
4854 !strncmp(v_name, "launchd", len) ||
4855 !strncmp(v_name, "Camera", len) ||
4856 !strncmp(v_name, "SpringBoard", len) ||
4857 !strncmp(v_name, "backboardd", len) ||
4858 !strncmp(v_name, "cameracaptured", len)) {
4859 /*
4860 * This file matters when launching Camera:
4861 * do not store its contents in the secluded
4862 * pool that will be drained on Camera launch.
4863 */
4864 memory_object_mark_eligible_for_secluded(moc,
4865 FALSE);
4866 } else if (!strncmp(v_name, "audiomxd", len) ||
4867 !strncmp(v_name, "mediaplaybackd", len)) {
4868 memory_object_mark_eligible_for_secluded(moc,
4869 FALSE);
4870 memory_object_mark_for_realtime(moc,
4871 true);
4872 } else if (!strncmp(v_name, "bluetoothd", len)) {
4873 /*
4874 * bluetoothd might be needed for realtime audio
4875 * playback.
4876 */
4877 memory_object_mark_eligible_for_secluded(moc,
4878 FALSE);
4879 memory_object_mark_for_realtime(moc,
4880 true);
4881 } else {
4882 char pathname[64] = { 0, };
4883 size_t copied;
4884 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4885 (void)copyinstr(ndp->ni_dirp,
4886 pathname,
4887 sizeof(pathname),
4888 &copied);
4889 } else {
4890 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4891 pathname,
4892 sizeof(pathname),
4893 &copied);
4894 }
4895 pathname[sizeof(pathname) - 1] = '\0';
4896 if (strncmp(pathname,
4897 "/Library/Audio/Plug-Ins/",
4898 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4899 strncmp(pathname,
4900 "/System/Library/Audio/Plug-Ins/",
4901 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4902 /*
4903 * This may be an audio plugin required
4904 * for realtime playback.
4905 * ==> NOT eligible for secluded.
4906 */
4907 memory_object_mark_eligible_for_secluded(moc,
4908 FALSE);
4909 memory_object_mark_for_realtime(moc,
4910 true);
4911 }
4912 }
4913 vnode_putname(v_name);
4914 }
4915 }
4916 #endif /* CONFIG_SECLUDED_MEMORY */
4917
4918 vnode_put(vp);
4919
4920 /*
4921 * The first terminal open (without a O_NOCTTY) by a session leader
4922 * results in it being set as the controlling terminal.
4923 */
4924 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4925 !(flags & O_NOCTTY)) {
4926 int tmp = 0;
4927
4928 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4929 (caddr_t)&tmp, ctx);
4930 }
4931
4932 proc_fdlock(p);
4933 procfdtbl_releasefd(p, indx, NULL);
4934
4935 fp_drop(p, indx, fp, 1);
4936 proc_fdunlock(p);
4937
4938 *retval = indx;
4939
4940 return 0;
4941 bad:
4942 context = *vfs_context_current();
4943 context.vc_ucred = fp->fp_glob->fg_cred;
4944
4945 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4946 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4947 struct flock lf = {
4948 .l_whence = SEEK_SET,
4949 .l_type = F_UNLCK,
4950 };
4951
4952 (void)VNOP_ADVLOCK(
4953 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4954 }
4955
4956 vn_close(vp, fp->fp_glob->fg_flag, &context);
4957 vnode_put(vp);
4958 fp_free(p, indx, fp);
4959
4960 return error;
4961 }
4962
4963 /*
4964 * While most of the *at syscall handlers can call nameiat() which
4965 * is a wrapper around namei, the use of namei and initialisation
4966 * of nameidata are far removed and in different functions - namei
4967 * gets called in vn_open_auth for open1. So we'll just do here what
4968 * nameiat() does.
4969 */
4970 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4971 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4972 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4973 int dirfd, int authfd)
4974 {
4975 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4976 int error;
4977 char c;
4978
4979 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4980 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4981 if (error) {
4982 return error;
4983 }
4984 } else {
4985 c = *((char *)(ndp->ni_dirp));
4986 }
4987
4988 if (c != '/') {
4989 vnode_t dvp_at;
4990
4991 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4992 &dvp_at);
4993 if (error) {
4994 return error;
4995 }
4996
4997 if (vnode_vtype(dvp_at) != VDIR) {
4998 vnode_put(dvp_at);
4999 return ENOTDIR;
5000 }
5001
5002 ndp->ni_dvp = dvp_at;
5003 ndp->ni_cnd.cn_flags |= USEDVP;
5004 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
5005 retval, authfd);
5006 vnode_put(dvp_at);
5007 return error;
5008 }
5009 }
5010
5011 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
5012 }
5013
5014 /*
5015 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
5016 *
5017 * Parameters: p Process requesting the open
5018 * uap User argument descriptor (see below)
5019 * retval Pointer to an area to receive the
5020 * return calue from the system call
5021 *
5022 * Indirect: uap->path Path to open (same as 'open')
5023 * uap->flags Flags to open (same as 'open'
5024 * uap->uid UID to set, if creating
5025 * uap->gid GID to set, if creating
5026 * uap->mode File mode, if creating (same as 'open')
5027 * uap->xsecurity ACL to set, if creating
5028 *
5029 * Returns: 0 Success
5030 * !0 errno value
5031 *
5032 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5033 *
5034 * XXX: We should enummerate the possible errno values here, and where
5035 * in the code they originated.
5036 */
5037 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)5038 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
5039 {
5040 int ciferror;
5041 kauth_filesec_t xsecdst;
5042 struct vnode_attr va;
5043 struct nameidata nd;
5044 int cmode;
5045
5046 AUDIT_ARG(owner, uap->uid, uap->gid);
5047
5048 xsecdst = NULL;
5049 if ((uap->xsecurity != USER_ADDR_NULL) &&
5050 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
5051 return ciferror;
5052 }
5053
5054 VATTR_INIT(&va);
5055 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
5056 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5057 if (uap->uid != KAUTH_UID_NONE) {
5058 VATTR_SET(&va, va_uid, uap->uid);
5059 }
5060 if (uap->gid != KAUTH_GID_NONE) {
5061 VATTR_SET(&va, va_gid, uap->gid);
5062 }
5063 if (xsecdst != NULL) {
5064 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5065 va.va_vaflags |= VA_FILESEC_ACL;
5066 }
5067
5068 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5069 uap->path, vfs_context_current());
5070
5071 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5072 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5073 if (xsecdst != NULL) {
5074 kauth_filesec_free(xsecdst);
5075 }
5076
5077 return ciferror;
5078 }
5079
5080 /*
5081 * Go through the data-protected atomically controlled open (2)
5082 *
5083 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5084 */
5085 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5086 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5087 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5088 {
5089 /*
5090 * Follow the same path as normal open(2)
5091 * Look up the item if it exists, and acquire the vnode.
5092 */
5093 struct vnode_attr va;
5094 struct nameidata nd;
5095 int cmode;
5096 int error;
5097 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5098
5099 VATTR_INIT(&va);
5100 /* Mask off all but regular access permissions */
5101 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5102 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5103
5104 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5105 path, ctx);
5106
5107 /*
5108 * Initialize the extra fields in vnode_attr to pass down our
5109 * extra fields.
5110 * 1. target cprotect class.
5111 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5112 */
5113 if (flags & O_CREAT) {
5114 /* lower level kernel code validates that the class is valid before applying it. */
5115 if (class != PROTECTION_CLASS_DEFAULT) {
5116 /*
5117 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5118 * file behave the same as open (2)
5119 */
5120 VATTR_SET(&va, va_dataprotect_class, class);
5121 }
5122 }
5123
5124 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5125 if (flags & (O_RDWR | O_WRONLY)) {
5126 /*
5127 * Not allowed to write raw encrypted bytes or when opening authenticated.
5128 */
5129 return EINVAL;
5130 }
5131 if (dpflags & O_DP_GETRAWENCRYPTED) {
5132 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5133 }
5134 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5135 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5136 }
5137 if (dpflags & O_DP_AUTHENTICATE) {
5138 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5139 }
5140 }
5141
5142 error = open1at(vfs_context_current(), &nd, flags, &va,
5143 NULL, NULL, retval, fd, authfd);
5144
5145 return error;
5146 }
5147
5148 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5149 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5150 {
5151 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5152 return EINVAL;
5153 }
5154
5155 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5156 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5157 }
5158
5159 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5160 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5161 {
5162 if (uap->dpflags & O_DP_AUTHENTICATE) {
5163 return EINVAL;
5164 }
5165
5166 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5167 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5168 }
5169
5170 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5171 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5172 int fd, enum uio_seg segflg, int *retval)
5173 {
5174 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5175 struct {
5176 struct vnode_attr va;
5177 struct nameidata nd;
5178 } *__open_data;
5179 struct vnode_attr *vap;
5180 struct nameidata *ndp;
5181 int cmode;
5182 int error;
5183
5184 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5185 vap = &__open_data->va;
5186 ndp = &__open_data->nd;
5187
5188 VATTR_INIT(vap);
5189 /* Mask off all but regular access permissions */
5190 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5191 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5192
5193 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5194 segflg, path, ctx);
5195
5196 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5197
5198 kfree_type(typeof(*__open_data), __open_data);
5199
5200 return error;
5201 }
5202
5203 int
open(proc_t p,struct open_args * uap,int32_t * retval)5204 open(proc_t p, struct open_args *uap, int32_t *retval)
5205 {
5206 __pthread_testcancel(1);
5207 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5208 }
5209
5210 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5211 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5212 int32_t *retval)
5213 {
5214 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5215 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5216 }
5217
5218 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5219 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5220 int32_t *retval)
5221 {
5222 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5223 uap->mode, uap->fd, UIO_USERSPACE, retval);
5224 }
5225
5226 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5227 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5228 {
5229 __pthread_testcancel(1);
5230 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5231 }
5232
5233 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5234
5235 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5236 vfs_context_can_open_by_id(vfs_context_t ctx)
5237 {
5238 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5239 return TRUE;
5240 }
5241
5242 return IOTaskHasEntitlement(vfs_context_task(ctx),
5243 OPEN_BY_ID_ENTITLEMENT);
5244 }
5245
5246 /*
5247 * openbyid_np: open a file given a file system id and a file system object id
5248 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5249 * file systems that don't support object ids it is a node id (uint64_t).
5250 *
5251 * Parameters: p Process requesting the open
5252 * uap User argument descriptor (see below)
5253 * retval Pointer to an area to receive the
5254 * return calue from the system call
5255 *
5256 * Indirect: uap->path Path to open (same as 'open')
5257 *
5258 * uap->fsid id of target file system
5259 * uap->objid id of target file system object
5260 * uap->flags Flags to open (same as 'open')
5261 *
5262 * Returns: 0 Success
5263 * !0 errno value
5264 *
5265 *
5266 * XXX: We should enummerate the possible errno values here, and where
5267 * in the code they originated.
5268 */
5269 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5270 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5271 {
5272 fsid_t fsid;
5273 uint64_t objid;
5274 int error;
5275 char *buf = NULL;
5276 int buflen = MAXPATHLEN;
5277 int pathlen = 0;
5278 vfs_context_t ctx = vfs_context_current();
5279
5280 if (!vfs_context_can_open_by_id(ctx)) {
5281 return EPERM;
5282 }
5283
5284 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5285 return error;
5286 }
5287
5288 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5289 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5290 return error;
5291 }
5292
5293 AUDIT_ARG(value32, fsid.val[0]);
5294 AUDIT_ARG(value64, objid);
5295
5296 /*resolve path from fsis, objid*/
5297 do {
5298 buf = kalloc_data(buflen + 1, Z_WAITOK);
5299 if (buf == NULL) {
5300 return ENOMEM;
5301 }
5302
5303 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5304 buf, FSOPT_ISREALFSID, &pathlen);
5305
5306 if (error) {
5307 kfree_data(buf, buflen + 1);
5308 buf = NULL;
5309 }
5310 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5311
5312 if (error) {
5313 return error;
5314 }
5315
5316 buf[pathlen] = 0;
5317
5318 error = openat_internal(
5319 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5320
5321 kfree_data(buf, buflen + 1);
5322
5323 return error;
5324 }
5325
5326
5327 /*
5328 * Create a special file.
5329 */
5330 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5331 int fd);
5332
5333 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5334 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5335 mode_t mode, int fd)
5336 {
5337 vfs_context_t ctx = vfs_context_current();
5338 struct nameidata nd;
5339 vnode_t vp, dvp;
5340 int error;
5341
5342 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5343 if ((mode & S_IFMT) == S_IFIFO) {
5344 return mkfifo1(ctx, upath, vap, fd);
5345 }
5346
5347 AUDIT_ARG(mode, mode);
5348 AUDIT_ARG(value32, vap->va_rdev);
5349
5350 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5351 return error;
5352 }
5353 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5354 UIO_USERSPACE, upath, ctx);
5355 error = nameiat(&nd, fd);
5356 if (error) {
5357 return error;
5358 }
5359 dvp = nd.ni_dvp;
5360 vp = nd.ni_vp;
5361
5362 if (vp != NULL) {
5363 error = EEXIST;
5364 goto out;
5365 }
5366
5367 switch (mode & S_IFMT) {
5368 case S_IFCHR:
5369 VATTR_SET(vap, va_type, VCHR);
5370 break;
5371 case S_IFBLK:
5372 VATTR_SET(vap, va_type, VBLK);
5373 break;
5374 default:
5375 error = EINVAL;
5376 goto out;
5377 }
5378
5379 #if CONFIG_MACF
5380 error = mac_vnode_check_create(ctx,
5381 nd.ni_dvp, &nd.ni_cnd, vap);
5382 if (error) {
5383 goto out;
5384 }
5385 #endif
5386
5387 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5388 goto out;
5389 }
5390
5391 #if CONFIG_FILE_LEASES
5392 vnode_breakdirlease(dvp, false, O_WRONLY);
5393 #endif
5394
5395 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5396 goto out;
5397 }
5398
5399 if (vp) {
5400 int update_flags = 0;
5401
5402 // Make sure the name & parent pointers are hooked up
5403 if (vp->v_name == NULL) {
5404 update_flags |= VNODE_UPDATE_NAME;
5405 }
5406 if (vp->v_parent == NULLVP) {
5407 update_flags |= VNODE_UPDATE_PARENT;
5408 }
5409
5410 if (update_flags) {
5411 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5412 }
5413
5414 #if CONFIG_FSE
5415 add_fsevent(FSE_CREATE_FILE, ctx,
5416 FSE_ARG_VNODE, vp,
5417 FSE_ARG_DONE);
5418 #endif
5419 }
5420
5421 out:
5422 /*
5423 * nameidone has to happen before we vnode_put(dvp)
5424 * since it may need to release the fs_nodelock on the dvp
5425 */
5426 nameidone(&nd);
5427
5428 if (vp) {
5429 vnode_put(vp);
5430 }
5431 vnode_put(dvp);
5432
5433 return error;
5434 }
5435
5436 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5437 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5438 {
5439 struct vnode_attr va;
5440
5441 VATTR_INIT(&va);
5442 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5443 VATTR_SET(&va, va_rdev, uap->dev);
5444
5445 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5446 }
5447
5448 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5449 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5450 {
5451 struct vnode_attr va;
5452
5453 VATTR_INIT(&va);
5454 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5455 VATTR_SET(&va, va_rdev, uap->dev);
5456
5457 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5458 }
5459
5460 /*
5461 * Create a named pipe.
5462 *
5463 * Returns: 0 Success
5464 * EEXIST
5465 * namei:???
5466 * vnode_authorize:???
5467 * vn_create:???
5468 */
5469 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5470 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5471 {
5472 vnode_t vp, dvp;
5473 int error;
5474 struct nameidata nd;
5475
5476 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5477 UIO_USERSPACE, upath, ctx);
5478 error = nameiat(&nd, fd);
5479 if (error) {
5480 return error;
5481 }
5482 dvp = nd.ni_dvp;
5483 vp = nd.ni_vp;
5484
5485 /* check that this is a new file and authorize addition */
5486 if (vp != NULL) {
5487 error = EEXIST;
5488 goto out;
5489 }
5490 VATTR_SET(vap, va_type, VFIFO);
5491
5492 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5493 goto out;
5494 }
5495
5496 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5497 out:
5498 /*
5499 * nameidone has to happen before we vnode_put(dvp)
5500 * since it may need to release the fs_nodelock on the dvp
5501 */
5502 nameidone(&nd);
5503
5504 if (vp) {
5505 vnode_put(vp);
5506 }
5507 vnode_put(dvp);
5508
5509 return error;
5510 }
5511
5512
5513 /*
5514 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5515 *
5516 * Parameters: p Process requesting the open
5517 * uap User argument descriptor (see below)
5518 * retval (Ignored)
5519 *
5520 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5521 * uap->uid UID to set
5522 * uap->gid GID to set
5523 * uap->mode File mode to set (same as 'mkfifo')
5524 * uap->xsecurity ACL to set, if creating
5525 *
5526 * Returns: 0 Success
5527 * !0 errno value
5528 *
5529 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5530 *
5531 * XXX: We should enummerate the possible errno values here, and where
5532 * in the code they originated.
5533 */
5534 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5535 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5536 {
5537 int ciferror;
5538 kauth_filesec_t xsecdst;
5539 struct vnode_attr va;
5540
5541 AUDIT_ARG(owner, uap->uid, uap->gid);
5542
5543 xsecdst = KAUTH_FILESEC_NONE;
5544 if (uap->xsecurity != USER_ADDR_NULL) {
5545 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5546 return ciferror;
5547 }
5548 }
5549
5550 VATTR_INIT(&va);
5551 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5552 if (uap->uid != KAUTH_UID_NONE) {
5553 VATTR_SET(&va, va_uid, uap->uid);
5554 }
5555 if (uap->gid != KAUTH_GID_NONE) {
5556 VATTR_SET(&va, va_gid, uap->gid);
5557 }
5558 if (xsecdst != KAUTH_FILESEC_NONE) {
5559 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5560 va.va_vaflags |= VA_FILESEC_ACL;
5561 }
5562
5563 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5564
5565 if (xsecdst != KAUTH_FILESEC_NONE) {
5566 kauth_filesec_free(xsecdst);
5567 }
5568 return ciferror;
5569 }
5570
5571 /* ARGSUSED */
5572 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5573 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5574 {
5575 struct vnode_attr va;
5576
5577 VATTR_INIT(&va);
5578 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5579
5580 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5581 }
5582
5583 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5584 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5585 {
5586 struct vnode_attr va;
5587
5588 VATTR_INIT(&va);
5589 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5590
5591 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5592 }
5593
5594 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5595 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5596 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5597
5598 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5599 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5600 {
5601 int ret, len = _len;
5602
5603 *truncated_path = 0;
5604
5605 if (firmlink) {
5606 ret = vn_getpath(dvp, path, &len);
5607 } else {
5608 ret = vn_getpath_no_firmlink(dvp, path, &len);
5609 }
5610 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5611 if (leafname) {
5612 path[len - 1] = '/';
5613 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5614 if (len > MAXPATHLEN) {
5615 char *ptr;
5616
5617 // the string got truncated!
5618 *truncated_path = 1;
5619 ptr = strrchr(path, '/');
5620 if (ptr) {
5621 *ptr = '\0'; // chop off the string at the last directory component
5622 }
5623 len = (int)strlen(path) + 1;
5624 }
5625 }
5626 } else if (ret == 0) {
5627 *truncated_path = 1;
5628 } else if (ret != 0) {
5629 struct vnode *mydvp = dvp;
5630
5631 if (ret != ENOSPC) {
5632 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5633 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5634 }
5635 *truncated_path = 1;
5636
5637 do {
5638 if (mydvp->v_parent != NULL) {
5639 mydvp = mydvp->v_parent;
5640 } else if (mydvp->v_mount) {
5641 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5642 break;
5643 } else {
5644 // no parent and no mount point? only thing is to punt and say "/" changed
5645 strlcpy(path, "/", _len);
5646 len = 2;
5647 mydvp = NULL;
5648 }
5649
5650 if (mydvp == NULL) {
5651 break;
5652 }
5653
5654 len = _len;
5655 if (firmlink) {
5656 ret = vn_getpath(mydvp, path, &len);
5657 } else {
5658 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5659 }
5660 } while (ret == ENOSPC);
5661 }
5662
5663 return len;
5664 }
5665
5666 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5667 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5668 {
5669 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5670 }
5671
5672 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5673 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5674 {
5675 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5676 }
5677
5678 /*
5679 * Make a hard file link.
5680 *
5681 * Returns: 0 Success
5682 * EPERM
5683 * EEXIST
5684 * EXDEV
5685 * namei:???
5686 * vnode_authorize:???
5687 * VNOP_LINK:???
5688 */
5689 /* ARGSUSED */
5690 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5691 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5692 user_addr_t link, int flag, enum uio_seg segflg)
5693 {
5694 vnode_t vp, pvp, dvp, lvp;
5695 struct nameidata nd;
5696 int follow;
5697 int error;
5698 #if CONFIG_FSE
5699 fse_info finfo;
5700 #endif
5701 char *target_path = NULL;
5702 char *no_firmlink_path = NULL;
5703 vnode_t locked_vp = NULLVP;
5704 int truncated = 0;
5705 int truncated_no_firmlink_path = 0;
5706 int num_retries = 0;
5707 int need_event, has_listeners, need_kpath2;
5708 bool do_retry;
5709
5710 /* look up the object we are linking to */
5711 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5712
5713 retry:
5714 do_retry = false;
5715 vp = dvp = lvp = NULLVP;
5716 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5717 segflg, path, ctx);
5718
5719 error = nameiat(&nd, fd1);
5720 if (error) {
5721 return error;
5722 }
5723 vp = nd.ni_vp;
5724
5725 nameidone(&nd);
5726
5727 /*
5728 * Normally, linking to directories is not supported.
5729 * However, some file systems may have limited support.
5730 */
5731 if (vp->v_type == VDIR) {
5732 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5733 error = EPERM; /* POSIX */
5734 goto out;
5735 }
5736
5737 /* Linking to a directory requires ownership. */
5738 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5739 struct vnode_attr dva;
5740
5741 VATTR_INIT(&dva);
5742 VATTR_WANTED(&dva, va_uid);
5743 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5744 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5745 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5746 error = EACCES;
5747 goto out;
5748 }
5749 }
5750 }
5751
5752 /* lookup the target node */
5753 #if CONFIG_TRIGGERS
5754 nd.ni_op = OP_LINK;
5755 #endif
5756 nd.ni_cnd.cn_nameiop = CREATE;
5757 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5758 nd.ni_dirp = link;
5759 error = nameiat(&nd, fd2);
5760 if (error != 0) {
5761 goto out;
5762 }
5763 dvp = nd.ni_dvp;
5764 lvp = nd.ni_vp;
5765
5766 assert(locked_vp == NULLVP);
5767 vnode_link_lock(vp);
5768 locked_vp = vp;
5769
5770 #if CONFIG_MACF
5771 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5772 goto out2;
5773 }
5774 #endif
5775
5776 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5777 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5778 goto out2;
5779 }
5780
5781 /* target node must not exist */
5782 if (lvp != NULLVP) {
5783 error = EEXIST;
5784 goto out2;
5785 }
5786 /* cannot link across mountpoints */
5787 if (vnode_mount(vp) != vnode_mount(dvp)) {
5788 error = EXDEV;
5789 goto out2;
5790 }
5791
5792 /* authorize creation of the target note */
5793 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5794 goto out2;
5795 }
5796
5797 #if CONFIG_FILE_LEASES
5798 vnode_breakdirlease(dvp, false, O_WRONLY);
5799 #endif
5800
5801 /* and finally make the link */
5802 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5803 if (error) {
5804 if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
5805 do_retry = true;
5806 num_retries += 1;
5807 }
5808 goto out2;
5809 }
5810
5811 #if CONFIG_MACF
5812 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5813 #endif
5814
5815 assert(locked_vp == vp);
5816 vnode_link_unlock(locked_vp);
5817 locked_vp = NULLVP;
5818
5819 #if CONFIG_FSE
5820 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5821 #else
5822 need_event = 0;
5823 #endif
5824 has_listeners = kauth_authorize_fileop_has_listeners();
5825
5826 need_kpath2 = 0;
5827 #if CONFIG_AUDIT
5828 if (AUDIT_RECORD_EXISTS()) {
5829 need_kpath2 = 1;
5830 }
5831 #endif
5832
5833 if (need_event || has_listeners || need_kpath2) {
5834 char *link_to_path = NULL;
5835 int len, link_name_len;
5836 int len_no_firmlink_path = 0;
5837
5838 /* build the path to the new link file */
5839 GET_PATH(target_path);
5840
5841 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5842 if (no_firmlink_path == NULL) {
5843 GET_PATH(no_firmlink_path);
5844 }
5845 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5846
5847 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5848
5849 if (has_listeners) {
5850 /* build the path to file we are linking to */
5851 GET_PATH(link_to_path);
5852
5853 link_name_len = MAXPATHLEN;
5854 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5855 /*
5856 * Call out to allow 3rd party notification of rename.
5857 * Ignore result of kauth_authorize_fileop call.
5858 */
5859 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5860 (uintptr_t)link_to_path,
5861 (uintptr_t)target_path);
5862 }
5863 if (link_to_path != NULL) {
5864 RELEASE_PATH(link_to_path);
5865 }
5866 }
5867 #if CONFIG_FSE
5868 if (need_event) {
5869 /* construct fsevent */
5870 if (get_fse_info(vp, &finfo, ctx) == 0) {
5871 if (truncated_no_firmlink_path) {
5872 finfo.mode |= FSE_TRUNCATED_PATH;
5873 }
5874
5875 // build the path to the destination of the link
5876 add_fsevent(FSE_CREATE_FILE, ctx,
5877 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5878 FSE_ARG_FINFO, &finfo,
5879 FSE_ARG_DONE);
5880 }
5881
5882 pvp = vp->v_parent;
5883 // need an iocount on parent vnode in this case
5884 if (pvp && pvp != dvp) {
5885 pvp = vnode_getparent_if_different(vp, dvp);
5886 }
5887 if (pvp) {
5888 add_fsevent(FSE_STAT_CHANGED, ctx,
5889 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5890 }
5891 if (pvp && pvp != dvp) {
5892 vnode_put(pvp);
5893 }
5894 }
5895 #endif
5896 }
5897 out2:
5898 /*
5899 * nameidone has to happen before we vnode_put(dvp)
5900 * since it may need to release the fs_nodelock on the dvp
5901 */
5902 nameidone(&nd);
5903 if (target_path != NULL) {
5904 RELEASE_PATH(target_path);
5905 target_path = NULL;
5906 }
5907 if (no_firmlink_path != NULL) {
5908 RELEASE_PATH(no_firmlink_path);
5909 no_firmlink_path = NULL;
5910 }
5911 out:
5912 if (locked_vp) {
5913 assert(locked_vp == vp);
5914 vnode_link_unlock(locked_vp);
5915 locked_vp = NULLVP;
5916 }
5917 if (lvp) {
5918 vnode_put(lvp);
5919 }
5920 if (dvp) {
5921 vnode_put(dvp);
5922 }
5923 vnode_put(vp);
5924
5925 if (do_retry) {
5926 goto retry;
5927 }
5928
5929 return error;
5930 }
5931
5932 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5933 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5934 {
5935 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5936 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5937 }
5938
5939 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5940 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5941 {
5942 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5943 return EINVAL;
5944 }
5945
5946 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5947 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5948 }
5949
5950 /*
5951 * Make a symbolic link.
5952 *
5953 * We could add support for ACLs here too...
5954 */
5955 /* ARGSUSED */
5956 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5957 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5958 user_addr_t link, enum uio_seg segflg)
5959 {
5960 struct vnode_attr va;
5961 char *path;
5962 int error;
5963 struct nameidata nd;
5964 vnode_t vp, dvp;
5965 size_t dummy = 0;
5966 proc_t p;
5967
5968 error = 0;
5969 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5970 path = zalloc(ZV_NAMEI);
5971 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5972 } else {
5973 path = (char *)path_data;
5974 }
5975 if (error) {
5976 goto out;
5977 }
5978 AUDIT_ARG(text, path); /* This is the link string */
5979
5980 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5981 segflg, link, ctx);
5982
5983 error = nameiat(&nd, fd);
5984 if (error) {
5985 goto out;
5986 }
5987 dvp = nd.ni_dvp;
5988 vp = nd.ni_vp;
5989
5990 p = vfs_context_proc(ctx);
5991 VATTR_INIT(&va);
5992 VATTR_SET(&va, va_type, VLNK);
5993 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5994
5995 #if CONFIG_MACF
5996 error = mac_vnode_check_create(ctx,
5997 dvp, &nd.ni_cnd, &va);
5998 #endif
5999 if (error != 0) {
6000 goto skipit;
6001 }
6002
6003 if (vp != NULL) {
6004 error = EEXIST;
6005 goto skipit;
6006 }
6007
6008 /* authorize */
6009 if (error == 0) {
6010 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6011 }
6012 /* get default ownership, etc. */
6013 if (error == 0) {
6014 error = vnode_authattr_new(dvp, &va, 0, ctx);
6015 }
6016
6017 #if CONFIG_FILE_LEASES
6018 vnode_breakdirlease(dvp, false, O_WRONLY);
6019 #endif
6020
6021 if (error == 0) {
6022 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
6023 }
6024
6025 /* do fallback attribute handling */
6026 if (error == 0 && vp) {
6027 error = vnode_setattr_fallback(vp, &va, ctx);
6028 }
6029
6030 #if CONFIG_MACF
6031 if (error == 0 && vp) {
6032 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
6033 }
6034 #endif
6035
6036 if (error == 0) {
6037 int update_flags = 0;
6038
6039 /*check if a new vnode was created, else try to get one*/
6040 if (vp == NULL) {
6041 nd.ni_cnd.cn_nameiop = LOOKUP;
6042 #if CONFIG_TRIGGERS
6043 nd.ni_op = OP_LOOKUP;
6044 #endif
6045 /*
6046 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
6047 * reallocated again in namei().
6048 */
6049 nd.ni_cnd.cn_flags &= HASBUF;
6050 error = nameiat(&nd, fd);
6051 if (error) {
6052 goto skipit;
6053 }
6054 vp = nd.ni_vp;
6055 }
6056
6057 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
6058 /* call out to allow 3rd party notification of rename.
6059 * Ignore result of kauth_authorize_fileop call.
6060 */
6061 if (kauth_authorize_fileop_has_listeners() &&
6062 namei(&nd) == 0) {
6063 char *new_link_path = NULL;
6064 int len;
6065
6066 /* build the path to the new link file */
6067 new_link_path = get_pathbuff();
6068 len = MAXPATHLEN;
6069 vn_getpath(dvp, new_link_path, &len);
6070 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
6071 new_link_path[len - 1] = '/';
6072 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
6073 }
6074
6075 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
6076 (uintptr_t)path, (uintptr_t)new_link_path);
6077 if (new_link_path != NULL) {
6078 release_pathbuff(new_link_path);
6079 }
6080 }
6081 #endif
6082 // Make sure the name & parent pointers are hooked up
6083 if (vp->v_name == NULL) {
6084 update_flags |= VNODE_UPDATE_NAME;
6085 }
6086 if (vp->v_parent == NULLVP) {
6087 update_flags |= VNODE_UPDATE_PARENT;
6088 }
6089
6090 if (update_flags) {
6091 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6092 }
6093
6094 #if CONFIG_FSE
6095 add_fsevent(FSE_CREATE_FILE, ctx,
6096 FSE_ARG_VNODE, vp,
6097 FSE_ARG_DONE);
6098 #endif
6099 }
6100
6101 skipit:
6102 /*
6103 * nameidone has to happen before we vnode_put(dvp)
6104 * since it may need to release the fs_nodelock on the dvp
6105 */
6106 nameidone(&nd);
6107
6108 if (vp) {
6109 vnode_put(vp);
6110 }
6111 vnode_put(dvp);
6112 out:
6113 if (path && (path != (char *)path_data)) {
6114 zfree(ZV_NAMEI, path);
6115 }
6116
6117 return error;
6118 }
6119
6120 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6121 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6122 {
6123 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6124 uap->link, UIO_USERSPACE);
6125 }
6126
6127 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6128 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6129 __unused int32_t *retval)
6130 {
6131 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6132 uap->path2, UIO_USERSPACE);
6133 }
6134
6135 /*
6136 * Delete a whiteout from the filesystem.
6137 * No longer supported.
6138 */
6139 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6140 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6141 {
6142 return ENOTSUP;
6143 }
6144
6145 /*
6146 * Delete a name from the filesystem.
6147 */
6148 /* ARGSUSED */
6149 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6150 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6151 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6152 {
6153 struct {
6154 struct nameidata nd;
6155 #if CONFIG_FSE
6156 struct vnode_attr va;
6157 fse_info finfo;
6158 #endif
6159 } *__unlink_data;
6160 struct nameidata *ndp;
6161 vnode_t vp, dvp;
6162 int error;
6163 struct componentname *cnp;
6164 char *path = NULL;
6165 char *no_firmlink_path = NULL;
6166 int len_path = 0;
6167 int len_no_firmlink_path = 0;
6168 int flags;
6169 int need_event;
6170 int has_listeners;
6171 int truncated_path;
6172 int truncated_no_firmlink_path;
6173 int batched;
6174 struct vnode_attr *vap;
6175 vnode_t locked_vp = NULLVP;
6176 int do_retry;
6177 int retry_count = 0;
6178 int cn_flags;
6179 int nofollow_any = 0;
6180
6181 cn_flags = LOCKPARENT;
6182 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6183 cn_flags |= AUDITVNPATH1;
6184 }
6185 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6186 nofollow_any = NAMEI_NOFOLLOW_ANY;
6187 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6188 }
6189 /* If a starting dvp is passed, it trumps any fd passed. */
6190 if (start_dvp) {
6191 cn_flags |= USEDVP;
6192 }
6193
6194 #if NAMEDRSRCFORK
6195 /* unlink or delete is allowed on rsrc forks and named streams */
6196 cn_flags |= CN_ALLOWRSRCFORK;
6197 #endif
6198
6199 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6200 ndp = &__unlink_data->nd;
6201 #if CONFIG_FSE
6202 fse_info *finfop = &__unlink_data->finfo;
6203 #endif
6204
6205 retry:
6206 do_retry = 0;
6207 flags = 0;
6208 need_event = 0;
6209 has_listeners = 0;
6210 truncated_path = 0;
6211 truncated_no_firmlink_path = 0;
6212 vap = NULL;
6213
6214 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6215
6216 ndp->ni_dvp = start_dvp;
6217 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6218 cnp = &ndp->ni_cnd;
6219
6220 continue_lookup:
6221 error = nameiat(ndp, fd);
6222 if (error) {
6223 goto early_out;
6224 }
6225
6226 dvp = ndp->ni_dvp;
6227 vp = ndp->ni_vp;
6228
6229 /* With Carbon delete semantics, busy files cannot be deleted */
6230 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6231 flags |= VNODE_REMOVE_NODELETEBUSY;
6232 }
6233
6234 /* Skip any potential upcalls if told to. */
6235 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6236 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6237 }
6238
6239 /* Update speculative telemetry with system discarded use state */
6240 if (unlink_flags & VNODE_REMOVE_SYSTEM_DISCARDED) {
6241 flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6242 }
6243
6244 if (vp) {
6245 batched = vnode_compound_remove_available(vp);
6246 /*
6247 * The root of a mounted filesystem cannot be deleted.
6248 */
6249 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6250 error = EBUSY;
6251 goto out;
6252 }
6253
6254 #if DEVELOPMENT || DEBUG
6255 /*
6256 * XXX VSWAP: Check for entitlements or special flag here
6257 * so we can restrict access appropriately.
6258 */
6259 #else /* DEVELOPMENT || DEBUG */
6260
6261 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6262 error = EPERM;
6263 goto out;
6264 }
6265 #endif /* DEVELOPMENT || DEBUG */
6266
6267 if (!batched) {
6268 vnode_link_lock(vp);
6269 locked_vp = vp;
6270 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6271 if (error) {
6272 if (error == ENOENT) {
6273 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6274 do_retry = 1;
6275 retry_count++;
6276 }
6277 }
6278 vnode_link_unlock(vp);
6279 locked_vp = NULLVP;
6280 goto out;
6281 }
6282 }
6283 } else {
6284 batched = 1;
6285
6286 if (!vnode_compound_remove_available(dvp)) {
6287 panic("No vp, but no compound remove?");
6288 }
6289 }
6290
6291 #if CONFIG_FSE
6292 need_event = need_fsevent(FSE_DELETE, dvp);
6293 if (need_event) {
6294 if (!batched) {
6295 if ((vp->v_flag & VISHARDLINK) == 0) {
6296 /* XXX need to get these data in batched VNOP */
6297 get_fse_info(vp, finfop, ctx);
6298 }
6299 } else {
6300 error =
6301 vfs_get_notify_attributes(&__unlink_data->va);
6302 if (error) {
6303 goto out;
6304 }
6305
6306 vap = &__unlink_data->va;
6307 }
6308 }
6309 #endif
6310 has_listeners = kauth_authorize_fileop_has_listeners();
6311 if (need_event || has_listeners) {
6312 if (path == NULL) {
6313 GET_PATH(path);
6314 }
6315 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6316 if (no_firmlink_path == NULL) {
6317 GET_PATH(no_firmlink_path);
6318 }
6319 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6320 }
6321
6322 #if NAMEDRSRCFORK
6323 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6324 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6325 } else
6326 #endif
6327 {
6328 #if CONFIG_FILE_LEASES
6329 vnode_breakdirlease(dvp, false, O_WRONLY);
6330 #endif
6331
6332 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6333 vp = ndp->ni_vp;
6334 if (error == EKEEPLOOKING) {
6335 if (!batched) {
6336 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6337 }
6338
6339 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6340 panic("EKEEPLOOKING, but continue flag not set?");
6341 }
6342
6343 if (vnode_isdir(vp)) {
6344 error = EISDIR;
6345 goto out;
6346 }
6347 goto continue_lookup;
6348 } else if (error == ENOENT && batched) {
6349 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6350 /*
6351 * For compound VNOPs, the authorization callback may
6352 * return ENOENT in case of racing hardlink lookups
6353 * hitting the name cache, redrive the lookup.
6354 */
6355 do_retry = 1;
6356 retry_count += 1;
6357 goto out;
6358 }
6359 }
6360 }
6361
6362 /*
6363 * Call out to allow 3rd party notification of delete.
6364 * Ignore result of kauth_authorize_fileop call.
6365 */
6366 if (!error) {
6367 if (has_listeners) {
6368 kauth_authorize_fileop(vfs_context_ucred(ctx),
6369 KAUTH_FILEOP_DELETE,
6370 (uintptr_t)vp,
6371 (uintptr_t)path);
6372 }
6373
6374 if (vp->v_flag & VISHARDLINK) {
6375 //
6376 // if a hardlink gets deleted we want to blow away the
6377 // v_parent link because the path that got us to this
6378 // instance of the link is no longer valid. this will
6379 // force the next call to get the path to ask the file
6380 // system instead of just following the v_parent link.
6381 //
6382 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6383 }
6384
6385 #if CONFIG_FSE
6386 if (need_event) {
6387 if (vp->v_flag & VISHARDLINK) {
6388 get_fse_info(vp, finfop, ctx);
6389 } else if (vap) {
6390 vnode_get_fse_info_from_vap(vp, finfop, vap);
6391 }
6392 if (truncated_path) {
6393 finfop->mode |= FSE_TRUNCATED_PATH;
6394 }
6395 add_fsevent(FSE_DELETE, ctx,
6396 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6397 FSE_ARG_FINFO, finfop,
6398 FSE_ARG_DONE);
6399 }
6400 #endif
6401
6402 #if CONFIG_MACF
6403 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6404 #endif
6405 }
6406
6407 out:
6408 if (locked_vp) {
6409 assert(locked_vp == vp);
6410 vnode_link_unlock(locked_vp);
6411 locked_vp = NULLVP;
6412 }
6413
6414 if (path != NULL) {
6415 RELEASE_PATH(path);
6416 path = NULL;
6417 }
6418
6419 if (no_firmlink_path != NULL) {
6420 RELEASE_PATH(no_firmlink_path);
6421 no_firmlink_path = NULL;
6422 }
6423 #if NAMEDRSRCFORK
6424 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6425 * will cause its shadow file to go away if necessary.
6426 */
6427 if (vp && (vnode_isnamedstream(vp)) &&
6428 (vp->v_parent != NULLVP) &&
6429 vnode_isshadow(vp)) {
6430 vnode_recycle(vp);
6431 }
6432 #endif
6433 /*
6434 * nameidone has to happen before we vnode_put(dvp)
6435 * since it may need to release the fs_nodelock on the dvp
6436 */
6437 nameidone(ndp);
6438 vnode_put(dvp);
6439 if (vp) {
6440 vnode_put(vp);
6441 }
6442
6443 if (do_retry) {
6444 goto retry;
6445 }
6446
6447 early_out:
6448 kfree_type(typeof(*__unlink_data), __unlink_data);
6449 return error;
6450 }
6451
6452 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6453 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6454 enum uio_seg segflg, int unlink_flags)
6455 {
6456 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6457 unlink_flags);
6458 }
6459
6460 /*
6461 * Delete a name from the filesystem using Carbon semantics.
6462 */
6463 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6464 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6465 {
6466 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6467 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6468 }
6469
6470 /*
6471 * Delete a name from the filesystem using POSIX semantics.
6472 */
6473 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6474 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6475 {
6476 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6477 uap->path, UIO_USERSPACE, 0);
6478 }
6479
6480 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6481 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6482 {
6483 int unlink_flags = 0;
6484
6485 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED)) {
6486 return EINVAL;
6487 }
6488
6489 if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6490 unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6491 }
6492
6493 if (uap->flag & AT_SYSTEM_DISCARDED) {
6494 unlink_flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6495 }
6496
6497 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6498 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6499 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6500 }
6501 return rmdirat_internal(vfs_context_current(), uap->fd,
6502 uap->path, UIO_USERSPACE, unlink_flags);
6503 } else {
6504 return unlinkat_internal(vfs_context_current(), uap->fd,
6505 NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6506 }
6507 }
6508
6509 /*
6510 * Reposition read/write file offset.
6511 */
6512 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6513 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6514 {
6515 struct fileproc *fp;
6516 vnode_t vp;
6517 struct vfs_context *ctx;
6518 off_t offset = uap->offset, file_size;
6519 int error;
6520
6521 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6522 if (error == ENOTSUP) {
6523 return ESPIPE;
6524 }
6525 return error;
6526 }
6527 if (vnode_isfifo(vp)) {
6528 file_drop(uap->fd);
6529 return ESPIPE;
6530 }
6531
6532
6533 ctx = vfs_context_current();
6534 #if CONFIG_MACF
6535 if (uap->whence == L_INCR && uap->offset == 0) {
6536 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6537 fp->fp_glob);
6538 } else {
6539 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6540 fp->fp_glob);
6541 }
6542 if (error) {
6543 file_drop(uap->fd);
6544 return error;
6545 }
6546 #endif
6547 if ((error = vnode_getwithref(vp))) {
6548 file_drop(uap->fd);
6549 return error;
6550 }
6551
6552 switch (uap->whence) {
6553 case L_INCR:
6554 offset += fp->fp_glob->fg_offset;
6555 break;
6556 case L_XTND:
6557 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6558 break;
6559 }
6560 offset += file_size;
6561 break;
6562 case L_SET:
6563 break;
6564 case SEEK_HOLE:
6565 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6566 break;
6567 case SEEK_DATA:
6568 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6569 break;
6570 default:
6571 error = EINVAL;
6572 }
6573 if (error == 0) {
6574 if (uap->offset > 0 && offset < 0) {
6575 /* Incremented/relative move past max size */
6576 error = EOVERFLOW;
6577 } else {
6578 /*
6579 * Allow negative offsets on character devices, per
6580 * POSIX 1003.1-2001. Most likely for writing disk
6581 * labels.
6582 */
6583 if (offset < 0 && vp->v_type != VCHR) {
6584 /* Decremented/relative move before start */
6585 error = EINVAL;
6586 } else {
6587 /* Success */
6588 fp->fp_glob->fg_offset = offset;
6589 *retval = fp->fp_glob->fg_offset;
6590 }
6591 }
6592 }
6593
6594 /*
6595 * An lseek can affect whether data is "available to read." Use
6596 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6597 */
6598 post_event_if_success(vp, error, NOTE_NONE);
6599 (void)vnode_put(vp);
6600 file_drop(uap->fd);
6601 return error;
6602 }
6603
6604
6605 /*
6606 * Check access permissions.
6607 *
6608 * Returns: 0 Success
6609 * vnode_authorize:???
6610 */
6611 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6612 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6613 {
6614 kauth_action_t action;
6615 int error;
6616
6617 /*
6618 * If just the regular access bits, convert them to something
6619 * that vnode_authorize will understand.
6620 */
6621 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6622 action = 0;
6623 if (uflags & R_OK) {
6624 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6625 }
6626 if (uflags & W_OK) {
6627 if (vnode_isdir(vp)) {
6628 action |= KAUTH_VNODE_ADD_FILE |
6629 KAUTH_VNODE_ADD_SUBDIRECTORY;
6630 /* might want delete rights here too */
6631 } else {
6632 action |= KAUTH_VNODE_WRITE_DATA;
6633 }
6634 }
6635 if (uflags & X_OK) {
6636 if (vnode_isdir(vp)) {
6637 action |= KAUTH_VNODE_SEARCH;
6638 } else {
6639 action |= KAUTH_VNODE_EXECUTE;
6640 }
6641 }
6642 } else {
6643 /* take advantage of definition of uflags */
6644 action = uflags >> 8;
6645 }
6646
6647 #if CONFIG_MACF
6648 error = mac_vnode_check_access(ctx, vp, uflags);
6649 if (error) {
6650 return error;
6651 }
6652 #endif /* MAC */
6653
6654 /* action == 0 means only check for existence */
6655 if (action != 0) {
6656 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6657 } else {
6658 error = 0;
6659 }
6660
6661 return error;
6662 }
6663
6664
6665
6666 /*
6667 * access_extended: Check access permissions in bulk.
6668 *
6669 * Description: uap->entries Pointer to an array of accessx
6670 * descriptor structs, plus one or
6671 * more NULL terminated strings (see
6672 * "Notes" section below).
6673 * uap->size Size of the area pointed to by
6674 * uap->entries.
6675 * uap->results Pointer to the results array.
6676 *
6677 * Returns: 0 Success
6678 * ENOMEM Insufficient memory
6679 * EINVAL Invalid arguments
6680 * namei:EFAULT Bad address
6681 * namei:ENAMETOOLONG Filename too long
6682 * namei:ENOENT No such file or directory
6683 * namei:ELOOP Too many levels of symbolic links
6684 * namei:EBADF Bad file descriptor
6685 * namei:ENOTDIR Not a directory
6686 * namei:???
6687 * access1:
6688 *
6689 * Implicit returns:
6690 * uap->results Array contents modified
6691 *
6692 * Notes: The uap->entries are structured as an arbitrary length array
6693 * of accessx descriptors, followed by one or more NULL terminated
6694 * strings
6695 *
6696 * struct accessx_descriptor[0]
6697 * ...
6698 * struct accessx_descriptor[n]
6699 * char name_data[0];
6700 *
6701 * We determine the entry count by walking the buffer containing
6702 * the uap->entries argument descriptor. For each descriptor we
6703 * see, the valid values for the offset ad_name_offset will be
6704 * in the byte range:
6705 *
6706 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6707 * to
6708 * [ uap->entries + uap->size - 2 ]
6709 *
6710 * since we must have at least one string, and the string must
6711 * be at least one character plus the NULL terminator in length.
6712 *
6713 * XXX: Need to support the check-as uid argument
6714 */
6715 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6716 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6717 {
6718 struct accessx_descriptor *input = NULL;
6719 errno_t *result = NULL;
6720 errno_t error = 0;
6721 int wantdelete = 0;
6722 size_t desc_max, desc_actual = 0;
6723 unsigned int i, j;
6724 struct vfs_context context;
6725 struct nameidata nd;
6726 int niopts;
6727 vnode_t vp = NULL;
6728 vnode_t dvp = NULL;
6729 #define ACCESSX_MAX_DESCR_ON_STACK 10
6730 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6731
6732 context.vc_ucred = NULL;
6733
6734 /*
6735 * Validate parameters; if valid, copy the descriptor array and string
6736 * arguments into local memory. Before proceeding, the following
6737 * conditions must have been met:
6738 *
6739 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6740 * o There must be sufficient room in the request for at least one
6741 * descriptor and a one yte NUL terminated string.
6742 * o The allocation of local storage must not fail.
6743 */
6744 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6745 return ENOMEM;
6746 }
6747 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6748 return EINVAL;
6749 }
6750 if (uap->size <= sizeof(stack_input)) {
6751 input = stack_input;
6752 } else {
6753 input = kalloc_data(uap->size, Z_WAITOK);
6754 if (input == NULL) {
6755 error = ENOMEM;
6756 goto out;
6757 }
6758 }
6759 error = copyin(uap->entries, input, uap->size);
6760 if (error) {
6761 goto out;
6762 }
6763
6764 AUDIT_ARG(opaque, input, uap->size);
6765
6766 /*
6767 * Force NUL termination of the copyin buffer to avoid nami() running
6768 * off the end. If the caller passes us bogus data, they may get a
6769 * bogus result.
6770 */
6771 ((char *)input)[uap->size - 1] = 0;
6772
6773 /*
6774 * Access is defined as checking against the process' real identity,
6775 * even if operations are checking the effective identity. This
6776 * requires that we use a local vfs context.
6777 */
6778 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6779 context.vc_thread = current_thread();
6780
6781 /*
6782 * Find out how many entries we have, so we can allocate the result
6783 * array by walking the list and adjusting the count downward by the
6784 * earliest string offset we see.
6785 */
6786 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6787 desc_actual = desc_max;
6788 for (i = 0; i < desc_actual; i++) {
6789 /*
6790 * Take the offset to the name string for this entry and
6791 * convert to an input array index, which would be one off
6792 * the end of the array if this entry was the lowest-addressed
6793 * name string.
6794 */
6795 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6796
6797 /*
6798 * An offset greater than the max allowable offset is an error.
6799 * It is also an error for any valid entry to point
6800 * to a location prior to the end of the current entry, if
6801 * it's not a reference to the string of the previous entry.
6802 */
6803 if (j > desc_max || (j != 0 && j <= i)) {
6804 error = EINVAL;
6805 goto out;
6806 }
6807
6808 /* Also do not let ad_name_offset point to something beyond the size of the input */
6809 if (input[i].ad_name_offset >= uap->size) {
6810 error = EINVAL;
6811 goto out;
6812 }
6813
6814 /*
6815 * An offset of 0 means use the previous descriptor's offset;
6816 * this is used to chain multiple requests for the same file
6817 * to avoid multiple lookups.
6818 */
6819 if (j == 0) {
6820 /* This is not valid for the first entry */
6821 if (i == 0) {
6822 error = EINVAL;
6823 goto out;
6824 }
6825 continue;
6826 }
6827
6828 /*
6829 * If the offset of the string for this descriptor is before
6830 * what we believe is the current actual last descriptor,
6831 * then we need to adjust our estimate downward; this permits
6832 * the string table following the last descriptor to be out
6833 * of order relative to the descriptor list.
6834 */
6835 if (j < desc_actual) {
6836 desc_actual = j;
6837 }
6838 }
6839
6840 /*
6841 * We limit the actual number of descriptors we are willing to process
6842 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6843 * requested does not exceed this limit,
6844 */
6845 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6846 error = ENOMEM;
6847 goto out;
6848 }
6849 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6850 if (result == NULL) {
6851 error = ENOMEM;
6852 goto out;
6853 }
6854
6855 /*
6856 * Do the work by iterating over the descriptor entries we know to
6857 * at least appear to contain valid data.
6858 */
6859 error = 0;
6860 for (i = 0; i < desc_actual; i++) {
6861 /*
6862 * If the ad_name_offset is 0, then we use the previous
6863 * results to make the check; otherwise, we are looking up
6864 * a new file name.
6865 */
6866 if (input[i].ad_name_offset != 0) {
6867 /* discard old vnodes */
6868 if (vp) {
6869 vnode_put(vp);
6870 vp = NULL;
6871 }
6872 if (dvp) {
6873 vnode_put(dvp);
6874 dvp = NULL;
6875 }
6876
6877 /*
6878 * Scan forward in the descriptor list to see if we
6879 * need the parent vnode. We will need it if we are
6880 * deleting, since we must have rights to remove
6881 * entries in the parent directory, as well as the
6882 * rights to delete the object itself.
6883 */
6884 wantdelete = input[i].ad_flags & _DELETE_OK;
6885 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6886 if (input[j].ad_flags & _DELETE_OK) {
6887 wantdelete = 1;
6888 }
6889 }
6890
6891 niopts = FOLLOW | AUDITVNPATH1;
6892
6893 /* need parent for vnode_authorize for deletion test */
6894 if (wantdelete) {
6895 niopts |= WANTPARENT;
6896 }
6897
6898 /* do the lookup */
6899 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6900 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6901 &context);
6902 error = namei(&nd);
6903 if (!error) {
6904 vp = nd.ni_vp;
6905 if (wantdelete) {
6906 dvp = nd.ni_dvp;
6907 }
6908 }
6909 nameidone(&nd);
6910 }
6911
6912 /*
6913 * Handle lookup errors.
6914 */
6915 switch (error) {
6916 case ENOENT:
6917 case EACCES:
6918 case EPERM:
6919 case ENOTDIR:
6920 result[i] = error;
6921 break;
6922 case 0:
6923 /* run this access check */
6924 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6925 break;
6926 default:
6927 /* fatal lookup error */
6928
6929 goto out;
6930 }
6931 }
6932
6933 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6934
6935 /* copy out results */
6936 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6937
6938 out:
6939 if (input && input != stack_input) {
6940 kfree_data(input, uap->size);
6941 }
6942 if (result) {
6943 kfree_data(result, desc_actual * sizeof(errno_t));
6944 }
6945 if (vp) {
6946 vnode_put(vp);
6947 }
6948 if (dvp) {
6949 vnode_put(dvp);
6950 }
6951 if (IS_VALID_CRED(context.vc_ucred)) {
6952 kauth_cred_unref(&context.vc_ucred);
6953 }
6954 return error;
6955 }
6956
6957
6958 /*
6959 * Returns: 0 Success
6960 * namei:EFAULT Bad address
6961 * namei:ENAMETOOLONG Filename too long
6962 * namei:ENOENT No such file or directory
6963 * namei:ELOOP Too many levels of symbolic links
6964 * namei:EBADF Bad file descriptor
6965 * namei:ENOTDIR Not a directory
6966 * namei:???
6967 * access1:
6968 */
6969 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6970 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6971 int flag, enum uio_seg segflg)
6972 {
6973 int error;
6974 struct nameidata nd;
6975 int niopts;
6976 struct vfs_context context;
6977 #if NAMEDRSRCFORK
6978 int is_namedstream = 0;
6979 #endif
6980
6981 /*
6982 * Unless the AT_EACCESS option is used, Access is defined as checking
6983 * against the process' real identity, even if operations are checking
6984 * the effective identity. So we need to tweak the credential
6985 * in the context for that case.
6986 */
6987 if (!(flag & AT_EACCESS)) {
6988 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6989 } else {
6990 context.vc_ucred = ctx->vc_ucred;
6991 }
6992 context.vc_thread = ctx->vc_thread;
6993
6994
6995 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6996 /* need parent for vnode_authorize for deletion test */
6997 if (amode & _DELETE_OK) {
6998 niopts |= WANTPARENT;
6999 }
7000 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
7001 path, &context);
7002 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7003 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7004 }
7005
7006 #if NAMEDRSRCFORK
7007 /* access(F_OK) calls are allowed for resource forks. */
7008 if (amode == F_OK) {
7009 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7010 }
7011 #endif
7012 error = nameiat(&nd, fd);
7013 if (error) {
7014 goto out;
7015 }
7016
7017 #if NAMEDRSRCFORK
7018 /* Grab reference on the shadow stream file vnode to
7019 * force an inactive on release which will mark it
7020 * for recycle.
7021 */
7022 if (vnode_isnamedstream(nd.ni_vp) &&
7023 (nd.ni_vp->v_parent != NULLVP) &&
7024 vnode_isshadow(nd.ni_vp)) {
7025 is_namedstream = 1;
7026 vnode_ref(nd.ni_vp);
7027 }
7028 #endif
7029
7030 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
7031
7032 #if NAMEDRSRCFORK
7033 if (is_namedstream) {
7034 vnode_rele(nd.ni_vp);
7035 }
7036 #endif
7037
7038 vnode_put(nd.ni_vp);
7039 if (amode & _DELETE_OK) {
7040 vnode_put(nd.ni_dvp);
7041 }
7042 nameidone(&nd);
7043
7044 out:
7045 if (!(flag & AT_EACCESS)) {
7046 kauth_cred_unref(&context.vc_ucred);
7047 }
7048 return error;
7049 }
7050
7051 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)7052 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
7053 {
7054 return faccessat_internal(vfs_context_current(), AT_FDCWD,
7055 uap->path, uap->flags, 0, UIO_USERSPACE);
7056 }
7057
7058 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)7059 faccessat(__unused proc_t p, struct faccessat_args *uap,
7060 __unused int32_t *retval)
7061 {
7062 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7063 return EINVAL;
7064 }
7065
7066 return faccessat_internal(vfs_context_current(), uap->fd,
7067 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
7068 }
7069
7070 /*
7071 * Returns: 0 Success
7072 * EFAULT
7073 * copyout:EFAULT
7074 * namei:???
7075 * vn_stat:???
7076 */
7077 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)7078 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
7079 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
7080 enum uio_seg segflg, int fd, int flag)
7081 {
7082 struct nameidata *ndp = NULL;
7083 int follow;
7084 union {
7085 struct stat sb;
7086 struct stat64 sb64;
7087 } source = {};
7088 union {
7089 struct user64_stat user64_sb;
7090 struct user32_stat user32_sb;
7091 struct user64_stat64 user64_sb64;
7092 struct user32_stat64 user32_sb64;
7093 } dest = {};
7094 caddr_t sbp;
7095 int error, my_size;
7096 kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
7097 size_t xsecurity_bufsize;
7098 void * statptr;
7099 struct fileproc *fp = NULL;
7100 int needsrealdev = 0;
7101
7102 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7103 ndp = kalloc_type(struct nameidata, Z_WAITOK);
7104 NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7105 segflg, path, ctx);
7106 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7107 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7108 }
7109
7110 #if NAMEDRSRCFORK
7111 int is_namedstream = 0;
7112 /* stat calls are allowed for resource forks. */
7113 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7114 #endif
7115
7116 if (flag & AT_FDONLY) {
7117 vnode_t fvp;
7118
7119 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7120 if (error) {
7121 goto out;
7122 }
7123 if ((error = vnode_getwithref(fvp))) {
7124 file_drop(fd);
7125 goto out;
7126 }
7127 ndp->ni_vp = fvp;
7128 } else {
7129 error = nameiat(ndp, fd);
7130 if (error) {
7131 goto out;
7132 }
7133 }
7134
7135 statptr = (void *)&source;
7136
7137 #if NAMEDRSRCFORK
7138 /* Grab reference on the shadow stream file vnode to
7139 * force an inactive on release which will mark it
7140 * for recycle.
7141 */
7142 if (vnode_isnamedstream(ndp->ni_vp) &&
7143 (ndp->ni_vp->v_parent != NULLVP) &&
7144 vnode_isshadow(ndp->ni_vp)) {
7145 is_namedstream = 1;
7146 vnode_ref(ndp->ni_vp);
7147 }
7148 #endif
7149
7150 needsrealdev = flag & AT_REALDEV ? 1 : 0;
7151 if (fp && (xsecurity == USER_ADDR_NULL)) {
7152 /*
7153 * If the caller has the file open, and is not
7154 * requesting extended security information, we are
7155 * going to let them get the basic stat information.
7156 */
7157 error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7158 fp->fp_glob->fg_cred);
7159 } else {
7160 error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7161 isstat64, needsrealdev, ctx);
7162 }
7163
7164 #if NAMEDRSRCFORK
7165 if (is_namedstream) {
7166 vnode_rele(ndp->ni_vp);
7167 }
7168 #endif
7169 vnode_put(ndp->ni_vp);
7170 nameidone(ndp);
7171
7172 if (fp) {
7173 file_drop(fd);
7174 fp = NULL;
7175 }
7176
7177 if (error) {
7178 goto out;
7179 }
7180 /* Zap spare fields */
7181 if (isstat64 != 0) {
7182 source.sb64.st_lspare = 0;
7183 source.sb64.st_qspare[0] = 0LL;
7184 source.sb64.st_qspare[1] = 0LL;
7185 if (vfs_context_is64bit(ctx)) {
7186 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7187 my_size = sizeof(dest.user64_sb64);
7188 sbp = (caddr_t)&dest.user64_sb64;
7189 } else {
7190 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7191 my_size = sizeof(dest.user32_sb64);
7192 sbp = (caddr_t)&dest.user32_sb64;
7193 }
7194 /*
7195 * Check if we raced (post lookup) against the last unlink of a file.
7196 */
7197 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7198 source.sb64.st_nlink = 1;
7199 }
7200 } else {
7201 source.sb.st_lspare = 0;
7202 source.sb.st_qspare[0] = 0LL;
7203 source.sb.st_qspare[1] = 0LL;
7204 if (vfs_context_is64bit(ctx)) {
7205 munge_user64_stat(&source.sb, &dest.user64_sb);
7206 my_size = sizeof(dest.user64_sb);
7207 sbp = (caddr_t)&dest.user64_sb;
7208 } else {
7209 munge_user32_stat(&source.sb, &dest.user32_sb);
7210 my_size = sizeof(dest.user32_sb);
7211 sbp = (caddr_t)&dest.user32_sb;
7212 }
7213
7214 /*
7215 * Check if we raced (post lookup) against the last unlink of a file.
7216 */
7217 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7218 source.sb.st_nlink = 1;
7219 }
7220 }
7221 if ((error = copyout(sbp, ub, my_size)) != 0) {
7222 goto out;
7223 }
7224
7225 /* caller wants extended security information? */
7226 if (xsecurity != USER_ADDR_NULL) {
7227 /* did we get any? */
7228 if (fsec == KAUTH_FILESEC_NONE) {
7229 if (susize(xsecurity_size, 0) != 0) {
7230 error = EFAULT;
7231 goto out;
7232 }
7233 } else {
7234 /* find the user buffer size */
7235 xsecurity_bufsize = fusize(xsecurity_size);
7236
7237 /* copy out the actual data size */
7238 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7239 error = EFAULT;
7240 goto out;
7241 }
7242
7243 /* if the caller supplied enough room, copy out to it */
7244 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7245 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7246 }
7247 }
7248 }
7249 out:
7250 if (ndp) {
7251 kfree_type(struct nameidata, ndp);
7252 }
7253 if (fsec != KAUTH_FILESEC_NONE) {
7254 kauth_filesec_free(fsec);
7255 }
7256 return error;
7257 }
7258
7259 /*
7260 * stat_extended: Get file status; with extended security (ACL).
7261 *
7262 * Parameters: p (ignored)
7263 * uap User argument descriptor (see below)
7264 * retval (ignored)
7265 *
7266 * Indirect: uap->path Path of file to get status from
7267 * uap->ub User buffer (holds file status info)
7268 * uap->xsecurity ACL to get (extended security)
7269 * uap->xsecurity_size Size of ACL
7270 *
7271 * Returns: 0 Success
7272 * !0 errno value
7273 *
7274 */
7275 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7276 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7277 __unused int32_t *retval)
7278 {
7279 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7280 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7281 0);
7282 }
7283
7284 /*
7285 * Returns: 0 Success
7286 * fstatat_internal:??? [see fstatat_internal() in this file]
7287 */
7288 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7289 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7290 {
7291 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7292 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7293 }
7294
7295 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7296 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7297 {
7298 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7299 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7300 }
7301
7302 /*
7303 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7304 *
7305 * Parameters: p (ignored)
7306 * uap User argument descriptor (see below)
7307 * retval (ignored)
7308 *
7309 * Indirect: uap->path Path of file to get status from
7310 * uap->ub User buffer (holds file status info)
7311 * uap->xsecurity ACL to get (extended security)
7312 * uap->xsecurity_size Size of ACL
7313 *
7314 * Returns: 0 Success
7315 * !0 errno value
7316 *
7317 */
7318 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7319 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7320 {
7321 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7322 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7323 0);
7324 }
7325
7326 /*
7327 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7328 *
7329 * Parameters: p (ignored)
7330 * uap User argument descriptor (see below)
7331 * retval (ignored)
7332 *
7333 * Indirect: uap->path Path of file to get status from
7334 * uap->ub User buffer (holds file status info)
7335 * uap->xsecurity ACL to get (extended security)
7336 * uap->xsecurity_size Size of ACL
7337 *
7338 * Returns: 0 Success
7339 * !0 errno value
7340 *
7341 */
7342 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7343 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7344 {
7345 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7346 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7347 AT_SYMLINK_NOFOLLOW);
7348 }
7349
7350 /*
7351 * Get file status; this version does not follow links.
7352 */
7353 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7354 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7355 {
7356 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7357 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7358 }
7359
7360 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7361 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7362 {
7363 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7364 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7365 }
7366
7367 /*
7368 * lstat64_extended: Get file status; can handle large inode numbers; does not
7369 * follow links; with extended security (ACL).
7370 *
7371 * Parameters: p (ignored)
7372 * uap User argument descriptor (see below)
7373 * retval (ignored)
7374 *
7375 * Indirect: uap->path Path of file to get status from
7376 * uap->ub User buffer (holds file status info)
7377 * uap->xsecurity ACL to get (extended security)
7378 * uap->xsecurity_size Size of ACL
7379 *
7380 * Returns: 0 Success
7381 * !0 errno value
7382 *
7383 */
7384 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7385 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7386 {
7387 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7388 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7389 AT_SYMLINK_NOFOLLOW);
7390 }
7391
7392 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7393 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7394 {
7395 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7396 return EINVAL;
7397 }
7398
7399 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7400 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7401 }
7402
7403 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7404 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7405 __unused int32_t *retval)
7406 {
7407 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7408 return EINVAL;
7409 }
7410
7411 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7412 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7413 }
7414
7415 /*
7416 * Get configurable pathname variables.
7417 *
7418 * Returns: 0 Success
7419 * namei:???
7420 * vn_pathconf:???
7421 *
7422 * Notes: Global implementation constants are intended to be
7423 * implemented in this function directly; all other constants
7424 * are per-FS implementation, and therefore must be handled in
7425 * each respective FS, instead.
7426 *
7427 * XXX We implement some things globally right now that should actually be
7428 * XXX per-FS; we will need to deal with this at some point.
7429 */
7430 /* ARGSUSED */
7431 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7432 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7433 {
7434 int error;
7435 struct nameidata nd;
7436 vfs_context_t ctx = vfs_context_current();
7437
7438 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7439 UIO_USERSPACE, uap->path, ctx);
7440 error = namei(&nd);
7441 if (error) {
7442 return error;
7443 }
7444
7445 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7446
7447 vnode_put(nd.ni_vp);
7448 nameidone(&nd);
7449 return error;
7450 }
7451
7452 /*
7453 * Return target name of a symbolic link.
7454 */
7455 /* ARGSUSED */
7456 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7457 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7458 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7459 int *retval)
7460 {
7461 vnode_t vp;
7462 uio_t auio;
7463 int error;
7464 struct nameidata nd;
7465 UIO_STACKBUF(uio_buf, 1);
7466 bool put_vnode;
7467
7468 if (bufsize > INT32_MAX) {
7469 return EINVAL;
7470 }
7471
7472 if (lnk_vp) {
7473 vp = lnk_vp;
7474 put_vnode = false;
7475 } else {
7476 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7477 seg, path, ctx);
7478
7479 error = nameiat(&nd, fd);
7480 if (error) {
7481 return error;
7482 }
7483 vp = nd.ni_vp;
7484 put_vnode = true;
7485 nameidone(&nd);
7486 }
7487
7488 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7489 &uio_buf[0], sizeof(uio_buf));
7490 uio_addiov(auio, buf, bufsize);
7491 if (vp->v_type != VLNK) {
7492 error = EINVAL;
7493 } else {
7494 #if CONFIG_MACF
7495 error = mac_vnode_check_readlink(ctx, vp);
7496 #endif
7497 if (error == 0) {
7498 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7499 ctx);
7500 }
7501 if (error == 0) {
7502 error = VNOP_READLINK(vp, auio, ctx);
7503 }
7504 }
7505
7506 if (put_vnode) {
7507 vnode_put(vp);
7508 }
7509
7510 *retval = (int)(bufsize - uio_resid(auio));
7511 return error;
7512 }
7513
7514 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7515 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7516 {
7517 enum uio_seg procseg;
7518 vnode_t vp;
7519 int error;
7520
7521 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7522
7523 AUDIT_ARG(fd, uap->fd);
7524
7525 if ((error = file_vnode(uap->fd, &vp))) {
7526 return error;
7527 }
7528 if ((error = vnode_getwithref(vp))) {
7529 file_drop(uap->fd);
7530 return error;
7531 }
7532
7533 error = readlinkat_internal(vfs_context_current(), -1,
7534 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7535 uap->bufsize, procseg, retval);
7536
7537 vnode_put(vp);
7538 file_drop(uap->fd);
7539 return error;
7540 }
7541
7542 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7543 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7544 {
7545 enum uio_seg procseg;
7546
7547 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7548 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7549 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7550 uap->count, procseg, retval);
7551 }
7552
7553 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7554 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7555 {
7556 enum uio_seg procseg;
7557
7558 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7559 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7560 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7561 retval);
7562 }
7563
7564 /*
7565 * Change file flags, the deep inner layer.
7566 */
7567 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7568 chflags0(vnode_t vp, struct vnode_attr *va,
7569 int (*setattr)(vnode_t, void *, vfs_context_t),
7570 void *arg, vfs_context_t ctx)
7571 {
7572 kauth_action_t action = 0;
7573 int error;
7574
7575 #if CONFIG_MACF
7576 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7577 if (error) {
7578 goto out;
7579 }
7580 #endif
7581
7582 /* request authorisation, disregard immutability */
7583 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7584 goto out;
7585 }
7586 /*
7587 * Request that the auth layer disregard those file flags it's allowed to when
7588 * authorizing this operation; we need to do this in order to be able to
7589 * clear immutable flags.
7590 */
7591 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7592 goto out;
7593 }
7594 error = (*setattr)(vp, arg, ctx);
7595
7596 #if CONFIG_MACF
7597 if (error == 0) {
7598 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7599 }
7600 #endif
7601
7602 out:
7603 return error;
7604 }
7605
7606 /*
7607 * Change file flags.
7608 *
7609 * NOTE: this will vnode_put() `vp'
7610 */
7611 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7612 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7613 {
7614 struct vnode_attr va;
7615 int error;
7616
7617 VATTR_INIT(&va);
7618 VATTR_SET(&va, va_flags, flags);
7619
7620 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7621 vnode_put(vp);
7622
7623 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7624 error = ENOTSUP;
7625 }
7626
7627 return error;
7628 }
7629
7630 /*
7631 * Change flags of a file given a path name.
7632 */
7633 /* ARGSUSED */
7634 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7635 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7636 {
7637 vnode_t vp;
7638 vfs_context_t ctx = vfs_context_current();
7639 int error;
7640 struct nameidata nd;
7641 uint32_t wantparent = 0;
7642
7643 #if CONFIG_FILE_LEASES
7644 wantparent = WANTPARENT;
7645 #endif
7646
7647 AUDIT_ARG(fflags, uap->flags);
7648 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7649 UIO_USERSPACE, uap->path, ctx);
7650 error = namei(&nd);
7651 if (error) {
7652 return error;
7653 }
7654 vp = nd.ni_vp;
7655
7656 #if CONFIG_FILE_LEASES
7657 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7658 vnode_put(nd.ni_dvp);
7659 #endif
7660
7661 nameidone(&nd);
7662
7663 /* we don't vnode_put() here because chflags1 does internally */
7664 error = chflags1(vp, uap->flags, ctx);
7665
7666 return error;
7667 }
7668
7669 /*
7670 * Change flags of a file given a file descriptor.
7671 */
7672 /* ARGSUSED */
7673 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7674 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7675 {
7676 vnode_t vp;
7677 int error;
7678
7679 AUDIT_ARG(fd, uap->fd);
7680 AUDIT_ARG(fflags, uap->flags);
7681 if ((error = file_vnode(uap->fd, &vp))) {
7682 return error;
7683 }
7684
7685 if ((error = vnode_getwithref(vp))) {
7686 file_drop(uap->fd);
7687 return error;
7688 }
7689
7690 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7691
7692 #if CONFIG_FILE_LEASES
7693 vnode_breakdirlease(vp, true, O_WRONLY);
7694 #endif
7695
7696 /* we don't vnode_put() here because chflags1 does internally */
7697 error = chflags1(vp, uap->flags, vfs_context_current());
7698
7699 file_drop(uap->fd);
7700 return error;
7701 }
7702
7703 /*
7704 * Change security information on a filesystem object.
7705 *
7706 * Returns: 0 Success
7707 * EPERM Operation not permitted
7708 * vnode_authattr:??? [anything vnode_authattr can return]
7709 * vnode_authorize:??? [anything vnode_authorize can return]
7710 * vnode_setattr:??? [anything vnode_setattr can return]
7711 *
7712 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7713 * translated to EPERM before being returned.
7714 */
7715 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7716 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7717 {
7718 kauth_action_t action;
7719 int error;
7720
7721 AUDIT_ARG(mode, vap->va_mode);
7722 /* XXX audit new args */
7723
7724 #if NAMEDSTREAMS
7725 /* chmod calls are not allowed for resource forks. */
7726 if (vp->v_flag & VISNAMEDSTREAM) {
7727 return EPERM;
7728 }
7729 #endif
7730
7731 #if CONFIG_MACF
7732 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7733 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7734 return error;
7735 }
7736
7737 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7738 if ((error = mac_vnode_check_setowner(ctx, vp,
7739 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7740 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7741 return error;
7742 }
7743 }
7744
7745 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7746 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7747 return error;
7748 }
7749 #endif
7750
7751 /* make sure that the caller is allowed to set this security information */
7752 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7753 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7754 if (error == EACCES) {
7755 error = EPERM;
7756 }
7757 return error;
7758 }
7759
7760 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7761 return error;
7762 }
7763
7764 #if CONFIG_MACF
7765 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7766 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7767 }
7768
7769 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7770 mac_vnode_notify_setowner(ctx, vp,
7771 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7772 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7773 }
7774
7775 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7776 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7777 }
7778 #endif
7779
7780 return error;
7781 }
7782
7783
7784 /*
7785 * Change mode of a file given a path name.
7786 *
7787 * Returns: 0 Success
7788 * namei:??? [anything namei can return]
7789 * chmod_vnode:??? [anything chmod_vnode can return]
7790 */
7791 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7792 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7793 int fd, int flag, enum uio_seg segflg)
7794 {
7795 struct nameidata nd;
7796 int follow, error;
7797 uint32_t wantparent = 0;
7798
7799 #if CONFIG_FILE_LEASES
7800 wantparent = WANTPARENT;
7801 #endif
7802
7803 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7804 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7805 segflg, path, ctx);
7806 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7807 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7808 }
7809 if ((error = nameiat(&nd, fd))) {
7810 return error;
7811 }
7812
7813 #if CONFIG_FILE_LEASES
7814 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7815 vnode_put(nd.ni_dvp);
7816 #endif
7817
7818 error = chmod_vnode(ctx, nd.ni_vp, vap);
7819 vnode_put(nd.ni_vp);
7820 nameidone(&nd);
7821 return error;
7822 }
7823
7824 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7825 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7826 gid_t gid, user_addr_t xsecurity)
7827 {
7828 int error;
7829
7830 VATTR_INIT(pva);
7831
7832 if (mode != -1) {
7833 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7834 } else {
7835 pva->va_mode = 0;
7836 }
7837
7838 if (uid != KAUTH_UID_NONE) {
7839 VATTR_SET(pva, va_uid, uid);
7840 }
7841
7842 if (gid != KAUTH_GID_NONE) {
7843 VATTR_SET(pva, va_gid, gid);
7844 }
7845
7846 *pxsecdst = NULL;
7847 switch (xsecurity) {
7848 case USER_ADDR_NULL:
7849 break;
7850
7851 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7852 VATTR_SET(pva, va_acl, NULL);
7853 break;
7854
7855 default:
7856 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7857 return error;
7858 }
7859
7860 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7861 pva->va_vaflags |= VA_FILESEC_ACL;
7862 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7863 break;
7864 }
7865
7866 return 0;
7867 }
7868
7869 /*
7870 * chmod_extended: Change the mode of a file given a path name; with extended
7871 * argument list (including extended security (ACL)).
7872 *
7873 * Parameters: p Process requesting the open
7874 * uap User argument descriptor (see below)
7875 * retval (ignored)
7876 *
7877 * Indirect: uap->path Path to object (same as 'chmod')
7878 * uap->uid UID to set
7879 * uap->gid GID to set
7880 * uap->mode File mode to set (same as 'chmod')
7881 * uap->xsecurity ACL to set (or delete)
7882 *
7883 * Returns: 0 Success
7884 * !0 errno value
7885 *
7886 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7887 *
7888 * XXX: We should enummerate the possible errno values here, and where
7889 * in the code they originated.
7890 */
7891 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7892 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7893 {
7894 int error;
7895 struct vnode_attr va;
7896 kauth_filesec_t xsecdst = NULL;
7897
7898 AUDIT_ARG(owner, uap->uid, uap->gid);
7899
7900 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7901 uap->gid, uap->xsecurity);
7902
7903 if (error) {
7904 return error;
7905 }
7906
7907 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7908 UIO_USERSPACE);
7909
7910 if (xsecdst != NULL) {
7911 kauth_filesec_free(xsecdst);
7912 }
7913 return error;
7914 }
7915
7916 /*
7917 * Returns: 0 Success
7918 * chmodat:??? [anything chmodat can return]
7919 */
7920 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7921 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7922 int flag, enum uio_seg segflg)
7923 {
7924 struct vnode_attr va;
7925
7926 VATTR_INIT(&va);
7927 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7928
7929 return chmodat(ctx, path, &va, fd, flag, segflg);
7930 }
7931
7932 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7933 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7934 {
7935 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7936 AT_FDCWD, 0, UIO_USERSPACE);
7937 }
7938
7939 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7940 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7941 {
7942 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7943 return EINVAL;
7944 }
7945
7946 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7947 uap->fd, uap->flag, UIO_USERSPACE);
7948 }
7949
7950 /*
7951 * Change mode of a file given a file descriptor.
7952 */
7953 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7954 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7955 {
7956 vnode_t vp;
7957 int error;
7958
7959 AUDIT_ARG(fd, fd);
7960
7961 if ((error = file_vnode(fd, &vp)) != 0) {
7962 return error;
7963 }
7964 if ((error = vnode_getwithref(vp)) != 0) {
7965 file_drop(fd);
7966 return error;
7967 }
7968 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7969
7970 #if CONFIG_FILE_LEASES
7971 vnode_breakdirlease(vp, true, O_WRONLY);
7972 #endif
7973
7974 error = chmod_vnode(vfs_context_current(), vp, vap);
7975 (void)vnode_put(vp);
7976 file_drop(fd);
7977
7978 return error;
7979 }
7980
7981 /*
7982 * fchmod_extended: Change mode of a file given a file descriptor; with
7983 * extended argument list (including extended security (ACL)).
7984 *
7985 * Parameters: p Process requesting to change file mode
7986 * uap User argument descriptor (see below)
7987 * retval (ignored)
7988 *
7989 * Indirect: uap->mode File mode to set (same as 'chmod')
7990 * uap->uid UID to set
7991 * uap->gid GID to set
7992 * uap->xsecurity ACL to set (or delete)
7993 * uap->fd File descriptor of file to change mode
7994 *
7995 * Returns: 0 Success
7996 * !0 errno value
7997 *
7998 */
7999 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)8000 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
8001 {
8002 int error;
8003 struct vnode_attr va;
8004 kauth_filesec_t xsecdst = NULL;
8005
8006 AUDIT_ARG(owner, uap->uid, uap->gid);
8007
8008 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8009 uap->gid, uap->xsecurity);
8010
8011 if (error) {
8012 return error;
8013 }
8014
8015 error = fchmod1(p, uap->fd, &va);
8016
8017 if (xsecdst != NULL) {
8018 kauth_filesec_free(xsecdst);
8019 }
8020 return error;
8021 }
8022
8023 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)8024 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
8025 {
8026 struct vnode_attr va;
8027
8028 VATTR_INIT(&va);
8029 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
8030
8031 return fchmod1(p, uap->fd, &va);
8032 }
8033
8034 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)8035 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
8036 {
8037 struct vnode_attr va;
8038 kauth_action_t action;
8039 int error;
8040
8041 VATTR_INIT(&va);
8042 if (uid != (uid_t)VNOVAL) {
8043 VATTR_SET(&va, va_uid, uid);
8044 }
8045 if (gid != (gid_t)VNOVAL) {
8046 VATTR_SET(&va, va_gid, gid);
8047 }
8048
8049 #if NAMEDSTREAMS
8050 /* chown calls are not allowed for resource forks. */
8051 if (vp->v_flag & VISNAMEDSTREAM) {
8052 error = EPERM;
8053 goto out;
8054 }
8055 #endif
8056
8057 #if CONFIG_MACF
8058 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
8059 if (error) {
8060 goto out;
8061 }
8062 #endif
8063
8064 /* preflight and authorize attribute changes */
8065 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8066 goto out;
8067 }
8068 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8069 /*
8070 * EACCES is only allowed from namei(); permissions failure should
8071 * return EPERM, so we need to translate the error code.
8072 */
8073 if (error == EACCES) {
8074 error = EPERM;
8075 }
8076
8077 goto out;
8078 }
8079
8080 #if CONFIG_FILE_LEASES
8081 vnode_breakdirlease(vp, true, O_WRONLY);
8082 #endif
8083
8084 error = vnode_setattr(vp, &va, ctx);
8085
8086 #if CONFIG_MACF
8087 if (error == 0) {
8088 mac_vnode_notify_setowner(ctx, vp, uid, gid);
8089 }
8090 #endif
8091
8092 out:
8093 return error;
8094 }
8095
8096 /*
8097 * Set ownership given a path name.
8098 */
8099 /* ARGSUSED */
8100 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)8101 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
8102 gid_t gid, int flag, enum uio_seg segflg)
8103 {
8104 vnode_t vp;
8105 int error;
8106 struct nameidata nd;
8107 int follow;
8108
8109 AUDIT_ARG(owner, uid, gid);
8110
8111 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8112 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8113 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8114 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8115 }
8116
8117 error = nameiat(&nd, fd);
8118 if (error) {
8119 return error;
8120 }
8121
8122 vp = nd.ni_vp;
8123 error = vn_chown_internal(ctx, vp, uid, gid);
8124
8125 nameidone(&nd);
8126 vnode_put(vp);
8127 return error;
8128 }
8129
8130 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8131 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8132 {
8133 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8134 uap->uid, uap->gid, 0, UIO_USERSPACE);
8135 }
8136
8137 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8138 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8139 {
8140 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8141 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8142 }
8143
8144 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8145 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8146 {
8147 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
8148 return EINVAL;
8149 }
8150
8151 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8152 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8153 }
8154
8155 /*
8156 * Set ownership given a file descriptor.
8157 */
8158 /* ARGSUSED */
8159 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8160 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8161 {
8162 vfs_context_t ctx = vfs_context_current();
8163 vnode_t vp;
8164 int error;
8165
8166 AUDIT_ARG(owner, uap->uid, uap->gid);
8167 AUDIT_ARG(fd, uap->fd);
8168
8169 if ((error = file_vnode(uap->fd, &vp))) {
8170 return error;
8171 }
8172
8173 if ((error = vnode_getwithref(vp))) {
8174 file_drop(uap->fd);
8175 return error;
8176 }
8177 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8178
8179 error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8180
8181 (void)vnode_put(vp);
8182 file_drop(uap->fd);
8183 return error;
8184 }
8185
8186 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8187 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8188 {
8189 int error;
8190
8191 if (usrtvp == USER_ADDR_NULL) {
8192 struct timeval old_tv;
8193 /* XXX Y2038 bug because of microtime argument */
8194 microtime(&old_tv);
8195 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8196 tsp[1] = tsp[0];
8197 } else {
8198 if (IS_64BIT_PROCESS(current_proc())) {
8199 struct user64_timeval tv[2];
8200 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8201 if (error) {
8202 return error;
8203 }
8204 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8205 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8206 } else {
8207 struct user32_timeval tv[2];
8208 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8209 if (error) {
8210 return error;
8211 }
8212 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8213 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8214 }
8215 }
8216 return 0;
8217 }
8218
8219 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8220 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8221 int nullflag)
8222 {
8223 int error;
8224 struct vnode_attr va;
8225 kauth_action_t action;
8226
8227 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8228
8229 VATTR_INIT(&va);
8230 VATTR_SET(&va, va_access_time, ts[0]);
8231 VATTR_SET(&va, va_modify_time, ts[1]);
8232 if (nullflag) {
8233 va.va_vaflags |= VA_UTIMES_NULL;
8234 }
8235
8236 #if NAMEDSTREAMS
8237 /* utimes calls are not allowed for resource forks. */
8238 if (vp->v_flag & VISNAMEDSTREAM) {
8239 error = EPERM;
8240 goto out;
8241 }
8242 #endif
8243
8244 #if CONFIG_MACF
8245 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8246 if (error) {
8247 goto out;
8248 }
8249 #endif
8250 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8251 if (!nullflag && error == EACCES) {
8252 error = EPERM;
8253 }
8254 goto out;
8255 }
8256
8257 /* since we may not need to auth anything, check here */
8258 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8259 if (!nullflag && error == EACCES) {
8260 error = EPERM;
8261 }
8262 goto out;
8263 }
8264 error = vnode_setattr(vp, &va, ctx);
8265
8266 #if CONFIG_MACF
8267 if (error == 0) {
8268 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8269 }
8270 #endif
8271
8272 out:
8273 return error;
8274 }
8275
8276 /*
8277 * Set the access and modification times of a file.
8278 */
8279 /* ARGSUSED */
8280 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8281 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8282 {
8283 struct timespec ts[2];
8284 user_addr_t usrtvp;
8285 int error;
8286 struct nameidata nd;
8287 vfs_context_t ctx = vfs_context_current();
8288 uint32_t wantparent = 0;
8289
8290 #if CONFIG_FILE_LEASES
8291 wantparent = WANTPARENT;
8292 #endif
8293
8294 /*
8295 * AUDIT: Needed to change the order of operations to do the
8296 * name lookup first because auditing wants the path.
8297 */
8298 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8299 UIO_USERSPACE, uap->path, ctx);
8300 error = namei(&nd);
8301 if (error) {
8302 return error;
8303 }
8304
8305 /*
8306 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8307 * the current time instead.
8308 */
8309 usrtvp = uap->tptr;
8310 if ((error = getutimes(usrtvp, ts)) != 0) {
8311 goto out;
8312 }
8313
8314 #if CONFIG_FILE_LEASES
8315 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8316 #endif
8317
8318 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8319
8320 out:
8321 #if CONFIG_FILE_LEASES
8322 vnode_put(nd.ni_dvp);
8323 #endif
8324 nameidone(&nd);
8325 vnode_put(nd.ni_vp);
8326 return error;
8327 }
8328
8329 /*
8330 * Set the access and modification times of a file.
8331 */
8332 /* ARGSUSED */
8333 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8334 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8335 {
8336 struct timespec ts[2];
8337 vnode_t vp;
8338 user_addr_t usrtvp;
8339 int error;
8340
8341 AUDIT_ARG(fd, uap->fd);
8342 usrtvp = uap->tptr;
8343 if ((error = getutimes(usrtvp, ts)) != 0) {
8344 return error;
8345 }
8346 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8347 return error;
8348 }
8349 if ((error = vnode_getwithref(vp))) {
8350 file_drop(uap->fd);
8351 return error;
8352 }
8353
8354 #if CONFIG_FILE_LEASES
8355 vnode_breakdirlease(vp, true, O_WRONLY);
8356 #endif
8357
8358 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8359
8360 vnode_put(vp);
8361 file_drop(uap->fd);
8362 return error;
8363 }
8364
8365 static int
truncate_validate_common(proc_t p,off_t length)8366 truncate_validate_common(proc_t p, off_t length)
8367 {
8368 rlim_t fsize_limit;
8369
8370 if (length < 0) {
8371 return EINVAL;
8372 }
8373
8374 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8375 if ((rlim_t)length > fsize_limit) {
8376 psignal(p, SIGXFSZ);
8377 return EFBIG;
8378 }
8379
8380 return 0;
8381 }
8382
8383 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8384 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8385 vfs_context_t ctx, boolean_t need_auth)
8386 {
8387 struct vnode_attr va;
8388 kauth_action_t action;
8389 int error;
8390
8391 VATTR_INIT(&va);
8392 VATTR_SET(&va, va_data_size, length);
8393
8394 #if CONFIG_MACF
8395 error = mac_vnode_check_truncate(ctx, cred, vp);
8396 if (error) {
8397 return error;
8398 }
8399 #endif
8400
8401 /*
8402 * If we reached here from `ftruncate` then we already did an effective
8403 * `vnode_authorize` upon open. We honour the result from then.
8404 */
8405 if (need_auth) {
8406 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8407 return error;
8408 }
8409
8410 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8411 return error;
8412 }
8413 }
8414
8415 #if CONFIG_FILE_LEASES
8416 /* Check if there is a lease placed on the parent directory. */
8417 vnode_breakdirlease(vp, true, O_WRONLY);
8418
8419 /* Now check if there is a lease placed on the file itself. */
8420 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8421 #endif
8422
8423 error = vnode_setattr(vp, &va, ctx);
8424
8425 #if CONFIG_MACF
8426 if (error == 0) {
8427 mac_vnode_notify_truncate(ctx, cred, vp);
8428 }
8429 #endif
8430
8431 return error;
8432 }
8433
8434 /*
8435 * Truncate a file given its path name.
8436 */
8437 /* ARGSUSED */
8438 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8439 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8440 {
8441 vfs_context_t ctx = vfs_context_current();
8442 vnode_t vp;
8443 int error;
8444 struct nameidata nd;
8445
8446 if ((error = truncate_validate_common(p, uap->length))) {
8447 return error;
8448 }
8449
8450 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8451 UIO_USERSPACE, uap->path, ctx);
8452
8453 if ((error = namei(&nd))) {
8454 return error;
8455 }
8456
8457 vp = nd.ni_vp;
8458 nameidone(&nd);
8459
8460 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8461 vnode_put(vp);
8462
8463 return error;
8464 }
8465
8466 /*
8467 * Truncate a file given a file descriptor.
8468 */
8469 /* ARGSUSED */
8470 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8471 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8472 {
8473 vnode_t vp;
8474 struct fileproc *fp;
8475 int error;
8476
8477 AUDIT_ARG(fd, uap->fd);
8478
8479 if ((error = truncate_validate_common(p, uap->length))) {
8480 return error;
8481 }
8482
8483 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8484 return error;
8485 }
8486
8487 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8488 case DTYPE_PSXSHM:
8489 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8490 goto out;
8491 case DTYPE_VNODE:
8492 break;
8493 default:
8494 error = EINVAL;
8495 goto out;
8496 }
8497
8498 vp = (vnode_t)fp_get_data(fp);
8499
8500 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8501 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8502 error = EINVAL;
8503 goto out;
8504 }
8505
8506 if ((error = vnode_getwithref(vp)) != 0) {
8507 goto out;
8508 }
8509
8510 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8511
8512 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8513 vfs_context_current(), false);
8514 vnode_put(vp);
8515
8516 out:
8517 file_drop(uap->fd);
8518 return error;
8519 }
8520
8521
8522 /*
8523 * Sync an open file with synchronized I/O _file_ integrity completion
8524 */
8525 /* ARGSUSED */
8526 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8527 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8528 {
8529 __pthread_testcancel(1);
8530 return fsync_common(p, uap, MNT_WAIT);
8531 }
8532
8533
8534 /*
8535 * Sync an open file with synchronized I/O _file_ integrity completion
8536 *
8537 * Notes: This is a legacy support function that does not test for
8538 * thread cancellation points.
8539 */
8540 /* ARGSUSED */
8541 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8542 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8543 {
8544 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8545 }
8546
8547
8548 /*
8549 * Sync an open file with synchronized I/O _data_ integrity completion
8550 */
8551 /* ARGSUSED */
8552 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8553 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8554 {
8555 __pthread_testcancel(1);
8556 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8557 }
8558
8559
8560 /*
8561 * fsync_common
8562 *
8563 * Common fsync code to support both synchronized I/O file integrity completion
8564 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8565 *
8566 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8567 * will only guarantee that the file data contents are retrievable. If
8568 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8569 * includes additional metadata unnecessary for retrieving the file data
8570 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8571 * storage.
8572 *
8573 * Parameters: p The process
8574 * uap->fd The descriptor to synchronize
8575 * flags The data integrity flags
8576 *
8577 * Returns: int Success
8578 * fp_getfvp:EBADF Bad file descriptor
8579 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8580 * VNOP_FSYNC:??? unspecified
8581 *
8582 * Notes: We use struct fsync_args because it is a short name, and all
8583 * caller argument structures are otherwise identical.
8584 */
8585 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8586 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8587 {
8588 vnode_t vp;
8589 struct fileproc *fp;
8590 vfs_context_t ctx = vfs_context_current();
8591 int error;
8592
8593 AUDIT_ARG(fd, uap->fd);
8594
8595 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8596 return error;
8597 }
8598 if ((error = vnode_getwithref(vp))) {
8599 file_drop(uap->fd);
8600 return error;
8601 }
8602
8603 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8604
8605 error = VNOP_FSYNC(vp, flags, ctx);
8606
8607 #if NAMEDRSRCFORK
8608 /* Sync resource fork shadow file if necessary. */
8609 if ((error == 0) &&
8610 (vp->v_flag & VISNAMEDSTREAM) &&
8611 (vp->v_parent != NULLVP) &&
8612 vnode_isshadow(vp) &&
8613 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8614 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8615 }
8616 #endif
8617
8618 (void)vnode_put(vp);
8619 file_drop(uap->fd);
8620 return error;
8621 }
8622
8623 /*
8624 * Duplicate files. Source must be a file, target must be a file or
8625 * must not exist.
8626 *
8627 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8628 * perform inheritance correctly.
8629 */
8630 /* ARGSUSED */
8631 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8632 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8633 {
8634 vnode_t tvp, fvp, tdvp, sdvp;
8635 struct nameidata fromnd, tond;
8636 int error;
8637 vfs_context_t ctx = vfs_context_current();
8638
8639 /* Check that the flags are valid. */
8640 if (uap->flags & ~CPF_MASK) {
8641 return EINVAL;
8642 }
8643
8644 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8645 UIO_USERSPACE, uap->from, ctx);
8646 if ((error = namei(&fromnd))) {
8647 return error;
8648 }
8649 fvp = fromnd.ni_vp;
8650
8651 NDINIT(&tond, CREATE, OP_LINK,
8652 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8653 UIO_USERSPACE, uap->to, ctx);
8654 if ((error = namei(&tond))) {
8655 goto out1;
8656 }
8657 tdvp = tond.ni_dvp;
8658 tvp = tond.ni_vp;
8659
8660 if (tvp != NULL) {
8661 if (!(uap->flags & CPF_OVERWRITE)) {
8662 error = EEXIST;
8663 goto out;
8664 }
8665 }
8666
8667 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8668 error = EISDIR;
8669 goto out;
8670 }
8671
8672 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8673 error = EOPNOTSUPP;
8674 goto out;
8675 }
8676
8677 #if CONFIG_MACF
8678 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8679 goto out;
8680 }
8681 #endif /* CONFIG_MACF */
8682
8683 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8684 goto out;
8685 }
8686 if (tvp) {
8687 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8688 goto out;
8689 }
8690 }
8691 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8692 goto out;
8693 }
8694
8695 if (fvp == tdvp) {
8696 error = EINVAL;
8697 }
8698 /*
8699 * If source is the same as the destination (that is the
8700 * same inode number) then there is nothing to do.
8701 * (fixed to have POSIX semantics - CSM 3/2/98)
8702 */
8703 if (fvp == tvp) {
8704 error = -1;
8705 }
8706
8707 #if CONFIG_FILE_LEASES
8708 vnode_breakdirlease(tdvp, false, O_WRONLY);
8709 #endif
8710
8711 if (!error) {
8712 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8713 }
8714 out:
8715 sdvp = tond.ni_startdir;
8716 /*
8717 * nameidone has to happen before we vnode_put(tdvp)
8718 * since it may need to release the fs_nodelock on the tdvp
8719 */
8720 nameidone(&tond);
8721
8722 if (tvp) {
8723 vnode_put(tvp);
8724 }
8725 vnode_put(tdvp);
8726 vnode_put(sdvp);
8727 out1:
8728 vnode_put(fvp);
8729
8730 nameidone(&fromnd);
8731
8732 if (error == -1) {
8733 return 0;
8734 }
8735 return error;
8736 }
8737
8738 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8739
8740 /*
8741 * Helper function for doing clones. The caller is expected to provide an
8742 * iocounted source vnode and release it.
8743 */
8744 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8745 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8746 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8747 {
8748 vnode_t tvp, tdvp;
8749 struct nameidata *tondp = NULL;
8750 int error;
8751 int follow;
8752 boolean_t free_src_acl;
8753 boolean_t attr_cleanup;
8754 enum vtype v_type;
8755 kauth_action_t action;
8756 struct componentname *cnp;
8757 uint32_t defaulted = 0;
8758 struct {
8759 struct vnode_attr va[2];
8760 } *va2p = NULL;
8761 struct vnode_attr *vap = NULL;
8762 struct vnode_attr *nvap = NULL;
8763 uint32_t vnop_flags;
8764
8765 v_type = vnode_vtype(fvp);
8766 switch (v_type) {
8767 case VLNK:
8768 /* FALLTHRU */
8769 case VREG:
8770 action = KAUTH_VNODE_ADD_FILE;
8771 break;
8772 case VDIR:
8773 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8774 fvp->v_mountedhere) {
8775 return EINVAL;
8776 }
8777 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8778 break;
8779 default:
8780 return EINVAL;
8781 }
8782
8783 AUDIT_ARG(fd2, dst_dirfd);
8784 AUDIT_ARG(value32, flags);
8785
8786 tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8787 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8788 NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8789 UIO_USERSPACE, dst, ctx);
8790 if (flags & CLONE_NOFOLLOW_ANY) {
8791 tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
8792 }
8793
8794 if ((error = nameiat(tondp, dst_dirfd))) {
8795 kfree_type(struct nameidata, tondp);
8796 return error;
8797 }
8798 cnp = &tondp->ni_cnd;
8799 tdvp = tondp->ni_dvp;
8800 tvp = tondp->ni_vp;
8801
8802 free_src_acl = FALSE;
8803 attr_cleanup = FALSE;
8804
8805 if (tvp != NULL) {
8806 error = EEXIST;
8807 goto out;
8808 }
8809
8810 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8811 error = EXDEV;
8812 goto out;
8813 }
8814
8815 #if CONFIG_MACF
8816 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8817 goto out;
8818 }
8819 #endif
8820 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8821 goto out;
8822 }
8823
8824 action = KAUTH_VNODE_GENERIC_READ_BITS;
8825 if (data_read_authorised) {
8826 action &= ~KAUTH_VNODE_READ_DATA;
8827 }
8828 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8829 goto out;
8830 }
8831
8832 va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
8833 vap = &va2p->va[0];
8834 nvap = &va2p->va[1];
8835
8836 /*
8837 * certain attributes may need to be changed from the source, we ask for
8838 * those here with the exception of source file's ACLs unless the CLONE_ACL
8839 * flag is specified. By default, the clone file will inherit the target
8840 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8841 * will inherit the source file's ACLs instead.
8842 */
8843 VATTR_INIT(vap);
8844 VATTR_WANTED(vap, va_uid);
8845 VATTR_WANTED(vap, va_gid);
8846 VATTR_WANTED(vap, va_mode);
8847 VATTR_WANTED(vap, va_flags);
8848 if (flags & CLONE_ACL) {
8849 VATTR_WANTED(vap, va_acl);
8850 }
8851
8852 if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
8853 goto out;
8854 }
8855
8856 VATTR_INIT(nvap);
8857 VATTR_SET(nvap, va_type, v_type);
8858 if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
8859 VATTR_SET(nvap, va_acl, vap->va_acl);
8860 free_src_acl = TRUE;
8861 }
8862
8863 /* Handle ACL inheritance, initialize vap. */
8864 if (v_type == VLNK) {
8865 error = vnode_authattr_new(tdvp, nvap, 0, ctx);
8866 } else {
8867 error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
8868 if (error) {
8869 goto out;
8870 }
8871 attr_cleanup = TRUE;
8872 }
8873
8874 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8875 /*
8876 * We've got initial values for all security parameters,
8877 * If we are superuser, then we can change owners to be the
8878 * same as the source. Both superuser and the owner have default
8879 * WRITE_SECURITY privileges so all other fields can be taken
8880 * from source as well.
8881 */
8882 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8883 if (VATTR_IS_SUPPORTED(vap, va_uid)) {
8884 VATTR_SET(nvap, va_uid, vap->va_uid);
8885 }
8886 if (VATTR_IS_SUPPORTED(vap, va_gid)) {
8887 VATTR_SET(nvap, va_gid, vap->va_gid);
8888 }
8889 } else {
8890 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8891 }
8892
8893 if (VATTR_IS_SUPPORTED(vap, va_mode)) {
8894 VATTR_SET(nvap, va_mode, vap->va_mode);
8895 }
8896 if (VATTR_IS_SUPPORTED(vap, va_flags)) {
8897 VATTR_SET(nvap, va_flags,
8898 ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8899 (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8900 }
8901
8902 #if CONFIG_FILE_LEASES
8903 vnode_breakdirlease(tdvp, false, O_WRONLY);
8904 #endif
8905
8906 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
8907
8908 if (!error && tvp) {
8909 int update_flags = 0;
8910 #if CONFIG_FSE
8911 int fsevent;
8912 #endif /* CONFIG_FSE */
8913
8914 /*
8915 * If some of the requested attributes weren't handled by the
8916 * VNOP, use our fallback code.
8917 */
8918 if (!VATTR_ALL_SUPPORTED(nvap)) {
8919 (void)vnode_setattr_fallback(tvp, nvap, ctx);
8920 }
8921
8922 #if CONFIG_MACF
8923 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8924 VNODE_LABEL_CREATE, ctx);
8925 #endif
8926
8927 // Make sure the name & parent pointers are hooked up
8928 if (tvp->v_name == NULL) {
8929 update_flags |= VNODE_UPDATE_NAME;
8930 }
8931 if (tvp->v_parent == NULLVP) {
8932 update_flags |= VNODE_UPDATE_PARENT;
8933 }
8934
8935 if (update_flags) {
8936 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8937 cnp->cn_namelen, cnp->cn_hash, update_flags);
8938 }
8939
8940 #if CONFIG_FSE
8941 switch (vnode_vtype(tvp)) {
8942 case VLNK:
8943 /* FALLTHRU */
8944 case VREG:
8945 fsevent = FSE_CREATE_FILE;
8946 break;
8947 case VDIR:
8948 fsevent = FSE_CREATE_DIR;
8949 break;
8950 default:
8951 goto out;
8952 }
8953
8954 if (need_fsevent(fsevent, tvp)) {
8955 /*
8956 * The following is a sequence of three explicit events.
8957 * A pair of FSE_CLONE events representing the source and destination
8958 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8959 * fseventsd may coalesce the destination clone and create events
8960 * into a single event resulting in the following sequence for a client
8961 * FSE_CLONE (src)
8962 * FSE_CLONE | FSE_CREATE (dst)
8963 */
8964 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8965 FSE_ARG_DONE);
8966 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8967 FSE_ARG_DONE);
8968 }
8969 #endif /* CONFIG_FSE */
8970 }
8971
8972 out:
8973 if (attr_cleanup) {
8974 vn_attribute_cleanup(nvap, defaulted);
8975 }
8976 if (free_src_acl && vap->va_acl) {
8977 kauth_acl_free(vap->va_acl);
8978 }
8979 if (va2p) {
8980 kfree_type(typeof(*va2p), va2p);
8981 }
8982 nameidone(tondp);
8983 kfree_type(struct nameidata, tondp);
8984 if (tvp) {
8985 vnode_put(tvp);
8986 }
8987 vnode_put(tdvp);
8988 return error;
8989 }
8990
8991 /*
8992 * clone files or directories, target must not exist.
8993 */
8994 /* ARGSUSED */
8995 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8996 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8997 __unused int32_t *retval)
8998 {
8999 vnode_t fvp;
9000 struct nameidata *ndp = NULL;
9001 int follow;
9002 int error;
9003 vfs_context_t ctx = vfs_context_current();
9004
9005 /* Check that the flags are valid. */
9006 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9007 CLONE_NOFOLLOW_ANY)) {
9008 return EINVAL;
9009 }
9010
9011 AUDIT_ARG(fd, uap->src_dirfd);
9012
9013 ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9014
9015 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9016 NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
9017 UIO_USERSPACE, uap->src, ctx);
9018 if (uap->flags & CLONE_NOFOLLOW_ANY) {
9019 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9020 }
9021
9022 if ((error = nameiat(ndp, uap->src_dirfd))) {
9023 kfree_type(struct nameidata, ndp);
9024 return error;
9025 }
9026
9027 fvp = ndp->ni_vp;
9028 nameidone(ndp);
9029 kfree_type(struct nameidata, ndp);
9030
9031 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
9032 uap->flags, ctx);
9033
9034 vnode_put(fvp);
9035 return error;
9036 }
9037
9038 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)9039 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
9040 __unused int32_t *retval)
9041 {
9042 vnode_t fvp;
9043 struct fileproc *fp;
9044 int error;
9045 vfs_context_t ctx = vfs_context_current();
9046
9047 /* Check that the flags are valid. */
9048 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9049 CLONE_NOFOLLOW_ANY)) {
9050 return EINVAL;
9051 }
9052
9053 AUDIT_ARG(fd, uap->src_fd);
9054 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
9055 if (error) {
9056 return error;
9057 }
9058
9059 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9060 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
9061 error = EBADF;
9062 goto out;
9063 }
9064
9065 if ((error = vnode_getwithref(fvp))) {
9066 goto out;
9067 }
9068
9069 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
9070
9071 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
9072 uap->flags, ctx);
9073
9074 vnode_put(fvp);
9075 out:
9076 file_drop(uap->src_fd);
9077 return error;
9078 }
9079
9080 static int
rename_submounts_callback(mount_t mp,void * arg)9081 rename_submounts_callback(mount_t mp, void *arg)
9082 {
9083 int error = 0;
9084 mount_t pmp = (mount_t)arg;
9085 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
9086
9087 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
9088 return 0;
9089 }
9090
9091 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
9092 return 0;
9093 }
9094
9095 if ((error = vfs_busy(mp, LK_NOWAIT))) {
9096 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
9097 return -1;
9098 }
9099
9100 size_t pathlen = MAXPATHLEN;
9101 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
9102 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
9103 }
9104
9105 vfs_unbusy(mp);
9106
9107 return error;
9108 }
9109
9110 /*
9111 * Rename files. Source and destination must either both be directories,
9112 * or both not be directories. If target is a directory, it must be empty.
9113 */
9114 /* ARGSUSED */
9115 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9116 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9117 int tofd, user_addr_t to, int segflg, u_int uflags)
9118 {
9119 vnode_t tvp, tdvp;
9120 vnode_t fvp, fdvp;
9121 vnode_t mnt_fvp;
9122 struct nameidata *fromnd, *tond;
9123 int error = 0;
9124 int do_retry;
9125 int retry_count;
9126 int mntrename;
9127 int need_event;
9128 int need_kpath2;
9129 int has_listeners;
9130 const char *oname = NULL;
9131 char *from_name = NULL, *to_name = NULL;
9132 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9133 int from_len = 0, to_len = 0;
9134 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9135 int holding_mntlock;
9136 int vn_authorize_skipped;
9137 mount_t locked_mp = NULL;
9138 vnode_t oparent = NULLVP;
9139 vnode_t locked_vp = NULLVP;
9140 #if CONFIG_FSE
9141 fse_info from_finfo = {}, to_finfo;
9142 #endif
9143 int from_truncated = 0, to_truncated = 0;
9144 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9145 int batched = 0;
9146 struct vnode_attr *fvap, *tvap;
9147 int continuing = 0;
9148 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9149 int32_t nofollow_any = 0;
9150 /* carving out a chunk for structs that are too big to be on stack. */
9151 struct {
9152 struct nameidata from_node, to_node;
9153 struct vnode_attr fv_attr, tv_attr;
9154 } * __rename_data;
9155
9156 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9157 fromnd = &__rename_data->from_node;
9158 tond = &__rename_data->to_node;
9159
9160 holding_mntlock = 0;
9161 do_retry = 0;
9162 retry_count = 0;
9163 retry:
9164 fvp = tvp = NULL;
9165 fdvp = tdvp = NULL;
9166 fvap = tvap = NULL;
9167 mnt_fvp = NULLVP;
9168 mntrename = FALSE;
9169 vn_authorize_skipped = FALSE;
9170
9171 if (uflags & RENAME_NOFOLLOW_ANY) {
9172 nofollow_any = NAMEI_NOFOLLOW_ANY;
9173 }
9174 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9175 segflg, from, ctx);
9176 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9177
9178 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9179 segflg, to, ctx);
9180 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9181
9182 continue_lookup:
9183 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9184 if ((error = nameiat(fromnd, fromfd))) {
9185 goto out1;
9186 }
9187 fdvp = fromnd->ni_dvp;
9188 fvp = fromnd->ni_vp;
9189
9190 if (fvp && fvp->v_type == VDIR) {
9191 tond->ni_cnd.cn_flags |= WILLBEDIR;
9192 }
9193 }
9194
9195 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9196 if ((error = nameiat(tond, tofd))) {
9197 /*
9198 * Translate error code for rename("dir1", "dir2/.").
9199 */
9200 if (error == EISDIR && fvp->v_type == VDIR) {
9201 error = EINVAL;
9202 }
9203 goto out1;
9204 }
9205 tdvp = tond->ni_dvp;
9206 tvp = tond->ni_vp;
9207 }
9208
9209 #if DEVELOPMENT || DEBUG
9210 /*
9211 * XXX VSWAP: Check for entitlements or special flag here
9212 * so we can restrict access appropriately.
9213 */
9214 #else /* DEVELOPMENT || DEBUG */
9215
9216 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9217 error = EPERM;
9218 goto out1;
9219 }
9220
9221 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9222 error = EPERM;
9223 goto out1;
9224 }
9225 #endif /* DEVELOPMENT || DEBUG */
9226
9227 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9228 error = ENOENT;
9229 goto out1;
9230 }
9231
9232 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9233 int32_t pval = 0;
9234 int err = 0;
9235
9236 /*
9237 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9238 * has the same name as target iff the following conditions are met:
9239 * 1. the target file system is case insensitive
9240 * 2. source and target directories are the same
9241 * 3. source and target files are the same
9242 * 4. name only differs in case (determined by underlying filesystem)
9243 */
9244 if (fvp != tvp || fdvp != tdvp) {
9245 error = EEXIST;
9246 goto out1;
9247 }
9248
9249 /*
9250 * Assume that the target file system is case sensitive if
9251 * _PC_CASE_SENSITIVE selector isn't supported.
9252 */
9253 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9254 if (err != 0 || pval != 0) {
9255 error = EEXIST;
9256 goto out1;
9257 }
9258 }
9259
9260 batched = vnode_compound_rename_available(fdvp);
9261
9262 #if CONFIG_FSE
9263 need_event = need_fsevent(FSE_RENAME, fdvp);
9264 if (need_event) {
9265 if (fvp) {
9266 get_fse_info(fvp, &from_finfo, ctx);
9267 } else {
9268 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9269 if (error) {
9270 goto out1;
9271 }
9272
9273 fvap = &__rename_data->fv_attr;
9274 }
9275
9276 if (tvp) {
9277 get_fse_info(tvp, &to_finfo, ctx);
9278 } else if (batched) {
9279 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9280 if (error) {
9281 goto out1;
9282 }
9283
9284 tvap = &__rename_data->tv_attr;
9285 }
9286 }
9287 #else
9288 need_event = 0;
9289 #endif /* CONFIG_FSE */
9290
9291 has_listeners = kauth_authorize_fileop_has_listeners();
9292
9293 need_kpath2 = 0;
9294 #if CONFIG_AUDIT
9295 if (AUDIT_RECORD_EXISTS()) {
9296 need_kpath2 = 1;
9297 }
9298 #endif
9299
9300 if (need_event || has_listeners) {
9301 if (from_name == NULL) {
9302 GET_PATH(from_name);
9303 }
9304
9305 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9306
9307 if (from_name_no_firmlink == NULL) {
9308 GET_PATH(from_name_no_firmlink);
9309 }
9310
9311 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9312 }
9313
9314 if (need_event || need_kpath2 || has_listeners) {
9315 if (to_name == NULL) {
9316 GET_PATH(to_name);
9317 }
9318
9319 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9320
9321 if (to_name_no_firmlink == NULL) {
9322 GET_PATH(to_name_no_firmlink);
9323 }
9324
9325 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9326 if (to_name && need_kpath2) {
9327 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9328 }
9329 }
9330 if (!fvp) {
9331 /*
9332 * Claim: this check will never reject a valid rename.
9333 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9334 * Suppose fdvp and tdvp are not on the same mount.
9335 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9336 * then you can't move it to within another dir on the same mountpoint.
9337 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9338 *
9339 * If this check passes, then we are safe to pass these vnodes to the same FS.
9340 */
9341 if (fdvp->v_mount != tdvp->v_mount) {
9342 error = EXDEV;
9343 goto out1;
9344 }
9345 goto skipped_lookup;
9346 }
9347
9348 /*
9349 * If the source and destination are the same (i.e. they're
9350 * links to the same vnode) and the target file system is
9351 * case sensitive, then there is nothing to do.
9352 *
9353 * XXX Come back to this.
9354 */
9355 if (fvp == tvp) {
9356 int pathconf_val;
9357
9358 /*
9359 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9360 * then assume that this file system is case sensitive.
9361 */
9362 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9363 pathconf_val != 0) {
9364 vn_authorize_skipped = TRUE;
9365 goto out1;
9366 }
9367 }
9368
9369 /*
9370 * Allow the renaming of mount points.
9371 * - target must not exist
9372 * - target must reside in the same directory as source
9373 * - union mounts cannot be renamed
9374 * - the root fs, and tightly-linked system volumes, cannot be renamed
9375 *
9376 * XXX Handle this in VFS after a continued lookup (if we missed
9377 * in the cache to start off)
9378 *
9379 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9380 * we'll skip past here. The file system is responsible for
9381 * checking that @tvp is not a descendent of @fvp and vice versa
9382 * so it should always return EINVAL if either @tvp or @fvp is the
9383 * root of a volume.
9384 */
9385 if ((fvp->v_flag & VROOT) &&
9386 (fvp->v_type == VDIR) &&
9387 (tvp == NULL) &&
9388 (fvp->v_mountedhere == NULL) &&
9389 (fdvp == tdvp) &&
9390 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9391 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9392 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9393 vnode_t coveredvp;
9394
9395 /* switch fvp to the covered vnode */
9396 coveredvp = fvp->v_mount->mnt_vnodecovered;
9397 if ((vnode_getwithref(coveredvp))) {
9398 error = ENOENT;
9399 goto out1;
9400 }
9401 /*
9402 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9403 * later.
9404 */
9405 mnt_fvp = fvp;
9406
9407 fvp = coveredvp;
9408 mntrename = TRUE;
9409 }
9410 /*
9411 * Check for cross-device rename.
9412 * For rename on mountpoint, we want to also check the source and its parent
9413 * belong to the same mountpoint.
9414 */
9415 if ((fvp->v_mount != tdvp->v_mount) ||
9416 (fvp->v_mount != fdvp->v_mount) ||
9417 (tvp && (fvp->v_mount != tvp->v_mount))) {
9418 error = EXDEV;
9419 goto out1;
9420 }
9421
9422 /*
9423 * If source is the same as the destination (that is the
9424 * same inode number) then there is nothing to do...
9425 * EXCEPT if the underlying file system supports case
9426 * insensitivity and is case preserving. In this case
9427 * the file system needs to handle the special case of
9428 * getting the same vnode as target (fvp) and source (tvp).
9429 *
9430 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9431 * and _PC_CASE_PRESERVING can have this exception, and they need to
9432 * handle the special case of getting the same vnode as target and
9433 * source. NOTE: Then the target is unlocked going into vnop_rename,
9434 * so not to cause locking problems. There is a single reference on tvp.
9435 *
9436 * NOTE - that fvp == tvp also occurs if they are hard linked and
9437 * that correct behaviour then is just to return success without doing
9438 * anything.
9439 *
9440 * XXX filesystem should take care of this itself, perhaps...
9441 */
9442 if (fvp == tvp && fdvp == tdvp) {
9443 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9444 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9445 fromnd->ni_cnd.cn_namelen)) {
9446 vn_authorize_skipped = TRUE;
9447 goto out1;
9448 }
9449 }
9450
9451 if (holding_mntlock && fvp->v_mount != locked_mp) {
9452 /*
9453 * we're holding a reference and lock
9454 * on locked_mp, but it no longer matches
9455 * what we want to do... so drop our hold
9456 */
9457 mount_unlock_renames(locked_mp);
9458 mount_drop(locked_mp, 0);
9459 holding_mntlock = 0;
9460 }
9461 if (tdvp != fdvp && fvp->v_type == VDIR) {
9462 /*
9463 * serialize renames that re-shape
9464 * the tree... if holding_mntlock is
9465 * set, then we're ready to go...
9466 * otherwise we
9467 * first need to drop the iocounts
9468 * we picked up, second take the
9469 * lock to serialize the access,
9470 * then finally start the lookup
9471 * process over with the lock held
9472 */
9473 if (!holding_mntlock) {
9474 /*
9475 * need to grab a reference on
9476 * the mount point before we
9477 * drop all the iocounts... once
9478 * the iocounts are gone, the mount
9479 * could follow
9480 */
9481 locked_mp = fvp->v_mount;
9482 mount_ref(locked_mp, 0);
9483
9484 /*
9485 * nameidone has to happen before we vnode_put(tvp)
9486 * since it may need to release the fs_nodelock on the tvp
9487 */
9488 nameidone(tond);
9489
9490 if (tvp) {
9491 vnode_put(tvp);
9492 }
9493 vnode_put(tdvp);
9494
9495 /*
9496 * nameidone has to happen before we vnode_put(fdvp)
9497 * since it may need to release the fs_nodelock on the fvp
9498 */
9499 nameidone(fromnd);
9500
9501 vnode_put(fvp);
9502 vnode_put(fdvp);
9503
9504 if (mnt_fvp != NULLVP) {
9505 vnode_put(mnt_fvp);
9506 }
9507
9508 mount_lock_renames(locked_mp);
9509 holding_mntlock = 1;
9510
9511 goto retry;
9512 }
9513 } else {
9514 /*
9515 * when we dropped the iocounts to take
9516 * the lock, we allowed the identity of
9517 * the various vnodes to change... if they did,
9518 * we may no longer be dealing with a rename
9519 * that reshapes the tree... once we're holding
9520 * the iocounts, the vnodes can't change type
9521 * so we're free to drop the lock at this point
9522 * and continue on
9523 */
9524 if (holding_mntlock) {
9525 mount_unlock_renames(locked_mp);
9526 mount_drop(locked_mp, 0);
9527 holding_mntlock = 0;
9528 }
9529 }
9530
9531 if (!batched) {
9532 assert(locked_vp == NULLVP);
9533 vnode_link_lock(fvp);
9534 locked_vp = fvp;
9535 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9536 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9537 flags, NULL);
9538 if (error) {
9539 if (error == ENOENT) {
9540 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9541 /*
9542 * We encountered a race where after doing the namei,
9543 * tvp stops being valid. If so, simply re-drive the rename
9544 * call from the top.
9545 */
9546 do_retry = 1;
9547 retry_count += 1;
9548 }
9549 }
9550 vnode_link_unlock(fvp);
9551 locked_vp = NULLVP;
9552 goto out1;
9553 }
9554 }
9555
9556 /* Release the 'mnt_fvp' now that it is no longer needed. */
9557 if (mnt_fvp != NULLVP) {
9558 vnode_put(mnt_fvp);
9559 mnt_fvp = NULLVP;
9560 }
9561
9562 // save these off so we can later verify that fvp is the same
9563 oname = fvp->v_name;
9564 oparent = fvp->v_parent;
9565
9566 skipped_lookup:
9567 #if CONFIG_FILE_LEASES
9568 /* Lease break needed for source's parent dir? */
9569 vnode_breakdirlease(fdvp, false, O_WRONLY);
9570
9571 /* Lease break needed for target's parent dir? */
9572 vnode_breakdirlease(tdvp, false, O_WRONLY);
9573 #endif
9574
9575 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9576 tdvp, &tvp, &tond->ni_cnd, tvap,
9577 flags, ctx);
9578
9579 if (locked_vp) {
9580 vnode_link_unlock(fvp);
9581 locked_vp = NULLVP;
9582 }
9583
9584 if (holding_mntlock) {
9585 /*
9586 * we can drop our serialization
9587 * lock now
9588 */
9589 mount_unlock_renames(locked_mp);
9590 mount_drop(locked_mp, 0);
9591 holding_mntlock = 0;
9592 }
9593 if (error) {
9594 if (error == EDATALESS) {
9595 /*
9596 * If we've been here before, something has gone
9597 * horribly wrong and we should just get out lest
9598 * we spiral around the drain forever.
9599 */
9600 if (flags & VFS_RENAME_DATALESS) {
9601 error = EIO;
9602 goto out1;
9603 }
9604
9605 /*
9606 * The object we're renaming is dataless (or has a
9607 * dataless descendent) and requires materialization
9608 * before the rename occurs. But we're holding the
9609 * mount point's rename lock, so it's not safe to
9610 * make the upcall.
9611 *
9612 * In this case, we release the lock (above), perform
9613 * the materialization, and start the whole thing over.
9614 */
9615 error = vfs_materialize_reparent(fvp, tdvp);
9616 if (error == 0) {
9617 /*
9618 * The next time around we need to tell the
9619 * file system that the materializtaion has
9620 * been performed.
9621 */
9622 flags |= VFS_RENAME_DATALESS;
9623 do_retry = 1;
9624 }
9625 goto out1;
9626 }
9627 if (error == EKEEPLOOKING) {
9628 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9629 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9630 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9631 }
9632 }
9633
9634 fromnd->ni_vp = fvp;
9635 tond->ni_vp = tvp;
9636
9637 goto continue_lookup;
9638 }
9639
9640 /*
9641 * We may encounter a race in the VNOP where the destination didn't
9642 * exist when we did the namei, but it does by the time we go and
9643 * try to create the entry. In this case, we should re-drive this rename
9644 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9645 * but other filesystems susceptible to this race could return it, too.
9646 */
9647 if (error == ERECYCLE) {
9648 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9649 do_retry = 1;
9650 retry_count += 1;
9651 } else {
9652 printf("rename retry limit due to ERECYCLE reached\n");
9653 error = ENOENT;
9654 }
9655 }
9656
9657 /*
9658 * For compound VNOPs, the authorization callback may return
9659 * ENOENT in case of racing hardlink lookups hitting the name
9660 * cache, redrive the lookup.
9661 */
9662 if (batched && error == ENOENT) {
9663 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9664 do_retry = 1;
9665 retry_count += 1;
9666 }
9667 }
9668
9669 goto out1;
9670 }
9671
9672 /* call out to allow 3rd party notification of rename.
9673 * Ignore result of kauth_authorize_fileop call.
9674 */
9675 kauth_authorize_fileop(vfs_context_ucred(ctx),
9676 KAUTH_FILEOP_RENAME,
9677 (uintptr_t)from_name, (uintptr_t)to_name);
9678 if (flags & VFS_RENAME_SWAP) {
9679 kauth_authorize_fileop(vfs_context_ucred(ctx),
9680 KAUTH_FILEOP_RENAME,
9681 (uintptr_t)to_name, (uintptr_t)from_name);
9682 }
9683
9684 #if CONFIG_FSE
9685 if (from_name != NULL && to_name != NULL) {
9686 if (from_truncated || to_truncated) {
9687 // set it here since only the from_finfo gets reported up to user space
9688 from_finfo.mode |= FSE_TRUNCATED_PATH;
9689 }
9690
9691 if (tvap && tvp) {
9692 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9693 }
9694 if (fvap) {
9695 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9696 }
9697
9698 if (tvp) {
9699 add_fsevent(FSE_RENAME, ctx,
9700 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9701 FSE_ARG_FINFO, &from_finfo,
9702 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9703 FSE_ARG_FINFO, &to_finfo,
9704 FSE_ARG_DONE);
9705 if (flags & VFS_RENAME_SWAP) {
9706 /*
9707 * Strictly speaking, swap is the equivalent of
9708 * *three* renames. FSEvents clients should only take
9709 * the events as a hint, so we only bother reporting
9710 * two.
9711 */
9712 add_fsevent(FSE_RENAME, ctx,
9713 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9714 FSE_ARG_FINFO, &to_finfo,
9715 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9716 FSE_ARG_FINFO, &from_finfo,
9717 FSE_ARG_DONE);
9718 }
9719 } else {
9720 add_fsevent(FSE_RENAME, ctx,
9721 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9722 FSE_ARG_FINFO, &from_finfo,
9723 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9724 FSE_ARG_DONE);
9725 }
9726 }
9727 #endif /* CONFIG_FSE */
9728
9729 /*
9730 * update filesystem's mount point data
9731 */
9732 if (mntrename) {
9733 char *cp, *pathend, *mpname;
9734 char * tobuf;
9735 struct mount *mp;
9736 int maxlen;
9737 size_t len = 0;
9738
9739 mp = fvp->v_mountedhere;
9740
9741 if (vfs_busy(mp, LK_NOWAIT)) {
9742 error = EBUSY;
9743 goto out1;
9744 }
9745 tobuf = zalloc(ZV_NAMEI);
9746
9747 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9748 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9749 } else {
9750 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9751 }
9752 if (!error) {
9753 /* find current mount point prefix */
9754 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9755 for (cp = pathend; *cp != '\0'; ++cp) {
9756 if (*cp == '/') {
9757 pathend = cp + 1;
9758 }
9759 }
9760 /* find last component of target name */
9761 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9762 if (*cp == '/') {
9763 mpname = cp + 1;
9764 }
9765 }
9766
9767 /* Update f_mntonname of sub mounts */
9768 vfs_iterate(0, rename_submounts_callback, (void *)mp);
9769
9770 /* append name to prefix */
9771 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9772 bzero(pathend, maxlen);
9773
9774 strlcpy(pathend, mpname, maxlen);
9775 }
9776 zfree(ZV_NAMEI, tobuf);
9777
9778 vfs_unbusy(mp);
9779
9780 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9781 }
9782 /*
9783 * fix up name & parent pointers. note that we first
9784 * check that fvp has the same name/parent pointers it
9785 * had before the rename call... this is a 'weak' check
9786 * at best...
9787 *
9788 * XXX oparent and oname may not be set in the compound vnop case
9789 */
9790 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9791 int update_flags;
9792
9793 update_flags = VNODE_UPDATE_NAME;
9794
9795 if (fdvp != tdvp) {
9796 update_flags |= VNODE_UPDATE_PARENT;
9797 }
9798
9799 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9800 }
9801 out1:
9802 /*
9803 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9804 * skipped earlier as no actual rename was performed.
9805 */
9806 if (vn_authorize_skipped && error == 0) {
9807 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9808 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9809 flags, NULL);
9810 if (error && error == ENOENT) {
9811 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9812 do_retry = 1;
9813 retry_count += 1;
9814 }
9815 }
9816 }
9817 if (to_name != NULL) {
9818 RELEASE_PATH(to_name);
9819 to_name = NULL;
9820 }
9821 if (to_name_no_firmlink != NULL) {
9822 RELEASE_PATH(to_name_no_firmlink);
9823 to_name_no_firmlink = NULL;
9824 }
9825 if (from_name != NULL) {
9826 RELEASE_PATH(from_name);
9827 from_name = NULL;
9828 }
9829 if (from_name_no_firmlink != NULL) {
9830 RELEASE_PATH(from_name_no_firmlink);
9831 from_name_no_firmlink = NULL;
9832 }
9833 if (holding_mntlock) {
9834 mount_unlock_renames(locked_mp);
9835 mount_drop(locked_mp, 0);
9836 holding_mntlock = 0;
9837 }
9838 if (tdvp) {
9839 /*
9840 * nameidone has to happen before we vnode_put(tdvp)
9841 * since it may need to release the fs_nodelock on the tdvp
9842 */
9843 nameidone(tond);
9844
9845 if (tvp) {
9846 vnode_put(tvp);
9847 }
9848 vnode_put(tdvp);
9849 }
9850 if (fdvp) {
9851 /*
9852 * nameidone has to happen before we vnode_put(fdvp)
9853 * since it may need to release the fs_nodelock on the fdvp
9854 */
9855 nameidone(fromnd);
9856
9857 if (fvp) {
9858 vnode_put(fvp);
9859 }
9860 vnode_put(fdvp);
9861 }
9862 if (mnt_fvp != NULLVP) {
9863 vnode_put(mnt_fvp);
9864 }
9865 /*
9866 * If things changed after we did the namei, then we will re-drive
9867 * this rename call from the top.
9868 */
9869 if (do_retry) {
9870 do_retry = 0;
9871 goto retry;
9872 }
9873
9874 kfree_type(typeof(*__rename_data), __rename_data);
9875 return error;
9876 }
9877
9878 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9879 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9880 {
9881 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9882 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9883 }
9884
9885 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9886 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9887 {
9888 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9889 return EINVAL;
9890 }
9891
9892 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9893 return EINVAL;
9894 }
9895
9896 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9897 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9898 }
9899
9900 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9901 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9902 {
9903 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9904 uap->tofd, uap->to, UIO_USERSPACE, 0);
9905 }
9906
9907 /*
9908 * Make a directory file.
9909 *
9910 * Returns: 0 Success
9911 * EEXIST
9912 * namei:???
9913 * vnode_authorize:???
9914 * vn_create:???
9915 */
9916 /* ARGSUSED */
9917 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9918 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9919 enum uio_seg segflg)
9920 {
9921 vnode_t vp, dvp;
9922 int error;
9923 int update_flags = 0;
9924 int batched;
9925 struct nameidata nd;
9926
9927 AUDIT_ARG(mode, vap->va_mode);
9928 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9929 path, ctx);
9930 nd.ni_cnd.cn_flags |= WILLBEDIR;
9931 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9932
9933 continue_lookup:
9934 error = nameiat(&nd, fd);
9935 if (error) {
9936 return error;
9937 }
9938 dvp = nd.ni_dvp;
9939 vp = nd.ni_vp;
9940
9941 if (vp != NULL) {
9942 error = EEXIST;
9943 goto out;
9944 }
9945
9946 batched = vnode_compound_mkdir_available(dvp);
9947
9948 VATTR_SET(vap, va_type, VDIR);
9949
9950 /*
9951 * XXX
9952 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9953 * only get EXISTS or EISDIR for existing path components, and not that it could see
9954 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9955 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9956 */
9957 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9958 if (error == EACCES || error == EPERM) {
9959 int error2;
9960
9961 nameidone(&nd);
9962 vnode_put(dvp);
9963 dvp = NULLVP;
9964
9965 /*
9966 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9967 * rather than EACCESS if the target exists.
9968 */
9969 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9970 path, ctx);
9971 error2 = nameiat(&nd, fd);
9972 if (error2) {
9973 goto out;
9974 } else {
9975 vp = nd.ni_vp;
9976 error = EEXIST;
9977 goto out;
9978 }
9979 }
9980
9981 goto out;
9982 }
9983
9984 #if CONFIG_FILE_LEASES
9985 vnode_breakdirlease(dvp, false, O_WRONLY);
9986 #endif
9987
9988 /*
9989 * make the directory
9990 */
9991 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9992 if (error == EKEEPLOOKING) {
9993 nd.ni_vp = vp;
9994 goto continue_lookup;
9995 }
9996
9997 goto out;
9998 }
9999
10000 // Make sure the name & parent pointers are hooked up
10001 if (vp->v_name == NULL) {
10002 update_flags |= VNODE_UPDATE_NAME;
10003 }
10004 if (vp->v_parent == NULLVP) {
10005 update_flags |= VNODE_UPDATE_PARENT;
10006 }
10007
10008 if (update_flags) {
10009 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
10010 }
10011
10012 #if CONFIG_FSE
10013 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
10014 #endif
10015
10016 out:
10017 /*
10018 * nameidone has to happen before we vnode_put(dvp)
10019 * since it may need to release the fs_nodelock on the dvp
10020 */
10021 nameidone(&nd);
10022
10023 if (vp) {
10024 vnode_put(vp);
10025 }
10026 if (dvp) {
10027 vnode_put(dvp);
10028 }
10029
10030 return error;
10031 }
10032
10033 /*
10034 * mkdir_extended: Create a directory; with extended security (ACL).
10035 *
10036 * Parameters: p Process requesting to create the directory
10037 * uap User argument descriptor (see below)
10038 * retval (ignored)
10039 *
10040 * Indirect: uap->path Path of directory to create
10041 * uap->mode Access permissions to set
10042 * uap->xsecurity ACL to set
10043 *
10044 * Returns: 0 Success
10045 * !0 Not success
10046 *
10047 */
10048 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)10049 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
10050 {
10051 int ciferror;
10052 kauth_filesec_t xsecdst;
10053 struct vnode_attr va;
10054
10055 AUDIT_ARG(owner, uap->uid, uap->gid);
10056
10057 xsecdst = NULL;
10058 if ((uap->xsecurity != USER_ADDR_NULL) &&
10059 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
10060 return ciferror;
10061 }
10062
10063 VATTR_INIT(&va);
10064 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10065 if (xsecdst != NULL) {
10066 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
10067 va.va_vaflags |= VA_FILESEC_ACL;
10068 }
10069
10070 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10071 UIO_USERSPACE);
10072 if (xsecdst != NULL) {
10073 kauth_filesec_free(xsecdst);
10074 }
10075 return ciferror;
10076 }
10077
10078 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)10079 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
10080 {
10081 struct vnode_attr va;
10082
10083 VATTR_INIT(&va);
10084 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10085
10086 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10087 UIO_USERSPACE);
10088 }
10089
10090 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)10091 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
10092 {
10093 struct vnode_attr va;
10094
10095 VATTR_INIT(&va);
10096 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10097
10098 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
10099 UIO_USERSPACE);
10100 }
10101
10102 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)10103 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
10104 enum uio_seg segflg, int unlink_flags)
10105 {
10106 struct {
10107 struct nameidata nd;
10108 #if CONFIG_FSE
10109 struct vnode_attr va;
10110 #endif /* CONFIG_FSE */
10111 } *__rmdir_data;
10112 vnode_t vp, dvp;
10113 int error;
10114 struct nameidata *ndp;
10115 char *path = NULL;
10116 char *no_firmlink_path = NULL;
10117 int len_path = 0;
10118 int len_no_firmlink_path = 0;
10119 int has_listeners = 0;
10120 int need_event = 0;
10121 int truncated_path = 0;
10122 int truncated_no_firmlink_path = 0;
10123 struct vnode_attr *vap = NULL;
10124 int restart_count = 0;
10125 int batched;
10126
10127 int restart_flag;
10128 int nofollow_any = 0;
10129
10130 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10131 ndp = &__rmdir_data->nd;
10132
10133 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10134 nofollow_any = NAMEI_NOFOLLOW_ANY;
10135 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10136 }
10137
10138 /*
10139 * This loop exists to restart rmdir in the unlikely case that two
10140 * processes are simultaneously trying to remove the same directory
10141 * containing orphaned appleDouble files.
10142 */
10143 do {
10144 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10145 segflg, dirpath, ctx);
10146 ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
10147 continue_lookup:
10148 restart_flag = 0;
10149 vap = NULL;
10150
10151 error = nameiat(ndp, fd);
10152 if (error) {
10153 goto err_out;
10154 }
10155
10156 dvp = ndp->ni_dvp;
10157 vp = ndp->ni_vp;
10158
10159 if (vp) {
10160 batched = vnode_compound_rmdir_available(vp);
10161
10162 if (vp->v_flag & VROOT) {
10163 /*
10164 * The root of a mounted filesystem cannot be deleted.
10165 */
10166 error = EBUSY;
10167 goto out;
10168 }
10169
10170 #if DEVELOPMENT || DEBUG
10171 /*
10172 * XXX VSWAP: Check for entitlements or special flag here
10173 * so we can restrict access appropriately.
10174 */
10175 #else /* DEVELOPMENT || DEBUG */
10176
10177 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10178 error = EPERM;
10179 goto out;
10180 }
10181 #endif /* DEVELOPMENT || DEBUG */
10182
10183 /*
10184 * Removed a check here; we used to abort if vp's vid
10185 * was not the same as what we'd seen the last time around.
10186 * I do not think that check was valid, because if we retry
10187 * and all dirents are gone, the directory could legitimately
10188 * be recycled but still be present in a situation where we would
10189 * have had permission to delete. Therefore, we won't make
10190 * an effort to preserve that check now that we may not have a
10191 * vp here.
10192 */
10193
10194 if (!batched) {
10195 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10196 if (error) {
10197 if (error == ENOENT) {
10198 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10199 restart_flag = 1;
10200 restart_count += 1;
10201 }
10202 }
10203 goto out;
10204 }
10205 }
10206 } else {
10207 batched = 1;
10208
10209 if (!vnode_compound_rmdir_available(dvp)) {
10210 panic("No error, but no compound rmdir?");
10211 }
10212 }
10213
10214 #if CONFIG_FSE
10215 fse_info finfo = {0};
10216
10217 need_event = need_fsevent(FSE_DELETE, dvp);
10218 if (need_event) {
10219 if (!batched) {
10220 get_fse_info(vp, &finfo, ctx);
10221 } else {
10222 error = vfs_get_notify_attributes(&__rmdir_data->va);
10223 if (error) {
10224 goto out;
10225 }
10226
10227 vap = &__rmdir_data->va;
10228 }
10229 }
10230 #endif
10231 has_listeners = kauth_authorize_fileop_has_listeners();
10232 if (need_event || has_listeners) {
10233 if (path == NULL) {
10234 GET_PATH(path);
10235 }
10236
10237 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10238
10239 if (no_firmlink_path == NULL) {
10240 GET_PATH(no_firmlink_path);
10241 }
10242
10243 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10244 #if CONFIG_FSE
10245 if (truncated_no_firmlink_path) {
10246 finfo.mode |= FSE_TRUNCATED_PATH;
10247 }
10248 #endif
10249 }
10250
10251 #if CONFIG_FILE_LEASES
10252 vnode_breakdirlease(dvp, false, O_WRONLY);
10253 #endif
10254
10255 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10256 ndp->ni_vp = vp;
10257 if (vp == NULLVP) {
10258 /* Couldn't find a vnode */
10259 goto out;
10260 }
10261
10262 if (error == EKEEPLOOKING) {
10263 goto continue_lookup;
10264 } else if (batched && error == ENOENT) {
10265 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10266 /*
10267 * For compound VNOPs, the authorization callback
10268 * may return ENOENT in case of racing hard link lookups
10269 * redrive the lookup.
10270 */
10271 restart_flag = 1;
10272 restart_count += 1;
10273 goto out;
10274 }
10275 }
10276
10277 /*
10278 * XXX There's no provision for passing flags
10279 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10280 * because it's not empty, then we try again
10281 * with VNOP_REMOVE(), passing in a special
10282 * flag that clever file systems will know
10283 * how to handle.
10284 */
10285 if (error == ENOTEMPTY &&
10286 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10287 /*
10288 * Only do this if the directory is actually
10289 * marked as DATALESS.
10290 */
10291 struct vnode_attr *lvap =
10292 kalloc_type(struct vnode_attr, Z_WAITOK);
10293
10294 VATTR_INIT(lvap);
10295 VATTR_WANTED(lvap, va_flags);
10296 if (vnode_getattr(vp, lvap, ctx) == 0 &&
10297 VATTR_IS_SUPPORTED(lvap, va_flags) &&
10298 (lvap->va_flags & SF_DATALESS) != 0) {
10299 /*
10300 * If this fails, we want to keep the original
10301 * error.
10302 */
10303 if (vn_remove(dvp, &vp, ndp,
10304 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10305 error = 0;
10306 }
10307 }
10308 kfree_type(struct vnode_attr, lvap);
10309 }
10310
10311 #if CONFIG_APPLEDOUBLE
10312 /*
10313 * Special case to remove orphaned AppleDouble
10314 * files. I don't like putting this in the kernel,
10315 * but carbon does not like putting this in carbon either,
10316 * so here we are.
10317 */
10318 if (error == ENOTEMPTY) {
10319 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10320 if (ad_error == EBUSY) {
10321 error = ad_error;
10322 goto out;
10323 }
10324
10325
10326 /*
10327 * Assuming everything went well, we will try the RMDIR again
10328 */
10329 if (!ad_error) {
10330 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10331 }
10332 }
10333 #endif /* CONFIG_APPLEDOUBLE */
10334 /*
10335 * Call out to allow 3rd party notification of delete.
10336 * Ignore result of kauth_authorize_fileop call.
10337 */
10338 if (!error) {
10339 if (has_listeners) {
10340 kauth_authorize_fileop(vfs_context_ucred(ctx),
10341 KAUTH_FILEOP_DELETE,
10342 (uintptr_t)vp,
10343 (uintptr_t)path);
10344 }
10345
10346 if (vp->v_flag & VISHARDLINK) {
10347 // see the comment in unlink1() about why we update
10348 // the parent of a hard link when it is removed
10349 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10350 }
10351
10352 #if CONFIG_FSE
10353 if (need_event) {
10354 if (vap) {
10355 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10356 }
10357 add_fsevent(FSE_DELETE, ctx,
10358 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10359 FSE_ARG_FINFO, &finfo,
10360 FSE_ARG_DONE);
10361 }
10362 #endif
10363
10364 #if CONFIG_MACF
10365 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10366 #endif
10367 }
10368
10369 out:
10370 if (path != NULL) {
10371 RELEASE_PATH(path);
10372 path = NULL;
10373 }
10374
10375 if (no_firmlink_path != NULL) {
10376 RELEASE_PATH(no_firmlink_path);
10377 no_firmlink_path = NULL;
10378 }
10379
10380 /*
10381 * nameidone has to happen before we vnode_put(dvp)
10382 * since it may need to release the fs_nodelock on the dvp
10383 */
10384 nameidone(ndp);
10385 vnode_put(dvp);
10386
10387 if (vp) {
10388 vnode_put(vp);
10389 }
10390
10391 if (restart_flag == 0) {
10392 wakeup_one((caddr_t)vp);
10393 goto err_out;
10394 }
10395 tsleep(vp, PVFS, "rm AD", 1);
10396 } while (restart_flag != 0);
10397
10398 err_out:
10399 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10400
10401 return error;
10402 }
10403
10404 /*
10405 * Remove a directory file.
10406 */
10407 /* ARGSUSED */
10408 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10409 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10410 {
10411 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10412 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10413 }
10414
10415 /* Get direntry length padded to 8 byte alignment */
10416 #define DIRENT64_LEN(namlen) \
10417 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10418
10419 /* Get dirent length padded to 4 byte alignment */
10420 #define DIRENT_LEN(namelen) \
10421 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10422
10423 /* Get the end of this dirent */
10424 #define DIRENT_END(dep) \
10425 (((char *)(dep)) + (dep)->d_reclen - 1)
10426
10427 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10428 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10429 int *numdirent, vfs_context_t ctxp)
10430 {
10431 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10432 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10433 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10434 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10435 } else {
10436 size_t bufsize;
10437 void * bufptr;
10438 uio_t auio;
10439 struct direntry *entry64;
10440 struct dirent *dep;
10441 size_t bytesread;
10442 int error;
10443
10444 /*
10445 * We're here because the underlying file system does not
10446 * support direnties or we mounted denying support so we must
10447 * fall back to dirents and convert them to direntries.
10448 *
10449 * Our kernel buffer needs to be smaller since re-packing will
10450 * expand each dirent. The worse case (when the name length
10451 * is 3 or less) corresponds to a struct direntry size of 32
10452 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10453 * (4-byte aligned). So having a buffer that is 3/8 the size
10454 * will prevent us from reading more than we can pack.
10455 *
10456 * Since this buffer is wired memory, we will limit the
10457 * buffer size to a maximum of 32K. We would really like to
10458 * use 32K in the MIN(), but we use magic number 87371 to
10459 * prevent uio_resid() * 3 / 8 from overflowing.
10460 */
10461 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10462 bufptr = kalloc_data(bufsize, Z_WAITOK);
10463 if (bufptr == NULL) {
10464 return ENOMEM;
10465 }
10466
10467 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10468 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10469 auio->uio_offset = uio->uio_offset;
10470
10471 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10472
10473 dep = (struct dirent *)bufptr;
10474 bytesread = bufsize - uio_resid(auio);
10475
10476 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10477 /*
10478 * Convert all the entries and copy them out to user's buffer.
10479 */
10480 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10481 /* First check that the dirent struct up to d_name is within the buffer */
10482 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10483 /* Check that the length of the entire dirent is within the buffer */
10484 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10485 /* Check that the actual length including the name doesn't exceed d_reclen */
10486 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10487 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10488 vp->v_mount->mnt_vfsstat.f_mntonname,
10489 vp->v_name ? vp->v_name : "<unknown>");
10490 error = EIO;
10491 break;
10492 }
10493
10494 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10495
10496 bzero(entry64, enbufsize);
10497 /* Convert a dirent to a dirent64. */
10498 entry64->d_ino = dep->d_ino;
10499 entry64->d_seekoff = 0;
10500 entry64->d_reclen = (uint16_t)enbufsize;
10501 entry64->d_namlen = dep->d_namlen;
10502 entry64->d_type = dep->d_type;
10503 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10504
10505 /* Move to next entry. */
10506 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10507
10508 /* Copy entry64 to user's buffer. */
10509 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10510 }
10511
10512 /* Update the real offset using the offset we got from VNOP_READDIR. */
10513 if (error == 0) {
10514 uio->uio_offset = auio->uio_offset;
10515 }
10516 uio_free(auio);
10517 kfree_data(bufptr, bufsize);
10518 kfree_type(struct direntry, entry64);
10519 return error;
10520 }
10521 }
10522
10523 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10524
10525 /*
10526 * Read a block of directory entries in a file system independent format.
10527 */
10528 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10529 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10530 off_t *offset, int *eofflag, int flags)
10531 {
10532 vnode_t vp;
10533 struct vfs_context context = *vfs_context_current(); /* local copy */
10534 struct fileproc *fp;
10535 uio_t auio;
10536 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10537 off_t loff;
10538 int error, numdirent;
10539 UIO_STACKBUF(uio_buf, 1);
10540
10541 get_from_fd:
10542 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10543 if (error) {
10544 return error;
10545 }
10546
10547 vn_offset_lock(fp->fp_glob);
10548 if (((vnode_t)fp_get_data(fp)) != vp) {
10549 vn_offset_unlock(fp->fp_glob);
10550 file_drop(fd);
10551 goto get_from_fd;
10552 }
10553
10554 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10555 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10556 error = EBADF;
10557 goto out;
10558 }
10559
10560 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10561 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10562 }
10563
10564 #if CONFIG_MACF
10565 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10566 if (error) {
10567 goto out;
10568 }
10569 #endif
10570
10571 if ((error = vnode_getwithref(vp))) {
10572 goto out;
10573 }
10574 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10575
10576 #if CONFIG_UNION_MOUNTS
10577 unionread:
10578 #endif /* CONFIG_UNION_MOUNTS */
10579 if (vp->v_type != VDIR) {
10580 (void)vnode_put(vp);
10581 error = EINVAL;
10582 goto out;
10583 }
10584
10585 #if CONFIG_MACF
10586 error = mac_vnode_check_readdir(&context, vp);
10587 if (error != 0) {
10588 (void)vnode_put(vp);
10589 goto out;
10590 }
10591 #endif /* MAC */
10592
10593 loff = fp->fp_glob->fg_offset;
10594 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10595 uio_addiov(auio, bufp, bufsize);
10596
10597 if (flags & VNODE_READDIR_EXTENDED) {
10598 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10599 fp->fp_glob->fg_offset = uio_offset(auio);
10600 } else {
10601 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10602 fp->fp_glob->fg_offset = uio_offset(auio);
10603 }
10604 if (error) {
10605 (void)vnode_put(vp);
10606 goto out;
10607 }
10608
10609 #if CONFIG_UNION_MOUNTS
10610 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10611 (vp->v_mount->mnt_flag & MNT_UNION)) {
10612 vnode_t uvp;
10613
10614 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10615 if (vnode_ref(uvp) == 0) {
10616 fp_set_data(fp, uvp);
10617 fp->fp_glob->fg_offset = 0;
10618 vnode_rele(vp);
10619 vnode_put(vp);
10620 vp = uvp;
10621 goto unionread;
10622 } else {
10623 /* could not get a ref, can't replace in fd */
10624 vnode_put(uvp);
10625 }
10626 }
10627 }
10628 #endif /* CONFIG_UNION_MOUNTS */
10629
10630 vnode_put(vp);
10631 if (offset) {
10632 *offset = loff;
10633 }
10634
10635 *bytesread = bufsize - uio_resid(auio);
10636 out:
10637 vn_offset_unlock(fp->fp_glob);
10638 file_drop(fd);
10639 return error;
10640 }
10641
10642
10643 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10644 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10645 {
10646 off_t offset;
10647 ssize_t bytesread;
10648 int error, eofflag;
10649
10650 AUDIT_ARG(fd, uap->fd);
10651 error = getdirentries_common(uap->fd, uap->buf, uap->count,
10652 &bytesread, &offset, &eofflag, 0);
10653
10654 if (error == 0) {
10655 if (proc_is64bit(p)) {
10656 user64_long_t base = (user64_long_t)offset;
10657 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10658 } else {
10659 user32_long_t base = (user32_long_t)offset;
10660 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10661 }
10662 *retval = (int)bytesread;
10663 }
10664 return error;
10665 }
10666
10667 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10668 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10669 {
10670 off_t offset;
10671 ssize_t bytesread;
10672 int error, eofflag;
10673 user_size_t bufsize;
10674
10675 AUDIT_ARG(fd, uap->fd);
10676
10677 /*
10678 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10679 * then the kernel carves out the last 4 bytes to return extended
10680 * information to userspace (namely whether we reached EOF with this call).
10681 */
10682 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10683 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10684 } else {
10685 bufsize = uap->bufsize;
10686 }
10687
10688 error = getdirentries_common(uap->fd, uap->buf, bufsize,
10689 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10690
10691 if (error == 0) {
10692 *retval = bytesread;
10693 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10694
10695 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10696 getdirentries64_flags_t flags = 0;
10697 if (eofflag) {
10698 flags |= GETDIRENTRIES64_EOF;
10699 }
10700 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10701 sizeof(flags));
10702 }
10703 }
10704 return error;
10705 }
10706
10707
10708 /*
10709 * Set the mode mask for creation of filesystem nodes.
10710 * XXX implement xsecurity
10711 */
10712 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10713 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10714 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10715 {
10716 AUDIT_ARG(mask, newmask);
10717 proc_fdlock(p);
10718 *retval = p->p_fd.fd_cmask;
10719 p->p_fd.fd_cmask = newmask & ALLPERMS;
10720 proc_fdunlock(p);
10721 return 0;
10722 }
10723
10724 /*
10725 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10726 *
10727 * Parameters: p Process requesting to set the umask
10728 * uap User argument descriptor (see below)
10729 * retval umask of the process (parameter p)
10730 *
10731 * Indirect: uap->newmask umask to set
10732 * uap->xsecurity ACL to set
10733 *
10734 * Returns: 0 Success
10735 * !0 Not success
10736 *
10737 */
10738 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10739 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10740 {
10741 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10742 }
10743
10744 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10745 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10746 {
10747 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10748 }
10749
10750 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10751 "com.apple.private.vfs.revoke-mounted-device"
10752
10753 /*
10754 * Void all references to file by ripping underlying filesystem
10755 * away from vnode.
10756 */
10757 /* ARGSUSED */
10758 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10759 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10760 {
10761 vnode_t vp;
10762 struct vnode_attr va;
10763 vfs_context_t ctx = vfs_context_current();
10764 int error;
10765 struct nameidata nd;
10766
10767 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10768 uap->path, ctx);
10769 error = namei(&nd);
10770 if (error) {
10771 return error;
10772 }
10773 vp = nd.ni_vp;
10774
10775 nameidone(&nd);
10776
10777 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10778 error = ENOTSUP;
10779 goto out;
10780 }
10781
10782 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10783 error = EBUSY;
10784 goto out;
10785 }
10786
10787 #if CONFIG_MACF
10788 error = mac_vnode_check_revoke(ctx, vp);
10789 if (error) {
10790 goto out;
10791 }
10792 #endif
10793
10794 VATTR_INIT(&va);
10795 VATTR_WANTED(&va, va_uid);
10796 if ((error = vnode_getattr(vp, &va, ctx))) {
10797 goto out;
10798 }
10799 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10800 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10801 goto out;
10802 }
10803 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10804 VNOP_REVOKE(vp, REVOKEALL, ctx);
10805 }
10806 out:
10807 vnode_put(vp);
10808 return error;
10809 }
10810
10811
10812 /*
10813 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10814 * The following system calls are designed to support features
10815 * which are specific to the HFS & HFS Plus volume formats
10816 */
10817
10818
10819 /*
10820 * Obtain attribute information on objects in a directory while enumerating
10821 * the directory.
10822 */
10823 /* ARGSUSED */
10824 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10825 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10826 {
10827 vnode_t vp;
10828 struct fileproc *fp;
10829 uio_t auio = NULL;
10830 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10831 uint32_t count = 0, savecount = 0;
10832 uint32_t newstate = 0;
10833 int error, eofflag = 0;
10834 off_t loff = 0;
10835 struct attrlist attributelist;
10836 vfs_context_t ctx = vfs_context_current();
10837 int fd = uap->fd;
10838 UIO_STACKBUF(uio_buf, 1);
10839 kauth_action_t action;
10840
10841 AUDIT_ARG(fd, fd);
10842
10843 /* Get the attributes into kernel space */
10844 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10845 return error;
10846 }
10847 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10848 return error;
10849 }
10850 savecount = count;
10851
10852 get_from_fd:
10853 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10854 return error;
10855 }
10856
10857 vn_offset_lock(fp->fp_glob);
10858 if (((vnode_t)fp_get_data(fp)) != vp) {
10859 vn_offset_unlock(fp->fp_glob);
10860 file_drop(fd);
10861 goto get_from_fd;
10862 }
10863
10864 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10865 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10866 error = EBADF;
10867 goto out;
10868 }
10869
10870
10871 #if CONFIG_MACF
10872 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10873 fp->fp_glob);
10874 if (error) {
10875 goto out;
10876 }
10877 #endif
10878
10879
10880 if ((error = vnode_getwithref(vp))) {
10881 goto out;
10882 }
10883
10884 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10885
10886 #if CONFIG_UNION_MOUNTS
10887 unionread:
10888 #endif /* CONFIG_UNION_MOUNTS */
10889 if (vp->v_type != VDIR) {
10890 (void)vnode_put(vp);
10891 error = EINVAL;
10892 goto out;
10893 }
10894
10895 #if CONFIG_MACF
10896 error = mac_vnode_check_readdir(ctx, vp);
10897 if (error != 0) {
10898 (void)vnode_put(vp);
10899 goto out;
10900 }
10901 #endif /* MAC */
10902
10903 /* set up the uio structure which will contain the users return buffer */
10904 loff = fp->fp_glob->fg_offset;
10905 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10906 uio_addiov(auio, uap->buffer, uap->buffersize);
10907
10908 /*
10909 * If the only item requested is file names, we can let that past with
10910 * just LIST_DIRECTORY. If they want any other attributes, that means
10911 * they need SEARCH as well.
10912 */
10913 action = KAUTH_VNODE_LIST_DIRECTORY;
10914 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10915 attributelist.fileattr || attributelist.dirattr) {
10916 action |= KAUTH_VNODE_SEARCH;
10917 }
10918
10919 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10920 /* Believe it or not, uap->options only has 32-bits of valid
10921 * info, so truncate before extending again */
10922
10923 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10924 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10925 }
10926
10927 if (error) {
10928 (void) vnode_put(vp);
10929 goto out;
10930 }
10931
10932 #if CONFIG_UNION_MOUNTS
10933 /*
10934 * If we've got the last entry of a directory in a union mount
10935 * then reset the eofflag and pretend there's still more to come.
10936 * The next call will again set eofflag and the buffer will be empty,
10937 * so traverse to the underlying directory and do the directory
10938 * read there.
10939 */
10940 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10941 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10942 eofflag = 0;
10943 } else { // Empty buffer
10944 vnode_t uvp;
10945 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10946 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10947 fp_set_data(fp, uvp);
10948 fp->fp_glob->fg_offset = 0; // reset index for new dir
10949 count = savecount;
10950 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10951 vnode_put(vp);
10952 vp = uvp;
10953 goto unionread;
10954 } else {
10955 /* could not get a ref, can't replace in fd */
10956 vnode_put(uvp);
10957 }
10958 }
10959 }
10960 }
10961 #endif /* CONFIG_UNION_MOUNTS */
10962
10963 (void)vnode_put(vp);
10964
10965 if (error) {
10966 goto out;
10967 }
10968 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10969
10970 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10971 goto out;
10972 }
10973 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10974 goto out;
10975 }
10976 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10977 goto out;
10978 }
10979
10980 *retval = eofflag; /* similar to getdirentries */
10981 error = 0;
10982 out:
10983 vn_offset_unlock(fp->fp_glob);
10984 file_drop(fd);
10985 return error; /* return error earlier, an retval of 0 or 1 now */
10986 } /* end of getdirentriesattr system call */
10987
10988 /*
10989 * Exchange data between two files
10990 */
10991
10992 /* ARGSUSED */
10993 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10994 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10995 {
10996 struct nameidata fnd, snd;
10997 vfs_context_t ctx = vfs_context_current();
10998 vnode_t fvp;
10999 vnode_t svp;
11000 int error;
11001 u_int32_t nameiflags;
11002 char *fpath = NULL;
11003 char *spath = NULL;
11004 int flen = 0, slen = 0;
11005 int from_truncated = 0, to_truncated = 0;
11006 #if CONFIG_FSE
11007 fse_info f_finfo, s_finfo;
11008 #endif
11009
11010 nameiflags = 0;
11011 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11012 nameiflags |= FOLLOW;
11013 }
11014
11015 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
11016 UIO_USERSPACE, uap->path1, ctx);
11017
11018 error = namei(&fnd);
11019 if (error) {
11020 goto out2;
11021 }
11022
11023 nameidone(&fnd);
11024 fvp = fnd.ni_vp;
11025
11026 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
11027 UIO_USERSPACE, uap->path2, ctx);
11028
11029 error = namei(&snd);
11030 if (error) {
11031 vnode_put(fvp);
11032 goto out2;
11033 }
11034 nameidone(&snd);
11035 svp = snd.ni_vp;
11036
11037 /*
11038 * if the files are the same, return an inval error
11039 */
11040 if (svp == fvp) {
11041 error = EINVAL;
11042 goto out;
11043 }
11044
11045 /*
11046 * if the files are on different volumes, return an error
11047 */
11048 if (svp->v_mount != fvp->v_mount) {
11049 error = EXDEV;
11050 goto out;
11051 }
11052
11053 /* If they're not files, return an error */
11054 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
11055 error = EINVAL;
11056 goto out;
11057 }
11058
11059 #if CONFIG_MACF
11060 error = mac_vnode_check_exchangedata(ctx,
11061 fvp, svp);
11062 if (error) {
11063 goto out;
11064 }
11065 #endif
11066 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
11067 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
11068 goto out;
11069 }
11070
11071 if (
11072 #if CONFIG_FSE
11073 need_fsevent(FSE_EXCHANGE, fvp) ||
11074 #endif
11075 kauth_authorize_fileop_has_listeners()) {
11076 GET_PATH(fpath);
11077 GET_PATH(spath);
11078
11079 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
11080 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
11081
11082 #if CONFIG_FSE
11083 get_fse_info(fvp, &f_finfo, ctx);
11084 get_fse_info(svp, &s_finfo, ctx);
11085 if (from_truncated || to_truncated) {
11086 // set it here since only the f_finfo gets reported up to user space
11087 f_finfo.mode |= FSE_TRUNCATED_PATH;
11088 }
11089 #endif
11090 }
11091 /* Ok, make the call */
11092 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
11093
11094 if (error == 0) {
11095 const char *tmpname;
11096
11097 if (fpath != NULL && spath != NULL) {
11098 /* call out to allow 3rd party notification of exchangedata.
11099 * Ignore result of kauth_authorize_fileop call.
11100 */
11101 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
11102 (uintptr_t)fpath, (uintptr_t)spath);
11103 }
11104 name_cache_lock();
11105
11106 tmpname = fvp->v_name;
11107 fvp->v_name = svp->v_name;
11108 svp->v_name = tmpname;
11109
11110 if (fvp->v_parent != svp->v_parent) {
11111 vnode_t tmp;
11112
11113 tmp = fvp->v_parent;
11114 fvp->v_parent = svp->v_parent;
11115 svp->v_parent = tmp;
11116 }
11117 name_cache_unlock();
11118
11119 #if CONFIG_FSE
11120 if (fpath != NULL && spath != NULL) {
11121 add_fsevent(FSE_EXCHANGE, ctx,
11122 FSE_ARG_STRING, flen, fpath,
11123 FSE_ARG_FINFO, &f_finfo,
11124 FSE_ARG_STRING, slen, spath,
11125 FSE_ARG_FINFO, &s_finfo,
11126 FSE_ARG_DONE);
11127 }
11128 #endif
11129 }
11130
11131 out:
11132 if (fpath != NULL) {
11133 RELEASE_PATH(fpath);
11134 }
11135 if (spath != NULL) {
11136 RELEASE_PATH(spath);
11137 }
11138 vnode_put(svp);
11139 vnode_put(fvp);
11140 out2:
11141 return error;
11142 }
11143
11144 /*
11145 * Return (in MB) the amount of freespace on the given vnode's volume.
11146 */
11147 uint32_t freespace_mb(vnode_t vp);
11148
11149 uint32_t
freespace_mb(vnode_t vp)11150 freespace_mb(vnode_t vp)
11151 {
11152 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11153 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11154 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11155 }
11156
11157 #if CONFIG_SEARCHFS
11158
11159 /* ARGSUSED */
11160
11161 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11162 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11163 {
11164 vnode_t vp, tvp;
11165 int i, error = 0;
11166 int fserror = 0;
11167 struct nameidata nd;
11168 struct user64_fssearchblock searchblock;
11169 struct searchstate *state;
11170 struct attrlist *returnattrs;
11171 struct timeval timelimit;
11172 void *searchparams1, *searchparams2;
11173 uio_t auio = NULL;
11174 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11175 uint32_t nummatches;
11176 size_t mallocsize;
11177 uint32_t nameiflags;
11178 vfs_context_t ctx = vfs_context_current();
11179 UIO_STACKBUF(uio_buf, 1);
11180
11181 /* Start by copying in fsearchblock parameter list */
11182 if (IS_64BIT_PROCESS(p)) {
11183 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11184 timelimit.tv_sec = searchblock.timelimit.tv_sec;
11185 timelimit.tv_usec = searchblock.timelimit.tv_usec;
11186 } else {
11187 struct user32_fssearchblock tmp_searchblock;
11188
11189 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11190 // munge into 64-bit version
11191 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11192 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11193 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11194 searchblock.maxmatches = tmp_searchblock.maxmatches;
11195 /*
11196 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11197 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11198 */
11199 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11200 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11201 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11202 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11203 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11204 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11205 searchblock.searchattrs = tmp_searchblock.searchattrs;
11206 }
11207 if (error) {
11208 return error;
11209 }
11210
11211 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11212 */
11213 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11214 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11215 return EINVAL;
11216 }
11217
11218 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11219 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
11220 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11221 /* block. */
11222 /* */
11223 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
11224 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
11225 /* assumes the size is still 556 bytes it will continue to work */
11226
11227 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11228 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11229
11230 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11231
11232 /* Now set up the various pointers to the correct place in our newly allocated memory */
11233
11234 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11235 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11236 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11237
11238 /* Now copy in the stuff given our local variables. */
11239
11240 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11241 goto freeandexit;
11242 }
11243
11244 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11245 goto freeandexit;
11246 }
11247
11248 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11249 goto freeandexit;
11250 }
11251
11252 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11253 goto freeandexit;
11254 }
11255
11256 /*
11257 * When searching a union mount, need to set the
11258 * start flag at the first call on each layer to
11259 * reset state for the new volume.
11260 */
11261 if (uap->options & SRCHFS_START) {
11262 state->ss_union_layer = 0;
11263 } else {
11264 uap->options |= state->ss_union_flags;
11265 }
11266 state->ss_union_flags = 0;
11267
11268 /*
11269 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11270 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11271 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11272 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11273 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11274 */
11275
11276 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11277 attrreference_t* string_ref;
11278 u_int32_t* start_length;
11279 user64_size_t param_length;
11280
11281 /* validate searchparams1 */
11282 param_length = searchblock.sizeofsearchparams1;
11283 /* skip the word that specifies length of the buffer */
11284 start_length = (u_int32_t*) searchparams1;
11285 start_length = start_length + 1;
11286 string_ref = (attrreference_t*) start_length;
11287
11288 /* ensure no negative offsets or too big offsets */
11289 if (string_ref->attr_dataoffset < 0) {
11290 error = EINVAL;
11291 goto freeandexit;
11292 }
11293 if (string_ref->attr_length > MAXPATHLEN) {
11294 error = EINVAL;
11295 goto freeandexit;
11296 }
11297
11298 /* Check for pointer overflow in the string ref */
11299 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11300 error = EINVAL;
11301 goto freeandexit;
11302 }
11303
11304 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11305 error = EINVAL;
11306 goto freeandexit;
11307 }
11308 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11309 error = EINVAL;
11310 goto freeandexit;
11311 }
11312 }
11313
11314 /* set up the uio structure which will contain the users return buffer */
11315 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11316 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11317
11318 nameiflags = 0;
11319 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11320 nameiflags |= FOLLOW;
11321 }
11322 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11323 UIO_USERSPACE, uap->path, ctx);
11324
11325 error = namei(&nd);
11326 if (error) {
11327 goto freeandexit;
11328 }
11329 vp = nd.ni_vp;
11330 nameidone(&nd);
11331
11332 /*
11333 * Switch to the root vnode for the volume
11334 */
11335 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11336 vnode_put(vp);
11337 if (error) {
11338 goto freeandexit;
11339 }
11340 vp = tvp;
11341
11342 #if CONFIG_UNION_MOUNTS
11343 /*
11344 * If it's a union mount, the path lookup takes
11345 * us to the top layer. But we may need to descend
11346 * to a lower layer. For non-union mounts the layer
11347 * is always zero.
11348 */
11349 for (i = 0; i < (int) state->ss_union_layer; i++) {
11350 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11351 break;
11352 }
11353 tvp = vp;
11354 vp = vp->v_mount->mnt_vnodecovered;
11355 if (vp == NULL) {
11356 vnode_put(tvp);
11357 error = ENOENT;
11358 goto freeandexit;
11359 }
11360 error = vnode_getwithref(vp);
11361 vnode_put(tvp);
11362 if (error) {
11363 goto freeandexit;
11364 }
11365 }
11366 #endif /* CONFIG_UNION_MOUNTS */
11367
11368 #if CONFIG_MACF
11369 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11370 if (error) {
11371 vnode_put(vp);
11372 goto freeandexit;
11373 }
11374 #endif
11375
11376
11377 /*
11378 * If searchblock.maxmatches == 0, then skip the search. This has happened
11379 * before and sometimes the underlying code doesnt deal with it well.
11380 */
11381 if (searchblock.maxmatches == 0) {
11382 nummatches = 0;
11383 goto saveandexit;
11384 }
11385
11386 /*
11387 * Allright, we have everything we need, so lets make that call.
11388 *
11389 * We keep special track of the return value from the file system:
11390 * EAGAIN is an acceptable error condition that shouldn't keep us
11391 * from copying out any results...
11392 */
11393
11394 fserror = VNOP_SEARCHFS(vp,
11395 searchparams1,
11396 searchparams2,
11397 &searchblock.searchattrs,
11398 (uint32_t)searchblock.maxmatches,
11399 &timelimit,
11400 returnattrs,
11401 &nummatches,
11402 (uint32_t)uap->scriptcode,
11403 (uint32_t)uap->options,
11404 auio,
11405 (struct searchstate *) &state->ss_fsstate,
11406 ctx);
11407
11408 #if CONFIG_UNION_MOUNTS
11409 /*
11410 * If it's a union mount we need to be called again
11411 * to search the mounted-on filesystem.
11412 */
11413 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11414 state->ss_union_flags = SRCHFS_START;
11415 state->ss_union_layer++; // search next layer down
11416 fserror = EAGAIN;
11417 }
11418 #endif /* CONFIG_UNION_MOUNTS */
11419
11420 saveandexit:
11421
11422 vnode_put(vp);
11423
11424 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11425 * search state. Everything was already put into he return buffer by the vop call. */
11426
11427 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11428 goto freeandexit;
11429 }
11430
11431 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11432 goto freeandexit;
11433 }
11434
11435 error = fserror;
11436
11437 freeandexit:
11438
11439 kfree_data(searchparams1, mallocsize);
11440
11441 return error;
11442 } /* end of searchfs system call */
11443
11444 #else /* CONFIG_SEARCHFS */
11445
11446 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11447 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11448 {
11449 return ENOTSUP;
11450 }
11451
11452 #endif /* CONFIG_SEARCHFS */
11453
11454
11455 #if CONFIG_DATALESS_FILES
11456
11457 /*
11458 * === Namespace Resolver Up-call Mechanism ===
11459 *
11460 * When I/O is performed to a dataless file or directory (read, write,
11461 * lookup-in, etc.), the file system performs an upcall to the namespace
11462 * resolver (filecoordinationd) to materialize the object.
11463 *
11464 * We need multiple up-calls to be in flight at once, and we need these
11465 * up-calls to be interruptible, thus the following implementation:
11466 *
11467 * => The nspace_resolver_request represents the in-kernel request state.
11468 * It contains a request ID, storage space for the errno code returned
11469 * by filecoordinationd, and flags.
11470 *
11471 * => The request ID is simply a global monotonically incrementing 32-bit
11472 * number. Outstanding requests are stored in a hash table, and the
11473 * hash function is extremely simple.
11474 *
11475 * => When an upcall is to be made to filecoordinationd, a request structure
11476 * is allocated on the stack (it is small, and needs to live only during
11477 * the duration of the call to resolve_nspace_item_ext()). It is
11478 * initialized and inserted into the table. Some backpressure from
11479 * filecoordinationd is applied by limiting the numnber of entries that
11480 * can be inserted into the table (and thus limiting the number of
11481 * outstanding requests issued to filecoordinationd); waiting for an
11482 * available slot is interruptible.
11483 *
11484 * => Once the request has been inserted into the table, the up-call is made
11485 * to filecoordinationd via a MiG-generated stub. The up-call returns
11486 * immediately and filecoordinationd processes the request asynchronously.
11487 *
11488 * => The caller now waits for the request to complete. Tnis is achieved by
11489 * sleeping on the address of the request structure and waiting for
11490 * filecoordinationd to mark the request structure as complete. This
11491 * is an interruptible sleep call; if interrupted, the request structure
11492 * is removed from the table and EINTR is returned to the caller. If
11493 * this occurs, an advisory up-call is made to filecoordinationd with
11494 * the request ID to indicate that the request can be aborted or
11495 * de-prioritized at the discretion of filecoordinationd.
11496 *
11497 * => When filecoordinationd has completed the request, it signals completion
11498 * by writing to the vfs.nspace.complete sysctl node. Only a process
11499 * decorated as a namespace resolver can write to this sysctl node. The
11500 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11501 * The request ID is looked up in the table, and if the request is found,
11502 * the error code is stored in the request structure and a wakeup()
11503 * issued on the address of the request structure. If the request is not
11504 * found, we simply drop the completion notification, assuming that the
11505 * caller was interrupted.
11506 *
11507 * => When the waiting thread wakes up, it extracts the error code from the
11508 * request structure, removes the request from the table, and returns the
11509 * error code to the calling function. Fini!
11510 */
11511
11512 struct nspace_resolver_request {
11513 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11514 vnode_t r_vp;
11515 vnode_t r_tdvp;
11516 uint32_t r_req_id;
11517 int r_resolver_error;
11518 int r_flags;
11519 };
11520
11521 #define RRF_COMPLETE 0x0001
11522 #define RRF_COMPLETING 0x0002
11523
11524 struct nspace_resolver_completion_data {
11525 uint32_t req_id;
11526 int32_t resolver_error;
11527 uint64_t orig_gencount;
11528 uint64_t orig_syncroot;
11529 };
11530
11531 static uint32_t
next_nspace_req_id(void)11532 next_nspace_req_id(void)
11533 {
11534 static uint32_t next_req_id;
11535
11536 return OSAddAtomic(1, &next_req_id);
11537 }
11538
11539 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11540 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11541
11542 static LIST_HEAD(nspace_resolver_requesthead,
11543 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11544 static u_long nspace_resolver_request_hashmask;
11545 static u_int nspace_resolver_request_count;
11546 static bool nspace_resolver_request_wait_slot;
11547 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11548 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11549 &nspace_resolver_request_lck_grp);
11550
11551 #define NSPACE_REQ_LOCK() \
11552 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11553 #define NSPACE_REQ_UNLOCK() \
11554 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11555
11556 #define NSPACE_RESOLVER_HASH(req_id) \
11557 (&nspace_resolver_request_hashtbl[(req_id) & \
11558 nspace_resolver_request_hashmask])
11559
11560 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11561 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11562 {
11563 struct nspace_resolver_requesthead *bucket;
11564 struct nspace_resolver_request *req;
11565
11566 bucket = NSPACE_RESOLVER_HASH(req_id);
11567 LIST_FOREACH(req, bucket, r_hashlink) {
11568 if (req->r_req_id == req_id) {
11569 /*
11570 * If this request already has a completion
11571 * pending, don't return it again.
11572 */
11573 if ((req->r_flags & RRF_COMPLETING) != 0 &&
11574 skip_completing) {
11575 req = NULL;
11576 }
11577 return req;
11578 }
11579 }
11580
11581 return NULL;
11582 }
11583
11584 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11585 nspace_resolver_req_add(struct nspace_resolver_request *req)
11586 {
11587 struct nspace_resolver_requesthead *bucket;
11588 int error;
11589
11590 NSPACE_REQ_LOCK();
11591
11592 while (nspace_resolver_request_count >=
11593 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11594 nspace_resolver_request_wait_slot = true;
11595 error = msleep(&nspace_resolver_request_count,
11596 &nspace_resolver_request_hash_mutex,
11597 PVFS | PCATCH, "nspacerq", NULL);
11598 if (error) {
11599 NSPACE_REQ_UNLOCK();
11600 return error;
11601 }
11602 }
11603
11604 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11605 #if DIAGNOSTIC
11606 assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11607 #endif /* DIAGNOSTIC */
11608 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11609 nspace_resolver_request_count++;
11610
11611 NSPACE_REQ_UNLOCK();
11612
11613 return 0;
11614 }
11615
11616 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11617 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11618 {
11619 /*
11620 * If a completion is in-progress, we have to wait for the
11621 * completion handler to finish because it's still using 'req',
11622 * which is allocated on our stack a couple of frames up.
11623 */
11624 while ((req->r_flags & RRF_COMPLETING) != 0) {
11625 (void) msleep(req, &nspace_resolver_request_hash_mutex,
11626 PVFS, "nspacecmplt", NULL);
11627 }
11628 }
11629
11630 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11631 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11632 {
11633 struct nspace_resolver_requesthead *bucket;
11634
11635 /* We're called with NSPACE_REQ_LOCK held. */
11636
11637 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11638 #if DIAGNOSTIC
11639 assert((req->r_flags & RRF_COMPLETING) == 0);
11640 assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11641 #endif /* DIAGNOSTIC */
11642 LIST_REMOVE(req, r_hashlink);
11643 nspace_resolver_request_count--;
11644
11645 if (nspace_resolver_request_wait_slot) {
11646 nspace_resolver_request_wait_slot = false;
11647 wakeup(&nspace_resolver_request_count);
11648 }
11649
11650 nspace_resolver_req_wait_pending_completion(req);
11651
11652 NSPACE_REQ_UNLOCK();
11653 }
11654
11655 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11656 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11657 {
11658 NSPACE_REQ_LOCK();
11659 nspace_resolver_req_remove_and_unlock(req);
11660 }
11661
11662 static void
nspace_resolver_req_cancel(uint32_t req_id)11663 nspace_resolver_req_cancel(uint32_t req_id)
11664 {
11665 kern_return_t kr;
11666 mach_port_t mp;
11667
11668 // Failures here aren't fatal -- the cancellation message
11669 // sent to the resolver is merely advisory.
11670
11671 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11672 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11673 return;
11674 }
11675
11676 kr = send_nspace_resolve_cancel(mp, req_id);
11677 if (kr != KERN_SUCCESS) {
11678 os_log_error(OS_LOG_DEFAULT,
11679 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11680 }
11681
11682 ipc_port_release_send(mp);
11683 }
11684
11685 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11686 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11687 {
11688 bool send_cancel_message = false;
11689 int error;
11690
11691 NSPACE_REQ_LOCK();
11692
11693 while ((req->r_flags & RRF_COMPLETE) == 0) {
11694 error = msleep(req, &nspace_resolver_request_hash_mutex,
11695 PVFS | PCATCH, "nspace", NULL);
11696 if (error && error != ERESTART) {
11697 req->r_resolver_error = (error == EINTR) ? EINTR :
11698 ETIMEDOUT;
11699 send_cancel_message = true;
11700 break;
11701 }
11702 }
11703
11704 nspace_resolver_req_remove_and_unlock(req);
11705
11706 /*
11707 * It's safe to continue referencing 'req' here because it's
11708 * allocated on our caller's stack.
11709 */
11710
11711 if (send_cancel_message) {
11712 nspace_resolver_req_cancel(req->r_req_id);
11713 }
11714
11715 return req->r_resolver_error;
11716 }
11717
11718 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11719 nspace_resolver_req_mark_complete(
11720 struct nspace_resolver_request *req,
11721 int resolver_error)
11722 {
11723 req->r_resolver_error = resolver_error;
11724 req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11725 wakeup(req);
11726 }
11727
11728 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11729 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11730 {
11731 req->r_flags |= RRF_COMPLETING;
11732 }
11733
11734 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11735 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11736 {
11737 struct nspace_resolver_request *req;
11738 int error;
11739 struct vnode_attr va;
11740 vnode_t vp;
11741
11742 NSPACE_REQ_LOCK();
11743
11744 req = nspace_resolver_req_lookup(c->req_id, true);
11745 if (req == NULL) {
11746 /*
11747 * If we don't find the request corresponding to our req_id,
11748 * just drop the completion on the floor; it's likely that
11749 * the requester interrupted with a signal, or it may already
11750 * be completing.
11751 */
11752 NSPACE_REQ_UNLOCK();
11753 return;
11754 }
11755
11756 /*
11757 * Get out now if the resolver reported an error.
11758 */
11759 if ((error = c->resolver_error) != 0) {
11760 goto out;
11761 }
11762
11763 /*
11764 * If the resolver did not specify any namespace shape criteria
11765 * for letting the operation proceed, then get out now.
11766 */
11767 if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11768 goto out;
11769 }
11770
11771 /*
11772 * We're going to have to acquire the mount rename lock and do
11773 * some I/O in order to verify the criteria. Mark the request
11774 * as pending so no one else messes with it after we drop the
11775 * NSPACE_REQ_LOCK.
11776 */
11777 nspace_resolver_req_mark_completion_pending(req);
11778 NSPACE_REQ_UNLOCK();
11779
11780 /*
11781 * Lock out renames from changing the shape of the tree while
11782 * validate the criteria.
11783 */
11784 mount_t locked_mp = req->r_vp->v_mount;
11785 mount_ref(locked_mp, 0);
11786 mount_lock_renames(locked_mp);
11787
11788 if (c->orig_gencount != 0) {
11789 vp = req->r_vp;
11790 if (error) {
11791 goto out_dropmount;
11792 }
11793
11794 VATTR_INIT(&va);
11795 VATTR_WANTED(&va, va_recursive_gencount);
11796 error = vnode_getattr(vp, &va, vfs_context_kernel());
11797 if (error) {
11798 goto out_dropmount;
11799 }
11800 if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11801 va.va_recursive_gencount != c->orig_gencount) {
11802 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11803 c->orig_gencount, va.va_recursive_gencount);
11804 error = EBUSY;
11805 goto out_dropmount;
11806 }
11807 }
11808
11809 /*
11810 * Ignore orig_syncroot if a destination directory wasn't specified
11811 * in the request.
11812 */
11813 if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11814 uint64_t syncroot_id;
11815
11816 if (error) {
11817 goto out_dropmount;
11818 }
11819
11820 #ifndef APFSIOC_GET_SYNC_ROOT
11821 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11822 #endif
11823
11824 error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11825 (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11826 if (error) {
11827 goto out_dropmount;
11828 }
11829 if (syncroot_id != c->orig_syncroot) {
11830 printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11831 c->orig_syncroot, syncroot_id);
11832 error = EBUSY;
11833 goto out_dropmount;
11834 }
11835 }
11836
11837 out_dropmount:
11838 mount_unlock_renames(locked_mp);
11839 mount_drop(locked_mp, 0);
11840 NSPACE_REQ_LOCK();
11841
11842 out:
11843 nspace_resolver_req_mark_complete(req, error);
11844 NSPACE_REQ_UNLOCK();
11845 }
11846
11847 static struct proc *nspace_resolver_proc;
11848
11849 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11850 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11851 {
11852 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11853 p == nspace_resolver_proc) ? 1 : 0;
11854 return 0;
11855 }
11856
11857 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11858
11859 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11860 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11861 {
11862 vfs_context_t ctx = vfs_context_current();
11863 int error = 0;
11864
11865 //
11866 // The system filecoordinationd runs as uid == 0. This also
11867 // has the nice side-effect of filtering out filecoordinationd
11868 // running in the simulator.
11869 //
11870 if (!vfs_context_issuser(ctx) ||
11871 !vfs_context_is_dataless_resolver(ctx)) {
11872 return EPERM;
11873 }
11874
11875 if (is_resolver) {
11876 NSPACE_REQ_LOCK();
11877
11878 if (nspace_resolver_proc == NULL) {
11879 proc_lock(p);
11880 p->p_lflag |= P_LNSPACE_RESOLVER;
11881 proc_unlock(p);
11882 nspace_resolver_proc = p;
11883 } else {
11884 error = EBUSY;
11885 }
11886
11887 NSPACE_REQ_UNLOCK();
11888 } else {
11889 // This is basically just like the exit case.
11890 // nspace_resolver_exited() will verify that the
11891 // process is the resolver, and will clear the
11892 // global.
11893 nspace_resolver_exited(p);
11894 }
11895
11896 return error;
11897 }
11898
11899 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11900 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11901 {
11902 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11903 (p->p_vfs_iopolicy &
11904 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11905 *is_prevented = 1;
11906 } else {
11907 *is_prevented = 0;
11908 }
11909 return 0;
11910 }
11911
11912 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11913 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11914 {
11915 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11916 return is_prevented ? 0 : EBUSY;
11917 }
11918
11919 if (is_prevented) {
11920 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11921 } else {
11922 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11923 }
11924 return 0;
11925 }
11926
11927 static int
nspace_materialization_get_thread_state(int * is_prevented)11928 nspace_materialization_get_thread_state(int *is_prevented)
11929 {
11930 uthread_t ut = current_uthread();
11931
11932 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11933 return 0;
11934 }
11935
11936 static int
nspace_materialization_set_thread_state(int is_prevented)11937 nspace_materialization_set_thread_state(int is_prevented)
11938 {
11939 uthread_t ut = current_uthread();
11940
11941 if (is_prevented) {
11942 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11943 } else {
11944 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11945 }
11946 return 0;
11947 }
11948
11949 /* the vfs.nspace branch */
11950 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11951
11952 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11953 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11954 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11955 {
11956 struct proc *p = req->p;
11957 int new_value, old_value, changed = 0;
11958 int error;
11959
11960 error = nspace_resolver_get_proc_state(p, &old_value);
11961 if (error) {
11962 return error;
11963 }
11964
11965 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11966 &changed);
11967 if (error == 0 && changed) {
11968 error = nspace_resolver_set_proc_state(p, new_value);
11969 }
11970 return error;
11971 }
11972
11973 /* decorate this process as the dataless file resolver */
11974 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11975 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11976 0, 0, sysctl_nspace_resolver, "I", "");
11977
11978 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11979 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11980 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11981 {
11982 struct proc *p = req->p;
11983 int new_value, old_value, changed = 0;
11984 int error;
11985
11986 error = nspace_materialization_get_proc_state(p, &old_value);
11987 if (error) {
11988 return error;
11989 }
11990
11991 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11992 &changed);
11993 if (error == 0 && changed) {
11994 error = nspace_materialization_set_proc_state(p, new_value);
11995 }
11996 return error;
11997 }
11998
11999 /* decorate this process as not wanting to materialize dataless files */
12000 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
12001 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12002 0, 0, sysctl_nspace_prevent_materialization, "I", "");
12003
12004 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12005 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
12006 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12007 {
12008 int new_value, old_value, changed = 0;
12009 int error;
12010
12011 error = nspace_materialization_get_thread_state(&old_value);
12012 if (error) {
12013 return error;
12014 }
12015
12016 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12017 &changed);
12018 if (error == 0 && changed) {
12019 error = nspace_materialization_set_thread_state(new_value);
12020 }
12021 return error;
12022 }
12023
12024 /* decorate this thread as not wanting to materialize dataless files */
12025 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
12026 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12027 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
12028
12029 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12030 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
12031 __unused int arg2, struct sysctl_req *req)
12032 {
12033 struct proc *p = req->p;
12034 uint32_t req_status[2] = { 0, 0 };
12035 uint64_t gencount = 0;
12036 uint64_t syncroot = 0;
12037 int error, is_resolver, changed = 0, other_changed;
12038
12039 error = nspace_resolver_get_proc_state(p, &is_resolver);
12040 if (error) {
12041 return error;
12042 }
12043
12044 if (!is_resolver) {
12045 return EPERM;
12046 }
12047
12048 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
12049 &changed);
12050 if (error) {
12051 return error;
12052 }
12053
12054 /*
12055 * Get the gencount if it was passed. Ignore errors, because
12056 * it's optional.
12057 */
12058 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
12059 &other_changed);
12060 if (error) {
12061 gencount = 0;
12062 error = 0;
12063 }
12064
12065 /*
12066 * ...and now the syncroot ID.
12067 */
12068 error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
12069 &other_changed);
12070 if (error) {
12071 syncroot = 0;
12072 error = 0;
12073 }
12074
12075 /*
12076 * req_status[0] is the req_id
12077 *
12078 * req_status[1] is the errno
12079 */
12080 if (error == 0 && changed) {
12081 const struct nspace_resolver_completion_data cd = {
12082 .req_id = req_status[0],
12083 .resolver_error = req_status[1],
12084 .orig_gencount = gencount,
12085 .orig_syncroot = syncroot,
12086 };
12087 nspace_resolver_req_completed(&cd);
12088 }
12089 return error;
12090 }
12091
12092 /* Resolver reports completed reqs here. */
12093 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
12094 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12095 0, 0, sysctl_nspace_complete, "-", "");
12096
12097 #endif /* CONFIG_DATALESS_FILES */
12098
12099 #if CONFIG_DATALESS_FILES
12100 #define __no_dataless_unused /* nothing */
12101 #else
12102 #define __no_dataless_unused __unused
12103 #endif
12104
12105 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12106 vfs_context_dataless_materialization_is_prevented(
12107 vfs_context_t const ctx __no_dataless_unused)
12108 {
12109 #if CONFIG_DATALESS_FILES
12110 proc_t const p = vfs_context_proc(ctx);
12111 thread_t const t = vfs_context_thread(ctx);
12112 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
12113
12114 /*
12115 * Kernel context ==> return EDEADLK, as we would with any random
12116 * process decorated as no-materialize.
12117 */
12118 if (ctx == vfs_context_kernel()) {
12119 return EDEADLK;
12120 }
12121
12122 /*
12123 * If the process has the dataless-manipulation entitlement,
12124 * materialization is prevented, and depending on the kind
12125 * of file system operation, things get to proceed as if the
12126 * object is not dataless.
12127 */
12128 if (vfs_context_is_dataless_manipulator(ctx)) {
12129 return EJUSTRETURN;
12130 }
12131
12132 /*
12133 * Per-thread decorations override any process-wide decorations.
12134 * (Foundation uses this, and this overrides even the dataless-
12135 * manipulation entitlement so as to make API contracts consistent.)
12136 */
12137 if (ut != NULL) {
12138 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12139 return EDEADLK;
12140 }
12141 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12142 return 0;
12143 }
12144 }
12145
12146 /*
12147 * If the process's iopolicy specifies that dataless files
12148 * can be materialized, then we let it go ahead.
12149 */
12150 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12151 return 0;
12152 }
12153 #endif /* CONFIG_DATALESS_FILES */
12154
12155 /*
12156 * The default behavior is to not materialize dataless files;
12157 * return to the caller that deadlock was detected.
12158 */
12159 return EDEADLK;
12160 }
12161
12162 void
nspace_resolver_init(void)12163 nspace_resolver_init(void)
12164 {
12165 #if CONFIG_DATALESS_FILES
12166 nspace_resolver_request_hashtbl =
12167 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12168 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12169 #endif /* CONFIG_DATALESS_FILES */
12170 }
12171
12172 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12173 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12174 {
12175 #if CONFIG_DATALESS_FILES
12176 struct nspace_resolver_requesthead *bucket;
12177 struct nspace_resolver_request *req;
12178 u_long idx;
12179
12180 NSPACE_REQ_LOCK();
12181
12182 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12183 p == nspace_resolver_proc) {
12184 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12185 bucket = &nspace_resolver_request_hashtbl[idx];
12186 LIST_FOREACH(req, bucket, r_hashlink) {
12187 nspace_resolver_req_wait_pending_completion(req);
12188 nspace_resolver_req_mark_complete(req,
12189 ETIMEDOUT);
12190 }
12191 }
12192 nspace_resolver_proc = NULL;
12193 }
12194
12195 NSPACE_REQ_UNLOCK();
12196 #endif /* CONFIG_DATALESS_FILES */
12197 }
12198
12199 #define DATALESS_RESOLVER_ENTITLEMENT \
12200 "com.apple.private.vfs.dataless-resolver"
12201 #define DATALESS_MANIPULATION_ENTITLEMENT \
12202 "com.apple.private.vfs.dataless-manipulation"
12203
12204 #if CONFIG_DATALESS_FILES
12205 /*
12206 * Return TRUE if the vfs context is associated with the dataless
12207 * resolver.
12208 */
12209 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12210 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12211 {
12212 return IOTaskHasEntitlement(vfs_context_task(ctx),
12213 DATALESS_RESOLVER_ENTITLEMENT);
12214 }
12215 #endif /* CONFIG_DATALESS_FILES */
12216
12217 /*
12218 * Return TRUE if the vfs context is associated with a process entitled
12219 * for dataless manipulation.
12220 *
12221 * XXX Arguably belongs in vfs_subr.c, but is here because of the
12222 * complication around CONFIG_DATALESS_FILES.
12223 */
12224 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12225 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12226 {
12227 #if CONFIG_DATALESS_FILES
12228 task_t task = vfs_context_task(ctx);
12229 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12230 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12231 #else
12232 return false;
12233 #endif /* CONFIG_DATALESS_FILES */
12234 }
12235
12236 #if CONFIG_DATALESS_FILES
12237 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12238 log_materialization_prevented(vnode_t vp, uint64_t op)
12239 {
12240 char p_name[MAXCOMLEN + 1];
12241 char *vntype;
12242 proc_selfname(&p_name[0], sizeof(p_name));
12243
12244 if (vp->v_type == VREG) {
12245 vntype = "File";
12246 } else if (vp->v_type == VDIR) {
12247 vntype = "Dir";
12248 } else if (vp->v_type == VLNK) {
12249 vntype = "SymLink";
12250 } else {
12251 vntype = "Other";
12252 }
12253
12254 #if DEVELOPMENT
12255 struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12256
12257 VATTR_INIT(vap);
12258 VATTR_WANTED(vap, va_fsid);
12259 VATTR_WANTED(vap, va_fileid);
12260 if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12261 os_log_debug(OS_LOG_DEFAULT,
12262 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12263 p_name, proc_selfpid(), op, vntype,
12264 vap->va_fsid, vap->va_fsid, vap->va_fileid);
12265 } else
12266 #endif
12267 {
12268 os_log_debug(OS_LOG_DEFAULT,
12269 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12270 p_name, proc_selfpid(), op, vntype);
12271 }
12272 #if DEVELOPMENT
12273 kfree_type(struct vnode_attr, vap);
12274 #endif
12275 }
12276 #endif /* CONFIG_DATALESS_FILES */
12277
12278 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12279 vfs_materialize_item(
12280 vnode_t vp __no_dataless_unused,
12281 uint32_t op __no_dataless_unused,
12282 int64_t offset __no_dataless_unused,
12283 int64_t size __no_dataless_unused,
12284 char *lookup_name __no_dataless_unused,
12285 size_t const namelen __no_dataless_unused,
12286 vnode_t tdvp __no_dataless_unused)
12287 {
12288 #if CONFIG_DATALESS_FILES
12289 kern_return_t kern_ret;
12290 mach_port_t mach_port;
12291 char *path = NULL;
12292 vfs_context_t context;
12293 int path_len;
12294 int error;
12295 audit_token_t atoken;
12296 enum vtype vp_vtype;
12297
12298 /* Swap files are special; ignore them */
12299 if (vnode_isswap(vp)) {
12300 return 0;
12301 }
12302
12303 /*
12304 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12305 * are no longer used nor supported.
12306 */
12307 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12308 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12309 return ENOTSUP;
12310 }
12311 if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12312 os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12313 return ENOTSUP;
12314 }
12315
12316 /* Normalize 'op'. */
12317 op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12318
12319 /*
12320 * To-directory is only meaningful for rename operations;
12321 * ignore it if someone handed one to us unexpectedly.
12322 */
12323 if (op != NAMESPACE_HANDLER_RENAME_OP) {
12324 tdvp = NULL;
12325 }
12326
12327 context = vfs_context_current();
12328
12329 /* Remember this for later. */
12330 vp_vtype = vnode_vtype(vp);
12331
12332 error = vfs_context_dataless_materialization_is_prevented(context);
12333 if (error) {
12334 log_materialization_prevented(vp, op);
12335 goto out_check_errors;
12336 }
12337
12338 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12339 &mach_port);
12340 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12341 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12342 /*
12343 * Treat this like being unable to access the backing store
12344 * server.
12345 */
12346 return ETIMEDOUT;
12347 }
12348
12349 int path_alloc_len = MAXPATHLEN;
12350 do {
12351 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12352 if (path == NULL) {
12353 return ENOMEM;
12354 }
12355
12356 path_len = path_alloc_len;
12357 error = vn_getpath(vp, path, &path_len);
12358 if (error == 0) {
12359 break;
12360 } else if (error == ENOSPC) {
12361 kfree_data(path, path_alloc_len);
12362 path = NULL;
12363 } else {
12364 goto out_release_port;
12365 }
12366 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12367
12368 error = vfs_context_copy_audit_token(context, &atoken);
12369 if (error) {
12370 goto out_release_port;
12371 }
12372
12373 struct nspace_resolver_request req = {
12374 .r_req_id = next_nspace_req_id(),
12375 .r_vp = vp,
12376 .r_tdvp = tdvp,
12377 };
12378
12379 error = nspace_resolver_req_add(&req);
12380 if (error) {
12381 goto out_release_port;
12382 }
12383
12384 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12385
12386 if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12387 char *dest_path = NULL;
12388 int dest_path_len;
12389
12390 dest_path = zalloc(ZV_NAMEI);
12391 dest_path_len = MAXPATHLEN;
12392
12393 error = vn_getpath(tdvp, dest_path, &dest_path_len);
12394 if (error) {
12395 zfree(ZV_NAMEI, dest_path);
12396 goto out_release_port;
12397 }
12398
12399 /*
12400 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12401 * compatibility with existing agents in user-space
12402 * who get passed this value.
12403 */
12404 kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12405 req.r_req_id,
12406 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12407 path, dest_path, atoken);
12408
12409 zfree(ZV_NAMEI, dest_path);
12410 } else if (vp_vtype == VDIR) {
12411 char *tmpname = NULL;
12412
12413 /*
12414 * If the caller provided a lookup_name *and* a name length,
12415 * then we assume the lookup_name is not NUL-terminated.
12416 * Allocate a temporary buffer in this case to provide
12417 * a NUL-terminated path name to the IPC call.
12418 */
12419 if (lookup_name != NULL && namelen != 0) {
12420 if (namelen >= PATH_MAX) {
12421 error = EINVAL;
12422 goto out_req_remove;
12423 }
12424 tmpname = zalloc(ZV_NAMEI);
12425 strlcpy(tmpname, lookup_name, namelen + 1);
12426 lookup_name = tmpname;
12427 } else if (lookup_name != NULL) {
12428 /*
12429 * If the caller provided a lookup_name with a
12430 * zero name length, then we assume it's NUL-
12431 * terminated. Verify it has a valid length.
12432 */
12433 if (strlen(lookup_name) >= PATH_MAX) {
12434 error = EINVAL;
12435 goto out_req_remove;
12436 }
12437 }
12438
12439 /* (See above.) */
12440 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12441 req.r_req_id,
12442 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12443 lookup_name == NULL ? "" : lookup_name, path, atoken);
12444
12445 if (tmpname != NULL) {
12446 zfree(ZV_NAMEI, tmpname);
12447
12448 /*
12449 * Poison lookup_name rather than reference
12450 * freed memory.
12451 */
12452 lookup_name = NULL;
12453 }
12454 } else {
12455 /* (See above.) */
12456 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12457 req.r_req_id,
12458 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12459 offset, size, path, atoken);
12460 }
12461 if (kern_ret != KERN_SUCCESS) {
12462 /*
12463 * Also treat this like being unable to access the backing
12464 * store server.
12465 */
12466 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12467 kern_ret);
12468 error = ETIMEDOUT;
12469 goto out_req_remove;
12470 }
12471
12472 /*
12473 * Give back the memory we allocated earlier while we wait; we
12474 * no longer need it.
12475 */
12476 kfree_data(path, path_alloc_len);
12477 path = NULL;
12478
12479 /*
12480 * Request has been submitted to the resolver. Now (interruptibly)
12481 * wait for completion. Upon requrn, the request will have been
12482 * removed from the lookup table.
12483 */
12484 error = nspace_resolver_req_wait(&req);
12485
12486 out_release_port:
12487 if (path != NULL) {
12488 kfree_data(path, path_alloc_len);
12489 path = NULL;
12490 }
12491 ipc_port_release_send(mach_port);
12492
12493 out_check_errors:
12494 /*
12495 * The file resolver owns the logic about what error to return
12496 * to the caller. We only need to handle a couple of special
12497 * cases here:
12498 */
12499 if (error == EJUSTRETURN) {
12500 /*
12501 * The requesting process is allowed to interact with
12502 * dataless objects. Make a couple of sanity-checks
12503 * here to ensure the action makes sense.
12504 */
12505 switch (op) {
12506 case NAMESPACE_HANDLER_WRITE_OP:
12507 case NAMESPACE_HANDLER_TRUNCATE_OP:
12508 case NAMESPACE_HANDLER_RENAME_OP:
12509 /*
12510 * This handles the case of the resolver itself
12511 * writing data to the file (or throwing it
12512 * away).
12513 */
12514 error = 0;
12515 break;
12516 case NAMESPACE_HANDLER_READ_OP:
12517 case NAMESPACE_HANDLER_LOOKUP_OP:
12518 /*
12519 * This handles the case of the resolver needing
12520 * to look up inside of a dataless directory while
12521 * it's in the process of materializing it (for
12522 * example, creating files or directories).
12523 */
12524 error = (vp_vtype == VDIR) ? 0 : EBADF;
12525 break;
12526 default:
12527 error = EBADF;
12528 break;
12529 }
12530 }
12531
12532 return error;
12533
12534 out_req_remove:
12535 nspace_resolver_req_remove(&req);
12536 goto out_release_port;
12537 #else
12538 return ENOTSUP;
12539 #endif /* CONFIG_DATALESS_FILES */
12540 }
12541
12542 /*
12543 * vfs_materialize_file: Materialize a regular file.
12544 *
12545 * Inputs:
12546 * vp The dataless file to be materialized.
12547 *
12548 * op What kind of operation is being performed:
12549 * -> NAMESPACE_HANDLER_READ_OP
12550 * -> NAMESPACE_HANDLER_WRITE_OP
12551 * -> NAMESPACE_HANDLER_LINK_CREATE
12552 * -> NAMESPACE_HANDLER_DELETE_OP
12553 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12554 * -> NAMESPACE_HANDLER_RENAME_OP
12555 *
12556 * offset offset of I/O for READ or WRITE. Ignored for
12557 * other ops.
12558 *
12559 * size size of I/O for READ or WRITE Ignored for
12560 * other ops.
12561 *
12562 * If offset or size are -1 for a READ or WRITE, then the resolver should
12563 * consider the range to be unknown.
12564 *
12565 * Upon successful return, the caller may proceed with the operation.
12566 * N.B. the file may still be "dataless" in this case.
12567 */
12568 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12569 vfs_materialize_file(
12570 struct vnode *vp,
12571 uint64_t op,
12572 int64_t offset,
12573 int64_t size)
12574 {
12575 if (vp->v_type != VREG) {
12576 return EFTYPE;
12577 }
12578 return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12579 NULL);
12580 }
12581
12582 /*
12583 * vfs_materialize_dir:
12584 *
12585 * Inputs:
12586 * vp The dataless directory to be materialized.
12587 *
12588 * op What kind of operation is being performed:
12589 * -> NAMESPACE_HANDLER_READ_OP
12590 * -> NAMESPACE_HANDLER_WRITE_OP
12591 * -> NAMESPACE_HANDLER_DELETE_OP
12592 * -> NAMESPACE_HANDLER_RENAME_OP
12593 * -> NAMESPACE_HANDLER_LOOKUP_OP
12594 *
12595 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12596 * other ops. May or may not be NUL-terminated; see below.
12597 *
12598 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12599 * terminated and namelen is the number of valid bytes in
12600 * lookup_name. If zero, then lookup_name is assumed to be
12601 * NUL-terminated.
12602 *
12603 * Upon successful return, the caller may proceed with the operation.
12604 * N.B. the directory may still be "dataless" in this case.
12605 */
12606 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12607 vfs_materialize_dir(
12608 struct vnode *vp,
12609 uint64_t op,
12610 char *lookup_name,
12611 size_t namelen)
12612 {
12613 if (vp->v_type != VDIR) {
12614 return EFTYPE;
12615 }
12616 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12617 return EINVAL;
12618 }
12619 return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12620 namelen, NULL);
12621 }
12622
12623 /*
12624 * vfs_materialize_reparent:
12625 *
12626 * Inputs:
12627 * vp The dataless file or directory to be materialized.
12628 *
12629 * tdvp The new parent directory for the dataless file.
12630 *
12631 * Upon successful return, the caller may proceed with the operation.
12632 * N.B. the item may still be "dataless" in this case.
12633 */
12634 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12635 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12636 {
12637 if (vp->v_type != VDIR && vp->v_type != VREG) {
12638 return EFTYPE;
12639 }
12640 return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12641 0, 0, NULL, 0, tdvp);
12642 }
12643
12644 #if 0
12645 static int
12646 build_volfs_path(struct vnode *vp, char *path, int *len)
12647 {
12648 struct vnode_attr va;
12649 int ret;
12650
12651 VATTR_INIT(&va);
12652 VATTR_WANTED(&va, va_fsid);
12653 VATTR_WANTED(&va, va_fileid);
12654
12655 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12656 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12657 ret = -1;
12658 } else {
12659 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12660 ret = 0;
12661 }
12662
12663 return ret;
12664 }
12665 #endif
12666
12667 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12668 fsctl_bogus_command_compat(unsigned long cmd)
12669 {
12670 switch (cmd) {
12671 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12672 return FSIOC_SYNC_VOLUME;
12673 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12674 return FSIOC_ROUTEFS_SETROUTEID;
12675 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12676 return FSIOC_SET_PACKAGE_EXTS;
12677 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12678 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12679 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12680 return DISK_CONDITIONER_IOC_GET;
12681 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12682 return DISK_CONDITIONER_IOC_SET;
12683 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12684 return FSIOC_FIOSEEKHOLE;
12685 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12686 return FSIOC_FIOSEEKDATA;
12687 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12688 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12689 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12690 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12691 }
12692
12693 return cmd;
12694 }
12695
12696 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12697 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12698 {
12699 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12700 }
12701
12702 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12703 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12704 {
12705 struct vfs_attr vfa;
12706 mount_t mp = vp->v_mount;
12707 unsigned arg;
12708 int error;
12709
12710 /* record vid of vp so we can drop it below. */
12711 uint32_t vvid = vp->v_id;
12712
12713 /*
12714 * Then grab mount_iterref so that we can release the vnode.
12715 * Without this, a thread may call vnode_iterate_prepare then
12716 * get into a deadlock because we've never released the root vp
12717 */
12718 error = mount_iterref(mp, 0);
12719 if (error) {
12720 return error;
12721 }
12722 vnode_hold(vp);
12723 vnode_put(vp);
12724
12725 arg = MNT_NOWAIT;
12726 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12727 arg = MNT_WAIT;
12728 }
12729
12730 /*
12731 * If the filessytem supports multiple filesytems in a
12732 * partition (For eg APFS volumes in a container, it knows
12733 * that the waitfor argument to VFS_SYNC are flags.
12734 */
12735 VFSATTR_INIT(&vfa);
12736 VFSATTR_WANTED(&vfa, f_capabilities);
12737 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12738 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12739 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12740 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12741 arg |= MNT_VOLUME;
12742 }
12743
12744 /* issue the sync for this volume */
12745 (void)sync_callback(mp, &arg);
12746
12747 /*
12748 * Then release the mount_iterref once we're done syncing; it's not
12749 * needed for the VNOP_IOCTL below
12750 */
12751 mount_iterdrop(mp);
12752
12753 if (arg & FSCTL_SYNC_FULLSYNC) {
12754 /* re-obtain vnode iocount on the root vp, if possible */
12755 error = vnode_getwithvid(vp, vvid);
12756 if (error == 0) {
12757 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12758 vnode_put(vp);
12759 }
12760 }
12761 vnode_drop(vp);
12762 /* mark the argument VP as having been released */
12763 *arg_vp = NULL;
12764 return error;
12765 }
12766
12767 #if ROUTEFS
12768 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12769 handle_routes(user_addr_t udata)
12770 {
12771 char routepath[MAXPATHLEN];
12772 size_t len = 0;
12773 int error;
12774
12775 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12776 return error;
12777 }
12778 bzero(routepath, MAXPATHLEN);
12779 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12780 if (error) {
12781 return error;
12782 }
12783 error = routefs_kernel_mount(routepath);
12784 return error;
12785 }
12786 #endif
12787
12788 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12789 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12790 {
12791 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12792 struct vnode_attr va;
12793 int error;
12794
12795 VATTR_INIT(&va);
12796 VATTR_SET(&va, va_flags, cas->new_flags);
12797
12798 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12799
12800 #if CONFIG_FSE
12801 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12802 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12803 }
12804 #endif
12805
12806 return error;
12807 }
12808
12809 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12810 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12811 {
12812 struct mount *mp = NULL;
12813 errno_t rootauth = 0;
12814
12815 mp = vp->v_mount;
12816
12817 /*
12818 * query the underlying FS and see if it reports something
12819 * sane for this vnode. If volume is authenticated via
12820 * chunklist, leave that for the caller to determine.
12821 */
12822 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12823
12824 return rootauth;
12825 }
12826
12827 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12828 "com.apple.private.kernel.set-package-extensions"
12829
12830 /*
12831 * Make a filesystem-specific control call:
12832 */
12833 /* ARGSUSED */
12834 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12835 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12836 {
12837 int error = 0;
12838 boolean_t is64bit;
12839 u_int size;
12840 #define STK_PARAMS 128
12841 char stkbuf[STK_PARAMS] = {0};
12842 caddr_t data, memp;
12843 vnode_t vp = *arg_vp;
12844
12845 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12846 return ENOTTY;
12847 }
12848
12849 cmd = fsctl_bogus_command_compat(cmd);
12850
12851 size = IOCPARM_LEN(cmd);
12852 if (size > IOCPARM_MAX) {
12853 return EINVAL;
12854 }
12855
12856 is64bit = proc_is64bit(p);
12857
12858 memp = NULL;
12859
12860 if (size > sizeof(stkbuf)) {
12861 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12862 return ENOMEM;
12863 }
12864 data = memp;
12865 } else {
12866 data = &stkbuf[0];
12867 };
12868
12869 if (cmd & IOC_IN) {
12870 if (size) {
12871 error = copyin(udata, data, size);
12872 if (error) {
12873 if (memp) {
12874 kfree_data(memp, size);
12875 }
12876 return error;
12877 }
12878 } else {
12879 if (is64bit) {
12880 *(user_addr_t *)data = udata;
12881 } else {
12882 *(uint32_t *)data = (uint32_t)udata;
12883 }
12884 };
12885 } else if ((cmd & IOC_OUT) && size) {
12886 /*
12887 * Zero the buffer so the user always
12888 * gets back something deterministic.
12889 */
12890 bzero(data, size);
12891 } else if (cmd & IOC_VOID) {
12892 if (is64bit) {
12893 *(user_addr_t *)data = udata;
12894 } else {
12895 *(uint32_t *)data = (uint32_t)udata;
12896 }
12897 }
12898
12899 /* Check to see if it's a generic command */
12900 switch (cmd) {
12901 case FSIOC_SYNC_VOLUME:
12902 error = handle_sync_volume(vp, arg_vp, data, ctx);
12903 break;
12904
12905 case FSIOC_ROUTEFS_SETROUTEID:
12906 #if ROUTEFS
12907 error = handle_routes(udata);
12908 #endif
12909 break;
12910
12911 case FSIOC_SET_PACKAGE_EXTS: {
12912 user_addr_t ext_strings;
12913 uint32_t num_entries;
12914 uint32_t max_width;
12915
12916 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12917 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12918 error = EPERM;
12919 break;
12920 }
12921
12922 if ((is64bit && size != sizeof(user64_package_ext_info))
12923 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12924 // either you're 64-bit and passed a 64-bit struct or
12925 // you're 32-bit and passed a 32-bit struct. otherwise
12926 // it's not ok.
12927 error = EINVAL;
12928 break;
12929 }
12930
12931 if (is64bit) {
12932 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12933 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12934 }
12935 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12936 num_entries = ((user64_package_ext_info *)data)->num_entries;
12937 max_width = ((user64_package_ext_info *)data)->max_width;
12938 } else {
12939 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12940 num_entries = ((user32_package_ext_info *)data)->num_entries;
12941 max_width = ((user32_package_ext_info *)data)->max_width;
12942 }
12943 error = set_package_extensions_table(ext_strings, num_entries, max_width);
12944 }
12945 break;
12946
12947 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12948 {
12949 mount_t mp;
12950
12951 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12952 break;
12953 }
12954 if ((mp = vp->v_mount) != NULL) {
12955 mount_lock(mp);
12956 if (data[0] != 0) {
12957 for (int i = 0; i < MFSTYPENAMELEN; i++) {
12958 if (!data[i]) {
12959 goto continue_copy;
12960 }
12961 }
12962 /*
12963 * Getting here means we have a user data
12964 * string which has no NULL termination in
12965 * its first MFSTYPENAMELEN bytes. This is
12966 * bogus, let's avoid strlcpy-ing the read
12967 * data and return an error.
12968 */
12969 error = EINVAL;
12970 goto unlock;
12971 continue_copy:
12972 vfs_setfstypename_locked(mp, data);
12973 if (vfs_isrdonly(mp) &&
12974 strcmp(data, "mtmfs") == 0) {
12975 mp->mnt_kern_flag |=
12976 MNTK_EXTENDED_SECURITY;
12977 mp->mnt_kern_flag &=
12978 ~MNTK_AUTH_OPAQUE;
12979 }
12980 } else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12981 const char *name =
12982 vfs_getfstypenameref_locked(mp, NULL);
12983 if (strcmp(name, "mtmfs") == 0) {
12984 mp->mnt_kern_flag &=
12985 ~MNTK_EXTENDED_SECURITY;
12986 }
12987 vfs_setfstypename_locked(mp, NULL);
12988 }
12989 unlock:
12990 mount_unlock(mp);
12991 }
12992 }
12993 break;
12994
12995 case DISK_CONDITIONER_IOC_GET: {
12996 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12997 }
12998 break;
12999
13000 case DISK_CONDITIONER_IOC_SET: {
13001 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
13002 }
13003 break;
13004
13005 case FSIOC_CAS_BSDFLAGS:
13006 error = handle_flags(vp, data, ctx);
13007 break;
13008
13009 case FSIOC_FD_ONLY_OPEN_ONCE: {
13010 error = 0;
13011 if (vnode_usecount(vp) > 1) {
13012 vnode_lock_spin(vp);
13013 if (vp->v_lflag & VL_HASSTREAMS) {
13014 if (vnode_isinuse_locked(vp, 1, 1)) {
13015 error = EBUSY;
13016 }
13017 } else if (vnode_usecount(vp) > 1) {
13018 error = EBUSY;
13019 }
13020 vnode_unlock(vp);
13021 }
13022 }
13023 break;
13024
13025 case FSIOC_EVAL_ROOTAUTH:
13026 error = handle_auth(vp, cmd, data, options, ctx);
13027 break;
13028
13029 case FSIOC_TEST_FSE_ACCESS_GRANTED:
13030 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
13031 break;
13032
13033 #if CONFIG_EXCLAVES
13034 case FSIOC_EXCLAVE_FS_REGISTER:
13035 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13036 error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
13037 } else {
13038 error = EPERM;
13039 }
13040 break;
13041
13042 case FSIOC_EXCLAVE_FS_UNREGISTER:
13043 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13044 error = vfs_exclave_fs_unregister(vp);
13045 } else {
13046 error = EPERM;
13047 }
13048 break;
13049
13050 case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
13051 exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
13052 exclave_fs_base_dir_t *dirs = NULL;
13053 if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13054 error = EPERM;
13055 break;
13056 }
13057 if (get_base_dirs->base_dirs) {
13058 if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
13059 error = EINVAL;
13060 break;
13061 }
13062 dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
13063 if (!dirs) {
13064 error = ENOSPC;
13065 break;
13066 }
13067 }
13068 error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
13069 if (!error && dirs) {
13070 error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
13071 get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
13072 }
13073 if (dirs) {
13074 kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
13075 }
13076 }
13077 break;
13078 #endif
13079
13080 default: {
13081 /*
13082 * Other, known commands shouldn't be passed down here.
13083 * (When adding a selector to this list, it may be prudent
13084 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
13085 */
13086 switch (cmd) {
13087 case F_PUNCHHOLE:
13088 case F_TRIM_ACTIVE_FILE:
13089 case F_RDADVISE:
13090 case F_TRANSCODEKEY:
13091 case F_GETPROTECTIONLEVEL:
13092 case F_GETDEFAULTPROTLEVEL:
13093 case F_MAKECOMPRESSED:
13094 case F_SET_GREEDY_MODE:
13095 case F_SETSTATICCONTENT:
13096 case F_SETIOTYPE:
13097 case F_SETBACKINGSTORE:
13098 case F_GETPATH_MTMINFO:
13099 case APFSIOC_REVERT_TO_SNAPSHOT:
13100 case FSIOC_FIOSEEKHOLE:
13101 case FSIOC_FIOSEEKDATA:
13102 case HFS_GET_BOOT_INFO:
13103 case HFS_SET_BOOT_INFO:
13104 case FIOPINSWAP:
13105 case F_CHKCLEAN:
13106 case F_FULLFSYNC:
13107 case F_BARRIERFSYNC:
13108 case F_FREEZE_FS:
13109 case F_THAW_FS:
13110 case FSIOC_KERNEL_ROOTAUTH:
13111 case FSIOC_GRAFT_FS:
13112 case FSIOC_UNGRAFT_FS:
13113 case FSIOC_AUTH_FS:
13114 case F_SPECULATIVE_READ:
13115 case F_ATTRIBUTION_TAG:
13116 case F_TRANSFEREXTENTS:
13117 case F_ASSERT_BG_ACCESS:
13118 case F_RELEASE_BG_ACCESS:
13119 error = EINVAL;
13120 goto outdrop;
13121 }
13122 /* Invoke the filesystem-specific code */
13123 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13124 }
13125 } /* end switch stmt */
13126
13127 /*
13128 * if no errors, copy any data to user. Size was
13129 * already set and checked above.
13130 */
13131 if (error == 0 && (cmd & IOC_OUT) && size) {
13132 error = copyout(data, udata, size);
13133 }
13134
13135 outdrop:
13136 if (memp) {
13137 kfree_data(memp, size);
13138 }
13139
13140 return error;
13141 }
13142
13143 /* ARGSUSED */
13144 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13145 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13146 {
13147 int error;
13148 struct nameidata nd;
13149 uint32_t nameiflags;
13150 vnode_t vp = NULL;
13151 vfs_context_t ctx = vfs_context_current();
13152
13153 AUDIT_ARG(cmd, (int)uap->cmd);
13154 AUDIT_ARG(value32, uap->options);
13155 /* Get the vnode for the file we are getting info on: */
13156 nameiflags = 0;
13157 //
13158 // if we come through fsctl() then the file is by definition not open.
13159 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13160 // lest the caller mistakenly thinks the only open is their own (but in
13161 // reality it's someone elses).
13162 //
13163 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13164 return EINVAL;
13165 }
13166 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13167 nameiflags |= FOLLOW;
13168 }
13169 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13170 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13171 }
13172 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13173 UIO_USERSPACE, uap->path, ctx);
13174 if ((error = namei(&nd))) {
13175 goto done;
13176 }
13177 vp = nd.ni_vp;
13178 nameidone(&nd);
13179
13180 #if CONFIG_MACF
13181 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13182 if (error) {
13183 goto done;
13184 }
13185 #endif
13186
13187 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13188
13189 done:
13190 if (vp) {
13191 vnode_put(vp);
13192 }
13193 return error;
13194 }
13195 /* ARGSUSED */
13196 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13197 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13198 {
13199 int error;
13200 vnode_t vp = NULL;
13201 vfs_context_t ctx = vfs_context_current();
13202 int fd = -1;
13203
13204 AUDIT_ARG(fd, uap->fd);
13205 AUDIT_ARG(cmd, (int)uap->cmd);
13206 AUDIT_ARG(value32, uap->options);
13207
13208 /* Get the vnode for the file we are getting info on: */
13209 if ((error = file_vnode(uap->fd, &vp))) {
13210 return error;
13211 }
13212 fd = uap->fd;
13213 if ((error = vnode_getwithref(vp))) {
13214 file_drop(fd);
13215 return error;
13216 }
13217
13218 #if CONFIG_MACF
13219 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13220 file_drop(fd);
13221 vnode_put(vp);
13222 return error;
13223 }
13224 #endif
13225
13226 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13227
13228 file_drop(fd);
13229
13230 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13231 if (vp) {
13232 vnode_put(vp);
13233 }
13234
13235 return error;
13236 }
13237 /* end of fsctl system call */
13238
13239 #define FILESEC_ACCESS_ENTITLEMENT \
13240 "com.apple.private.vfs.filesec-access"
13241
13242 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13243 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13244 {
13245 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13246 /*
13247 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13248 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13249 */
13250 if ((!setting && vfs_context_issuser(ctx)) ||
13251 IOTaskHasEntitlement(vfs_context_task(ctx),
13252 FILESEC_ACCESS_ENTITLEMENT)) {
13253 return 0;
13254 }
13255 }
13256
13257 return EPERM;
13258 }
13259
13260 /*
13261 * Retrieve the data of an extended attribute.
13262 */
13263 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13264 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13265 {
13266 vnode_t vp;
13267 struct nameidata nd;
13268 char attrname[XATTR_MAXNAMELEN + 1];
13269 vfs_context_t ctx = vfs_context_current();
13270 uio_t auio = NULL;
13271 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13272 size_t attrsize = 0;
13273 size_t namelen;
13274 u_int32_t nameiflags;
13275 int error;
13276 UIO_STACKBUF(uio_buf, 1);
13277
13278 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13279 return EINVAL;
13280 }
13281
13282 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13283 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13284 if (uap->options & XATTR_NOFOLLOW_ANY) {
13285 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13286 }
13287
13288 if ((error = namei(&nd))) {
13289 return error;
13290 }
13291 vp = nd.ni_vp;
13292 nameidone(&nd);
13293
13294 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13295 if (error != 0) {
13296 goto out;
13297 }
13298 if (xattr_protected(attrname) &&
13299 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13300 goto out;
13301 }
13302 /*
13303 * the specific check for 0xffffffff is a hack to preserve
13304 * binaray compatibilty in K64 with applications that discovered
13305 * that passing in a buf pointer and a size of -1 resulted in
13306 * just the size of the indicated extended attribute being returned.
13307 * this isn't part of the documented behavior, but because of the
13308 * original implemtation's check for "uap->size > 0", this behavior
13309 * was allowed. In K32 that check turned into a signed comparison
13310 * even though uap->size is unsigned... in K64, we blow by that
13311 * check because uap->size is unsigned and doesn't get sign smeared
13312 * in the munger for a 32 bit user app. we also need to add a
13313 * check to limit the maximum size of the buffer being passed in...
13314 * unfortunately, the underlying fileystems seem to just malloc
13315 * the requested size even if the actual extended attribute is tiny.
13316 * because that malloc is for kernel wired memory, we have to put a
13317 * sane limit on it.
13318 *
13319 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13320 * U64 running on K64 will yield -1 (64 bits wide)
13321 * U32/U64 running on K32 will yield -1 (32 bits wide)
13322 */
13323 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13324 goto no_uio;
13325 }
13326
13327 if (uap->value) {
13328 if (uap->size > (size_t)XATTR_MAXSIZE) {
13329 uap->size = XATTR_MAXSIZE;
13330 }
13331
13332 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13333 &uio_buf[0], sizeof(uio_buf));
13334 uio_addiov(auio, uap->value, uap->size);
13335 }
13336 no_uio:
13337 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13338 out:
13339 vnode_put(vp);
13340
13341 if (auio) {
13342 *retval = uap->size - uio_resid(auio);
13343 } else {
13344 *retval = (user_ssize_t)attrsize;
13345 }
13346
13347 return error;
13348 }
13349
13350 /*
13351 * Retrieve the data of an extended attribute.
13352 */
13353 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13354 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13355 {
13356 vnode_t vp;
13357 char attrname[XATTR_MAXNAMELEN + 1];
13358 vfs_context_t ctx = vfs_context_current();
13359 uio_t auio = NULL;
13360 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13361 size_t attrsize = 0;
13362 size_t namelen;
13363 int error;
13364 UIO_STACKBUF(uio_buf, 1);
13365
13366 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13367 XATTR_NOFOLLOW_ANY)) {
13368 return EINVAL;
13369 }
13370
13371 if ((error = file_vnode(uap->fd, &vp))) {
13372 return error;
13373 }
13374 if ((error = vnode_getwithref(vp))) {
13375 file_drop(uap->fd);
13376 return error;
13377 }
13378 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13379 if (error != 0) {
13380 goto out;
13381 }
13382 if (xattr_protected(attrname) &&
13383 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13384 goto out;
13385 }
13386 if (uap->value && uap->size > 0) {
13387 if (uap->size > (size_t)XATTR_MAXSIZE) {
13388 uap->size = XATTR_MAXSIZE;
13389 }
13390
13391 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13392 &uio_buf[0], sizeof(uio_buf));
13393 uio_addiov(auio, uap->value, uap->size);
13394 }
13395
13396 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13397 out:
13398 (void)vnode_put(vp);
13399 file_drop(uap->fd);
13400
13401 if (auio) {
13402 *retval = uap->size - uio_resid(auio);
13403 } else {
13404 *retval = (user_ssize_t)attrsize;
13405 }
13406 return error;
13407 }
13408
13409 /* struct for checkdirs iteration */
13410 struct setxattr_ctx {
13411 struct nameidata nd;
13412 char attrname[XATTR_MAXNAMELEN + 1];
13413 UIO_STACKBUF(uio_buf, 1);
13414 };
13415
13416 /*
13417 * Set the data of an extended attribute.
13418 */
13419 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13420 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13421 {
13422 vnode_t vp;
13423 vfs_context_t ctx = vfs_context_current();
13424 uio_t auio = NULL;
13425 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13426 size_t namelen;
13427 u_int32_t nameiflags;
13428 int error;
13429 struct setxattr_ctx *sactx;
13430
13431 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13432 return EINVAL;
13433 }
13434
13435 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13436 if (sactx == NULL) {
13437 return ENOMEM;
13438 }
13439
13440 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13441 if (error != 0) {
13442 if (error == EPERM) {
13443 /* if the string won't fit in attrname, copyinstr emits EPERM */
13444 error = ENAMETOOLONG;
13445 }
13446 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13447 goto out;
13448 }
13449 if (xattr_protected(sactx->attrname) &&
13450 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13451 goto out;
13452 }
13453 if (uap->size != 0 && uap->value == 0) {
13454 error = EINVAL;
13455 goto out;
13456 }
13457 if (uap->size > INT_MAX) {
13458 error = E2BIG;
13459 goto out;
13460 }
13461
13462 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13463 #if CONFIG_FILE_LEASES
13464 nameiflags |= WANTPARENT;
13465 #endif
13466 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13467 if (uap->options & XATTR_NOFOLLOW_ANY) {
13468 sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13469 }
13470
13471 if ((error = namei(&sactx->nd))) {
13472 goto out;
13473 }
13474 vp = sactx->nd.ni_vp;
13475 #if CONFIG_FILE_LEASES
13476 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13477 vnode_put(sactx->nd.ni_dvp);
13478 #endif
13479 nameidone(&sactx->nd);
13480
13481 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13482 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13483 uio_addiov(auio, uap->value, uap->size);
13484
13485 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13486 #if CONFIG_FSE
13487 if (error == 0) {
13488 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13489 FSE_ARG_VNODE, vp,
13490 FSE_ARG_DONE);
13491 }
13492 #endif
13493 vnode_put(vp);
13494 out:
13495 kfree_type(struct setxattr_ctx, sactx);
13496 *retval = 0;
13497 return error;
13498 }
13499
13500 /*
13501 * Set the data of an extended attribute.
13502 */
13503 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13504 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13505 {
13506 vnode_t vp;
13507 char attrname[XATTR_MAXNAMELEN + 1];
13508 vfs_context_t ctx = vfs_context_current();
13509 uio_t auio = NULL;
13510 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13511 size_t namelen;
13512 int error;
13513 UIO_STACKBUF(uio_buf, 1);
13514
13515 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13516 XATTR_NOFOLLOW_ANY)) {
13517 return EINVAL;
13518 }
13519
13520 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13521 if (error != 0) {
13522 if (error == EPERM) {
13523 /* if the string won't fit in attrname, copyinstr emits EPERM */
13524 return ENAMETOOLONG;
13525 }
13526 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13527 return error;
13528 }
13529 if (xattr_protected(attrname) &&
13530 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13531 return error;
13532 }
13533 if (uap->size != 0 && uap->value == 0) {
13534 return EINVAL;
13535 }
13536 if (uap->size > INT_MAX) {
13537 return E2BIG;
13538 }
13539 if ((error = file_vnode(uap->fd, &vp))) {
13540 return error;
13541 }
13542 if ((error = vnode_getwithref(vp))) {
13543 file_drop(uap->fd);
13544 return error;
13545 }
13546
13547 #if CONFIG_FILE_LEASES
13548 vnode_breakdirlease(vp, true, O_WRONLY);
13549 #endif
13550
13551 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13552 &uio_buf[0], sizeof(uio_buf));
13553 uio_addiov(auio, uap->value, uap->size);
13554
13555 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13556 #if CONFIG_FSE
13557 if (error == 0) {
13558 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13559 FSE_ARG_VNODE, vp,
13560 FSE_ARG_DONE);
13561 }
13562 #endif
13563 vnode_put(vp);
13564 file_drop(uap->fd);
13565 *retval = 0;
13566 return error;
13567 }
13568
13569 /*
13570 * Remove an extended attribute.
13571 * XXX Code duplication here.
13572 */
13573 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13574 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13575 {
13576 vnode_t vp;
13577 struct nameidata nd;
13578 char attrname[XATTR_MAXNAMELEN + 1];
13579 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13580 vfs_context_t ctx = vfs_context_current();
13581 size_t namelen;
13582 u_int32_t nameiflags;
13583 int error;
13584
13585 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13586 return EINVAL;
13587 }
13588
13589 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13590 if (error != 0) {
13591 return error;
13592 }
13593 if (xattr_protected(attrname)) {
13594 return EPERM;
13595 }
13596 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13597 #if CONFIG_FILE_LEASES
13598 nameiflags |= WANTPARENT;
13599 #endif
13600 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13601 if (uap->options & XATTR_NOFOLLOW_ANY) {
13602 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13603 }
13604
13605 if ((error = namei(&nd))) {
13606 return error;
13607 }
13608 vp = nd.ni_vp;
13609 #if CONFIG_FILE_LEASES
13610 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13611 vnode_put(nd.ni_dvp);
13612 #endif
13613 nameidone(&nd);
13614
13615 error = vn_removexattr(vp, attrname, uap->options, ctx);
13616 #if CONFIG_FSE
13617 if (error == 0) {
13618 add_fsevent(FSE_XATTR_REMOVED, ctx,
13619 FSE_ARG_VNODE, vp,
13620 FSE_ARG_DONE);
13621 }
13622 #endif
13623 vnode_put(vp);
13624 *retval = 0;
13625 return error;
13626 }
13627
13628 /*
13629 * Remove an extended attribute.
13630 * XXX Code duplication here.
13631 */
13632 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13633 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13634 {
13635 vnode_t vp;
13636 char attrname[XATTR_MAXNAMELEN + 1];
13637 size_t namelen;
13638 int error;
13639 #if CONFIG_FSE
13640 vfs_context_t ctx = vfs_context_current();
13641 #endif
13642
13643 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13644 XATTR_NOFOLLOW_ANY)) {
13645 return EINVAL;
13646 }
13647
13648 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13649 if (error != 0) {
13650 return error;
13651 }
13652 if (xattr_protected(attrname)) {
13653 return EPERM;
13654 }
13655 if ((error = file_vnode(uap->fd, &vp))) {
13656 return error;
13657 }
13658 if ((error = vnode_getwithref(vp))) {
13659 file_drop(uap->fd);
13660 return error;
13661 }
13662
13663 #if CONFIG_FILE_LEASES
13664 vnode_breakdirlease(vp, true, O_WRONLY);
13665 #endif
13666
13667 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13668 #if CONFIG_FSE
13669 if (error == 0) {
13670 add_fsevent(FSE_XATTR_REMOVED, ctx,
13671 FSE_ARG_VNODE, vp,
13672 FSE_ARG_DONE);
13673 }
13674 #endif
13675 vnode_put(vp);
13676 file_drop(uap->fd);
13677 *retval = 0;
13678 return error;
13679 }
13680
13681 /*
13682 * Retrieve the list of extended attribute names.
13683 * XXX Code duplication here.
13684 */
13685 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13686 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13687 {
13688 vnode_t vp;
13689 struct nameidata nd;
13690 vfs_context_t ctx = vfs_context_current();
13691 uio_t auio = NULL;
13692 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13693 size_t attrsize = 0;
13694 u_int32_t nameiflags;
13695 int error;
13696 UIO_STACKBUF(uio_buf, 1);
13697
13698 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13699 return EINVAL;
13700 }
13701
13702 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13703 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13704 if (uap->options & XATTR_NOFOLLOW_ANY) {
13705 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13706 }
13707
13708 if ((error = namei(&nd))) {
13709 return error;
13710 }
13711 vp = nd.ni_vp;
13712 nameidone(&nd);
13713 if (uap->namebuf != 0 && uap->bufsize > 0) {
13714 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13715 &uio_buf[0], sizeof(uio_buf));
13716 uio_addiov(auio, uap->namebuf, uap->bufsize);
13717 }
13718
13719 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13720
13721 vnode_put(vp);
13722 if (auio) {
13723 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13724 } else {
13725 *retval = (user_ssize_t)attrsize;
13726 }
13727 return error;
13728 }
13729
13730 /*
13731 * Retrieve the list of extended attribute names.
13732 * XXX Code duplication here.
13733 */
13734 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13735 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13736 {
13737 vnode_t vp;
13738 uio_t auio = NULL;
13739 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13740 size_t attrsize = 0;
13741 int error;
13742 UIO_STACKBUF(uio_buf, 1);
13743
13744 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13745 XATTR_NOFOLLOW_ANY)) {
13746 return EINVAL;
13747 }
13748
13749 if ((error = file_vnode(uap->fd, &vp))) {
13750 return error;
13751 }
13752 if ((error = vnode_getwithref(vp))) {
13753 file_drop(uap->fd);
13754 return error;
13755 }
13756 if (uap->namebuf != 0 && uap->bufsize > 0) {
13757 auio = uio_createwithbuffer(1, 0, spacetype,
13758 UIO_READ, &uio_buf[0], sizeof(uio_buf));
13759 uio_addiov(auio, uap->namebuf, uap->bufsize);
13760 }
13761
13762 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13763
13764 vnode_put(vp);
13765 file_drop(uap->fd);
13766 if (auio) {
13767 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13768 } else {
13769 *retval = (user_ssize_t)attrsize;
13770 }
13771 return error;
13772 }
13773
13774 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13775 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13776 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13777 {
13778 int error;
13779 struct mount *mp = NULL;
13780 vnode_t vp;
13781 int length;
13782 int bpflags;
13783 /* maximum number of times to retry build_path */
13784 unsigned int retries = 0x10;
13785
13786 if (bufsize > FSGETPATH_MAXBUFLEN) {
13787 return EINVAL;
13788 }
13789
13790 if (buf == NULL) {
13791 return ENOMEM;
13792 }
13793
13794 retry:
13795 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13796 error = ENOTSUP; /* unexpected failure */
13797 return ENOTSUP;
13798 }
13799
13800 #if CONFIG_UNION_MOUNTS
13801 unionget:
13802 #endif /* CONFIG_UNION_MOUNTS */
13803 if (objid == 2) {
13804 struct vfs_attr vfsattr;
13805 int use_vfs_root = TRUE;
13806
13807 VFSATTR_INIT(&vfsattr);
13808 VFSATTR_WANTED(&vfsattr, f_capabilities);
13809 if (!(options & FSOPT_ISREALFSID) &&
13810 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13811 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13812 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13813 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13814 use_vfs_root = FALSE;
13815 }
13816 }
13817
13818 if (use_vfs_root) {
13819 error = VFS_ROOT(mp, &vp, ctx);
13820 } else {
13821 error = VFS_VGET(mp, objid, &vp, ctx);
13822 }
13823 } else {
13824 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13825 }
13826
13827 #if CONFIG_UNION_MOUNTS
13828 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13829 /*
13830 * If the fileid isn't found and we're in a union
13831 * mount volume, then see if the fileid is in the
13832 * mounted-on volume.
13833 */
13834 struct mount *tmp = mp;
13835 mp = vnode_mount(tmp->mnt_vnodecovered);
13836 vfs_unbusy(tmp);
13837 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13838 goto unionget;
13839 }
13840 } else {
13841 vfs_unbusy(mp);
13842 }
13843 #else
13844 vfs_unbusy(mp);
13845 #endif /* CONFIG_UNION_MOUNTS */
13846
13847 if (error) {
13848 return error;
13849 }
13850
13851 #if CONFIG_MACF
13852 error = mac_vnode_check_fsgetpath(ctx, vp);
13853 if (error) {
13854 vnode_put(vp);
13855 return error;
13856 }
13857 #endif
13858
13859 /* Obtain the absolute path to this vnode. */
13860 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13861 if (options & FSOPT_NOFIRMLINKPATH) {
13862 bpflags |= BUILDPATH_NO_FIRMLINK;
13863 }
13864 bpflags |= BUILDPATH_CHECK_MOVED;
13865 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13866 vnode_put(vp);
13867
13868 if (error) {
13869 /* there was a race building the path, try a few more times */
13870 if (error == EAGAIN) {
13871 --retries;
13872 if (retries > 0) {
13873 goto retry;
13874 }
13875
13876 error = ENOENT;
13877 }
13878 goto out;
13879 }
13880
13881 AUDIT_ARG(text, buf);
13882
13883 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13884 unsigned long path_words[NUMPARMS];
13885 size_t path_len = sizeof(path_words);
13886
13887 if ((size_t)length < path_len) {
13888 memcpy((char *)path_words, buf, length);
13889 memset((char *)path_words + length, 0, path_len - length);
13890
13891 path_len = length;
13892 } else {
13893 memcpy((char *)path_words, buf + (length - path_len), path_len);
13894 }
13895
13896 kdebug_vfs_lookup(path_words, (int)path_len, vp,
13897 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13898 }
13899
13900 *pathlen = length; /* may be superseded by error */
13901
13902 out:
13903 return error;
13904 }
13905
13906 /*
13907 * Obtain the full pathname of a file system object by id.
13908 */
13909 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13910 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13911 uint32_t options, user_ssize_t *retval)
13912 {
13913 vfs_context_t ctx = vfs_context_current();
13914 fsid_t fsid;
13915 char *realpath;
13916 int length;
13917 int error;
13918
13919 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13920 return EINVAL;
13921 }
13922
13923 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13924 return error;
13925 }
13926 AUDIT_ARG(value32, fsid.val[0]);
13927 AUDIT_ARG(value64, objid);
13928 /* Restrict output buffer size for now. */
13929
13930 if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13931 return EINVAL;
13932 }
13933 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13934 if (realpath == NULL) {
13935 return ENOMEM;
13936 }
13937
13938 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13939 options, &length);
13940
13941 if (error) {
13942 goto out;
13943 }
13944
13945 error = copyout((caddr_t)realpath, buf, length);
13946
13947 *retval = (user_ssize_t)length; /* may be superseded by error */
13948 out:
13949 kfree_data(realpath, bufsize);
13950 return error;
13951 }
13952
13953 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13954 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13955 {
13956 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13957 0, retval);
13958 }
13959
13960 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13961 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13962 {
13963 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13964 uap->options, retval);
13965 }
13966
13967 /*
13968 * Common routine to handle various flavors of statfs data heading out
13969 * to user space.
13970 *
13971 * Returns: 0 Success
13972 * EFAULT
13973 */
13974 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13975 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13976 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13977 boolean_t partial_copy)
13978 {
13979 int error;
13980 int my_size, copy_size;
13981
13982 if (is_64_bit) {
13983 struct user64_statfs sfs;
13984 my_size = copy_size = sizeof(sfs);
13985 bzero(&sfs, my_size);
13986 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13987 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13988 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13989 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13990 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13991 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13992 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13993 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13994 sfs.f_files = (user64_long_t)sfsp->f_files;
13995 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13996 sfs.f_fsid = sfsp->f_fsid;
13997 sfs.f_owner = sfsp->f_owner;
13998 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13999 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14000 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14001
14002 if (partial_copy) {
14003 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14004 }
14005 error = copyout((caddr_t)&sfs, bufp, copy_size);
14006 } else {
14007 struct user32_statfs sfs;
14008
14009 my_size = copy_size = sizeof(sfs);
14010 bzero(&sfs, my_size);
14011
14012 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14013 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14014 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14015
14016 /*
14017 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
14018 * have to fudge the numbers here in that case. We inflate the blocksize in order
14019 * to reflect the filesystem size as best we can.
14020 */
14021 if ((sfsp->f_blocks > INT_MAX)
14022 /* Hack for 4061702 . I think the real fix is for Carbon to
14023 * look for some volume capability and not depend on hidden
14024 * semantics agreed between a FS and carbon.
14025 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
14026 * for Carbon to set bNoVolumeSizes volume attribute.
14027 * Without this the webdavfs files cannot be copied onto
14028 * disk as they look huge. This change should not affect
14029 * XSAN as they should not setting these to -1..
14030 */
14031 && (sfsp->f_blocks != 0xffffffffffffffffULL)
14032 && (sfsp->f_bfree != 0xffffffffffffffffULL)
14033 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
14034 int shift;
14035
14036 /*
14037 * Work out how far we have to shift the block count down to make it fit.
14038 * Note that it's possible to have to shift so far that the resulting
14039 * blocksize would be unreportably large. At that point, we will clip
14040 * any values that don't fit.
14041 *
14042 * For safety's sake, we also ensure that f_iosize is never reported as
14043 * being smaller than f_bsize.
14044 */
14045 for (shift = 0; shift < 32; shift++) {
14046 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
14047 break;
14048 }
14049 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
14050 break;
14051 }
14052 }
14053 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
14054 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
14055 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
14056 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
14057 #undef __SHIFT_OR_CLIP
14058 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
14059 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
14060 } else {
14061 /* filesystem is small enough to be reported honestly */
14062 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
14063 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
14064 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
14065 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
14066 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
14067 }
14068 sfs.f_files = (user32_long_t)sfsp->f_files;
14069 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
14070 sfs.f_fsid = sfsp->f_fsid;
14071 sfs.f_owner = sfsp->f_owner;
14072 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14073 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14074 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14075
14076 if (partial_copy) {
14077 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14078 }
14079 error = copyout((caddr_t)&sfs, bufp, copy_size);
14080 }
14081
14082 if (sizep != NULL) {
14083 *sizep = my_size;
14084 }
14085 return error;
14086 }
14087
14088 /*
14089 * copy stat structure into user_stat structure.
14090 */
14091 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)14092 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
14093 {
14094 bzero(usbp, sizeof(*usbp));
14095
14096 usbp->st_dev = sbp->st_dev;
14097 usbp->st_ino = sbp->st_ino;
14098 usbp->st_mode = sbp->st_mode;
14099 usbp->st_nlink = sbp->st_nlink;
14100 usbp->st_uid = sbp->st_uid;
14101 usbp->st_gid = sbp->st_gid;
14102 usbp->st_rdev = sbp->st_rdev;
14103 #ifndef _POSIX_C_SOURCE
14104 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14105 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14106 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14107 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14108 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14109 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14110 #else
14111 usbp->st_atime = sbp->st_atime;
14112 usbp->st_atimensec = sbp->st_atimensec;
14113 usbp->st_mtime = sbp->st_mtime;
14114 usbp->st_mtimensec = sbp->st_mtimensec;
14115 usbp->st_ctime = sbp->st_ctime;
14116 usbp->st_ctimensec = sbp->st_ctimensec;
14117 #endif
14118 usbp->st_size = sbp->st_size;
14119 usbp->st_blocks = sbp->st_blocks;
14120 usbp->st_blksize = sbp->st_blksize;
14121 usbp->st_flags = sbp->st_flags;
14122 usbp->st_gen = sbp->st_gen;
14123 usbp->st_lspare = sbp->st_lspare;
14124 usbp->st_qspare[0] = sbp->st_qspare[0];
14125 usbp->st_qspare[1] = sbp->st_qspare[1];
14126 }
14127
14128 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14129 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14130 {
14131 bzero(usbp, sizeof(*usbp));
14132
14133 usbp->st_dev = sbp->st_dev;
14134 usbp->st_ino = sbp->st_ino;
14135 usbp->st_mode = sbp->st_mode;
14136 usbp->st_nlink = sbp->st_nlink;
14137 usbp->st_uid = sbp->st_uid;
14138 usbp->st_gid = sbp->st_gid;
14139 usbp->st_rdev = sbp->st_rdev;
14140 #ifndef _POSIX_C_SOURCE
14141 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14142 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14143 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14144 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14145 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14146 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14147 #else
14148 usbp->st_atime = sbp->st_atime;
14149 usbp->st_atimensec = sbp->st_atimensec;
14150 usbp->st_mtime = sbp->st_mtime;
14151 usbp->st_mtimensec = sbp->st_mtimensec;
14152 usbp->st_ctime = sbp->st_ctime;
14153 usbp->st_ctimensec = sbp->st_ctimensec;
14154 #endif
14155 usbp->st_size = sbp->st_size;
14156 usbp->st_blocks = sbp->st_blocks;
14157 usbp->st_blksize = sbp->st_blksize;
14158 usbp->st_flags = sbp->st_flags;
14159 usbp->st_gen = sbp->st_gen;
14160 usbp->st_lspare = sbp->st_lspare;
14161 usbp->st_qspare[0] = sbp->st_qspare[0];
14162 usbp->st_qspare[1] = sbp->st_qspare[1];
14163 }
14164
14165 /*
14166 * copy stat64 structure into user_stat64 structure.
14167 */
14168 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14169 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14170 {
14171 bzero(usbp, sizeof(*usbp));
14172
14173 usbp->st_dev = sbp->st_dev;
14174 usbp->st_ino = sbp->st_ino;
14175 usbp->st_mode = sbp->st_mode;
14176 usbp->st_nlink = sbp->st_nlink;
14177 usbp->st_uid = sbp->st_uid;
14178 usbp->st_gid = sbp->st_gid;
14179 usbp->st_rdev = sbp->st_rdev;
14180 #ifndef _POSIX_C_SOURCE
14181 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14182 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14183 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14184 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14185 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14186 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14187 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14188 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14189 #else
14190 usbp->st_atime = sbp->st_atime;
14191 usbp->st_atimensec = sbp->st_atimensec;
14192 usbp->st_mtime = sbp->st_mtime;
14193 usbp->st_mtimensec = sbp->st_mtimensec;
14194 usbp->st_ctime = sbp->st_ctime;
14195 usbp->st_ctimensec = sbp->st_ctimensec;
14196 usbp->st_birthtime = sbp->st_birthtime;
14197 usbp->st_birthtimensec = sbp->st_birthtimensec;
14198 #endif
14199 usbp->st_size = sbp->st_size;
14200 usbp->st_blocks = sbp->st_blocks;
14201 usbp->st_blksize = sbp->st_blksize;
14202 usbp->st_flags = sbp->st_flags;
14203 usbp->st_gen = sbp->st_gen;
14204 usbp->st_lspare = sbp->st_lspare;
14205 usbp->st_qspare[0] = sbp->st_qspare[0];
14206 usbp->st_qspare[1] = sbp->st_qspare[1];
14207 }
14208
14209 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14210 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14211 {
14212 bzero(usbp, sizeof(*usbp));
14213
14214 usbp->st_dev = sbp->st_dev;
14215 usbp->st_ino = sbp->st_ino;
14216 usbp->st_mode = sbp->st_mode;
14217 usbp->st_nlink = sbp->st_nlink;
14218 usbp->st_uid = sbp->st_uid;
14219 usbp->st_gid = sbp->st_gid;
14220 usbp->st_rdev = sbp->st_rdev;
14221 #ifndef _POSIX_C_SOURCE
14222 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14223 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14224 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14225 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14226 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14227 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14228 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14229 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14230 #else
14231 usbp->st_atime = sbp->st_atime;
14232 usbp->st_atimensec = sbp->st_atimensec;
14233 usbp->st_mtime = sbp->st_mtime;
14234 usbp->st_mtimensec = sbp->st_mtimensec;
14235 usbp->st_ctime = sbp->st_ctime;
14236 usbp->st_ctimensec = sbp->st_ctimensec;
14237 usbp->st_birthtime = sbp->st_birthtime;
14238 usbp->st_birthtimensec = sbp->st_birthtimensec;
14239 #endif
14240 usbp->st_size = sbp->st_size;
14241 usbp->st_blocks = sbp->st_blocks;
14242 usbp->st_blksize = sbp->st_blksize;
14243 usbp->st_flags = sbp->st_flags;
14244 usbp->st_gen = sbp->st_gen;
14245 usbp->st_lspare = sbp->st_lspare;
14246 usbp->st_qspare[0] = sbp->st_qspare[0];
14247 usbp->st_qspare[1] = sbp->st_qspare[1];
14248 }
14249
14250 /*
14251 * Purge buffer cache for simulating cold starts
14252 */
14253 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14254 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14255 {
14256 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14257
14258 return VNODE_RETURNED;
14259 }
14260
14261 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14262 vfs_purge_callback(mount_t mp, __unused void * arg)
14263 {
14264 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14265
14266 return VFS_RETURNED;
14267 }
14268
14269 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14270 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14271
14272 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14273 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14274 {
14275 if (!kauth_cred_issuser(kauth_cred_get())) {
14276 return EPERM;
14277 }
14278
14279 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14280
14281 /* also flush any VM pagers backed by files */
14282 if (vfs_purge_vm_pagers) {
14283 vm_purge_filebacked_pagers();
14284 }
14285
14286 return 0;
14287 }
14288
14289 /*
14290 * gets the vnode associated with the (unnamed) snapshot directory
14291 * for a Filesystem. The snapshot directory vnode is returned with
14292 * an iocount on it.
14293 */
14294 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14295 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14296 {
14297 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14298 }
14299
14300 /*
14301 * Get the snapshot vnode.
14302 *
14303 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14304 * needs nameidone() on ndp.
14305 *
14306 * If the snapshot vnode exists it is returned in ndp->ni_vp.
14307 *
14308 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14309 * not needed.
14310 */
14311 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14312 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14313 user_addr_t name, struct nameidata *ndp, int32_t op,
14314 #if !CONFIG_TRIGGERS
14315 __unused
14316 #endif
14317 enum path_operation pathop,
14318 vfs_context_t ctx)
14319 {
14320 int error, i;
14321 caddr_t name_buf;
14322 size_t name_len;
14323 struct vfs_attr vfa;
14324
14325 *sdvpp = NULLVP;
14326 *rvpp = NULLVP;
14327
14328 error = vnode_getfromfd(ctx, dirfd, rvpp);
14329 if (error) {
14330 return error;
14331 }
14332
14333 if (!vnode_isvroot(*rvpp)) {
14334 error = EINVAL;
14335 goto out;
14336 }
14337
14338 /* Make sure the filesystem supports snapshots */
14339 VFSATTR_INIT(&vfa);
14340 VFSATTR_WANTED(&vfa, f_capabilities);
14341 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14342 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14343 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14344 VOL_CAP_INT_SNAPSHOT)) ||
14345 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14346 VOL_CAP_INT_SNAPSHOT))) {
14347 error = ENOTSUP;
14348 goto out;
14349 }
14350
14351 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14352 if (error) {
14353 goto out;
14354 }
14355
14356 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14357 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14358 if (error) {
14359 goto out1;
14360 }
14361
14362 /*
14363 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14364 * (the length returned by copyinstr includes the terminating NUL)
14365 */
14366 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14367 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14368 error = EINVAL;
14369 goto out1;
14370 }
14371 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14372 ;
14373 }
14374 if (i < (int)name_len) {
14375 error = EINVAL;
14376 goto out1;
14377 }
14378
14379 #if CONFIG_MACF
14380 if (op == CREATE) {
14381 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14382 name_buf);
14383 } else if (op == DELETE) {
14384 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14385 name_buf);
14386 }
14387 if (error) {
14388 goto out1;
14389 }
14390 #endif
14391
14392 /* Check if the snapshot already exists ... */
14393 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14394 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14395 ndp->ni_dvp = *sdvpp;
14396
14397 error = namei(ndp);
14398 out1:
14399 zfree(ZV_NAMEI, name_buf);
14400 out:
14401 if (error) {
14402 if (*sdvpp) {
14403 vnode_put(*sdvpp);
14404 *sdvpp = NULLVP;
14405 }
14406 if (*rvpp) {
14407 vnode_put(*rvpp);
14408 *rvpp = NULLVP;
14409 }
14410 }
14411 return error;
14412 }
14413
14414 /*
14415 * create a filesystem snapshot (for supporting filesystems)
14416 *
14417 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14418 * We get to the (unnamed) snapshot directory vnode and create the vnode
14419 * for the snapshot in it.
14420 *
14421 * Restrictions:
14422 *
14423 * a) Passed in name for snapshot cannot have slashes.
14424 * b) name can't be "." or ".."
14425 *
14426 * Since this requires superuser privileges, vnode_authorize calls are not
14427 * made.
14428 */
14429 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14430 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14431 vfs_context_t ctx)
14432 {
14433 vnode_t rvp, snapdvp;
14434 int error;
14435 struct nameidata *ndp;
14436
14437 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14438
14439 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14440 OP_LINK, ctx);
14441 if (error) {
14442 goto out;
14443 }
14444
14445 if (ndp->ni_vp) {
14446 vnode_put(ndp->ni_vp);
14447 error = EEXIST;
14448 } else {
14449 struct vnode_attr *vap;
14450 vnode_t vp = NULLVP;
14451
14452 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14453
14454 VATTR_INIT(vap);
14455 VATTR_SET(vap, va_type, VREG);
14456 VATTR_SET(vap, va_mode, 0);
14457
14458 error = vn_create(snapdvp, &vp, ndp, vap,
14459 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14460 if (!error && vp) {
14461 vnode_put(vp);
14462 }
14463
14464 kfree_type(struct vnode_attr, vap);
14465 }
14466
14467 nameidone(ndp);
14468 vnode_put(snapdvp);
14469 vnode_put(rvp);
14470 out:
14471 kfree_type(struct nameidata, ndp);
14472
14473 return error;
14474 }
14475
14476 /*
14477 * Delete a Filesystem snapshot
14478 *
14479 * get the vnode for the unnamed snapshot directory and the snapshot and
14480 * delete the snapshot.
14481 */
14482 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14483 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14484 vfs_context_t ctx)
14485 {
14486 vnode_t rvp, snapdvp;
14487 int error;
14488 struct nameidata *ndp;
14489
14490 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14491
14492 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14493 OP_UNLINK, ctx);
14494 if (error) {
14495 goto out;
14496 }
14497
14498 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14499 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14500
14501 vnode_put(ndp->ni_vp);
14502 nameidone(ndp);
14503 vnode_put(snapdvp);
14504 vnode_put(rvp);
14505 out:
14506 kfree_type(struct nameidata, ndp);
14507
14508 return error;
14509 }
14510
14511 /*
14512 * Revert a filesystem to a snapshot
14513 *
14514 * Marks the filesystem to revert to the given snapshot on next mount.
14515 */
14516 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14517 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14518 vfs_context_t ctx)
14519 {
14520 int error;
14521 vnode_t rvp;
14522 mount_t mp;
14523 struct fs_snapshot_revert_args revert_data;
14524 struct componentname cnp;
14525 caddr_t name_buf;
14526 size_t name_len;
14527
14528 error = vnode_getfromfd(ctx, dirfd, &rvp);
14529 if (error) {
14530 return error;
14531 }
14532 mp = vnode_mount(rvp);
14533
14534 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14535 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14536 if (error) {
14537 zfree(ZV_NAMEI, name_buf);
14538 vnode_put(rvp);
14539 return error;
14540 }
14541
14542 #if CONFIG_MACF
14543 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14544 if (error) {
14545 zfree(ZV_NAMEI, name_buf);
14546 vnode_put(rvp);
14547 return error;
14548 }
14549 #endif
14550
14551 /*
14552 * Grab mount_iterref so that we can release the vnode,
14553 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14554 */
14555 error = mount_iterref(mp, 0);
14556 vnode_put(rvp);
14557 if (error) {
14558 zfree(ZV_NAMEI, name_buf);
14559 return error;
14560 }
14561
14562 memset(&cnp, 0, sizeof(cnp));
14563 cnp.cn_pnbuf = (char *)name_buf;
14564 cnp.cn_nameiop = LOOKUP;
14565 cnp.cn_flags = ISLASTCN | HASBUF;
14566 cnp.cn_pnlen = MAXPATHLEN;
14567 cnp.cn_nameptr = cnp.cn_pnbuf;
14568 cnp.cn_namelen = (int)name_len;
14569 revert_data.sr_cnp = &cnp;
14570
14571 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14572 mount_iterdrop(mp);
14573 zfree(ZV_NAMEI, name_buf);
14574
14575 if (error) {
14576 /* If there was any error, try again using VNOP_IOCTL */
14577
14578 vnode_t snapdvp;
14579 struct nameidata namend;
14580
14581 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14582 OP_LOOKUP, ctx);
14583 if (error) {
14584 return error;
14585 }
14586
14587
14588 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14589 0, ctx);
14590
14591 vnode_put(namend.ni_vp);
14592 nameidone(&namend);
14593 vnode_put(snapdvp);
14594 vnode_put(rvp);
14595 }
14596
14597 return error;
14598 }
14599
14600 /*
14601 * rename a Filesystem snapshot
14602 *
14603 * get the vnode for the unnamed snapshot directory and the snapshot and
14604 * rename the snapshot. This is a very specialised (and simple) case of
14605 * rename(2) (which has to deal with a lot more complications). It differs
14606 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14607 */
14608 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14609 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14610 __unused uint32_t flags, vfs_context_t ctx)
14611 {
14612 vnode_t rvp, snapdvp;
14613 int error, i;
14614 caddr_t newname_buf;
14615 size_t name_len;
14616 vnode_t fvp;
14617 struct nameidata *fromnd, *tond;
14618 /* carving out a chunk for structs that are too big to be on stack. */
14619 struct {
14620 struct nameidata from_node;
14621 struct nameidata to_node;
14622 } * __rename_data;
14623
14624 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14625 fromnd = &__rename_data->from_node;
14626 tond = &__rename_data->to_node;
14627
14628 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14629 OP_UNLINK, ctx);
14630 if (error) {
14631 goto out;
14632 }
14633 fvp = fromnd->ni_vp;
14634
14635 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14636 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14637 if (error) {
14638 goto out1;
14639 }
14640
14641 /*
14642 * Some sanity checks- new name can't be empty, "." or ".." or have
14643 * slashes.
14644 * (the length returned by copyinstr includes the terminating NUL)
14645 *
14646 * The FS rename VNOP is suppossed to handle this but we'll pick it
14647 * off here itself.
14648 */
14649 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14650 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14651 error = EINVAL;
14652 goto out1;
14653 }
14654 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14655 ;
14656 }
14657 if (i < (int)name_len) {
14658 error = EINVAL;
14659 goto out1;
14660 }
14661
14662 #if CONFIG_MACF
14663 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14664 newname_buf);
14665 if (error) {
14666 goto out1;
14667 }
14668 #endif
14669
14670 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14671 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14672 tond->ni_dvp = snapdvp;
14673
14674 error = namei(tond);
14675 if (error) {
14676 goto out2;
14677 } else if (tond->ni_vp) {
14678 /*
14679 * snapshot rename behaves differently than rename(2) - if the
14680 * new name exists, EEXIST is returned.
14681 */
14682 vnode_put(tond->ni_vp);
14683 error = EEXIST;
14684 goto out2;
14685 }
14686
14687 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14688 &tond->ni_cnd, ctx);
14689
14690 out2:
14691 nameidone(tond);
14692 out1:
14693 zfree(ZV_NAMEI, newname_buf);
14694 vnode_put(fvp);
14695 vnode_put(snapdvp);
14696 vnode_put(rvp);
14697 nameidone(fromnd);
14698 out:
14699 kfree_type(typeof(*__rename_data), __rename_data);
14700 return error;
14701 }
14702
14703 /*
14704 * Mount a Filesystem snapshot
14705 *
14706 * get the vnode for the unnamed snapshot directory and the snapshot and
14707 * mount the snapshot.
14708 */
14709 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14710 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14711 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14712 {
14713 mount_t mp;
14714 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14715 struct fs_snapshot_mount_args smnt_data;
14716 int error, mount_flags = 0;
14717 struct nameidata *snapndp, *dirndp;
14718 /* carving out a chunk for structs that are too big to be on stack. */
14719 struct {
14720 struct nameidata snapnd;
14721 struct nameidata dirnd;
14722 } * __snapshot_mount_data;
14723
14724 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14725 snapndp = &__snapshot_mount_data->snapnd;
14726 dirndp = &__snapshot_mount_data->dirnd;
14727
14728 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14729 OP_LOOKUP, ctx);
14730 if (error) {
14731 goto out;
14732 }
14733
14734 snapvp = snapndp->ni_vp;
14735 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14736 error = EIO;
14737 goto out1;
14738 }
14739
14740 /* Convert snapshot_mount flags to mount flags */
14741 if (flags & SNAPSHOT_MNT_NOSUID) {
14742 mount_flags |= MNT_NOSUID;
14743 }
14744 if (flags & SNAPSHOT_MNT_NODEV) {
14745 mount_flags |= MNT_NODEV;
14746 }
14747 if (flags & SNAPSHOT_MNT_DONTBROWSE) {
14748 mount_flags |= MNT_DONTBROWSE;
14749 }
14750 if (flags & SNAPSHOT_MNT_IGNORE_OWNERSHIP) {
14751 mount_flags |= MNT_IGNORE_OWNERSHIP;
14752 }
14753 if (flags & SNAPSHOT_MNT_NOFOLLOW) {
14754 mount_flags |= MNT_NOFOLLOW;
14755 }
14756
14757 /* Get the vnode to be covered */
14758 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14759 UIO_USERSPACE, directory, ctx);
14760 if (mount_flags & MNT_NOFOLLOW) {
14761 dirndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
14762 }
14763
14764 error = namei(dirndp);
14765 if (error) {
14766 goto out1;
14767 }
14768
14769 vp = dirndp->ni_vp;
14770 pvp = dirndp->ni_dvp;
14771 mp = vnode_mount(rvp);
14772
14773 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14774 error = EINVAL;
14775 goto out2;
14776 }
14777
14778 #if CONFIG_MACF
14779 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14780 mp->mnt_vfsstat.f_fstypename);
14781 if (error) {
14782 goto out2;
14783 }
14784 #endif
14785
14786 smnt_data.sm_mp = mp;
14787 smnt_data.sm_cnp = &snapndp->ni_cnd;
14788 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14789 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), mount_flags,
14790 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14791
14792 out2:
14793 vnode_put(vp);
14794 vnode_put(pvp);
14795 nameidone(dirndp);
14796 out1:
14797 vnode_put(snapvp);
14798 vnode_put(snapdvp);
14799 vnode_put(rvp);
14800 nameidone(snapndp);
14801 out:
14802 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14803 return error;
14804 }
14805
14806 /*
14807 * Root from a snapshot of the filesystem
14808 *
14809 * Marks the filesystem to root from the given snapshot on next boot.
14810 */
14811 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14812 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14813 vfs_context_t ctx)
14814 {
14815 int error;
14816 vnode_t rvp;
14817 mount_t mp;
14818 struct fs_snapshot_root_args root_data;
14819 struct componentname cnp;
14820 caddr_t name_buf;
14821 size_t name_len;
14822
14823 error = vnode_getfromfd(ctx, dirfd, &rvp);
14824 if (error) {
14825 return error;
14826 }
14827 mp = vnode_mount(rvp);
14828
14829 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14830 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14831 if (error) {
14832 zfree(ZV_NAMEI, name_buf);
14833 vnode_put(rvp);
14834 return error;
14835 }
14836
14837 // XXX MAC checks ?
14838
14839 /*
14840 * Grab mount_iterref so that we can release the vnode,
14841 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14842 */
14843 error = mount_iterref(mp, 0);
14844 vnode_put(rvp);
14845 if (error) {
14846 zfree(ZV_NAMEI, name_buf);
14847 return error;
14848 }
14849
14850 memset(&cnp, 0, sizeof(cnp));
14851 cnp.cn_pnbuf = (char *)name_buf;
14852 cnp.cn_nameiop = LOOKUP;
14853 cnp.cn_flags = ISLASTCN | HASBUF;
14854 cnp.cn_pnlen = MAXPATHLEN;
14855 cnp.cn_nameptr = cnp.cn_pnbuf;
14856 cnp.cn_namelen = (int)name_len;
14857 root_data.sr_cnp = &cnp;
14858
14859 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14860
14861 mount_iterdrop(mp);
14862 zfree(ZV_NAMEI, name_buf);
14863
14864 return error;
14865 }
14866
14867 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14868 vfs_context_can_snapshot(vfs_context_t ctx)
14869 {
14870 static const char * const snapshot_entitlements[] = {
14871 "com.apple.private.vfs.snapshot",
14872 "com.apple.developer.vfs.snapshot",
14873 "com.apple.private.apfs.arv.limited.snapshot",
14874 };
14875 static const size_t nentitlements =
14876 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14877 size_t i;
14878
14879 task_t task = vfs_context_task(ctx);
14880 for (i = 0; i < nentitlements; i++) {
14881 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14882 return TRUE;
14883 }
14884 }
14885 return FALSE;
14886 }
14887
14888 /*
14889 * FS snapshot operations dispatcher
14890 */
14891 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14892 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14893 __unused int32_t *retval)
14894 {
14895 int error;
14896 vfs_context_t ctx = vfs_context_current();
14897
14898 AUDIT_ARG(fd, uap->dirfd);
14899 AUDIT_ARG(value32, uap->op);
14900
14901 if (!vfs_context_can_snapshot(ctx)) {
14902 return EPERM;
14903 }
14904
14905 /*
14906 * Enforce user authorization for snapshot modification operations,
14907 * or if trying to root from snapshot.
14908 */
14909 if (uap->op != SNAPSHOT_OP_MOUNT) {
14910 vnode_t dvp = NULLVP;
14911 vnode_t devvp = NULLVP;
14912 mount_t mp;
14913
14914 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14915 if (error) {
14916 return error;
14917 }
14918 mp = vnode_mount(dvp);
14919 devvp = mp->mnt_devvp;
14920
14921 /* get an iocount on devvp */
14922 if (devvp == NULLVP) {
14923 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14924 /* for mounts which arent block devices */
14925 if (error == ENOENT) {
14926 error = ENXIO;
14927 }
14928 } else {
14929 error = vnode_getwithref(devvp);
14930 }
14931
14932 if (error) {
14933 vnode_put(dvp);
14934 return error;
14935 }
14936
14937 if ((vfs_context_issuser(ctx) == 0) &&
14938 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14939 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14940 error = EPERM;
14941 }
14942 vnode_put(dvp);
14943 vnode_put(devvp);
14944
14945 if (error) {
14946 return error;
14947 }
14948 }
14949
14950 switch (uap->op) {
14951 case SNAPSHOT_OP_CREATE:
14952 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14953 break;
14954 case SNAPSHOT_OP_DELETE:
14955 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14956 break;
14957 case SNAPSHOT_OP_RENAME:
14958 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14959 uap->flags, ctx);
14960 break;
14961 case SNAPSHOT_OP_MOUNT:
14962 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14963 uap->data, uap->flags, ctx);
14964 break;
14965 case SNAPSHOT_OP_REVERT:
14966 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14967 break;
14968 #if CONFIG_MNT_ROOTSNAP
14969 case SNAPSHOT_OP_ROOT:
14970 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14971 break;
14972 #endif /* CONFIG_MNT_ROOTSNAP */
14973 default:
14974 error = ENOSYS;
14975 }
14976
14977 return error;
14978 }
14979