1 /*
2 * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114 #if CONFIG_EXCLAVES
115 #include <vfs/vfs_exclave_fs.h>
116 #endif
117
118 #include <security/audit/audit.h>
119 #include <bsm/audit_kevents.h>
120
121 #include <mach/mach_types.h>
122 #include <kern/kern_types.h>
123 #include <kern/kalloc.h>
124 #include <kern/task.h>
125
126 #include <vm/vm_pageout.h>
127 #include <vm/vm_protos.h>
128 #include <vm/memory_object_xnu.h>
129
130 #include <libkern/OSAtomic.h>
131 #include <os/atomic_private.h>
132 #include <pexpert/pexpert.h>
133 #include <IOKit/IOBSD.h>
134
135 // deps for MIG call
136 #include <kern/host.h>
137 #include <kern/ipc_misc.h>
138 #include <mach/host_priv.h>
139 #include <mach/vfs_nspace.h>
140 #include <os/log.h>
141
142 #include <nfs/nfs_conf.h>
143
144 #if ROUTEFS
145 #include <miscfs/routefs/routefs.h>
146 #endif /* ROUTEFS */
147
148 #if CONFIG_MACF
149 #include <security/mac.h>
150 #include <security/mac_framework.h>
151 #endif
152
153 #if CONFIG_FSE
154 #define GET_PATH(x) \
155 ((x) = get_pathbuff())
156 #define RELEASE_PATH(x) \
157 release_pathbuff(x)
158 #else
159 #define GET_PATH(x) \
160 ((x) = zalloc(ZV_NAMEI))
161 #define RELEASE_PATH(x) \
162 zfree(ZV_NAMEI, x)
163 #endif /* CONFIG_FSE */
164
165 #ifndef HFS_GET_BOOT_INFO
166 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
167 #endif
168
169 #ifndef HFS_SET_BOOT_INFO
170 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
171 #endif
172
173 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
174 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
175 #endif
176
177 extern void disk_conditioner_unmount(mount_t mp);
178
179 /* struct for checkdirs iteration */
180 struct cdirargs {
181 vnode_t olddp;
182 vnode_t newdp;
183 };
184 /* callback for checkdirs iteration */
185 static int checkdirs_callback(proc_t p, void * arg);
186
187 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
188 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
189 void enablequotas(struct mount *mp, vfs_context_t ctx);
190 static int getfsstat_callback(mount_t mp, void * arg);
191 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
192 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
193 static int sync_callback(mount_t, void *);
194 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
195 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
196 boolean_t partial_copy);
197 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
198 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
199 struct componentname *cnp, user_addr_t fsmountargs,
200 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
201 void vfs_notify_mount(vnode_t pdvp);
202
203 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
204
205 struct fd_vn_data * fg_vn_data_alloc(void);
206
207 /*
208 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
209 * Concurrent lookups (or lookups by ids) on hard links can cause the
210 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
211 * does) to return ENOENT as the path cannot be returned from the name cache
212 * alone. We have no option but to retry and hope to get one namei->reverse path
213 * generation done without an intervening lookup, lookup by id on the hard link
214 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
215 * which currently are the MAC hooks for rename, unlink and rmdir.
216 */
217 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
218
219 /* Max retry limit for rename due to vnode recycling. */
220 #define MAX_RENAME_ERECYCLE_RETRIES 1024
221
222 #define MAX_LINK_ENOENT_RETRIES 1024
223
224 /* Max retries for concurrent mounts on the same covered vnode. */
225 #define MAX_MOUNT_RETRIES 10
226
227 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
228 int unlink_flags);
229
230 #ifdef CONFIG_IMGSRC_ACCESS
231 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
232 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
233 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
234 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
235 static void mount_end_update(mount_t mp);
236 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
237 #endif /* CONFIG_IMGSRC_ACCESS */
238
239 //snapshot functions
240 #if CONFIG_MNT_ROOTSNAP
241 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
242 #else
243 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
244 #endif
245
246 __private_extern__
247 int sync_internal(void);
248
249 __private_extern__
250 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
251
252 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
253 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
254
255 /* vars for sync mutex */
256 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
257 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
258
259 extern lck_rw_t rootvnode_rw_lock;
260
261 VFS_SMR_DECLARE;
262 extern uint32_t nc_smr_enabled;
263
264 /*
265 * incremented each time a mount or unmount operation occurs
266 * used to invalidate the cached value of the rootvp in the
267 * mount structure utilized by cache_lookup_path
268 */
269 uint32_t mount_generation = 0;
270
271 /* counts number of mount and unmount operations */
272 unsigned int vfs_nummntops = 0;
273
274 /* system-wide, per-boot unique mount ID */
275 static _Atomic uint64_t mount_unique_id = 1;
276
277 extern const struct fileops vnops;
278 #if CONFIG_APPLEDOUBLE
279 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
280 #endif /* CONFIG_APPLEDOUBLE */
281
282 /* Maximum buffer length supported by fsgetpath(2) */
283 #define FSGETPATH_MAXBUFLEN 8192
284
285 /*
286 * Virtual File System System Calls
287 */
288
289 /*
290 * Private in-kernel mounting spi (specific use-cases only)
291 */
292 boolean_t
vfs_iskernelmount(mount_t mp)293 vfs_iskernelmount(mount_t mp)
294 {
295 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
296 }
297
298 __private_extern__
299 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)300 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
301 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
302 vfs_context_t ctx)
303 {
304 struct nameidata nd;
305 boolean_t did_namei;
306 int error;
307
308 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
309 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
310 if (syscall_flags & MNT_NOFOLLOW) {
311 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
312 }
313
314 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
315
316 /*
317 * Get the vnode to be covered if it's not supplied
318 */
319 if (vp == NULLVP) {
320 error = namei(&nd);
321 if (error) {
322 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
323 printf("failed to locate mount-on path: %s ", path);
324 }
325 return error;
326 }
327 vp = nd.ni_vp;
328 pvp = nd.ni_dvp;
329 did_namei = TRUE;
330 } else {
331 char *pnbuf = CAST_DOWN(char *, path);
332
333 nd.ni_cnd.cn_pnbuf = pnbuf;
334 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
335 did_namei = FALSE;
336 }
337
338 kern_flags |= KERNEL_MOUNT_KMOUNT;
339 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
340 syscall_flags, kern_flags, NULL, ctx);
341
342 if (did_namei) {
343 vnode_put(vp);
344 vnode_put(pvp);
345 nameidone(&nd);
346 }
347
348 return error;
349 }
350
351 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)352 vfs_mount_at_path(const char *fstype, const char *path,
353 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
354 int mnt_flags, int flags)
355 {
356 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
357 int error, km_flags = 0;
358 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
359
360 /*
361 * This call is currently restricted to specific use cases.
362 */
363 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
364 return ENOTSUP;
365 }
366
367 #if !defined(XNU_TARGET_OS_OSX)
368 if (strcmp(fstype, "lifs") == 0) {
369 syscall_flags |= MNT_NOEXEC;
370 }
371 #endif
372
373 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
374 km_flags |= KERNEL_MOUNT_NOAUTH;
375 }
376 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
377 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
378 }
379
380 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
381 syscall_flags, km_flags, ctx);
382 if (error) {
383 printf("%s: mount on %s failed, error %d\n", __func__, path,
384 error);
385 }
386
387 return error;
388 }
389
390 /*
391 * Mount a file system.
392 */
393 /* ARGSUSED */
394 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)395 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
396 {
397 struct __mac_mount_args muap;
398
399 muap.type = uap->type;
400 muap.path = uap->path;
401 muap.flags = uap->flags;
402 muap.data = uap->data;
403 muap.mac_p = USER_ADDR_NULL;
404 return __mac_mount(p, &muap, retval);
405 }
406
407 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)408 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
409 {
410 struct componentname cn;
411 vfs_context_t ctx = vfs_context_current();
412 size_t dummy = 0;
413 int error;
414 int flags = uap->flags;
415 char fstypename[MFSNAMELEN];
416 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
417 vnode_t pvp;
418 vnode_t vp;
419
420 AUDIT_ARG(fd, uap->fd);
421 AUDIT_ARG(fflags, flags);
422 /* fstypename will get audited by mount_common */
423
424 /* Sanity check the flags */
425 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
426 return ENOTSUP;
427 }
428
429 if (flags & MNT_UNION) {
430 return EPERM;
431 }
432
433 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
434 if (error) {
435 return error;
436 }
437
438 if ((error = file_vnode(uap->fd, &vp)) != 0) {
439 return error;
440 }
441
442 if ((error = vnode_getwithref(vp)) != 0) {
443 file_drop(uap->fd);
444 return error;
445 }
446
447 pvp = vnode_getparent(vp);
448 if (pvp == NULL) {
449 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
450 error = EBUSY;
451 } else {
452 error = EINVAL;
453 }
454 vnode_put(vp);
455 file_drop(uap->fd);
456 return error;
457 }
458
459 memset(&cn, 0, sizeof(struct componentname));
460 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
461 cn.cn_pnlen = MAXPATHLEN;
462
463 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
464 zfree(ZV_NAMEI, cn.cn_pnbuf);
465 vnode_put(pvp);
466 vnode_put(vp);
467 file_drop(uap->fd);
468 return error;
469 }
470
471 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
472
473 zfree(ZV_NAMEI, cn.cn_pnbuf);
474 vnode_put(pvp);
475 vnode_put(vp);
476 file_drop(uap->fd);
477
478 return error;
479 }
480
481 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
482
483 /*
484 * Get the size of a graft file (a manifest or payload file).
485 * The vp should be an iocounted vnode.
486 */
487 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)488 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
489 {
490 struct stat64 sb = {};
491 int error;
492
493 *size = 0;
494
495 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
496 if (error) {
497 return error;
498 }
499
500 if (sb.st_size == 0) {
501 error = ENODATA;
502 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
503 error = EFBIG;
504 } else {
505 *size = (size_t) sb.st_size;
506 }
507
508 return error;
509 }
510
511 /*
512 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
513 * `size` must already be validated.
514 */
515 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)516 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
517 {
518 return vn_rdwr(UIO_READ, graft_vp,
519 (caddr_t) buf, (int) size, /* offset */ 0,
520 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
521 vfs_context_ucred(vctx), /* resid */ NULL,
522 vfs_context_proc(vctx));
523 }
524
525 /*
526 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
527 * and read it into `buf`.
528 * If `path_prefix` is non-NULL, verify that the file path has that prefix.
529 */
530 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)531 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
532 {
533 vnode_t metadata_vp = NULLVP;
534 char *path = NULL;
535 int error;
536
537 // Convert this graft fd to a vnode.
538 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
539 goto out;
540 }
541
542 // Verify that the vnode path starts with `path_prefix` if it was passed.
543 if (path_prefix) {
544 int len = MAXPATHLEN;
545 path = zalloc(ZV_NAMEI);
546 if ((error = vn_getpath(metadata_vp, path, &len))) {
547 goto out;
548 }
549 if (strncmp(path, path_prefix, strlen(path_prefix))) {
550 error = EINVAL;
551 goto out;
552 }
553 }
554
555 // Get (and validate) size information.
556 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
557 goto out;
558 }
559
560 // Read each file into the provided buffer - we must get the expected amount of bytes.
561 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
562 goto out;
563 }
564
565 out:
566 if (path) {
567 zfree(ZV_NAMEI, path);
568 }
569 if (metadata_vp) {
570 vnode_put(metadata_vp);
571 metadata_vp = NULLVP;
572 }
573
574 return error;
575 }
576
577 #if XNU_TARGET_OS_OSX
578 #if defined(__arm64e__)
579 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
580 #else /* x86_64 */
581 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
582 #endif /* x86_64 */
583 #else /* !XNU_TARGET_OS_OSX */
584 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
585 #endif /* !XNU_TARGET_OS_OSX */
586
587 /*
588 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
589 * provided in `gfs`, saving the size of data read in `gfs`.
590 */
591 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)592 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
593 vfs_context_t vctx, fsioc_graft_fs_t *gfs)
594 {
595 const char *manifest_path_prefix = NULL;
596 int error;
597
598 // For Mobile Asset, make sure that the manifest comes from a data vault.
599 if (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) {
600 manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
601 }
602
603 // Read the authentic manifest.
604 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
605 manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
606 return error;
607 }
608
609 // The user manifest is currently unused, but set its size.
610 gfs->user_manifest_size = 0;
611
612 // Read the payload.
613 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
614 NULL, &gfs->payload_size, gfs->payload))) {
615 return error;
616 }
617
618 return 0;
619 }
620
621 /*
622 * Call into the filesystem to verify and graft a cryptex.
623 */
624 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)625 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
626 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
627 {
628 fsioc_graft_fs_t gfs = {};
629 uint64_t graft_dir_ino = 0;
630 struct stat64 sb = {};
631 int error;
632
633 // Pre-flight arguments.
634 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
635 // Make sure that this graft version matches what we support.
636 return ENOTSUP;
637 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
638 // For this type, cryptex VP must live on same volume as the target of graft.
639 return EXDEV;
640 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
641 // We cannot graft upon non-directories.
642 return ENOTDIR;
643 } else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
644 // We do not allow grafts inside disk images.
645 return ENODEV;
646 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
647 sbc_args->sbc_payload_fd < 0) {
648 // We cannot graft without a manifest and payload.
649 return EINVAL;
650 }
651
652 if (mounton_vp) {
653 // Get the mounton's inode number.
654 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
655 if (error) {
656 return error;
657 }
658 graft_dir_ino = (uint64_t) sb.st_ino;
659 }
660
661 // Create buffers (of our maximum-defined size) to store authentication info.
662 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
663 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
664
665 if (!gfs.authentic_manifest || !gfs.payload) {
666 error = ENOMEM;
667 goto out;
668 }
669
670 // Read our fd's into our buffers.
671 // (Note that this will set the buffer size fields in `gfs`.)
672 error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
673 if (error) {
674 goto out;
675 }
676
677 gfs.graft_version = FSIOC_GRAFT_VERSION;
678 gfs.graft_type = graft_type;
679 gfs.graft_4cc = sbc_args->sbc_4cc;
680 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
681 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
682 }
683 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
684 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
685 }
686 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
687 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
688 }
689 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
690 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
691 }
692 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
693 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
694 }
695 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
696 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
697 }
698 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
699
700 // Call into the FS to perform the graft (and validation).
701 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
702
703 out:
704 if (gfs.authentic_manifest) {
705 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
706 gfs.authentic_manifest = NULL;
707 }
708 if (gfs.payload) {
709 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
710 gfs.payload = NULL;
711 }
712
713 return error;
714 }
715
716 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
717
718 /*
719 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
720 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
721 */
722 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)723 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
724 {
725 int ua_dmgfd = uap->dmg_fd;
726 user_addr_t ua_mountdir = uap->mountdir;
727 uint32_t ua_grafttype = uap->graft_type;
728 user_addr_t ua_graftargs = uap->gda;
729
730 graftdmg_args_un kern_gda = {};
731 int error = 0;
732 secure_boot_cryptex_args_t *sbc_args = NULL;
733
734 vnode_t cryptex_vp = NULLVP;
735 vnode_t mounton_vp = NULLVP;
736 struct nameidata nd = {};
737 vfs_context_t ctx = vfs_context_current();
738
739 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
740 return EPERM;
741 }
742
743 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
744 if (error) {
745 return error;
746 }
747
748 // Copy mount dir in, if provided.
749 if (ua_mountdir != USER_ADDR_NULL) {
750 // Acquire vnode for mount-on path
751 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
752 UIO_USERSPACE, ua_mountdir, ctx);
753
754 error = namei(&nd);
755 if (error) {
756 return error;
757 }
758 mounton_vp = nd.ni_vp;
759 }
760
761 // Convert fd to vnode.
762 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
763 if (error) {
764 goto graftout;
765 }
766
767 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
768 error = EINVAL;
769 } else {
770 sbc_args = &kern_gda.sbc_args;
771 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
772 }
773
774 graftout:
775 if (cryptex_vp) {
776 vnode_put(cryptex_vp);
777 cryptex_vp = NULLVP;
778 }
779 if (mounton_vp) {
780 vnode_put(mounton_vp);
781 mounton_vp = NULLVP;
782 }
783 if (ua_mountdir != USER_ADDR_NULL) {
784 nameidone(&nd);
785 }
786
787 return error;
788 }
789
790 /*
791 * Ungraft a cryptex disk image (via mount dir FD)
792 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
793 */
794 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)795 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
796 {
797 int error = 0;
798 user_addr_t ua_mountdir = uap->mountdir;
799 fsioc_ungraft_fs_t ugfs;
800 vnode_t mounton_vp = NULLVP;
801 struct nameidata nd = {};
802 vfs_context_t ctx = vfs_context_current();
803
804 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
805 return EPERM;
806 }
807
808 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
809 return EINVAL;
810 }
811
812 ugfs.ungraft_flags = 0;
813
814 // Acquire vnode for mount-on path
815 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
816 UIO_USERSPACE, ua_mountdir, ctx);
817
818 error = namei(&nd);
819 if (error) {
820 return error;
821 }
822 mounton_vp = nd.ni_vp;
823
824 // Call into the FS to perform the ungraft
825 error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
826
827 vnode_put(mounton_vp);
828 nameidone(&nd);
829
830 return error;
831 }
832
833
834 void
vfs_notify_mount(vnode_t pdvp)835 vfs_notify_mount(vnode_t pdvp)
836 {
837 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
838 lock_vnode_and_post(pdvp, NOTE_WRITE);
839 }
840
841 /*
842 * __mac_mount:
843 * Mount a file system taking into account MAC label behavior.
844 * See mount(2) man page for more information
845 *
846 * Parameters: p Process requesting the mount
847 * uap User argument descriptor (see below)
848 * retval (ignored)
849 *
850 * Indirect: uap->type Filesystem type
851 * uap->path Path to mount
852 * uap->data Mount arguments
853 * uap->mac_p MAC info
854 * uap->flags Mount flags
855 *
856 *
857 * Returns: 0 Success
858 * !0 Not success
859 */
860 boolean_t root_fs_upgrade_try = FALSE;
861
862 #define MAX_NESTED_UNION_MOUNTS 10
863
864 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)865 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
866 {
867 vnode_t pvp = NULLVP;
868 vnode_t vp = NULLVP;
869 int need_nameidone = 0;
870 vfs_context_t ctx = vfs_context_current();
871 char fstypename[MFSNAMELEN];
872 struct nameidata nd;
873 size_t dummy = 0;
874 char *labelstr = NULL;
875 size_t labelsz = 0;
876 int flags = uap->flags;
877 int error;
878 int num_retries = 0;
879 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
880 boolean_t is_64bit = IS_64BIT_PROCESS(p);
881 #else
882 #pragma unused(p)
883 #endif
884 /*
885 * Get the fs type name from user space
886 */
887 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
888 if (error) {
889 return error;
890 }
891
892 retry:
893 /*
894 * Get the vnode to be covered
895 */
896 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
897 UIO_USERSPACE, uap->path, ctx);
898 if (flags & MNT_NOFOLLOW) {
899 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
900 }
901 error = namei(&nd);
902 if (error) {
903 goto out;
904 }
905 need_nameidone = 1;
906 vp = nd.ni_vp;
907 pvp = nd.ni_dvp;
908
909 #ifdef CONFIG_IMGSRC_ACCESS
910 /* Mounting image source cannot be batched with other operations */
911 if (flags == MNT_IMGSRC_BY_INDEX) {
912 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
913 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
914 goto out;
915 }
916 #endif /* CONFIG_IMGSRC_ACCESS */
917
918 #if CONFIG_MACF
919 /*
920 * Get the label string (if any) from user space
921 */
922 if (uap->mac_p != USER_ADDR_NULL) {
923 struct user_mac mac;
924 size_t ulen = 0;
925
926 if (is_64bit) {
927 struct user64_mac mac64;
928 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
929 mac.m_buflen = (user_size_t)mac64.m_buflen;
930 mac.m_string = (user_addr_t)mac64.m_string;
931 } else {
932 struct user32_mac mac32;
933 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
934 mac.m_buflen = mac32.m_buflen;
935 mac.m_string = mac32.m_string;
936 }
937 if (error) {
938 goto out;
939 }
940 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
941 (mac.m_buflen < 2)) {
942 error = EINVAL;
943 goto out;
944 }
945 labelsz = mac.m_buflen;
946 labelstr = kalloc_data(labelsz, Z_WAITOK);
947 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
948 if (error) {
949 goto out;
950 }
951 AUDIT_ARG(mac_string, labelstr);
952 }
953 #endif /* CONFIG_MACF */
954
955 AUDIT_ARG(fflags, flags);
956
957 if (flags & MNT_UNION) {
958 #if CONFIG_UNION_MOUNTS
959 mount_t mp = vp->v_mount;
960 int nested_union_mounts = 0;
961
962 name_cache_lock_shared();
963
964 /* Walk up the vnodecovered chain and check for nested union mounts. */
965 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
966 while (mp) {
967 if (!(mp->mnt_flag & MNT_UNION)) {
968 break;
969 }
970 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
971
972 /*
973 * Limit the max nested unon mounts to prevent stack exhaustion
974 * when calling lookup_traverse_union().
975 */
976 if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
977 error = ELOOP;
978 break;
979 }
980 }
981
982 name_cache_unlock();
983 if (error) {
984 goto out;
985 }
986 #else
987 error = EPERM;
988 goto out;
989 #endif /* CONFIG_UNION_MOUNTS */
990 }
991
992 if ((vp->v_flag & VROOT) &&
993 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
994 #if CONFIG_UNION_MOUNTS
995 if (!(flags & MNT_UNION)) {
996 flags |= MNT_UPDATE;
997 } else {
998 /*
999 * For a union mount on '/', treat it as fresh
1000 * mount instead of update.
1001 * Otherwise, union mouting on '/' used to panic the
1002 * system before, since mnt_vnodecovered was found to
1003 * be NULL for '/' which is required for unionlookup
1004 * after it gets ENOENT on union mount.
1005 */
1006 flags = (flags & ~(MNT_UPDATE));
1007 }
1008 #else
1009 flags |= MNT_UPDATE;
1010 #endif /* CONFIG_UNION_MOUNTS */
1011
1012 #if SECURE_KERNEL
1013 if ((flags & MNT_RDONLY) == 0) {
1014 /* Release kernels are not allowed to mount "/" as rw */
1015 error = EPERM;
1016 goto out;
1017 }
1018 #endif
1019
1020 /*
1021 * See 7392553 for more details on why this check exists.
1022 * Suffice to say: If this check is ON and something tries
1023 * to mount the rootFS RW, we'll turn off the codesign
1024 * bitmap optimization.
1025 */
1026 #if CHECK_CS_VALIDATION_BITMAP
1027 if ((flags & MNT_RDONLY) == 0) {
1028 root_fs_upgrade_try = TRUE;
1029 }
1030 #endif
1031 }
1032
1033 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1034 labelstr, ctx);
1035
1036 out:
1037
1038 #if CONFIG_MACF
1039 kfree_data(labelstr, labelsz);
1040 #endif /* CONFIG_MACF */
1041
1042 if (vp) {
1043 vnode_put(vp);
1044 vp = NULLVP;
1045 }
1046 if (pvp) {
1047 vnode_put(pvp);
1048 pvp = NULLVP;
1049 }
1050 if (need_nameidone) {
1051 nameidone(&nd);
1052 need_nameidone = 0;
1053 }
1054
1055 if (error == EBUSY) {
1056 /* Retry the lookup and mount again due to concurrent mounts. */
1057 if (++num_retries < MAX_MOUNT_RETRIES) {
1058 goto retry;
1059 }
1060 }
1061
1062 return error;
1063 }
1064
1065 /*
1066 * common mount implementation (final stage of mounting)
1067 *
1068 * Arguments:
1069 * fstypename file system type (ie it's vfs name)
1070 * pvp parent of covered vnode
1071 * vp covered vnode
1072 * cnp component name (ie path) of covered vnode
1073 * flags generic mount flags
1074 * fsmountargs file system specific data
1075 * labelstr optional MAC label
1076 * kernelmount TRUE for mounts initiated from inside the kernel
1077 * ctx caller's context
1078 */
1079 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1080 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1081 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1082 char *labelstr, vfs_context_t ctx)
1083 {
1084 #if !CONFIG_MACF
1085 #pragma unused(labelstr)
1086 #endif
1087 struct vnode *devvp = NULLVP;
1088 struct vnode *device_vnode = NULLVP;
1089 #if CONFIG_MACF
1090 struct vnode *rvp;
1091 #endif
1092 struct mount *mp = NULL;
1093 struct vfstable *vfsp = (struct vfstable *)0;
1094 struct proc *p = vfs_context_proc(ctx);
1095 int error, flag = 0;
1096 bool flag_set = false;
1097 user_addr_t devpath = USER_ADDR_NULL;
1098 int ronly = 0;
1099 int mntalloc = 0;
1100 boolean_t vfsp_ref = FALSE;
1101 boolean_t is_rwlock_locked = FALSE;
1102 boolean_t did_rele = FALSE;
1103 boolean_t have_usecount = FALSE;
1104 boolean_t did_set_lmount = FALSE;
1105 boolean_t did_set_vmount = FALSE;
1106 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1107
1108 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1109 /* Check for mutually-exclusive flag bits */
1110 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1111 int bitcount = 0;
1112 while (checkflags != 0) {
1113 checkflags &= (checkflags - 1);
1114 bitcount++;
1115 }
1116
1117 if (bitcount > 1) {
1118 //not allowed to request multiple mount-by-role flags
1119 error = EINVAL;
1120 goto out1;
1121 }
1122 #endif
1123
1124 /*
1125 * Process an update for an existing mount
1126 */
1127 if (flags & MNT_UPDATE) {
1128 if ((vp->v_flag & VROOT) == 0) {
1129 error = EINVAL;
1130 goto out1;
1131 }
1132 mp = vp->v_mount;
1133
1134 /* if unmount or mount in progress, return error */
1135 mount_lock_spin(mp);
1136 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1137 mount_unlock(mp);
1138 error = EBUSY;
1139 goto out1;
1140 }
1141 mp->mnt_lflag |= MNT_LMOUNT;
1142 did_set_lmount = TRUE;
1143 mount_unlock(mp);
1144 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1145 is_rwlock_locked = TRUE;
1146 /*
1147 * We only allow the filesystem to be reloaded if it
1148 * is currently mounted read-only.
1149 */
1150 if ((flags & MNT_RELOAD) &&
1151 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1152 error = ENOTSUP;
1153 goto out1;
1154 }
1155
1156 /*
1157 * If content protection is enabled, update mounts are not
1158 * allowed to turn it off.
1159 */
1160 if ((mp->mnt_flag & MNT_CPROTECT) &&
1161 ((flags & MNT_CPROTECT) == 0)) {
1162 error = EINVAL;
1163 goto out1;
1164 }
1165
1166 /*
1167 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1168 * failure to return an error for this so we'll just silently
1169 * add it if it is not passed in.
1170 */
1171 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1172 ((flags & MNT_REMOVABLE) == 0)) {
1173 flags |= MNT_REMOVABLE;
1174 }
1175
1176 /* Can't downgrade the backer of the root FS */
1177 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1178 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1179 error = ENOTSUP;
1180 goto out1;
1181 }
1182
1183 /*
1184 * Only root, or the user that did the original mount is
1185 * permitted to update it.
1186 */
1187 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1188 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1189 goto out1;
1190 }
1191 #if CONFIG_MACF
1192 error = mac_mount_check_remount(ctx, mp, flags);
1193 if (error != 0) {
1194 goto out1;
1195 }
1196 #endif
1197 /*
1198 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1199 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1200 */
1201 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1202 flags |= MNT_NOSUID | MNT_NODEV;
1203 if (mp->mnt_flag & MNT_NOEXEC) {
1204 flags |= MNT_NOEXEC;
1205 }
1206 }
1207 flag = mp->mnt_flag;
1208 flag_set = true;
1209
1210
1211
1212 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1213
1214 vfsp = mp->mnt_vtable;
1215 goto update;
1216 } // MNT_UPDATE
1217
1218 /*
1219 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1220 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1221 */
1222 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1223 flags |= MNT_NOSUID | MNT_NODEV;
1224 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1225 flags |= MNT_NOEXEC;
1226 }
1227 }
1228
1229 /* XXXAUDIT: Should we capture the type on the error path as well? */
1230 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1231 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1232 mount_list_lock();
1233 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1234 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1235 vfsp->vfc_refcount++;
1236 vfsp_ref = TRUE;
1237 break;
1238 }
1239 }
1240 mount_list_unlock();
1241 if (vfsp == NULL) {
1242 error = ENODEV;
1243 goto out1;
1244 }
1245
1246 /*
1247 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1248 * except in ROSV configs and for the initial BaseSystem root.
1249 */
1250 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1251 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1252 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1253 error = EINVAL; /* unsupported request */
1254 goto out1;
1255 }
1256
1257 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1258 if (error != 0) {
1259 goto out1;
1260 }
1261
1262 /*
1263 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1264 */
1265 did_set_vmount = TRUE;
1266
1267 /*
1268 * Allocate and initialize the filesystem (mount_t)
1269 */
1270 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1271 mntalloc = 1;
1272
1273 /* Initialize the default IO constraints */
1274 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1275 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1276 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1277 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1278 mp->mnt_devblocksize = DEV_BSIZE;
1279 mp->mnt_alignmentmask = PAGE_MASK;
1280 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1281 mp->mnt_ioscale = 1;
1282 mp->mnt_ioflags = 0;
1283 mp->mnt_realrootvp = NULLVP;
1284 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1285
1286 mp->mnt_lflag |= MNT_LMOUNT;
1287 did_set_lmount = TRUE;
1288
1289 TAILQ_INIT(&mp->mnt_vnodelist);
1290 TAILQ_INIT(&mp->mnt_workerqueue);
1291 TAILQ_INIT(&mp->mnt_newvnodes);
1292 mount_lock_init(mp);
1293 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1294 is_rwlock_locked = TRUE;
1295 mp->mnt_op = vfsp->vfc_vfsops;
1296 mp->mnt_vtable = vfsp;
1297 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1298 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1299 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1300 do {
1301 size_t pathlen = MAXPATHLEN;
1302
1303 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1304 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1305 }
1306 } while (0);
1307 mp->mnt_vnodecovered = vp;
1308 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1309 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1310 mp->mnt_devbsdunit = 0;
1311 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1312
1313 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1314 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1315
1316 if (kernelmount) {
1317 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1318 }
1319 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1320 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1321 }
1322
1323 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1324 // kernel mounted devfs
1325 mp->mnt_kern_flag |= MNTK_SYSTEM;
1326 }
1327
1328 update:
1329
1330 /*
1331 * Set the mount level flags.
1332 */
1333 if (flags & MNT_RDONLY) {
1334 mp->mnt_flag |= MNT_RDONLY;
1335 } else if (mp->mnt_flag & MNT_RDONLY) {
1336 // disallow read/write upgrades of file systems that
1337 // had the TYPENAME_OVERRIDE feature set.
1338 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1339 error = EPERM;
1340 goto out1;
1341 }
1342 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1343 }
1344 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1345 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1346 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1347 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1348 MNT_QUARANTINE | MNT_CPROTECT);
1349
1350 #if SECURE_KERNEL
1351 #if !CONFIG_MNT_SUID
1352 /*
1353 * On release builds of iOS based platforms, always enforce NOSUID on
1354 * all mounts. We do this here because we can catch update mounts as well as
1355 * non-update mounts in this case.
1356 */
1357 mp->mnt_flag |= (MNT_NOSUID);
1358 #endif
1359 #endif
1360
1361 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1362 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1363 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1364 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1365 MNT_QUARANTINE | MNT_CPROTECT);
1366
1367 #if CONFIG_MACF
1368 if (flags & MNT_MULTILABEL) {
1369 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1370 error = EINVAL;
1371 goto out1;
1372 }
1373 mp->mnt_flag |= MNT_MULTILABEL;
1374 }
1375 #endif
1376 /*
1377 * Process device path for local file systems if requested.
1378 *
1379 * Snapshot and mount-by-role mounts do not use this path; they are
1380 * passing other opaque data in the device path field.
1381 *
1382 * Basesystemroot mounts pass a device path to be resolved here,
1383 * but it's just a char * already inside the kernel, which
1384 * kernel_mount() shoved into a user_addr_t to call us. So for such
1385 * mounts we must skip copyin (both of the address and of the string
1386 * (in NDINIT).
1387 */
1388 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1389 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1390 boolean_t do_copyin_devpath = true;
1391 #if CONFIG_BASESYSTEMROOT
1392 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1393 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1394 // We have been passed fsmountargs, which is typed as a user_addr_t,
1395 // but is actually a char ** pointing to a (kernelspace) string.
1396 // We manually unpack it with a series of casts and dereferences
1397 // that reverses what was done just above us on the stack in
1398 // imageboot_pivot_image().
1399 // After retrieving the path to the dev node (which we will NDINIT
1400 // in a moment), we pass NULL fsmountargs on to the filesystem.
1401 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1402 char **devnamepp = (char **)fsmountargs;
1403 char *devnamep = *devnamepp;
1404 devpath = CAST_USER_ADDR_T(devnamep);
1405 do_copyin_devpath = false;
1406 fsmountargs = USER_ADDR_NULL;
1407
1408 //Now that we have a mp, denote that this mount is for the basesystem.
1409 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1410 }
1411 #endif // CONFIG_BASESYSTEMROOT
1412
1413 if (do_copyin_devpath) {
1414 if (vfs_context_is64bit(ctx)) {
1415 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1416 goto out1;
1417 }
1418 fsmountargs += sizeof(devpath);
1419 } else {
1420 user32_addr_t tmp;
1421 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1422 goto out1;
1423 }
1424 /* munge into LP64 addr */
1425 devpath = CAST_USER_ADDR_T(tmp);
1426 fsmountargs += sizeof(tmp);
1427 }
1428 }
1429
1430 /* Lookup device and authorize access to it */
1431 if ((devpath)) {
1432 struct nameidata nd;
1433
1434 enum uio_seg seg = UIO_USERSPACE;
1435 #if CONFIG_BASESYSTEMROOT
1436 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1437 seg = UIO_SYSSPACE;
1438 }
1439 #endif // CONFIG_BASESYSTEMROOT
1440
1441 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1442 if (flags & MNT_NOFOLLOW) {
1443 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
1444 }
1445 if ((error = namei(&nd))) {
1446 goto out1;
1447 }
1448
1449 devvp = nd.ni_vp;
1450
1451 if (devvp->v_type != VBLK) {
1452 error = ENOTBLK;
1453 nameidone(&nd);
1454 goto out2;
1455 }
1456 if (major(devvp->v_rdev) >= nblkdev) {
1457 error = ENXIO;
1458 nameidone(&nd);
1459 goto out2;
1460 }
1461 /*
1462 * If mount by non-root, then verify that user has necessary
1463 * permissions on the device.
1464 */
1465 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1466 kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1467
1468 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1469 accessmode |= KAUTH_VNODE_WRITE_DATA;
1470 }
1471 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1472 nameidone(&nd);
1473 goto out2;
1474 }
1475 }
1476
1477 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1478 nameidone(&nd);
1479 }
1480 /* On first mount, preflight and open device */
1481 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1482 if ((error = vnode_ref(devvp))) {
1483 goto out2;
1484 }
1485 /*
1486 * Disallow multiple mounts of the same device.
1487 * Disallow mounting of a device that is currently in use
1488 * (except for root, which might share swap device for miniroot).
1489 * Flush out any old buffers remaining from a previous use.
1490 */
1491 if ((error = vfs_setmounting(devvp))) {
1492 vnode_rele(devvp);
1493 goto out2;
1494 }
1495
1496 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1497 error = EBUSY;
1498 goto out3;
1499 }
1500 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1501 error = ENOTBLK;
1502 goto out3;
1503 }
1504 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1505 goto out3;
1506 }
1507
1508 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1509 #if CONFIG_MACF
1510 error = mac_vnode_check_open(ctx,
1511 devvp,
1512 ronly ? FREAD : FREAD | FWRITE);
1513 if (error) {
1514 goto out3;
1515 }
1516 #endif /* MAC */
1517 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1518 goto out3;
1519 }
1520
1521 mp->mnt_devvp = devvp;
1522 device_vnode = devvp;
1523 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1524 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1525 (device_vnode = mp->mnt_devvp)) {
1526 dev_t dev;
1527 int maj;
1528 /*
1529 * If upgrade to read-write by non-root, then verify
1530 * that user has necessary permissions on the device.
1531 */
1532 vnode_getalways(device_vnode);
1533
1534 if (suser(vfs_context_ucred(ctx), NULL) &&
1535 (error = vnode_authorize(device_vnode, NULL,
1536 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1537 ctx)) != 0) {
1538 vnode_put(device_vnode);
1539 goto out2;
1540 }
1541
1542 /* Tell the device that we're upgrading */
1543 dev = (dev_t)device_vnode->v_rdev;
1544 maj = major(dev);
1545
1546 if ((u_int)maj >= (u_int)nblkdev) {
1547 panic("Volume mounted on a device with invalid major number.");
1548 }
1549
1550 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1551 vnode_put(device_vnode);
1552 device_vnode = NULLVP;
1553 if (error != 0) {
1554 goto out2;
1555 }
1556 }
1557 } // localargs && !(snapshot | data | vm)
1558
1559 #if CONFIG_MACF
1560 if ((flags & MNT_UPDATE) == 0) {
1561 mac_mount_label_init(mp);
1562 mac_mount_label_associate(ctx, mp);
1563 }
1564 if (labelstr) {
1565 if ((flags & MNT_UPDATE) != 0) {
1566 error = mac_mount_check_label_update(ctx, mp);
1567 if (error != 0) {
1568 goto out3;
1569 }
1570 }
1571 }
1572 #endif
1573 /*
1574 * Mount the filesystem. We already asserted that internal_flags
1575 * cannot have more than one mount-by-role bit set.
1576 */
1577 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1578 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1579 (caddr_t)fsmountargs, 0, ctx);
1580 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1581 #if CONFIG_ROSV_STARTUP
1582 struct mount *origin_mp = (struct mount*)fsmountargs;
1583 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1584 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1585 if (error) {
1586 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1587 } else {
1588 /* Mark volume associated with system volume */
1589 mp->mnt_kern_flag |= MNTK_SYSTEM;
1590
1591 /* Attempt to acquire the mnt_devvp and set it up */
1592 struct vnode *mp_devvp = NULL;
1593 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1594 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1595 0, &mp_devvp, vfs_context_kernel());
1596 if (!lerr) {
1597 mp->mnt_devvp = mp_devvp;
1598 //vnode_lookup took an iocount, need to drop it.
1599 vnode_put(mp_devvp);
1600 // now set `device_vnode` to the devvp that was acquired.
1601 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1602 // note that though the iocount above was dropped, the mount acquires
1603 // an implicit reference against the device.
1604 device_vnode = mp_devvp;
1605 }
1606 }
1607 }
1608 #else
1609 error = EINVAL;
1610 #endif
1611 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1612 #if CONFIG_MOUNT_VM
1613 struct mount *origin_mp = (struct mount*)fsmountargs;
1614 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1615 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1616 if (error) {
1617 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1618 } else {
1619 /* Mark volume associated with system volume and a swap mount */
1620 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1621 /* Attempt to acquire the mnt_devvp and set it up */
1622 struct vnode *mp_devvp = NULL;
1623 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1624 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1625 0, &mp_devvp, vfs_context_kernel());
1626 if (!lerr) {
1627 mp->mnt_devvp = mp_devvp;
1628 //vnode_lookup took an iocount, need to drop it.
1629 vnode_put(mp_devvp);
1630
1631 // now set `device_vnode` to the devvp that was acquired.
1632 // note that though the iocount above was dropped, the mount acquires
1633 // an implicit reference against the device.
1634 device_vnode = mp_devvp;
1635 }
1636 }
1637 }
1638 #else
1639 error = EINVAL;
1640 #endif
1641 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1642 #if CONFIG_MOUNT_PREBOOTRECOVERY
1643 struct mount *origin_mp = (struct mount*)fsmountargs;
1644 uint32_t mount_role = 0;
1645 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1646 mount_role = VFS_PREBOOT_ROLE;
1647 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1648 mount_role = VFS_RECOVERY_ROLE;
1649 }
1650
1651 if (mount_role != 0) {
1652 fs_role_mount_args_t frma = {origin_mp, mount_role};
1653 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1654 if (error) {
1655 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1656 } else {
1657 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1658 /* Mark volume associated with system volume */
1659 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1660 /* Attempt to acquire the mnt_devvp and set it up */
1661 struct vnode *mp_devvp = NULL;
1662 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1663 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1664 0, &mp_devvp, vfs_context_kernel());
1665 if (!lerr) {
1666 mp->mnt_devvp = mp_devvp;
1667 //vnode_lookup took an iocount, need to drop it.
1668 vnode_put(mp_devvp);
1669
1670 // now set `device_vnode` to the devvp that was acquired.
1671 // note that though the iocount above was dropped, the mount acquires
1672 // an implicit reference against the device.
1673 device_vnode = mp_devvp;
1674 }
1675 }
1676 }
1677 } else {
1678 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1679 error = EINVAL;
1680 }
1681 #else
1682 error = EINVAL;
1683 #endif
1684 } else {
1685 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1686 }
1687
1688 if (flags & MNT_UPDATE) {
1689 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1690 mp->mnt_flag &= ~MNT_RDONLY;
1691 }
1692 mp->mnt_flag &= ~
1693 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1694 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1695 if (error) {
1696 mp->mnt_flag = flag; /* restore flag value */
1697 }
1698 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1699 lck_rw_done(&mp->mnt_rwlock);
1700 is_rwlock_locked = FALSE;
1701 if (!error) {
1702 enablequotas(mp, ctx);
1703 }
1704 goto exit;
1705 }
1706
1707 /*
1708 * Put the new filesystem on the mount list after root.
1709 */
1710 if (error == 0) {
1711 struct vfs_attr vfsattr;
1712 if (device_vnode) {
1713 /*
1714 * cache the IO attributes for the underlying physical media...
1715 * an error return indicates the underlying driver doesn't
1716 * support all the queries necessary... however, reasonable
1717 * defaults will have been set, so no reason to bail or care
1718 *
1719 * Need to do this before calling the MAC hook as it needs
1720 * information from this call.
1721 */
1722 vfs_init_io_attributes(device_vnode, mp);
1723 }
1724
1725 #if CONFIG_MACF
1726 error = mac_mount_check_mount_late(ctx, mp);
1727 if (error != 0) {
1728 goto out4;
1729 }
1730
1731 if (vfs_flags(mp) & MNT_MULTILABEL) {
1732 error = VFS_ROOT(mp, &rvp, ctx);
1733 if (error) {
1734 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1735 goto out4;
1736 }
1737 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1738 /*
1739 * drop reference provided by VFS_ROOT
1740 */
1741 vnode_put(rvp);
1742
1743 if (error) {
1744 goto out4;
1745 }
1746 }
1747 #endif /* MAC */
1748
1749 vnode_lock_spin(vp);
1750 CLR(vp->v_flag, VMOUNT);
1751 vp->v_mountedhere = mp;
1752 SET(vp->v_flag, VMOUNTEDHERE);
1753
1754 /*
1755 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1756 * 'v_mountedhere' to be planted.
1757 */
1758 wakeup(&vp->v_flag);
1759 vnode_unlock(vp);
1760
1761 /*
1762 * taking the name_cache_lock exclusively will
1763 * insure that everyone is out of the fast path who
1764 * might be trying to use a now stale copy of
1765 * vp->v_mountedhere->mnt_realrootvp
1766 * bumping mount_generation causes the cached values
1767 * to be invalidated
1768 */
1769 name_cache_lock();
1770 mount_generation++;
1771 name_cache_unlock();
1772
1773 error = vnode_ref(vp);
1774 if (error != 0) {
1775 goto out4;
1776 }
1777
1778 have_usecount = TRUE;
1779
1780 error = checkdirs(vp, ctx);
1781 if (error != 0) {
1782 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1783 goto out4;
1784 }
1785 /*
1786 * there is no cleanup code here so I have made it void
1787 * we need to revisit this
1788 */
1789 (void)VFS_START(mp, 0, ctx);
1790
1791 if (mount_list_add(mp) != 0) {
1792 /*
1793 * The system is shutting down trying to umount
1794 * everything, so fail with a plausible errno.
1795 */
1796 error = EBUSY;
1797 goto out4;
1798 }
1799 lck_rw_done(&mp->mnt_rwlock);
1800 is_rwlock_locked = FALSE;
1801
1802 /* Check if this mounted file system supports EAs or named streams. */
1803 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1804 VFSATTR_INIT(&vfsattr);
1805 VFSATTR_WANTED(&vfsattr, f_capabilities);
1806 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1807 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1808 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1809 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1810 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1811 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1812 }
1813 #if NAMEDSTREAMS
1814 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1815 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1816 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1817 }
1818 #endif
1819 /* Check if this file system supports path from id lookups. */
1820 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1821 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1822 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1823 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1824 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1825 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1826 }
1827
1828 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1829 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1830 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1831 }
1832 }
1833 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1834 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1835 }
1836 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1837 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1838 }
1839 /* increment the operations count */
1840 OSAddAtomic(1, &vfs_nummntops);
1841 enablequotas(mp, ctx);
1842
1843 if (device_vnode) {
1844 vfs_setmountedon(device_vnode);
1845 }
1846
1847 /* Now that mount is setup, notify the listeners */
1848 vfs_notify_mount(pvp);
1849 IOBSDMountChange(mp, kIOMountChangeMount);
1850 } else {
1851 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1852 if (mp->mnt_vnodelist.tqh_first != NULL) {
1853 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1854 mp->mnt_vtable->vfc_name, error);
1855 }
1856
1857 vnode_lock_spin(vp);
1858 CLR(vp->v_flag, VMOUNT);
1859 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
1860 wakeup(&vp->v_flag);
1861 vnode_unlock(vp);
1862 mount_list_lock();
1863 mp->mnt_vtable->vfc_refcount--;
1864 mount_list_unlock();
1865
1866 if (device_vnode) {
1867 vnode_rele(device_vnode);
1868 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1869 vfs_clearmounting(device_vnode);
1870 }
1871 lck_rw_done(&mp->mnt_rwlock);
1872 is_rwlock_locked = FALSE;
1873
1874 if (nc_smr_enabled) {
1875 vfs_smr_synchronize();
1876 }
1877
1878 /*
1879 * if we get here, we have a mount structure that needs to be freed,
1880 * but since the coveredvp hasn't yet been updated to point at it,
1881 * no need to worry about other threads holding a crossref on this mp
1882 * so it's ok to just free it
1883 */
1884 mount_lock_destroy(mp);
1885 #if CONFIG_MACF
1886 mac_mount_label_destroy(mp);
1887 #endif
1888 zfree(mount_zone, mp);
1889 did_set_lmount = false;
1890 }
1891 exit:
1892 /*
1893 * drop I/O count on the device vp if there was one
1894 */
1895 if (devpath && devvp) {
1896 vnode_put(devvp);
1897 }
1898
1899 if (did_set_lmount) {
1900 mount_lock_spin(mp);
1901 mp->mnt_lflag &= ~MNT_LMOUNT;
1902 mount_unlock(mp);
1903 }
1904
1905 return error;
1906
1907 /* Error condition exits */
1908 out4:
1909 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1910
1911 /*
1912 * If the mount has been placed on the covered vp,
1913 * it may have been discovered by now, so we have
1914 * to treat this just like an unmount
1915 */
1916 mount_lock_spin(mp);
1917 mp->mnt_lflag |= MNT_LDEAD;
1918 mount_unlock(mp);
1919
1920 if (device_vnode != NULLVP) {
1921 vnode_rele(device_vnode);
1922 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1923 ctx);
1924 vfs_clearmounting(device_vnode);
1925 did_rele = TRUE;
1926 }
1927
1928 vnode_lock_spin(vp);
1929
1930 mp->mnt_crossref++;
1931 CLR(vp->v_flag, VMOUNTEDHERE);
1932 vp->v_mountedhere = (mount_t) 0;
1933
1934 vnode_unlock(vp);
1935
1936 if (have_usecount) {
1937 vnode_rele(vp);
1938 }
1939 out3:
1940 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1941 vnode_rele(devvp);
1942 vfs_clearmounting(devvp);
1943 }
1944 out2:
1945 if (devpath && devvp) {
1946 vnode_put(devvp);
1947 }
1948 out1:
1949 /* Release mnt_rwlock only when it was taken */
1950 if (is_rwlock_locked == TRUE) {
1951 if (flag_set) {
1952 mp->mnt_flag = flag; /* restore mnt_flag value */
1953 }
1954 lck_rw_done(&mp->mnt_rwlock);
1955 }
1956
1957 if (did_set_lmount) {
1958 mount_lock_spin(mp);
1959 mp->mnt_lflag &= ~MNT_LMOUNT;
1960 mount_unlock(mp);
1961 }
1962
1963 if (did_set_vmount) {
1964 vnode_lock_spin(vp);
1965 CLR(vp->v_flag, VMOUNT);
1966 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
1967 wakeup(&vp->v_flag);
1968 vnode_unlock(vp);
1969 }
1970
1971 if (mntalloc) {
1972 if (mp->mnt_crossref) {
1973 mount_dropcrossref(mp, vp, 0);
1974 } else {
1975 if (nc_smr_enabled) {
1976 vfs_smr_synchronize();
1977 }
1978
1979 mount_lock_destroy(mp);
1980 #if CONFIG_MACF
1981 mac_mount_label_destroy(mp);
1982 #endif
1983 zfree(mount_zone, mp);
1984 }
1985 }
1986 if (vfsp_ref) {
1987 mount_list_lock();
1988 vfsp->vfc_refcount--;
1989 mount_list_unlock();
1990 }
1991
1992 return error;
1993 }
1994
1995 /*
1996 * Flush in-core data, check for competing mount attempts,
1997 * and set VMOUNT
1998 */
1999 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)2000 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
2001 {
2002 #if !CONFIG_MACF
2003 #pragma unused(cnp,fsname)
2004 #endif
2005 struct vnode_attr va;
2006 int error;
2007 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
2008 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
2009 boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
2010
2011 if (!skip_auth) {
2012 /*
2013 * If the user is not root, ensure that they own the directory
2014 * onto which we are attempting to mount.
2015 */
2016 VATTR_INIT(&va);
2017 VATTR_WANTED(&va, va_uid);
2018 if ((error = vnode_getattr(vp, &va, ctx)) ||
2019 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2020 (!vfs_context_issuser(ctx)))) {
2021 error = EPERM;
2022 goto out;
2023 }
2024 }
2025
2026 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
2027 goto out;
2028 }
2029
2030 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
2031 goto out;
2032 }
2033
2034 if (vp->v_type != VDIR) {
2035 error = ENOTDIR;
2036 goto out;
2037 }
2038
2039 vnode_lock_spin(vp);
2040
2041 if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
2042 error = EBUSY;
2043 } else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
2044 (vp->v_mountedhere != NULL))) {
2045 /*
2046 * For mount triggered from mount() call, we want to wait for the
2047 * current in-progress mount to complete, redo lookup and retry the
2048 * mount again. Similarly, we also want to retry if we lost the race
2049 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
2050 * 'v_mountedhere' has been planted after initial lookup.
2051 */
2052 if (ISSET(vp->v_flag, VMOUNT)) {
2053 vnode_lock_convert(vp);
2054 msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
2055 }
2056 error = EBUSY;
2057 } else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
2058 error = EBUSY;
2059 }
2060
2061 if (error) {
2062 vnode_unlock(vp);
2063 goto out;
2064 }
2065 SET(vp->v_flag, VMOUNT);
2066 vnode_unlock(vp);
2067
2068 #if CONFIG_MACF
2069 error = mac_mount_check_mount(ctx, vp,
2070 cnp, fsname);
2071 if (error != 0) {
2072 vnode_lock_spin(vp);
2073 CLR(vp->v_flag, VMOUNT);
2074 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2075 wakeup(&vp->v_flag);
2076 vnode_unlock(vp);
2077 }
2078 #endif
2079
2080 out:
2081 return error;
2082 }
2083
2084 #if CONFIG_IMGSRC_ACCESS
2085
2086 #define DEBUG_IMGSRC 0
2087
2088 #if DEBUG_IMGSRC
2089 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2090 #else
2091 #define IMGSRC_DEBUG(args...) do { } while(0)
2092 #endif
2093
2094 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2095 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2096 {
2097 struct nameidata nd;
2098 vnode_t vp, realdevvp;
2099 kauth_action_t accessmode;
2100 int error;
2101 enum uio_seg uio = UIO_USERSPACE;
2102
2103 if (ctx == vfs_context_kernel()) {
2104 uio = UIO_SYSSPACE;
2105 }
2106
2107 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2108 if ((error = namei(&nd))) {
2109 IMGSRC_DEBUG("namei() failed with %d\n", error);
2110 return error;
2111 }
2112
2113 vp = nd.ni_vp;
2114
2115 if (!vnode_isblk(vp)) {
2116 IMGSRC_DEBUG("Not block device.\n");
2117 error = ENOTBLK;
2118 goto out;
2119 }
2120
2121 realdevvp = mp->mnt_devvp;
2122 if (realdevvp == NULLVP) {
2123 IMGSRC_DEBUG("No device backs the mount.\n");
2124 error = ENXIO;
2125 goto out;
2126 }
2127
2128 error = vnode_getwithref(realdevvp);
2129 if (error != 0) {
2130 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2131 goto out;
2132 }
2133
2134 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2135 IMGSRC_DEBUG("Wrong dev_t.\n");
2136 error = ENXIO;
2137 goto out1;
2138 }
2139
2140 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2141
2142 /*
2143 * If mount by non-root, then verify that user has necessary
2144 * permissions on the device.
2145 */
2146 if (!vfs_context_issuser(ctx)) {
2147 accessmode = KAUTH_VNODE_READ_DATA;
2148 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2149 accessmode |= KAUTH_VNODE_WRITE_DATA;
2150 }
2151 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2152 IMGSRC_DEBUG("Access denied.\n");
2153 goto out1;
2154 }
2155 }
2156
2157 *devvpp = vp;
2158
2159 out1:
2160 vnode_put(realdevvp);
2161
2162 out:
2163 nameidone(&nd);
2164
2165 if (error) {
2166 vnode_put(vp);
2167 }
2168
2169 return error;
2170 }
2171
2172 /*
2173 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2174 * and call checkdirs()
2175 */
2176 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2177 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2178 {
2179 int error;
2180
2181 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2182
2183 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2184 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2185
2186 vnode_lock_spin(vp);
2187 CLR(vp->v_flag, VMOUNT);
2188 vp->v_mountedhere = mp;
2189 SET(vp->v_flag, VMOUNTEDHERE);
2190 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2191 wakeup(&vp->v_flag);
2192 vnode_unlock(vp);
2193
2194 /*
2195 * taking the name_cache_lock exclusively will
2196 * insure that everyone is out of the fast path who
2197 * might be trying to use a now stale copy of
2198 * vp->v_mountedhere->mnt_realrootvp
2199 * bumping mount_generation causes the cached values
2200 * to be invalidated
2201 */
2202 name_cache_lock();
2203 mount_generation++;
2204 name_cache_unlock();
2205
2206 error = vnode_ref(vp);
2207 if (error != 0) {
2208 goto out;
2209 }
2210
2211 error = checkdirs(vp, ctx);
2212 if (error != 0) {
2213 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2214 vnode_rele(vp);
2215 goto out;
2216 }
2217
2218 out:
2219 if (error != 0) {
2220 mp->mnt_vnodecovered = NULLVP;
2221 }
2222 return error;
2223 }
2224
2225 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2226 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2227 {
2228 vnode_rele(vp);
2229 vnode_lock_spin(vp);
2230 CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2231 vp->v_mountedhere = (mount_t)NULL;
2232 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2233 wakeup(&vp->v_flag);
2234 vnode_unlock(vp);
2235
2236 mp->mnt_vnodecovered = NULLVP;
2237 }
2238
2239 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2240 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2241 {
2242 int error;
2243
2244 /* unmount in progress return error */
2245 mount_lock_spin(mp);
2246 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2247 mount_unlock(mp);
2248 return EBUSY;
2249 }
2250 mount_unlock(mp);
2251 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2252
2253 /*
2254 * We only allow the filesystem to be reloaded if it
2255 * is currently mounted read-only.
2256 */
2257 if ((flags & MNT_RELOAD) &&
2258 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2259 error = ENOTSUP;
2260 goto out;
2261 }
2262
2263 /*
2264 * Only root, or the user that did the original mount is
2265 * permitted to update it.
2266 */
2267 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2268 (!vfs_context_issuser(ctx))) {
2269 error = EPERM;
2270 goto out;
2271 }
2272 #if CONFIG_MACF
2273 error = mac_mount_check_remount(ctx, mp, flags);
2274 if (error != 0) {
2275 goto out;
2276 }
2277 #endif
2278
2279 out:
2280 if (error) {
2281 lck_rw_done(&mp->mnt_rwlock);
2282 }
2283
2284 return error;
2285 }
2286
2287 static void
mount_end_update(mount_t mp)2288 mount_end_update(mount_t mp)
2289 {
2290 lck_rw_done(&mp->mnt_rwlock);
2291 }
2292
2293 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2294 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2295 {
2296 vnode_t vp;
2297
2298 if (height >= MAX_IMAGEBOOT_NESTING) {
2299 return EINVAL;
2300 }
2301
2302 vp = imgsrc_rootvnodes[height];
2303 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2304 *rvpp = vp;
2305 return 0;
2306 } else {
2307 return ENOENT;
2308 }
2309 }
2310
2311 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2312 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2313 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2314 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2315 {
2316 int error;
2317 mount_t mp;
2318 boolean_t placed = FALSE;
2319 struct vfstable *vfsp;
2320 user_addr_t devpath;
2321 char *old_mntonname;
2322 vnode_t rvp;
2323 vnode_t devvp;
2324 uint32_t height;
2325 uint32_t flags;
2326
2327 /* If we didn't imageboot, nothing to move */
2328 if (imgsrc_rootvnodes[0] == NULLVP) {
2329 return EINVAL;
2330 }
2331
2332 /* Only root can do this */
2333 if (!vfs_context_issuser(ctx)) {
2334 return EPERM;
2335 }
2336
2337 IMGSRC_DEBUG("looking for root vnode.\n");
2338
2339 /*
2340 * Get root vnode of filesystem we're moving.
2341 */
2342 if (by_index) {
2343 if (is64bit) {
2344 struct user64_mnt_imgsrc_args mia64;
2345 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2346 if (error != 0) {
2347 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2348 return error;
2349 }
2350
2351 height = mia64.mi_height;
2352 flags = mia64.mi_flags;
2353 devpath = (user_addr_t)mia64.mi_devpath;
2354 } else {
2355 struct user32_mnt_imgsrc_args mia32;
2356 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2357 if (error != 0) {
2358 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2359 return error;
2360 }
2361
2362 height = mia32.mi_height;
2363 flags = mia32.mi_flags;
2364 devpath = mia32.mi_devpath;
2365 }
2366 } else {
2367 /*
2368 * For binary compatibility--assumes one level of nesting.
2369 */
2370 if (is64bit) {
2371 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2372 return error;
2373 }
2374 } else {
2375 user32_addr_t tmp;
2376 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2377 return error;
2378 }
2379
2380 /* munge into LP64 addr */
2381 devpath = CAST_USER_ADDR_T(tmp);
2382 }
2383
2384 height = 0;
2385 flags = 0;
2386 }
2387
2388 if (flags != 0) {
2389 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2390 return EINVAL;
2391 }
2392
2393 error = get_imgsrc_rootvnode(height, &rvp);
2394 if (error != 0) {
2395 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2396 return error;
2397 }
2398
2399 IMGSRC_DEBUG("got old root vnode\n");
2400
2401 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2402
2403 /* Can only move once */
2404 mp = vnode_mount(rvp);
2405 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2406 IMGSRC_DEBUG("Already moved.\n");
2407 error = EBUSY;
2408 goto out0;
2409 }
2410
2411 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2412 IMGSRC_DEBUG("Starting updated.\n");
2413
2414 /* Get exclusive rwlock on mount, authorize update on mp */
2415 error = mount_begin_update(mp, ctx, 0);
2416 if (error != 0) {
2417 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2418 goto out0;
2419 }
2420
2421 /*
2422 * It can only be moved once. Flag is set under the rwlock,
2423 * so we're now safe to proceed.
2424 */
2425 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2426 IMGSRC_DEBUG("Already moved [2]\n");
2427 goto out1;
2428 }
2429
2430 IMGSRC_DEBUG("Preparing coveredvp.\n");
2431
2432 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2433 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2434 if (error != 0) {
2435 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2436 goto out1;
2437 }
2438
2439 IMGSRC_DEBUG("Covered vp OK.\n");
2440
2441 /* Sanity check the name caller has provided */
2442 vfsp = mp->mnt_vtable;
2443 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2444 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2445 vfsp->vfc_name, fsname);
2446 error = EINVAL;
2447 goto out2;
2448 }
2449
2450 /* Check the device vnode and update mount-from name, for local filesystems */
2451 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2452 IMGSRC_DEBUG("Local, doing device validation.\n");
2453
2454 if (devpath != USER_ADDR_NULL) {
2455 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2456 if (error) {
2457 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2458 goto out2;
2459 }
2460
2461 vnode_put(devvp);
2462 }
2463 }
2464
2465 /*
2466 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2467 * and increment the name cache's mount generation
2468 */
2469
2470 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2471 error = place_mount_and_checkdirs(mp, vp, ctx);
2472 if (error != 0) {
2473 goto out2;
2474 }
2475
2476 placed = TRUE;
2477
2478 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2479 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2480
2481 /* Forbid future moves */
2482 mount_lock(mp);
2483 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2484 mount_unlock(mp);
2485
2486 /* Finally, add to mount list, completely ready to go */
2487 if (mount_list_add(mp) != 0) {
2488 /*
2489 * The system is shutting down trying to umount
2490 * everything, so fail with a plausible errno.
2491 */
2492 error = EBUSY;
2493 goto out3;
2494 }
2495
2496 mount_end_update(mp);
2497 vnode_put(rvp);
2498 zfree(ZV_NAMEI, old_mntonname);
2499
2500 vfs_notify_mount(pvp);
2501
2502 return 0;
2503 out3:
2504 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2505
2506 mount_lock(mp);
2507 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2508 mount_unlock(mp);
2509
2510 out2:
2511 /*
2512 * Placing the mp on the vnode clears VMOUNT,
2513 * so cleanup is different after that point
2514 */
2515 if (placed) {
2516 /* Rele the vp, clear VMOUNT and v_mountedhere */
2517 undo_place_on_covered_vp(mp, vp);
2518 } else {
2519 vnode_lock_spin(vp);
2520 CLR(vp->v_flag, VMOUNT);
2521 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2522 wakeup(&vp->v_flag);
2523 vnode_unlock(vp);
2524 }
2525 out1:
2526 mount_end_update(mp);
2527
2528 out0:
2529 vnode_put(rvp);
2530 zfree(ZV_NAMEI, old_mntonname);
2531 return error;
2532 }
2533
2534 #endif /* CONFIG_IMGSRC_ACCESS */
2535
2536 void
enablequotas(struct mount * mp,vfs_context_t ctx)2537 enablequotas(struct mount *mp, vfs_context_t ctx)
2538 {
2539 struct nameidata qnd;
2540 int type;
2541 char qfpath[MAXPATHLEN];
2542 const char *qfname = QUOTAFILENAME;
2543 const char *qfopsname = QUOTAOPSNAME;
2544 const char *qfextension[] = INITQFNAMES;
2545
2546 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2547 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2548 return;
2549 }
2550 /*
2551 * Enable filesystem disk quotas if necessary.
2552 * We ignore errors as this should not interfere with final mount
2553 */
2554 for (type = 0; type < MAXQUOTAS; type++) {
2555 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2556 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2557 CAST_USER_ADDR_T(qfpath), ctx);
2558 if (namei(&qnd) != 0) {
2559 continue; /* option file to trigger quotas is not present */
2560 }
2561 vnode_put(qnd.ni_vp);
2562 nameidone(&qnd);
2563 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2564
2565 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2566 }
2567 return;
2568 }
2569
2570
2571 static int
checkdirs_callback(proc_t p,void * arg)2572 checkdirs_callback(proc_t p, void * arg)
2573 {
2574 struct cdirargs *cdrp = (struct cdirargs *)arg;
2575 vnode_t olddp = cdrp->olddp;
2576 vnode_t newdp = cdrp->newdp;
2577 struct filedesc *fdp = &p->p_fd;
2578 vnode_t new_cvp = newdp;
2579 vnode_t new_rvp = newdp;
2580 vnode_t old_cvp = NULL;
2581 vnode_t old_rvp = NULL;
2582
2583 /*
2584 * XXX Also needs to iterate each thread in the process to see if it
2585 * XXX is using a per-thread current working directory, and, if so,
2586 * XXX update that as well.
2587 */
2588
2589 /*
2590 * First, with the proc_fdlock held, check to see if we will need
2591 * to do any work. If not, we will get out fast.
2592 */
2593 proc_fdlock(p);
2594 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2595 proc_fdunlock(p);
2596 return PROC_RETURNED;
2597 }
2598 proc_fdunlock(p);
2599
2600 /*
2601 * Ok, we will have to do some work. Always take two refs
2602 * because we might need that many. We'll dispose of whatever
2603 * we ended up not using.
2604 */
2605 if (vnode_ref(newdp) != 0) {
2606 return PROC_RETURNED;
2607 }
2608 if (vnode_ref(newdp) != 0) {
2609 vnode_rele(newdp);
2610 return PROC_RETURNED;
2611 }
2612
2613 proc_dirs_lock_exclusive(p);
2614 /*
2615 * Now do the work. Note: we dropped the proc_fdlock, so we
2616 * have to do all of the checks again.
2617 */
2618 proc_fdlock(p);
2619 if (fdp->fd_cdir == olddp) {
2620 old_cvp = olddp;
2621 fdp->fd_cdir = newdp;
2622 new_cvp = NULL;
2623 }
2624 if (fdp->fd_rdir == olddp) {
2625 old_rvp = olddp;
2626 fdp->fd_rdir = newdp;
2627 new_rvp = NULL;
2628 }
2629 proc_fdunlock(p);
2630 proc_dirs_unlock_exclusive(p);
2631
2632 /*
2633 * Dispose of any references that are no longer needed.
2634 */
2635 if (old_cvp != NULL) {
2636 vnode_rele(old_cvp);
2637 }
2638 if (old_rvp != NULL) {
2639 vnode_rele(old_rvp);
2640 }
2641 if (new_cvp != NULL) {
2642 vnode_rele(new_cvp);
2643 }
2644 if (new_rvp != NULL) {
2645 vnode_rele(new_rvp);
2646 }
2647
2648 return PROC_RETURNED;
2649 }
2650
2651
2652
2653 /*
2654 * Scan all active processes to see if any of them have a current
2655 * or root directory onto which the new filesystem has just been
2656 * mounted. If so, replace them with the new mount point.
2657 */
2658 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2659 checkdirs(vnode_t olddp, vfs_context_t ctx)
2660 {
2661 vnode_t newdp;
2662 vnode_t tvp;
2663 int err;
2664 struct cdirargs cdr;
2665
2666 if (olddp->v_usecount == 1) {
2667 return 0;
2668 }
2669 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2670
2671 if (err != 0) {
2672 #if DIAGNOSTIC
2673 panic("mount: lost mount: error %d", err);
2674 #endif
2675 return err;
2676 }
2677
2678 cdr.olddp = olddp;
2679 cdr.newdp = newdp;
2680 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2681 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2682
2683 if (rootvnode == olddp) {
2684 vnode_ref(newdp);
2685 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2686 tvp = rootvnode;
2687 rootvnode = newdp;
2688 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2689 vnode_rele(tvp);
2690 }
2691
2692 vnode_put(newdp);
2693 return 0;
2694 }
2695
2696 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2697 "com.apple.private.vfs.role-account-unmount"
2698
2699 /*
2700 * Unmount a file system.
2701 *
2702 * Note: unmount takes a path to the vnode mounted on as argument,
2703 * not special file (as before).
2704 */
2705 /* ARGSUSED */
2706 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2707 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2708 {
2709 vnode_t vp;
2710 struct mount *mp;
2711 int flags = uap->flags;
2712 int error;
2713 struct nameidata nd;
2714 vfs_context_t ctx;
2715
2716 /*
2717 * If the process has the entitlement, use the kernel's context when
2718 * performing lookup on the mount path as the process might lack proper
2719 * permission to access the directory.
2720 */
2721 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2722 vfs_context_kernel() : vfs_context_current();
2723
2724 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2725 UIO_USERSPACE, uap->path, ctx);
2726 if (flags & MNT_NOFOLLOW) {
2727 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
2728 }
2729
2730 error = namei(&nd);
2731 if (error) {
2732 return error;
2733 }
2734 vp = nd.ni_vp;
2735 mp = vp->v_mount;
2736 nameidone(&nd);
2737
2738 /*
2739 * Must be the root of the filesystem
2740 */
2741 if ((vp->v_flag & VROOT) == 0) {
2742 vnode_put(vp);
2743 return EINVAL;
2744 }
2745 #if CONFIG_MACF
2746 error = mac_mount_check_umount(ctx, mp);
2747 if (error != 0) {
2748 vnode_put(vp);
2749 return error;
2750 }
2751 #endif
2752 mount_ref(mp, 0);
2753 vnode_put(vp);
2754 /* safedounmount consumes the mount ref */
2755 return safedounmount(mp, flags, ctx);
2756 }
2757
2758 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2759 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2760 {
2761 mount_t mp;
2762
2763 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2764 if (mp == (mount_t)0) {
2765 return ENOENT;
2766 }
2767 mount_ref(mp, 0);
2768 mount_iterdrop(mp);
2769 /* safedounmount consumes the mount ref */
2770 return safedounmount(mp, flags, ctx);
2771 }
2772
2773 /*
2774 * The mount struct comes with a mount ref which will be consumed.
2775 * Do the actual file system unmount, prevent some common foot shooting.
2776 */
2777 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2778 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2779 {
2780 int error;
2781 proc_t p = vfs_context_proc(ctx);
2782
2783 /*
2784 * If the file system is not responding and MNT_NOBLOCK
2785 * is set and not a forced unmount then return EBUSY.
2786 */
2787 if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2788 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2789 error = EBUSY;
2790 goto out;
2791 }
2792
2793 /*
2794 * Skip authorization in two cases:
2795 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2796 * This entitlement allows non-root processes unmount volumes mounted by
2797 * other processes.
2798 * - If the mount is tagged as permissive and this is not a forced-unmount
2799 * attempt.
2800 */
2801 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2802 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2803 /*
2804 * Only root, or the user that did the original mount is
2805 * permitted to unmount this filesystem.
2806 */
2807 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2808 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2809 goto out;
2810 }
2811 }
2812 /*
2813 * Don't allow unmounting the root file system, or other volumes
2814 * associated with it (for example, the associated VM or DATA mounts) .
2815 */
2816 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2817 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2818 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2819 mp->mnt_vfsstat.f_mntonname);
2820 }
2821 error = EBUSY; /* the root (or associated volumes) is always busy */
2822 goto out;
2823 }
2824
2825 /*
2826 * If the mount is providing the root filesystem's disk image
2827 * (i.e. imageboot), don't allow unmounting
2828 */
2829 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2830 error = EBUSY;
2831 goto out;
2832 }
2833
2834 return dounmount(mp, flags, 1, ctx);
2835
2836 out:
2837 mount_drop(mp, 0);
2838 return error;
2839 }
2840
2841 /*
2842 * Do the actual file system unmount.
2843 */
2844 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2845 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2846 {
2847 vnode_t coveredvp = (vnode_t)0;
2848 int error;
2849 int needwakeup = 0;
2850 int forcedunmount = 0;
2851 int lflags = 0;
2852 struct vnode *devvp = NULLVP;
2853 #if CONFIG_TRIGGERS
2854 proc_t p = vfs_context_proc(ctx);
2855 int did_vflush = 0;
2856 int pflags_save = 0;
2857 #endif /* CONFIG_TRIGGERS */
2858
2859 #if CONFIG_FSE
2860 if (!(flags & MNT_FORCE)) {
2861 fsevent_unmount(mp, ctx); /* has to come first! */
2862 }
2863 #endif
2864
2865 mount_lock(mp);
2866
2867 /*
2868 * If already an unmount in progress just return EBUSY.
2869 * Even a forced unmount cannot override.
2870 */
2871 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2872 if (withref != 0) {
2873 mount_drop(mp, 1);
2874 }
2875 mount_unlock(mp);
2876 return EBUSY;
2877 }
2878
2879 if (flags & MNT_FORCE) {
2880 forcedunmount = 1;
2881 mp->mnt_lflag |= MNT_LFORCE;
2882 }
2883
2884 #if CONFIG_TRIGGERS
2885 if (flags & MNT_NOBLOCK && p != kernproc) {
2886 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2887 }
2888 #endif
2889
2890 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2891 mp->mnt_lflag |= MNT_LUNMOUNT;
2892 mp->mnt_flag &= ~MNT_ASYNC;
2893 /*
2894 * anyone currently in the fast path that
2895 * trips over the cached rootvp will be
2896 * dumped out and forced into the slow path
2897 * to regenerate a new cached value
2898 */
2899 mp->mnt_realrootvp = NULLVP;
2900 mount_unlock(mp);
2901
2902 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2903 /*
2904 * Force unmount any mounts in this filesystem.
2905 * If any unmounts fail - just leave them dangling.
2906 * Avoids recursion.
2907 */
2908 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2909 }
2910
2911 /*
2912 * taking the name_cache_lock exclusively will
2913 * insure that everyone is out of the fast path who
2914 * might be trying to use a now stale copy of
2915 * vp->v_mountedhere->mnt_realrootvp
2916 * bumping mount_generation causes the cached values
2917 * to be invalidated
2918 */
2919 name_cache_lock();
2920 mount_generation++;
2921 name_cache_unlock();
2922
2923
2924 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2925 if (withref != 0) {
2926 mount_drop(mp, 0);
2927 }
2928 error = 0;
2929 if (forcedunmount == 0) {
2930 ubc_umount(mp); /* release cached vnodes */
2931 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2932 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2933 if (error) {
2934 mount_lock(mp);
2935 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2936 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2937 mp->mnt_lflag &= ~MNT_LFORCE;
2938 goto out;
2939 }
2940 }
2941 }
2942
2943 IOBSDMountChange(mp, kIOMountChangeUnmount);
2944
2945 #if CONFIG_TRIGGERS
2946 vfs_nested_trigger_unmounts(mp, flags, ctx);
2947 did_vflush = 1;
2948 #endif
2949 if (forcedunmount) {
2950 lflags |= FORCECLOSE;
2951 }
2952 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2953 if ((forcedunmount == 0) && error) {
2954 mount_lock(mp);
2955 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2956 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2957 mp->mnt_lflag &= ~MNT_LFORCE;
2958 goto out;
2959 }
2960
2961 /* make sure there are no one in the mount iterations or lookup */
2962 mount_iterdrain(mp);
2963
2964 error = VFS_UNMOUNT(mp, flags, ctx);
2965 if (error) {
2966 mount_iterreset(mp);
2967 mount_lock(mp);
2968 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2969 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2970 mp->mnt_lflag &= ~MNT_LFORCE;
2971 goto out;
2972 }
2973
2974 /* increment the operations count */
2975 if (!error) {
2976 OSAddAtomic(1, &vfs_nummntops);
2977 }
2978
2979 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2980 /* hold an io reference and drop the usecount before close */
2981 devvp = mp->mnt_devvp;
2982 vnode_getalways(devvp);
2983 vnode_rele(devvp);
2984 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2985 ctx);
2986 vnode_clearmountedon(devvp);
2987 vnode_put(devvp);
2988 }
2989 lck_rw_done(&mp->mnt_rwlock);
2990 mount_list_remove(mp);
2991 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2992
2993 /* mark the mount point hook in the vp but not drop the ref yet */
2994 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2995 /*
2996 * The covered vnode needs special handling. Trying to get an
2997 * iocount must not block here as this may lead to deadlocks
2998 * if the Filesystem to which the covered vnode belongs is
2999 * undergoing forced unmounts. Since we hold a usecount, the
3000 * vnode cannot be reused (it can, however, still be terminated)
3001 */
3002 vnode_getalways(coveredvp);
3003 vnode_lock_spin(coveredvp);
3004
3005 mp->mnt_crossref++;
3006 coveredvp->v_mountedhere = (struct mount *)0;
3007 CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
3008 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
3009 wakeup(&coveredvp->v_flag);
3010 vnode_unlock(coveredvp);
3011 vnode_put(coveredvp);
3012 }
3013
3014 mount_list_lock();
3015 mp->mnt_vtable->vfc_refcount--;
3016 mount_list_unlock();
3017
3018 cache_purgevfs(mp); /* remove cache entries for this file sys */
3019 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
3020 mount_lock(mp);
3021 mp->mnt_lflag |= MNT_LDEAD;
3022
3023 if (mp->mnt_lflag & MNT_LWAIT) {
3024 /*
3025 * do the wakeup here
3026 * in case we block in mount_refdrain
3027 * which will drop the mount lock
3028 * and allow anyone blocked in vfs_busy
3029 * to wakeup and see the LDEAD state
3030 */
3031 mp->mnt_lflag &= ~MNT_LWAIT;
3032 wakeup((caddr_t)mp);
3033 }
3034 mount_refdrain(mp);
3035
3036 /* free disk_conditioner_info structure for this mount */
3037 disk_conditioner_unmount(mp);
3038
3039 out:
3040 if (mp->mnt_lflag & MNT_LWAIT) {
3041 mp->mnt_lflag &= ~MNT_LWAIT;
3042 needwakeup = 1;
3043 }
3044
3045 #if CONFIG_TRIGGERS
3046 if (flags & MNT_NOBLOCK && p != kernproc) {
3047 // Restore P_NOREMOTEHANG bit to its previous value
3048 if ((pflags_save & P_NOREMOTEHANG) == 0) {
3049 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
3050 }
3051 }
3052
3053 /*
3054 * Callback and context are set together under the mount lock, and
3055 * never cleared, so we're safe to examine them here, drop the lock,
3056 * and call out.
3057 */
3058 if (mp->mnt_triggercallback != NULL) {
3059 mount_unlock(mp);
3060 if (error == 0) {
3061 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
3062 } else if (did_vflush) {
3063 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
3064 }
3065 } else {
3066 mount_unlock(mp);
3067 }
3068 #else
3069 mount_unlock(mp);
3070 #endif /* CONFIG_TRIGGERS */
3071
3072 lck_rw_done(&mp->mnt_rwlock);
3073
3074 if (needwakeup) {
3075 wakeup((caddr_t)mp);
3076 }
3077
3078 if (!error) {
3079 if ((coveredvp != NULLVP)) {
3080 vnode_t pvp = NULLVP;
3081
3082 /*
3083 * The covered vnode needs special handling. Trying to
3084 * get an iocount must not block here as this may lead
3085 * to deadlocks if the Filesystem to which the covered
3086 * vnode belongs is undergoing forced unmounts. Since we
3087 * hold a usecount, the vnode cannot be reused
3088 * (it can, however, still be terminated).
3089 */
3090 vnode_getalways(coveredvp);
3091
3092 mount_dropcrossref(mp, coveredvp, 0);
3093 /*
3094 * We'll _try_ to detect if this really needs to be
3095 * done. The coveredvp can only be in termination (or
3096 * terminated) if the coveredvp's mount point is in a
3097 * forced unmount (or has been) since we still hold the
3098 * ref.
3099 */
3100 if (!vnode_isrecycled(coveredvp)) {
3101 pvp = vnode_getparent(coveredvp);
3102 #if CONFIG_TRIGGERS
3103 if (coveredvp->v_resolve) {
3104 vnode_trigger_rearm(coveredvp, ctx);
3105 }
3106 #endif
3107 }
3108
3109 vnode_rele(coveredvp);
3110 vnode_put(coveredvp);
3111 coveredvp = NULLVP;
3112
3113 if (pvp) {
3114 lock_vnode_and_post(pvp, NOTE_WRITE);
3115 vnode_put(pvp);
3116 }
3117 } else if (mp->mnt_flag & MNT_ROOTFS) {
3118 if (nc_smr_enabled) {
3119 vfs_smr_synchronize();
3120 }
3121
3122 mount_lock_destroy(mp);
3123 #if CONFIG_MACF
3124 mac_mount_label_destroy(mp);
3125 #endif
3126 zfree(mount_zone, mp);
3127 } else {
3128 panic("dounmount: no coveredvp");
3129 }
3130 }
3131 return error;
3132 }
3133
3134 /*
3135 * Unmount any mounts in this filesystem.
3136 */
3137 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3138 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3139 {
3140 mount_t smp;
3141 fsid_t *fsids, fsid;
3142 int fsids_sz;
3143 int count = 0, i, m = 0;
3144 vnode_t vp;
3145
3146 mount_list_lock();
3147
3148 // Get an array to hold the submounts fsids.
3149 TAILQ_FOREACH(smp, &mountlist, mnt_list)
3150 count++;
3151 fsids_sz = count * sizeof(fsid_t);
3152 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3153 if (fsids == NULL) {
3154 mount_list_unlock();
3155 goto out;
3156 }
3157 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3158
3159 /*
3160 * Fill the array with submount fsids.
3161 * Since mounts are always added to the tail of the mount list, the
3162 * list is always in mount order.
3163 * For each mount check if the mounted-on vnode belongs to a
3164 * mount that's already added to our array of mounts to be unmounted.
3165 */
3166 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3167 vp = smp->mnt_vnodecovered;
3168 if (vp == NULL) {
3169 continue;
3170 }
3171 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3172 for (i = 0; i <= m; i++) {
3173 if (fsids[i].val[0] == fsid.val[0] &&
3174 fsids[i].val[1] == fsid.val[1]) {
3175 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3176 break;
3177 }
3178 }
3179 }
3180 mount_list_unlock();
3181
3182 // Unmount the submounts in reverse order. Ignore errors.
3183 for (i = m; i > 0; i--) {
3184 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3185 if (smp) {
3186 mount_ref(smp, 0);
3187 mount_iterdrop(smp);
3188 (void) dounmount(smp, flags, 1, ctx);
3189 }
3190 }
3191 out:
3192 kfree_data(fsids, fsids_sz);
3193 }
3194
3195 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3196 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3197 {
3198 vnode_hold(dp);
3199 vnode_lock(dp);
3200 mp->mnt_crossref--;
3201
3202 if (mp->mnt_crossref < 0) {
3203 panic("mount cross refs -ve");
3204 }
3205
3206 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3207 if (need_put) {
3208 vnode_put_locked(dp);
3209 }
3210 vnode_drop_and_unlock(dp);
3211
3212 if (nc_smr_enabled) {
3213 vfs_smr_synchronize();
3214 }
3215
3216 mount_lock_destroy(mp);
3217 #if CONFIG_MACF
3218 mac_mount_label_destroy(mp);
3219 #endif
3220 zfree(mount_zone, mp);
3221 return;
3222 }
3223 if (need_put) {
3224 vnode_put_locked(dp);
3225 }
3226 vnode_drop_and_unlock(dp);
3227 }
3228
3229
3230 /*
3231 * Sync each mounted filesystem.
3232 */
3233 #if DIAGNOSTIC
3234 int syncprt = 0;
3235 #endif
3236
3237 int print_vmpage_stat = 0;
3238
3239 /*
3240 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3241 * mounted read-write with the passed waitfor value.
3242 *
3243 * Parameters: mp mount-point descriptor per mounted file-system instance.
3244 * arg user argument (please see below)
3245 *
3246 * User argument is a pointer to 32 bit unsigned integer which describes the
3247 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3248 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3249 * waitfor value.
3250 *
3251 * Returns: VFS_RETURNED
3252 */
3253 static int
sync_callback(mount_t mp,void * arg)3254 sync_callback(mount_t mp, void *arg)
3255 {
3256 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3257 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3258 unsigned waitfor = MNT_NOWAIT;
3259
3260 if (arg) {
3261 waitfor = *(uint32_t*)arg;
3262 }
3263
3264 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3265 if (waitfor != MNT_WAIT &&
3266 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3267 waitfor != MNT_NOWAIT &&
3268 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3269 waitfor != MNT_DWAIT &&
3270 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3271 panic("Passed inappropriate waitfor %u to "
3272 "sync_callback()", waitfor);
3273 }
3274
3275 mp->mnt_flag &= ~MNT_ASYNC;
3276 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3277 if (asyncflag) {
3278 mp->mnt_flag |= MNT_ASYNC;
3279 }
3280 }
3281
3282 return VFS_RETURNED;
3283 }
3284
3285 /* ARGSUSED */
3286 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3287 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3288 {
3289 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3290
3291 if (print_vmpage_stat) {
3292 vm_countdirtypages();
3293 }
3294
3295 #if DIAGNOSTIC
3296 if (syncprt) {
3297 vfs_bufstats();
3298 }
3299 #endif /* DIAGNOSTIC */
3300 return 0;
3301 }
3302
3303 typedef enum {
3304 SYNC_ALL = 0,
3305 SYNC_ONLY_RELIABLE_MEDIA = 1,
3306 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3307 } sync_type_t;
3308
3309 static int
sync_internal_callback(mount_t mp,void * arg)3310 sync_internal_callback(mount_t mp, void *arg)
3311 {
3312 if (arg) {
3313 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3314 (mp->mnt_flag & MNT_LOCAL);
3315 sync_type_t sync_type = *((sync_type_t *)arg);
3316
3317 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3318 return VFS_RETURNED;
3319 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3320 return VFS_RETURNED;
3321 }
3322 }
3323
3324 (void)sync_callback(mp, NULL);
3325
3326 return VFS_RETURNED;
3327 }
3328
3329 int sync_thread_state = 0;
3330 int sync_timeout_seconds = 5;
3331
3332 #define SYNC_THREAD_RUN 0x0001
3333 #define SYNC_THREAD_RUNNING 0x0002
3334
3335 #if CONFIG_PHYS_WRITE_ACCT
3336 thread_t pm_sync_thread;
3337 #endif /* CONFIG_PHYS_WRITE_ACCT */
3338
3339 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3340 sync_thread(__unused void *arg, __unused wait_result_t wr)
3341 {
3342 sync_type_t sync_type;
3343 #if CONFIG_PHYS_WRITE_ACCT
3344 pm_sync_thread = current_thread();
3345 #endif /* CONFIG_PHYS_WRITE_ACCT */
3346
3347 lck_mtx_lock(&sync_mtx_lck);
3348 while (sync_thread_state & SYNC_THREAD_RUN) {
3349 sync_thread_state &= ~SYNC_THREAD_RUN;
3350 lck_mtx_unlock(&sync_mtx_lck);
3351
3352 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3353 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3354 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3355 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3356
3357 lck_mtx_lock(&sync_mtx_lck);
3358 }
3359 /*
3360 * This wakeup _has_ to be issued before the lock is released otherwise
3361 * we may end up waking up a thread in sync_internal which is
3362 * expecting a wakeup from a thread it just created and not from this
3363 * thread which is about to exit.
3364 */
3365 wakeup(&sync_thread_state);
3366 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3367 #if CONFIG_PHYS_WRITE_ACCT
3368 pm_sync_thread = NULL;
3369 #endif /* CONFIG_PHYS_WRITE_ACCT */
3370 lck_mtx_unlock(&sync_mtx_lck);
3371
3372 if (print_vmpage_stat) {
3373 vm_countdirtypages();
3374 }
3375
3376 #if DIAGNOSTIC
3377 if (syncprt) {
3378 vfs_bufstats();
3379 }
3380 #endif /* DIAGNOSTIC */
3381 }
3382
3383 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3384
3385 /*
3386 * An in-kernel sync for power management to call.
3387 * This function always returns within sync_timeout seconds.
3388 */
3389 __private_extern__ int
sync_internal(void)3390 sync_internal(void)
3391 {
3392 thread_t thd = NULL;
3393 int error;
3394 int thread_created = FALSE;
3395 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3396
3397 lck_mtx_lock(&sync_mtx_lck);
3398 sync_thread_state |= SYNC_THREAD_RUN;
3399 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3400 int kr;
3401
3402 sync_thread_state |= SYNC_THREAD_RUNNING;
3403 kr = kernel_thread_start(sync_thread, NULL, &thd);
3404 if (kr != KERN_SUCCESS) {
3405 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3406 lck_mtx_unlock(&sync_mtx_lck);
3407 printf("sync_thread failed\n");
3408 return 0;
3409 }
3410 thread_created = TRUE;
3411 }
3412
3413 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3414 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3415 if (error) {
3416 struct timeval now;
3417
3418 microtime(&now);
3419 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3420 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3421 sync_timeout_last_print.tv_sec = now.tv_sec;
3422 }
3423 }
3424
3425 if (thread_created) {
3426 thread_deallocate(thd);
3427 }
3428
3429 return 0;
3430 } /* end of sync_internal call */
3431
3432 /*
3433 * Change filesystem quotas.
3434 */
3435 #if QUOTA
3436 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3437 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3438 {
3439 struct mount *mp;
3440 int error, quota_cmd, quota_status = 0;
3441 caddr_t datap;
3442 size_t fnamelen;
3443 struct nameidata nd;
3444 vfs_context_t ctx = vfs_context_current();
3445 struct dqblk my_dqblk = {};
3446
3447 AUDIT_ARG(uid, uap->uid);
3448 AUDIT_ARG(cmd, uap->cmd);
3449 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3450 uap->path, ctx);
3451 error = namei(&nd);
3452 if (error) {
3453 return error;
3454 }
3455 mp = nd.ni_vp->v_mount;
3456 mount_ref(mp, 0);
3457 vnode_put(nd.ni_vp);
3458 nameidone(&nd);
3459
3460 #if CONFIG_MACF
3461 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3462 if (error != 0) {
3463 goto out;
3464 }
3465 #endif
3466
3467 /* copyin any data we will need for downstream code */
3468 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3469
3470 switch (quota_cmd) {
3471 case Q_QUOTAON:
3472 /* uap->arg specifies a file from which to take the quotas */
3473 fnamelen = MAXPATHLEN;
3474 datap = zalloc(ZV_NAMEI);
3475 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3476 break;
3477 case Q_GETQUOTA:
3478 /* uap->arg is a pointer to a dqblk structure. */
3479 datap = (caddr_t) &my_dqblk;
3480 break;
3481 case Q_SETQUOTA:
3482 case Q_SETUSE:
3483 /* uap->arg is a pointer to a dqblk structure. */
3484 datap = (caddr_t) &my_dqblk;
3485 if (proc_is64bit(p)) {
3486 struct user_dqblk my_dqblk64;
3487 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3488 if (error == 0) {
3489 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3490 }
3491 } else {
3492 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3493 }
3494 break;
3495 case Q_QUOTASTAT:
3496 /* uap->arg is a pointer to an integer */
3497 datap = (caddr_t) "a_status;
3498 break;
3499 default:
3500 datap = NULL;
3501 break;
3502 } /* switch */
3503
3504 if (error == 0) {
3505 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3506 }
3507
3508 switch (quota_cmd) {
3509 case Q_QUOTAON:
3510 if (datap != NULL) {
3511 zfree(ZV_NAMEI, datap);
3512 }
3513 break;
3514 case Q_GETQUOTA:
3515 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3516 if (error == 0) {
3517 if (proc_is64bit(p)) {
3518 struct user_dqblk my_dqblk64;
3519
3520 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3521 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3522 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3523 } else {
3524 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3525 }
3526 }
3527 break;
3528 case Q_QUOTASTAT:
3529 /* uap->arg is a pointer to an integer */
3530 if (error == 0) {
3531 error = copyout(datap, uap->arg, sizeof(quota_status));
3532 }
3533 break;
3534 default:
3535 break;
3536 } /* switch */
3537
3538 out:
3539 mount_drop(mp, 0);
3540 return error;
3541 }
3542 #else
3543 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3544 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3545 {
3546 return EOPNOTSUPP;
3547 }
3548 #endif /* QUOTA */
3549
3550 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3551 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3552 {
3553 int error;
3554 vfs_context_t ctx = vfs_context_current();
3555
3556 #if CONFIG_MACF
3557 error = mac_mount_check_stat(ctx, mp);
3558 if (error != 0) {
3559 return error;
3560 }
3561 #endif
3562
3563 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3564 if (error != 0) {
3565 return error;
3566 }
3567
3568 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3569 }
3570
3571 /*
3572 * Get filesystem statistics.
3573 *
3574 * Returns: 0 Success
3575 * namei:???
3576 * vfs_update_vfsstat:???
3577 * munge_statfs:EFAULT
3578 */
3579 /* ARGSUSED */
3580 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3581 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3582 {
3583 int error;
3584 struct mount *mp;
3585 struct nameidata nd;
3586 vfs_context_t ctx = vfs_context_current();
3587 vnode_t vp;
3588
3589 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3590 UIO_USERSPACE, uap->path, ctx);
3591 error = namei(&nd);
3592 if (error != 0) {
3593 return error;
3594 }
3595 vp = nd.ni_vp;
3596 mp = vp->v_mount;
3597 nameidone(&nd);
3598
3599 error = statfs_internal(p, mp, uap->buf);
3600 vnode_put(vp);
3601
3602 return error;
3603 }
3604
3605 /*
3606 * Get filesystem statistics.
3607 */
3608 /* ARGSUSED */
3609 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3610 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3611 {
3612 int error;
3613 vnode_t vp = NULL;
3614 struct mount *mp;
3615
3616 AUDIT_ARG(fd, uap->fd);
3617
3618 if ((error = file_vnode(uap->fd, &vp)) ||
3619 (error = vnode_getwithref(vp))) {
3620 goto out;
3621 }
3622
3623 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3624
3625 mp = vp->v_mount;
3626 if (!mp) {
3627 error = EBADF;
3628 goto out_vnode;
3629 }
3630
3631 error = statfs_internal(p, mp, uap->buf);
3632
3633 out_vnode:
3634 vnode_put(vp);
3635
3636 out:
3637 if (vp != NULL) {
3638 file_drop(uap->fd);
3639 }
3640
3641 return error;
3642 }
3643
3644 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3645 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3646 {
3647 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3648
3649 bzero(sfs, sizeof(*sfs));
3650
3651 sfs->f_bsize = vsfs->f_bsize;
3652 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3653 sfs->f_blocks = vsfs->f_blocks;
3654 sfs->f_bfree = vsfs->f_bfree;
3655 sfs->f_bavail = vsfs->f_bavail;
3656 sfs->f_files = vsfs->f_files;
3657 sfs->f_ffree = vsfs->f_ffree;
3658 sfs->f_fsid = vsfs->f_fsid;
3659 sfs->f_owner = vsfs->f_owner;
3660 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3661 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3662 sfs->f_fssubtype = vsfs->f_fssubtype;
3663 sfs->f_flags_ext = vfs_getextflags(mp);
3664 vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3665 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3666 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3667 }
3668
3669 /*
3670 * Get file system statistics in 64-bit mode
3671 */
3672 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3673 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3674 {
3675 struct mount *mp;
3676 int error;
3677 struct nameidata *ndp;
3678 struct statfs64 *sfsp;
3679 vfs_context_t ctxp = vfs_context_current();
3680 vnode_t vp;
3681 struct {
3682 struct nameidata nd;
3683 struct statfs64 sfs;
3684 } *__nameidata_statfs64;
3685
3686 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3687 Z_WAITOK);
3688 ndp = &__nameidata_statfs64->nd;
3689
3690 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3691 UIO_USERSPACE, uap->path, ctxp);
3692 error = namei(ndp);
3693 if (error != 0) {
3694 goto out;
3695 }
3696 vp = ndp->ni_vp;
3697 mp = vp->v_mount;
3698 nameidone(ndp);
3699
3700 #if CONFIG_MACF
3701 error = mac_mount_check_stat(ctxp, mp);
3702 if (error != 0) {
3703 vnode_put(vp);
3704 goto out;
3705 }
3706 #endif
3707
3708 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3709 if (error != 0) {
3710 vnode_put(vp);
3711 goto out;
3712 }
3713
3714 sfsp = &__nameidata_statfs64->sfs;
3715 vfs_get_statfs64(mp, sfsp);
3716 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3717 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3718 /* This process does not want to see a seperate data volume mountpoint */
3719 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3720 }
3721 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3722 vnode_put(vp);
3723
3724 out:
3725 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3726
3727 return error;
3728 }
3729
3730 /*
3731 * Get file system statistics in 64-bit mode
3732 */
3733 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3734 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3735 {
3736 struct vnode *vp;
3737 struct mount *mp;
3738 struct statfs64 sfs;
3739 int error;
3740
3741 AUDIT_ARG(fd, uap->fd);
3742
3743 if ((error = file_vnode(uap->fd, &vp))) {
3744 return error;
3745 }
3746
3747 error = vnode_getwithref(vp);
3748 if (error) {
3749 file_drop(uap->fd);
3750 return error;
3751 }
3752
3753 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3754
3755 mp = vp->v_mount;
3756 if (!mp) {
3757 error = EBADF;
3758 goto out;
3759 }
3760
3761 #if CONFIG_MACF
3762 error = mac_mount_check_stat(vfs_context_current(), mp);
3763 if (error != 0) {
3764 goto out;
3765 }
3766 #endif
3767
3768 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3769 goto out;
3770 }
3771
3772 vfs_get_statfs64(mp, &sfs);
3773 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3774 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3775 /* This process does not want to see a seperate data volume mountpoint */
3776 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3777 }
3778 error = copyout(&sfs, uap->buf, sizeof(sfs));
3779
3780 out:
3781 file_drop(uap->fd);
3782 vnode_put(vp);
3783
3784 return error;
3785 }
3786
3787 struct getfsstat_struct {
3788 user_addr_t sfsp;
3789 user_addr_t *mp;
3790 int count;
3791 int maxcount;
3792 int flags;
3793 int error;
3794 };
3795
3796
3797 static int
getfsstat_callback(mount_t mp,void * arg)3798 getfsstat_callback(mount_t mp, void * arg)
3799 {
3800 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3801 struct vfsstatfs *sp;
3802 int error, my_size;
3803 vfs_context_t ctx = vfs_context_current();
3804
3805 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3806 #if CONFIG_MACF
3807 error = mac_mount_check_stat(ctx, mp);
3808 if (error != 0) {
3809 fstp->error = error;
3810 return VFS_RETURNED_DONE;
3811 }
3812 #endif
3813 sp = &mp->mnt_vfsstat;
3814 /*
3815 * If MNT_NOWAIT is specified, do not refresh the
3816 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3817 */
3818 if ((mp->mnt_lflag & MNT_LDEAD) ||
3819 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3820 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3821 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3822 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3823 return VFS_RETURNED;
3824 }
3825
3826 /*
3827 * Need to handle LP64 version of struct statfs
3828 */
3829 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3830 if (error) {
3831 fstp->error = error;
3832 return VFS_RETURNED_DONE;
3833 }
3834 fstp->sfsp += my_size;
3835
3836 if (fstp->mp) {
3837 #if CONFIG_MACF
3838 error = mac_mount_label_get(mp, *fstp->mp);
3839 if (error) {
3840 fstp->error = error;
3841 return VFS_RETURNED_DONE;
3842 }
3843 #endif
3844 fstp->mp++;
3845 }
3846 }
3847 fstp->count++;
3848 return VFS_RETURNED;
3849 }
3850
3851 /*
3852 * Get statistics on all filesystems.
3853 */
3854 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3855 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3856 {
3857 struct __mac_getfsstat_args muap;
3858
3859 muap.buf = uap->buf;
3860 muap.bufsize = uap->bufsize;
3861 muap.mac = USER_ADDR_NULL;
3862 muap.macsize = 0;
3863 muap.flags = uap->flags;
3864
3865 return __mac_getfsstat(p, &muap, retval);
3866 }
3867
3868 /*
3869 * __mac_getfsstat: Get MAC-related file system statistics
3870 *
3871 * Parameters: p (ignored)
3872 * uap User argument descriptor (see below)
3873 * retval Count of file system statistics (N stats)
3874 *
3875 * Indirect: uap->bufsize Buffer size
3876 * uap->macsize MAC info size
3877 * uap->buf Buffer where information will be returned
3878 * uap->mac MAC info
3879 * uap->flags File system flags
3880 *
3881 *
3882 * Returns: 0 Success
3883 * !0 Not success
3884 *
3885 */
3886 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3887 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3888 {
3889 user_addr_t sfsp;
3890 user_addr_t *mp;
3891 size_t count, maxcount, bufsize, macsize;
3892 struct getfsstat_struct fst;
3893
3894 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3895 return EINVAL;
3896 }
3897
3898 bufsize = (size_t) uap->bufsize;
3899 macsize = (size_t) uap->macsize;
3900
3901 if (IS_64BIT_PROCESS(p)) {
3902 maxcount = bufsize / sizeof(struct user64_statfs);
3903 } else {
3904 maxcount = bufsize / sizeof(struct user32_statfs);
3905 }
3906 sfsp = uap->buf;
3907 count = 0;
3908
3909 mp = NULL;
3910
3911 #if CONFIG_MACF
3912 if (uap->mac != USER_ADDR_NULL) {
3913 u_int32_t *mp0;
3914 int error;
3915 unsigned int i;
3916
3917 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3918 if (count != maxcount) {
3919 return EINVAL;
3920 }
3921
3922 /* Copy in the array */
3923 mp0 = kalloc_data(macsize, Z_WAITOK);
3924 if (mp0 == NULL) {
3925 return ENOMEM;
3926 }
3927
3928 error = copyin(uap->mac, mp0, macsize);
3929 if (error) {
3930 kfree_data(mp0, macsize);
3931 return error;
3932 }
3933
3934 /* Normalize to an array of user_addr_t */
3935 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3936 if (mp == NULL) {
3937 kfree_data(mp0, macsize);
3938 return ENOMEM;
3939 }
3940
3941 for (i = 0; i < count; i++) {
3942 if (IS_64BIT_PROCESS(p)) {
3943 mp[i] = ((user_addr_t *)mp0)[i];
3944 } else {
3945 mp[i] = (user_addr_t)mp0[i];
3946 }
3947 }
3948 kfree_data(mp0, macsize);
3949 }
3950 #endif
3951
3952
3953 fst.sfsp = sfsp;
3954 fst.mp = mp;
3955 fst.flags = uap->flags;
3956 fst.count = 0;
3957 fst.error = 0;
3958 fst.maxcount = (int)maxcount;
3959
3960
3961 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3962
3963 if (mp) {
3964 kfree_data(mp, count * sizeof(user_addr_t));
3965 }
3966
3967 if (fst.error) {
3968 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3969 return fst.error;
3970 }
3971
3972 if (fst.sfsp && fst.count > fst.maxcount) {
3973 *retval = fst.maxcount;
3974 } else {
3975 *retval = fst.count;
3976 }
3977 return 0;
3978 }
3979
3980 static int
getfsstat64_callback(mount_t mp,void * arg)3981 getfsstat64_callback(mount_t mp, void * arg)
3982 {
3983 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3984 struct vfsstatfs *sp;
3985 struct statfs64 sfs;
3986 int error;
3987
3988 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3989 #if CONFIG_MACF
3990 error = mac_mount_check_stat(vfs_context_current(), mp);
3991 if (error != 0) {
3992 fstp->error = error;
3993 return VFS_RETURNED_DONE;
3994 }
3995 #endif
3996 sp = &mp->mnt_vfsstat;
3997 /*
3998 * If MNT_NOWAIT is specified, do not refresh the fsstat
3999 * cache. MNT_WAIT overrides MNT_NOWAIT.
4000 *
4001 * We treat MNT_DWAIT as MNT_WAIT for all instances of
4002 * getfsstat, since the constants are out of the same
4003 * namespace.
4004 */
4005 if ((mp->mnt_lflag & MNT_LDEAD) ||
4006 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
4007 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
4008 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
4009 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
4010 return VFS_RETURNED;
4011 }
4012
4013 vfs_get_statfs64(mp, &sfs);
4014 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
4015 if (error) {
4016 fstp->error = error;
4017 return VFS_RETURNED_DONE;
4018 }
4019 fstp->sfsp += sizeof(sfs);
4020 }
4021 fstp->count++;
4022 return VFS_RETURNED;
4023 }
4024
4025 /*
4026 * Get statistics on all file systems in 64 bit mode.
4027 */
4028 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)4029 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
4030 {
4031 user_addr_t sfsp;
4032 int count, maxcount;
4033 struct getfsstat_struct fst;
4034
4035 maxcount = uap->bufsize / sizeof(struct statfs64);
4036
4037 sfsp = uap->buf;
4038 count = 0;
4039
4040 fst.sfsp = sfsp;
4041 fst.flags = uap->flags;
4042 fst.count = 0;
4043 fst.error = 0;
4044 fst.maxcount = maxcount;
4045
4046 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
4047
4048 if (fst.error) {
4049 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4050 return fst.error;
4051 }
4052
4053 if (fst.sfsp && fst.count > fst.maxcount) {
4054 *retval = fst.maxcount;
4055 } else {
4056 *retval = fst.count;
4057 }
4058
4059 return 0;
4060 }
4061
4062 /*
4063 * gets the associated vnode with the file descriptor passed.
4064 * as input
4065 *
4066 * INPUT
4067 * ctx - vfs context of caller
4068 * fd - file descriptor for which vnode is required.
4069 * vpp - Pointer to pointer to vnode to be returned.
4070 *
4071 * The vnode is returned with an iocount so any vnode obtained
4072 * by this call needs a vnode_put
4073 *
4074 */
4075 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4076 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4077 {
4078 int error;
4079 vnode_t vp;
4080 struct fileproc *fp;
4081 proc_t p = vfs_context_proc(ctx);
4082
4083 *vpp = NULLVP;
4084
4085 error = fp_getfvp(p, fd, &fp, &vp);
4086 if (error) {
4087 return error;
4088 }
4089
4090 error = vnode_getwithref(vp);
4091 if (error) {
4092 (void)fp_drop(p, fd, fp, 0);
4093 return error;
4094 }
4095
4096 (void)fp_drop(p, fd, fp, 0);
4097 *vpp = vp;
4098 return error;
4099 }
4100
4101 /*
4102 * Wrapper function around namei to start lookup from a directory
4103 * specified by a file descriptor ni_dirfd.
4104 *
4105 * In addition to all the errors returned by namei, this call can
4106 * return ENOTDIR if the file descriptor does not refer to a directory.
4107 * and EBADF if the file descriptor is not valid.
4108 */
4109 int
nameiat(struct nameidata * ndp,int dirfd)4110 nameiat(struct nameidata *ndp, int dirfd)
4111 {
4112 if ((dirfd != AT_FDCWD) &&
4113 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4114 !(ndp->ni_cnd.cn_flags & USEDVP)) {
4115 int error = 0;
4116 char c;
4117
4118 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4119 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4120 if (error) {
4121 return error;
4122 }
4123 } else {
4124 c = *((char *)(ndp->ni_dirp));
4125 }
4126
4127 if (c != '/') {
4128 vnode_t dvp_at;
4129
4130 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4131 &dvp_at);
4132 if (error) {
4133 return error;
4134 }
4135
4136 if (vnode_vtype(dvp_at) != VDIR) {
4137 vnode_put(dvp_at);
4138 return ENOTDIR;
4139 }
4140
4141 ndp->ni_dvp = dvp_at;
4142 ndp->ni_cnd.cn_flags |= USEDVP;
4143 error = namei(ndp);
4144 ndp->ni_cnd.cn_flags &= ~USEDVP;
4145 vnode_put(dvp_at);
4146 return error;
4147 }
4148 }
4149
4150 return namei(ndp);
4151 }
4152
4153 /*
4154 * Change current working directory to a given file descriptor.
4155 */
4156 /* ARGSUSED */
4157 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4158 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4159 {
4160 vnode_t vp;
4161 vnode_t tdp;
4162 vnode_t tvp;
4163 struct mount *mp;
4164 int error, should_put = 1;
4165
4166 AUDIT_ARG(fd, fd);
4167 if (per_thread && fd == -1) {
4168 /*
4169 * Switching back from per-thread to per process CWD; verify we
4170 * in fact have one before proceeding. The only success case
4171 * for this code path is to return 0 preemptively after zapping
4172 * the thread structure contents.
4173 */
4174 thread_t th = vfs_context_thread(ctx);
4175 if (th) {
4176 uthread_t uth = get_bsdthread_info(th);
4177 tvp = uth->uu_cdir;
4178 uth->uu_cdir = NULLVP;
4179 if (tvp != NULLVP) {
4180 vnode_rele(tvp);
4181 return 0;
4182 }
4183 }
4184 return EBADF;
4185 }
4186
4187 if ((error = file_vnode(fd, &vp))) {
4188 return error;
4189 }
4190 if ((error = vnode_getwithref(vp))) {
4191 file_drop(fd);
4192 return error;
4193 }
4194
4195 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4196
4197 if (vp->v_type != VDIR) {
4198 error = ENOTDIR;
4199 goto out;
4200 }
4201
4202 #if CONFIG_MACF
4203 error = mac_vnode_check_chdir(ctx, vp);
4204 if (error) {
4205 goto out;
4206 }
4207 #endif
4208 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4209 if (error) {
4210 goto out;
4211 }
4212
4213 while (!error && (mp = vp->v_mountedhere) != NULL) {
4214 if (vfs_busy(mp, LK_NOWAIT)) {
4215 error = EACCES;
4216 goto out;
4217 }
4218 error = VFS_ROOT(mp, &tdp, ctx);
4219 vfs_unbusy(mp);
4220 if (error) {
4221 break;
4222 }
4223 vnode_put(vp);
4224 vp = tdp;
4225 }
4226 if (error) {
4227 goto out;
4228 }
4229 if ((error = vnode_ref(vp))) {
4230 goto out;
4231 }
4232 vnode_put(vp);
4233 should_put = 0;
4234
4235 if (per_thread) {
4236 thread_t th = vfs_context_thread(ctx);
4237 if (th) {
4238 uthread_t uth = get_bsdthread_info(th);
4239 tvp = uth->uu_cdir;
4240 uth->uu_cdir = vp;
4241 OSBitOrAtomic(P_THCWD, &p->p_flag);
4242 } else {
4243 vnode_rele(vp);
4244 error = ENOENT;
4245 goto out;
4246 }
4247 } else {
4248 proc_dirs_lock_exclusive(p);
4249 proc_fdlock(p);
4250 tvp = p->p_fd.fd_cdir;
4251 p->p_fd.fd_cdir = vp;
4252 proc_fdunlock(p);
4253 proc_dirs_unlock_exclusive(p);
4254 }
4255
4256 if (tvp) {
4257 vnode_rele(tvp);
4258 }
4259
4260 out:
4261 if (should_put) {
4262 vnode_put(vp);
4263 }
4264 file_drop(fd);
4265
4266 return error;
4267 }
4268
4269 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4270 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4271 {
4272 return fchdir(p, vfs_context_current(), uap->fd, false);
4273 }
4274
4275 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4276 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4277 {
4278 return fchdir(p, vfs_context_current(), uap->fd, true);
4279 }
4280
4281
4282 /*
4283 * Change current working directory (".").
4284 *
4285 * Returns: 0 Success
4286 * change_dir:ENOTDIR
4287 * change_dir:???
4288 * vnode_ref:ENOENT No such file or directory
4289 */
4290 /* ARGSUSED */
4291 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4292 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4293 {
4294 int error;
4295 vnode_t tvp;
4296
4297 error = change_dir(ndp, ctx);
4298 if (error) {
4299 return error;
4300 }
4301 if ((error = vnode_ref(ndp->ni_vp))) {
4302 vnode_put(ndp->ni_vp);
4303 return error;
4304 }
4305 /*
4306 * drop the iocount we picked up in change_dir
4307 */
4308 vnode_put(ndp->ni_vp);
4309
4310 if (per_thread) {
4311 thread_t th = vfs_context_thread(ctx);
4312 if (th) {
4313 uthread_t uth = get_bsdthread_info(th);
4314 tvp = uth->uu_cdir;
4315 uth->uu_cdir = ndp->ni_vp;
4316 OSBitOrAtomic(P_THCWD, &p->p_flag);
4317 } else {
4318 vnode_rele(ndp->ni_vp);
4319 return ENOENT;
4320 }
4321 } else {
4322 proc_dirs_lock_exclusive(p);
4323 proc_fdlock(p);
4324 tvp = p->p_fd.fd_cdir;
4325 p->p_fd.fd_cdir = ndp->ni_vp;
4326 proc_fdunlock(p);
4327 proc_dirs_unlock_exclusive(p);
4328 }
4329
4330 if (tvp) {
4331 vnode_rele(tvp);
4332 }
4333
4334 return 0;
4335 }
4336
4337
4338 /*
4339 * Change current working directory (".").
4340 *
4341 * Returns: 0 Success
4342 * chdir_internal:ENOTDIR
4343 * chdir_internal:ENOENT No such file or directory
4344 * chdir_internal:???
4345 */
4346 /* ARGSUSED */
4347 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4348 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4349 {
4350 struct nameidata nd;
4351 vfs_context_t ctx = vfs_context_current();
4352
4353 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4354 UIO_USERSPACE, uap->path, ctx);
4355
4356 return chdir_internal(p, ctx, &nd, per_thread);
4357 }
4358
4359
4360 /*
4361 * chdir
4362 *
4363 * Change current working directory (".") for the entire process
4364 *
4365 * Parameters: p Process requesting the call
4366 * uap User argument descriptor (see below)
4367 * retval (ignored)
4368 *
4369 * Indirect parameters: uap->path Directory path
4370 *
4371 * Returns: 0 Success
4372 * common_chdir: ENOTDIR
4373 * common_chdir: ENOENT No such file or directory
4374 * common_chdir: ???
4375 *
4376 */
4377 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4378 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4379 {
4380 return common_chdir(p, (void *)uap, 0);
4381 }
4382
4383 /*
4384 * __pthread_chdir
4385 *
4386 * Change current working directory (".") for a single thread
4387 *
4388 * Parameters: p Process requesting the call
4389 * uap User argument descriptor (see below)
4390 * retval (ignored)
4391 *
4392 * Indirect parameters: uap->path Directory path
4393 *
4394 * Returns: 0 Success
4395 * common_chdir: ENOTDIR
4396 * common_chdir: ENOENT No such file or directory
4397 * common_chdir: ???
4398 *
4399 */
4400 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4401 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4402 {
4403 return common_chdir(p, (void *)uap, 1);
4404 }
4405
4406
4407 /*
4408 * Change notion of root (``/'') directory.
4409 */
4410 /* ARGSUSED */
4411 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4412 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4413 {
4414 struct filedesc *fdp = &p->p_fd;
4415 int error;
4416 struct nameidata nd;
4417 vnode_t tvp;
4418 vfs_context_t ctx = vfs_context_current();
4419
4420 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4421 return error;
4422 }
4423
4424 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4425 UIO_USERSPACE, uap->path, ctx);
4426 error = change_dir(&nd, ctx);
4427 if (error) {
4428 return error;
4429 }
4430
4431 #if CONFIG_MACF
4432 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4433 &nd.ni_cnd);
4434 if (error) {
4435 vnode_put(nd.ni_vp);
4436 return error;
4437 }
4438 #endif
4439
4440 if ((error = vnode_ref(nd.ni_vp))) {
4441 vnode_put(nd.ni_vp);
4442 return error;
4443 }
4444 vnode_put(nd.ni_vp);
4445
4446 /*
4447 * This lock provides the guarantee that as long as you hold the lock
4448 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4449 * on a referenced vnode in namei when determining the rootvnode for
4450 * a process.
4451 */
4452 /* needed for synchronization with lookup */
4453 proc_dirs_lock_exclusive(p);
4454 /* needed for setting the flag and other activities on the fd itself */
4455 proc_fdlock(p);
4456 tvp = fdp->fd_rdir;
4457 fdp->fd_rdir = nd.ni_vp;
4458 fdt_flag_set(fdp, FD_CHROOT);
4459 proc_fdunlock(p);
4460 proc_dirs_unlock_exclusive(p);
4461
4462 if (tvp != NULL) {
4463 vnode_rele(tvp);
4464 }
4465
4466 return 0;
4467 }
4468
4469 #define PATHSTATICBUFLEN 256
4470 #define PIVOT_ROOT_ENTITLEMENT \
4471 "com.apple.private.vfs.pivot-root"
4472
4473 #if defined(XNU_TARGET_OS_OSX)
4474 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4475 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4476 {
4477 int error;
4478 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4479 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4480 char *new_rootfs_path_before_buf = NULL;
4481 char *old_rootfs_path_after_buf = NULL;
4482 char *incoming = NULL;
4483 char *outgoing = NULL;
4484 vnode_t incoming_rootvp = NULLVP;
4485 size_t bytes_copied;
4486
4487 /*
4488 * XXX : Additional restrictions needed
4489 * - perhaps callable only once.
4490 */
4491 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4492 return error;
4493 }
4494
4495 /*
4496 * pivot_root can be executed by launchd only.
4497 * Enforce entitlement.
4498 */
4499 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4500 return EPERM;
4501 }
4502
4503 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4504 if (error == ENAMETOOLONG) {
4505 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4506 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4507 }
4508
4509 if (error) {
4510 goto out;
4511 }
4512
4513 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4514 if (error == ENAMETOOLONG) {
4515 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4516 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4517 }
4518 if (error) {
4519 goto out;
4520 }
4521
4522 if (new_rootfs_path_before_buf) {
4523 incoming = new_rootfs_path_before_buf;
4524 } else {
4525 incoming = &new_rootfs_path_before[0];
4526 }
4527
4528 if (old_rootfs_path_after_buf) {
4529 outgoing = old_rootfs_path_after_buf;
4530 } else {
4531 outgoing = &old_rootfs_path_after[0];
4532 }
4533
4534 /*
4535 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4536 * Userland is not allowed to pivot to an image.
4537 */
4538 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4539 if (error) {
4540 goto out;
4541 }
4542 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4543 if (error) {
4544 goto out;
4545 }
4546
4547 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4548
4549 out:
4550 if (incoming_rootvp != NULLVP) {
4551 vnode_put(incoming_rootvp);
4552 incoming_rootvp = NULLVP;
4553 }
4554
4555 if (old_rootfs_path_after_buf) {
4556 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4557 }
4558
4559 if (new_rootfs_path_before_buf) {
4560 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4561 }
4562
4563 return error;
4564 }
4565 #else
4566 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4567 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4568 {
4569 return nosys(p, NULL, retval);
4570 }
4571 #endif /* XNU_TARGET_OS_OSX */
4572
4573 /*
4574 * Common routine for chroot and chdir.
4575 *
4576 * Returns: 0 Success
4577 * ENOTDIR Not a directory
4578 * namei:??? [anything namei can return]
4579 * vnode_authorize:??? [anything vnode_authorize can return]
4580 */
4581 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4582 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4583 {
4584 vnode_t vp;
4585 int error;
4586
4587 if ((error = namei(ndp))) {
4588 return error;
4589 }
4590 nameidone(ndp);
4591 vp = ndp->ni_vp;
4592
4593 if (vp->v_type != VDIR) {
4594 vnode_put(vp);
4595 return ENOTDIR;
4596 }
4597
4598 #if CONFIG_MACF
4599 error = mac_vnode_check_chdir(ctx, vp);
4600 if (error) {
4601 vnode_put(vp);
4602 return error;
4603 }
4604 #endif
4605
4606 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4607 if (error) {
4608 vnode_put(vp);
4609 return error;
4610 }
4611
4612 return error;
4613 }
4614
4615 /*
4616 * Free the vnode data (for directories) associated with the file glob.
4617 */
4618 struct fd_vn_data *
fg_vn_data_alloc(void)4619 fg_vn_data_alloc(void)
4620 {
4621 struct fd_vn_data *fvdata;
4622
4623 /* Allocate per fd vnode data */
4624 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4625 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4626 return fvdata;
4627 }
4628
4629 /*
4630 * Free the vnode data (for directories) associated with the file glob.
4631 */
4632 void
fg_vn_data_free(void * fgvndata)4633 fg_vn_data_free(void *fgvndata)
4634 {
4635 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4636
4637 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4638 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4639 kfree_type(struct fd_vn_data, fvdata);
4640 }
4641
4642 /*
4643 * Check permissions, allocate an open file structure,
4644 * and call the device open routine if any.
4645 *
4646 * Returns: 0 Success
4647 * EINVAL
4648 * EINTR
4649 * falloc:ENFILE
4650 * falloc:EMFILE
4651 * falloc:ENOMEM
4652 * vn_open_auth:???
4653 * dupfdopen:???
4654 * VNOP_ADVLOCK:???
4655 * vnode_setsize:???
4656 *
4657 * XXX Need to implement uid, gid
4658 */
4659 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4660 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4661 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4662 {
4663 proc_t p = vfs_context_proc(ctx);
4664 kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4665 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4666 struct fileproc *fp;
4667 vnode_t vp;
4668 int flags, oflags, amode;
4669 int type, indx, error;
4670 struct vfs_context context;
4671 vnode_t authvp = NULLVP;
4672
4673 oflags = uflags;
4674
4675 amode = oflags & O_ACCMODE;
4676 /*
4677 * Because O_RDONLY is 0, it is not possible to distinguish between
4678 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4679 * with FREAD/FWRITE.
4680 */
4681 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4682 return EINVAL;
4683 }
4684
4685 flags = FFLAGS(uflags);
4686 CLR(flags, FENCRYPTED);
4687 CLR(flags, FUNENCRYPTED);
4688
4689 AUDIT_ARG(fflags, oflags);
4690 AUDIT_ARG(mode, vap->va_mode);
4691
4692 if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4693 return error;
4694 }
4695 if (flags & O_CLOEXEC) {
4696 fp->fp_flags |= FP_CLOEXEC;
4697 }
4698 if (flags & O_CLOFORK) {
4699 fp->fp_flags |= FP_CLOFORK;
4700 }
4701
4702 /* setup state to recognize when fdesc_open was called */
4703 uu->uu_dupfd = -1;
4704
4705 /*
4706 * Disable read/write access if file is opened with O_EVTONLY and
4707 * the process has requested to deny read/write access.
4708 */
4709 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4710 flags &= ~(FREAD | FWRITE);
4711 }
4712
4713 if (authfd != AUTH_OPEN_NOAUTHFD) {
4714 error = vnode_getfromfd(ctx, authfd, &authvp);
4715 if (error) {
4716 fp_free(p, indx, fp);
4717 return error;
4718 }
4719 }
4720
4721 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4722 if (authvp != NULLVP) {
4723 vnode_put(authvp);
4724 }
4725 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4726 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4727 *retval = indx;
4728 return 0;
4729 }
4730 }
4731 if (error == ERESTART) {
4732 error = EINTR;
4733 }
4734 fp_free(p, indx, fp);
4735 return error;
4736 }
4737
4738 if (authvp != NULLVP) {
4739 vnode_put(authvp);
4740 }
4741
4742 uu->uu_dupfd = 0;
4743 vp = ndp->ni_vp;
4744
4745 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4746 fp->fp_glob->fg_ops = &vnops;
4747 fp_set_data(fp, vp);
4748
4749 #if CONFIG_FILE_LEASES
4750 /*
4751 * If we are creating a file or open with truncate, we need to break the
4752 * lease if there is a read lease placed on the parent dir.
4753 */
4754 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4755 vnode_breakdirlease(vp, true, oflags);
4756 }
4757 /* Now check if there is a lease placed on the file itself. */
4758 error = vnode_breaklease(vp, oflags, ctx);
4759 if (error) {
4760 goto bad;
4761 }
4762 #endif /* CONFIG_FILE_LEASES */
4763
4764 if (flags & (O_EXLOCK | O_SHLOCK)) {
4765 struct flock lf = {
4766 .l_whence = SEEK_SET,
4767 };
4768
4769 if (flags & O_EXLOCK) {
4770 lf.l_type = F_WRLCK;
4771 } else {
4772 lf.l_type = F_RDLCK;
4773 }
4774 type = F_FLOCK;
4775 if ((flags & FNONBLOCK) == 0) {
4776 type |= F_WAIT;
4777 }
4778 #if CONFIG_MACF
4779 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4780 F_SETLK, &lf);
4781 if (error) {
4782 goto bad;
4783 }
4784 #endif
4785 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4786 goto bad;
4787 }
4788 fp->fp_glob->fg_flag |= FWASLOCKED;
4789 }
4790
4791 /* try to truncate by setting the size attribute */
4792 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4793 goto bad;
4794 }
4795
4796 /*
4797 * For directories we hold some additional information in the fd.
4798 */
4799 if (vnode_vtype(vp) == VDIR) {
4800 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4801 } else {
4802 fp->fp_glob->fg_vn_data = NULL;
4803 }
4804
4805 #if CONFIG_SECLUDED_MEMORY
4806 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4807 memory_object_control_t moc;
4808 const char *v_name;
4809
4810 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4811
4812 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4813 /* nothing to do... */
4814 } else if (fp->fp_glob->fg_flag & FWRITE) {
4815 /* writable -> no longer eligible for secluded pages */
4816 memory_object_mark_eligible_for_secluded(moc,
4817 FALSE);
4818 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4819 char pathname[32] = { 0, };
4820 size_t copied;
4821 /* XXX FBDP: better way to detect /Applications/ ? */
4822 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4823 (void)copyinstr(ndp->ni_dirp,
4824 pathname,
4825 sizeof(pathname),
4826 &copied);
4827 } else {
4828 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4829 pathname,
4830 sizeof(pathname),
4831 &copied);
4832 }
4833 pathname[sizeof(pathname) - 1] = '\0';
4834 if (strncmp(pathname,
4835 "/Applications/",
4836 strlen("/Applications/")) == 0 &&
4837 strncmp(pathname,
4838 "/Applications/Camera.app/",
4839 strlen("/Applications/Camera.app/")) != 0) {
4840 /*
4841 * not writable
4842 * AND from "/Applications/"
4843 * AND not from "/Applications/Camera.app/"
4844 * ==> eligible for secluded
4845 */
4846 memory_object_mark_eligible_for_secluded(moc,
4847 TRUE);
4848 }
4849 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4850 (v_name = vnode_getname(vp))) {
4851 size_t len = strlen(v_name);
4852
4853 if (!strncmp(v_name, "dyld", len) ||
4854 !strncmp(v_name, "launchd", len) ||
4855 !strncmp(v_name, "Camera", len) ||
4856 !strncmp(v_name, "SpringBoard", len) ||
4857 !strncmp(v_name, "backboardd", len) ||
4858 !strncmp(v_name, "cameracaptured", len)) {
4859 /*
4860 * This file matters when launching Camera:
4861 * do not store its contents in the secluded
4862 * pool that will be drained on Camera launch.
4863 */
4864 memory_object_mark_eligible_for_secluded(moc,
4865 FALSE);
4866 } else if (!strncmp(v_name, "audiomxd", len) ||
4867 !strncmp(v_name, "mediaplaybackd", len)) {
4868 memory_object_mark_eligible_for_secluded(moc,
4869 FALSE);
4870 memory_object_mark_for_realtime(moc,
4871 true);
4872 } else if (!strncmp(v_name, "bluetoothd", len)) {
4873 /*
4874 * bluetoothd might be needed for realtime audio
4875 * playback.
4876 */
4877 memory_object_mark_eligible_for_secluded(moc,
4878 FALSE);
4879 memory_object_mark_for_realtime(moc,
4880 true);
4881 } else {
4882 char pathname[64] = { 0, };
4883 size_t copied;
4884 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4885 (void)copyinstr(ndp->ni_dirp,
4886 pathname,
4887 sizeof(pathname),
4888 &copied);
4889 } else {
4890 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4891 pathname,
4892 sizeof(pathname),
4893 &copied);
4894 }
4895 pathname[sizeof(pathname) - 1] = '\0';
4896 if (strncmp(pathname,
4897 "/Library/Audio/Plug-Ins/",
4898 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4899 strncmp(pathname,
4900 "/System/Library/Audio/Plug-Ins/",
4901 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4902 /*
4903 * This may be an audio plugin required
4904 * for realtime playback.
4905 * ==> NOT eligible for secluded.
4906 */
4907 memory_object_mark_eligible_for_secluded(moc,
4908 FALSE);
4909 memory_object_mark_for_realtime(moc,
4910 true);
4911 }
4912 }
4913 vnode_putname(v_name);
4914 }
4915 }
4916 #endif /* CONFIG_SECLUDED_MEMORY */
4917
4918 vnode_put(vp);
4919
4920 /*
4921 * The first terminal open (without a O_NOCTTY) by a session leader
4922 * results in it being set as the controlling terminal.
4923 */
4924 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4925 !(flags & O_NOCTTY)) {
4926 int tmp = 0;
4927
4928 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4929 (caddr_t)&tmp, ctx);
4930 }
4931
4932 proc_fdlock(p);
4933 procfdtbl_releasefd(p, indx, NULL);
4934
4935 fp_drop(p, indx, fp, 1);
4936 proc_fdunlock(p);
4937
4938 *retval = indx;
4939
4940 return 0;
4941 bad:
4942 context = *vfs_context_current();
4943 context.vc_ucred = fp->fp_glob->fg_cred;
4944
4945 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4946 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4947 struct flock lf = {
4948 .l_whence = SEEK_SET,
4949 .l_type = F_UNLCK,
4950 };
4951
4952 (void)VNOP_ADVLOCK(
4953 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4954 }
4955
4956 vn_close(vp, fp->fp_glob->fg_flag, &context);
4957 vnode_put(vp);
4958 fp_free(p, indx, fp);
4959
4960 return error;
4961 }
4962
4963 /*
4964 * While most of the *at syscall handlers can call nameiat() which
4965 * is a wrapper around namei, the use of namei and initialisation
4966 * of nameidata are far removed and in different functions - namei
4967 * gets called in vn_open_auth for open1. So we'll just do here what
4968 * nameiat() does.
4969 */
4970 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4971 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4972 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4973 int dirfd, int authfd)
4974 {
4975 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4976 int error;
4977 char c;
4978
4979 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4980 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4981 if (error) {
4982 return error;
4983 }
4984 } else {
4985 c = *((char *)(ndp->ni_dirp));
4986 }
4987
4988 if (c != '/') {
4989 vnode_t dvp_at;
4990
4991 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4992 &dvp_at);
4993 if (error) {
4994 return error;
4995 }
4996
4997 if (vnode_vtype(dvp_at) != VDIR) {
4998 vnode_put(dvp_at);
4999 return ENOTDIR;
5000 }
5001
5002 ndp->ni_dvp = dvp_at;
5003 ndp->ni_cnd.cn_flags |= USEDVP;
5004 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
5005 retval, authfd);
5006 vnode_put(dvp_at);
5007 return error;
5008 }
5009 }
5010
5011 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
5012 }
5013
5014 /*
5015 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
5016 *
5017 * Parameters: p Process requesting the open
5018 * uap User argument descriptor (see below)
5019 * retval Pointer to an area to receive the
5020 * return calue from the system call
5021 *
5022 * Indirect: uap->path Path to open (same as 'open')
5023 * uap->flags Flags to open (same as 'open'
5024 * uap->uid UID to set, if creating
5025 * uap->gid GID to set, if creating
5026 * uap->mode File mode, if creating (same as 'open')
5027 * uap->xsecurity ACL to set, if creating
5028 *
5029 * Returns: 0 Success
5030 * !0 errno value
5031 *
5032 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5033 *
5034 * XXX: We should enummerate the possible errno values here, and where
5035 * in the code they originated.
5036 */
5037 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)5038 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
5039 {
5040 int ciferror;
5041 kauth_filesec_t xsecdst;
5042 struct vnode_attr va;
5043 struct nameidata nd;
5044 int cmode;
5045
5046 AUDIT_ARG(owner, uap->uid, uap->gid);
5047
5048 xsecdst = NULL;
5049 if ((uap->xsecurity != USER_ADDR_NULL) &&
5050 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
5051 return ciferror;
5052 }
5053
5054 VATTR_INIT(&va);
5055 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
5056 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5057 if (uap->uid != KAUTH_UID_NONE) {
5058 VATTR_SET(&va, va_uid, uap->uid);
5059 }
5060 if (uap->gid != KAUTH_GID_NONE) {
5061 VATTR_SET(&va, va_gid, uap->gid);
5062 }
5063 if (xsecdst != NULL) {
5064 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5065 va.va_vaflags |= VA_FILESEC_ACL;
5066 }
5067
5068 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5069 uap->path, vfs_context_current());
5070
5071 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5072 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5073 if (xsecdst != NULL) {
5074 kauth_filesec_free(xsecdst);
5075 }
5076
5077 return ciferror;
5078 }
5079
5080 /*
5081 * Go through the data-protected atomically controlled open (2)
5082 *
5083 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5084 */
5085 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5086 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5087 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5088 {
5089 /*
5090 * Follow the same path as normal open(2)
5091 * Look up the item if it exists, and acquire the vnode.
5092 */
5093 struct vnode_attr va;
5094 struct nameidata nd;
5095 int cmode;
5096 int error;
5097 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5098
5099 VATTR_INIT(&va);
5100 /* Mask off all but regular access permissions */
5101 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5102 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5103
5104 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5105 path, ctx);
5106
5107 /*
5108 * Initialize the extra fields in vnode_attr to pass down our
5109 * extra fields.
5110 * 1. target cprotect class.
5111 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5112 */
5113 if (flags & O_CREAT) {
5114 /* lower level kernel code validates that the class is valid before applying it. */
5115 if (class != PROTECTION_CLASS_DEFAULT) {
5116 /*
5117 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5118 * file behave the same as open (2)
5119 */
5120 VATTR_SET(&va, va_dataprotect_class, class);
5121 }
5122 }
5123
5124 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5125 if (flags & (O_RDWR | O_WRONLY)) {
5126 /*
5127 * Not allowed to write raw encrypted bytes or when opening authenticated.
5128 */
5129 return EINVAL;
5130 }
5131 if (dpflags & O_DP_GETRAWENCRYPTED) {
5132 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5133 }
5134 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5135 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5136 }
5137 if (dpflags & O_DP_AUTHENTICATE) {
5138 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5139 }
5140 }
5141
5142 error = open1at(vfs_context_current(), &nd, flags, &va,
5143 NULL, NULL, retval, fd, authfd);
5144
5145 return error;
5146 }
5147
5148 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5149 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5150 {
5151 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5152 return EINVAL;
5153 }
5154
5155 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5156 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5157 }
5158
5159 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5160 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5161 {
5162 if (uap->dpflags & O_DP_AUTHENTICATE) {
5163 return EINVAL;
5164 }
5165
5166 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5167 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5168 }
5169
5170 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5171 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5172 int fd, enum uio_seg segflg, int *retval)
5173 {
5174 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5175 struct {
5176 struct vnode_attr va;
5177 struct nameidata nd;
5178 } *__open_data;
5179 struct vnode_attr *vap;
5180 struct nameidata *ndp;
5181 int cmode;
5182 int error;
5183
5184 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5185 vap = &__open_data->va;
5186 ndp = &__open_data->nd;
5187
5188 VATTR_INIT(vap);
5189 /* Mask off all but regular access permissions */
5190 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5191 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5192
5193 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5194 segflg, path, ctx);
5195
5196 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5197
5198 kfree_type(typeof(*__open_data), __open_data);
5199
5200 return error;
5201 }
5202
5203 int
open(proc_t p,struct open_args * uap,int32_t * retval)5204 open(proc_t p, struct open_args *uap, int32_t *retval)
5205 {
5206 __pthread_testcancel(1);
5207 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5208 }
5209
5210 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5211 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5212 int32_t *retval)
5213 {
5214 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5215 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5216 }
5217
5218 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5219 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5220 int32_t *retval)
5221 {
5222 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5223 uap->mode, uap->fd, UIO_USERSPACE, retval);
5224 }
5225
5226 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5227 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5228 {
5229 __pthread_testcancel(1);
5230 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5231 }
5232
5233 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5234
5235 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5236 vfs_context_can_open_by_id(vfs_context_t ctx)
5237 {
5238 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5239 return TRUE;
5240 }
5241
5242 return IOTaskHasEntitlement(vfs_context_task(ctx),
5243 OPEN_BY_ID_ENTITLEMENT);
5244 }
5245
5246 /*
5247 * openbyid_np: open a file given a file system id and a file system object id
5248 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5249 * file systems that don't support object ids it is a node id (uint64_t).
5250 *
5251 * Parameters: p Process requesting the open
5252 * uap User argument descriptor (see below)
5253 * retval Pointer to an area to receive the
5254 * return calue from the system call
5255 *
5256 * Indirect: uap->path Path to open (same as 'open')
5257 *
5258 * uap->fsid id of target file system
5259 * uap->objid id of target file system object
5260 * uap->flags Flags to open (same as 'open')
5261 *
5262 * Returns: 0 Success
5263 * !0 errno value
5264 *
5265 *
5266 * XXX: We should enummerate the possible errno values here, and where
5267 * in the code they originated.
5268 */
5269 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5270 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5271 {
5272 fsid_t fsid;
5273 uint64_t objid;
5274 int error;
5275 char *buf = NULL;
5276 int buflen = MAXPATHLEN;
5277 int pathlen = 0;
5278 vfs_context_t ctx = vfs_context_current();
5279
5280 if (!vfs_context_can_open_by_id(ctx)) {
5281 return EPERM;
5282 }
5283
5284 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5285 return error;
5286 }
5287
5288 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5289 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5290 return error;
5291 }
5292
5293 AUDIT_ARG(value32, fsid.val[0]);
5294 AUDIT_ARG(value64, objid);
5295
5296 /*resolve path from fsis, objid*/
5297 do {
5298 buf = kalloc_data(buflen + 1, Z_WAITOK);
5299 if (buf == NULL) {
5300 return ENOMEM;
5301 }
5302
5303 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5304 buf, FSOPT_ISREALFSID, &pathlen);
5305
5306 if (error) {
5307 kfree_data(buf, buflen + 1);
5308 buf = NULL;
5309 }
5310 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5311
5312 if (error) {
5313 return error;
5314 }
5315
5316 buf[pathlen] = 0;
5317
5318 error = openat_internal(
5319 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5320
5321 kfree_data(buf, buflen + 1);
5322
5323 return error;
5324 }
5325
5326
5327 /*
5328 * Create a special file.
5329 */
5330 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5331 int fd);
5332
5333 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5334 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5335 mode_t mode, int fd)
5336 {
5337 vfs_context_t ctx = vfs_context_current();
5338 struct nameidata nd;
5339 vnode_t vp, dvp;
5340 int error;
5341
5342 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5343 if ((mode & S_IFMT) == S_IFIFO) {
5344 return mkfifo1(ctx, upath, vap, fd);
5345 }
5346
5347 AUDIT_ARG(mode, mode);
5348 AUDIT_ARG(value32, vap->va_rdev);
5349
5350 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5351 return error;
5352 }
5353 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5354 UIO_USERSPACE, upath, ctx);
5355 error = nameiat(&nd, fd);
5356 if (error) {
5357 return error;
5358 }
5359 dvp = nd.ni_dvp;
5360 vp = nd.ni_vp;
5361
5362 if (vp != NULL) {
5363 error = EEXIST;
5364 goto out;
5365 }
5366
5367 switch (mode & S_IFMT) {
5368 case S_IFCHR:
5369 VATTR_SET(vap, va_type, VCHR);
5370 break;
5371 case S_IFBLK:
5372 VATTR_SET(vap, va_type, VBLK);
5373 break;
5374 default:
5375 error = EINVAL;
5376 goto out;
5377 }
5378
5379 #if CONFIG_MACF
5380 error = mac_vnode_check_create(ctx,
5381 nd.ni_dvp, &nd.ni_cnd, vap);
5382 if (error) {
5383 goto out;
5384 }
5385 #endif
5386
5387 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5388 goto out;
5389 }
5390
5391 #if CONFIG_FILE_LEASES
5392 vnode_breakdirlease(dvp, false, O_WRONLY);
5393 #endif
5394
5395 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5396 goto out;
5397 }
5398
5399 if (vp) {
5400 int update_flags = 0;
5401
5402 // Make sure the name & parent pointers are hooked up
5403 if (vp->v_name == NULL) {
5404 update_flags |= VNODE_UPDATE_NAME;
5405 }
5406 if (vp->v_parent == NULLVP) {
5407 update_flags |= VNODE_UPDATE_PARENT;
5408 }
5409
5410 if (update_flags) {
5411 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5412 }
5413
5414 #if CONFIG_FSE
5415 add_fsevent(FSE_CREATE_FILE, ctx,
5416 FSE_ARG_VNODE, vp,
5417 FSE_ARG_DONE);
5418 #endif
5419 }
5420
5421 out:
5422 /*
5423 * nameidone has to happen before we vnode_put(dvp)
5424 * since it may need to release the fs_nodelock on the dvp
5425 */
5426 nameidone(&nd);
5427
5428 if (vp) {
5429 vnode_put(vp);
5430 }
5431 vnode_put(dvp);
5432
5433 return error;
5434 }
5435
5436 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5437 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5438 {
5439 struct vnode_attr va;
5440
5441 VATTR_INIT(&va);
5442 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5443 VATTR_SET(&va, va_rdev, uap->dev);
5444
5445 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5446 }
5447
5448 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5449 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5450 {
5451 struct vnode_attr va;
5452
5453 VATTR_INIT(&va);
5454 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5455 VATTR_SET(&va, va_rdev, uap->dev);
5456
5457 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5458 }
5459
5460 /*
5461 * Create a named pipe.
5462 *
5463 * Returns: 0 Success
5464 * EEXIST
5465 * namei:???
5466 * vnode_authorize:???
5467 * vn_create:???
5468 */
5469 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5470 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5471 {
5472 vnode_t vp, dvp;
5473 int error;
5474 struct nameidata nd;
5475
5476 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5477 UIO_USERSPACE, upath, ctx);
5478 error = nameiat(&nd, fd);
5479 if (error) {
5480 return error;
5481 }
5482 dvp = nd.ni_dvp;
5483 vp = nd.ni_vp;
5484
5485 /* check that this is a new file and authorize addition */
5486 if (vp != NULL) {
5487 error = EEXIST;
5488 goto out;
5489 }
5490 VATTR_SET(vap, va_type, VFIFO);
5491
5492 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5493 goto out;
5494 }
5495
5496 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5497 out:
5498 /*
5499 * nameidone has to happen before we vnode_put(dvp)
5500 * since it may need to release the fs_nodelock on the dvp
5501 */
5502 nameidone(&nd);
5503
5504 if (vp) {
5505 vnode_put(vp);
5506 }
5507 vnode_put(dvp);
5508
5509 return error;
5510 }
5511
5512
5513 /*
5514 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5515 *
5516 * Parameters: p Process requesting the open
5517 * uap User argument descriptor (see below)
5518 * retval (Ignored)
5519 *
5520 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5521 * uap->uid UID to set
5522 * uap->gid GID to set
5523 * uap->mode File mode to set (same as 'mkfifo')
5524 * uap->xsecurity ACL to set, if creating
5525 *
5526 * Returns: 0 Success
5527 * !0 errno value
5528 *
5529 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5530 *
5531 * XXX: We should enummerate the possible errno values here, and where
5532 * in the code they originated.
5533 */
5534 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5535 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5536 {
5537 int ciferror;
5538 kauth_filesec_t xsecdst;
5539 struct vnode_attr va;
5540
5541 AUDIT_ARG(owner, uap->uid, uap->gid);
5542
5543 xsecdst = KAUTH_FILESEC_NONE;
5544 if (uap->xsecurity != USER_ADDR_NULL) {
5545 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5546 return ciferror;
5547 }
5548 }
5549
5550 VATTR_INIT(&va);
5551 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5552 if (uap->uid != KAUTH_UID_NONE) {
5553 VATTR_SET(&va, va_uid, uap->uid);
5554 }
5555 if (uap->gid != KAUTH_GID_NONE) {
5556 VATTR_SET(&va, va_gid, uap->gid);
5557 }
5558 if (xsecdst != KAUTH_FILESEC_NONE) {
5559 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5560 va.va_vaflags |= VA_FILESEC_ACL;
5561 }
5562
5563 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5564
5565 if (xsecdst != KAUTH_FILESEC_NONE) {
5566 kauth_filesec_free(xsecdst);
5567 }
5568 return ciferror;
5569 }
5570
5571 /* ARGSUSED */
5572 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5573 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5574 {
5575 struct vnode_attr va;
5576
5577 VATTR_INIT(&va);
5578 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5579
5580 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5581 }
5582
5583 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5584 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5585 {
5586 struct vnode_attr va;
5587
5588 VATTR_INIT(&va);
5589 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5590
5591 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5592 }
5593
5594 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5595 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5596 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5597
5598 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5599 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5600 {
5601 int ret, len = _len;
5602
5603 *truncated_path = 0;
5604
5605 if (firmlink) {
5606 ret = vn_getpath(dvp, path, &len);
5607 } else {
5608 ret = vn_getpath_no_firmlink(dvp, path, &len);
5609 }
5610 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5611 if (leafname) {
5612 path[len - 1] = '/';
5613 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5614 if (len > MAXPATHLEN) {
5615 char *ptr;
5616
5617 // the string got truncated!
5618 *truncated_path = 1;
5619 ptr = strrchr(path, '/');
5620 if (ptr) {
5621 *ptr = '\0'; // chop off the string at the last directory component
5622 }
5623 len = (int)strlen(path) + 1;
5624 }
5625 }
5626 } else if (ret == 0) {
5627 *truncated_path = 1;
5628 } else if (ret != 0) {
5629 struct vnode *mydvp = dvp;
5630
5631 if (ret != ENOSPC) {
5632 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5633 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5634 }
5635 *truncated_path = 1;
5636
5637 do {
5638 if (mydvp->v_parent != NULL) {
5639 mydvp = mydvp->v_parent;
5640 } else if (mydvp->v_mount) {
5641 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5642 break;
5643 } else {
5644 // no parent and no mount point? only thing is to punt and say "/" changed
5645 strlcpy(path, "/", _len);
5646 len = 2;
5647 mydvp = NULL;
5648 }
5649
5650 if (mydvp == NULL) {
5651 break;
5652 }
5653
5654 len = _len;
5655 if (firmlink) {
5656 ret = vn_getpath(mydvp, path, &len);
5657 } else {
5658 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5659 }
5660 } while (ret == ENOSPC);
5661 }
5662
5663 return len;
5664 }
5665
5666 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5667 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5668 {
5669 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5670 }
5671
5672 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5673 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5674 {
5675 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5676 }
5677
5678 /*
5679 * Make a hard file link.
5680 *
5681 * Returns: 0 Success
5682 * EPERM
5683 * EEXIST
5684 * EXDEV
5685 * namei:???
5686 * vnode_authorize:???
5687 * VNOP_LINK:???
5688 */
5689 /* ARGSUSED */
5690 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5691 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5692 user_addr_t link, int flag, enum uio_seg segflg)
5693 {
5694 vnode_t vp, pvp, dvp, lvp;
5695 struct nameidata nd;
5696 int follow;
5697 int error;
5698 #if CONFIG_FSE
5699 fse_info finfo;
5700 #endif
5701 char *target_path = NULL;
5702 char *no_firmlink_path = NULL;
5703 vnode_t locked_vp = NULLVP;
5704 int truncated = 0;
5705 int truncated_no_firmlink_path = 0;
5706 int num_retries = 0;
5707 int need_event, has_listeners, need_kpath2;
5708 bool do_retry;
5709
5710 /* look up the object we are linking to */
5711 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5712
5713 retry:
5714 do_retry = false;
5715 vp = dvp = lvp = NULLVP;
5716 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5717 segflg, path, ctx);
5718
5719 error = nameiat(&nd, fd1);
5720 if (error) {
5721 return error;
5722 }
5723 vp = nd.ni_vp;
5724
5725 nameidone(&nd);
5726
5727 /*
5728 * Normally, linking to directories is not supported.
5729 * However, some file systems may have limited support.
5730 */
5731 if (vp->v_type == VDIR) {
5732 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5733 error = EPERM; /* POSIX */
5734 goto out;
5735 }
5736
5737 /* Linking to a directory requires ownership. */
5738 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5739 struct vnode_attr dva;
5740
5741 VATTR_INIT(&dva);
5742 VATTR_WANTED(&dva, va_uid);
5743 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5744 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5745 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5746 error = EACCES;
5747 goto out;
5748 }
5749 }
5750 }
5751
5752 /* lookup the target node */
5753 #if CONFIG_TRIGGERS
5754 nd.ni_op = OP_LINK;
5755 #endif
5756 nd.ni_cnd.cn_nameiop = CREATE;
5757 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5758 nd.ni_dirp = link;
5759 error = nameiat(&nd, fd2);
5760 if (error != 0) {
5761 goto out;
5762 }
5763 dvp = nd.ni_dvp;
5764 lvp = nd.ni_vp;
5765
5766 assert(locked_vp == NULLVP);
5767 vnode_link_lock(vp);
5768 locked_vp = vp;
5769
5770 #if CONFIG_MACF
5771 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5772 goto out2;
5773 }
5774 #endif
5775
5776 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5777 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5778 goto out2;
5779 }
5780
5781 /* target node must not exist */
5782 if (lvp != NULLVP) {
5783 error = EEXIST;
5784 goto out2;
5785 }
5786 /* cannot link across mountpoints */
5787 if (vnode_mount(vp) != vnode_mount(dvp)) {
5788 error = EXDEV;
5789 goto out2;
5790 }
5791
5792 /* authorize creation of the target note */
5793 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5794 goto out2;
5795 }
5796
5797 #if CONFIG_FILE_LEASES
5798 vnode_breakdirlease(dvp, false, O_WRONLY);
5799 #endif
5800
5801 /* and finally make the link */
5802 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5803 if (error) {
5804 if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
5805 do_retry = true;
5806 }
5807 goto out2;
5808 }
5809
5810 #if CONFIG_MACF
5811 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5812 #endif
5813
5814 assert(locked_vp == vp);
5815 vnode_link_unlock(locked_vp);
5816 locked_vp = NULLVP;
5817
5818 #if CONFIG_FSE
5819 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5820 #else
5821 need_event = 0;
5822 #endif
5823 has_listeners = kauth_authorize_fileop_has_listeners();
5824
5825 need_kpath2 = 0;
5826 #if CONFIG_AUDIT
5827 if (AUDIT_RECORD_EXISTS()) {
5828 need_kpath2 = 1;
5829 }
5830 #endif
5831
5832 if (need_event || has_listeners || need_kpath2) {
5833 char *link_to_path = NULL;
5834 int len, link_name_len;
5835 int len_no_firmlink_path = 0;
5836
5837 /* build the path to the new link file */
5838 GET_PATH(target_path);
5839
5840 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5841 if (no_firmlink_path == NULL) {
5842 GET_PATH(no_firmlink_path);
5843 }
5844 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5845
5846 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5847
5848 if (has_listeners) {
5849 /* build the path to file we are linking to */
5850 GET_PATH(link_to_path);
5851
5852 link_name_len = MAXPATHLEN;
5853 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5854 /*
5855 * Call out to allow 3rd party notification of rename.
5856 * Ignore result of kauth_authorize_fileop call.
5857 */
5858 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5859 (uintptr_t)link_to_path,
5860 (uintptr_t)target_path);
5861 }
5862 if (link_to_path != NULL) {
5863 RELEASE_PATH(link_to_path);
5864 }
5865 }
5866 #if CONFIG_FSE
5867 if (need_event) {
5868 /* construct fsevent */
5869 if (get_fse_info(vp, &finfo, ctx) == 0) {
5870 if (truncated_no_firmlink_path) {
5871 finfo.mode |= FSE_TRUNCATED_PATH;
5872 }
5873
5874 // build the path to the destination of the link
5875 add_fsevent(FSE_CREATE_FILE, ctx,
5876 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5877 FSE_ARG_FINFO, &finfo,
5878 FSE_ARG_DONE);
5879 }
5880
5881 pvp = vp->v_parent;
5882 // need an iocount on parent vnode in this case
5883 if (pvp && pvp != dvp) {
5884 pvp = vnode_getparent_if_different(vp, dvp);
5885 }
5886 if (pvp) {
5887 add_fsevent(FSE_STAT_CHANGED, ctx,
5888 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5889 }
5890 if (pvp && pvp != dvp) {
5891 vnode_put(pvp);
5892 }
5893 }
5894 #endif
5895 }
5896 out2:
5897 /*
5898 * nameidone has to happen before we vnode_put(dvp)
5899 * since it may need to release the fs_nodelock on the dvp
5900 */
5901 nameidone(&nd);
5902 if (target_path != NULL) {
5903 RELEASE_PATH(target_path);
5904 target_path = NULL;
5905 }
5906 if (no_firmlink_path != NULL) {
5907 RELEASE_PATH(no_firmlink_path);
5908 no_firmlink_path = NULL;
5909 }
5910 out:
5911 if (locked_vp) {
5912 assert(locked_vp == vp);
5913 vnode_link_unlock(locked_vp);
5914 locked_vp = NULLVP;
5915 }
5916 if (lvp) {
5917 vnode_put(lvp);
5918 }
5919 if (dvp) {
5920 vnode_put(dvp);
5921 }
5922 vnode_put(vp);
5923
5924 if (do_retry) {
5925 goto retry;
5926 }
5927
5928 return error;
5929 }
5930
5931 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5932 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5933 {
5934 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5935 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5936 }
5937
5938 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5939 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5940 {
5941 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5942 return EINVAL;
5943 }
5944
5945 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5946 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5947 }
5948
5949 /*
5950 * Make a symbolic link.
5951 *
5952 * We could add support for ACLs here too...
5953 */
5954 /* ARGSUSED */
5955 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5956 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5957 user_addr_t link, enum uio_seg segflg)
5958 {
5959 struct vnode_attr va;
5960 char *path;
5961 int error;
5962 struct nameidata nd;
5963 vnode_t vp, dvp;
5964 size_t dummy = 0;
5965 proc_t p;
5966
5967 error = 0;
5968 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5969 path = zalloc(ZV_NAMEI);
5970 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5971 } else {
5972 path = (char *)path_data;
5973 }
5974 if (error) {
5975 goto out;
5976 }
5977 AUDIT_ARG(text, path); /* This is the link string */
5978
5979 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5980 segflg, link, ctx);
5981
5982 error = nameiat(&nd, fd);
5983 if (error) {
5984 goto out;
5985 }
5986 dvp = nd.ni_dvp;
5987 vp = nd.ni_vp;
5988
5989 p = vfs_context_proc(ctx);
5990 VATTR_INIT(&va);
5991 VATTR_SET(&va, va_type, VLNK);
5992 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5993
5994 #if CONFIG_MACF
5995 error = mac_vnode_check_create(ctx,
5996 dvp, &nd.ni_cnd, &va);
5997 #endif
5998 if (error != 0) {
5999 goto skipit;
6000 }
6001
6002 if (vp != NULL) {
6003 error = EEXIST;
6004 goto skipit;
6005 }
6006
6007 /* authorize */
6008 if (error == 0) {
6009 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6010 }
6011 /* get default ownership, etc. */
6012 if (error == 0) {
6013 error = vnode_authattr_new(dvp, &va, 0, ctx);
6014 }
6015
6016 #if CONFIG_FILE_LEASES
6017 vnode_breakdirlease(dvp, false, O_WRONLY);
6018 #endif
6019
6020 if (error == 0) {
6021 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
6022 }
6023
6024 /* do fallback attribute handling */
6025 if (error == 0 && vp) {
6026 error = vnode_setattr_fallback(vp, &va, ctx);
6027 }
6028
6029 #if CONFIG_MACF
6030 if (error == 0 && vp) {
6031 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
6032 }
6033 #endif
6034
6035 if (error == 0) {
6036 int update_flags = 0;
6037
6038 /*check if a new vnode was created, else try to get one*/
6039 if (vp == NULL) {
6040 nd.ni_cnd.cn_nameiop = LOOKUP;
6041 #if CONFIG_TRIGGERS
6042 nd.ni_op = OP_LOOKUP;
6043 #endif
6044 /*
6045 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
6046 * reallocated again in namei().
6047 */
6048 nd.ni_cnd.cn_flags &= HASBUF;
6049 error = nameiat(&nd, fd);
6050 if (error) {
6051 goto skipit;
6052 }
6053 vp = nd.ni_vp;
6054 }
6055
6056 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
6057 /* call out to allow 3rd party notification of rename.
6058 * Ignore result of kauth_authorize_fileop call.
6059 */
6060 if (kauth_authorize_fileop_has_listeners() &&
6061 namei(&nd) == 0) {
6062 char *new_link_path = NULL;
6063 int len;
6064
6065 /* build the path to the new link file */
6066 new_link_path = get_pathbuff();
6067 len = MAXPATHLEN;
6068 vn_getpath(dvp, new_link_path, &len);
6069 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
6070 new_link_path[len - 1] = '/';
6071 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
6072 }
6073
6074 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
6075 (uintptr_t)path, (uintptr_t)new_link_path);
6076 if (new_link_path != NULL) {
6077 release_pathbuff(new_link_path);
6078 }
6079 }
6080 #endif
6081 // Make sure the name & parent pointers are hooked up
6082 if (vp->v_name == NULL) {
6083 update_flags |= VNODE_UPDATE_NAME;
6084 }
6085 if (vp->v_parent == NULLVP) {
6086 update_flags |= VNODE_UPDATE_PARENT;
6087 }
6088
6089 if (update_flags) {
6090 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6091 }
6092
6093 #if CONFIG_FSE
6094 add_fsevent(FSE_CREATE_FILE, ctx,
6095 FSE_ARG_VNODE, vp,
6096 FSE_ARG_DONE);
6097 #endif
6098 }
6099
6100 skipit:
6101 /*
6102 * nameidone has to happen before we vnode_put(dvp)
6103 * since it may need to release the fs_nodelock on the dvp
6104 */
6105 nameidone(&nd);
6106
6107 if (vp) {
6108 vnode_put(vp);
6109 }
6110 vnode_put(dvp);
6111 out:
6112 if (path && (path != (char *)path_data)) {
6113 zfree(ZV_NAMEI, path);
6114 }
6115
6116 return error;
6117 }
6118
6119 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6120 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6121 {
6122 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6123 uap->link, UIO_USERSPACE);
6124 }
6125
6126 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6127 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6128 __unused int32_t *retval)
6129 {
6130 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6131 uap->path2, UIO_USERSPACE);
6132 }
6133
6134 /*
6135 * Delete a whiteout from the filesystem.
6136 * No longer supported.
6137 */
6138 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6139 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6140 {
6141 return ENOTSUP;
6142 }
6143
6144 /*
6145 * Delete a name from the filesystem.
6146 */
6147 /* ARGSUSED */
6148 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6149 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6150 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6151 {
6152 struct {
6153 struct nameidata nd;
6154 #if CONFIG_FSE
6155 struct vnode_attr va;
6156 fse_info finfo;
6157 #endif
6158 } *__unlink_data;
6159 struct nameidata *ndp;
6160 vnode_t vp, dvp;
6161 int error;
6162 struct componentname *cnp;
6163 char *path = NULL;
6164 char *no_firmlink_path = NULL;
6165 int len_path = 0;
6166 int len_no_firmlink_path = 0;
6167 int flags;
6168 int need_event;
6169 int has_listeners;
6170 int truncated_path;
6171 int truncated_no_firmlink_path;
6172 int batched;
6173 struct vnode_attr *vap;
6174 vnode_t locked_vp = NULLVP;
6175 int do_retry;
6176 int retry_count = 0;
6177 int cn_flags;
6178 int nofollow_any = 0;
6179
6180 cn_flags = LOCKPARENT;
6181 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6182 cn_flags |= AUDITVNPATH1;
6183 }
6184 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6185 nofollow_any = NAMEI_NOFOLLOW_ANY;
6186 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6187 }
6188 /* If a starting dvp is passed, it trumps any fd passed. */
6189 if (start_dvp) {
6190 cn_flags |= USEDVP;
6191 }
6192
6193 #if NAMEDRSRCFORK
6194 /* unlink or delete is allowed on rsrc forks and named streams */
6195 cn_flags |= CN_ALLOWRSRCFORK;
6196 #endif
6197
6198 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6199 ndp = &__unlink_data->nd;
6200 #if CONFIG_FSE
6201 fse_info *finfop = &__unlink_data->finfo;
6202 #endif
6203
6204 retry:
6205 do_retry = 0;
6206 flags = 0;
6207 need_event = 0;
6208 has_listeners = 0;
6209 truncated_path = 0;
6210 truncated_no_firmlink_path = 0;
6211 vap = NULL;
6212
6213 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6214
6215 ndp->ni_dvp = start_dvp;
6216 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6217 cnp = &ndp->ni_cnd;
6218
6219 continue_lookup:
6220 error = nameiat(ndp, fd);
6221 if (error) {
6222 goto early_out;
6223 }
6224
6225 dvp = ndp->ni_dvp;
6226 vp = ndp->ni_vp;
6227
6228 /* With Carbon delete semantics, busy files cannot be deleted */
6229 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6230 flags |= VNODE_REMOVE_NODELETEBUSY;
6231 }
6232
6233 /* Skip any potential upcalls if told to. */
6234 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6235 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6236 }
6237
6238 /* Update speculative telemetry with system discarded use state */
6239 if (unlink_flags & VNODE_REMOVE_SYSTEM_DISCARDED) {
6240 flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6241 }
6242
6243 if (vp) {
6244 batched = vnode_compound_remove_available(vp);
6245 /*
6246 * The root of a mounted filesystem cannot be deleted.
6247 */
6248 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6249 error = EBUSY;
6250 goto out;
6251 }
6252
6253 #if DEVELOPMENT || DEBUG
6254 /*
6255 * XXX VSWAP: Check for entitlements or special flag here
6256 * so we can restrict access appropriately.
6257 */
6258 #else /* DEVELOPMENT || DEBUG */
6259
6260 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6261 error = EPERM;
6262 goto out;
6263 }
6264 #endif /* DEVELOPMENT || DEBUG */
6265
6266 if (!batched) {
6267 vnode_link_lock(vp);
6268 locked_vp = vp;
6269 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6270 if (error) {
6271 if (error == ENOENT) {
6272 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6273 do_retry = 1;
6274 retry_count++;
6275 }
6276 }
6277 vnode_link_unlock(vp);
6278 locked_vp = NULLVP;
6279 goto out;
6280 }
6281 }
6282 } else {
6283 batched = 1;
6284
6285 if (!vnode_compound_remove_available(dvp)) {
6286 panic("No vp, but no compound remove?");
6287 }
6288 }
6289
6290 #if CONFIG_FSE
6291 need_event = need_fsevent(FSE_DELETE, dvp);
6292 if (need_event) {
6293 if (!batched) {
6294 if ((vp->v_flag & VISHARDLINK) == 0) {
6295 /* XXX need to get these data in batched VNOP */
6296 get_fse_info(vp, finfop, ctx);
6297 }
6298 } else {
6299 error =
6300 vfs_get_notify_attributes(&__unlink_data->va);
6301 if (error) {
6302 goto out;
6303 }
6304
6305 vap = &__unlink_data->va;
6306 }
6307 }
6308 #endif
6309 has_listeners = kauth_authorize_fileop_has_listeners();
6310 if (need_event || has_listeners) {
6311 if (path == NULL) {
6312 GET_PATH(path);
6313 }
6314 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6315 if (no_firmlink_path == NULL) {
6316 GET_PATH(no_firmlink_path);
6317 }
6318 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6319 }
6320
6321 #if NAMEDRSRCFORK
6322 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6323 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6324 } else
6325 #endif
6326 {
6327 #if CONFIG_FILE_LEASES
6328 vnode_breakdirlease(dvp, false, O_WRONLY);
6329 #endif
6330
6331 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6332 vp = ndp->ni_vp;
6333 if (error == EKEEPLOOKING) {
6334 if (!batched) {
6335 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6336 }
6337
6338 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6339 panic("EKEEPLOOKING, but continue flag not set?");
6340 }
6341
6342 if (vnode_isdir(vp)) {
6343 error = EISDIR;
6344 goto out;
6345 }
6346 goto continue_lookup;
6347 } else if (error == ENOENT && batched) {
6348 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6349 /*
6350 * For compound VNOPs, the authorization callback may
6351 * return ENOENT in case of racing hardlink lookups
6352 * hitting the name cache, redrive the lookup.
6353 */
6354 do_retry = 1;
6355 retry_count += 1;
6356 goto out;
6357 }
6358 }
6359 }
6360
6361 /*
6362 * Call out to allow 3rd party notification of delete.
6363 * Ignore result of kauth_authorize_fileop call.
6364 */
6365 if (!error) {
6366 if (has_listeners) {
6367 kauth_authorize_fileop(vfs_context_ucred(ctx),
6368 KAUTH_FILEOP_DELETE,
6369 (uintptr_t)vp,
6370 (uintptr_t)path);
6371 }
6372
6373 if (vp->v_flag & VISHARDLINK) {
6374 //
6375 // if a hardlink gets deleted we want to blow away the
6376 // v_parent link because the path that got us to this
6377 // instance of the link is no longer valid. this will
6378 // force the next call to get the path to ask the file
6379 // system instead of just following the v_parent link.
6380 //
6381 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6382 }
6383
6384 #if CONFIG_FSE
6385 if (need_event) {
6386 if (vp->v_flag & VISHARDLINK) {
6387 get_fse_info(vp, finfop, ctx);
6388 } else if (vap) {
6389 vnode_get_fse_info_from_vap(vp, finfop, vap);
6390 }
6391 if (truncated_path) {
6392 finfop->mode |= FSE_TRUNCATED_PATH;
6393 }
6394 add_fsevent(FSE_DELETE, ctx,
6395 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6396 FSE_ARG_FINFO, finfop,
6397 FSE_ARG_DONE);
6398 }
6399 #endif
6400
6401 #if CONFIG_MACF
6402 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6403 #endif
6404 }
6405
6406 out:
6407 if (locked_vp) {
6408 assert(locked_vp == vp);
6409 vnode_link_unlock(locked_vp);
6410 locked_vp = NULLVP;
6411 }
6412
6413 if (path != NULL) {
6414 RELEASE_PATH(path);
6415 path = NULL;
6416 }
6417
6418 if (no_firmlink_path != NULL) {
6419 RELEASE_PATH(no_firmlink_path);
6420 no_firmlink_path = NULL;
6421 }
6422 #if NAMEDRSRCFORK
6423 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6424 * will cause its shadow file to go away if necessary.
6425 */
6426 if (vp && (vnode_isnamedstream(vp)) &&
6427 (vp->v_parent != NULLVP) &&
6428 vnode_isshadow(vp)) {
6429 vnode_recycle(vp);
6430 }
6431 #endif
6432 /*
6433 * nameidone has to happen before we vnode_put(dvp)
6434 * since it may need to release the fs_nodelock on the dvp
6435 */
6436 nameidone(ndp);
6437 vnode_put(dvp);
6438 if (vp) {
6439 vnode_put(vp);
6440 }
6441
6442 if (do_retry) {
6443 goto retry;
6444 }
6445
6446 early_out:
6447 kfree_type(typeof(*__unlink_data), __unlink_data);
6448 return error;
6449 }
6450
6451 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6452 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6453 enum uio_seg segflg, int unlink_flags)
6454 {
6455 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6456 unlink_flags);
6457 }
6458
6459 /*
6460 * Delete a name from the filesystem using Carbon semantics.
6461 */
6462 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6463 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6464 {
6465 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6466 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6467 }
6468
6469 /*
6470 * Delete a name from the filesystem using POSIX semantics.
6471 */
6472 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6473 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6474 {
6475 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6476 uap->path, UIO_USERSPACE, 0);
6477 }
6478
6479 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6480 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6481 {
6482 int unlink_flags = 0;
6483
6484 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED)) {
6485 return EINVAL;
6486 }
6487
6488 if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6489 unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6490 }
6491
6492 if (uap->flag & AT_SYSTEM_DISCARDED) {
6493 unlink_flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6494 }
6495
6496 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6497 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6498 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6499 }
6500 return rmdirat_internal(vfs_context_current(), uap->fd,
6501 uap->path, UIO_USERSPACE, unlink_flags);
6502 } else {
6503 return unlinkat_internal(vfs_context_current(), uap->fd,
6504 NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6505 }
6506 }
6507
6508 /*
6509 * Reposition read/write file offset.
6510 */
6511 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6512 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6513 {
6514 struct fileproc *fp;
6515 vnode_t vp;
6516 struct vfs_context *ctx;
6517 off_t offset = uap->offset, file_size;
6518 int error;
6519
6520 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6521 if (error == ENOTSUP) {
6522 return ESPIPE;
6523 }
6524 return error;
6525 }
6526 if (vnode_isfifo(vp)) {
6527 file_drop(uap->fd);
6528 return ESPIPE;
6529 }
6530
6531
6532 ctx = vfs_context_current();
6533 #if CONFIG_MACF
6534 if (uap->whence == L_INCR && uap->offset == 0) {
6535 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6536 fp->fp_glob);
6537 } else {
6538 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6539 fp->fp_glob);
6540 }
6541 if (error) {
6542 file_drop(uap->fd);
6543 return error;
6544 }
6545 #endif
6546 if ((error = vnode_getwithref(vp))) {
6547 file_drop(uap->fd);
6548 return error;
6549 }
6550
6551 switch (uap->whence) {
6552 case L_INCR:
6553 offset += fp->fp_glob->fg_offset;
6554 break;
6555 case L_XTND:
6556 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6557 break;
6558 }
6559 offset += file_size;
6560 break;
6561 case L_SET:
6562 break;
6563 case SEEK_HOLE:
6564 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6565 break;
6566 case SEEK_DATA:
6567 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6568 break;
6569 default:
6570 error = EINVAL;
6571 }
6572 if (error == 0) {
6573 if (uap->offset > 0 && offset < 0) {
6574 /* Incremented/relative move past max size */
6575 error = EOVERFLOW;
6576 } else {
6577 /*
6578 * Allow negative offsets on character devices, per
6579 * POSIX 1003.1-2001. Most likely for writing disk
6580 * labels.
6581 */
6582 if (offset < 0 && vp->v_type != VCHR) {
6583 /* Decremented/relative move before start */
6584 error = EINVAL;
6585 } else {
6586 /* Success */
6587 fp->fp_glob->fg_offset = offset;
6588 *retval = fp->fp_glob->fg_offset;
6589 }
6590 }
6591 }
6592
6593 /*
6594 * An lseek can affect whether data is "available to read." Use
6595 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6596 */
6597 post_event_if_success(vp, error, NOTE_NONE);
6598 (void)vnode_put(vp);
6599 file_drop(uap->fd);
6600 return error;
6601 }
6602
6603
6604 /*
6605 * Check access permissions.
6606 *
6607 * Returns: 0 Success
6608 * vnode_authorize:???
6609 */
6610 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6611 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6612 {
6613 kauth_action_t action;
6614 int error;
6615
6616 /*
6617 * If just the regular access bits, convert them to something
6618 * that vnode_authorize will understand.
6619 */
6620 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6621 action = 0;
6622 if (uflags & R_OK) {
6623 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6624 }
6625 if (uflags & W_OK) {
6626 if (vnode_isdir(vp)) {
6627 action |= KAUTH_VNODE_ADD_FILE |
6628 KAUTH_VNODE_ADD_SUBDIRECTORY;
6629 /* might want delete rights here too */
6630 } else {
6631 action |= KAUTH_VNODE_WRITE_DATA;
6632 }
6633 }
6634 if (uflags & X_OK) {
6635 if (vnode_isdir(vp)) {
6636 action |= KAUTH_VNODE_SEARCH;
6637 } else {
6638 action |= KAUTH_VNODE_EXECUTE;
6639 }
6640 }
6641 } else {
6642 /* take advantage of definition of uflags */
6643 action = uflags >> 8;
6644 }
6645
6646 #if CONFIG_MACF
6647 error = mac_vnode_check_access(ctx, vp, uflags);
6648 if (error) {
6649 return error;
6650 }
6651 #endif /* MAC */
6652
6653 /* action == 0 means only check for existence */
6654 if (action != 0) {
6655 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6656 } else {
6657 error = 0;
6658 }
6659
6660 return error;
6661 }
6662
6663
6664
6665 /*
6666 * access_extended: Check access permissions in bulk.
6667 *
6668 * Description: uap->entries Pointer to an array of accessx
6669 * descriptor structs, plus one or
6670 * more NULL terminated strings (see
6671 * "Notes" section below).
6672 * uap->size Size of the area pointed to by
6673 * uap->entries.
6674 * uap->results Pointer to the results array.
6675 *
6676 * Returns: 0 Success
6677 * ENOMEM Insufficient memory
6678 * EINVAL Invalid arguments
6679 * namei:EFAULT Bad address
6680 * namei:ENAMETOOLONG Filename too long
6681 * namei:ENOENT No such file or directory
6682 * namei:ELOOP Too many levels of symbolic links
6683 * namei:EBADF Bad file descriptor
6684 * namei:ENOTDIR Not a directory
6685 * namei:???
6686 * access1:
6687 *
6688 * Implicit returns:
6689 * uap->results Array contents modified
6690 *
6691 * Notes: The uap->entries are structured as an arbitrary length array
6692 * of accessx descriptors, followed by one or more NULL terminated
6693 * strings
6694 *
6695 * struct accessx_descriptor[0]
6696 * ...
6697 * struct accessx_descriptor[n]
6698 * char name_data[0];
6699 *
6700 * We determine the entry count by walking the buffer containing
6701 * the uap->entries argument descriptor. For each descriptor we
6702 * see, the valid values for the offset ad_name_offset will be
6703 * in the byte range:
6704 *
6705 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6706 * to
6707 * [ uap->entries + uap->size - 2 ]
6708 *
6709 * since we must have at least one string, and the string must
6710 * be at least one character plus the NULL terminator in length.
6711 *
6712 * XXX: Need to support the check-as uid argument
6713 */
6714 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6715 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6716 {
6717 struct accessx_descriptor *input = NULL;
6718 errno_t *result = NULL;
6719 errno_t error = 0;
6720 int wantdelete = 0;
6721 size_t desc_max, desc_actual = 0;
6722 unsigned int i, j;
6723 struct vfs_context context;
6724 struct nameidata nd;
6725 int niopts;
6726 vnode_t vp = NULL;
6727 vnode_t dvp = NULL;
6728 #define ACCESSX_MAX_DESCR_ON_STACK 10
6729 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6730
6731 context.vc_ucred = NULL;
6732
6733 /*
6734 * Validate parameters; if valid, copy the descriptor array and string
6735 * arguments into local memory. Before proceeding, the following
6736 * conditions must have been met:
6737 *
6738 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6739 * o There must be sufficient room in the request for at least one
6740 * descriptor and a one yte NUL terminated string.
6741 * o The allocation of local storage must not fail.
6742 */
6743 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6744 return ENOMEM;
6745 }
6746 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6747 return EINVAL;
6748 }
6749 if (uap->size <= sizeof(stack_input)) {
6750 input = stack_input;
6751 } else {
6752 input = kalloc_data(uap->size, Z_WAITOK);
6753 if (input == NULL) {
6754 error = ENOMEM;
6755 goto out;
6756 }
6757 }
6758 error = copyin(uap->entries, input, uap->size);
6759 if (error) {
6760 goto out;
6761 }
6762
6763 AUDIT_ARG(opaque, input, uap->size);
6764
6765 /*
6766 * Force NUL termination of the copyin buffer to avoid nami() running
6767 * off the end. If the caller passes us bogus data, they may get a
6768 * bogus result.
6769 */
6770 ((char *)input)[uap->size - 1] = 0;
6771
6772 /*
6773 * Access is defined as checking against the process' real identity,
6774 * even if operations are checking the effective identity. This
6775 * requires that we use a local vfs context.
6776 */
6777 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6778 context.vc_thread = current_thread();
6779
6780 /*
6781 * Find out how many entries we have, so we can allocate the result
6782 * array by walking the list and adjusting the count downward by the
6783 * earliest string offset we see.
6784 */
6785 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6786 desc_actual = desc_max;
6787 for (i = 0; i < desc_actual; i++) {
6788 /*
6789 * Take the offset to the name string for this entry and
6790 * convert to an input array index, which would be one off
6791 * the end of the array if this entry was the lowest-addressed
6792 * name string.
6793 */
6794 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6795
6796 /*
6797 * An offset greater than the max allowable offset is an error.
6798 * It is also an error for any valid entry to point
6799 * to a location prior to the end of the current entry, if
6800 * it's not a reference to the string of the previous entry.
6801 */
6802 if (j > desc_max || (j != 0 && j <= i)) {
6803 error = EINVAL;
6804 goto out;
6805 }
6806
6807 /* Also do not let ad_name_offset point to something beyond the size of the input */
6808 if (input[i].ad_name_offset >= uap->size) {
6809 error = EINVAL;
6810 goto out;
6811 }
6812
6813 /*
6814 * An offset of 0 means use the previous descriptor's offset;
6815 * this is used to chain multiple requests for the same file
6816 * to avoid multiple lookups.
6817 */
6818 if (j == 0) {
6819 /* This is not valid for the first entry */
6820 if (i == 0) {
6821 error = EINVAL;
6822 goto out;
6823 }
6824 continue;
6825 }
6826
6827 /*
6828 * If the offset of the string for this descriptor is before
6829 * what we believe is the current actual last descriptor,
6830 * then we need to adjust our estimate downward; this permits
6831 * the string table following the last descriptor to be out
6832 * of order relative to the descriptor list.
6833 */
6834 if (j < desc_actual) {
6835 desc_actual = j;
6836 }
6837 }
6838
6839 /*
6840 * We limit the actual number of descriptors we are willing to process
6841 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6842 * requested does not exceed this limit,
6843 */
6844 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6845 error = ENOMEM;
6846 goto out;
6847 }
6848 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6849 if (result == NULL) {
6850 error = ENOMEM;
6851 goto out;
6852 }
6853
6854 /*
6855 * Do the work by iterating over the descriptor entries we know to
6856 * at least appear to contain valid data.
6857 */
6858 error = 0;
6859 for (i = 0; i < desc_actual; i++) {
6860 /*
6861 * If the ad_name_offset is 0, then we use the previous
6862 * results to make the check; otherwise, we are looking up
6863 * a new file name.
6864 */
6865 if (input[i].ad_name_offset != 0) {
6866 /* discard old vnodes */
6867 if (vp) {
6868 vnode_put(vp);
6869 vp = NULL;
6870 }
6871 if (dvp) {
6872 vnode_put(dvp);
6873 dvp = NULL;
6874 }
6875
6876 /*
6877 * Scan forward in the descriptor list to see if we
6878 * need the parent vnode. We will need it if we are
6879 * deleting, since we must have rights to remove
6880 * entries in the parent directory, as well as the
6881 * rights to delete the object itself.
6882 */
6883 wantdelete = input[i].ad_flags & _DELETE_OK;
6884 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6885 if (input[j].ad_flags & _DELETE_OK) {
6886 wantdelete = 1;
6887 }
6888 }
6889
6890 niopts = FOLLOW | AUDITVNPATH1;
6891
6892 /* need parent for vnode_authorize for deletion test */
6893 if (wantdelete) {
6894 niopts |= WANTPARENT;
6895 }
6896
6897 /* do the lookup */
6898 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6899 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6900 &context);
6901 error = namei(&nd);
6902 if (!error) {
6903 vp = nd.ni_vp;
6904 if (wantdelete) {
6905 dvp = nd.ni_dvp;
6906 }
6907 }
6908 nameidone(&nd);
6909 }
6910
6911 /*
6912 * Handle lookup errors.
6913 */
6914 switch (error) {
6915 case ENOENT:
6916 case EACCES:
6917 case EPERM:
6918 case ENOTDIR:
6919 result[i] = error;
6920 break;
6921 case 0:
6922 /* run this access check */
6923 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6924 break;
6925 default:
6926 /* fatal lookup error */
6927
6928 goto out;
6929 }
6930 }
6931
6932 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6933
6934 /* copy out results */
6935 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6936
6937 out:
6938 if (input && input != stack_input) {
6939 kfree_data(input, uap->size);
6940 }
6941 if (result) {
6942 kfree_data(result, desc_actual * sizeof(errno_t));
6943 }
6944 if (vp) {
6945 vnode_put(vp);
6946 }
6947 if (dvp) {
6948 vnode_put(dvp);
6949 }
6950 if (IS_VALID_CRED(context.vc_ucred)) {
6951 kauth_cred_unref(&context.vc_ucred);
6952 }
6953 return error;
6954 }
6955
6956
6957 /*
6958 * Returns: 0 Success
6959 * namei:EFAULT Bad address
6960 * namei:ENAMETOOLONG Filename too long
6961 * namei:ENOENT No such file or directory
6962 * namei:ELOOP Too many levels of symbolic links
6963 * namei:EBADF Bad file descriptor
6964 * namei:ENOTDIR Not a directory
6965 * namei:???
6966 * access1:
6967 */
6968 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6969 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6970 int flag, enum uio_seg segflg)
6971 {
6972 int error;
6973 struct nameidata nd;
6974 int niopts;
6975 struct vfs_context context;
6976 #if NAMEDRSRCFORK
6977 int is_namedstream = 0;
6978 #endif
6979
6980 /*
6981 * Unless the AT_EACCESS option is used, Access is defined as checking
6982 * against the process' real identity, even if operations are checking
6983 * the effective identity. So we need to tweak the credential
6984 * in the context for that case.
6985 */
6986 if (!(flag & AT_EACCESS)) {
6987 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6988 } else {
6989 context.vc_ucred = ctx->vc_ucred;
6990 }
6991 context.vc_thread = ctx->vc_thread;
6992
6993
6994 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6995 /* need parent for vnode_authorize for deletion test */
6996 if (amode & _DELETE_OK) {
6997 niopts |= WANTPARENT;
6998 }
6999 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
7000 path, &context);
7001 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7002 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7003 }
7004
7005 #if NAMEDRSRCFORK
7006 /* access(F_OK) calls are allowed for resource forks. */
7007 if (amode == F_OK) {
7008 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7009 }
7010 #endif
7011 error = nameiat(&nd, fd);
7012 if (error) {
7013 goto out;
7014 }
7015
7016 #if NAMEDRSRCFORK
7017 /* Grab reference on the shadow stream file vnode to
7018 * force an inactive on release which will mark it
7019 * for recycle.
7020 */
7021 if (vnode_isnamedstream(nd.ni_vp) &&
7022 (nd.ni_vp->v_parent != NULLVP) &&
7023 vnode_isshadow(nd.ni_vp)) {
7024 is_namedstream = 1;
7025 vnode_ref(nd.ni_vp);
7026 }
7027 #endif
7028
7029 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
7030
7031 #if NAMEDRSRCFORK
7032 if (is_namedstream) {
7033 vnode_rele(nd.ni_vp);
7034 }
7035 #endif
7036
7037 vnode_put(nd.ni_vp);
7038 if (amode & _DELETE_OK) {
7039 vnode_put(nd.ni_dvp);
7040 }
7041 nameidone(&nd);
7042
7043 out:
7044 if (!(flag & AT_EACCESS)) {
7045 kauth_cred_unref(&context.vc_ucred);
7046 }
7047 return error;
7048 }
7049
7050 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)7051 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
7052 {
7053 return faccessat_internal(vfs_context_current(), AT_FDCWD,
7054 uap->path, uap->flags, 0, UIO_USERSPACE);
7055 }
7056
7057 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)7058 faccessat(__unused proc_t p, struct faccessat_args *uap,
7059 __unused int32_t *retval)
7060 {
7061 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7062 return EINVAL;
7063 }
7064
7065 return faccessat_internal(vfs_context_current(), uap->fd,
7066 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
7067 }
7068
7069 /*
7070 * Returns: 0 Success
7071 * EFAULT
7072 * copyout:EFAULT
7073 * namei:???
7074 * vn_stat:???
7075 */
7076 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)7077 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
7078 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
7079 enum uio_seg segflg, int fd, int flag)
7080 {
7081 struct nameidata *ndp = NULL;
7082 int follow;
7083 union {
7084 struct stat sb;
7085 struct stat64 sb64;
7086 } source = {};
7087 union {
7088 struct user64_stat user64_sb;
7089 struct user32_stat user32_sb;
7090 struct user64_stat64 user64_sb64;
7091 struct user32_stat64 user32_sb64;
7092 } dest = {};
7093 caddr_t sbp;
7094 int error, my_size;
7095 kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
7096 size_t xsecurity_bufsize;
7097 void * statptr;
7098 struct fileproc *fp = NULL;
7099 int needsrealdev = 0;
7100
7101 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7102 ndp = kalloc_type(struct nameidata, Z_WAITOK);
7103 NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7104 segflg, path, ctx);
7105 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7106 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7107 }
7108
7109 #if NAMEDRSRCFORK
7110 int is_namedstream = 0;
7111 /* stat calls are allowed for resource forks. */
7112 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7113 #endif
7114
7115 if (flag & AT_FDONLY) {
7116 vnode_t fvp;
7117
7118 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7119 if (error) {
7120 goto out;
7121 }
7122 if ((error = vnode_getwithref(fvp))) {
7123 file_drop(fd);
7124 goto out;
7125 }
7126 ndp->ni_vp = fvp;
7127 } else {
7128 error = nameiat(ndp, fd);
7129 if (error) {
7130 goto out;
7131 }
7132 }
7133
7134 statptr = (void *)&source;
7135
7136 #if NAMEDRSRCFORK
7137 /* Grab reference on the shadow stream file vnode to
7138 * force an inactive on release which will mark it
7139 * for recycle.
7140 */
7141 if (vnode_isnamedstream(ndp->ni_vp) &&
7142 (ndp->ni_vp->v_parent != NULLVP) &&
7143 vnode_isshadow(ndp->ni_vp)) {
7144 is_namedstream = 1;
7145 vnode_ref(ndp->ni_vp);
7146 }
7147 #endif
7148
7149 needsrealdev = flag & AT_REALDEV ? 1 : 0;
7150 if (fp && (xsecurity == USER_ADDR_NULL)) {
7151 /*
7152 * If the caller has the file open, and is not
7153 * requesting extended security information, we are
7154 * going to let them get the basic stat information.
7155 */
7156 error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7157 fp->fp_glob->fg_cred);
7158 } else {
7159 error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7160 isstat64, needsrealdev, ctx);
7161 }
7162
7163 #if NAMEDRSRCFORK
7164 if (is_namedstream) {
7165 vnode_rele(ndp->ni_vp);
7166 }
7167 #endif
7168 vnode_put(ndp->ni_vp);
7169 nameidone(ndp);
7170
7171 if (fp) {
7172 file_drop(fd);
7173 fp = NULL;
7174 }
7175
7176 if (error) {
7177 goto out;
7178 }
7179 /* Zap spare fields */
7180 if (isstat64 != 0) {
7181 source.sb64.st_lspare = 0;
7182 source.sb64.st_qspare[0] = 0LL;
7183 source.sb64.st_qspare[1] = 0LL;
7184 if (vfs_context_is64bit(ctx)) {
7185 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7186 my_size = sizeof(dest.user64_sb64);
7187 sbp = (caddr_t)&dest.user64_sb64;
7188 } else {
7189 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7190 my_size = sizeof(dest.user32_sb64);
7191 sbp = (caddr_t)&dest.user32_sb64;
7192 }
7193 /*
7194 * Check if we raced (post lookup) against the last unlink of a file.
7195 */
7196 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7197 source.sb64.st_nlink = 1;
7198 }
7199 } else {
7200 source.sb.st_lspare = 0;
7201 source.sb.st_qspare[0] = 0LL;
7202 source.sb.st_qspare[1] = 0LL;
7203 if (vfs_context_is64bit(ctx)) {
7204 munge_user64_stat(&source.sb, &dest.user64_sb);
7205 my_size = sizeof(dest.user64_sb);
7206 sbp = (caddr_t)&dest.user64_sb;
7207 } else {
7208 munge_user32_stat(&source.sb, &dest.user32_sb);
7209 my_size = sizeof(dest.user32_sb);
7210 sbp = (caddr_t)&dest.user32_sb;
7211 }
7212
7213 /*
7214 * Check if we raced (post lookup) against the last unlink of a file.
7215 */
7216 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7217 source.sb.st_nlink = 1;
7218 }
7219 }
7220 if ((error = copyout(sbp, ub, my_size)) != 0) {
7221 goto out;
7222 }
7223
7224 /* caller wants extended security information? */
7225 if (xsecurity != USER_ADDR_NULL) {
7226 /* did we get any? */
7227 if (fsec == KAUTH_FILESEC_NONE) {
7228 if (susize(xsecurity_size, 0) != 0) {
7229 error = EFAULT;
7230 goto out;
7231 }
7232 } else {
7233 /* find the user buffer size */
7234 xsecurity_bufsize = fusize(xsecurity_size);
7235
7236 /* copy out the actual data size */
7237 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7238 error = EFAULT;
7239 goto out;
7240 }
7241
7242 /* if the caller supplied enough room, copy out to it */
7243 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7244 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7245 }
7246 }
7247 }
7248 out:
7249 if (ndp) {
7250 kfree_type(struct nameidata, ndp);
7251 }
7252 if (fsec != KAUTH_FILESEC_NONE) {
7253 kauth_filesec_free(fsec);
7254 }
7255 return error;
7256 }
7257
7258 /*
7259 * stat_extended: Get file status; with extended security (ACL).
7260 *
7261 * Parameters: p (ignored)
7262 * uap User argument descriptor (see below)
7263 * retval (ignored)
7264 *
7265 * Indirect: uap->path Path of file to get status from
7266 * uap->ub User buffer (holds file status info)
7267 * uap->xsecurity ACL to get (extended security)
7268 * uap->xsecurity_size Size of ACL
7269 *
7270 * Returns: 0 Success
7271 * !0 errno value
7272 *
7273 */
7274 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7275 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7276 __unused int32_t *retval)
7277 {
7278 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7279 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7280 0);
7281 }
7282
7283 /*
7284 * Returns: 0 Success
7285 * fstatat_internal:??? [see fstatat_internal() in this file]
7286 */
7287 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7288 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7289 {
7290 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7291 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7292 }
7293
7294 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7295 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7296 {
7297 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7298 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7299 }
7300
7301 /*
7302 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7303 *
7304 * Parameters: p (ignored)
7305 * uap User argument descriptor (see below)
7306 * retval (ignored)
7307 *
7308 * Indirect: uap->path Path of file to get status from
7309 * uap->ub User buffer (holds file status info)
7310 * uap->xsecurity ACL to get (extended security)
7311 * uap->xsecurity_size Size of ACL
7312 *
7313 * Returns: 0 Success
7314 * !0 errno value
7315 *
7316 */
7317 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7318 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7319 {
7320 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7321 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7322 0);
7323 }
7324
7325 /*
7326 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7327 *
7328 * Parameters: p (ignored)
7329 * uap User argument descriptor (see below)
7330 * retval (ignored)
7331 *
7332 * Indirect: uap->path Path of file to get status from
7333 * uap->ub User buffer (holds file status info)
7334 * uap->xsecurity ACL to get (extended security)
7335 * uap->xsecurity_size Size of ACL
7336 *
7337 * Returns: 0 Success
7338 * !0 errno value
7339 *
7340 */
7341 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7342 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7343 {
7344 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7345 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7346 AT_SYMLINK_NOFOLLOW);
7347 }
7348
7349 /*
7350 * Get file status; this version does not follow links.
7351 */
7352 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7353 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7354 {
7355 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7356 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7357 }
7358
7359 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7360 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7361 {
7362 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7363 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7364 }
7365
7366 /*
7367 * lstat64_extended: Get file status; can handle large inode numbers; does not
7368 * follow links; with extended security (ACL).
7369 *
7370 * Parameters: p (ignored)
7371 * uap User argument descriptor (see below)
7372 * retval (ignored)
7373 *
7374 * Indirect: uap->path Path of file to get status from
7375 * uap->ub User buffer (holds file status info)
7376 * uap->xsecurity ACL to get (extended security)
7377 * uap->xsecurity_size Size of ACL
7378 *
7379 * Returns: 0 Success
7380 * !0 errno value
7381 *
7382 */
7383 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7384 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7385 {
7386 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7387 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7388 AT_SYMLINK_NOFOLLOW);
7389 }
7390
7391 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7392 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7393 {
7394 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7395 return EINVAL;
7396 }
7397
7398 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7399 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7400 }
7401
7402 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7403 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7404 __unused int32_t *retval)
7405 {
7406 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7407 return EINVAL;
7408 }
7409
7410 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7411 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7412 }
7413
7414 /*
7415 * Get configurable pathname variables.
7416 *
7417 * Returns: 0 Success
7418 * namei:???
7419 * vn_pathconf:???
7420 *
7421 * Notes: Global implementation constants are intended to be
7422 * implemented in this function directly; all other constants
7423 * are per-FS implementation, and therefore must be handled in
7424 * each respective FS, instead.
7425 *
7426 * XXX We implement some things globally right now that should actually be
7427 * XXX per-FS; we will need to deal with this at some point.
7428 */
7429 /* ARGSUSED */
7430 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7431 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7432 {
7433 int error;
7434 struct nameidata nd;
7435 vfs_context_t ctx = vfs_context_current();
7436
7437 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7438 UIO_USERSPACE, uap->path, ctx);
7439 error = namei(&nd);
7440 if (error) {
7441 return error;
7442 }
7443
7444 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7445
7446 vnode_put(nd.ni_vp);
7447 nameidone(&nd);
7448 return error;
7449 }
7450
7451 /*
7452 * Return target name of a symbolic link.
7453 */
7454 /* ARGSUSED */
7455 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7456 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7457 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7458 int *retval)
7459 {
7460 vnode_t vp;
7461 uio_t auio;
7462 int error;
7463 struct nameidata nd;
7464 UIO_STACKBUF(uio_buf, 1);
7465 bool put_vnode;
7466
7467 if (bufsize > INT32_MAX) {
7468 return EINVAL;
7469 }
7470
7471 if (lnk_vp) {
7472 vp = lnk_vp;
7473 put_vnode = false;
7474 } else {
7475 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7476 seg, path, ctx);
7477
7478 error = nameiat(&nd, fd);
7479 if (error) {
7480 return error;
7481 }
7482 vp = nd.ni_vp;
7483 put_vnode = true;
7484 nameidone(&nd);
7485 }
7486
7487 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7488 &uio_buf[0], sizeof(uio_buf));
7489 uio_addiov(auio, buf, bufsize);
7490 if (vp->v_type != VLNK) {
7491 error = EINVAL;
7492 } else {
7493 #if CONFIG_MACF
7494 error = mac_vnode_check_readlink(ctx, vp);
7495 #endif
7496 if (error == 0) {
7497 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7498 ctx);
7499 }
7500 if (error == 0) {
7501 error = VNOP_READLINK(vp, auio, ctx);
7502 }
7503 }
7504
7505 if (put_vnode) {
7506 vnode_put(vp);
7507 }
7508
7509 *retval = (int)(bufsize - uio_resid(auio));
7510 return error;
7511 }
7512
7513 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7514 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7515 {
7516 enum uio_seg procseg;
7517 vnode_t vp;
7518 int error;
7519
7520 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7521
7522 AUDIT_ARG(fd, uap->fd);
7523
7524 if ((error = file_vnode(uap->fd, &vp))) {
7525 return error;
7526 }
7527 if ((error = vnode_getwithref(vp))) {
7528 file_drop(uap->fd);
7529 return error;
7530 }
7531
7532 error = readlinkat_internal(vfs_context_current(), -1,
7533 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7534 uap->bufsize, procseg, retval);
7535
7536 vnode_put(vp);
7537 file_drop(uap->fd);
7538 return error;
7539 }
7540
7541 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7542 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7543 {
7544 enum uio_seg procseg;
7545
7546 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7547 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7548 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7549 uap->count, procseg, retval);
7550 }
7551
7552 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7553 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7554 {
7555 enum uio_seg procseg;
7556
7557 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7558 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7559 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7560 retval);
7561 }
7562
7563 /*
7564 * Change file flags, the deep inner layer.
7565 */
7566 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7567 chflags0(vnode_t vp, struct vnode_attr *va,
7568 int (*setattr)(vnode_t, void *, vfs_context_t),
7569 void *arg, vfs_context_t ctx)
7570 {
7571 kauth_action_t action = 0;
7572 int error;
7573
7574 #if CONFIG_MACF
7575 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7576 if (error) {
7577 goto out;
7578 }
7579 #endif
7580
7581 /* request authorisation, disregard immutability */
7582 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7583 goto out;
7584 }
7585 /*
7586 * Request that the auth layer disregard those file flags it's allowed to when
7587 * authorizing this operation; we need to do this in order to be able to
7588 * clear immutable flags.
7589 */
7590 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7591 goto out;
7592 }
7593 error = (*setattr)(vp, arg, ctx);
7594
7595 #if CONFIG_MACF
7596 if (error == 0) {
7597 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7598 }
7599 #endif
7600
7601 out:
7602 return error;
7603 }
7604
7605 /*
7606 * Change file flags.
7607 *
7608 * NOTE: this will vnode_put() `vp'
7609 */
7610 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7611 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7612 {
7613 struct vnode_attr va;
7614 int error;
7615
7616 VATTR_INIT(&va);
7617 VATTR_SET(&va, va_flags, flags);
7618
7619 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7620 vnode_put(vp);
7621
7622 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7623 error = ENOTSUP;
7624 }
7625
7626 return error;
7627 }
7628
7629 /*
7630 * Change flags of a file given a path name.
7631 */
7632 /* ARGSUSED */
7633 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7634 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7635 {
7636 vnode_t vp;
7637 vfs_context_t ctx = vfs_context_current();
7638 int error;
7639 struct nameidata nd;
7640 uint32_t wantparent = 0;
7641
7642 #if CONFIG_FILE_LEASES
7643 wantparent = WANTPARENT;
7644 #endif
7645
7646 AUDIT_ARG(fflags, uap->flags);
7647 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7648 UIO_USERSPACE, uap->path, ctx);
7649 error = namei(&nd);
7650 if (error) {
7651 return error;
7652 }
7653 vp = nd.ni_vp;
7654
7655 #if CONFIG_FILE_LEASES
7656 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7657 vnode_put(nd.ni_dvp);
7658 #endif
7659
7660 nameidone(&nd);
7661
7662 /* we don't vnode_put() here because chflags1 does internally */
7663 error = chflags1(vp, uap->flags, ctx);
7664
7665 return error;
7666 }
7667
7668 /*
7669 * Change flags of a file given a file descriptor.
7670 */
7671 /* ARGSUSED */
7672 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7673 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7674 {
7675 vnode_t vp;
7676 int error;
7677
7678 AUDIT_ARG(fd, uap->fd);
7679 AUDIT_ARG(fflags, uap->flags);
7680 if ((error = file_vnode(uap->fd, &vp))) {
7681 return error;
7682 }
7683
7684 if ((error = vnode_getwithref(vp))) {
7685 file_drop(uap->fd);
7686 return error;
7687 }
7688
7689 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7690
7691 #if CONFIG_FILE_LEASES
7692 vnode_breakdirlease(vp, true, O_WRONLY);
7693 #endif
7694
7695 /* we don't vnode_put() here because chflags1 does internally */
7696 error = chflags1(vp, uap->flags, vfs_context_current());
7697
7698 file_drop(uap->fd);
7699 return error;
7700 }
7701
7702 /*
7703 * Change security information on a filesystem object.
7704 *
7705 * Returns: 0 Success
7706 * EPERM Operation not permitted
7707 * vnode_authattr:??? [anything vnode_authattr can return]
7708 * vnode_authorize:??? [anything vnode_authorize can return]
7709 * vnode_setattr:??? [anything vnode_setattr can return]
7710 *
7711 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7712 * translated to EPERM before being returned.
7713 */
7714 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7715 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7716 {
7717 kauth_action_t action;
7718 int error;
7719
7720 AUDIT_ARG(mode, vap->va_mode);
7721 /* XXX audit new args */
7722
7723 #if NAMEDSTREAMS
7724 /* chmod calls are not allowed for resource forks. */
7725 if (vp->v_flag & VISNAMEDSTREAM) {
7726 return EPERM;
7727 }
7728 #endif
7729
7730 #if CONFIG_MACF
7731 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7732 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7733 return error;
7734 }
7735
7736 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7737 if ((error = mac_vnode_check_setowner(ctx, vp,
7738 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7739 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7740 return error;
7741 }
7742 }
7743
7744 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7745 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7746 return error;
7747 }
7748 #endif
7749
7750 /* make sure that the caller is allowed to set this security information */
7751 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7752 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7753 if (error == EACCES) {
7754 error = EPERM;
7755 }
7756 return error;
7757 }
7758
7759 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7760 return error;
7761 }
7762
7763 #if CONFIG_MACF
7764 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7765 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7766 }
7767
7768 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7769 mac_vnode_notify_setowner(ctx, vp,
7770 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7771 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7772 }
7773
7774 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7775 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7776 }
7777 #endif
7778
7779 return error;
7780 }
7781
7782
7783 /*
7784 * Change mode of a file given a path name.
7785 *
7786 * Returns: 0 Success
7787 * namei:??? [anything namei can return]
7788 * chmod_vnode:??? [anything chmod_vnode can return]
7789 */
7790 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7791 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7792 int fd, int flag, enum uio_seg segflg)
7793 {
7794 struct nameidata nd;
7795 int follow, error;
7796 uint32_t wantparent = 0;
7797
7798 #if CONFIG_FILE_LEASES
7799 wantparent = WANTPARENT;
7800 #endif
7801
7802 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7803 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7804 segflg, path, ctx);
7805 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7806 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7807 }
7808 if ((error = nameiat(&nd, fd))) {
7809 return error;
7810 }
7811
7812 #if CONFIG_FILE_LEASES
7813 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7814 vnode_put(nd.ni_dvp);
7815 #endif
7816
7817 error = chmod_vnode(ctx, nd.ni_vp, vap);
7818 vnode_put(nd.ni_vp);
7819 nameidone(&nd);
7820 return error;
7821 }
7822
7823 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7824 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7825 gid_t gid, user_addr_t xsecurity)
7826 {
7827 int error;
7828
7829 VATTR_INIT(pva);
7830
7831 if (mode != -1) {
7832 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7833 } else {
7834 pva->va_mode = 0;
7835 }
7836
7837 if (uid != KAUTH_UID_NONE) {
7838 VATTR_SET(pva, va_uid, uid);
7839 }
7840
7841 if (gid != KAUTH_GID_NONE) {
7842 VATTR_SET(pva, va_gid, gid);
7843 }
7844
7845 *pxsecdst = NULL;
7846 switch (xsecurity) {
7847 case USER_ADDR_NULL:
7848 break;
7849
7850 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7851 VATTR_SET(pva, va_acl, NULL);
7852 break;
7853
7854 default:
7855 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7856 return error;
7857 }
7858
7859 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7860 pva->va_vaflags |= VA_FILESEC_ACL;
7861 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7862 break;
7863 }
7864
7865 return 0;
7866 }
7867
7868 /*
7869 * chmod_extended: Change the mode of a file given a path name; with extended
7870 * argument list (including extended security (ACL)).
7871 *
7872 * Parameters: p Process requesting the open
7873 * uap User argument descriptor (see below)
7874 * retval (ignored)
7875 *
7876 * Indirect: uap->path Path to object (same as 'chmod')
7877 * uap->uid UID to set
7878 * uap->gid GID to set
7879 * uap->mode File mode to set (same as 'chmod')
7880 * uap->xsecurity ACL to set (or delete)
7881 *
7882 * Returns: 0 Success
7883 * !0 errno value
7884 *
7885 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7886 *
7887 * XXX: We should enummerate the possible errno values here, and where
7888 * in the code they originated.
7889 */
7890 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7891 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7892 {
7893 int error;
7894 struct vnode_attr va;
7895 kauth_filesec_t xsecdst = NULL;
7896
7897 AUDIT_ARG(owner, uap->uid, uap->gid);
7898
7899 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7900 uap->gid, uap->xsecurity);
7901
7902 if (error) {
7903 return error;
7904 }
7905
7906 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7907 UIO_USERSPACE);
7908
7909 if (xsecdst != NULL) {
7910 kauth_filesec_free(xsecdst);
7911 }
7912 return error;
7913 }
7914
7915 /*
7916 * Returns: 0 Success
7917 * chmodat:??? [anything chmodat can return]
7918 */
7919 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7920 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7921 int flag, enum uio_seg segflg)
7922 {
7923 struct vnode_attr va;
7924
7925 VATTR_INIT(&va);
7926 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7927
7928 return chmodat(ctx, path, &va, fd, flag, segflg);
7929 }
7930
7931 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7932 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7933 {
7934 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7935 AT_FDCWD, 0, UIO_USERSPACE);
7936 }
7937
7938 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7939 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7940 {
7941 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7942 return EINVAL;
7943 }
7944
7945 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7946 uap->fd, uap->flag, UIO_USERSPACE);
7947 }
7948
7949 /*
7950 * Change mode of a file given a file descriptor.
7951 */
7952 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7953 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7954 {
7955 vnode_t vp;
7956 int error;
7957
7958 AUDIT_ARG(fd, fd);
7959
7960 if ((error = file_vnode(fd, &vp)) != 0) {
7961 return error;
7962 }
7963 if ((error = vnode_getwithref(vp)) != 0) {
7964 file_drop(fd);
7965 return error;
7966 }
7967 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7968
7969 #if CONFIG_FILE_LEASES
7970 vnode_breakdirlease(vp, true, O_WRONLY);
7971 #endif
7972
7973 error = chmod_vnode(vfs_context_current(), vp, vap);
7974 (void)vnode_put(vp);
7975 file_drop(fd);
7976
7977 return error;
7978 }
7979
7980 /*
7981 * fchmod_extended: Change mode of a file given a file descriptor; with
7982 * extended argument list (including extended security (ACL)).
7983 *
7984 * Parameters: p Process requesting to change file mode
7985 * uap User argument descriptor (see below)
7986 * retval (ignored)
7987 *
7988 * Indirect: uap->mode File mode to set (same as 'chmod')
7989 * uap->uid UID to set
7990 * uap->gid GID to set
7991 * uap->xsecurity ACL to set (or delete)
7992 * uap->fd File descriptor of file to change mode
7993 *
7994 * Returns: 0 Success
7995 * !0 errno value
7996 *
7997 */
7998 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7999 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
8000 {
8001 int error;
8002 struct vnode_attr va;
8003 kauth_filesec_t xsecdst = NULL;
8004
8005 AUDIT_ARG(owner, uap->uid, uap->gid);
8006
8007 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8008 uap->gid, uap->xsecurity);
8009
8010 if (error) {
8011 return error;
8012 }
8013
8014 error = fchmod1(p, uap->fd, &va);
8015
8016 if (xsecdst != NULL) {
8017 kauth_filesec_free(xsecdst);
8018 }
8019 return error;
8020 }
8021
8022 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)8023 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
8024 {
8025 struct vnode_attr va;
8026
8027 VATTR_INIT(&va);
8028 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
8029
8030 return fchmod1(p, uap->fd, &va);
8031 }
8032
8033 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)8034 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
8035 {
8036 struct vnode_attr va;
8037 kauth_action_t action;
8038 int error;
8039
8040 VATTR_INIT(&va);
8041 if (uid != (uid_t)VNOVAL) {
8042 VATTR_SET(&va, va_uid, uid);
8043 }
8044 if (gid != (gid_t)VNOVAL) {
8045 VATTR_SET(&va, va_gid, gid);
8046 }
8047
8048 #if NAMEDSTREAMS
8049 /* chown calls are not allowed for resource forks. */
8050 if (vp->v_flag & VISNAMEDSTREAM) {
8051 error = EPERM;
8052 goto out;
8053 }
8054 #endif
8055
8056 #if CONFIG_MACF
8057 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
8058 if (error) {
8059 goto out;
8060 }
8061 #endif
8062
8063 /* preflight and authorize attribute changes */
8064 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8065 goto out;
8066 }
8067 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8068 /*
8069 * EACCES is only allowed from namei(); permissions failure should
8070 * return EPERM, so we need to translate the error code.
8071 */
8072 if (error == EACCES) {
8073 error = EPERM;
8074 }
8075
8076 goto out;
8077 }
8078
8079 #if CONFIG_FILE_LEASES
8080 vnode_breakdirlease(vp, true, O_WRONLY);
8081 #endif
8082
8083 error = vnode_setattr(vp, &va, ctx);
8084
8085 #if CONFIG_MACF
8086 if (error == 0) {
8087 mac_vnode_notify_setowner(ctx, vp, uid, gid);
8088 }
8089 #endif
8090
8091 out:
8092 return error;
8093 }
8094
8095 /*
8096 * Set ownership given a path name.
8097 */
8098 /* ARGSUSED */
8099 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)8100 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
8101 gid_t gid, int flag, enum uio_seg segflg)
8102 {
8103 vnode_t vp;
8104 int error;
8105 struct nameidata nd;
8106 int follow;
8107
8108 AUDIT_ARG(owner, uid, gid);
8109
8110 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8111 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8112 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8113 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8114 }
8115
8116 error = nameiat(&nd, fd);
8117 if (error) {
8118 return error;
8119 }
8120
8121 vp = nd.ni_vp;
8122 error = vn_chown_internal(ctx, vp, uid, gid);
8123
8124 nameidone(&nd);
8125 vnode_put(vp);
8126 return error;
8127 }
8128
8129 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8130 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8131 {
8132 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8133 uap->uid, uap->gid, 0, UIO_USERSPACE);
8134 }
8135
8136 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8137 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8138 {
8139 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8140 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8141 }
8142
8143 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8144 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8145 {
8146 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
8147 return EINVAL;
8148 }
8149
8150 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8151 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8152 }
8153
8154 /*
8155 * Set ownership given a file descriptor.
8156 */
8157 /* ARGSUSED */
8158 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8159 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8160 {
8161 vfs_context_t ctx = vfs_context_current();
8162 vnode_t vp;
8163 int error;
8164
8165 AUDIT_ARG(owner, uap->uid, uap->gid);
8166 AUDIT_ARG(fd, uap->fd);
8167
8168 if ((error = file_vnode(uap->fd, &vp))) {
8169 return error;
8170 }
8171
8172 if ((error = vnode_getwithref(vp))) {
8173 file_drop(uap->fd);
8174 return error;
8175 }
8176 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8177
8178 error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8179
8180 (void)vnode_put(vp);
8181 file_drop(uap->fd);
8182 return error;
8183 }
8184
8185 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8186 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8187 {
8188 int error;
8189
8190 if (usrtvp == USER_ADDR_NULL) {
8191 struct timeval old_tv;
8192 /* XXX Y2038 bug because of microtime argument */
8193 microtime(&old_tv);
8194 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8195 tsp[1] = tsp[0];
8196 } else {
8197 if (IS_64BIT_PROCESS(current_proc())) {
8198 struct user64_timeval tv[2];
8199 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8200 if (error) {
8201 return error;
8202 }
8203 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8204 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8205 } else {
8206 struct user32_timeval tv[2];
8207 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8208 if (error) {
8209 return error;
8210 }
8211 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8212 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8213 }
8214 }
8215 return 0;
8216 }
8217
8218 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8219 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8220 int nullflag)
8221 {
8222 int error;
8223 struct vnode_attr va;
8224 kauth_action_t action;
8225
8226 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8227
8228 VATTR_INIT(&va);
8229 VATTR_SET(&va, va_access_time, ts[0]);
8230 VATTR_SET(&va, va_modify_time, ts[1]);
8231 if (nullflag) {
8232 va.va_vaflags |= VA_UTIMES_NULL;
8233 }
8234
8235 #if NAMEDSTREAMS
8236 /* utimes calls are not allowed for resource forks. */
8237 if (vp->v_flag & VISNAMEDSTREAM) {
8238 error = EPERM;
8239 goto out;
8240 }
8241 #endif
8242
8243 #if CONFIG_MACF
8244 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8245 if (error) {
8246 goto out;
8247 }
8248 #endif
8249 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8250 if (!nullflag && error == EACCES) {
8251 error = EPERM;
8252 }
8253 goto out;
8254 }
8255
8256 /* since we may not need to auth anything, check here */
8257 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8258 if (!nullflag && error == EACCES) {
8259 error = EPERM;
8260 }
8261 goto out;
8262 }
8263 error = vnode_setattr(vp, &va, ctx);
8264
8265 #if CONFIG_MACF
8266 if (error == 0) {
8267 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8268 }
8269 #endif
8270
8271 out:
8272 return error;
8273 }
8274
8275 /*
8276 * Set the access and modification times of a file.
8277 */
8278 /* ARGSUSED */
8279 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8280 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8281 {
8282 struct timespec ts[2];
8283 user_addr_t usrtvp;
8284 int error;
8285 struct nameidata nd;
8286 vfs_context_t ctx = vfs_context_current();
8287 uint32_t wantparent = 0;
8288
8289 #if CONFIG_FILE_LEASES
8290 wantparent = WANTPARENT;
8291 #endif
8292
8293 /*
8294 * AUDIT: Needed to change the order of operations to do the
8295 * name lookup first because auditing wants the path.
8296 */
8297 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8298 UIO_USERSPACE, uap->path, ctx);
8299 error = namei(&nd);
8300 if (error) {
8301 return error;
8302 }
8303
8304 /*
8305 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8306 * the current time instead.
8307 */
8308 usrtvp = uap->tptr;
8309 if ((error = getutimes(usrtvp, ts)) != 0) {
8310 goto out;
8311 }
8312
8313 #if CONFIG_FILE_LEASES
8314 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8315 #endif
8316
8317 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8318
8319 out:
8320 #if CONFIG_FILE_LEASES
8321 vnode_put(nd.ni_dvp);
8322 #endif
8323 nameidone(&nd);
8324 vnode_put(nd.ni_vp);
8325 return error;
8326 }
8327
8328 /*
8329 * Set the access and modification times of a file.
8330 */
8331 /* ARGSUSED */
8332 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8333 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8334 {
8335 struct timespec ts[2];
8336 vnode_t vp;
8337 user_addr_t usrtvp;
8338 int error;
8339
8340 AUDIT_ARG(fd, uap->fd);
8341 usrtvp = uap->tptr;
8342 if ((error = getutimes(usrtvp, ts)) != 0) {
8343 return error;
8344 }
8345 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8346 return error;
8347 }
8348 if ((error = vnode_getwithref(vp))) {
8349 file_drop(uap->fd);
8350 return error;
8351 }
8352
8353 #if CONFIG_FILE_LEASES
8354 vnode_breakdirlease(vp, true, O_WRONLY);
8355 #endif
8356
8357 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8358
8359 vnode_put(vp);
8360 file_drop(uap->fd);
8361 return error;
8362 }
8363
8364 static int
truncate_validate_common(proc_t p,off_t length)8365 truncate_validate_common(proc_t p, off_t length)
8366 {
8367 rlim_t fsize_limit;
8368
8369 if (length < 0) {
8370 return EINVAL;
8371 }
8372
8373 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8374 if ((rlim_t)length > fsize_limit) {
8375 psignal(p, SIGXFSZ);
8376 return EFBIG;
8377 }
8378
8379 return 0;
8380 }
8381
8382 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8383 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8384 vfs_context_t ctx, boolean_t need_auth)
8385 {
8386 struct vnode_attr va;
8387 kauth_action_t action;
8388 int error;
8389
8390 VATTR_INIT(&va);
8391 VATTR_SET(&va, va_data_size, length);
8392
8393 #if CONFIG_MACF
8394 error = mac_vnode_check_truncate(ctx, cred, vp);
8395 if (error) {
8396 return error;
8397 }
8398 #endif
8399
8400 /*
8401 * If we reached here from `ftruncate` then we already did an effective
8402 * `vnode_authorize` upon open. We honour the result from then.
8403 */
8404 if (need_auth) {
8405 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8406 return error;
8407 }
8408
8409 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8410 return error;
8411 }
8412 }
8413
8414 #if CONFIG_FILE_LEASES
8415 /* Check if there is a lease placed on the parent directory. */
8416 vnode_breakdirlease(vp, true, O_WRONLY);
8417
8418 /* Now check if there is a lease placed on the file itself. */
8419 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8420 #endif
8421
8422 error = vnode_setattr(vp, &va, ctx);
8423
8424 #if CONFIG_MACF
8425 if (error == 0) {
8426 mac_vnode_notify_truncate(ctx, cred, vp);
8427 }
8428 #endif
8429
8430 return error;
8431 }
8432
8433 /*
8434 * Truncate a file given its path name.
8435 */
8436 /* ARGSUSED */
8437 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8438 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8439 {
8440 vfs_context_t ctx = vfs_context_current();
8441 vnode_t vp;
8442 int error;
8443 struct nameidata nd;
8444
8445 if ((error = truncate_validate_common(p, uap->length))) {
8446 return error;
8447 }
8448
8449 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8450 UIO_USERSPACE, uap->path, ctx);
8451
8452 if ((error = namei(&nd))) {
8453 return error;
8454 }
8455
8456 vp = nd.ni_vp;
8457 nameidone(&nd);
8458
8459 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8460 vnode_put(vp);
8461
8462 return error;
8463 }
8464
8465 /*
8466 * Truncate a file given a file descriptor.
8467 */
8468 /* ARGSUSED */
8469 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8470 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8471 {
8472 vnode_t vp;
8473 struct fileproc *fp;
8474 int error;
8475
8476 AUDIT_ARG(fd, uap->fd);
8477
8478 if ((error = truncate_validate_common(p, uap->length))) {
8479 return error;
8480 }
8481
8482 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8483 return error;
8484 }
8485
8486 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8487 case DTYPE_PSXSHM:
8488 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8489 goto out;
8490 case DTYPE_VNODE:
8491 break;
8492 default:
8493 error = EINVAL;
8494 goto out;
8495 }
8496
8497 vp = (vnode_t)fp_get_data(fp);
8498
8499 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8500 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8501 error = EINVAL;
8502 goto out;
8503 }
8504
8505 if ((error = vnode_getwithref(vp)) != 0) {
8506 goto out;
8507 }
8508
8509 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8510
8511 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8512 vfs_context_current(), false);
8513 vnode_put(vp);
8514
8515 out:
8516 file_drop(uap->fd);
8517 return error;
8518 }
8519
8520
8521 /*
8522 * Sync an open file with synchronized I/O _file_ integrity completion
8523 */
8524 /* ARGSUSED */
8525 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8526 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8527 {
8528 __pthread_testcancel(1);
8529 return fsync_common(p, uap, MNT_WAIT);
8530 }
8531
8532
8533 /*
8534 * Sync an open file with synchronized I/O _file_ integrity completion
8535 *
8536 * Notes: This is a legacy support function that does not test for
8537 * thread cancellation points.
8538 */
8539 /* ARGSUSED */
8540 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8541 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8542 {
8543 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8544 }
8545
8546
8547 /*
8548 * Sync an open file with synchronized I/O _data_ integrity completion
8549 */
8550 /* ARGSUSED */
8551 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8552 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8553 {
8554 __pthread_testcancel(1);
8555 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8556 }
8557
8558
8559 /*
8560 * fsync_common
8561 *
8562 * Common fsync code to support both synchronized I/O file integrity completion
8563 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8564 *
8565 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8566 * will only guarantee that the file data contents are retrievable. If
8567 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8568 * includes additional metadata unnecessary for retrieving the file data
8569 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8570 * storage.
8571 *
8572 * Parameters: p The process
8573 * uap->fd The descriptor to synchronize
8574 * flags The data integrity flags
8575 *
8576 * Returns: int Success
8577 * fp_getfvp:EBADF Bad file descriptor
8578 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8579 * VNOP_FSYNC:??? unspecified
8580 *
8581 * Notes: We use struct fsync_args because it is a short name, and all
8582 * caller argument structures are otherwise identical.
8583 */
8584 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8585 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8586 {
8587 vnode_t vp;
8588 struct fileproc *fp;
8589 vfs_context_t ctx = vfs_context_current();
8590 int error;
8591
8592 AUDIT_ARG(fd, uap->fd);
8593
8594 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8595 return error;
8596 }
8597 if ((error = vnode_getwithref(vp))) {
8598 file_drop(uap->fd);
8599 return error;
8600 }
8601
8602 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8603
8604 error = VNOP_FSYNC(vp, flags, ctx);
8605
8606 #if NAMEDRSRCFORK
8607 /* Sync resource fork shadow file if necessary. */
8608 if ((error == 0) &&
8609 (vp->v_flag & VISNAMEDSTREAM) &&
8610 (vp->v_parent != NULLVP) &&
8611 vnode_isshadow(vp) &&
8612 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8613 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8614 }
8615 #endif
8616
8617 (void)vnode_put(vp);
8618 file_drop(uap->fd);
8619 return error;
8620 }
8621
8622 /*
8623 * Duplicate files. Source must be a file, target must be a file or
8624 * must not exist.
8625 *
8626 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8627 * perform inheritance correctly.
8628 */
8629 /* ARGSUSED */
8630 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8631 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8632 {
8633 vnode_t tvp, fvp, tdvp, sdvp;
8634 struct nameidata fromnd, tond;
8635 int error;
8636 vfs_context_t ctx = vfs_context_current();
8637
8638 /* Check that the flags are valid. */
8639 if (uap->flags & ~CPF_MASK) {
8640 return EINVAL;
8641 }
8642
8643 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8644 UIO_USERSPACE, uap->from, ctx);
8645 if ((error = namei(&fromnd))) {
8646 return error;
8647 }
8648 fvp = fromnd.ni_vp;
8649
8650 NDINIT(&tond, CREATE, OP_LINK,
8651 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8652 UIO_USERSPACE, uap->to, ctx);
8653 if ((error = namei(&tond))) {
8654 goto out1;
8655 }
8656 tdvp = tond.ni_dvp;
8657 tvp = tond.ni_vp;
8658
8659 if (tvp != NULL) {
8660 if (!(uap->flags & CPF_OVERWRITE)) {
8661 error = EEXIST;
8662 goto out;
8663 }
8664 }
8665
8666 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8667 error = EISDIR;
8668 goto out;
8669 }
8670
8671 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8672 error = EOPNOTSUPP;
8673 goto out;
8674 }
8675
8676 #if CONFIG_MACF
8677 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8678 goto out;
8679 }
8680 #endif /* CONFIG_MACF */
8681
8682 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8683 goto out;
8684 }
8685 if (tvp) {
8686 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8687 goto out;
8688 }
8689 }
8690 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8691 goto out;
8692 }
8693
8694 if (fvp == tdvp) {
8695 error = EINVAL;
8696 }
8697 /*
8698 * If source is the same as the destination (that is the
8699 * same inode number) then there is nothing to do.
8700 * (fixed to have POSIX semantics - CSM 3/2/98)
8701 */
8702 if (fvp == tvp) {
8703 error = -1;
8704 }
8705
8706 #if CONFIG_FILE_LEASES
8707 vnode_breakdirlease(tdvp, false, O_WRONLY);
8708 #endif
8709
8710 if (!error) {
8711 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8712 }
8713 out:
8714 sdvp = tond.ni_startdir;
8715 /*
8716 * nameidone has to happen before we vnode_put(tdvp)
8717 * since it may need to release the fs_nodelock on the tdvp
8718 */
8719 nameidone(&tond);
8720
8721 if (tvp) {
8722 vnode_put(tvp);
8723 }
8724 vnode_put(tdvp);
8725 vnode_put(sdvp);
8726 out1:
8727 vnode_put(fvp);
8728
8729 nameidone(&fromnd);
8730
8731 if (error == -1) {
8732 return 0;
8733 }
8734 return error;
8735 }
8736
8737 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8738
8739 /*
8740 * Helper function for doing clones. The caller is expected to provide an
8741 * iocounted source vnode and release it.
8742 */
8743 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8744 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8745 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8746 {
8747 vnode_t tvp, tdvp;
8748 struct nameidata *tondp = NULL;
8749 int error;
8750 int follow;
8751 boolean_t free_src_acl;
8752 boolean_t attr_cleanup;
8753 enum vtype v_type;
8754 kauth_action_t action;
8755 struct componentname *cnp;
8756 uint32_t defaulted = 0;
8757 struct {
8758 struct vnode_attr va[2];
8759 } *va2p = NULL;
8760 struct vnode_attr *vap = NULL;
8761 struct vnode_attr *nvap = NULL;
8762 uint32_t vnop_flags;
8763
8764 v_type = vnode_vtype(fvp);
8765 switch (v_type) {
8766 case VLNK:
8767 /* FALLTHRU */
8768 case VREG:
8769 action = KAUTH_VNODE_ADD_FILE;
8770 break;
8771 case VDIR:
8772 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8773 fvp->v_mountedhere) {
8774 return EINVAL;
8775 }
8776 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8777 break;
8778 default:
8779 return EINVAL;
8780 }
8781
8782 AUDIT_ARG(fd2, dst_dirfd);
8783 AUDIT_ARG(value32, flags);
8784
8785 tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8786 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8787 NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8788 UIO_USERSPACE, dst, ctx);
8789 if (flags & CLONE_NOFOLLOW_ANY) {
8790 tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
8791 }
8792
8793 if ((error = nameiat(tondp, dst_dirfd))) {
8794 kfree_type(struct nameidata, tondp);
8795 return error;
8796 }
8797 cnp = &tondp->ni_cnd;
8798 tdvp = tondp->ni_dvp;
8799 tvp = tondp->ni_vp;
8800
8801 free_src_acl = FALSE;
8802 attr_cleanup = FALSE;
8803
8804 if (tvp != NULL) {
8805 error = EEXIST;
8806 goto out;
8807 }
8808
8809 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8810 error = EXDEV;
8811 goto out;
8812 }
8813
8814 #if CONFIG_MACF
8815 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8816 goto out;
8817 }
8818 #endif
8819 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8820 goto out;
8821 }
8822
8823 action = KAUTH_VNODE_GENERIC_READ_BITS;
8824 if (data_read_authorised) {
8825 action &= ~KAUTH_VNODE_READ_DATA;
8826 }
8827 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8828 goto out;
8829 }
8830
8831 va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
8832 vap = &va2p->va[0];
8833 nvap = &va2p->va[1];
8834
8835 /*
8836 * certain attributes may need to be changed from the source, we ask for
8837 * those here with the exception of source file's ACLs unless the CLONE_ACL
8838 * flag is specified. By default, the clone file will inherit the target
8839 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8840 * will inherit the source file's ACLs instead.
8841 */
8842 VATTR_INIT(vap);
8843 VATTR_WANTED(vap, va_uid);
8844 VATTR_WANTED(vap, va_gid);
8845 VATTR_WANTED(vap, va_mode);
8846 VATTR_WANTED(vap, va_flags);
8847 if (flags & CLONE_ACL) {
8848 VATTR_WANTED(vap, va_acl);
8849 }
8850
8851 if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
8852 goto out;
8853 }
8854
8855 VATTR_INIT(nvap);
8856 VATTR_SET(nvap, va_type, v_type);
8857 if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
8858 VATTR_SET(nvap, va_acl, vap->va_acl);
8859 free_src_acl = TRUE;
8860 }
8861
8862 /* Handle ACL inheritance, initialize vap. */
8863 if (v_type == VLNK) {
8864 error = vnode_authattr_new(tdvp, nvap, 0, ctx);
8865 } else {
8866 error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
8867 if (error) {
8868 goto out;
8869 }
8870 attr_cleanup = TRUE;
8871 }
8872
8873 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8874 /*
8875 * We've got initial values for all security parameters,
8876 * If we are superuser, then we can change owners to be the
8877 * same as the source. Both superuser and the owner have default
8878 * WRITE_SECURITY privileges so all other fields can be taken
8879 * from source as well.
8880 */
8881 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8882 if (VATTR_IS_SUPPORTED(vap, va_uid)) {
8883 VATTR_SET(nvap, va_uid, vap->va_uid);
8884 }
8885 if (VATTR_IS_SUPPORTED(vap, va_gid)) {
8886 VATTR_SET(nvap, va_gid, vap->va_gid);
8887 }
8888 } else {
8889 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8890 }
8891
8892 if (VATTR_IS_SUPPORTED(vap, va_mode)) {
8893 VATTR_SET(nvap, va_mode, vap->va_mode);
8894 }
8895 if (VATTR_IS_SUPPORTED(vap, va_flags)) {
8896 VATTR_SET(nvap, va_flags,
8897 ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8898 (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8899 }
8900
8901 #if CONFIG_FILE_LEASES
8902 vnode_breakdirlease(tdvp, false, O_WRONLY);
8903 #endif
8904
8905 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
8906
8907 if (!error && tvp) {
8908 int update_flags = 0;
8909 #if CONFIG_FSE
8910 int fsevent;
8911 #endif /* CONFIG_FSE */
8912
8913 /*
8914 * If some of the requested attributes weren't handled by the
8915 * VNOP, use our fallback code.
8916 */
8917 if (!VATTR_ALL_SUPPORTED(nvap)) {
8918 (void)vnode_setattr_fallback(tvp, nvap, ctx);
8919 }
8920
8921 #if CONFIG_MACF
8922 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8923 VNODE_LABEL_CREATE, ctx);
8924 #endif
8925
8926 // Make sure the name & parent pointers are hooked up
8927 if (tvp->v_name == NULL) {
8928 update_flags |= VNODE_UPDATE_NAME;
8929 }
8930 if (tvp->v_parent == NULLVP) {
8931 update_flags |= VNODE_UPDATE_PARENT;
8932 }
8933
8934 if (update_flags) {
8935 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8936 cnp->cn_namelen, cnp->cn_hash, update_flags);
8937 }
8938
8939 #if CONFIG_FSE
8940 switch (vnode_vtype(tvp)) {
8941 case VLNK:
8942 /* FALLTHRU */
8943 case VREG:
8944 fsevent = FSE_CREATE_FILE;
8945 break;
8946 case VDIR:
8947 fsevent = FSE_CREATE_DIR;
8948 break;
8949 default:
8950 goto out;
8951 }
8952
8953 if (need_fsevent(fsevent, tvp)) {
8954 /*
8955 * The following is a sequence of three explicit events.
8956 * A pair of FSE_CLONE events representing the source and destination
8957 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8958 * fseventsd may coalesce the destination clone and create events
8959 * into a single event resulting in the following sequence for a client
8960 * FSE_CLONE (src)
8961 * FSE_CLONE | FSE_CREATE (dst)
8962 */
8963 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8964 FSE_ARG_DONE);
8965 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8966 FSE_ARG_DONE);
8967 }
8968 #endif /* CONFIG_FSE */
8969 }
8970
8971 out:
8972 if (attr_cleanup) {
8973 vn_attribute_cleanup(nvap, defaulted);
8974 }
8975 if (free_src_acl && vap->va_acl) {
8976 kauth_acl_free(vap->va_acl);
8977 }
8978 if (va2p) {
8979 kfree_type(typeof(*va2p), va2p);
8980 }
8981 nameidone(tondp);
8982 kfree_type(struct nameidata, tondp);
8983 if (tvp) {
8984 vnode_put(tvp);
8985 }
8986 vnode_put(tdvp);
8987 return error;
8988 }
8989
8990 /*
8991 * clone files or directories, target must not exist.
8992 */
8993 /* ARGSUSED */
8994 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8995 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8996 __unused int32_t *retval)
8997 {
8998 vnode_t fvp;
8999 struct nameidata *ndp = NULL;
9000 int follow;
9001 int error;
9002 vfs_context_t ctx = vfs_context_current();
9003
9004 /* Check that the flags are valid. */
9005 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9006 CLONE_NOFOLLOW_ANY)) {
9007 return EINVAL;
9008 }
9009
9010 AUDIT_ARG(fd, uap->src_dirfd);
9011
9012 ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9013
9014 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9015 NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
9016 UIO_USERSPACE, uap->src, ctx);
9017 if (uap->flags & CLONE_NOFOLLOW_ANY) {
9018 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9019 }
9020
9021 if ((error = nameiat(ndp, uap->src_dirfd))) {
9022 kfree_type(struct nameidata, ndp);
9023 return error;
9024 }
9025
9026 fvp = ndp->ni_vp;
9027 nameidone(ndp);
9028 kfree_type(struct nameidata, ndp);
9029
9030 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
9031 uap->flags, ctx);
9032
9033 vnode_put(fvp);
9034 return error;
9035 }
9036
9037 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)9038 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
9039 __unused int32_t *retval)
9040 {
9041 vnode_t fvp;
9042 struct fileproc *fp;
9043 int error;
9044 vfs_context_t ctx = vfs_context_current();
9045
9046 /* Check that the flags are valid. */
9047 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9048 CLONE_NOFOLLOW_ANY)) {
9049 return EINVAL;
9050 }
9051
9052 AUDIT_ARG(fd, uap->src_fd);
9053 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
9054 if (error) {
9055 return error;
9056 }
9057
9058 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9059 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
9060 error = EBADF;
9061 goto out;
9062 }
9063
9064 if ((error = vnode_getwithref(fvp))) {
9065 goto out;
9066 }
9067
9068 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
9069
9070 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
9071 uap->flags, ctx);
9072
9073 vnode_put(fvp);
9074 out:
9075 file_drop(uap->src_fd);
9076 return error;
9077 }
9078
9079 static int
rename_submounts_callback(mount_t mp,void * arg)9080 rename_submounts_callback(mount_t mp, void *arg)
9081 {
9082 int error = 0;
9083 mount_t pmp = (mount_t)arg;
9084 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
9085
9086 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
9087 return 0;
9088 }
9089
9090 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
9091 return 0;
9092 }
9093
9094 if ((error = vfs_busy(mp, LK_NOWAIT))) {
9095 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
9096 return -1;
9097 }
9098
9099 size_t pathlen = MAXPATHLEN;
9100 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
9101 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
9102 }
9103
9104 vfs_unbusy(mp);
9105
9106 return error;
9107 }
9108
9109 /*
9110 * Rename files. Source and destination must either both be directories,
9111 * or both not be directories. If target is a directory, it must be empty.
9112 */
9113 /* ARGSUSED */
9114 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9115 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9116 int tofd, user_addr_t to, int segflg, u_int uflags)
9117 {
9118 vnode_t tvp, tdvp;
9119 vnode_t fvp, fdvp;
9120 vnode_t mnt_fvp;
9121 struct nameidata *fromnd, *tond;
9122 int error = 0;
9123 int do_retry;
9124 int retry_count;
9125 int mntrename;
9126 int need_event;
9127 int need_kpath2;
9128 int has_listeners;
9129 const char *oname = NULL;
9130 char *from_name = NULL, *to_name = NULL;
9131 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9132 int from_len = 0, to_len = 0;
9133 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9134 int holding_mntlock;
9135 int vn_authorize_skipped;
9136 mount_t locked_mp = NULL;
9137 vnode_t oparent = NULLVP;
9138 vnode_t locked_vp = NULLVP;
9139 #if CONFIG_FSE
9140 fse_info from_finfo = {}, to_finfo;
9141 #endif
9142 int from_truncated = 0, to_truncated = 0;
9143 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9144 int batched = 0;
9145 struct vnode_attr *fvap, *tvap;
9146 int continuing = 0;
9147 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9148 int32_t nofollow_any = 0;
9149 /* carving out a chunk for structs that are too big to be on stack. */
9150 struct {
9151 struct nameidata from_node, to_node;
9152 struct vnode_attr fv_attr, tv_attr;
9153 } * __rename_data;
9154
9155 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9156 fromnd = &__rename_data->from_node;
9157 tond = &__rename_data->to_node;
9158
9159 holding_mntlock = 0;
9160 do_retry = 0;
9161 retry_count = 0;
9162 retry:
9163 fvp = tvp = NULL;
9164 fdvp = tdvp = NULL;
9165 fvap = tvap = NULL;
9166 mnt_fvp = NULLVP;
9167 mntrename = FALSE;
9168 vn_authorize_skipped = FALSE;
9169
9170 if (uflags & RENAME_NOFOLLOW_ANY) {
9171 nofollow_any = NAMEI_NOFOLLOW_ANY;
9172 }
9173 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9174 segflg, from, ctx);
9175 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9176
9177 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9178 segflg, to, ctx);
9179 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9180
9181 continue_lookup:
9182 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9183 if ((error = nameiat(fromnd, fromfd))) {
9184 goto out1;
9185 }
9186 fdvp = fromnd->ni_dvp;
9187 fvp = fromnd->ni_vp;
9188
9189 if (fvp && fvp->v_type == VDIR) {
9190 tond->ni_cnd.cn_flags |= WILLBEDIR;
9191 }
9192 }
9193
9194 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9195 if ((error = nameiat(tond, tofd))) {
9196 /*
9197 * Translate error code for rename("dir1", "dir2/.").
9198 */
9199 if (error == EISDIR && fvp->v_type == VDIR) {
9200 error = EINVAL;
9201 }
9202 goto out1;
9203 }
9204 tdvp = tond->ni_dvp;
9205 tvp = tond->ni_vp;
9206 }
9207
9208 #if DEVELOPMENT || DEBUG
9209 /*
9210 * XXX VSWAP: Check for entitlements or special flag here
9211 * so we can restrict access appropriately.
9212 */
9213 #else /* DEVELOPMENT || DEBUG */
9214
9215 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9216 error = EPERM;
9217 goto out1;
9218 }
9219
9220 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9221 error = EPERM;
9222 goto out1;
9223 }
9224 #endif /* DEVELOPMENT || DEBUG */
9225
9226 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9227 error = ENOENT;
9228 goto out1;
9229 }
9230
9231 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9232 int32_t pval = 0;
9233 int err = 0;
9234
9235 /*
9236 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9237 * has the same name as target iff the following conditions are met:
9238 * 1. the target file system is case insensitive
9239 * 2. source and target directories are the same
9240 * 3. source and target files are the same
9241 * 4. name only differs in case (determined by underlying filesystem)
9242 */
9243 if (fvp != tvp || fdvp != tdvp) {
9244 error = EEXIST;
9245 goto out1;
9246 }
9247
9248 /*
9249 * Assume that the target file system is case sensitive if
9250 * _PC_CASE_SENSITIVE selector isn't supported.
9251 */
9252 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9253 if (err != 0 || pval != 0) {
9254 error = EEXIST;
9255 goto out1;
9256 }
9257 }
9258
9259 batched = vnode_compound_rename_available(fdvp);
9260
9261 #if CONFIG_FSE
9262 need_event = need_fsevent(FSE_RENAME, fdvp);
9263 if (need_event) {
9264 if (fvp) {
9265 get_fse_info(fvp, &from_finfo, ctx);
9266 } else {
9267 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9268 if (error) {
9269 goto out1;
9270 }
9271
9272 fvap = &__rename_data->fv_attr;
9273 }
9274
9275 if (tvp) {
9276 get_fse_info(tvp, &to_finfo, ctx);
9277 } else if (batched) {
9278 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9279 if (error) {
9280 goto out1;
9281 }
9282
9283 tvap = &__rename_data->tv_attr;
9284 }
9285 }
9286 #else
9287 need_event = 0;
9288 #endif /* CONFIG_FSE */
9289
9290 has_listeners = kauth_authorize_fileop_has_listeners();
9291
9292 need_kpath2 = 0;
9293 #if CONFIG_AUDIT
9294 if (AUDIT_RECORD_EXISTS()) {
9295 need_kpath2 = 1;
9296 }
9297 #endif
9298
9299 if (need_event || has_listeners) {
9300 if (from_name == NULL) {
9301 GET_PATH(from_name);
9302 }
9303
9304 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9305
9306 if (from_name_no_firmlink == NULL) {
9307 GET_PATH(from_name_no_firmlink);
9308 }
9309
9310 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9311 }
9312
9313 if (need_event || need_kpath2 || has_listeners) {
9314 if (to_name == NULL) {
9315 GET_PATH(to_name);
9316 }
9317
9318 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9319
9320 if (to_name_no_firmlink == NULL) {
9321 GET_PATH(to_name_no_firmlink);
9322 }
9323
9324 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9325 if (to_name && need_kpath2) {
9326 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9327 }
9328 }
9329 if (!fvp) {
9330 /*
9331 * Claim: this check will never reject a valid rename.
9332 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9333 * Suppose fdvp and tdvp are not on the same mount.
9334 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9335 * then you can't move it to within another dir on the same mountpoint.
9336 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9337 *
9338 * If this check passes, then we are safe to pass these vnodes to the same FS.
9339 */
9340 if (fdvp->v_mount != tdvp->v_mount) {
9341 error = EXDEV;
9342 goto out1;
9343 }
9344 goto skipped_lookup;
9345 }
9346
9347 /*
9348 * If the source and destination are the same (i.e. they're
9349 * links to the same vnode) and the target file system is
9350 * case sensitive, then there is nothing to do.
9351 *
9352 * XXX Come back to this.
9353 */
9354 if (fvp == tvp) {
9355 int pathconf_val;
9356
9357 /*
9358 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9359 * then assume that this file system is case sensitive.
9360 */
9361 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9362 pathconf_val != 0) {
9363 vn_authorize_skipped = TRUE;
9364 goto out1;
9365 }
9366 }
9367
9368 /*
9369 * Allow the renaming of mount points.
9370 * - target must not exist
9371 * - target must reside in the same directory as source
9372 * - union mounts cannot be renamed
9373 * - the root fs, and tightly-linked system volumes, cannot be renamed
9374 *
9375 * XXX Handle this in VFS after a continued lookup (if we missed
9376 * in the cache to start off)
9377 *
9378 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9379 * we'll skip past here. The file system is responsible for
9380 * checking that @tvp is not a descendent of @fvp and vice versa
9381 * so it should always return EINVAL if either @tvp or @fvp is the
9382 * root of a volume.
9383 */
9384 if ((fvp->v_flag & VROOT) &&
9385 (fvp->v_type == VDIR) &&
9386 (tvp == NULL) &&
9387 (fvp->v_mountedhere == NULL) &&
9388 (fdvp == tdvp) &&
9389 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9390 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9391 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9392 vnode_t coveredvp;
9393
9394 /* switch fvp to the covered vnode */
9395 coveredvp = fvp->v_mount->mnt_vnodecovered;
9396 if ((vnode_getwithref(coveredvp))) {
9397 error = ENOENT;
9398 goto out1;
9399 }
9400 /*
9401 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9402 * later.
9403 */
9404 mnt_fvp = fvp;
9405
9406 fvp = coveredvp;
9407 mntrename = TRUE;
9408 }
9409 /*
9410 * Check for cross-device rename.
9411 * For rename on mountpoint, we want to also check the source and its parent
9412 * belong to the same mountpoint.
9413 */
9414 if ((fvp->v_mount != tdvp->v_mount) ||
9415 (fvp->v_mount != fdvp->v_mount) ||
9416 (tvp && (fvp->v_mount != tvp->v_mount))) {
9417 error = EXDEV;
9418 goto out1;
9419 }
9420
9421 /*
9422 * If source is the same as the destination (that is the
9423 * same inode number) then there is nothing to do...
9424 * EXCEPT if the underlying file system supports case
9425 * insensitivity and is case preserving. In this case
9426 * the file system needs to handle the special case of
9427 * getting the same vnode as target (fvp) and source (tvp).
9428 *
9429 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9430 * and _PC_CASE_PRESERVING can have this exception, and they need to
9431 * handle the special case of getting the same vnode as target and
9432 * source. NOTE: Then the target is unlocked going into vnop_rename,
9433 * so not to cause locking problems. There is a single reference on tvp.
9434 *
9435 * NOTE - that fvp == tvp also occurs if they are hard linked and
9436 * that correct behaviour then is just to return success without doing
9437 * anything.
9438 *
9439 * XXX filesystem should take care of this itself, perhaps...
9440 */
9441 if (fvp == tvp && fdvp == tdvp) {
9442 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9443 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9444 fromnd->ni_cnd.cn_namelen)) {
9445 vn_authorize_skipped = TRUE;
9446 goto out1;
9447 }
9448 }
9449
9450 if (holding_mntlock && fvp->v_mount != locked_mp) {
9451 /*
9452 * we're holding a reference and lock
9453 * on locked_mp, but it no longer matches
9454 * what we want to do... so drop our hold
9455 */
9456 mount_unlock_renames(locked_mp);
9457 mount_drop(locked_mp, 0);
9458 holding_mntlock = 0;
9459 }
9460 if (tdvp != fdvp && fvp->v_type == VDIR) {
9461 /*
9462 * serialize renames that re-shape
9463 * the tree... if holding_mntlock is
9464 * set, then we're ready to go...
9465 * otherwise we
9466 * first need to drop the iocounts
9467 * we picked up, second take the
9468 * lock to serialize the access,
9469 * then finally start the lookup
9470 * process over with the lock held
9471 */
9472 if (!holding_mntlock) {
9473 /*
9474 * need to grab a reference on
9475 * the mount point before we
9476 * drop all the iocounts... once
9477 * the iocounts are gone, the mount
9478 * could follow
9479 */
9480 locked_mp = fvp->v_mount;
9481 mount_ref(locked_mp, 0);
9482
9483 /*
9484 * nameidone has to happen before we vnode_put(tvp)
9485 * since it may need to release the fs_nodelock on the tvp
9486 */
9487 nameidone(tond);
9488
9489 if (tvp) {
9490 vnode_put(tvp);
9491 }
9492 vnode_put(tdvp);
9493
9494 /*
9495 * nameidone has to happen before we vnode_put(fdvp)
9496 * since it may need to release the fs_nodelock on the fvp
9497 */
9498 nameidone(fromnd);
9499
9500 vnode_put(fvp);
9501 vnode_put(fdvp);
9502
9503 if (mnt_fvp != NULLVP) {
9504 vnode_put(mnt_fvp);
9505 }
9506
9507 mount_lock_renames(locked_mp);
9508 holding_mntlock = 1;
9509
9510 goto retry;
9511 }
9512 } else {
9513 /*
9514 * when we dropped the iocounts to take
9515 * the lock, we allowed the identity of
9516 * the various vnodes to change... if they did,
9517 * we may no longer be dealing with a rename
9518 * that reshapes the tree... once we're holding
9519 * the iocounts, the vnodes can't change type
9520 * so we're free to drop the lock at this point
9521 * and continue on
9522 */
9523 if (holding_mntlock) {
9524 mount_unlock_renames(locked_mp);
9525 mount_drop(locked_mp, 0);
9526 holding_mntlock = 0;
9527 }
9528 }
9529
9530 if (!batched) {
9531 assert(locked_vp == NULLVP);
9532 vnode_link_lock(fvp);
9533 locked_vp = fvp;
9534 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9535 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9536 flags, NULL);
9537 if (error) {
9538 if (error == ENOENT) {
9539 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9540 /*
9541 * We encountered a race where after doing the namei,
9542 * tvp stops being valid. If so, simply re-drive the rename
9543 * call from the top.
9544 */
9545 do_retry = 1;
9546 retry_count += 1;
9547 }
9548 }
9549 vnode_link_unlock(fvp);
9550 locked_vp = NULLVP;
9551 goto out1;
9552 }
9553 }
9554
9555 /* Release the 'mnt_fvp' now that it is no longer needed. */
9556 if (mnt_fvp != NULLVP) {
9557 vnode_put(mnt_fvp);
9558 mnt_fvp = NULLVP;
9559 }
9560
9561 // save these off so we can later verify that fvp is the same
9562 oname = fvp->v_name;
9563 oparent = fvp->v_parent;
9564
9565 skipped_lookup:
9566 #if CONFIG_FILE_LEASES
9567 /* Lease break needed for source's parent dir? */
9568 vnode_breakdirlease(fdvp, false, O_WRONLY);
9569
9570 /* Lease break needed for target's parent dir? */
9571 vnode_breakdirlease(tdvp, false, O_WRONLY);
9572 #endif
9573
9574 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9575 tdvp, &tvp, &tond->ni_cnd, tvap,
9576 flags, ctx);
9577
9578 if (locked_vp) {
9579 vnode_link_unlock(fvp);
9580 locked_vp = NULLVP;
9581 }
9582
9583 if (holding_mntlock) {
9584 /*
9585 * we can drop our serialization
9586 * lock now
9587 */
9588 mount_unlock_renames(locked_mp);
9589 mount_drop(locked_mp, 0);
9590 holding_mntlock = 0;
9591 }
9592 if (error) {
9593 if (error == EDATALESS) {
9594 /*
9595 * If we've been here before, something has gone
9596 * horribly wrong and we should just get out lest
9597 * we spiral around the drain forever.
9598 */
9599 if (flags & VFS_RENAME_DATALESS) {
9600 error = EIO;
9601 goto out1;
9602 }
9603
9604 /*
9605 * The object we're renaming is dataless (or has a
9606 * dataless descendent) and requires materialization
9607 * before the rename occurs. But we're holding the
9608 * mount point's rename lock, so it's not safe to
9609 * make the upcall.
9610 *
9611 * In this case, we release the lock (above), perform
9612 * the materialization, and start the whole thing over.
9613 */
9614 error = vfs_materialize_reparent(fvp, tdvp);
9615 if (error == 0) {
9616 /*
9617 * The next time around we need to tell the
9618 * file system that the materializtaion has
9619 * been performed.
9620 */
9621 flags |= VFS_RENAME_DATALESS;
9622 do_retry = 1;
9623 }
9624 goto out1;
9625 }
9626 if (error == EKEEPLOOKING) {
9627 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9628 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9629 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9630 }
9631 }
9632
9633 fromnd->ni_vp = fvp;
9634 tond->ni_vp = tvp;
9635
9636 goto continue_lookup;
9637 }
9638
9639 /*
9640 * We may encounter a race in the VNOP where the destination didn't
9641 * exist when we did the namei, but it does by the time we go and
9642 * try to create the entry. In this case, we should re-drive this rename
9643 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9644 * but other filesystems susceptible to this race could return it, too.
9645 */
9646 if (error == ERECYCLE) {
9647 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9648 do_retry = 1;
9649 retry_count += 1;
9650 } else {
9651 printf("rename retry limit due to ERECYCLE reached\n");
9652 error = ENOENT;
9653 }
9654 }
9655
9656 /*
9657 * For compound VNOPs, the authorization callback may return
9658 * ENOENT in case of racing hardlink lookups hitting the name
9659 * cache, redrive the lookup.
9660 */
9661 if (batched && error == ENOENT) {
9662 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9663 do_retry = 1;
9664 retry_count += 1;
9665 }
9666 }
9667
9668 goto out1;
9669 }
9670
9671 /* call out to allow 3rd party notification of rename.
9672 * Ignore result of kauth_authorize_fileop call.
9673 */
9674 kauth_authorize_fileop(vfs_context_ucred(ctx),
9675 KAUTH_FILEOP_RENAME,
9676 (uintptr_t)from_name, (uintptr_t)to_name);
9677 if (flags & VFS_RENAME_SWAP) {
9678 kauth_authorize_fileop(vfs_context_ucred(ctx),
9679 KAUTH_FILEOP_RENAME,
9680 (uintptr_t)to_name, (uintptr_t)from_name);
9681 }
9682
9683 #if CONFIG_FSE
9684 if (from_name != NULL && to_name != NULL) {
9685 if (from_truncated || to_truncated) {
9686 // set it here since only the from_finfo gets reported up to user space
9687 from_finfo.mode |= FSE_TRUNCATED_PATH;
9688 }
9689
9690 if (tvap && tvp) {
9691 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9692 }
9693 if (fvap) {
9694 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9695 }
9696
9697 if (tvp) {
9698 add_fsevent(FSE_RENAME, ctx,
9699 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9700 FSE_ARG_FINFO, &from_finfo,
9701 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9702 FSE_ARG_FINFO, &to_finfo,
9703 FSE_ARG_DONE);
9704 if (flags & VFS_RENAME_SWAP) {
9705 /*
9706 * Strictly speaking, swap is the equivalent of
9707 * *three* renames. FSEvents clients should only take
9708 * the events as a hint, so we only bother reporting
9709 * two.
9710 */
9711 add_fsevent(FSE_RENAME, ctx,
9712 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9713 FSE_ARG_FINFO, &to_finfo,
9714 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9715 FSE_ARG_FINFO, &from_finfo,
9716 FSE_ARG_DONE);
9717 }
9718 } else {
9719 add_fsevent(FSE_RENAME, ctx,
9720 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9721 FSE_ARG_FINFO, &from_finfo,
9722 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9723 FSE_ARG_DONE);
9724 }
9725 }
9726 #endif /* CONFIG_FSE */
9727
9728 /*
9729 * update filesystem's mount point data
9730 */
9731 if (mntrename) {
9732 char *cp, *pathend, *mpname;
9733 char * tobuf;
9734 struct mount *mp;
9735 int maxlen;
9736 size_t len = 0;
9737
9738 mp = fvp->v_mountedhere;
9739
9740 if (vfs_busy(mp, LK_NOWAIT)) {
9741 error = EBUSY;
9742 goto out1;
9743 }
9744 tobuf = zalloc(ZV_NAMEI);
9745
9746 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9747 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9748 } else {
9749 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9750 }
9751 if (!error) {
9752 /* find current mount point prefix */
9753 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9754 for (cp = pathend; *cp != '\0'; ++cp) {
9755 if (*cp == '/') {
9756 pathend = cp + 1;
9757 }
9758 }
9759 /* find last component of target name */
9760 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9761 if (*cp == '/') {
9762 mpname = cp + 1;
9763 }
9764 }
9765
9766 /* Update f_mntonname of sub mounts */
9767 vfs_iterate(0, rename_submounts_callback, (void *)mp);
9768
9769 /* append name to prefix */
9770 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9771 bzero(pathend, maxlen);
9772
9773 strlcpy(pathend, mpname, maxlen);
9774 }
9775 zfree(ZV_NAMEI, tobuf);
9776
9777 vfs_unbusy(mp);
9778
9779 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9780 }
9781 /*
9782 * fix up name & parent pointers. note that we first
9783 * check that fvp has the same name/parent pointers it
9784 * had before the rename call... this is a 'weak' check
9785 * at best...
9786 *
9787 * XXX oparent and oname may not be set in the compound vnop case
9788 */
9789 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9790 int update_flags;
9791
9792 update_flags = VNODE_UPDATE_NAME;
9793
9794 if (fdvp != tdvp) {
9795 update_flags |= VNODE_UPDATE_PARENT;
9796 }
9797
9798 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9799 }
9800 out1:
9801 /*
9802 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9803 * skipped earlier as no actual rename was performed.
9804 */
9805 if (vn_authorize_skipped && error == 0) {
9806 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9807 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9808 flags, NULL);
9809 if (error && error == ENOENT) {
9810 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9811 do_retry = 1;
9812 retry_count += 1;
9813 }
9814 }
9815 }
9816 if (to_name != NULL) {
9817 RELEASE_PATH(to_name);
9818 to_name = NULL;
9819 }
9820 if (to_name_no_firmlink != NULL) {
9821 RELEASE_PATH(to_name_no_firmlink);
9822 to_name_no_firmlink = NULL;
9823 }
9824 if (from_name != NULL) {
9825 RELEASE_PATH(from_name);
9826 from_name = NULL;
9827 }
9828 if (from_name_no_firmlink != NULL) {
9829 RELEASE_PATH(from_name_no_firmlink);
9830 from_name_no_firmlink = NULL;
9831 }
9832 if (holding_mntlock) {
9833 mount_unlock_renames(locked_mp);
9834 mount_drop(locked_mp, 0);
9835 holding_mntlock = 0;
9836 }
9837 if (tdvp) {
9838 /*
9839 * nameidone has to happen before we vnode_put(tdvp)
9840 * since it may need to release the fs_nodelock on the tdvp
9841 */
9842 nameidone(tond);
9843
9844 if (tvp) {
9845 vnode_put(tvp);
9846 }
9847 vnode_put(tdvp);
9848 }
9849 if (fdvp) {
9850 /*
9851 * nameidone has to happen before we vnode_put(fdvp)
9852 * since it may need to release the fs_nodelock on the fdvp
9853 */
9854 nameidone(fromnd);
9855
9856 if (fvp) {
9857 vnode_put(fvp);
9858 }
9859 vnode_put(fdvp);
9860 }
9861 if (mnt_fvp != NULLVP) {
9862 vnode_put(mnt_fvp);
9863 }
9864 /*
9865 * If things changed after we did the namei, then we will re-drive
9866 * this rename call from the top.
9867 */
9868 if (do_retry) {
9869 do_retry = 0;
9870 goto retry;
9871 }
9872
9873 kfree_type(typeof(*__rename_data), __rename_data);
9874 return error;
9875 }
9876
9877 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9878 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9879 {
9880 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9881 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9882 }
9883
9884 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9885 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9886 {
9887 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9888 return EINVAL;
9889 }
9890
9891 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9892 return EINVAL;
9893 }
9894
9895 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9896 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9897 }
9898
9899 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9900 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9901 {
9902 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9903 uap->tofd, uap->to, UIO_USERSPACE, 0);
9904 }
9905
9906 /*
9907 * Make a directory file.
9908 *
9909 * Returns: 0 Success
9910 * EEXIST
9911 * namei:???
9912 * vnode_authorize:???
9913 * vn_create:???
9914 */
9915 /* ARGSUSED */
9916 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9917 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9918 enum uio_seg segflg)
9919 {
9920 vnode_t vp, dvp;
9921 int error;
9922 int update_flags = 0;
9923 int batched;
9924 struct nameidata nd;
9925
9926 AUDIT_ARG(mode, vap->va_mode);
9927 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9928 path, ctx);
9929 nd.ni_cnd.cn_flags |= WILLBEDIR;
9930 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9931
9932 continue_lookup:
9933 error = nameiat(&nd, fd);
9934 if (error) {
9935 return error;
9936 }
9937 dvp = nd.ni_dvp;
9938 vp = nd.ni_vp;
9939
9940 if (vp != NULL) {
9941 error = EEXIST;
9942 goto out;
9943 }
9944
9945 batched = vnode_compound_mkdir_available(dvp);
9946
9947 VATTR_SET(vap, va_type, VDIR);
9948
9949 /*
9950 * XXX
9951 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9952 * only get EXISTS or EISDIR for existing path components, and not that it could see
9953 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9954 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9955 */
9956 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9957 if (error == EACCES || error == EPERM) {
9958 int error2;
9959
9960 nameidone(&nd);
9961 vnode_put(dvp);
9962 dvp = NULLVP;
9963
9964 /*
9965 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9966 * rather than EACCESS if the target exists.
9967 */
9968 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9969 path, ctx);
9970 error2 = nameiat(&nd, fd);
9971 if (error2) {
9972 goto out;
9973 } else {
9974 vp = nd.ni_vp;
9975 error = EEXIST;
9976 goto out;
9977 }
9978 }
9979
9980 goto out;
9981 }
9982
9983 #if CONFIG_FILE_LEASES
9984 vnode_breakdirlease(dvp, false, O_WRONLY);
9985 #endif
9986
9987 /*
9988 * make the directory
9989 */
9990 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9991 if (error == EKEEPLOOKING) {
9992 nd.ni_vp = vp;
9993 goto continue_lookup;
9994 }
9995
9996 goto out;
9997 }
9998
9999 // Make sure the name & parent pointers are hooked up
10000 if (vp->v_name == NULL) {
10001 update_flags |= VNODE_UPDATE_NAME;
10002 }
10003 if (vp->v_parent == NULLVP) {
10004 update_flags |= VNODE_UPDATE_PARENT;
10005 }
10006
10007 if (update_flags) {
10008 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
10009 }
10010
10011 #if CONFIG_FSE
10012 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
10013 #endif
10014
10015 out:
10016 /*
10017 * nameidone has to happen before we vnode_put(dvp)
10018 * since it may need to release the fs_nodelock on the dvp
10019 */
10020 nameidone(&nd);
10021
10022 if (vp) {
10023 vnode_put(vp);
10024 }
10025 if (dvp) {
10026 vnode_put(dvp);
10027 }
10028
10029 return error;
10030 }
10031
10032 /*
10033 * mkdir_extended: Create a directory; with extended security (ACL).
10034 *
10035 * Parameters: p Process requesting to create the directory
10036 * uap User argument descriptor (see below)
10037 * retval (ignored)
10038 *
10039 * Indirect: uap->path Path of directory to create
10040 * uap->mode Access permissions to set
10041 * uap->xsecurity ACL to set
10042 *
10043 * Returns: 0 Success
10044 * !0 Not success
10045 *
10046 */
10047 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)10048 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
10049 {
10050 int ciferror;
10051 kauth_filesec_t xsecdst;
10052 struct vnode_attr va;
10053
10054 AUDIT_ARG(owner, uap->uid, uap->gid);
10055
10056 xsecdst = NULL;
10057 if ((uap->xsecurity != USER_ADDR_NULL) &&
10058 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
10059 return ciferror;
10060 }
10061
10062 VATTR_INIT(&va);
10063 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10064 if (xsecdst != NULL) {
10065 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
10066 va.va_vaflags |= VA_FILESEC_ACL;
10067 }
10068
10069 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10070 UIO_USERSPACE);
10071 if (xsecdst != NULL) {
10072 kauth_filesec_free(xsecdst);
10073 }
10074 return ciferror;
10075 }
10076
10077 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)10078 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
10079 {
10080 struct vnode_attr va;
10081
10082 VATTR_INIT(&va);
10083 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10084
10085 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10086 UIO_USERSPACE);
10087 }
10088
10089 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)10090 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
10091 {
10092 struct vnode_attr va;
10093
10094 VATTR_INIT(&va);
10095 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10096
10097 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
10098 UIO_USERSPACE);
10099 }
10100
10101 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)10102 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
10103 enum uio_seg segflg, int unlink_flags)
10104 {
10105 struct {
10106 struct nameidata nd;
10107 #if CONFIG_FSE
10108 struct vnode_attr va;
10109 #endif /* CONFIG_FSE */
10110 } *__rmdir_data;
10111 vnode_t vp, dvp;
10112 int error;
10113 struct nameidata *ndp;
10114 char *path = NULL;
10115 char *no_firmlink_path = NULL;
10116 int len_path = 0;
10117 int len_no_firmlink_path = 0;
10118 int has_listeners = 0;
10119 int need_event = 0;
10120 int truncated_path = 0;
10121 int truncated_no_firmlink_path = 0;
10122 struct vnode_attr *vap = NULL;
10123 int restart_count = 0;
10124 int batched;
10125
10126 int restart_flag;
10127 int nofollow_any = 0;
10128
10129 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10130 ndp = &__rmdir_data->nd;
10131
10132 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10133 nofollow_any = NAMEI_NOFOLLOW_ANY;
10134 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10135 }
10136
10137 /*
10138 * This loop exists to restart rmdir in the unlikely case that two
10139 * processes are simultaneously trying to remove the same directory
10140 * containing orphaned appleDouble files.
10141 */
10142 do {
10143 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10144 segflg, dirpath, ctx);
10145 ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
10146 continue_lookup:
10147 restart_flag = 0;
10148 vap = NULL;
10149
10150 error = nameiat(ndp, fd);
10151 if (error) {
10152 goto err_out;
10153 }
10154
10155 dvp = ndp->ni_dvp;
10156 vp = ndp->ni_vp;
10157
10158 if (vp) {
10159 batched = vnode_compound_rmdir_available(vp);
10160
10161 if (vp->v_flag & VROOT) {
10162 /*
10163 * The root of a mounted filesystem cannot be deleted.
10164 */
10165 error = EBUSY;
10166 goto out;
10167 }
10168
10169 #if DEVELOPMENT || DEBUG
10170 /*
10171 * XXX VSWAP: Check for entitlements or special flag here
10172 * so we can restrict access appropriately.
10173 */
10174 #else /* DEVELOPMENT || DEBUG */
10175
10176 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10177 error = EPERM;
10178 goto out;
10179 }
10180 #endif /* DEVELOPMENT || DEBUG */
10181
10182 /*
10183 * Removed a check here; we used to abort if vp's vid
10184 * was not the same as what we'd seen the last time around.
10185 * I do not think that check was valid, because if we retry
10186 * and all dirents are gone, the directory could legitimately
10187 * be recycled but still be present in a situation where we would
10188 * have had permission to delete. Therefore, we won't make
10189 * an effort to preserve that check now that we may not have a
10190 * vp here.
10191 */
10192
10193 if (!batched) {
10194 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10195 if (error) {
10196 if (error == ENOENT) {
10197 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10198 restart_flag = 1;
10199 restart_count += 1;
10200 }
10201 }
10202 goto out;
10203 }
10204 }
10205 } else {
10206 batched = 1;
10207
10208 if (!vnode_compound_rmdir_available(dvp)) {
10209 panic("No error, but no compound rmdir?");
10210 }
10211 }
10212
10213 #if CONFIG_FSE
10214 fse_info finfo = {0};
10215
10216 need_event = need_fsevent(FSE_DELETE, dvp);
10217 if (need_event) {
10218 if (!batched) {
10219 get_fse_info(vp, &finfo, ctx);
10220 } else {
10221 error = vfs_get_notify_attributes(&__rmdir_data->va);
10222 if (error) {
10223 goto out;
10224 }
10225
10226 vap = &__rmdir_data->va;
10227 }
10228 }
10229 #endif
10230 has_listeners = kauth_authorize_fileop_has_listeners();
10231 if (need_event || has_listeners) {
10232 if (path == NULL) {
10233 GET_PATH(path);
10234 }
10235
10236 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10237
10238 if (no_firmlink_path == NULL) {
10239 GET_PATH(no_firmlink_path);
10240 }
10241
10242 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10243 #if CONFIG_FSE
10244 if (truncated_no_firmlink_path) {
10245 finfo.mode |= FSE_TRUNCATED_PATH;
10246 }
10247 #endif
10248 }
10249
10250 #if CONFIG_FILE_LEASES
10251 vnode_breakdirlease(dvp, false, O_WRONLY);
10252 #endif
10253
10254 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10255 ndp->ni_vp = vp;
10256 if (vp == NULLVP) {
10257 /* Couldn't find a vnode */
10258 goto out;
10259 }
10260
10261 if (error == EKEEPLOOKING) {
10262 goto continue_lookup;
10263 } else if (batched && error == ENOENT) {
10264 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10265 /*
10266 * For compound VNOPs, the authorization callback
10267 * may return ENOENT in case of racing hard link lookups
10268 * redrive the lookup.
10269 */
10270 restart_flag = 1;
10271 restart_count += 1;
10272 goto out;
10273 }
10274 }
10275
10276 /*
10277 * XXX There's no provision for passing flags
10278 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10279 * because it's not empty, then we try again
10280 * with VNOP_REMOVE(), passing in a special
10281 * flag that clever file systems will know
10282 * how to handle.
10283 */
10284 if (error == ENOTEMPTY &&
10285 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10286 /*
10287 * Only do this if the directory is actually
10288 * marked as DATALESS.
10289 */
10290 struct vnode_attr *lvap =
10291 kalloc_type(struct vnode_attr, Z_WAITOK);
10292
10293 VATTR_INIT(lvap);
10294 VATTR_WANTED(lvap, va_flags);
10295 if (vnode_getattr(vp, lvap, ctx) == 0 &&
10296 VATTR_IS_SUPPORTED(lvap, va_flags) &&
10297 (lvap->va_flags & SF_DATALESS) != 0) {
10298 /*
10299 * If this fails, we want to keep the original
10300 * error.
10301 */
10302 if (vn_remove(dvp, &vp, ndp,
10303 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10304 error = 0;
10305 }
10306 }
10307 kfree_type(struct vnode_attr, lvap);
10308 }
10309
10310 #if CONFIG_APPLEDOUBLE
10311 /*
10312 * Special case to remove orphaned AppleDouble
10313 * files. I don't like putting this in the kernel,
10314 * but carbon does not like putting this in carbon either,
10315 * so here we are.
10316 */
10317 if (error == ENOTEMPTY) {
10318 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10319 if (ad_error == EBUSY) {
10320 error = ad_error;
10321 goto out;
10322 }
10323
10324
10325 /*
10326 * Assuming everything went well, we will try the RMDIR again
10327 */
10328 if (!ad_error) {
10329 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10330 }
10331 }
10332 #endif /* CONFIG_APPLEDOUBLE */
10333 /*
10334 * Call out to allow 3rd party notification of delete.
10335 * Ignore result of kauth_authorize_fileop call.
10336 */
10337 if (!error) {
10338 if (has_listeners) {
10339 kauth_authorize_fileop(vfs_context_ucred(ctx),
10340 KAUTH_FILEOP_DELETE,
10341 (uintptr_t)vp,
10342 (uintptr_t)path);
10343 }
10344
10345 if (vp->v_flag & VISHARDLINK) {
10346 // see the comment in unlink1() about why we update
10347 // the parent of a hard link when it is removed
10348 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10349 }
10350
10351 #if CONFIG_FSE
10352 if (need_event) {
10353 if (vap) {
10354 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10355 }
10356 add_fsevent(FSE_DELETE, ctx,
10357 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10358 FSE_ARG_FINFO, &finfo,
10359 FSE_ARG_DONE);
10360 }
10361 #endif
10362
10363 #if CONFIG_MACF
10364 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10365 #endif
10366 }
10367
10368 out:
10369 if (path != NULL) {
10370 RELEASE_PATH(path);
10371 path = NULL;
10372 }
10373
10374 if (no_firmlink_path != NULL) {
10375 RELEASE_PATH(no_firmlink_path);
10376 no_firmlink_path = NULL;
10377 }
10378
10379 /*
10380 * nameidone has to happen before we vnode_put(dvp)
10381 * since it may need to release the fs_nodelock on the dvp
10382 */
10383 nameidone(ndp);
10384 vnode_put(dvp);
10385
10386 if (vp) {
10387 vnode_put(vp);
10388 }
10389
10390 if (restart_flag == 0) {
10391 wakeup_one((caddr_t)vp);
10392 goto err_out;
10393 }
10394 tsleep(vp, PVFS, "rm AD", 1);
10395 } while (restart_flag != 0);
10396
10397 err_out:
10398 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10399
10400 return error;
10401 }
10402
10403 /*
10404 * Remove a directory file.
10405 */
10406 /* ARGSUSED */
10407 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10408 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10409 {
10410 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10411 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10412 }
10413
10414 /* Get direntry length padded to 8 byte alignment */
10415 #define DIRENT64_LEN(namlen) \
10416 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10417
10418 /* Get dirent length padded to 4 byte alignment */
10419 #define DIRENT_LEN(namelen) \
10420 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10421
10422 /* Get the end of this dirent */
10423 #define DIRENT_END(dep) \
10424 (((char *)(dep)) + (dep)->d_reclen - 1)
10425
10426 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10427 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10428 int *numdirent, vfs_context_t ctxp)
10429 {
10430 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10431 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10432 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10433 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10434 } else {
10435 size_t bufsize;
10436 void * bufptr;
10437 uio_t auio;
10438 struct direntry *entry64;
10439 struct dirent *dep;
10440 size_t bytesread;
10441 int error;
10442
10443 /*
10444 * We're here because the underlying file system does not
10445 * support direnties or we mounted denying support so we must
10446 * fall back to dirents and convert them to direntries.
10447 *
10448 * Our kernel buffer needs to be smaller since re-packing will
10449 * expand each dirent. The worse case (when the name length
10450 * is 3 or less) corresponds to a struct direntry size of 32
10451 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10452 * (4-byte aligned). So having a buffer that is 3/8 the size
10453 * will prevent us from reading more than we can pack.
10454 *
10455 * Since this buffer is wired memory, we will limit the
10456 * buffer size to a maximum of 32K. We would really like to
10457 * use 32K in the MIN(), but we use magic number 87371 to
10458 * prevent uio_resid() * 3 / 8 from overflowing.
10459 */
10460 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10461 bufptr = kalloc_data(bufsize, Z_WAITOK);
10462 if (bufptr == NULL) {
10463 return ENOMEM;
10464 }
10465
10466 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10467 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10468 auio->uio_offset = uio->uio_offset;
10469
10470 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10471
10472 dep = (struct dirent *)bufptr;
10473 bytesread = bufsize - uio_resid(auio);
10474
10475 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10476 /*
10477 * Convert all the entries and copy them out to user's buffer.
10478 */
10479 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10480 /* First check that the dirent struct up to d_name is within the buffer */
10481 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10482 /* Check that the length of the entire dirent is within the buffer */
10483 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10484 /* Check that the actual length including the name doesn't exceed d_reclen */
10485 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10486 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10487 vp->v_mount->mnt_vfsstat.f_mntonname,
10488 vp->v_name ? vp->v_name : "<unknown>");
10489 error = EIO;
10490 break;
10491 }
10492
10493 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10494
10495 bzero(entry64, enbufsize);
10496 /* Convert a dirent to a dirent64. */
10497 entry64->d_ino = dep->d_ino;
10498 entry64->d_seekoff = 0;
10499 entry64->d_reclen = (uint16_t)enbufsize;
10500 entry64->d_namlen = dep->d_namlen;
10501 entry64->d_type = dep->d_type;
10502 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10503
10504 /* Move to next entry. */
10505 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10506
10507 /* Copy entry64 to user's buffer. */
10508 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10509 }
10510
10511 /* Update the real offset using the offset we got from VNOP_READDIR. */
10512 if (error == 0) {
10513 uio->uio_offset = auio->uio_offset;
10514 }
10515 uio_free(auio);
10516 kfree_data(bufptr, bufsize);
10517 kfree_type(struct direntry, entry64);
10518 return error;
10519 }
10520 }
10521
10522 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10523
10524 /*
10525 * Read a block of directory entries in a file system independent format.
10526 */
10527 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10528 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10529 off_t *offset, int *eofflag, int flags)
10530 {
10531 vnode_t vp;
10532 struct vfs_context context = *vfs_context_current(); /* local copy */
10533 struct fileproc *fp;
10534 uio_t auio;
10535 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10536 off_t loff;
10537 int error, numdirent;
10538 UIO_STACKBUF(uio_buf, 1);
10539
10540 get_from_fd:
10541 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10542 if (error) {
10543 return error;
10544 }
10545
10546 vn_offset_lock(fp->fp_glob);
10547 if (((vnode_t)fp_get_data(fp)) != vp) {
10548 vn_offset_unlock(fp->fp_glob);
10549 file_drop(fd);
10550 goto get_from_fd;
10551 }
10552
10553 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10554 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10555 error = EBADF;
10556 goto out;
10557 }
10558
10559 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10560 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10561 }
10562
10563 #if CONFIG_MACF
10564 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10565 if (error) {
10566 goto out;
10567 }
10568 #endif
10569
10570 if ((error = vnode_getwithref(vp))) {
10571 goto out;
10572 }
10573 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10574
10575 #if CONFIG_UNION_MOUNTS
10576 unionread:
10577 #endif /* CONFIG_UNION_MOUNTS */
10578 if (vp->v_type != VDIR) {
10579 (void)vnode_put(vp);
10580 error = EINVAL;
10581 goto out;
10582 }
10583
10584 #if CONFIG_MACF
10585 error = mac_vnode_check_readdir(&context, vp);
10586 if (error != 0) {
10587 (void)vnode_put(vp);
10588 goto out;
10589 }
10590 #endif /* MAC */
10591
10592 loff = fp->fp_glob->fg_offset;
10593 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10594 uio_addiov(auio, bufp, bufsize);
10595
10596 if (flags & VNODE_READDIR_EXTENDED) {
10597 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10598 fp->fp_glob->fg_offset = uio_offset(auio);
10599 } else {
10600 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10601 fp->fp_glob->fg_offset = uio_offset(auio);
10602 }
10603 if (error) {
10604 (void)vnode_put(vp);
10605 goto out;
10606 }
10607
10608 #if CONFIG_UNION_MOUNTS
10609 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10610 (vp->v_mount->mnt_flag & MNT_UNION)) {
10611 vnode_t uvp;
10612
10613 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10614 if (vnode_ref(uvp) == 0) {
10615 fp_set_data(fp, uvp);
10616 fp->fp_glob->fg_offset = 0;
10617 vnode_rele(vp);
10618 vnode_put(vp);
10619 vp = uvp;
10620 goto unionread;
10621 } else {
10622 /* could not get a ref, can't replace in fd */
10623 vnode_put(uvp);
10624 }
10625 }
10626 }
10627 #endif /* CONFIG_UNION_MOUNTS */
10628
10629 vnode_put(vp);
10630 if (offset) {
10631 *offset = loff;
10632 }
10633
10634 *bytesread = bufsize - uio_resid(auio);
10635 out:
10636 vn_offset_unlock(fp->fp_glob);
10637 file_drop(fd);
10638 return error;
10639 }
10640
10641
10642 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10643 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10644 {
10645 off_t offset;
10646 ssize_t bytesread;
10647 int error, eofflag;
10648
10649 AUDIT_ARG(fd, uap->fd);
10650 error = getdirentries_common(uap->fd, uap->buf, uap->count,
10651 &bytesread, &offset, &eofflag, 0);
10652
10653 if (error == 0) {
10654 if (proc_is64bit(p)) {
10655 user64_long_t base = (user64_long_t)offset;
10656 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10657 } else {
10658 user32_long_t base = (user32_long_t)offset;
10659 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10660 }
10661 *retval = (int)bytesread;
10662 }
10663 return error;
10664 }
10665
10666 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10667 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10668 {
10669 off_t offset;
10670 ssize_t bytesread;
10671 int error, eofflag;
10672 user_size_t bufsize;
10673
10674 AUDIT_ARG(fd, uap->fd);
10675
10676 /*
10677 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10678 * then the kernel carves out the last 4 bytes to return extended
10679 * information to userspace (namely whether we reached EOF with this call).
10680 */
10681 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10682 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10683 } else {
10684 bufsize = uap->bufsize;
10685 }
10686
10687 error = getdirentries_common(uap->fd, uap->buf, bufsize,
10688 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10689
10690 if (error == 0) {
10691 *retval = bytesread;
10692 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10693
10694 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10695 getdirentries64_flags_t flags = 0;
10696 if (eofflag) {
10697 flags |= GETDIRENTRIES64_EOF;
10698 }
10699 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10700 sizeof(flags));
10701 }
10702 }
10703 return error;
10704 }
10705
10706
10707 /*
10708 * Set the mode mask for creation of filesystem nodes.
10709 * XXX implement xsecurity
10710 */
10711 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10712 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10713 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10714 {
10715 AUDIT_ARG(mask, newmask);
10716 proc_fdlock(p);
10717 *retval = p->p_fd.fd_cmask;
10718 p->p_fd.fd_cmask = newmask & ALLPERMS;
10719 proc_fdunlock(p);
10720 return 0;
10721 }
10722
10723 /*
10724 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10725 *
10726 * Parameters: p Process requesting to set the umask
10727 * uap User argument descriptor (see below)
10728 * retval umask of the process (parameter p)
10729 *
10730 * Indirect: uap->newmask umask to set
10731 * uap->xsecurity ACL to set
10732 *
10733 * Returns: 0 Success
10734 * !0 Not success
10735 *
10736 */
10737 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10738 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10739 {
10740 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10741 }
10742
10743 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10744 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10745 {
10746 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10747 }
10748
10749 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10750 "com.apple.private.vfs.revoke-mounted-device"
10751
10752 /*
10753 * Void all references to file by ripping underlying filesystem
10754 * away from vnode.
10755 */
10756 /* ARGSUSED */
10757 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10758 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10759 {
10760 vnode_t vp;
10761 struct vnode_attr va;
10762 vfs_context_t ctx = vfs_context_current();
10763 int error;
10764 struct nameidata nd;
10765
10766 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10767 uap->path, ctx);
10768 error = namei(&nd);
10769 if (error) {
10770 return error;
10771 }
10772 vp = nd.ni_vp;
10773
10774 nameidone(&nd);
10775
10776 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10777 error = ENOTSUP;
10778 goto out;
10779 }
10780
10781 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10782 error = EBUSY;
10783 goto out;
10784 }
10785
10786 #if CONFIG_MACF
10787 error = mac_vnode_check_revoke(ctx, vp);
10788 if (error) {
10789 goto out;
10790 }
10791 #endif
10792
10793 VATTR_INIT(&va);
10794 VATTR_WANTED(&va, va_uid);
10795 if ((error = vnode_getattr(vp, &va, ctx))) {
10796 goto out;
10797 }
10798 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10799 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10800 goto out;
10801 }
10802 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10803 VNOP_REVOKE(vp, REVOKEALL, ctx);
10804 }
10805 out:
10806 vnode_put(vp);
10807 return error;
10808 }
10809
10810
10811 /*
10812 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10813 * The following system calls are designed to support features
10814 * which are specific to the HFS & HFS Plus volume formats
10815 */
10816
10817
10818 /*
10819 * Obtain attribute information on objects in a directory while enumerating
10820 * the directory.
10821 */
10822 /* ARGSUSED */
10823 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10824 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10825 {
10826 vnode_t vp;
10827 struct fileproc *fp;
10828 uio_t auio = NULL;
10829 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10830 uint32_t count = 0, savecount = 0;
10831 uint32_t newstate = 0;
10832 int error, eofflag = 0;
10833 off_t loff = 0;
10834 struct attrlist attributelist;
10835 vfs_context_t ctx = vfs_context_current();
10836 int fd = uap->fd;
10837 UIO_STACKBUF(uio_buf, 1);
10838 kauth_action_t action;
10839
10840 AUDIT_ARG(fd, fd);
10841
10842 /* Get the attributes into kernel space */
10843 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10844 return error;
10845 }
10846 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10847 return error;
10848 }
10849 savecount = count;
10850
10851 get_from_fd:
10852 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10853 return error;
10854 }
10855
10856 vn_offset_lock(fp->fp_glob);
10857 if (((vnode_t)fp_get_data(fp)) != vp) {
10858 vn_offset_unlock(fp->fp_glob);
10859 file_drop(fd);
10860 goto get_from_fd;
10861 }
10862
10863 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10864 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10865 error = EBADF;
10866 goto out;
10867 }
10868
10869
10870 #if CONFIG_MACF
10871 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10872 fp->fp_glob);
10873 if (error) {
10874 goto out;
10875 }
10876 #endif
10877
10878
10879 if ((error = vnode_getwithref(vp))) {
10880 goto out;
10881 }
10882
10883 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10884
10885 #if CONFIG_UNION_MOUNTS
10886 unionread:
10887 #endif /* CONFIG_UNION_MOUNTS */
10888 if (vp->v_type != VDIR) {
10889 (void)vnode_put(vp);
10890 error = EINVAL;
10891 goto out;
10892 }
10893
10894 #if CONFIG_MACF
10895 error = mac_vnode_check_readdir(ctx, vp);
10896 if (error != 0) {
10897 (void)vnode_put(vp);
10898 goto out;
10899 }
10900 #endif /* MAC */
10901
10902 /* set up the uio structure which will contain the users return buffer */
10903 loff = fp->fp_glob->fg_offset;
10904 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10905 uio_addiov(auio, uap->buffer, uap->buffersize);
10906
10907 /*
10908 * If the only item requested is file names, we can let that past with
10909 * just LIST_DIRECTORY. If they want any other attributes, that means
10910 * they need SEARCH as well.
10911 */
10912 action = KAUTH_VNODE_LIST_DIRECTORY;
10913 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10914 attributelist.fileattr || attributelist.dirattr) {
10915 action |= KAUTH_VNODE_SEARCH;
10916 }
10917
10918 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10919 /* Believe it or not, uap->options only has 32-bits of valid
10920 * info, so truncate before extending again */
10921
10922 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10923 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10924 }
10925
10926 if (error) {
10927 (void) vnode_put(vp);
10928 goto out;
10929 }
10930
10931 #if CONFIG_UNION_MOUNTS
10932 /*
10933 * If we've got the last entry of a directory in a union mount
10934 * then reset the eofflag and pretend there's still more to come.
10935 * The next call will again set eofflag and the buffer will be empty,
10936 * so traverse to the underlying directory and do the directory
10937 * read there.
10938 */
10939 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10940 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10941 eofflag = 0;
10942 } else { // Empty buffer
10943 vnode_t uvp;
10944 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10945 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10946 fp_set_data(fp, uvp);
10947 fp->fp_glob->fg_offset = 0; // reset index for new dir
10948 count = savecount;
10949 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10950 vnode_put(vp);
10951 vp = uvp;
10952 goto unionread;
10953 } else {
10954 /* could not get a ref, can't replace in fd */
10955 vnode_put(uvp);
10956 }
10957 }
10958 }
10959 }
10960 #endif /* CONFIG_UNION_MOUNTS */
10961
10962 (void)vnode_put(vp);
10963
10964 if (error) {
10965 goto out;
10966 }
10967 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10968
10969 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10970 goto out;
10971 }
10972 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10973 goto out;
10974 }
10975 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10976 goto out;
10977 }
10978
10979 *retval = eofflag; /* similar to getdirentries */
10980 error = 0;
10981 out:
10982 vn_offset_unlock(fp->fp_glob);
10983 file_drop(fd);
10984 return error; /* return error earlier, an retval of 0 or 1 now */
10985 } /* end of getdirentriesattr system call */
10986
10987 /*
10988 * Exchange data between two files
10989 */
10990
10991 /* ARGSUSED */
10992 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10993 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10994 {
10995 struct nameidata fnd, snd;
10996 vfs_context_t ctx = vfs_context_current();
10997 vnode_t fvp;
10998 vnode_t svp;
10999 int error;
11000 u_int32_t nameiflags;
11001 char *fpath = NULL;
11002 char *spath = NULL;
11003 int flen = 0, slen = 0;
11004 int from_truncated = 0, to_truncated = 0;
11005 #if CONFIG_FSE
11006 fse_info f_finfo, s_finfo;
11007 #endif
11008
11009 nameiflags = 0;
11010 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11011 nameiflags |= FOLLOW;
11012 }
11013
11014 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
11015 UIO_USERSPACE, uap->path1, ctx);
11016
11017 error = namei(&fnd);
11018 if (error) {
11019 goto out2;
11020 }
11021
11022 nameidone(&fnd);
11023 fvp = fnd.ni_vp;
11024
11025 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
11026 UIO_USERSPACE, uap->path2, ctx);
11027
11028 error = namei(&snd);
11029 if (error) {
11030 vnode_put(fvp);
11031 goto out2;
11032 }
11033 nameidone(&snd);
11034 svp = snd.ni_vp;
11035
11036 /*
11037 * if the files are the same, return an inval error
11038 */
11039 if (svp == fvp) {
11040 error = EINVAL;
11041 goto out;
11042 }
11043
11044 /*
11045 * if the files are on different volumes, return an error
11046 */
11047 if (svp->v_mount != fvp->v_mount) {
11048 error = EXDEV;
11049 goto out;
11050 }
11051
11052 /* If they're not files, return an error */
11053 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
11054 error = EINVAL;
11055 goto out;
11056 }
11057
11058 #if CONFIG_MACF
11059 error = mac_vnode_check_exchangedata(ctx,
11060 fvp, svp);
11061 if (error) {
11062 goto out;
11063 }
11064 #endif
11065 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
11066 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
11067 goto out;
11068 }
11069
11070 if (
11071 #if CONFIG_FSE
11072 need_fsevent(FSE_EXCHANGE, fvp) ||
11073 #endif
11074 kauth_authorize_fileop_has_listeners()) {
11075 GET_PATH(fpath);
11076 GET_PATH(spath);
11077
11078 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
11079 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
11080
11081 #if CONFIG_FSE
11082 get_fse_info(fvp, &f_finfo, ctx);
11083 get_fse_info(svp, &s_finfo, ctx);
11084 if (from_truncated || to_truncated) {
11085 // set it here since only the f_finfo gets reported up to user space
11086 f_finfo.mode |= FSE_TRUNCATED_PATH;
11087 }
11088 #endif
11089 }
11090 /* Ok, make the call */
11091 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
11092
11093 if (error == 0) {
11094 const char *tmpname;
11095
11096 if (fpath != NULL && spath != NULL) {
11097 /* call out to allow 3rd party notification of exchangedata.
11098 * Ignore result of kauth_authorize_fileop call.
11099 */
11100 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
11101 (uintptr_t)fpath, (uintptr_t)spath);
11102 }
11103 name_cache_lock();
11104
11105 tmpname = fvp->v_name;
11106 fvp->v_name = svp->v_name;
11107 svp->v_name = tmpname;
11108
11109 if (fvp->v_parent != svp->v_parent) {
11110 vnode_t tmp;
11111
11112 tmp = fvp->v_parent;
11113 fvp->v_parent = svp->v_parent;
11114 svp->v_parent = tmp;
11115 }
11116 name_cache_unlock();
11117
11118 #if CONFIG_FSE
11119 if (fpath != NULL && spath != NULL) {
11120 add_fsevent(FSE_EXCHANGE, ctx,
11121 FSE_ARG_STRING, flen, fpath,
11122 FSE_ARG_FINFO, &f_finfo,
11123 FSE_ARG_STRING, slen, spath,
11124 FSE_ARG_FINFO, &s_finfo,
11125 FSE_ARG_DONE);
11126 }
11127 #endif
11128 }
11129
11130 out:
11131 if (fpath != NULL) {
11132 RELEASE_PATH(fpath);
11133 }
11134 if (spath != NULL) {
11135 RELEASE_PATH(spath);
11136 }
11137 vnode_put(svp);
11138 vnode_put(fvp);
11139 out2:
11140 return error;
11141 }
11142
11143 /*
11144 * Return (in MB) the amount of freespace on the given vnode's volume.
11145 */
11146 uint32_t freespace_mb(vnode_t vp);
11147
11148 uint32_t
freespace_mb(vnode_t vp)11149 freespace_mb(vnode_t vp)
11150 {
11151 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11152 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11153 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11154 }
11155
11156 #if CONFIG_SEARCHFS
11157
11158 /* ARGSUSED */
11159
11160 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11161 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11162 {
11163 vnode_t vp, tvp;
11164 int i, error = 0;
11165 int fserror = 0;
11166 struct nameidata nd;
11167 struct user64_fssearchblock searchblock;
11168 struct searchstate *state;
11169 struct attrlist *returnattrs;
11170 struct timeval timelimit;
11171 void *searchparams1, *searchparams2;
11172 uio_t auio = NULL;
11173 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11174 uint32_t nummatches;
11175 size_t mallocsize;
11176 uint32_t nameiflags;
11177 vfs_context_t ctx = vfs_context_current();
11178 UIO_STACKBUF(uio_buf, 1);
11179
11180 /* Start by copying in fsearchblock parameter list */
11181 if (IS_64BIT_PROCESS(p)) {
11182 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11183 timelimit.tv_sec = searchblock.timelimit.tv_sec;
11184 timelimit.tv_usec = searchblock.timelimit.tv_usec;
11185 } else {
11186 struct user32_fssearchblock tmp_searchblock;
11187
11188 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11189 // munge into 64-bit version
11190 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11191 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11192 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11193 searchblock.maxmatches = tmp_searchblock.maxmatches;
11194 /*
11195 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11196 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11197 */
11198 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11199 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11200 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11201 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11202 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11203 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11204 searchblock.searchattrs = tmp_searchblock.searchattrs;
11205 }
11206 if (error) {
11207 return error;
11208 }
11209
11210 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11211 */
11212 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11213 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11214 return EINVAL;
11215 }
11216
11217 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11218 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
11219 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11220 /* block. */
11221 /* */
11222 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
11223 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
11224 /* assumes the size is still 556 bytes it will continue to work */
11225
11226 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11227 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11228
11229 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11230
11231 /* Now set up the various pointers to the correct place in our newly allocated memory */
11232
11233 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11234 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11235 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11236
11237 /* Now copy in the stuff given our local variables. */
11238
11239 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11240 goto freeandexit;
11241 }
11242
11243 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11244 goto freeandexit;
11245 }
11246
11247 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11248 goto freeandexit;
11249 }
11250
11251 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11252 goto freeandexit;
11253 }
11254
11255 /*
11256 * When searching a union mount, need to set the
11257 * start flag at the first call on each layer to
11258 * reset state for the new volume.
11259 */
11260 if (uap->options & SRCHFS_START) {
11261 state->ss_union_layer = 0;
11262 } else {
11263 uap->options |= state->ss_union_flags;
11264 }
11265 state->ss_union_flags = 0;
11266
11267 /*
11268 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11269 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11270 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11271 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11272 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11273 */
11274
11275 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11276 attrreference_t* string_ref;
11277 u_int32_t* start_length;
11278 user64_size_t param_length;
11279
11280 /* validate searchparams1 */
11281 param_length = searchblock.sizeofsearchparams1;
11282 /* skip the word that specifies length of the buffer */
11283 start_length = (u_int32_t*) searchparams1;
11284 start_length = start_length + 1;
11285 string_ref = (attrreference_t*) start_length;
11286
11287 /* ensure no negative offsets or too big offsets */
11288 if (string_ref->attr_dataoffset < 0) {
11289 error = EINVAL;
11290 goto freeandexit;
11291 }
11292 if (string_ref->attr_length > MAXPATHLEN) {
11293 error = EINVAL;
11294 goto freeandexit;
11295 }
11296
11297 /* Check for pointer overflow in the string ref */
11298 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11299 error = EINVAL;
11300 goto freeandexit;
11301 }
11302
11303 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11304 error = EINVAL;
11305 goto freeandexit;
11306 }
11307 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11308 error = EINVAL;
11309 goto freeandexit;
11310 }
11311 }
11312
11313 /* set up the uio structure which will contain the users return buffer */
11314 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11315 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11316
11317 nameiflags = 0;
11318 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11319 nameiflags |= FOLLOW;
11320 }
11321 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11322 UIO_USERSPACE, uap->path, ctx);
11323
11324 error = namei(&nd);
11325 if (error) {
11326 goto freeandexit;
11327 }
11328 vp = nd.ni_vp;
11329 nameidone(&nd);
11330
11331 /*
11332 * Switch to the root vnode for the volume
11333 */
11334 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11335 vnode_put(vp);
11336 if (error) {
11337 goto freeandexit;
11338 }
11339 vp = tvp;
11340
11341 #if CONFIG_UNION_MOUNTS
11342 /*
11343 * If it's a union mount, the path lookup takes
11344 * us to the top layer. But we may need to descend
11345 * to a lower layer. For non-union mounts the layer
11346 * is always zero.
11347 */
11348 for (i = 0; i < (int) state->ss_union_layer; i++) {
11349 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11350 break;
11351 }
11352 tvp = vp;
11353 vp = vp->v_mount->mnt_vnodecovered;
11354 if (vp == NULL) {
11355 vnode_put(tvp);
11356 error = ENOENT;
11357 goto freeandexit;
11358 }
11359 error = vnode_getwithref(vp);
11360 vnode_put(tvp);
11361 if (error) {
11362 goto freeandexit;
11363 }
11364 }
11365 #endif /* CONFIG_UNION_MOUNTS */
11366
11367 #if CONFIG_MACF
11368 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11369 if (error) {
11370 vnode_put(vp);
11371 goto freeandexit;
11372 }
11373 #endif
11374
11375
11376 /*
11377 * If searchblock.maxmatches == 0, then skip the search. This has happened
11378 * before and sometimes the underlying code doesnt deal with it well.
11379 */
11380 if (searchblock.maxmatches == 0) {
11381 nummatches = 0;
11382 goto saveandexit;
11383 }
11384
11385 /*
11386 * Allright, we have everything we need, so lets make that call.
11387 *
11388 * We keep special track of the return value from the file system:
11389 * EAGAIN is an acceptable error condition that shouldn't keep us
11390 * from copying out any results...
11391 */
11392
11393 fserror = VNOP_SEARCHFS(vp,
11394 searchparams1,
11395 searchparams2,
11396 &searchblock.searchattrs,
11397 (uint32_t)searchblock.maxmatches,
11398 &timelimit,
11399 returnattrs,
11400 &nummatches,
11401 (uint32_t)uap->scriptcode,
11402 (uint32_t)uap->options,
11403 auio,
11404 (struct searchstate *) &state->ss_fsstate,
11405 ctx);
11406
11407 #if CONFIG_UNION_MOUNTS
11408 /*
11409 * If it's a union mount we need to be called again
11410 * to search the mounted-on filesystem.
11411 */
11412 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11413 state->ss_union_flags = SRCHFS_START;
11414 state->ss_union_layer++; // search next layer down
11415 fserror = EAGAIN;
11416 }
11417 #endif /* CONFIG_UNION_MOUNTS */
11418
11419 saveandexit:
11420
11421 vnode_put(vp);
11422
11423 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11424 * search state. Everything was already put into he return buffer by the vop call. */
11425
11426 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11427 goto freeandexit;
11428 }
11429
11430 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11431 goto freeandexit;
11432 }
11433
11434 error = fserror;
11435
11436 freeandexit:
11437
11438 kfree_data(searchparams1, mallocsize);
11439
11440 return error;
11441 } /* end of searchfs system call */
11442
11443 #else /* CONFIG_SEARCHFS */
11444
11445 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11446 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11447 {
11448 return ENOTSUP;
11449 }
11450
11451 #endif /* CONFIG_SEARCHFS */
11452
11453
11454 #if CONFIG_DATALESS_FILES
11455
11456 /*
11457 * === Namespace Resolver Up-call Mechanism ===
11458 *
11459 * When I/O is performed to a dataless file or directory (read, write,
11460 * lookup-in, etc.), the file system performs an upcall to the namespace
11461 * resolver (filecoordinationd) to materialize the object.
11462 *
11463 * We need multiple up-calls to be in flight at once, and we need these
11464 * up-calls to be interruptible, thus the following implementation:
11465 *
11466 * => The nspace_resolver_request represents the in-kernel request state.
11467 * It contains a request ID, storage space for the errno code returned
11468 * by filecoordinationd, and flags.
11469 *
11470 * => The request ID is simply a global monotonically incrementing 32-bit
11471 * number. Outstanding requests are stored in a hash table, and the
11472 * hash function is extremely simple.
11473 *
11474 * => When an upcall is to be made to filecoordinationd, a request structure
11475 * is allocated on the stack (it is small, and needs to live only during
11476 * the duration of the call to resolve_nspace_item_ext()). It is
11477 * initialized and inserted into the table. Some backpressure from
11478 * filecoordinationd is applied by limiting the numnber of entries that
11479 * can be inserted into the table (and thus limiting the number of
11480 * outstanding requests issued to filecoordinationd); waiting for an
11481 * available slot is interruptible.
11482 *
11483 * => Once the request has been inserted into the table, the up-call is made
11484 * to filecoordinationd via a MiG-generated stub. The up-call returns
11485 * immediately and filecoordinationd processes the request asynchronously.
11486 *
11487 * => The caller now waits for the request to complete. Tnis is achieved by
11488 * sleeping on the address of the request structure and waiting for
11489 * filecoordinationd to mark the request structure as complete. This
11490 * is an interruptible sleep call; if interrupted, the request structure
11491 * is removed from the table and EINTR is returned to the caller. If
11492 * this occurs, an advisory up-call is made to filecoordinationd with
11493 * the request ID to indicate that the request can be aborted or
11494 * de-prioritized at the discretion of filecoordinationd.
11495 *
11496 * => When filecoordinationd has completed the request, it signals completion
11497 * by writing to the vfs.nspace.complete sysctl node. Only a process
11498 * decorated as a namespace resolver can write to this sysctl node. The
11499 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11500 * The request ID is looked up in the table, and if the request is found,
11501 * the error code is stored in the request structure and a wakeup()
11502 * issued on the address of the request structure. If the request is not
11503 * found, we simply drop the completion notification, assuming that the
11504 * caller was interrupted.
11505 *
11506 * => When the waiting thread wakes up, it extracts the error code from the
11507 * request structure, removes the request from the table, and returns the
11508 * error code to the calling function. Fini!
11509 */
11510
11511 struct nspace_resolver_request {
11512 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11513 vnode_t r_vp;
11514 vnode_t r_tdvp;
11515 uint32_t r_req_id;
11516 int r_resolver_error;
11517 int r_flags;
11518 };
11519
11520 #define RRF_COMPLETE 0x0001
11521 #define RRF_COMPLETING 0x0002
11522
11523 struct nspace_resolver_completion_data {
11524 uint32_t req_id;
11525 int32_t resolver_error;
11526 uint64_t orig_gencount;
11527 uint64_t orig_syncroot;
11528 };
11529
11530 static uint32_t
next_nspace_req_id(void)11531 next_nspace_req_id(void)
11532 {
11533 static uint32_t next_req_id;
11534
11535 return OSAddAtomic(1, &next_req_id);
11536 }
11537
11538 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11539 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11540
11541 static LIST_HEAD(nspace_resolver_requesthead,
11542 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11543 static u_long nspace_resolver_request_hashmask;
11544 static u_int nspace_resolver_request_count;
11545 static bool nspace_resolver_request_wait_slot;
11546 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11547 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11548 &nspace_resolver_request_lck_grp);
11549
11550 #define NSPACE_REQ_LOCK() \
11551 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11552 #define NSPACE_REQ_UNLOCK() \
11553 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11554
11555 #define NSPACE_RESOLVER_HASH(req_id) \
11556 (&nspace_resolver_request_hashtbl[(req_id) & \
11557 nspace_resolver_request_hashmask])
11558
11559 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11560 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11561 {
11562 struct nspace_resolver_requesthead *bucket;
11563 struct nspace_resolver_request *req;
11564
11565 bucket = NSPACE_RESOLVER_HASH(req_id);
11566 LIST_FOREACH(req, bucket, r_hashlink) {
11567 if (req->r_req_id == req_id) {
11568 /*
11569 * If this request already has a completion
11570 * pending, don't return it again.
11571 */
11572 if ((req->r_flags & RRF_COMPLETING) != 0 &&
11573 skip_completing) {
11574 req = NULL;
11575 }
11576 return req;
11577 }
11578 }
11579
11580 return NULL;
11581 }
11582
11583 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11584 nspace_resolver_req_add(struct nspace_resolver_request *req)
11585 {
11586 struct nspace_resolver_requesthead *bucket;
11587 int error;
11588
11589 NSPACE_REQ_LOCK();
11590
11591 while (nspace_resolver_request_count >=
11592 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11593 nspace_resolver_request_wait_slot = true;
11594 error = msleep(&nspace_resolver_request_count,
11595 &nspace_resolver_request_hash_mutex,
11596 PVFS | PCATCH, "nspacerq", NULL);
11597 if (error) {
11598 NSPACE_REQ_UNLOCK();
11599 return error;
11600 }
11601 }
11602
11603 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11604 #if DIAGNOSTIC
11605 assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11606 #endif /* DIAGNOSTIC */
11607 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11608 nspace_resolver_request_count++;
11609
11610 NSPACE_REQ_UNLOCK();
11611
11612 return 0;
11613 }
11614
11615 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11616 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11617 {
11618 /*
11619 * If a completion is in-progress, we have to wait for the
11620 * completion handler to finish because it's still using 'req',
11621 * which is allocated on our stack a couple of frames up.
11622 */
11623 while ((req->r_flags & RRF_COMPLETING) != 0) {
11624 (void) msleep(req, &nspace_resolver_request_hash_mutex,
11625 PVFS, "nspacecmplt", NULL);
11626 }
11627 }
11628
11629 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11630 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11631 {
11632 struct nspace_resolver_requesthead *bucket;
11633
11634 /* We're called with NSPACE_REQ_LOCK held. */
11635
11636 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11637 #if DIAGNOSTIC
11638 assert((req->r_flags & RRF_COMPLETING) == 0);
11639 assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11640 #endif /* DIAGNOSTIC */
11641 LIST_REMOVE(req, r_hashlink);
11642 nspace_resolver_request_count--;
11643
11644 if (nspace_resolver_request_wait_slot) {
11645 nspace_resolver_request_wait_slot = false;
11646 wakeup(&nspace_resolver_request_count);
11647 }
11648
11649 nspace_resolver_req_wait_pending_completion(req);
11650
11651 NSPACE_REQ_UNLOCK();
11652 }
11653
11654 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11655 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11656 {
11657 NSPACE_REQ_LOCK();
11658 nspace_resolver_req_remove_and_unlock(req);
11659 }
11660
11661 static void
nspace_resolver_req_cancel(uint32_t req_id)11662 nspace_resolver_req_cancel(uint32_t req_id)
11663 {
11664 kern_return_t kr;
11665 mach_port_t mp;
11666
11667 // Failures here aren't fatal -- the cancellation message
11668 // sent to the resolver is merely advisory.
11669
11670 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11671 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11672 return;
11673 }
11674
11675 kr = send_nspace_resolve_cancel(mp, req_id);
11676 if (kr != KERN_SUCCESS) {
11677 os_log_error(OS_LOG_DEFAULT,
11678 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11679 }
11680
11681 ipc_port_release_send(mp);
11682 }
11683
11684 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11685 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11686 {
11687 bool send_cancel_message = false;
11688 int error;
11689
11690 NSPACE_REQ_LOCK();
11691
11692 while ((req->r_flags & RRF_COMPLETE) == 0) {
11693 error = msleep(req, &nspace_resolver_request_hash_mutex,
11694 PVFS | PCATCH, "nspace", NULL);
11695 if (error && error != ERESTART) {
11696 req->r_resolver_error = (error == EINTR) ? EINTR :
11697 ETIMEDOUT;
11698 send_cancel_message = true;
11699 break;
11700 }
11701 }
11702
11703 nspace_resolver_req_remove_and_unlock(req);
11704
11705 /*
11706 * It's safe to continue referencing 'req' here because it's
11707 * allocated on our caller's stack.
11708 */
11709
11710 if (send_cancel_message) {
11711 nspace_resolver_req_cancel(req->r_req_id);
11712 }
11713
11714 return req->r_resolver_error;
11715 }
11716
11717 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11718 nspace_resolver_req_mark_complete(
11719 struct nspace_resolver_request *req,
11720 int resolver_error)
11721 {
11722 req->r_resolver_error = resolver_error;
11723 req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11724 wakeup(req);
11725 }
11726
11727 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11728 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11729 {
11730 req->r_flags |= RRF_COMPLETING;
11731 }
11732
11733 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11734 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11735 {
11736 struct nspace_resolver_request *req;
11737 int error;
11738 struct vnode_attr va;
11739 vnode_t vp;
11740
11741 NSPACE_REQ_LOCK();
11742
11743 req = nspace_resolver_req_lookup(c->req_id, true);
11744 if (req == NULL) {
11745 /*
11746 * If we don't find the request corresponding to our req_id,
11747 * just drop the completion on the floor; it's likely that
11748 * the requester interrupted with a signal, or it may already
11749 * be completing.
11750 */
11751 NSPACE_REQ_UNLOCK();
11752 return;
11753 }
11754
11755 /*
11756 * Get out now if the resolver reported an error.
11757 */
11758 if ((error = c->resolver_error) != 0) {
11759 goto out;
11760 }
11761
11762 /*
11763 * If the resolver did not specify any namespace shape criteria
11764 * for letting the operation proceed, then get out now.
11765 */
11766 if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11767 goto out;
11768 }
11769
11770 /*
11771 * We're going to have to acquire the mount rename lock and do
11772 * some I/O in order to verify the criteria. Mark the request
11773 * as pending so no one else messes with it after we drop the
11774 * NSPACE_REQ_LOCK.
11775 */
11776 nspace_resolver_req_mark_completion_pending(req);
11777 NSPACE_REQ_UNLOCK();
11778
11779 /*
11780 * Lock out renames from changing the shape of the tree while
11781 * validate the criteria.
11782 */
11783 mount_t locked_mp = req->r_vp->v_mount;
11784 mount_ref(locked_mp, 0);
11785 mount_lock_renames(locked_mp);
11786
11787 if (c->orig_gencount != 0) {
11788 vp = req->r_vp;
11789 if (error) {
11790 goto out_dropmount;
11791 }
11792
11793 VATTR_INIT(&va);
11794 VATTR_WANTED(&va, va_recursive_gencount);
11795 error = vnode_getattr(vp, &va, vfs_context_kernel());
11796 if (error) {
11797 goto out_dropmount;
11798 }
11799 if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11800 va.va_recursive_gencount != c->orig_gencount) {
11801 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11802 c->orig_gencount, va.va_recursive_gencount);
11803 error = EBUSY;
11804 goto out_dropmount;
11805 }
11806 }
11807
11808 /*
11809 * Ignore orig_syncroot if a destination directory wasn't specified
11810 * in the request.
11811 */
11812 if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11813 uint64_t syncroot_id;
11814
11815 if (error) {
11816 goto out_dropmount;
11817 }
11818
11819 #ifndef APFSIOC_GET_SYNC_ROOT
11820 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11821 #endif
11822
11823 error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11824 (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11825 if (error) {
11826 goto out_dropmount;
11827 }
11828 if (syncroot_id != c->orig_syncroot) {
11829 printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11830 c->orig_syncroot, syncroot_id);
11831 error = EBUSY;
11832 goto out_dropmount;
11833 }
11834 }
11835
11836 out_dropmount:
11837 mount_unlock_renames(locked_mp);
11838 mount_drop(locked_mp, 0);
11839 NSPACE_REQ_LOCK();
11840
11841 out:
11842 nspace_resolver_req_mark_complete(req, error);
11843 NSPACE_REQ_UNLOCK();
11844 }
11845
11846 static struct proc *nspace_resolver_proc;
11847
11848 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11849 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11850 {
11851 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11852 p == nspace_resolver_proc) ? 1 : 0;
11853 return 0;
11854 }
11855
11856 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11857
11858 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11859 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11860 {
11861 vfs_context_t ctx = vfs_context_current();
11862 int error = 0;
11863
11864 //
11865 // The system filecoordinationd runs as uid == 0. This also
11866 // has the nice side-effect of filtering out filecoordinationd
11867 // running in the simulator.
11868 //
11869 if (!vfs_context_issuser(ctx) ||
11870 !vfs_context_is_dataless_resolver(ctx)) {
11871 return EPERM;
11872 }
11873
11874 if (is_resolver) {
11875 NSPACE_REQ_LOCK();
11876
11877 if (nspace_resolver_proc == NULL) {
11878 proc_lock(p);
11879 p->p_lflag |= P_LNSPACE_RESOLVER;
11880 proc_unlock(p);
11881 nspace_resolver_proc = p;
11882 } else {
11883 error = EBUSY;
11884 }
11885
11886 NSPACE_REQ_UNLOCK();
11887 } else {
11888 // This is basically just like the exit case.
11889 // nspace_resolver_exited() will verify that the
11890 // process is the resolver, and will clear the
11891 // global.
11892 nspace_resolver_exited(p);
11893 }
11894
11895 return error;
11896 }
11897
11898 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11899 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11900 {
11901 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11902 (p->p_vfs_iopolicy &
11903 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11904 *is_prevented = 1;
11905 } else {
11906 *is_prevented = 0;
11907 }
11908 return 0;
11909 }
11910
11911 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11912 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11913 {
11914 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11915 return is_prevented ? 0 : EBUSY;
11916 }
11917
11918 if (is_prevented) {
11919 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11920 } else {
11921 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11922 }
11923 return 0;
11924 }
11925
11926 static int
nspace_materialization_get_thread_state(int * is_prevented)11927 nspace_materialization_get_thread_state(int *is_prevented)
11928 {
11929 uthread_t ut = current_uthread();
11930
11931 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11932 return 0;
11933 }
11934
11935 static int
nspace_materialization_set_thread_state(int is_prevented)11936 nspace_materialization_set_thread_state(int is_prevented)
11937 {
11938 uthread_t ut = current_uthread();
11939
11940 if (is_prevented) {
11941 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11942 } else {
11943 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11944 }
11945 return 0;
11946 }
11947
11948 /* the vfs.nspace branch */
11949 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11950
11951 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11952 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11953 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11954 {
11955 struct proc *p = req->p;
11956 int new_value, old_value, changed = 0;
11957 int error;
11958
11959 error = nspace_resolver_get_proc_state(p, &old_value);
11960 if (error) {
11961 return error;
11962 }
11963
11964 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11965 &changed);
11966 if (error == 0 && changed) {
11967 error = nspace_resolver_set_proc_state(p, new_value);
11968 }
11969 return error;
11970 }
11971
11972 /* decorate this process as the dataless file resolver */
11973 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11974 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11975 0, 0, sysctl_nspace_resolver, "I", "");
11976
11977 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11978 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11979 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11980 {
11981 struct proc *p = req->p;
11982 int new_value, old_value, changed = 0;
11983 int error;
11984
11985 error = nspace_materialization_get_proc_state(p, &old_value);
11986 if (error) {
11987 return error;
11988 }
11989
11990 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11991 &changed);
11992 if (error == 0 && changed) {
11993 error = nspace_materialization_set_proc_state(p, new_value);
11994 }
11995 return error;
11996 }
11997
11998 /* decorate this process as not wanting to materialize dataless files */
11999 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
12000 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12001 0, 0, sysctl_nspace_prevent_materialization, "I", "");
12002
12003 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12004 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
12005 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12006 {
12007 int new_value, old_value, changed = 0;
12008 int error;
12009
12010 error = nspace_materialization_get_thread_state(&old_value);
12011 if (error) {
12012 return error;
12013 }
12014
12015 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12016 &changed);
12017 if (error == 0 && changed) {
12018 error = nspace_materialization_set_thread_state(new_value);
12019 }
12020 return error;
12021 }
12022
12023 /* decorate this thread as not wanting to materialize dataless files */
12024 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
12025 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12026 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
12027
12028 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12029 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
12030 __unused int arg2, struct sysctl_req *req)
12031 {
12032 struct proc *p = req->p;
12033 uint32_t req_status[2] = { 0, 0 };
12034 uint64_t gencount = 0;
12035 uint64_t syncroot = 0;
12036 int error, is_resolver, changed = 0, other_changed;
12037
12038 error = nspace_resolver_get_proc_state(p, &is_resolver);
12039 if (error) {
12040 return error;
12041 }
12042
12043 if (!is_resolver) {
12044 return EPERM;
12045 }
12046
12047 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
12048 &changed);
12049 if (error) {
12050 return error;
12051 }
12052
12053 /*
12054 * Get the gencount if it was passed. Ignore errors, because
12055 * it's optional.
12056 */
12057 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
12058 &other_changed);
12059 if (error) {
12060 gencount = 0;
12061 error = 0;
12062 }
12063
12064 /*
12065 * ...and now the syncroot ID.
12066 */
12067 error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
12068 &other_changed);
12069 if (error) {
12070 syncroot = 0;
12071 error = 0;
12072 }
12073
12074 /*
12075 * req_status[0] is the req_id
12076 *
12077 * req_status[1] is the errno
12078 */
12079 if (error == 0 && changed) {
12080 const struct nspace_resolver_completion_data cd = {
12081 .req_id = req_status[0],
12082 .resolver_error = req_status[1],
12083 .orig_gencount = gencount,
12084 .orig_syncroot = syncroot,
12085 };
12086 nspace_resolver_req_completed(&cd);
12087 }
12088 return error;
12089 }
12090
12091 /* Resolver reports completed reqs here. */
12092 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
12093 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12094 0, 0, sysctl_nspace_complete, "-", "");
12095
12096 #endif /* CONFIG_DATALESS_FILES */
12097
12098 #if CONFIG_DATALESS_FILES
12099 #define __no_dataless_unused /* nothing */
12100 #else
12101 #define __no_dataless_unused __unused
12102 #endif
12103
12104 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12105 vfs_context_dataless_materialization_is_prevented(
12106 vfs_context_t const ctx __no_dataless_unused)
12107 {
12108 #if CONFIG_DATALESS_FILES
12109 proc_t const p = vfs_context_proc(ctx);
12110 thread_t const t = vfs_context_thread(ctx);
12111 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
12112
12113 /*
12114 * Kernel context ==> return EDEADLK, as we would with any random
12115 * process decorated as no-materialize.
12116 */
12117 if (ctx == vfs_context_kernel()) {
12118 return EDEADLK;
12119 }
12120
12121 /*
12122 * If the process has the dataless-manipulation entitlement,
12123 * materialization is prevented, and depending on the kind
12124 * of file system operation, things get to proceed as if the
12125 * object is not dataless.
12126 */
12127 if (vfs_context_is_dataless_manipulator(ctx)) {
12128 return EJUSTRETURN;
12129 }
12130
12131 /*
12132 * Per-thread decorations override any process-wide decorations.
12133 * (Foundation uses this, and this overrides even the dataless-
12134 * manipulation entitlement so as to make API contracts consistent.)
12135 */
12136 if (ut != NULL) {
12137 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12138 return EDEADLK;
12139 }
12140 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12141 return 0;
12142 }
12143 }
12144
12145 /*
12146 * If the process's iopolicy specifies that dataless files
12147 * can be materialized, then we let it go ahead.
12148 */
12149 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12150 return 0;
12151 }
12152 #endif /* CONFIG_DATALESS_FILES */
12153
12154 /*
12155 * The default behavior is to not materialize dataless files;
12156 * return to the caller that deadlock was detected.
12157 */
12158 return EDEADLK;
12159 }
12160
12161 void
nspace_resolver_init(void)12162 nspace_resolver_init(void)
12163 {
12164 #if CONFIG_DATALESS_FILES
12165 nspace_resolver_request_hashtbl =
12166 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12167 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12168 #endif /* CONFIG_DATALESS_FILES */
12169 }
12170
12171 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12172 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12173 {
12174 #if CONFIG_DATALESS_FILES
12175 struct nspace_resolver_requesthead *bucket;
12176 struct nspace_resolver_request *req;
12177 u_long idx;
12178
12179 NSPACE_REQ_LOCK();
12180
12181 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12182 p == nspace_resolver_proc) {
12183 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12184 bucket = &nspace_resolver_request_hashtbl[idx];
12185 LIST_FOREACH(req, bucket, r_hashlink) {
12186 nspace_resolver_req_wait_pending_completion(req);
12187 nspace_resolver_req_mark_complete(req,
12188 ETIMEDOUT);
12189 }
12190 }
12191 nspace_resolver_proc = NULL;
12192 }
12193
12194 NSPACE_REQ_UNLOCK();
12195 #endif /* CONFIG_DATALESS_FILES */
12196 }
12197
12198 #define DATALESS_RESOLVER_ENTITLEMENT \
12199 "com.apple.private.vfs.dataless-resolver"
12200 #define DATALESS_MANIPULATION_ENTITLEMENT \
12201 "com.apple.private.vfs.dataless-manipulation"
12202
12203 #if CONFIG_DATALESS_FILES
12204 /*
12205 * Return TRUE if the vfs context is associated with the dataless
12206 * resolver.
12207 */
12208 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12209 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12210 {
12211 return IOTaskHasEntitlement(vfs_context_task(ctx),
12212 DATALESS_RESOLVER_ENTITLEMENT);
12213 }
12214 #endif /* CONFIG_DATALESS_FILES */
12215
12216 /*
12217 * Return TRUE if the vfs context is associated with a process entitled
12218 * for dataless manipulation.
12219 *
12220 * XXX Arguably belongs in vfs_subr.c, but is here because of the
12221 * complication around CONFIG_DATALESS_FILES.
12222 */
12223 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12224 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12225 {
12226 #if CONFIG_DATALESS_FILES
12227 task_t task = vfs_context_task(ctx);
12228 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12229 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12230 #else
12231 return false;
12232 #endif /* CONFIG_DATALESS_FILES */
12233 }
12234
12235 #if CONFIG_DATALESS_FILES
12236 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12237 log_materialization_prevented(vnode_t vp, uint64_t op)
12238 {
12239 char p_name[MAXCOMLEN + 1];
12240 char *vntype;
12241 proc_selfname(&p_name[0], sizeof(p_name));
12242
12243 if (vp->v_type == VREG) {
12244 vntype = "File";
12245 } else if (vp->v_type == VDIR) {
12246 vntype = "Dir";
12247 } else if (vp->v_type == VLNK) {
12248 vntype = "SymLink";
12249 } else {
12250 vntype = "Other";
12251 }
12252
12253 #if DEVELOPMENT
12254 struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12255
12256 VATTR_INIT(vap);
12257 VATTR_WANTED(vap, va_fsid);
12258 VATTR_WANTED(vap, va_fileid);
12259 if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12260 os_log_debug(OS_LOG_DEFAULT,
12261 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12262 p_name, proc_selfpid(), op, vntype,
12263 vap->va_fsid, vap->va_fsid, vap->va_fileid);
12264 } else
12265 #endif
12266 {
12267 os_log_debug(OS_LOG_DEFAULT,
12268 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12269 p_name, proc_selfpid(), op, vntype);
12270 }
12271 #if DEVELOPMENT
12272 kfree_type(struct vnode_attr, vap);
12273 #endif
12274 }
12275 #endif /* CONFIG_DATALESS_FILES */
12276
12277 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12278 vfs_materialize_item(
12279 vnode_t vp __no_dataless_unused,
12280 uint32_t op __no_dataless_unused,
12281 int64_t offset __no_dataless_unused,
12282 int64_t size __no_dataless_unused,
12283 char *lookup_name __no_dataless_unused,
12284 size_t const namelen __no_dataless_unused,
12285 vnode_t tdvp __no_dataless_unused)
12286 {
12287 #if CONFIG_DATALESS_FILES
12288 kern_return_t kern_ret;
12289 mach_port_t mach_port;
12290 char *path = NULL;
12291 vfs_context_t context;
12292 int path_len;
12293 int error;
12294 audit_token_t atoken;
12295 enum vtype vp_vtype;
12296
12297 /* Swap files are special; ignore them */
12298 if (vnode_isswap(vp)) {
12299 return 0;
12300 }
12301
12302 /*
12303 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12304 * are no longer used nor supported.
12305 */
12306 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12307 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12308 return ENOTSUP;
12309 }
12310 if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12311 os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12312 return ENOTSUP;
12313 }
12314
12315 /* Normalize 'op'. */
12316 op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12317
12318 /*
12319 * To-directory is only meaningful for rename operations;
12320 * ignore it if someone handed one to us unexpectedly.
12321 */
12322 if (op != NAMESPACE_HANDLER_RENAME_OP) {
12323 tdvp = NULL;
12324 }
12325
12326 context = vfs_context_current();
12327
12328 /* Remember this for later. */
12329 vp_vtype = vnode_vtype(vp);
12330
12331 error = vfs_context_dataless_materialization_is_prevented(context);
12332 if (error) {
12333 log_materialization_prevented(vp, op);
12334 goto out_check_errors;
12335 }
12336
12337 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12338 &mach_port);
12339 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12340 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12341 /*
12342 * Treat this like being unable to access the backing store
12343 * server.
12344 */
12345 return ETIMEDOUT;
12346 }
12347
12348 int path_alloc_len = MAXPATHLEN;
12349 do {
12350 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12351 if (path == NULL) {
12352 return ENOMEM;
12353 }
12354
12355 path_len = path_alloc_len;
12356 error = vn_getpath(vp, path, &path_len);
12357 if (error == 0) {
12358 break;
12359 } else if (error == ENOSPC) {
12360 kfree_data(path, path_alloc_len);
12361 path = NULL;
12362 } else {
12363 goto out_release_port;
12364 }
12365 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12366
12367 error = vfs_context_copy_audit_token(context, &atoken);
12368 if (error) {
12369 goto out_release_port;
12370 }
12371
12372 struct nspace_resolver_request req = {
12373 .r_req_id = next_nspace_req_id(),
12374 .r_vp = vp,
12375 .r_tdvp = tdvp,
12376 };
12377
12378 error = nspace_resolver_req_add(&req);
12379 if (error) {
12380 goto out_release_port;
12381 }
12382
12383 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12384
12385 if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12386 char *dest_path = NULL;
12387 int dest_path_len;
12388
12389 dest_path = zalloc(ZV_NAMEI);
12390 dest_path_len = MAXPATHLEN;
12391
12392 error = vn_getpath(tdvp, dest_path, &dest_path_len);
12393 if (error) {
12394 zfree(ZV_NAMEI, dest_path);
12395 goto out_release_port;
12396 }
12397
12398 /*
12399 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12400 * compatibility with existing agents in user-space
12401 * who get passed this value.
12402 */
12403 kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12404 req.r_req_id,
12405 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12406 path, dest_path, atoken);
12407
12408 zfree(ZV_NAMEI, dest_path);
12409 } else if (vp_vtype == VDIR) {
12410 char *tmpname = NULL;
12411
12412 /*
12413 * If the caller provided a lookup_name *and* a name length,
12414 * then we assume the lookup_name is not NUL-terminated.
12415 * Allocate a temporary buffer in this case to provide
12416 * a NUL-terminated path name to the IPC call.
12417 */
12418 if (lookup_name != NULL && namelen != 0) {
12419 if (namelen >= PATH_MAX) {
12420 error = EINVAL;
12421 goto out_req_remove;
12422 }
12423 tmpname = zalloc(ZV_NAMEI);
12424 strlcpy(tmpname, lookup_name, namelen + 1);
12425 lookup_name = tmpname;
12426 } else if (lookup_name != NULL) {
12427 /*
12428 * If the caller provided a lookup_name with a
12429 * zero name length, then we assume it's NUL-
12430 * terminated. Verify it has a valid length.
12431 */
12432 if (strlen(lookup_name) >= PATH_MAX) {
12433 error = EINVAL;
12434 goto out_req_remove;
12435 }
12436 }
12437
12438 /* (See above.) */
12439 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12440 req.r_req_id,
12441 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12442 lookup_name == NULL ? "" : lookup_name, path, atoken);
12443
12444 if (tmpname != NULL) {
12445 zfree(ZV_NAMEI, tmpname);
12446
12447 /*
12448 * Poison lookup_name rather than reference
12449 * freed memory.
12450 */
12451 lookup_name = NULL;
12452 }
12453 } else {
12454 /* (See above.) */
12455 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12456 req.r_req_id,
12457 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12458 offset, size, path, atoken);
12459 }
12460 if (kern_ret != KERN_SUCCESS) {
12461 /*
12462 * Also treat this like being unable to access the backing
12463 * store server.
12464 */
12465 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12466 kern_ret);
12467 error = ETIMEDOUT;
12468 goto out_req_remove;
12469 }
12470
12471 /*
12472 * Give back the memory we allocated earlier while we wait; we
12473 * no longer need it.
12474 */
12475 kfree_data(path, path_alloc_len);
12476 path = NULL;
12477
12478 /*
12479 * Request has been submitted to the resolver. Now (interruptibly)
12480 * wait for completion. Upon requrn, the request will have been
12481 * removed from the lookup table.
12482 */
12483 error = nspace_resolver_req_wait(&req);
12484
12485 out_release_port:
12486 if (path != NULL) {
12487 kfree_data(path, path_alloc_len);
12488 path = NULL;
12489 }
12490 ipc_port_release_send(mach_port);
12491
12492 out_check_errors:
12493 /*
12494 * The file resolver owns the logic about what error to return
12495 * to the caller. We only need to handle a couple of special
12496 * cases here:
12497 */
12498 if (error == EJUSTRETURN) {
12499 /*
12500 * The requesting process is allowed to interact with
12501 * dataless objects. Make a couple of sanity-checks
12502 * here to ensure the action makes sense.
12503 */
12504 switch (op) {
12505 case NAMESPACE_HANDLER_WRITE_OP:
12506 case NAMESPACE_HANDLER_TRUNCATE_OP:
12507 case NAMESPACE_HANDLER_RENAME_OP:
12508 /*
12509 * This handles the case of the resolver itself
12510 * writing data to the file (or throwing it
12511 * away).
12512 */
12513 error = 0;
12514 break;
12515 case NAMESPACE_HANDLER_READ_OP:
12516 case NAMESPACE_HANDLER_LOOKUP_OP:
12517 /*
12518 * This handles the case of the resolver needing
12519 * to look up inside of a dataless directory while
12520 * it's in the process of materializing it (for
12521 * example, creating files or directories).
12522 */
12523 error = (vp_vtype == VDIR) ? 0 : EBADF;
12524 break;
12525 default:
12526 error = EBADF;
12527 break;
12528 }
12529 }
12530
12531 return error;
12532
12533 out_req_remove:
12534 nspace_resolver_req_remove(&req);
12535 goto out_release_port;
12536 #else
12537 return ENOTSUP;
12538 #endif /* CONFIG_DATALESS_FILES */
12539 }
12540
12541 /*
12542 * vfs_materialize_file: Materialize a regular file.
12543 *
12544 * Inputs:
12545 * vp The dataless file to be materialized.
12546 *
12547 * op What kind of operation is being performed:
12548 * -> NAMESPACE_HANDLER_READ_OP
12549 * -> NAMESPACE_HANDLER_WRITE_OP
12550 * -> NAMESPACE_HANDLER_LINK_CREATE
12551 * -> NAMESPACE_HANDLER_DELETE_OP
12552 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12553 * -> NAMESPACE_HANDLER_RENAME_OP
12554 *
12555 * offset offset of I/O for READ or WRITE. Ignored for
12556 * other ops.
12557 *
12558 * size size of I/O for READ or WRITE Ignored for
12559 * other ops.
12560 *
12561 * If offset or size are -1 for a READ or WRITE, then the resolver should
12562 * consider the range to be unknown.
12563 *
12564 * Upon successful return, the caller may proceed with the operation.
12565 * N.B. the file may still be "dataless" in this case.
12566 */
12567 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12568 vfs_materialize_file(
12569 struct vnode *vp,
12570 uint64_t op,
12571 int64_t offset,
12572 int64_t size)
12573 {
12574 if (vp->v_type != VREG) {
12575 return EFTYPE;
12576 }
12577 return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12578 NULL);
12579 }
12580
12581 /*
12582 * vfs_materialize_dir:
12583 *
12584 * Inputs:
12585 * vp The dataless directory to be materialized.
12586 *
12587 * op What kind of operation is being performed:
12588 * -> NAMESPACE_HANDLER_READ_OP
12589 * -> NAMESPACE_HANDLER_WRITE_OP
12590 * -> NAMESPACE_HANDLER_DELETE_OP
12591 * -> NAMESPACE_HANDLER_RENAME_OP
12592 * -> NAMESPACE_HANDLER_LOOKUP_OP
12593 *
12594 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12595 * other ops. May or may not be NUL-terminated; see below.
12596 *
12597 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12598 * terminated and namelen is the number of valid bytes in
12599 * lookup_name. If zero, then lookup_name is assumed to be
12600 * NUL-terminated.
12601 *
12602 * Upon successful return, the caller may proceed with the operation.
12603 * N.B. the directory may still be "dataless" in this case.
12604 */
12605 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12606 vfs_materialize_dir(
12607 struct vnode *vp,
12608 uint64_t op,
12609 char *lookup_name,
12610 size_t namelen)
12611 {
12612 if (vp->v_type != VDIR) {
12613 return EFTYPE;
12614 }
12615 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12616 return EINVAL;
12617 }
12618 return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12619 namelen, NULL);
12620 }
12621
12622 /*
12623 * vfs_materialize_reparent:
12624 *
12625 * Inputs:
12626 * vp The dataless file or directory to be materialized.
12627 *
12628 * tdvp The new parent directory for the dataless file.
12629 *
12630 * Upon successful return, the caller may proceed with the operation.
12631 * N.B. the item may still be "dataless" in this case.
12632 */
12633 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12634 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12635 {
12636 if (vp->v_type != VDIR && vp->v_type != VREG) {
12637 return EFTYPE;
12638 }
12639 return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12640 0, 0, NULL, 0, tdvp);
12641 }
12642
12643 #if 0
12644 static int
12645 build_volfs_path(struct vnode *vp, char *path, int *len)
12646 {
12647 struct vnode_attr va;
12648 int ret;
12649
12650 VATTR_INIT(&va);
12651 VATTR_WANTED(&va, va_fsid);
12652 VATTR_WANTED(&va, va_fileid);
12653
12654 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12655 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12656 ret = -1;
12657 } else {
12658 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12659 ret = 0;
12660 }
12661
12662 return ret;
12663 }
12664 #endif
12665
12666 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12667 fsctl_bogus_command_compat(unsigned long cmd)
12668 {
12669 switch (cmd) {
12670 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12671 return FSIOC_SYNC_VOLUME;
12672 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12673 return FSIOC_ROUTEFS_SETROUTEID;
12674 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12675 return FSIOC_SET_PACKAGE_EXTS;
12676 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12677 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12678 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12679 return DISK_CONDITIONER_IOC_GET;
12680 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12681 return DISK_CONDITIONER_IOC_SET;
12682 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12683 return FSIOC_FIOSEEKHOLE;
12684 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12685 return FSIOC_FIOSEEKDATA;
12686 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12687 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12688 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12689 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12690 }
12691
12692 return cmd;
12693 }
12694
12695 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12696 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12697 {
12698 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12699 }
12700
12701 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12702 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12703 {
12704 struct vfs_attr vfa;
12705 mount_t mp = vp->v_mount;
12706 unsigned arg;
12707 int error;
12708
12709 /* record vid of vp so we can drop it below. */
12710 uint32_t vvid = vp->v_id;
12711
12712 /*
12713 * Then grab mount_iterref so that we can release the vnode.
12714 * Without this, a thread may call vnode_iterate_prepare then
12715 * get into a deadlock because we've never released the root vp
12716 */
12717 error = mount_iterref(mp, 0);
12718 if (error) {
12719 return error;
12720 }
12721 vnode_hold(vp);
12722 vnode_put(vp);
12723
12724 arg = MNT_NOWAIT;
12725 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12726 arg = MNT_WAIT;
12727 }
12728
12729 /*
12730 * If the filessytem supports multiple filesytems in a
12731 * partition (For eg APFS volumes in a container, it knows
12732 * that the waitfor argument to VFS_SYNC are flags.
12733 */
12734 VFSATTR_INIT(&vfa);
12735 VFSATTR_WANTED(&vfa, f_capabilities);
12736 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12737 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12738 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12739 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12740 arg |= MNT_VOLUME;
12741 }
12742
12743 /* issue the sync for this volume */
12744 (void)sync_callback(mp, &arg);
12745
12746 /*
12747 * Then release the mount_iterref once we're done syncing; it's not
12748 * needed for the VNOP_IOCTL below
12749 */
12750 mount_iterdrop(mp);
12751
12752 if (arg & FSCTL_SYNC_FULLSYNC) {
12753 /* re-obtain vnode iocount on the root vp, if possible */
12754 error = vnode_getwithvid(vp, vvid);
12755 if (error == 0) {
12756 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12757 vnode_put(vp);
12758 }
12759 }
12760 vnode_drop(vp);
12761 /* mark the argument VP as having been released */
12762 *arg_vp = NULL;
12763 return error;
12764 }
12765
12766 #if ROUTEFS
12767 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12768 handle_routes(user_addr_t udata)
12769 {
12770 char routepath[MAXPATHLEN];
12771 size_t len = 0;
12772 int error;
12773
12774 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12775 return error;
12776 }
12777 bzero(routepath, MAXPATHLEN);
12778 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12779 if (error) {
12780 return error;
12781 }
12782 error = routefs_kernel_mount(routepath);
12783 return error;
12784 }
12785 #endif
12786
12787 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12788 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12789 {
12790 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12791 struct vnode_attr va;
12792 int error;
12793
12794 VATTR_INIT(&va);
12795 VATTR_SET(&va, va_flags, cas->new_flags);
12796
12797 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12798
12799 #if CONFIG_FSE
12800 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12801 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12802 }
12803 #endif
12804
12805 return error;
12806 }
12807
12808 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12809 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12810 {
12811 struct mount *mp = NULL;
12812 errno_t rootauth = 0;
12813
12814 mp = vp->v_mount;
12815
12816 /*
12817 * query the underlying FS and see if it reports something
12818 * sane for this vnode. If volume is authenticated via
12819 * chunklist, leave that for the caller to determine.
12820 */
12821 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12822
12823 return rootauth;
12824 }
12825
12826 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12827 "com.apple.private.kernel.set-package-extensions"
12828
12829 /*
12830 * Make a filesystem-specific control call:
12831 */
12832 /* ARGSUSED */
12833 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12834 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12835 {
12836 int error = 0;
12837 boolean_t is64bit;
12838 u_int size;
12839 #define STK_PARAMS 128
12840 char stkbuf[STK_PARAMS] = {0};
12841 caddr_t data, memp;
12842 vnode_t vp = *arg_vp;
12843
12844 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12845 return ENOTTY;
12846 }
12847
12848 cmd = fsctl_bogus_command_compat(cmd);
12849
12850 size = IOCPARM_LEN(cmd);
12851 if (size > IOCPARM_MAX) {
12852 return EINVAL;
12853 }
12854
12855 is64bit = proc_is64bit(p);
12856
12857 memp = NULL;
12858
12859 if (size > sizeof(stkbuf)) {
12860 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12861 return ENOMEM;
12862 }
12863 data = memp;
12864 } else {
12865 data = &stkbuf[0];
12866 };
12867
12868 if (cmd & IOC_IN) {
12869 if (size) {
12870 error = copyin(udata, data, size);
12871 if (error) {
12872 if (memp) {
12873 kfree_data(memp, size);
12874 }
12875 return error;
12876 }
12877 } else {
12878 if (is64bit) {
12879 *(user_addr_t *)data = udata;
12880 } else {
12881 *(uint32_t *)data = (uint32_t)udata;
12882 }
12883 };
12884 } else if ((cmd & IOC_OUT) && size) {
12885 /*
12886 * Zero the buffer so the user always
12887 * gets back something deterministic.
12888 */
12889 bzero(data, size);
12890 } else if (cmd & IOC_VOID) {
12891 if (is64bit) {
12892 *(user_addr_t *)data = udata;
12893 } else {
12894 *(uint32_t *)data = (uint32_t)udata;
12895 }
12896 }
12897
12898 /* Check to see if it's a generic command */
12899 switch (cmd) {
12900 case FSIOC_SYNC_VOLUME:
12901 error = handle_sync_volume(vp, arg_vp, data, ctx);
12902 break;
12903
12904 case FSIOC_ROUTEFS_SETROUTEID:
12905 #if ROUTEFS
12906 error = handle_routes(udata);
12907 #endif
12908 break;
12909
12910 case FSIOC_SET_PACKAGE_EXTS: {
12911 user_addr_t ext_strings;
12912 uint32_t num_entries;
12913 uint32_t max_width;
12914
12915 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12916 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12917 error = EPERM;
12918 break;
12919 }
12920
12921 if ((is64bit && size != sizeof(user64_package_ext_info))
12922 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12923 // either you're 64-bit and passed a 64-bit struct or
12924 // you're 32-bit and passed a 32-bit struct. otherwise
12925 // it's not ok.
12926 error = EINVAL;
12927 break;
12928 }
12929
12930 if (is64bit) {
12931 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12932 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12933 }
12934 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12935 num_entries = ((user64_package_ext_info *)data)->num_entries;
12936 max_width = ((user64_package_ext_info *)data)->max_width;
12937 } else {
12938 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12939 num_entries = ((user32_package_ext_info *)data)->num_entries;
12940 max_width = ((user32_package_ext_info *)data)->max_width;
12941 }
12942 error = set_package_extensions_table(ext_strings, num_entries, max_width);
12943 }
12944 break;
12945
12946 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12947 {
12948 mount_t mp;
12949
12950 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12951 break;
12952 }
12953 if ((mp = vp->v_mount) != NULL) {
12954 mount_lock(mp);
12955 if (data[0] != 0) {
12956 for (int i = 0; i < MFSTYPENAMELEN; i++) {
12957 if (!data[i]) {
12958 goto continue_copy;
12959 }
12960 }
12961 /*
12962 * Getting here means we have a user data
12963 * string which has no NULL termination in
12964 * its first MFSTYPENAMELEN bytes. This is
12965 * bogus, let's avoid strlcpy-ing the read
12966 * data and return an error.
12967 */
12968 error = EINVAL;
12969 goto unlock;
12970 continue_copy:
12971 vfs_setfstypename_locked(mp, data);
12972 if (vfs_isrdonly(mp) &&
12973 strcmp(data, "mtmfs") == 0) {
12974 mp->mnt_kern_flag |=
12975 MNTK_EXTENDED_SECURITY;
12976 mp->mnt_kern_flag &=
12977 ~MNTK_AUTH_OPAQUE;
12978 }
12979 } else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12980 const char *name =
12981 vfs_getfstypenameref_locked(mp, NULL);
12982 if (strcmp(name, "mtmfs") == 0) {
12983 mp->mnt_kern_flag &=
12984 ~MNTK_EXTENDED_SECURITY;
12985 }
12986 vfs_setfstypename_locked(mp, NULL);
12987 }
12988 unlock:
12989 mount_unlock(mp);
12990 }
12991 }
12992 break;
12993
12994 case DISK_CONDITIONER_IOC_GET: {
12995 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12996 }
12997 break;
12998
12999 case DISK_CONDITIONER_IOC_SET: {
13000 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
13001 }
13002 break;
13003
13004 case FSIOC_CAS_BSDFLAGS:
13005 error = handle_flags(vp, data, ctx);
13006 break;
13007
13008 case FSIOC_FD_ONLY_OPEN_ONCE: {
13009 error = 0;
13010 if (vnode_usecount(vp) > 1) {
13011 vnode_lock_spin(vp);
13012 if (vp->v_lflag & VL_HASSTREAMS) {
13013 if (vnode_isinuse_locked(vp, 1, 1)) {
13014 error = EBUSY;
13015 }
13016 } else if (vnode_usecount(vp) > 1) {
13017 error = EBUSY;
13018 }
13019 vnode_unlock(vp);
13020 }
13021 }
13022 break;
13023
13024 case FSIOC_EVAL_ROOTAUTH:
13025 error = handle_auth(vp, cmd, data, options, ctx);
13026 break;
13027
13028 case FSIOC_TEST_FSE_ACCESS_GRANTED:
13029 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
13030 break;
13031
13032 #if CONFIG_EXCLAVES
13033 case FSIOC_EXCLAVE_FS_REGISTER:
13034 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13035 error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
13036 } else {
13037 error = EPERM;
13038 }
13039 break;
13040
13041 case FSIOC_EXCLAVE_FS_UNREGISTER:
13042 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13043 error = vfs_exclave_fs_unregister(vp);
13044 } else {
13045 error = EPERM;
13046 }
13047 break;
13048
13049 case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
13050 exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
13051 exclave_fs_base_dir_t *dirs = NULL;
13052 if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13053 error = EPERM;
13054 break;
13055 }
13056 if (get_base_dirs->base_dirs) {
13057 if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
13058 error = EINVAL;
13059 break;
13060 }
13061 dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
13062 if (!dirs) {
13063 error = ENOSPC;
13064 break;
13065 }
13066 }
13067 error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
13068 if (!error && dirs) {
13069 error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
13070 get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
13071 }
13072 if (dirs) {
13073 kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
13074 }
13075 }
13076 break;
13077 #endif
13078
13079 default: {
13080 /*
13081 * Other, known commands shouldn't be passed down here.
13082 * (When adding a selector to this list, it may be prudent
13083 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
13084 */
13085 switch (cmd) {
13086 case F_PUNCHHOLE:
13087 case F_TRIM_ACTIVE_FILE:
13088 case F_RDADVISE:
13089 case F_TRANSCODEKEY:
13090 case F_GETPROTECTIONLEVEL:
13091 case F_GETDEFAULTPROTLEVEL:
13092 case F_MAKECOMPRESSED:
13093 case F_SET_GREEDY_MODE:
13094 case F_SETSTATICCONTENT:
13095 case F_SETIOTYPE:
13096 case F_SETBACKINGSTORE:
13097 case F_GETPATH_MTMINFO:
13098 case APFSIOC_REVERT_TO_SNAPSHOT:
13099 case FSIOC_FIOSEEKHOLE:
13100 case FSIOC_FIOSEEKDATA:
13101 case HFS_GET_BOOT_INFO:
13102 case HFS_SET_BOOT_INFO:
13103 case FIOPINSWAP:
13104 case F_CHKCLEAN:
13105 case F_FULLFSYNC:
13106 case F_BARRIERFSYNC:
13107 case F_FREEZE_FS:
13108 case F_THAW_FS:
13109 case FSIOC_KERNEL_ROOTAUTH:
13110 case FSIOC_GRAFT_FS:
13111 case FSIOC_UNGRAFT_FS:
13112 case FSIOC_AUTH_FS:
13113 case F_SPECULATIVE_READ:
13114 case F_ATTRIBUTION_TAG:
13115 case F_TRANSFEREXTENTS:
13116 case F_ASSERT_BG_ACCESS:
13117 case F_RELEASE_BG_ACCESS:
13118 error = EINVAL;
13119 goto outdrop;
13120 }
13121 /* Invoke the filesystem-specific code */
13122 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13123 }
13124 } /* end switch stmt */
13125
13126 /*
13127 * if no errors, copy any data to user. Size was
13128 * already set and checked above.
13129 */
13130 if (error == 0 && (cmd & IOC_OUT) && size) {
13131 error = copyout(data, udata, size);
13132 }
13133
13134 outdrop:
13135 if (memp) {
13136 kfree_data(memp, size);
13137 }
13138
13139 return error;
13140 }
13141
13142 /* ARGSUSED */
13143 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13144 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13145 {
13146 int error;
13147 struct nameidata nd;
13148 uint32_t nameiflags;
13149 vnode_t vp = NULL;
13150 vfs_context_t ctx = vfs_context_current();
13151
13152 AUDIT_ARG(cmd, (int)uap->cmd);
13153 AUDIT_ARG(value32, uap->options);
13154 /* Get the vnode for the file we are getting info on: */
13155 nameiflags = 0;
13156 //
13157 // if we come through fsctl() then the file is by definition not open.
13158 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13159 // lest the caller mistakenly thinks the only open is their own (but in
13160 // reality it's someone elses).
13161 //
13162 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13163 return EINVAL;
13164 }
13165 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13166 nameiflags |= FOLLOW;
13167 }
13168 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13169 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13170 }
13171 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13172 UIO_USERSPACE, uap->path, ctx);
13173 if ((error = namei(&nd))) {
13174 goto done;
13175 }
13176 vp = nd.ni_vp;
13177 nameidone(&nd);
13178
13179 #if CONFIG_MACF
13180 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13181 if (error) {
13182 goto done;
13183 }
13184 #endif
13185
13186 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13187
13188 done:
13189 if (vp) {
13190 vnode_put(vp);
13191 }
13192 return error;
13193 }
13194 /* ARGSUSED */
13195 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13196 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13197 {
13198 int error;
13199 vnode_t vp = NULL;
13200 vfs_context_t ctx = vfs_context_current();
13201 int fd = -1;
13202
13203 AUDIT_ARG(fd, uap->fd);
13204 AUDIT_ARG(cmd, (int)uap->cmd);
13205 AUDIT_ARG(value32, uap->options);
13206
13207 /* Get the vnode for the file we are getting info on: */
13208 if ((error = file_vnode(uap->fd, &vp))) {
13209 return error;
13210 }
13211 fd = uap->fd;
13212 if ((error = vnode_getwithref(vp))) {
13213 file_drop(fd);
13214 return error;
13215 }
13216
13217 #if CONFIG_MACF
13218 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13219 file_drop(fd);
13220 vnode_put(vp);
13221 return error;
13222 }
13223 #endif
13224
13225 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13226
13227 file_drop(fd);
13228
13229 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13230 if (vp) {
13231 vnode_put(vp);
13232 }
13233
13234 return error;
13235 }
13236 /* end of fsctl system call */
13237
13238 #define FILESEC_ACCESS_ENTITLEMENT \
13239 "com.apple.private.vfs.filesec-access"
13240
13241 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13242 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13243 {
13244 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13245 /*
13246 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13247 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13248 */
13249 if ((!setting && vfs_context_issuser(ctx)) ||
13250 IOTaskHasEntitlement(vfs_context_task(ctx),
13251 FILESEC_ACCESS_ENTITLEMENT)) {
13252 return 0;
13253 }
13254 }
13255
13256 return EPERM;
13257 }
13258
13259 /*
13260 * Retrieve the data of an extended attribute.
13261 */
13262 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13263 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13264 {
13265 vnode_t vp;
13266 struct nameidata nd;
13267 char attrname[XATTR_MAXNAMELEN + 1];
13268 vfs_context_t ctx = vfs_context_current();
13269 uio_t auio = NULL;
13270 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13271 size_t attrsize = 0;
13272 size_t namelen;
13273 u_int32_t nameiflags;
13274 int error;
13275 UIO_STACKBUF(uio_buf, 1);
13276
13277 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13278 return EINVAL;
13279 }
13280
13281 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13282 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13283 if (uap->options & XATTR_NOFOLLOW_ANY) {
13284 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13285 }
13286
13287 if ((error = namei(&nd))) {
13288 return error;
13289 }
13290 vp = nd.ni_vp;
13291 nameidone(&nd);
13292
13293 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13294 if (error != 0) {
13295 goto out;
13296 }
13297 if (xattr_protected(attrname) &&
13298 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13299 goto out;
13300 }
13301 /*
13302 * the specific check for 0xffffffff is a hack to preserve
13303 * binaray compatibilty in K64 with applications that discovered
13304 * that passing in a buf pointer and a size of -1 resulted in
13305 * just the size of the indicated extended attribute being returned.
13306 * this isn't part of the documented behavior, but because of the
13307 * original implemtation's check for "uap->size > 0", this behavior
13308 * was allowed. In K32 that check turned into a signed comparison
13309 * even though uap->size is unsigned... in K64, we blow by that
13310 * check because uap->size is unsigned and doesn't get sign smeared
13311 * in the munger for a 32 bit user app. we also need to add a
13312 * check to limit the maximum size of the buffer being passed in...
13313 * unfortunately, the underlying fileystems seem to just malloc
13314 * the requested size even if the actual extended attribute is tiny.
13315 * because that malloc is for kernel wired memory, we have to put a
13316 * sane limit on it.
13317 *
13318 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13319 * U64 running on K64 will yield -1 (64 bits wide)
13320 * U32/U64 running on K32 will yield -1 (32 bits wide)
13321 */
13322 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13323 goto no_uio;
13324 }
13325
13326 if (uap->value) {
13327 if (uap->size > (size_t)XATTR_MAXSIZE) {
13328 uap->size = XATTR_MAXSIZE;
13329 }
13330
13331 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13332 &uio_buf[0], sizeof(uio_buf));
13333 uio_addiov(auio, uap->value, uap->size);
13334 }
13335 no_uio:
13336 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13337 out:
13338 vnode_put(vp);
13339
13340 if (auio) {
13341 *retval = uap->size - uio_resid(auio);
13342 } else {
13343 *retval = (user_ssize_t)attrsize;
13344 }
13345
13346 return error;
13347 }
13348
13349 /*
13350 * Retrieve the data of an extended attribute.
13351 */
13352 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13353 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13354 {
13355 vnode_t vp;
13356 char attrname[XATTR_MAXNAMELEN + 1];
13357 vfs_context_t ctx = vfs_context_current();
13358 uio_t auio = NULL;
13359 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13360 size_t attrsize = 0;
13361 size_t namelen;
13362 int error;
13363 UIO_STACKBUF(uio_buf, 1);
13364
13365 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13366 XATTR_NOFOLLOW_ANY)) {
13367 return EINVAL;
13368 }
13369
13370 if ((error = file_vnode(uap->fd, &vp))) {
13371 return error;
13372 }
13373 if ((error = vnode_getwithref(vp))) {
13374 file_drop(uap->fd);
13375 return error;
13376 }
13377 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13378 if (error != 0) {
13379 goto out;
13380 }
13381 if (xattr_protected(attrname) &&
13382 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13383 goto out;
13384 }
13385 if (uap->value && uap->size > 0) {
13386 if (uap->size > (size_t)XATTR_MAXSIZE) {
13387 uap->size = XATTR_MAXSIZE;
13388 }
13389
13390 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13391 &uio_buf[0], sizeof(uio_buf));
13392 uio_addiov(auio, uap->value, uap->size);
13393 }
13394
13395 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13396 out:
13397 (void)vnode_put(vp);
13398 file_drop(uap->fd);
13399
13400 if (auio) {
13401 *retval = uap->size - uio_resid(auio);
13402 } else {
13403 *retval = (user_ssize_t)attrsize;
13404 }
13405 return error;
13406 }
13407
13408 /* struct for checkdirs iteration */
13409 struct setxattr_ctx {
13410 struct nameidata nd;
13411 char attrname[XATTR_MAXNAMELEN + 1];
13412 UIO_STACKBUF(uio_buf, 1);
13413 };
13414
13415 /*
13416 * Set the data of an extended attribute.
13417 */
13418 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13419 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13420 {
13421 vnode_t vp;
13422 vfs_context_t ctx = vfs_context_current();
13423 uio_t auio = NULL;
13424 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13425 size_t namelen;
13426 u_int32_t nameiflags;
13427 int error;
13428 struct setxattr_ctx *sactx;
13429
13430 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13431 return EINVAL;
13432 }
13433
13434 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13435 if (sactx == NULL) {
13436 return ENOMEM;
13437 }
13438
13439 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13440 if (error != 0) {
13441 if (error == EPERM) {
13442 /* if the string won't fit in attrname, copyinstr emits EPERM */
13443 error = ENAMETOOLONG;
13444 }
13445 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13446 goto out;
13447 }
13448 if (xattr_protected(sactx->attrname) &&
13449 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13450 goto out;
13451 }
13452 if (uap->size != 0 && uap->value == 0) {
13453 error = EINVAL;
13454 goto out;
13455 }
13456 if (uap->size > INT_MAX) {
13457 error = E2BIG;
13458 goto out;
13459 }
13460
13461 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13462 #if CONFIG_FILE_LEASES
13463 nameiflags |= WANTPARENT;
13464 #endif
13465 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13466 if (uap->options & XATTR_NOFOLLOW_ANY) {
13467 sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13468 }
13469
13470 if ((error = namei(&sactx->nd))) {
13471 goto out;
13472 }
13473 vp = sactx->nd.ni_vp;
13474 #if CONFIG_FILE_LEASES
13475 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13476 vnode_put(sactx->nd.ni_dvp);
13477 #endif
13478 nameidone(&sactx->nd);
13479
13480 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13481 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13482 uio_addiov(auio, uap->value, uap->size);
13483
13484 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13485 #if CONFIG_FSE
13486 if (error == 0) {
13487 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13488 FSE_ARG_VNODE, vp,
13489 FSE_ARG_DONE);
13490 }
13491 #endif
13492 vnode_put(vp);
13493 out:
13494 kfree_type(struct setxattr_ctx, sactx);
13495 *retval = 0;
13496 return error;
13497 }
13498
13499 /*
13500 * Set the data of an extended attribute.
13501 */
13502 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13503 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13504 {
13505 vnode_t vp;
13506 char attrname[XATTR_MAXNAMELEN + 1];
13507 vfs_context_t ctx = vfs_context_current();
13508 uio_t auio = NULL;
13509 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13510 size_t namelen;
13511 int error;
13512 UIO_STACKBUF(uio_buf, 1);
13513
13514 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13515 XATTR_NOFOLLOW_ANY)) {
13516 return EINVAL;
13517 }
13518
13519 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13520 if (error != 0) {
13521 if (error == EPERM) {
13522 /* if the string won't fit in attrname, copyinstr emits EPERM */
13523 return ENAMETOOLONG;
13524 }
13525 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13526 return error;
13527 }
13528 if (xattr_protected(attrname) &&
13529 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13530 return error;
13531 }
13532 if (uap->size != 0 && uap->value == 0) {
13533 return EINVAL;
13534 }
13535 if (uap->size > INT_MAX) {
13536 return E2BIG;
13537 }
13538 if ((error = file_vnode(uap->fd, &vp))) {
13539 return error;
13540 }
13541 if ((error = vnode_getwithref(vp))) {
13542 file_drop(uap->fd);
13543 return error;
13544 }
13545
13546 #if CONFIG_FILE_LEASES
13547 vnode_breakdirlease(vp, true, O_WRONLY);
13548 #endif
13549
13550 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13551 &uio_buf[0], sizeof(uio_buf));
13552 uio_addiov(auio, uap->value, uap->size);
13553
13554 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13555 #if CONFIG_FSE
13556 if (error == 0) {
13557 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13558 FSE_ARG_VNODE, vp,
13559 FSE_ARG_DONE);
13560 }
13561 #endif
13562 vnode_put(vp);
13563 file_drop(uap->fd);
13564 *retval = 0;
13565 return error;
13566 }
13567
13568 /*
13569 * Remove an extended attribute.
13570 * XXX Code duplication here.
13571 */
13572 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13573 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13574 {
13575 vnode_t vp;
13576 struct nameidata nd;
13577 char attrname[XATTR_MAXNAMELEN + 1];
13578 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13579 vfs_context_t ctx = vfs_context_current();
13580 size_t namelen;
13581 u_int32_t nameiflags;
13582 int error;
13583
13584 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13585 return EINVAL;
13586 }
13587
13588 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13589 if (error != 0) {
13590 return error;
13591 }
13592 if (xattr_protected(attrname)) {
13593 return EPERM;
13594 }
13595 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13596 #if CONFIG_FILE_LEASES
13597 nameiflags |= WANTPARENT;
13598 #endif
13599 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13600 if (uap->options & XATTR_NOFOLLOW_ANY) {
13601 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13602 }
13603
13604 if ((error = namei(&nd))) {
13605 return error;
13606 }
13607 vp = nd.ni_vp;
13608 #if CONFIG_FILE_LEASES
13609 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13610 vnode_put(nd.ni_dvp);
13611 #endif
13612 nameidone(&nd);
13613
13614 error = vn_removexattr(vp, attrname, uap->options, ctx);
13615 #if CONFIG_FSE
13616 if (error == 0) {
13617 add_fsevent(FSE_XATTR_REMOVED, ctx,
13618 FSE_ARG_VNODE, vp,
13619 FSE_ARG_DONE);
13620 }
13621 #endif
13622 vnode_put(vp);
13623 *retval = 0;
13624 return error;
13625 }
13626
13627 /*
13628 * Remove an extended attribute.
13629 * XXX Code duplication here.
13630 */
13631 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13632 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13633 {
13634 vnode_t vp;
13635 char attrname[XATTR_MAXNAMELEN + 1];
13636 size_t namelen;
13637 int error;
13638 #if CONFIG_FSE
13639 vfs_context_t ctx = vfs_context_current();
13640 #endif
13641
13642 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13643 XATTR_NOFOLLOW_ANY)) {
13644 return EINVAL;
13645 }
13646
13647 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13648 if (error != 0) {
13649 return error;
13650 }
13651 if (xattr_protected(attrname)) {
13652 return EPERM;
13653 }
13654 if ((error = file_vnode(uap->fd, &vp))) {
13655 return error;
13656 }
13657 if ((error = vnode_getwithref(vp))) {
13658 file_drop(uap->fd);
13659 return error;
13660 }
13661
13662 #if CONFIG_FILE_LEASES
13663 vnode_breakdirlease(vp, true, O_WRONLY);
13664 #endif
13665
13666 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13667 #if CONFIG_FSE
13668 if (error == 0) {
13669 add_fsevent(FSE_XATTR_REMOVED, ctx,
13670 FSE_ARG_VNODE, vp,
13671 FSE_ARG_DONE);
13672 }
13673 #endif
13674 vnode_put(vp);
13675 file_drop(uap->fd);
13676 *retval = 0;
13677 return error;
13678 }
13679
13680 /*
13681 * Retrieve the list of extended attribute names.
13682 * XXX Code duplication here.
13683 */
13684 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13685 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13686 {
13687 vnode_t vp;
13688 struct nameidata nd;
13689 vfs_context_t ctx = vfs_context_current();
13690 uio_t auio = NULL;
13691 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13692 size_t attrsize = 0;
13693 u_int32_t nameiflags;
13694 int error;
13695 UIO_STACKBUF(uio_buf, 1);
13696
13697 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13698 return EINVAL;
13699 }
13700
13701 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13702 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13703 if (uap->options & XATTR_NOFOLLOW_ANY) {
13704 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13705 }
13706
13707 if ((error = namei(&nd))) {
13708 return error;
13709 }
13710 vp = nd.ni_vp;
13711 nameidone(&nd);
13712 if (uap->namebuf != 0 && uap->bufsize > 0) {
13713 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13714 &uio_buf[0], sizeof(uio_buf));
13715 uio_addiov(auio, uap->namebuf, uap->bufsize);
13716 }
13717
13718 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13719
13720 vnode_put(vp);
13721 if (auio) {
13722 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13723 } else {
13724 *retval = (user_ssize_t)attrsize;
13725 }
13726 return error;
13727 }
13728
13729 /*
13730 * Retrieve the list of extended attribute names.
13731 * XXX Code duplication here.
13732 */
13733 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13734 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13735 {
13736 vnode_t vp;
13737 uio_t auio = NULL;
13738 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13739 size_t attrsize = 0;
13740 int error;
13741 UIO_STACKBUF(uio_buf, 1);
13742
13743 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13744 XATTR_NOFOLLOW_ANY)) {
13745 return EINVAL;
13746 }
13747
13748 if ((error = file_vnode(uap->fd, &vp))) {
13749 return error;
13750 }
13751 if ((error = vnode_getwithref(vp))) {
13752 file_drop(uap->fd);
13753 return error;
13754 }
13755 if (uap->namebuf != 0 && uap->bufsize > 0) {
13756 auio = uio_createwithbuffer(1, 0, spacetype,
13757 UIO_READ, &uio_buf[0], sizeof(uio_buf));
13758 uio_addiov(auio, uap->namebuf, uap->bufsize);
13759 }
13760
13761 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13762
13763 vnode_put(vp);
13764 file_drop(uap->fd);
13765 if (auio) {
13766 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13767 } else {
13768 *retval = (user_ssize_t)attrsize;
13769 }
13770 return error;
13771 }
13772
13773 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13774 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13775 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13776 {
13777 int error;
13778 struct mount *mp = NULL;
13779 vnode_t vp;
13780 int length;
13781 int bpflags;
13782 /* maximum number of times to retry build_path */
13783 unsigned int retries = 0x10;
13784
13785 if (bufsize > FSGETPATH_MAXBUFLEN) {
13786 return EINVAL;
13787 }
13788
13789 if (buf == NULL) {
13790 return ENOMEM;
13791 }
13792
13793 retry:
13794 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13795 error = ENOTSUP; /* unexpected failure */
13796 return ENOTSUP;
13797 }
13798
13799 #if CONFIG_UNION_MOUNTS
13800 unionget:
13801 #endif /* CONFIG_UNION_MOUNTS */
13802 if (objid == 2) {
13803 struct vfs_attr vfsattr;
13804 int use_vfs_root = TRUE;
13805
13806 VFSATTR_INIT(&vfsattr);
13807 VFSATTR_WANTED(&vfsattr, f_capabilities);
13808 if (!(options & FSOPT_ISREALFSID) &&
13809 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13810 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13811 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13812 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13813 use_vfs_root = FALSE;
13814 }
13815 }
13816
13817 if (use_vfs_root) {
13818 error = VFS_ROOT(mp, &vp, ctx);
13819 } else {
13820 error = VFS_VGET(mp, objid, &vp, ctx);
13821 }
13822 } else {
13823 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13824 }
13825
13826 #if CONFIG_UNION_MOUNTS
13827 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13828 /*
13829 * If the fileid isn't found and we're in a union
13830 * mount volume, then see if the fileid is in the
13831 * mounted-on volume.
13832 */
13833 struct mount *tmp = mp;
13834 mp = vnode_mount(tmp->mnt_vnodecovered);
13835 vfs_unbusy(tmp);
13836 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13837 goto unionget;
13838 }
13839 } else {
13840 vfs_unbusy(mp);
13841 }
13842 #else
13843 vfs_unbusy(mp);
13844 #endif /* CONFIG_UNION_MOUNTS */
13845
13846 if (error) {
13847 return error;
13848 }
13849
13850 #if CONFIG_MACF
13851 error = mac_vnode_check_fsgetpath(ctx, vp);
13852 if (error) {
13853 vnode_put(vp);
13854 return error;
13855 }
13856 #endif
13857
13858 /* Obtain the absolute path to this vnode. */
13859 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13860 if (options & FSOPT_NOFIRMLINKPATH) {
13861 bpflags |= BUILDPATH_NO_FIRMLINK;
13862 }
13863 bpflags |= BUILDPATH_CHECK_MOVED;
13864 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13865 vnode_put(vp);
13866
13867 if (error) {
13868 /* there was a race building the path, try a few more times */
13869 if (error == EAGAIN) {
13870 --retries;
13871 if (retries > 0) {
13872 goto retry;
13873 }
13874
13875 error = ENOENT;
13876 }
13877 goto out;
13878 }
13879
13880 AUDIT_ARG(text, buf);
13881
13882 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13883 unsigned long path_words[NUMPARMS];
13884 size_t path_len = sizeof(path_words);
13885
13886 if ((size_t)length < path_len) {
13887 memcpy((char *)path_words, buf, length);
13888 memset((char *)path_words + length, 0, path_len - length);
13889
13890 path_len = length;
13891 } else {
13892 memcpy((char *)path_words, buf + (length - path_len), path_len);
13893 }
13894
13895 kdebug_vfs_lookup(path_words, (int)path_len, vp,
13896 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13897 }
13898
13899 *pathlen = length; /* may be superseded by error */
13900
13901 out:
13902 return error;
13903 }
13904
13905 /*
13906 * Obtain the full pathname of a file system object by id.
13907 */
13908 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13909 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13910 uint32_t options, user_ssize_t *retval)
13911 {
13912 vfs_context_t ctx = vfs_context_current();
13913 fsid_t fsid;
13914 char *realpath;
13915 int length;
13916 int error;
13917
13918 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13919 return EINVAL;
13920 }
13921
13922 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13923 return error;
13924 }
13925 AUDIT_ARG(value32, fsid.val[0]);
13926 AUDIT_ARG(value64, objid);
13927 /* Restrict output buffer size for now. */
13928
13929 if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13930 return EINVAL;
13931 }
13932 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13933 if (realpath == NULL) {
13934 return ENOMEM;
13935 }
13936
13937 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13938 options, &length);
13939
13940 if (error) {
13941 goto out;
13942 }
13943
13944 error = copyout((caddr_t)realpath, buf, length);
13945
13946 *retval = (user_ssize_t)length; /* may be superseded by error */
13947 out:
13948 kfree_data(realpath, bufsize);
13949 return error;
13950 }
13951
13952 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13953 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13954 {
13955 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13956 0, retval);
13957 }
13958
13959 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13960 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13961 {
13962 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13963 uap->options, retval);
13964 }
13965
13966 /*
13967 * Common routine to handle various flavors of statfs data heading out
13968 * to user space.
13969 *
13970 * Returns: 0 Success
13971 * EFAULT
13972 */
13973 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13974 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13975 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13976 boolean_t partial_copy)
13977 {
13978 int error;
13979 int my_size, copy_size;
13980
13981 if (is_64_bit) {
13982 struct user64_statfs sfs;
13983 my_size = copy_size = sizeof(sfs);
13984 bzero(&sfs, my_size);
13985 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13986 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13987 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13988 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13989 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13990 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13991 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13992 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13993 sfs.f_files = (user64_long_t)sfsp->f_files;
13994 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13995 sfs.f_fsid = sfsp->f_fsid;
13996 sfs.f_owner = sfsp->f_owner;
13997 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13998 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13999 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14000
14001 if (partial_copy) {
14002 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14003 }
14004 error = copyout((caddr_t)&sfs, bufp, copy_size);
14005 } else {
14006 struct user32_statfs sfs;
14007
14008 my_size = copy_size = sizeof(sfs);
14009 bzero(&sfs, my_size);
14010
14011 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14012 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14013 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14014
14015 /*
14016 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
14017 * have to fudge the numbers here in that case. We inflate the blocksize in order
14018 * to reflect the filesystem size as best we can.
14019 */
14020 if ((sfsp->f_blocks > INT_MAX)
14021 /* Hack for 4061702 . I think the real fix is for Carbon to
14022 * look for some volume capability and not depend on hidden
14023 * semantics agreed between a FS and carbon.
14024 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
14025 * for Carbon to set bNoVolumeSizes volume attribute.
14026 * Without this the webdavfs files cannot be copied onto
14027 * disk as they look huge. This change should not affect
14028 * XSAN as they should not setting these to -1..
14029 */
14030 && (sfsp->f_blocks != 0xffffffffffffffffULL)
14031 && (sfsp->f_bfree != 0xffffffffffffffffULL)
14032 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
14033 int shift;
14034
14035 /*
14036 * Work out how far we have to shift the block count down to make it fit.
14037 * Note that it's possible to have to shift so far that the resulting
14038 * blocksize would be unreportably large. At that point, we will clip
14039 * any values that don't fit.
14040 *
14041 * For safety's sake, we also ensure that f_iosize is never reported as
14042 * being smaller than f_bsize.
14043 */
14044 for (shift = 0; shift < 32; shift++) {
14045 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
14046 break;
14047 }
14048 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
14049 break;
14050 }
14051 }
14052 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
14053 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
14054 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
14055 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
14056 #undef __SHIFT_OR_CLIP
14057 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
14058 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
14059 } else {
14060 /* filesystem is small enough to be reported honestly */
14061 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
14062 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
14063 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
14064 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
14065 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
14066 }
14067 sfs.f_files = (user32_long_t)sfsp->f_files;
14068 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
14069 sfs.f_fsid = sfsp->f_fsid;
14070 sfs.f_owner = sfsp->f_owner;
14071 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14072 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14073 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14074
14075 if (partial_copy) {
14076 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14077 }
14078 error = copyout((caddr_t)&sfs, bufp, copy_size);
14079 }
14080
14081 if (sizep != NULL) {
14082 *sizep = my_size;
14083 }
14084 return error;
14085 }
14086
14087 /*
14088 * copy stat structure into user_stat structure.
14089 */
14090 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)14091 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
14092 {
14093 bzero(usbp, sizeof(*usbp));
14094
14095 usbp->st_dev = sbp->st_dev;
14096 usbp->st_ino = sbp->st_ino;
14097 usbp->st_mode = sbp->st_mode;
14098 usbp->st_nlink = sbp->st_nlink;
14099 usbp->st_uid = sbp->st_uid;
14100 usbp->st_gid = sbp->st_gid;
14101 usbp->st_rdev = sbp->st_rdev;
14102 #ifndef _POSIX_C_SOURCE
14103 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14104 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14105 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14106 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14107 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14108 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14109 #else
14110 usbp->st_atime = sbp->st_atime;
14111 usbp->st_atimensec = sbp->st_atimensec;
14112 usbp->st_mtime = sbp->st_mtime;
14113 usbp->st_mtimensec = sbp->st_mtimensec;
14114 usbp->st_ctime = sbp->st_ctime;
14115 usbp->st_ctimensec = sbp->st_ctimensec;
14116 #endif
14117 usbp->st_size = sbp->st_size;
14118 usbp->st_blocks = sbp->st_blocks;
14119 usbp->st_blksize = sbp->st_blksize;
14120 usbp->st_flags = sbp->st_flags;
14121 usbp->st_gen = sbp->st_gen;
14122 usbp->st_lspare = sbp->st_lspare;
14123 usbp->st_qspare[0] = sbp->st_qspare[0];
14124 usbp->st_qspare[1] = sbp->st_qspare[1];
14125 }
14126
14127 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14128 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14129 {
14130 bzero(usbp, sizeof(*usbp));
14131
14132 usbp->st_dev = sbp->st_dev;
14133 usbp->st_ino = sbp->st_ino;
14134 usbp->st_mode = sbp->st_mode;
14135 usbp->st_nlink = sbp->st_nlink;
14136 usbp->st_uid = sbp->st_uid;
14137 usbp->st_gid = sbp->st_gid;
14138 usbp->st_rdev = sbp->st_rdev;
14139 #ifndef _POSIX_C_SOURCE
14140 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14141 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14142 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14143 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14144 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14145 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14146 #else
14147 usbp->st_atime = sbp->st_atime;
14148 usbp->st_atimensec = sbp->st_atimensec;
14149 usbp->st_mtime = sbp->st_mtime;
14150 usbp->st_mtimensec = sbp->st_mtimensec;
14151 usbp->st_ctime = sbp->st_ctime;
14152 usbp->st_ctimensec = sbp->st_ctimensec;
14153 #endif
14154 usbp->st_size = sbp->st_size;
14155 usbp->st_blocks = sbp->st_blocks;
14156 usbp->st_blksize = sbp->st_blksize;
14157 usbp->st_flags = sbp->st_flags;
14158 usbp->st_gen = sbp->st_gen;
14159 usbp->st_lspare = sbp->st_lspare;
14160 usbp->st_qspare[0] = sbp->st_qspare[0];
14161 usbp->st_qspare[1] = sbp->st_qspare[1];
14162 }
14163
14164 /*
14165 * copy stat64 structure into user_stat64 structure.
14166 */
14167 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14168 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14169 {
14170 bzero(usbp, sizeof(*usbp));
14171
14172 usbp->st_dev = sbp->st_dev;
14173 usbp->st_ino = sbp->st_ino;
14174 usbp->st_mode = sbp->st_mode;
14175 usbp->st_nlink = sbp->st_nlink;
14176 usbp->st_uid = sbp->st_uid;
14177 usbp->st_gid = sbp->st_gid;
14178 usbp->st_rdev = sbp->st_rdev;
14179 #ifndef _POSIX_C_SOURCE
14180 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14181 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14182 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14183 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14184 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14185 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14186 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14187 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14188 #else
14189 usbp->st_atime = sbp->st_atime;
14190 usbp->st_atimensec = sbp->st_atimensec;
14191 usbp->st_mtime = sbp->st_mtime;
14192 usbp->st_mtimensec = sbp->st_mtimensec;
14193 usbp->st_ctime = sbp->st_ctime;
14194 usbp->st_ctimensec = sbp->st_ctimensec;
14195 usbp->st_birthtime = sbp->st_birthtime;
14196 usbp->st_birthtimensec = sbp->st_birthtimensec;
14197 #endif
14198 usbp->st_size = sbp->st_size;
14199 usbp->st_blocks = sbp->st_blocks;
14200 usbp->st_blksize = sbp->st_blksize;
14201 usbp->st_flags = sbp->st_flags;
14202 usbp->st_gen = sbp->st_gen;
14203 usbp->st_lspare = sbp->st_lspare;
14204 usbp->st_qspare[0] = sbp->st_qspare[0];
14205 usbp->st_qspare[1] = sbp->st_qspare[1];
14206 }
14207
14208 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14209 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14210 {
14211 bzero(usbp, sizeof(*usbp));
14212
14213 usbp->st_dev = sbp->st_dev;
14214 usbp->st_ino = sbp->st_ino;
14215 usbp->st_mode = sbp->st_mode;
14216 usbp->st_nlink = sbp->st_nlink;
14217 usbp->st_uid = sbp->st_uid;
14218 usbp->st_gid = sbp->st_gid;
14219 usbp->st_rdev = sbp->st_rdev;
14220 #ifndef _POSIX_C_SOURCE
14221 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14222 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14223 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14224 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14225 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14226 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14227 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14228 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14229 #else
14230 usbp->st_atime = sbp->st_atime;
14231 usbp->st_atimensec = sbp->st_atimensec;
14232 usbp->st_mtime = sbp->st_mtime;
14233 usbp->st_mtimensec = sbp->st_mtimensec;
14234 usbp->st_ctime = sbp->st_ctime;
14235 usbp->st_ctimensec = sbp->st_ctimensec;
14236 usbp->st_birthtime = sbp->st_birthtime;
14237 usbp->st_birthtimensec = sbp->st_birthtimensec;
14238 #endif
14239 usbp->st_size = sbp->st_size;
14240 usbp->st_blocks = sbp->st_blocks;
14241 usbp->st_blksize = sbp->st_blksize;
14242 usbp->st_flags = sbp->st_flags;
14243 usbp->st_gen = sbp->st_gen;
14244 usbp->st_lspare = sbp->st_lspare;
14245 usbp->st_qspare[0] = sbp->st_qspare[0];
14246 usbp->st_qspare[1] = sbp->st_qspare[1];
14247 }
14248
14249 /*
14250 * Purge buffer cache for simulating cold starts
14251 */
14252 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14253 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14254 {
14255 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14256
14257 return VNODE_RETURNED;
14258 }
14259
14260 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14261 vfs_purge_callback(mount_t mp, __unused void * arg)
14262 {
14263 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14264
14265 return VFS_RETURNED;
14266 }
14267
14268 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14269 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14270
14271 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14272 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14273 {
14274 if (!kauth_cred_issuser(kauth_cred_get())) {
14275 return EPERM;
14276 }
14277
14278 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14279
14280 /* also flush any VM pagers backed by files */
14281 if (vfs_purge_vm_pagers) {
14282 vm_purge_filebacked_pagers();
14283 }
14284
14285 return 0;
14286 }
14287
14288 /*
14289 * gets the vnode associated with the (unnamed) snapshot directory
14290 * for a Filesystem. The snapshot directory vnode is returned with
14291 * an iocount on it.
14292 */
14293 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14294 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14295 {
14296 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14297 }
14298
14299 /*
14300 * Get the snapshot vnode.
14301 *
14302 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14303 * needs nameidone() on ndp.
14304 *
14305 * If the snapshot vnode exists it is returned in ndp->ni_vp.
14306 *
14307 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14308 * not needed.
14309 */
14310 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14311 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14312 user_addr_t name, struct nameidata *ndp, int32_t op,
14313 #if !CONFIG_TRIGGERS
14314 __unused
14315 #endif
14316 enum path_operation pathop,
14317 vfs_context_t ctx)
14318 {
14319 int error, i;
14320 caddr_t name_buf;
14321 size_t name_len;
14322 struct vfs_attr vfa;
14323
14324 *sdvpp = NULLVP;
14325 *rvpp = NULLVP;
14326
14327 error = vnode_getfromfd(ctx, dirfd, rvpp);
14328 if (error) {
14329 return error;
14330 }
14331
14332 if (!vnode_isvroot(*rvpp)) {
14333 error = EINVAL;
14334 goto out;
14335 }
14336
14337 /* Make sure the filesystem supports snapshots */
14338 VFSATTR_INIT(&vfa);
14339 VFSATTR_WANTED(&vfa, f_capabilities);
14340 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14341 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14342 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14343 VOL_CAP_INT_SNAPSHOT)) ||
14344 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14345 VOL_CAP_INT_SNAPSHOT))) {
14346 error = ENOTSUP;
14347 goto out;
14348 }
14349
14350 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14351 if (error) {
14352 goto out;
14353 }
14354
14355 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14356 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14357 if (error) {
14358 goto out1;
14359 }
14360
14361 /*
14362 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14363 * (the length returned by copyinstr includes the terminating NUL)
14364 */
14365 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14366 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14367 error = EINVAL;
14368 goto out1;
14369 }
14370 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14371 ;
14372 }
14373 if (i < (int)name_len) {
14374 error = EINVAL;
14375 goto out1;
14376 }
14377
14378 #if CONFIG_MACF
14379 if (op == CREATE) {
14380 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14381 name_buf);
14382 } else if (op == DELETE) {
14383 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14384 name_buf);
14385 }
14386 if (error) {
14387 goto out1;
14388 }
14389 #endif
14390
14391 /* Check if the snapshot already exists ... */
14392 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14393 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14394 ndp->ni_dvp = *sdvpp;
14395
14396 error = namei(ndp);
14397 out1:
14398 zfree(ZV_NAMEI, name_buf);
14399 out:
14400 if (error) {
14401 if (*sdvpp) {
14402 vnode_put(*sdvpp);
14403 *sdvpp = NULLVP;
14404 }
14405 if (*rvpp) {
14406 vnode_put(*rvpp);
14407 *rvpp = NULLVP;
14408 }
14409 }
14410 return error;
14411 }
14412
14413 /*
14414 * create a filesystem snapshot (for supporting filesystems)
14415 *
14416 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14417 * We get to the (unnamed) snapshot directory vnode and create the vnode
14418 * for the snapshot in it.
14419 *
14420 * Restrictions:
14421 *
14422 * a) Passed in name for snapshot cannot have slashes.
14423 * b) name can't be "." or ".."
14424 *
14425 * Since this requires superuser privileges, vnode_authorize calls are not
14426 * made.
14427 */
14428 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14429 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14430 vfs_context_t ctx)
14431 {
14432 vnode_t rvp, snapdvp;
14433 int error;
14434 struct nameidata *ndp;
14435
14436 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14437
14438 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14439 OP_LINK, ctx);
14440 if (error) {
14441 goto out;
14442 }
14443
14444 if (ndp->ni_vp) {
14445 vnode_put(ndp->ni_vp);
14446 error = EEXIST;
14447 } else {
14448 struct vnode_attr *vap;
14449 vnode_t vp = NULLVP;
14450
14451 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14452
14453 VATTR_INIT(vap);
14454 VATTR_SET(vap, va_type, VREG);
14455 VATTR_SET(vap, va_mode, 0);
14456
14457 error = vn_create(snapdvp, &vp, ndp, vap,
14458 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14459 if (!error && vp) {
14460 vnode_put(vp);
14461 }
14462
14463 kfree_type(struct vnode_attr, vap);
14464 }
14465
14466 nameidone(ndp);
14467 vnode_put(snapdvp);
14468 vnode_put(rvp);
14469 out:
14470 kfree_type(struct nameidata, ndp);
14471
14472 return error;
14473 }
14474
14475 /*
14476 * Delete a Filesystem snapshot
14477 *
14478 * get the vnode for the unnamed snapshot directory and the snapshot and
14479 * delete the snapshot.
14480 */
14481 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14482 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14483 vfs_context_t ctx)
14484 {
14485 vnode_t rvp, snapdvp;
14486 int error;
14487 struct nameidata *ndp;
14488
14489 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14490
14491 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14492 OP_UNLINK, ctx);
14493 if (error) {
14494 goto out;
14495 }
14496
14497 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14498 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14499
14500 vnode_put(ndp->ni_vp);
14501 nameidone(ndp);
14502 vnode_put(snapdvp);
14503 vnode_put(rvp);
14504 out:
14505 kfree_type(struct nameidata, ndp);
14506
14507 return error;
14508 }
14509
14510 /*
14511 * Revert a filesystem to a snapshot
14512 *
14513 * Marks the filesystem to revert to the given snapshot on next mount.
14514 */
14515 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14516 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14517 vfs_context_t ctx)
14518 {
14519 int error;
14520 vnode_t rvp;
14521 mount_t mp;
14522 struct fs_snapshot_revert_args revert_data;
14523 struct componentname cnp;
14524 caddr_t name_buf;
14525 size_t name_len;
14526
14527 error = vnode_getfromfd(ctx, dirfd, &rvp);
14528 if (error) {
14529 return error;
14530 }
14531 mp = vnode_mount(rvp);
14532
14533 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14534 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14535 if (error) {
14536 zfree(ZV_NAMEI, name_buf);
14537 vnode_put(rvp);
14538 return error;
14539 }
14540
14541 #if CONFIG_MACF
14542 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14543 if (error) {
14544 zfree(ZV_NAMEI, name_buf);
14545 vnode_put(rvp);
14546 return error;
14547 }
14548 #endif
14549
14550 /*
14551 * Grab mount_iterref so that we can release the vnode,
14552 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14553 */
14554 error = mount_iterref(mp, 0);
14555 vnode_put(rvp);
14556 if (error) {
14557 zfree(ZV_NAMEI, name_buf);
14558 return error;
14559 }
14560
14561 memset(&cnp, 0, sizeof(cnp));
14562 cnp.cn_pnbuf = (char *)name_buf;
14563 cnp.cn_nameiop = LOOKUP;
14564 cnp.cn_flags = ISLASTCN | HASBUF;
14565 cnp.cn_pnlen = MAXPATHLEN;
14566 cnp.cn_nameptr = cnp.cn_pnbuf;
14567 cnp.cn_namelen = (int)name_len;
14568 revert_data.sr_cnp = &cnp;
14569
14570 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14571 mount_iterdrop(mp);
14572 zfree(ZV_NAMEI, name_buf);
14573
14574 if (error) {
14575 /* If there was any error, try again using VNOP_IOCTL */
14576
14577 vnode_t snapdvp;
14578 struct nameidata namend;
14579
14580 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14581 OP_LOOKUP, ctx);
14582 if (error) {
14583 return error;
14584 }
14585
14586
14587 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14588 0, ctx);
14589
14590 vnode_put(namend.ni_vp);
14591 nameidone(&namend);
14592 vnode_put(snapdvp);
14593 vnode_put(rvp);
14594 }
14595
14596 return error;
14597 }
14598
14599 /*
14600 * rename a Filesystem snapshot
14601 *
14602 * get the vnode for the unnamed snapshot directory and the snapshot and
14603 * rename the snapshot. This is a very specialised (and simple) case of
14604 * rename(2) (which has to deal with a lot more complications). It differs
14605 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14606 */
14607 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14608 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14609 __unused uint32_t flags, vfs_context_t ctx)
14610 {
14611 vnode_t rvp, snapdvp;
14612 int error, i;
14613 caddr_t newname_buf;
14614 size_t name_len;
14615 vnode_t fvp;
14616 struct nameidata *fromnd, *tond;
14617 /* carving out a chunk for structs that are too big to be on stack. */
14618 struct {
14619 struct nameidata from_node;
14620 struct nameidata to_node;
14621 } * __rename_data;
14622
14623 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14624 fromnd = &__rename_data->from_node;
14625 tond = &__rename_data->to_node;
14626
14627 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14628 OP_UNLINK, ctx);
14629 if (error) {
14630 goto out;
14631 }
14632 fvp = fromnd->ni_vp;
14633
14634 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14635 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14636 if (error) {
14637 goto out1;
14638 }
14639
14640 /*
14641 * Some sanity checks- new name can't be empty, "." or ".." or have
14642 * slashes.
14643 * (the length returned by copyinstr includes the terminating NUL)
14644 *
14645 * The FS rename VNOP is suppossed to handle this but we'll pick it
14646 * off here itself.
14647 */
14648 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14649 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14650 error = EINVAL;
14651 goto out1;
14652 }
14653 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14654 ;
14655 }
14656 if (i < (int)name_len) {
14657 error = EINVAL;
14658 goto out1;
14659 }
14660
14661 #if CONFIG_MACF
14662 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14663 newname_buf);
14664 if (error) {
14665 goto out1;
14666 }
14667 #endif
14668
14669 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14670 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14671 tond->ni_dvp = snapdvp;
14672
14673 error = namei(tond);
14674 if (error) {
14675 goto out2;
14676 } else if (tond->ni_vp) {
14677 /*
14678 * snapshot rename behaves differently than rename(2) - if the
14679 * new name exists, EEXIST is returned.
14680 */
14681 vnode_put(tond->ni_vp);
14682 error = EEXIST;
14683 goto out2;
14684 }
14685
14686 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14687 &tond->ni_cnd, ctx);
14688
14689 out2:
14690 nameidone(tond);
14691 out1:
14692 zfree(ZV_NAMEI, newname_buf);
14693 vnode_put(fvp);
14694 vnode_put(snapdvp);
14695 vnode_put(rvp);
14696 nameidone(fromnd);
14697 out:
14698 kfree_type(typeof(*__rename_data), __rename_data);
14699 return error;
14700 }
14701
14702 /*
14703 * Mount a Filesystem snapshot
14704 *
14705 * get the vnode for the unnamed snapshot directory and the snapshot and
14706 * mount the snapshot.
14707 */
14708 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14709 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14710 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14711 {
14712 mount_t mp;
14713 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14714 struct fs_snapshot_mount_args smnt_data;
14715 int error, mount_flags = 0;
14716 struct nameidata *snapndp, *dirndp;
14717 /* carving out a chunk for structs that are too big to be on stack. */
14718 struct {
14719 struct nameidata snapnd;
14720 struct nameidata dirnd;
14721 } * __snapshot_mount_data;
14722
14723 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14724 snapndp = &__snapshot_mount_data->snapnd;
14725 dirndp = &__snapshot_mount_data->dirnd;
14726
14727 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14728 OP_LOOKUP, ctx);
14729 if (error) {
14730 goto out;
14731 }
14732
14733 snapvp = snapndp->ni_vp;
14734 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14735 error = EIO;
14736 goto out1;
14737 }
14738
14739 /* Convert snapshot_mount flags to mount flags */
14740 if (flags & SNAPSHOT_MNT_DONTBROWSE) {
14741 mount_flags |= MNT_DONTBROWSE;
14742 }
14743 if (flags & SNAPSHOT_MNT_IGNORE_OWNERSHIP) {
14744 mount_flags |= MNT_IGNORE_OWNERSHIP;
14745 }
14746 if (flags & SNAPSHOT_MNT_NOFOLLOW) {
14747 mount_flags |= MNT_NOFOLLOW;
14748 }
14749
14750 /* Get the vnode to be covered */
14751 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14752 UIO_USERSPACE, directory, ctx);
14753 if (mount_flags & MNT_NOFOLLOW) {
14754 dirndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
14755 }
14756
14757 error = namei(dirndp);
14758 if (error) {
14759 goto out1;
14760 }
14761
14762 vp = dirndp->ni_vp;
14763 pvp = dirndp->ni_dvp;
14764 mp = vnode_mount(rvp);
14765
14766 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14767 error = EINVAL;
14768 goto out2;
14769 }
14770
14771 #if CONFIG_MACF
14772 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14773 mp->mnt_vfsstat.f_fstypename);
14774 if (error) {
14775 goto out2;
14776 }
14777 #endif
14778
14779 smnt_data.sm_mp = mp;
14780 smnt_data.sm_cnp = &snapndp->ni_cnd;
14781 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14782 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), mount_flags,
14783 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14784
14785 out2:
14786 vnode_put(vp);
14787 vnode_put(pvp);
14788 nameidone(dirndp);
14789 out1:
14790 vnode_put(snapvp);
14791 vnode_put(snapdvp);
14792 vnode_put(rvp);
14793 nameidone(snapndp);
14794 out:
14795 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14796 return error;
14797 }
14798
14799 /*
14800 * Root from a snapshot of the filesystem
14801 *
14802 * Marks the filesystem to root from the given snapshot on next boot.
14803 */
14804 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14805 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14806 vfs_context_t ctx)
14807 {
14808 int error;
14809 vnode_t rvp;
14810 mount_t mp;
14811 struct fs_snapshot_root_args root_data;
14812 struct componentname cnp;
14813 caddr_t name_buf;
14814 size_t name_len;
14815
14816 error = vnode_getfromfd(ctx, dirfd, &rvp);
14817 if (error) {
14818 return error;
14819 }
14820 mp = vnode_mount(rvp);
14821
14822 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14823 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14824 if (error) {
14825 zfree(ZV_NAMEI, name_buf);
14826 vnode_put(rvp);
14827 return error;
14828 }
14829
14830 // XXX MAC checks ?
14831
14832 /*
14833 * Grab mount_iterref so that we can release the vnode,
14834 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14835 */
14836 error = mount_iterref(mp, 0);
14837 vnode_put(rvp);
14838 if (error) {
14839 zfree(ZV_NAMEI, name_buf);
14840 return error;
14841 }
14842
14843 memset(&cnp, 0, sizeof(cnp));
14844 cnp.cn_pnbuf = (char *)name_buf;
14845 cnp.cn_nameiop = LOOKUP;
14846 cnp.cn_flags = ISLASTCN | HASBUF;
14847 cnp.cn_pnlen = MAXPATHLEN;
14848 cnp.cn_nameptr = cnp.cn_pnbuf;
14849 cnp.cn_namelen = (int)name_len;
14850 root_data.sr_cnp = &cnp;
14851
14852 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14853
14854 mount_iterdrop(mp);
14855 zfree(ZV_NAMEI, name_buf);
14856
14857 return error;
14858 }
14859
14860 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14861 vfs_context_can_snapshot(vfs_context_t ctx)
14862 {
14863 static const char * const snapshot_entitlements[] = {
14864 "com.apple.private.vfs.snapshot",
14865 "com.apple.developer.vfs.snapshot",
14866 "com.apple.private.apfs.arv.limited.snapshot",
14867 };
14868 static const size_t nentitlements =
14869 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14870 size_t i;
14871
14872 task_t task = vfs_context_task(ctx);
14873 for (i = 0; i < nentitlements; i++) {
14874 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14875 return TRUE;
14876 }
14877 }
14878 return FALSE;
14879 }
14880
14881 /*
14882 * FS snapshot operations dispatcher
14883 */
14884 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14885 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14886 __unused int32_t *retval)
14887 {
14888 int error;
14889 vfs_context_t ctx = vfs_context_current();
14890
14891 AUDIT_ARG(fd, uap->dirfd);
14892 AUDIT_ARG(value32, uap->op);
14893
14894 if (!vfs_context_can_snapshot(ctx)) {
14895 return EPERM;
14896 }
14897
14898 /*
14899 * Enforce user authorization for snapshot modification operations,
14900 * or if trying to root from snapshot.
14901 */
14902 if (uap->op != SNAPSHOT_OP_MOUNT) {
14903 vnode_t dvp = NULLVP;
14904 vnode_t devvp = NULLVP;
14905 mount_t mp;
14906
14907 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14908 if (error) {
14909 return error;
14910 }
14911 mp = vnode_mount(dvp);
14912 devvp = mp->mnt_devvp;
14913
14914 /* get an iocount on devvp */
14915 if (devvp == NULLVP) {
14916 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14917 /* for mounts which arent block devices */
14918 if (error == ENOENT) {
14919 error = ENXIO;
14920 }
14921 } else {
14922 error = vnode_getwithref(devvp);
14923 }
14924
14925 if (error) {
14926 vnode_put(dvp);
14927 return error;
14928 }
14929
14930 if ((vfs_context_issuser(ctx) == 0) &&
14931 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14932 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14933 error = EPERM;
14934 }
14935 vnode_put(dvp);
14936 vnode_put(devvp);
14937
14938 if (error) {
14939 return error;
14940 }
14941 }
14942
14943 switch (uap->op) {
14944 case SNAPSHOT_OP_CREATE:
14945 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14946 break;
14947 case SNAPSHOT_OP_DELETE:
14948 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14949 break;
14950 case SNAPSHOT_OP_RENAME:
14951 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14952 uap->flags, ctx);
14953 break;
14954 case SNAPSHOT_OP_MOUNT:
14955 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14956 uap->data, uap->flags, ctx);
14957 break;
14958 case SNAPSHOT_OP_REVERT:
14959 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14960 break;
14961 #if CONFIG_MNT_ROOTSNAP
14962 case SNAPSHOT_OP_ROOT:
14963 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14964 break;
14965 #endif /* CONFIG_MNT_ROOTSNAP */
14966 default:
14967 error = ENOSYS;
14968 }
14969
14970 return error;
14971 }
14972