1 /*
2 * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/syslimits.h> /* For MAXLONGPATHLEN */
77 #include <sys/namei.h>
78 #include <sys/filedesc.h>
79 #include <sys/kernel.h>
80 #include <sys/file_internal.h>
81 #include <sys/stat.h>
82 #include <sys/vnode_internal.h>
83 #include <sys/mount_internal.h>
84 #include <sys/proc_internal.h>
85 #include <sys/kauth.h>
86 #include <sys/uio_internal.h>
87 #include <kern/kalloc.h>
88 #include <sys/mman.h>
89 #include <sys/dirent.h>
90 #include <sys/attr.h>
91 #include <sys/sysctl.h>
92 #include <sys/ubc.h>
93 #include <sys/quota.h>
94 #include <sys/kdebug.h>
95 #include <sys/fsevents.h>
96 #include <sys/imgsrc.h>
97 #include <sys/sysproto.h>
98 #include <sys/sysctl.h>
99 #include <sys/xattr.h>
100 #include <sys/fcntl.h>
101 #include <sys/stdio.h>
102 #include <sys/fsctl.h>
103 #include <sys/ubc_internal.h>
104 #include <sys/disk.h>
105 #include <sys/content_protection.h>
106 #include <sys/clonefile.h>
107 #include <sys/snapshot.h>
108 #include <sys/priv.h>
109 #include <sys/fsgetpath.h>
110 #include <machine/cons.h>
111 #include <machine/limits.h>
112 #include <miscfs/specfs/specdev.h>
113
114 #include <vfs/vfs_disk_conditioner.h>
115 #if CONFIG_EXCLAVES
116 #include <vfs/vfs_exclave_fs.h>
117 #endif
118
119 #include <security/audit/audit.h>
120 #include <bsm/audit_kevents.h>
121
122 #include <mach/mach_types.h>
123 #include <kern/kern_types.h>
124 #include <kern/kalloc.h>
125 #include <kern/task.h>
126
127 #include <vm/vm_pageout.h>
128 #include <vm/vm_protos.h>
129 #include <vm/memory_object_xnu.h>
130
131 #include <libkern/OSAtomic.h>
132 #include <os/atomic_private.h>
133 #include <pexpert/pexpert.h>
134 #include <IOKit/IOBSD.h>
135
136 // deps for MIG call
137 #include <kern/host.h>
138 #include <kern/ipc_misc.h>
139 #include <mach/host_priv.h>
140 #include <mach/vfs_nspace.h>
141 #include <os/log.h>
142
143 #include <nfs/nfs_conf.h>
144
145 #if ROUTEFS
146 #include <miscfs/routefs/routefs.h>
147 #endif /* ROUTEFS */
148
149 #if CONFIG_MACF
150 #include <security/mac.h>
151 #include <security/mac_framework.h>
152 #endif
153
154 #if CONFIG_FSE
155 #define GET_PATH(x) \
156 ((x) = get_pathbuff())
157 #define RELEASE_PATH(x) \
158 release_pathbuff(x)
159 #else
160 #define GET_PATH(x) \
161 ((x) = zalloc(ZV_NAMEI))
162 #define RELEASE_PATH(x) \
163 zfree(ZV_NAMEI, x)
164 #endif /* CONFIG_FSE */
165
166 #ifndef HFS_GET_BOOT_INFO
167 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
168 #endif
169
170 #ifndef HFS_SET_BOOT_INFO
171 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
172 #endif
173
174 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
175 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
176 #endif
177
178 extern void disk_conditioner_unmount(mount_t mp);
179
180 /* struct for checkdirs iteration */
181 struct cdirargs {
182 vnode_t olddp;
183 vnode_t newdp;
184 };
185 /* callback for checkdirs iteration */
186 static int checkdirs_callback(proc_t p, void * arg);
187
188 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
189 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
190 void enablequotas(struct mount *mp, vfs_context_t ctx);
191 static int getfsstat_callback(mount_t mp, void * arg);
192 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
193 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
194 static int sync_callback(mount_t, void *);
195 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
196 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
197 boolean_t partial_copy);
198 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
199 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
200 struct componentname *cnp, user_addr_t fsmountargs,
201 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
202 void vfs_notify_mount(vnode_t pdvp);
203
204 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
205
206 struct fd_vn_data * fg_vn_data_alloc(void);
207
208 /*
209 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
210 * Concurrent lookups (or lookups by ids) on hard links can cause the
211 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
212 * does) to return ENOENT as the path cannot be returned from the name cache
213 * alone. We have no option but to retry and hope to get one namei->reverse path
214 * generation done without an intervening lookup, lookup by id on the hard link
215 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
216 * which currently are the MAC hooks for rename, unlink and rmdir.
217 */
218 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
219
220 /* Max retry limit for rename due to vnode recycling. */
221 #define MAX_RENAME_ERECYCLE_RETRIES 1024
222
223 #define MAX_LINK_ENOENT_RETRIES 1024
224
225 /* Max retries for concurrent mounts on the same covered vnode. */
226 #define MAX_MOUNT_RETRIES 10
227
228 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
229 int unlink_flags);
230
231 #ifdef CONFIG_IMGSRC_ACCESS
232 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
233 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
234 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
235 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
236 static void mount_end_update(mount_t mp);
237 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
238 #endif /* CONFIG_IMGSRC_ACCESS */
239
240 //snapshot functions
241 #if CONFIG_MNT_ROOTSNAP
242 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
243 #else
244 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
245 #endif
246
247 __private_extern__
248 int sync_internal(void);
249
250 __private_extern__
251 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
252
253 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
254 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
255
256 /* vars for sync mutex */
257 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
258 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
259
260 extern lck_rw_t rootvnode_rw_lock;
261
262 VFS_SMR_DECLARE;
263 extern uint32_t nc_smr_enabled;
264
265 /*
266 * incremented each time a mount or unmount operation occurs
267 * used to invalidate the cached value of the rootvp in the
268 * mount structure utilized by cache_lookup_path
269 */
270 uint32_t mount_generation = 0;
271
272 /* counts number of mount and unmount operations */
273 unsigned int vfs_nummntops = 0;
274
275 /* system-wide, per-boot unique mount ID */
276 static _Atomic uint64_t mount_unique_id = 1;
277
278 extern const struct fileops vnops;
279 #if CONFIG_APPLEDOUBLE
280 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
281 #endif /* CONFIG_APPLEDOUBLE */
282
283
284 /*
285 * Virtual File System System Calls
286 */
287
288 /*
289 * Private in-kernel mounting spi (specific use-cases only)
290 */
291 boolean_t
vfs_iskernelmount(mount_t mp)292 vfs_iskernelmount(mount_t mp)
293 {
294 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
295 }
296
297 __private_extern__
298 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)299 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
300 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
301 vfs_context_t ctx)
302 {
303 struct nameidata nd;
304 boolean_t did_namei;
305 int error;
306
307 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
308 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
309 if (syscall_flags & MNT_NOFOLLOW) {
310 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
311 }
312
313 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
314
315 /*
316 * Get the vnode to be covered if it's not supplied
317 */
318 if (vp == NULLVP) {
319 error = namei(&nd);
320 if (error) {
321 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
322 printf("failed to locate mount-on path: %s ", path);
323 }
324 return error;
325 }
326 vp = nd.ni_vp;
327 pvp = nd.ni_dvp;
328 did_namei = TRUE;
329 } else {
330 char *pnbuf = CAST_DOWN(char *, path);
331
332 nd.ni_cnd.cn_pnbuf = pnbuf;
333 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
334 did_namei = FALSE;
335 }
336
337 kern_flags |= KERNEL_MOUNT_KMOUNT;
338 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
339 syscall_flags, kern_flags, NULL, ctx);
340
341 if (did_namei) {
342 vnode_put(vp);
343 vnode_put(pvp);
344 nameidone(&nd);
345 }
346
347 return error;
348 }
349
350 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)351 vfs_mount_at_path(const char *fstype, const char *path,
352 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
353 int mnt_flags, int flags)
354 {
355 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
356 int error, km_flags = 0;
357 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
358
359 /*
360 * This call is currently restricted to specific use cases.
361 */
362 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
363 return ENOTSUP;
364 }
365
366 #if !defined(XNU_TARGET_OS_OSX)
367 if (strcmp(fstype, "lifs") == 0) {
368 syscall_flags |= MNT_NOEXEC;
369 }
370 #endif
371
372 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
373 km_flags |= KERNEL_MOUNT_NOAUTH;
374 }
375 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
376 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
377 }
378
379 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
380 syscall_flags, km_flags, ctx);
381 if (error) {
382 printf("%s: mount on %s failed, error %d\n", __func__, path,
383 error);
384 }
385
386 return error;
387 }
388
389 /*
390 * Mount a file system.
391 */
392 /* ARGSUSED */
393 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)394 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
395 {
396 struct __mac_mount_args muap;
397
398 muap.type = uap->type;
399 muap.path = uap->path;
400 muap.flags = uap->flags;
401 muap.data = uap->data;
402 muap.mac_p = USER_ADDR_NULL;
403 return __mac_mount(p, &muap, retval);
404 }
405
406 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)407 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
408 {
409 struct componentname cn;
410 vfs_context_t ctx = vfs_context_current();
411 size_t dummy = 0;
412 int error;
413 int flags = uap->flags;
414 char fstypename[MFSNAMELEN];
415 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
416 vnode_t pvp;
417 vnode_t vp;
418
419 AUDIT_ARG(fd, uap->fd);
420 AUDIT_ARG(fflags, flags);
421 /* fstypename will get audited by mount_common */
422
423 /* Sanity check the flags */
424 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
425 return ENOTSUP;
426 }
427
428 if (flags & MNT_UNION) {
429 return EPERM;
430 }
431
432 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
433 if (error) {
434 return error;
435 }
436
437 if ((error = file_vnode(uap->fd, &vp)) != 0) {
438 return error;
439 }
440
441 if ((error = vnode_getwithref(vp)) != 0) {
442 file_drop(uap->fd);
443 return error;
444 }
445
446 pvp = vnode_getparent(vp);
447 if (pvp == NULL) {
448 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
449 error = EBUSY;
450 } else {
451 error = EINVAL;
452 }
453 vnode_put(vp);
454 file_drop(uap->fd);
455 return error;
456 }
457
458 memset(&cn, 0, sizeof(struct componentname));
459 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
460 cn.cn_pnlen = MAXPATHLEN;
461
462 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
463 zfree(ZV_NAMEI, cn.cn_pnbuf);
464 vnode_put(pvp);
465 vnode_put(vp);
466 file_drop(uap->fd);
467 return error;
468 }
469
470 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
471
472 zfree(ZV_NAMEI, cn.cn_pnbuf);
473 vnode_put(pvp);
474 vnode_put(vp);
475 file_drop(uap->fd);
476
477 return error;
478 }
479
480 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
481
482 /*
483 * Get the size of a graft file (a manifest or payload file).
484 * The vp should be an iocounted vnode.
485 */
486 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)487 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
488 {
489 struct stat64 sb = {};
490 int error;
491
492 *size = 0;
493
494 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
495 if (error) {
496 return error;
497 }
498
499 if (sb.st_size == 0) {
500 error = ENODATA;
501 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
502 error = EFBIG;
503 } else {
504 *size = (size_t) sb.st_size;
505 }
506
507 return error;
508 }
509
510 /*
511 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
512 * `size` must already be validated.
513 */
514 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)515 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
516 {
517 return vn_rdwr(UIO_READ, graft_vp,
518 (caddr_t) buf, (int) size, /* offset */ 0,
519 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
520 vfs_context_ucred(vctx), /* resid */ NULL,
521 vfs_context_proc(vctx));
522 }
523
524 /*
525 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
526 * and read it into `buf`.
527 * If `path_prefix` is non-NULL, verify that the file path has that prefix.
528 */
529 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)530 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
531 {
532 vnode_t metadata_vp = NULLVP;
533 char *path = NULL;
534 int error;
535
536 // Convert this graft fd to a vnode.
537 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
538 goto out;
539 }
540
541 // Verify that the vnode path starts with `path_prefix` if it was passed.
542 if (path_prefix) {
543 int len = MAXPATHLEN;
544 path = zalloc(ZV_NAMEI);
545 if ((error = vn_getpath(metadata_vp, path, &len))) {
546 goto out;
547 }
548 if (strncmp(path, path_prefix, strlen(path_prefix))) {
549 error = EINVAL;
550 goto out;
551 }
552 }
553
554 // Get (and validate) size information.
555 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
556 goto out;
557 }
558
559 // Read each file into the provided buffer - we must get the expected amount of bytes.
560 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
561 goto out;
562 }
563
564 out:
565 if (path) {
566 zfree(ZV_NAMEI, path);
567 }
568 if (metadata_vp) {
569 vnode_put(metadata_vp);
570 metadata_vp = NULLVP;
571 }
572
573 return error;
574 }
575
576 #if XNU_TARGET_OS_OSX
577 #if defined(__arm64e__)
578 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
579 #else /* x86_64 */
580 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
581 #endif /* x86_64 */
582 #else /* !XNU_TARGET_OS_OSX */
583 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
584 #endif /* !XNU_TARGET_OS_OSX */
585
586 /*
587 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
588 * provided in `gfs`, saving the size of data read in `gfs`.
589 */
590 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)591 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
592 vfs_context_t vctx, fsioc_graft_fs_t *gfs)
593 {
594 const char *manifest_path_prefix = NULL;
595 int error;
596
597 // For Mobile Asset, make sure that the manifest comes from a data vault.
598 if (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) {
599 manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
600 }
601
602 // Read the authentic manifest.
603 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
604 manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
605 return error;
606 }
607
608 // The user manifest is currently unused, but set its size.
609 gfs->user_manifest_size = 0;
610
611 // Read the payload.
612 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
613 NULL, &gfs->payload_size, gfs->payload))) {
614 return error;
615 }
616
617 return 0;
618 }
619
620 /*
621 * Call into the filesystem to verify and graft a cryptex.
622 */
623 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)624 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
625 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
626 {
627 fsioc_graft_fs_t gfs = {};
628 uint64_t graft_dir_ino = 0;
629 struct stat64 sb = {};
630 int error;
631
632 // Pre-flight arguments.
633 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
634 // Make sure that this graft version matches what we support.
635 return ENOTSUP;
636 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
637 // For this type, cryptex VP must live on same volume as the target of graft.
638 return EXDEV;
639 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
640 // We cannot graft upon non-directories.
641 return ENOTDIR;
642 } else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
643 // We do not allow grafts inside disk images.
644 return ENODEV;
645 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
646 sbc_args->sbc_payload_fd < 0) {
647 // We cannot graft without a manifest and payload.
648 return EINVAL;
649 }
650
651 if (mounton_vp) {
652 // Get the mounton's inode number.
653 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
654 if (error) {
655 return error;
656 }
657 graft_dir_ino = (uint64_t) sb.st_ino;
658 }
659
660 // Create buffers (of our maximum-defined size) to store authentication info.
661 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
662 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
663
664 if (!gfs.authentic_manifest || !gfs.payload) {
665 error = ENOMEM;
666 goto out;
667 }
668
669 // Read our fd's into our buffers.
670 // (Note that this will set the buffer size fields in `gfs`.)
671 error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
672 if (error) {
673 goto out;
674 }
675
676 gfs.graft_version = FSIOC_GRAFT_VERSION;
677 gfs.graft_type = graft_type;
678 gfs.graft_4cc = sbc_args->sbc_4cc;
679 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
680 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
681 }
682 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
683 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
684 }
685 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
686 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
687 }
688 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
689 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
690 }
691 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
692 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
693 }
694 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
695 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
696 }
697 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
698
699 // Call into the FS to perform the graft (and validation).
700 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
701
702 out:
703 if (gfs.authentic_manifest) {
704 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
705 gfs.authentic_manifest = NULL;
706 }
707 if (gfs.payload) {
708 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
709 gfs.payload = NULL;
710 }
711
712 return error;
713 }
714
715 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
716
717 /*
718 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
719 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
720 */
721 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)722 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
723 {
724 int ua_dmgfd = uap->dmg_fd;
725 user_addr_t ua_mountdir = uap->mountdir;
726 uint32_t ua_grafttype = uap->graft_type;
727 user_addr_t ua_graftargs = uap->gda;
728
729 graftdmg_args_un kern_gda = {};
730 int error = 0;
731 secure_boot_cryptex_args_t *sbc_args = NULL;
732 bool graft_on_parent = (ua_mountdir == USER_ADDR_NULL);
733
734 vnode_t cryptex_vp = NULLVP;
735 struct nameidata nd = {};
736 vfs_context_t ctx = vfs_context_current();
737 #if CONFIG_MACF
738 vnode_t parent_vp = NULLVP;
739 #endif
740
741 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
742 return EPERM;
743 }
744
745 // Copy graftargs in, if provided.
746 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
747 if (error) {
748 return error;
749 }
750
751 // Convert fd to vnode.
752 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
753 if (error) {
754 return error;
755 }
756
757 if (vnode_isdir(cryptex_vp)) {
758 error = EISDIR;
759 goto graftout;
760 }
761
762 #if CONFIG_MACF
763 if (graft_on_parent) {
764 // Grafting on Cryptex file parent directory, need to get its vp for MAC check.
765 parent_vp = vnode_getparent(cryptex_vp);
766 if (parent_vp == NULLVP) {
767 error = ENOENT;
768 goto graftout;
769 }
770 }
771 #endif
772
773 if (!graft_on_parent) {
774 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
775 UIO_USERSPACE, ua_mountdir, ctx);
776
777 error = namei(&nd);
778 if (error) {
779 goto graftout;
780 }
781 }
782
783 #if CONFIG_MACF
784 vnode_t macf_vp = graft_on_parent ? parent_vp : nd.ni_vp;
785 error = mac_graft_check_graft(ctx, macf_vp);
786 if (error) {
787 goto graftout;
788 }
789 #endif
790
791 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
792 error = EINVAL;
793 } else {
794 sbc_args = &kern_gda.sbc_args;
795 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx,
796 cryptex_vp, graft_on_parent ? NULLVP : nd.ni_vp);
797 }
798
799 #if CONFIG_MACF
800 if (!error) {
801 mac_graft_notify_graft(ctx, macf_vp);
802 }
803 #endif
804
805 graftout:
806 #if CONFIG_MACF
807 if (parent_vp != NULLVP) {
808 vnode_put(parent_vp);
809 parent_vp = NULLVP;
810 }
811 #endif
812 if (cryptex_vp != NULLVP) {
813 vnode_put(cryptex_vp);
814 cryptex_vp = NULLVP;
815 }
816 if (nd.ni_vp != NULLVP) {
817 vnode_put(nd.ni_vp);
818 nameidone(&nd);
819 }
820
821 return error;
822 }
823
824 /*
825 * Ungraft a cryptex disk image (via mount dir FD)
826 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
827 */
828 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)829 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
830 {
831 int error = 0;
832 user_addr_t ua_mountdir = uap->mountdir;
833 fsioc_ungraft_fs_t ugfs = {};
834 struct nameidata nd = {};
835 vfs_context_t ctx = vfs_context_current();
836
837 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
838 return EPERM;
839 }
840
841 if (ua_mountdir == USER_ADDR_NULL) {
842 return EINVAL;
843 }
844
845 if (uap->flags & UNGRAFTDMG_NOFORCE) {
846 ugfs.ungraft_flags |= FSCTL_UNGRAFT_NOFORCE;
847 }
848
849 // Acquire vnode for mount-on path
850 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
851 UIO_USERSPACE, ua_mountdir, ctx);
852
853 error = namei(&nd);
854 if (error) {
855 return error;
856 }
857
858 if (!vnode_isdir(nd.ni_vp)) {
859 error = ENOTDIR;
860 goto ungraftout;
861 }
862
863 #if CONFIG_MACF
864 error = mac_graft_check_ungraft(ctx, nd.ni_vp);
865 if (error) {
866 goto ungraftout;
867 }
868 #endif
869
870 // Call into the FS to perform the ungraft
871 error = VNOP_IOCTL(nd.ni_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
872
873 #if CONFIG_MACF
874 if (!error) {
875 mac_graft_notify_ungraft(ctx, nd.ni_vp);
876 }
877 #endif
878
879 ungraftout:
880 vnode_put(nd.ni_vp);
881 nameidone(&nd);
882
883 return error;
884 }
885
886
887 void
vfs_notify_mount(vnode_t pdvp)888 vfs_notify_mount(vnode_t pdvp)
889 {
890 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
891 lock_vnode_and_post(pdvp, NOTE_WRITE);
892 }
893
894 /*
895 * __mac_mount:
896 * Mount a file system taking into account MAC label behavior.
897 * See mount(2) man page for more information
898 *
899 * Parameters: p Process requesting the mount
900 * uap User argument descriptor (see below)
901 * retval (ignored)
902 *
903 * Indirect: uap->type Filesystem type
904 * uap->path Path to mount
905 * uap->data Mount arguments
906 * uap->mac_p MAC info
907 * uap->flags Mount flags
908 *
909 *
910 * Returns: 0 Success
911 * !0 Not success
912 */
913 boolean_t root_fs_upgrade_try = FALSE;
914
915 #define MAX_NESTED_UNION_MOUNTS 10
916
917 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)918 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
919 {
920 vnode_t pvp = NULLVP;
921 vnode_t vp = NULLVP;
922 int need_nameidone = 0;
923 vfs_context_t ctx = vfs_context_current();
924 char fstypename[MFSNAMELEN];
925 struct nameidata nd;
926 size_t dummy = 0;
927 char *labelstr = NULL;
928 size_t labelsz = 0;
929 int flags = uap->flags;
930 int error;
931 int num_retries = 0;
932 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
933 boolean_t is_64bit = IS_64BIT_PROCESS(p);
934 #else
935 #pragma unused(p)
936 #endif
937 /*
938 * Get the fs type name from user space
939 */
940 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
941 if (error) {
942 return error;
943 }
944
945 retry:
946 /*
947 * Get the vnode to be covered
948 */
949 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
950 UIO_USERSPACE, uap->path, ctx);
951 if (flags & MNT_NOFOLLOW) {
952 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
953 }
954 error = namei(&nd);
955 if (error) {
956 goto out;
957 }
958 need_nameidone = 1;
959 vp = nd.ni_vp;
960 pvp = nd.ni_dvp;
961
962 #ifdef CONFIG_IMGSRC_ACCESS
963 /* Mounting image source cannot be batched with other operations */
964 if (flags == MNT_IMGSRC_BY_INDEX) {
965 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
966 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
967 goto out;
968 }
969 #endif /* CONFIG_IMGSRC_ACCESS */
970
971 #if CONFIG_MACF
972 /*
973 * Get the label string (if any) from user space
974 */
975 if (uap->mac_p != USER_ADDR_NULL) {
976 struct user_mac mac;
977 size_t ulen = 0;
978
979 if (is_64bit) {
980 struct user64_mac mac64;
981 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
982 mac.m_buflen = (user_size_t)mac64.m_buflen;
983 mac.m_string = (user_addr_t)mac64.m_string;
984 } else {
985 struct user32_mac mac32;
986 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
987 mac.m_buflen = mac32.m_buflen;
988 mac.m_string = mac32.m_string;
989 }
990 if (error) {
991 goto out;
992 }
993 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
994 (mac.m_buflen < 2)) {
995 error = EINVAL;
996 goto out;
997 }
998 labelsz = mac.m_buflen;
999 labelstr = kalloc_data(labelsz, Z_WAITOK);
1000 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
1001 if (error) {
1002 goto out;
1003 }
1004 AUDIT_ARG(mac_string, labelstr);
1005 }
1006 #endif /* CONFIG_MACF */
1007
1008 AUDIT_ARG(fflags, flags);
1009
1010 if (flags & MNT_UNION) {
1011 #if CONFIG_UNION_MOUNTS
1012 mount_t mp = vp->v_mount;
1013 int nested_union_mounts = 0;
1014
1015 name_cache_lock_shared();
1016
1017 /* Walk up the vnodecovered chain and check for nested union mounts. */
1018 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
1019 while (mp) {
1020 if (!(mp->mnt_flag & MNT_UNION)) {
1021 break;
1022 }
1023 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
1024
1025 /*
1026 * Limit the max nested unon mounts to prevent stack exhaustion
1027 * when calling lookup_traverse_union().
1028 */
1029 if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
1030 error = ELOOP;
1031 break;
1032 }
1033 }
1034
1035 name_cache_unlock();
1036 if (error) {
1037 goto out;
1038 }
1039 #else
1040 error = EPERM;
1041 goto out;
1042 #endif /* CONFIG_UNION_MOUNTS */
1043 }
1044
1045 if ((vp->v_flag & VROOT) &&
1046 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
1047 #if CONFIG_UNION_MOUNTS
1048 if (!(flags & MNT_UNION)) {
1049 flags |= MNT_UPDATE;
1050 } else {
1051 /*
1052 * For a union mount on '/', treat it as fresh
1053 * mount instead of update.
1054 * Otherwise, union mouting on '/' used to panic the
1055 * system before, since mnt_vnodecovered was found to
1056 * be NULL for '/' which is required for unionlookup
1057 * after it gets ENOENT on union mount.
1058 */
1059 flags = (flags & ~(MNT_UPDATE));
1060 }
1061 #else
1062 flags |= MNT_UPDATE;
1063 #endif /* CONFIG_UNION_MOUNTS */
1064
1065 #if SECURE_KERNEL
1066 if ((flags & MNT_RDONLY) == 0) {
1067 /* Release kernels are not allowed to mount "/" as rw */
1068 error = EPERM;
1069 goto out;
1070 }
1071 #endif
1072
1073 /*
1074 * See 7392553 for more details on why this check exists.
1075 * Suffice to say: If this check is ON and something tries
1076 * to mount the rootFS RW, we'll turn off the codesign
1077 * bitmap optimization.
1078 */
1079 #if CHECK_CS_VALIDATION_BITMAP
1080 if ((flags & MNT_RDONLY) == 0) {
1081 root_fs_upgrade_try = TRUE;
1082 }
1083 #endif
1084 }
1085
1086 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1087 labelstr, ctx);
1088
1089 out:
1090
1091 #if CONFIG_MACF
1092 kfree_data(labelstr, labelsz);
1093 #endif /* CONFIG_MACF */
1094
1095 if (vp) {
1096 vnode_put(vp);
1097 vp = NULLVP;
1098 }
1099 if (pvp) {
1100 vnode_put(pvp);
1101 pvp = NULLVP;
1102 }
1103 if (need_nameidone) {
1104 nameidone(&nd);
1105 need_nameidone = 0;
1106 }
1107
1108 if (error == EBUSY) {
1109 /* Retry the lookup and mount again due to concurrent mounts. */
1110 if (++num_retries < MAX_MOUNT_RETRIES) {
1111 goto retry;
1112 }
1113 }
1114
1115 return error;
1116 }
1117
1118 /*
1119 * common mount implementation (final stage of mounting)
1120 *
1121 * Arguments:
1122 * fstypename file system type (ie it's vfs name)
1123 * pvp parent of covered vnode
1124 * vp covered vnode
1125 * cnp component name (ie path) of covered vnode
1126 * flags generic mount flags
1127 * fsmountargs file system specific data
1128 * labelstr optional MAC label
1129 * kernelmount TRUE for mounts initiated from inside the kernel
1130 * ctx caller's context
1131 */
1132 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1133 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1134 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1135 char *labelstr, vfs_context_t ctx)
1136 {
1137 #if !CONFIG_MACF
1138 #pragma unused(labelstr)
1139 #endif
1140 struct vnode *devvp = NULLVP;
1141 struct vnode *device_vnode = NULLVP;
1142 #if CONFIG_MACF
1143 struct vnode *rvp;
1144 #endif
1145 struct mount *mp = NULL;
1146 struct vfstable *vfsp = (struct vfstable *)0;
1147 struct proc *p = vfs_context_proc(ctx);
1148 int error, flag = 0;
1149 bool flag_set = false;
1150 user_addr_t devpath = USER_ADDR_NULL;
1151 int ronly = 0;
1152 int mntalloc = 0;
1153 boolean_t vfsp_ref = FALSE;
1154 boolean_t is_rwlock_locked = FALSE;
1155 boolean_t did_rele = FALSE;
1156 boolean_t have_usecount = FALSE;
1157 boolean_t did_set_lmount = FALSE;
1158 boolean_t did_set_vmount = FALSE;
1159 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1160
1161 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1162 /* Check for mutually-exclusive flag bits */
1163 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1164 int bitcount = 0;
1165 while (checkflags != 0) {
1166 checkflags &= (checkflags - 1);
1167 bitcount++;
1168 }
1169
1170 if (bitcount > 1) {
1171 //not allowed to request multiple mount-by-role flags
1172 error = EINVAL;
1173 goto out1;
1174 }
1175 #endif
1176
1177 /*
1178 * Process an update for an existing mount
1179 */
1180 if (flags & MNT_UPDATE) {
1181 if ((vp->v_flag & VROOT) == 0) {
1182 error = EINVAL;
1183 goto out1;
1184 }
1185 mp = vp->v_mount;
1186
1187 /* if unmount or mount in progress, return error */
1188 mount_lock_spin(mp);
1189 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1190 mount_unlock(mp);
1191 error = EBUSY;
1192 goto out1;
1193 }
1194 mp->mnt_lflag |= MNT_LMOUNT;
1195 did_set_lmount = TRUE;
1196 mount_unlock(mp);
1197 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1198 is_rwlock_locked = TRUE;
1199 /*
1200 * We only allow the filesystem to be reloaded if it
1201 * is currently mounted read-only.
1202 */
1203 if ((flags & MNT_RELOAD) &&
1204 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1205 error = ENOTSUP;
1206 goto out1;
1207 }
1208
1209 /*
1210 * If content protection is enabled, update mounts are not
1211 * allowed to turn it off.
1212 */
1213 if ((mp->mnt_flag & MNT_CPROTECT) &&
1214 ((flags & MNT_CPROTECT) == 0)) {
1215 error = EINVAL;
1216 goto out1;
1217 }
1218
1219 /*
1220 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1221 * failure to return an error for this so we'll just silently
1222 * add it if it is not passed in.
1223 */
1224 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1225 ((flags & MNT_REMOVABLE) == 0)) {
1226 flags |= MNT_REMOVABLE;
1227 }
1228
1229 /* Can't downgrade the backer of the root FS */
1230 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1231 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1232 error = ENOTSUP;
1233 goto out1;
1234 }
1235
1236 /*
1237 * Only root, or the user that did the original mount is
1238 * permitted to update it.
1239 */
1240 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1241 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1242 goto out1;
1243 }
1244 #if CONFIG_MACF
1245 error = mac_mount_check_remount(ctx, mp, flags);
1246 if (error != 0) {
1247 goto out1;
1248 }
1249 #endif
1250 /*
1251 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1252 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1253 */
1254 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1255 flags |= MNT_NOSUID | MNT_NODEV;
1256 if (mp->mnt_flag & MNT_NOEXEC) {
1257 flags |= MNT_NOEXEC;
1258 }
1259 }
1260 flag = mp->mnt_flag;
1261 flag_set = true;
1262
1263
1264
1265 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1266
1267 vfsp = mp->mnt_vtable;
1268 goto update;
1269 } // MNT_UPDATE
1270
1271 /*
1272 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1273 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1274 */
1275 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1276 flags |= MNT_NOSUID | MNT_NODEV;
1277 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1278 flags |= MNT_NOEXEC;
1279 }
1280 }
1281
1282 /* XXXAUDIT: Should we capture the type on the error path as well? */
1283 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1284 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1285 mount_list_lock();
1286 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1287 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1288 vfsp->vfc_refcount++;
1289 vfsp_ref = TRUE;
1290 break;
1291 }
1292 }
1293 mount_list_unlock();
1294 if (vfsp == NULL) {
1295 error = ENODEV;
1296 goto out1;
1297 }
1298
1299 /*
1300 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1301 * except in ROSV configs and for the initial BaseSystem root.
1302 */
1303 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1304 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1305 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1306 error = EINVAL; /* unsupported request */
1307 goto out1;
1308 }
1309
1310 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1311 if (error != 0) {
1312 goto out1;
1313 }
1314
1315 /*
1316 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1317 */
1318 did_set_vmount = TRUE;
1319
1320 /*
1321 * Allocate and initialize the filesystem (mount_t)
1322 */
1323 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1324 mntalloc = 1;
1325
1326 /* Initialize the default IO constraints */
1327 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1328 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1329 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1330 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1331 mp->mnt_devblocksize = DEV_BSIZE;
1332 mp->mnt_alignmentmask = PAGE_MASK;
1333 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1334 mp->mnt_ioscale = 1;
1335 mp->mnt_ioflags = 0;
1336 mp->mnt_realrootvp = NULLVP;
1337 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1338
1339 mp->mnt_lflag |= MNT_LMOUNT;
1340 did_set_lmount = TRUE;
1341
1342 TAILQ_INIT(&mp->mnt_vnodelist);
1343 TAILQ_INIT(&mp->mnt_workerqueue);
1344 TAILQ_INIT(&mp->mnt_newvnodes);
1345 mount_lock_init(mp);
1346 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1347 is_rwlock_locked = TRUE;
1348 mp->mnt_op = vfsp->vfc_vfsops;
1349 mp->mnt_vtable = vfsp;
1350 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1351 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1352 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1353 do {
1354 size_t pathlen = MAXPATHLEN;
1355
1356 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1357 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1358 }
1359 } while (0);
1360 mp->mnt_vnodecovered = vp;
1361 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1362 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1363 mp->mnt_devbsdunit = 0;
1364 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1365
1366 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1367 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1368
1369 if (kernelmount) {
1370 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1371 }
1372 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1373 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1374 }
1375
1376 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1377 // kernel mounted devfs
1378 mp->mnt_kern_flag |= MNTK_SYSTEM;
1379 }
1380
1381 update:
1382
1383 /*
1384 * Set the mount level flags.
1385 */
1386 if (flags & MNT_RDONLY) {
1387 mp->mnt_flag |= MNT_RDONLY;
1388 } else if (mp->mnt_flag & MNT_RDONLY) {
1389 // disallow read/write upgrades of file systems that
1390 // had the TYPENAME_OVERRIDE feature set.
1391 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1392 error = EPERM;
1393 goto out1;
1394 }
1395 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1396 }
1397 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1398 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1399 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1400 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1401 MNT_QUARANTINE | MNT_CPROTECT);
1402
1403 #if SECURE_KERNEL
1404 #if !CONFIG_MNT_SUID
1405 /*
1406 * On release builds of iOS based platforms, always enforce NOSUID on
1407 * all mounts. We do this here because we can catch update mounts as well as
1408 * non-update mounts in this case.
1409 */
1410 mp->mnt_flag |= (MNT_NOSUID);
1411 #endif
1412 #endif
1413
1414 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1415 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1416 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1417 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1418 MNT_QUARANTINE | MNT_CPROTECT);
1419
1420 #if CONFIG_MACF
1421 if (flags & MNT_MULTILABEL) {
1422 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1423 error = EINVAL;
1424 goto out1;
1425 }
1426 mp->mnt_flag |= MNT_MULTILABEL;
1427 }
1428 #endif
1429 /*
1430 * Process device path for local file systems if requested.
1431 *
1432 * Snapshot and mount-by-role mounts do not use this path; they are
1433 * passing other opaque data in the device path field.
1434 *
1435 * Basesystemroot mounts pass a device path to be resolved here,
1436 * but it's just a char * already inside the kernel, which
1437 * kernel_mount() shoved into a user_addr_t to call us. So for such
1438 * mounts we must skip copyin (both of the address and of the string
1439 * (in NDINIT).
1440 */
1441 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1442 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1443 boolean_t do_copyin_devpath = true;
1444 #if CONFIG_BASESYSTEMROOT
1445 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1446 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1447 // We have been passed fsmountargs, which is typed as a user_addr_t,
1448 // but is actually a char ** pointing to a (kernelspace) string.
1449 // We manually unpack it with a series of casts and dereferences
1450 // that reverses what was done just above us on the stack in
1451 // imageboot_pivot_image().
1452 // After retrieving the path to the dev node (which we will NDINIT
1453 // in a moment), we pass NULL fsmountargs on to the filesystem.
1454 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1455 char **devnamepp = (char **)fsmountargs;
1456 char *devnamep = *devnamepp;
1457 devpath = CAST_USER_ADDR_T(devnamep);
1458 do_copyin_devpath = false;
1459 fsmountargs = USER_ADDR_NULL;
1460
1461 //Now that we have a mp, denote that this mount is for the basesystem.
1462 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1463 }
1464 #endif // CONFIG_BASESYSTEMROOT
1465
1466 if (do_copyin_devpath) {
1467 if (vfs_context_is64bit(ctx)) {
1468 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1469 goto out1;
1470 }
1471 fsmountargs += sizeof(devpath);
1472 } else {
1473 user32_addr_t tmp;
1474 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1475 goto out1;
1476 }
1477 /* munge into LP64 addr */
1478 devpath = CAST_USER_ADDR_T(tmp);
1479 fsmountargs += sizeof(tmp);
1480 }
1481 }
1482
1483 /* Lookup device and authorize access to it */
1484 if ((devpath)) {
1485 struct nameidata nd;
1486
1487 enum uio_seg seg = UIO_USERSPACE;
1488 #if CONFIG_BASESYSTEMROOT
1489 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1490 seg = UIO_SYSSPACE;
1491 }
1492 #endif // CONFIG_BASESYSTEMROOT
1493
1494 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1495 if (flags & MNT_NOFOLLOW) {
1496 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
1497 }
1498 if ((error = namei(&nd))) {
1499 goto out1;
1500 }
1501
1502 devvp = nd.ni_vp;
1503
1504 if (devvp->v_type != VBLK) {
1505 error = ENOTBLK;
1506 nameidone(&nd);
1507 goto out2;
1508 }
1509 if (major(devvp->v_rdev) >= nblkdev) {
1510 error = ENXIO;
1511 nameidone(&nd);
1512 goto out2;
1513 }
1514 /*
1515 * If mount by non-root, then verify that user has necessary
1516 * permissions on the device.
1517 */
1518 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1519 kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1520
1521 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1522 accessmode |= KAUTH_VNODE_WRITE_DATA;
1523 }
1524 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1525 nameidone(&nd);
1526 goto out2;
1527 }
1528 }
1529
1530 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1531 nameidone(&nd);
1532 }
1533 /* On first mount, preflight and open device */
1534 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1535 if ((error = vnode_ref(devvp))) {
1536 goto out2;
1537 }
1538 /*
1539 * Disallow multiple mounts of the same device.
1540 * Disallow mounting of a device that is currently in use
1541 * (except for root, which might share swap device for miniroot).
1542 * Flush out any old buffers remaining from a previous use.
1543 */
1544 if ((error = vfs_setmounting(devvp))) {
1545 vnode_rele(devvp);
1546 goto out2;
1547 }
1548
1549 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1550 error = EBUSY;
1551 goto out3;
1552 }
1553 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1554 error = ENOTBLK;
1555 goto out3;
1556 }
1557 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1558 goto out3;
1559 }
1560
1561 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1562 #if CONFIG_MACF
1563 error = mac_vnode_check_open(ctx,
1564 devvp,
1565 ronly ? FREAD : FREAD | FWRITE);
1566 if (error) {
1567 goto out3;
1568 }
1569 #endif /* MAC */
1570 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1571 goto out3;
1572 }
1573
1574 mp->mnt_devvp = devvp;
1575 device_vnode = devvp;
1576 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1577 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1578 (device_vnode = mp->mnt_devvp)) {
1579 dev_t dev;
1580 int maj;
1581 /*
1582 * If upgrade to read-write by non-root, then verify
1583 * that user has necessary permissions on the device.
1584 */
1585 vnode_getalways(device_vnode);
1586
1587 if (suser(vfs_context_ucred(ctx), NULL) &&
1588 (error = vnode_authorize(device_vnode, NULL,
1589 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1590 ctx)) != 0) {
1591 vnode_put(device_vnode);
1592 goto out2;
1593 }
1594
1595 /* Tell the device that we're upgrading */
1596 dev = (dev_t)device_vnode->v_rdev;
1597 maj = major(dev);
1598
1599 if ((u_int)maj >= (u_int)nblkdev) {
1600 panic("Volume mounted on a device with invalid major number.");
1601 }
1602
1603 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1604 vnode_put(device_vnode);
1605 device_vnode = NULLVP;
1606 if (error != 0) {
1607 goto out2;
1608 }
1609 }
1610 } // localargs && !(snapshot | data | vm)
1611
1612 #if CONFIG_MACF
1613 if ((flags & MNT_UPDATE) == 0) {
1614 mac_mount_label_init(mp);
1615 mac_mount_label_associate(ctx, mp);
1616 }
1617 if (labelstr) {
1618 if ((flags & MNT_UPDATE) != 0) {
1619 error = mac_mount_check_label_update(ctx, mp);
1620 if (error != 0) {
1621 goto out3;
1622 }
1623 }
1624 }
1625 #endif
1626 /*
1627 * Mount the filesystem. We already asserted that internal_flags
1628 * cannot have more than one mount-by-role bit set.
1629 */
1630 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1631 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1632 (caddr_t)fsmountargs, 0, ctx);
1633 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1634 #if CONFIG_ROSV_STARTUP
1635 struct mount *origin_mp = (struct mount*)fsmountargs;
1636 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1637 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1638 if (error) {
1639 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1640 } else {
1641 /* Mark volume associated with system volume */
1642 mp->mnt_kern_flag |= MNTK_SYSTEM;
1643
1644 /* Attempt to acquire the mnt_devvp and set it up */
1645 struct vnode *mp_devvp = NULL;
1646 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1647 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1648 0, &mp_devvp, vfs_context_kernel());
1649 if (!lerr) {
1650 mp->mnt_devvp = mp_devvp;
1651 //vnode_lookup took an iocount, need to drop it.
1652 vnode_put(mp_devvp);
1653 // now set `device_vnode` to the devvp that was acquired.
1654 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1655 // note that though the iocount above was dropped, the mount acquires
1656 // an implicit reference against the device.
1657 device_vnode = mp_devvp;
1658 }
1659 }
1660 }
1661 #else
1662 error = EINVAL;
1663 #endif
1664 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1665 #if CONFIG_MOUNT_VM
1666 struct mount *origin_mp = (struct mount*)fsmountargs;
1667 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1668 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1669 if (error) {
1670 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1671 } else {
1672 /* Mark volume associated with system volume and a swap mount */
1673 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1674 /* Attempt to acquire the mnt_devvp and set it up */
1675 struct vnode *mp_devvp = NULL;
1676 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1677 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1678 0, &mp_devvp, vfs_context_kernel());
1679 if (!lerr) {
1680 mp->mnt_devvp = mp_devvp;
1681 //vnode_lookup took an iocount, need to drop it.
1682 vnode_put(mp_devvp);
1683
1684 // now set `device_vnode` to the devvp that was acquired.
1685 // note that though the iocount above was dropped, the mount acquires
1686 // an implicit reference against the device.
1687 device_vnode = mp_devvp;
1688 }
1689 }
1690 }
1691 #else
1692 error = EINVAL;
1693 #endif
1694 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1695 #if CONFIG_MOUNT_PREBOOTRECOVERY
1696 struct mount *origin_mp = (struct mount*)fsmountargs;
1697 uint32_t mount_role = 0;
1698 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1699 mount_role = VFS_PREBOOT_ROLE;
1700 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1701 mount_role = VFS_RECOVERY_ROLE;
1702 }
1703
1704 if (mount_role != 0) {
1705 fs_role_mount_args_t frma = {origin_mp, mount_role};
1706 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1707 if (error) {
1708 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1709 } else {
1710 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1711 /* Mark volume associated with system volume */
1712 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1713 /* Attempt to acquire the mnt_devvp and set it up */
1714 struct vnode *mp_devvp = NULL;
1715 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1716 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1717 0, &mp_devvp, vfs_context_kernel());
1718 if (!lerr) {
1719 mp->mnt_devvp = mp_devvp;
1720 //vnode_lookup took an iocount, need to drop it.
1721 vnode_put(mp_devvp);
1722
1723 // now set `device_vnode` to the devvp that was acquired.
1724 // note that though the iocount above was dropped, the mount acquires
1725 // an implicit reference against the device.
1726 device_vnode = mp_devvp;
1727 }
1728 }
1729 }
1730 } else {
1731 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1732 error = EINVAL;
1733 }
1734 #else
1735 error = EINVAL;
1736 #endif
1737 } else {
1738 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1739 }
1740
1741 if (flags & MNT_UPDATE) {
1742 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1743 mp->mnt_flag &= ~MNT_RDONLY;
1744 }
1745 mp->mnt_flag &= ~
1746 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1747 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1748 if (error) {
1749 mp->mnt_flag = flag; /* restore flag value */
1750 }
1751 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1752 lck_rw_done(&mp->mnt_rwlock);
1753 is_rwlock_locked = FALSE;
1754 if (!error) {
1755 enablequotas(mp, ctx);
1756 }
1757 goto exit;
1758 }
1759
1760 /*
1761 * Put the new filesystem on the mount list after root.
1762 */
1763 if (error == 0) {
1764 struct vfs_attr vfsattr;
1765 if (device_vnode) {
1766 /*
1767 * cache the IO attributes for the underlying physical media...
1768 * an error return indicates the underlying driver doesn't
1769 * support all the queries necessary... however, reasonable
1770 * defaults will have been set, so no reason to bail or care
1771 *
1772 * Need to do this before calling the MAC hook as it needs
1773 * information from this call.
1774 */
1775 vfs_init_io_attributes(device_vnode, mp);
1776 }
1777
1778 #if CONFIG_MACF
1779 error = mac_mount_check_mount_late(ctx, mp);
1780 if (error != 0) {
1781 goto out4;
1782 }
1783
1784 if (vfs_flags(mp) & MNT_MULTILABEL) {
1785 error = VFS_ROOT(mp, &rvp, ctx);
1786 if (error) {
1787 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1788 goto out4;
1789 }
1790 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1791 /*
1792 * drop reference provided by VFS_ROOT
1793 */
1794 vnode_put(rvp);
1795
1796 if (error) {
1797 goto out4;
1798 }
1799 }
1800 #endif /* MAC */
1801
1802 vnode_lock_spin(vp);
1803 CLR(vp->v_flag, VMOUNT);
1804 vp->v_mountedhere = mp;
1805 SET(vp->v_flag, VMOUNTEDHERE);
1806
1807 /*
1808 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1809 * 'v_mountedhere' to be planted.
1810 */
1811 wakeup(&vp->v_flag);
1812 vnode_unlock(vp);
1813
1814 /*
1815 * taking the name_cache_lock exclusively will
1816 * insure that everyone is out of the fast path who
1817 * might be trying to use a now stale copy of
1818 * vp->v_mountedhere->mnt_realrootvp
1819 * bumping mount_generation causes the cached values
1820 * to be invalidated
1821 */
1822 name_cache_lock();
1823 mount_generation++;
1824 name_cache_unlock();
1825
1826 error = vnode_ref(vp);
1827 if (error != 0) {
1828 goto out4;
1829 }
1830
1831 have_usecount = TRUE;
1832
1833 error = checkdirs(vp, ctx);
1834 if (error != 0) {
1835 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1836 goto out4;
1837 }
1838 /*
1839 * there is no cleanup code here so I have made it void
1840 * we need to revisit this
1841 */
1842 (void)VFS_START(mp, 0, ctx);
1843
1844 if (mount_list_add(mp) != 0) {
1845 /*
1846 * The system is shutting down trying to umount
1847 * everything, so fail with a plausible errno.
1848 */
1849 error = EBUSY;
1850 goto out4;
1851 }
1852 lck_rw_done(&mp->mnt_rwlock);
1853 is_rwlock_locked = FALSE;
1854
1855 /* Check if this mounted file system supports EAs or named streams. */
1856 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1857 VFSATTR_INIT(&vfsattr);
1858 VFSATTR_WANTED(&vfsattr, f_capabilities);
1859 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1860 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1861 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1862 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1863 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1864 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1865 }
1866 #if NAMEDSTREAMS
1867 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1868 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1869 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1870 }
1871 #endif
1872 /* Check if this file system supports path from id lookups. */
1873 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1874 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1875 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1876 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1877 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1878 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1879 }
1880
1881 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1882 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1883 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1884 }
1885 }
1886 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1887 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1888 }
1889 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1890 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1891 }
1892 /* Get subtype if supported to cache it */
1893 VFSATTR_INIT(&vfsattr);
1894 VFSATTR_WANTED(&vfsattr, f_fssubtype);
1895 if (vfs_getattr(mp, &vfsattr, ctx) == 0 && VFSATTR_IS_SUPPORTED(&vfsattr, f_fssubtype)) {
1896 mp->mnt_vfsstat.f_fssubtype = vfsattr.f_fssubtype;
1897 }
1898
1899 /* increment the operations count */
1900 OSAddAtomic(1, &vfs_nummntops);
1901 enablequotas(mp, ctx);
1902
1903 if (device_vnode) {
1904 vfs_setmountedon(device_vnode);
1905 }
1906
1907 /* Now that mount is setup, notify the listeners */
1908 vfs_notify_mount(pvp);
1909 IOBSDMountChange(mp, kIOMountChangeMount);
1910 #if CONFIG_MACF
1911 mac_mount_notify_mount(ctx, mp);
1912 #endif /* CONFIG_MACF */
1913 } else {
1914 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1915 if (mp->mnt_vnodelist.tqh_first != NULL) {
1916 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1917 mp->mnt_vtable->vfc_name, error);
1918 }
1919
1920 vnode_lock_spin(vp);
1921 CLR(vp->v_flag, VMOUNT);
1922 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
1923 wakeup(&vp->v_flag);
1924 vnode_unlock(vp);
1925 mount_list_lock();
1926 mp->mnt_vtable->vfc_refcount--;
1927 mount_list_unlock();
1928
1929 if (device_vnode) {
1930 vnode_rele(device_vnode);
1931 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1932 vfs_clearmounting(device_vnode);
1933 }
1934 lck_rw_done(&mp->mnt_rwlock);
1935 is_rwlock_locked = FALSE;
1936
1937 if (nc_smr_enabled) {
1938 vfs_smr_synchronize();
1939 }
1940
1941 /*
1942 * if we get here, we have a mount structure that needs to be freed,
1943 * but since the coveredvp hasn't yet been updated to point at it,
1944 * no need to worry about other threads holding a crossref on this mp
1945 * so it's ok to just free it
1946 */
1947 mount_lock_destroy(mp);
1948 #if CONFIG_MACF
1949 mac_mount_label_destroy(mp);
1950 #endif
1951 zfree(mount_zone, mp);
1952 did_set_lmount = false;
1953 }
1954 exit:
1955 /*
1956 * drop I/O count on the device vp if there was one
1957 */
1958 if (devpath && devvp) {
1959 vnode_put(devvp);
1960 }
1961
1962 if (did_set_lmount) {
1963 mount_lock_spin(mp);
1964 mp->mnt_lflag &= ~MNT_LMOUNT;
1965 mount_unlock(mp);
1966 }
1967
1968 return error;
1969
1970 /* Error condition exits */
1971 out4:
1972 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1973
1974 /*
1975 * If the mount has been placed on the covered vp,
1976 * it may have been discovered by now, so we have
1977 * to treat this just like an unmount
1978 */
1979 mount_lock_spin(mp);
1980 mp->mnt_lflag |= MNT_LDEAD;
1981 mount_unlock(mp);
1982
1983 if (device_vnode != NULLVP) {
1984 vnode_rele(device_vnode);
1985 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1986 ctx);
1987 vfs_clearmounting(device_vnode);
1988 did_rele = TRUE;
1989 }
1990
1991 vnode_lock_spin(vp);
1992
1993 mp->mnt_crossref++;
1994 CLR(vp->v_flag, VMOUNTEDHERE);
1995 vp->v_mountedhere = (mount_t) 0;
1996
1997 vnode_unlock(vp);
1998
1999 if (have_usecount) {
2000 vnode_rele(vp);
2001 }
2002 out3:
2003 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
2004 vnode_rele(devvp);
2005 vfs_clearmounting(devvp);
2006 }
2007 out2:
2008 if (devpath && devvp) {
2009 vnode_put(devvp);
2010 }
2011 out1:
2012 /* Release mnt_rwlock only when it was taken */
2013 if (is_rwlock_locked == TRUE) {
2014 if (flag_set) {
2015 mp->mnt_flag = flag; /* restore mnt_flag value */
2016 }
2017 lck_rw_done(&mp->mnt_rwlock);
2018 }
2019
2020 if (did_set_lmount) {
2021 mount_lock_spin(mp);
2022 mp->mnt_lflag &= ~MNT_LMOUNT;
2023 mount_unlock(mp);
2024 }
2025
2026 if (did_set_vmount) {
2027 vnode_lock_spin(vp);
2028 CLR(vp->v_flag, VMOUNT);
2029 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2030 wakeup(&vp->v_flag);
2031 vnode_unlock(vp);
2032 }
2033
2034 if (mntalloc) {
2035 if (mp->mnt_crossref) {
2036 mount_dropcrossref(mp, vp, 0);
2037 } else {
2038 if (nc_smr_enabled) {
2039 vfs_smr_synchronize();
2040 }
2041
2042 mount_lock_destroy(mp);
2043 #if CONFIG_MACF
2044 mac_mount_label_destroy(mp);
2045 #endif
2046 zfree(mount_zone, mp);
2047 }
2048 }
2049 if (vfsp_ref) {
2050 mount_list_lock();
2051 vfsp->vfc_refcount--;
2052 mount_list_unlock();
2053 }
2054
2055 return error;
2056 }
2057
2058 /*
2059 * Flush in-core data, check for competing mount attempts,
2060 * and set VMOUNT
2061 */
2062 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)2063 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
2064 {
2065 #if !CONFIG_MACF
2066 #pragma unused(cnp,fsname)
2067 #endif
2068 struct vnode_attr va;
2069 int error;
2070 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
2071 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
2072 boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
2073
2074 if (!skip_auth) {
2075 /*
2076 * If the user is not root, ensure that they own the directory
2077 * onto which we are attempting to mount.
2078 */
2079 VATTR_INIT(&va);
2080 VATTR_WANTED(&va, va_uid);
2081 if ((error = vnode_getattr(vp, &va, ctx)) ||
2082 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2083 (!vfs_context_issuser(ctx)))) {
2084 error = EPERM;
2085 goto out;
2086 }
2087 }
2088
2089 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
2090 goto out;
2091 }
2092
2093 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
2094 goto out;
2095 }
2096
2097 if (vp->v_type != VDIR) {
2098 error = ENOTDIR;
2099 goto out;
2100 }
2101
2102 vnode_lock_spin(vp);
2103
2104 if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
2105 error = EBUSY;
2106 } else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
2107 (vp->v_mountedhere != NULL))) {
2108 /*
2109 * For mount triggered from mount() call, we want to wait for the
2110 * current in-progress mount to complete, redo lookup and retry the
2111 * mount again. Similarly, we also want to retry if we lost the race
2112 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
2113 * 'v_mountedhere' has been planted after initial lookup.
2114 */
2115 if (ISSET(vp->v_flag, VMOUNT)) {
2116 vnode_lock_convert(vp);
2117 msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
2118 }
2119 error = EBUSY;
2120 } else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
2121 error = EBUSY;
2122 }
2123
2124 if (error) {
2125 vnode_unlock(vp);
2126 goto out;
2127 }
2128 SET(vp->v_flag, VMOUNT);
2129 vnode_unlock(vp);
2130
2131 #if CONFIG_MACF
2132 error = mac_mount_check_mount(ctx, vp,
2133 cnp, fsname);
2134 if (error != 0) {
2135 vnode_lock_spin(vp);
2136 CLR(vp->v_flag, VMOUNT);
2137 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2138 wakeup(&vp->v_flag);
2139 vnode_unlock(vp);
2140 }
2141 #endif
2142
2143 out:
2144 return error;
2145 }
2146
2147 #if CONFIG_IMGSRC_ACCESS
2148
2149 #define DEBUG_IMGSRC 0
2150
2151 #if DEBUG_IMGSRC
2152 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2153 #else
2154 #define IMGSRC_DEBUG(args...) do { } while(0)
2155 #endif
2156
2157 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2158 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2159 {
2160 struct nameidata nd;
2161 vnode_t vp, realdevvp;
2162 kauth_action_t accessmode;
2163 int error;
2164 enum uio_seg uio = UIO_USERSPACE;
2165
2166 if (ctx == vfs_context_kernel()) {
2167 uio = UIO_SYSSPACE;
2168 }
2169
2170 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2171 if ((error = namei(&nd))) {
2172 IMGSRC_DEBUG("namei() failed with %d\n", error);
2173 return error;
2174 }
2175
2176 vp = nd.ni_vp;
2177
2178 if (!vnode_isblk(vp)) {
2179 IMGSRC_DEBUG("Not block device.\n");
2180 error = ENOTBLK;
2181 goto out;
2182 }
2183
2184 realdevvp = mp->mnt_devvp;
2185 if (realdevvp == NULLVP) {
2186 IMGSRC_DEBUG("No device backs the mount.\n");
2187 error = ENXIO;
2188 goto out;
2189 }
2190
2191 error = vnode_getwithref(realdevvp);
2192 if (error != 0) {
2193 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2194 goto out;
2195 }
2196
2197 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2198 IMGSRC_DEBUG("Wrong dev_t.\n");
2199 error = ENXIO;
2200 goto out1;
2201 }
2202
2203 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2204
2205 /*
2206 * If mount by non-root, then verify that user has necessary
2207 * permissions on the device.
2208 */
2209 if (!vfs_context_issuser(ctx)) {
2210 accessmode = KAUTH_VNODE_READ_DATA;
2211 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2212 accessmode |= KAUTH_VNODE_WRITE_DATA;
2213 }
2214 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2215 IMGSRC_DEBUG("Access denied.\n");
2216 goto out1;
2217 }
2218 }
2219
2220 *devvpp = vp;
2221
2222 out1:
2223 vnode_put(realdevvp);
2224
2225 out:
2226 nameidone(&nd);
2227
2228 if (error) {
2229 vnode_put(vp);
2230 }
2231
2232 return error;
2233 }
2234
2235 /*
2236 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2237 * and call checkdirs()
2238 */
2239 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2240 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2241 {
2242 int error;
2243
2244 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2245
2246 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2247 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2248
2249 vnode_lock_spin(vp);
2250 CLR(vp->v_flag, VMOUNT);
2251 vp->v_mountedhere = mp;
2252 SET(vp->v_flag, VMOUNTEDHERE);
2253 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2254 wakeup(&vp->v_flag);
2255 vnode_unlock(vp);
2256
2257 /*
2258 * taking the name_cache_lock exclusively will
2259 * insure that everyone is out of the fast path who
2260 * might be trying to use a now stale copy of
2261 * vp->v_mountedhere->mnt_realrootvp
2262 * bumping mount_generation causes the cached values
2263 * to be invalidated
2264 */
2265 name_cache_lock();
2266 mount_generation++;
2267 name_cache_unlock();
2268
2269 error = vnode_ref(vp);
2270 if (error != 0) {
2271 goto out;
2272 }
2273
2274 error = checkdirs(vp, ctx);
2275 if (error != 0) {
2276 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2277 vnode_rele(vp);
2278 goto out;
2279 }
2280
2281 out:
2282 if (error != 0) {
2283 mp->mnt_vnodecovered = NULLVP;
2284 }
2285 return error;
2286 }
2287
2288 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2289 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2290 {
2291 vnode_rele(vp);
2292 vnode_lock_spin(vp);
2293 CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2294 vp->v_mountedhere = (mount_t)NULL;
2295 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2296 wakeup(&vp->v_flag);
2297 vnode_unlock(vp);
2298
2299 mp->mnt_vnodecovered = NULLVP;
2300 }
2301
2302 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2303 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2304 {
2305 int error;
2306
2307 /* unmount in progress return error */
2308 mount_lock_spin(mp);
2309 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2310 mount_unlock(mp);
2311 return EBUSY;
2312 }
2313 mount_unlock(mp);
2314 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2315
2316 /*
2317 * We only allow the filesystem to be reloaded if it
2318 * is currently mounted read-only.
2319 */
2320 if ((flags & MNT_RELOAD) &&
2321 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2322 error = ENOTSUP;
2323 goto out;
2324 }
2325
2326 /*
2327 * Only root, or the user that did the original mount is
2328 * permitted to update it.
2329 */
2330 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2331 (!vfs_context_issuser(ctx))) {
2332 error = EPERM;
2333 goto out;
2334 }
2335 #if CONFIG_MACF
2336 error = mac_mount_check_remount(ctx, mp, flags);
2337 if (error != 0) {
2338 goto out;
2339 }
2340 #endif
2341
2342 out:
2343 if (error) {
2344 lck_rw_done(&mp->mnt_rwlock);
2345 }
2346
2347 return error;
2348 }
2349
2350 static void
mount_end_update(mount_t mp)2351 mount_end_update(mount_t mp)
2352 {
2353 lck_rw_done(&mp->mnt_rwlock);
2354 }
2355
2356 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2357 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2358 {
2359 vnode_t vp;
2360
2361 if (height >= MAX_IMAGEBOOT_NESTING) {
2362 return EINVAL;
2363 }
2364
2365 vp = imgsrc_rootvnodes[height];
2366 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2367 *rvpp = vp;
2368 return 0;
2369 } else {
2370 return ENOENT;
2371 }
2372 }
2373
2374 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2375 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2376 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2377 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2378 {
2379 int error;
2380 mount_t mp;
2381 boolean_t placed = FALSE;
2382 struct vfstable *vfsp;
2383 user_addr_t devpath;
2384 char *old_mntonname;
2385 vnode_t rvp;
2386 vnode_t devvp;
2387 uint32_t height;
2388 uint32_t flags;
2389
2390 /* If we didn't imageboot, nothing to move */
2391 if (imgsrc_rootvnodes[0] == NULLVP) {
2392 return EINVAL;
2393 }
2394
2395 /* Only root can do this */
2396 if (!vfs_context_issuser(ctx)) {
2397 return EPERM;
2398 }
2399
2400 IMGSRC_DEBUG("looking for root vnode.\n");
2401
2402 /*
2403 * Get root vnode of filesystem we're moving.
2404 */
2405 if (by_index) {
2406 if (is64bit) {
2407 struct user64_mnt_imgsrc_args mia64;
2408 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2409 if (error != 0) {
2410 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2411 return error;
2412 }
2413
2414 height = mia64.mi_height;
2415 flags = mia64.mi_flags;
2416 devpath = (user_addr_t)mia64.mi_devpath;
2417 } else {
2418 struct user32_mnt_imgsrc_args mia32;
2419 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2420 if (error != 0) {
2421 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2422 return error;
2423 }
2424
2425 height = mia32.mi_height;
2426 flags = mia32.mi_flags;
2427 devpath = mia32.mi_devpath;
2428 }
2429 } else {
2430 /*
2431 * For binary compatibility--assumes one level of nesting.
2432 */
2433 if (is64bit) {
2434 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2435 return error;
2436 }
2437 } else {
2438 user32_addr_t tmp;
2439 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2440 return error;
2441 }
2442
2443 /* munge into LP64 addr */
2444 devpath = CAST_USER_ADDR_T(tmp);
2445 }
2446
2447 height = 0;
2448 flags = 0;
2449 }
2450
2451 if (flags != 0) {
2452 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2453 return EINVAL;
2454 }
2455
2456 error = get_imgsrc_rootvnode(height, &rvp);
2457 if (error != 0) {
2458 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2459 return error;
2460 }
2461
2462 IMGSRC_DEBUG("got old root vnode\n");
2463
2464 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2465
2466 /* Can only move once */
2467 mp = vnode_mount(rvp);
2468 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2469 IMGSRC_DEBUG("Already moved.\n");
2470 error = EBUSY;
2471 goto out0;
2472 }
2473
2474 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2475 IMGSRC_DEBUG("Starting updated.\n");
2476
2477 /* Get exclusive rwlock on mount, authorize update on mp */
2478 error = mount_begin_update(mp, ctx, 0);
2479 if (error != 0) {
2480 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2481 goto out0;
2482 }
2483
2484 /*
2485 * It can only be moved once. Flag is set under the rwlock,
2486 * so we're now safe to proceed.
2487 */
2488 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2489 IMGSRC_DEBUG("Already moved [2]\n");
2490 goto out1;
2491 }
2492
2493 IMGSRC_DEBUG("Preparing coveredvp.\n");
2494
2495 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2496 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2497 if (error != 0) {
2498 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2499 goto out1;
2500 }
2501
2502 IMGSRC_DEBUG("Covered vp OK.\n");
2503
2504 /* Sanity check the name caller has provided */
2505 vfsp = mp->mnt_vtable;
2506 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2507 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2508 vfsp->vfc_name, fsname);
2509 error = EINVAL;
2510 goto out2;
2511 }
2512
2513 /* Check the device vnode and update mount-from name, for local filesystems */
2514 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2515 IMGSRC_DEBUG("Local, doing device validation.\n");
2516
2517 if (devpath != USER_ADDR_NULL) {
2518 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2519 if (error) {
2520 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2521 goto out2;
2522 }
2523
2524 vnode_put(devvp);
2525 }
2526 }
2527
2528 /*
2529 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2530 * and increment the name cache's mount generation
2531 */
2532
2533 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2534 error = place_mount_and_checkdirs(mp, vp, ctx);
2535 if (error != 0) {
2536 goto out2;
2537 }
2538
2539 placed = TRUE;
2540
2541 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2542 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2543
2544 /* Forbid future moves */
2545 mount_lock(mp);
2546 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2547 mount_unlock(mp);
2548
2549 /* Finally, add to mount list, completely ready to go */
2550 if (mount_list_add(mp) != 0) {
2551 /*
2552 * The system is shutting down trying to umount
2553 * everything, so fail with a plausible errno.
2554 */
2555 error = EBUSY;
2556 goto out3;
2557 }
2558
2559 mount_end_update(mp);
2560 vnode_put(rvp);
2561 zfree(ZV_NAMEI, old_mntonname);
2562
2563 vfs_notify_mount(pvp);
2564 #if CONFIG_MACF
2565 mac_mount_notify_mount(ctx, mp);
2566 #endif /* CONFIG_MACF */
2567
2568 return 0;
2569 out3:
2570 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2571
2572 mount_lock(mp);
2573 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2574 mount_unlock(mp);
2575
2576 out2:
2577 /*
2578 * Placing the mp on the vnode clears VMOUNT,
2579 * so cleanup is different after that point
2580 */
2581 if (placed) {
2582 /* Rele the vp, clear VMOUNT and v_mountedhere */
2583 undo_place_on_covered_vp(mp, vp);
2584 } else {
2585 vnode_lock_spin(vp);
2586 CLR(vp->v_flag, VMOUNT);
2587 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2588 wakeup(&vp->v_flag);
2589 vnode_unlock(vp);
2590 }
2591 out1:
2592 mount_end_update(mp);
2593
2594 out0:
2595 vnode_put(rvp);
2596 zfree(ZV_NAMEI, old_mntonname);
2597 return error;
2598 }
2599
2600 #endif /* CONFIG_IMGSRC_ACCESS */
2601
2602 void
enablequotas(struct mount * mp,vfs_context_t ctx)2603 enablequotas(struct mount *mp, vfs_context_t ctx)
2604 {
2605 struct nameidata qnd;
2606 int type;
2607 char qfpath[MAXPATHLEN];
2608 const char *qfname = QUOTAFILENAME;
2609 const char *qfopsname = QUOTAOPSNAME;
2610 const char *qfextension[] = INITQFNAMES;
2611
2612 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2613 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2614 return;
2615 }
2616 /*
2617 * Enable filesystem disk quotas if necessary.
2618 * We ignore errors as this should not interfere with final mount
2619 */
2620 for (type = 0; type < MAXQUOTAS; type++) {
2621 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2622 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2623 CAST_USER_ADDR_T(qfpath), ctx);
2624 if (namei(&qnd) != 0) {
2625 continue; /* option file to trigger quotas is not present */
2626 }
2627 vnode_put(qnd.ni_vp);
2628 nameidone(&qnd);
2629 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2630
2631 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2632 }
2633 return;
2634 }
2635
2636
2637 static int
checkdirs_callback(proc_t p,void * arg)2638 checkdirs_callback(proc_t p, void * arg)
2639 {
2640 struct cdirargs *cdrp = (struct cdirargs *)arg;
2641 vnode_t olddp = cdrp->olddp;
2642 vnode_t newdp = cdrp->newdp;
2643 struct filedesc *fdp = &p->p_fd;
2644 vnode_t new_cvp = newdp;
2645 vnode_t new_rvp = newdp;
2646 vnode_t old_cvp = NULL;
2647 vnode_t old_rvp = NULL;
2648
2649 /*
2650 * XXX Also needs to iterate each thread in the process to see if it
2651 * XXX is using a per-thread current working directory, and, if so,
2652 * XXX update that as well.
2653 */
2654
2655 /*
2656 * First, with the proc_fdlock held, check to see if we will need
2657 * to do any work. If not, we will get out fast.
2658 */
2659 proc_fdlock(p);
2660 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2661 proc_fdunlock(p);
2662 return PROC_RETURNED;
2663 }
2664 proc_fdunlock(p);
2665
2666 /*
2667 * Ok, we will have to do some work. Always take two refs
2668 * because we might need that many. We'll dispose of whatever
2669 * we ended up not using.
2670 */
2671 if (vnode_ref(newdp) != 0) {
2672 return PROC_RETURNED;
2673 }
2674 if (vnode_ref(newdp) != 0) {
2675 vnode_rele(newdp);
2676 return PROC_RETURNED;
2677 }
2678
2679 proc_dirs_lock_exclusive(p);
2680 /*
2681 * Now do the work. Note: we dropped the proc_fdlock, so we
2682 * have to do all of the checks again.
2683 */
2684 proc_fdlock(p);
2685 if (fdp->fd_cdir == olddp) {
2686 old_cvp = olddp;
2687 fdp->fd_cdir = newdp;
2688 new_cvp = NULL;
2689 }
2690 if (fdp->fd_rdir == olddp) {
2691 old_rvp = olddp;
2692 fdp->fd_rdir = newdp;
2693 new_rvp = NULL;
2694 }
2695 proc_fdunlock(p);
2696 proc_dirs_unlock_exclusive(p);
2697
2698 /*
2699 * Dispose of any references that are no longer needed.
2700 */
2701 if (old_cvp != NULL) {
2702 vnode_rele(old_cvp);
2703 }
2704 if (old_rvp != NULL) {
2705 vnode_rele(old_rvp);
2706 }
2707 if (new_cvp != NULL) {
2708 vnode_rele(new_cvp);
2709 }
2710 if (new_rvp != NULL) {
2711 vnode_rele(new_rvp);
2712 }
2713
2714 return PROC_RETURNED;
2715 }
2716
2717
2718
2719 /*
2720 * Scan all active processes to see if any of them have a current
2721 * or root directory onto which the new filesystem has just been
2722 * mounted. If so, replace them with the new mount point.
2723 */
2724 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2725 checkdirs(vnode_t olddp, vfs_context_t ctx)
2726 {
2727 vnode_t newdp;
2728 vnode_t tvp;
2729 int err;
2730 struct cdirargs cdr;
2731
2732 if (olddp->v_usecount == 1) {
2733 return 0;
2734 }
2735 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2736
2737 if (err != 0) {
2738 #if DIAGNOSTIC
2739 panic("mount: lost mount: error %d", err);
2740 #endif
2741 return err;
2742 }
2743
2744 cdr.olddp = olddp;
2745 cdr.newdp = newdp;
2746 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2747 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2748
2749 if (rootvnode == olddp) {
2750 vnode_ref(newdp);
2751 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2752 tvp = rootvnode;
2753 rootvnode = newdp;
2754 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2755 vnode_rele(tvp);
2756 }
2757
2758 vnode_put(newdp);
2759 return 0;
2760 }
2761
2762 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2763 "com.apple.private.vfs.role-account-unmount"
2764 #define SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT \
2765 "com.apple.private.vfs.system-volume-unmount"
2766
2767 /*
2768 * Unmount a file system.
2769 *
2770 * Note: unmount takes a path to the vnode mounted on as argument,
2771 * not special file (as before).
2772 */
2773 /* ARGSUSED */
2774 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2775 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2776 {
2777 vnode_t vp;
2778 struct mount *mp;
2779 int flags = uap->flags;
2780 int error;
2781 struct nameidata nd;
2782 vfs_context_t ctx;
2783
2784 /*
2785 * If the process has the entitlement, use the kernel's context when
2786 * performing lookup on the mount path as the process might lack proper
2787 * permission to access the directory.
2788 */
2789 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2790 vfs_context_kernel() : vfs_context_current();
2791
2792 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2793 UIO_USERSPACE, uap->path, ctx);
2794 if (flags & MNT_NOFOLLOW) {
2795 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
2796 }
2797
2798 error = namei(&nd);
2799 if (error) {
2800 return error;
2801 }
2802 vp = nd.ni_vp;
2803 mp = vp->v_mount;
2804 nameidone(&nd);
2805
2806 /*
2807 * Must be the root of the filesystem
2808 */
2809 if ((vp->v_flag & VROOT) == 0) {
2810 vnode_put(vp);
2811 return EINVAL;
2812 }
2813 #if CONFIG_MACF
2814 error = mac_mount_check_umount(ctx, mp);
2815 if (error != 0) {
2816 vnode_put(vp);
2817 return error;
2818 }
2819 #endif
2820 mount_ref(mp, 0);
2821 vnode_put(vp);
2822 /* safedounmount consumes the mount ref */
2823 return safedounmount(mp, flags, ctx);
2824 }
2825
2826 int
funmount(__unused proc_t p,struct funmount_args * uap,__unused int32_t * retval)2827 funmount(__unused proc_t p, struct funmount_args *uap, __unused int32_t *retval)
2828 {
2829 int error;
2830 vnode_t vp;
2831 struct mount *mp;
2832 vfs_context_t ctx;
2833
2834 AUDIT_ARG(fd, uap->fd);
2835 AUDIT_ARG(fflags, uap->flags);
2836
2837 /*
2838 * If the process has the entitlement, use the kernel's context when
2839 * performing lookup on the mount path as the process might lack proper
2840 * permission to access the directory.
2841 */
2842 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2843 vfs_context_kernel() : vfs_context_current();
2844
2845 error = vnode_getfromfd(ctx, uap->fd, &vp);
2846 if (error) {
2847 return error;
2848 }
2849
2850 /*
2851 * Must be the root of the filesystem
2852 */
2853 if ((vp->v_flag & VROOT) == 0) {
2854 vnode_put(vp);
2855 return EINVAL;
2856 }
2857 mp = vnode_mount(vp);
2858
2859 #if CONFIG_MACF
2860 error = mac_mount_check_umount(ctx, mp);
2861 if (error != 0) {
2862 vnode_put(vp);
2863 return error;
2864 }
2865 #endif
2866 mount_ref(mp, 0);
2867 vnode_put(vp);
2868
2869 /* safedounmount consumes the mount ref */
2870 return safedounmount(mp, uap->flags, ctx);
2871 }
2872
2873 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2874 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2875 {
2876 mount_t mp;
2877
2878 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2879 if (mp == (mount_t)0) {
2880 return ENOENT;
2881 }
2882 mount_ref(mp, 0);
2883 mount_iterdrop(mp);
2884 /* safedounmount consumes the mount ref */
2885 return safedounmount(mp, flags, ctx);
2886 }
2887
2888 /*
2889 * The mount struct comes with a mount ref which will be consumed.
2890 * Do the actual file system unmount, prevent some common foot shooting.
2891 */
2892 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2893 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2894 {
2895 int error;
2896 proc_t p = vfs_context_proc(ctx);
2897
2898 /*
2899 * If the file system is not responding and MNT_NOBLOCK
2900 * is set and not a forced unmount then return EBUSY.
2901 */
2902 if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2903 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2904 error = EBUSY;
2905 goto out;
2906 }
2907
2908 /*
2909 * Skip authorization in two cases:
2910 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2911 * This entitlement allows non-root processes unmount volumes mounted by
2912 * other processes.
2913 * - If the mount is tagged as permissive and this is not a forced-unmount
2914 * attempt.
2915 */
2916 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2917 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2918 /*
2919 * Only root, or the user that did the original mount is
2920 * permitted to unmount this filesystem.
2921 */
2922 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2923 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2924 goto out;
2925 }
2926 }
2927
2928 /*
2929 * Don't allow unmounting the root file system, or other volumes
2930 * associated with it (for example, the associated VM or DATA mounts) .
2931 */
2932 if (mp->mnt_flag & MNT_ROOTFS) {
2933 error = EBUSY; /* the root is always busy */
2934 goto out;
2935 }
2936 if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !IOCurrentTaskHasEntitlement(SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT)) {
2937 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2938 mp->mnt_vfsstat.f_mntonname);
2939 error = EBUSY; /* root-associated volumes are always busy unless caller is entitled */
2940 goto out;
2941 }
2942
2943 /*
2944 * If the mount is providing the root filesystem's disk image
2945 * (i.e. imageboot), don't allow unmounting
2946 */
2947 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2948 error = EBUSY;
2949 goto out;
2950 }
2951
2952 return dounmount(mp, flags, 1, ctx);
2953
2954 out:
2955 mount_drop(mp, 0);
2956 return error;
2957 }
2958
2959 /*
2960 * Do the actual file system unmount.
2961 */
2962 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2963 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2964 {
2965 vnode_t coveredvp = (vnode_t)0;
2966 int error;
2967 int needwakeup = 0;
2968 int forcedunmount = 0;
2969 int lflags = 0;
2970 struct vnode *devvp = NULLVP;
2971 #if CONFIG_TRIGGERS
2972 proc_t p = vfs_context_proc(ctx);
2973 int did_vflush = 0;
2974 int pflags_save = 0;
2975 #endif /* CONFIG_TRIGGERS */
2976
2977 #if CONFIG_FSE
2978 if (!(flags & MNT_FORCE)) {
2979 fsevent_unmount(mp, ctx); /* has to come first! */
2980 }
2981 #endif
2982
2983 mount_lock(mp);
2984
2985 /*
2986 * If already an unmount in progress just return EBUSY.
2987 * Even a forced unmount cannot override.
2988 */
2989 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2990 if (withref != 0) {
2991 mount_drop(mp, 1);
2992 }
2993 mount_unlock(mp);
2994 return EBUSY;
2995 }
2996
2997 if (flags & MNT_FORCE) {
2998 forcedunmount = 1;
2999 mp->mnt_lflag |= MNT_LFORCE;
3000 }
3001
3002 #if CONFIG_TRIGGERS
3003 if (flags & MNT_NOBLOCK && p != kernproc) {
3004 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
3005 }
3006 #endif
3007
3008 mp->mnt_kern_flag |= MNTK_UNMOUNT;
3009 mp->mnt_lflag |= MNT_LUNMOUNT;
3010 mp->mnt_flag &= ~MNT_ASYNC;
3011 /*
3012 * anyone currently in the fast path that
3013 * trips over the cached rootvp will be
3014 * dumped out and forced into the slow path
3015 * to regenerate a new cached value
3016 */
3017 mp->mnt_realrootvp = NULLVP;
3018 mount_unlock(mp);
3019
3020 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
3021 /*
3022 * Force unmount any mounts in this filesystem.
3023 * If any unmounts fail - just leave them dangling.
3024 * Avoids recursion.
3025 */
3026 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
3027 }
3028
3029 /*
3030 * taking the name_cache_lock exclusively will
3031 * insure that everyone is out of the fast path who
3032 * might be trying to use a now stale copy of
3033 * vp->v_mountedhere->mnt_realrootvp
3034 * bumping mount_generation causes the cached values
3035 * to be invalidated
3036 */
3037 name_cache_lock();
3038 mount_generation++;
3039 name_cache_unlock();
3040
3041 /*
3042 * Make sure there are no one in the mount iterations or lookup.
3043 * Drain makes 'mnt_iterref' -ve so on error exit we need to ensure that
3044 * 'mnt_iterref' is reset back to 0 by calling mount_iterreset().
3045 */
3046 mount_iterdrain(mp);
3047
3048 lck_rw_lock_exclusive(&mp->mnt_rwlock);
3049 if (withref != 0) {
3050 mount_drop(mp, 0);
3051 }
3052 error = 0;
3053 if (forcedunmount == 0) {
3054 ubc_umount(mp); /* release cached vnodes */
3055 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3056 error = VFS_SYNC(mp, MNT_WAIT, ctx);
3057 if (error) {
3058 mount_iterreset(mp);
3059 mount_lock(mp);
3060 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3061 mp->mnt_lflag &= ~MNT_LUNMOUNT;
3062 mp->mnt_lflag &= ~MNT_LFORCE;
3063 goto out;
3064 }
3065 }
3066 }
3067
3068 IOBSDMountChange(mp, kIOMountChangeUnmount);
3069
3070 #if CONFIG_TRIGGERS
3071 vfs_nested_trigger_unmounts(mp, flags, ctx);
3072 did_vflush = 1;
3073 #endif
3074 if (forcedunmount) {
3075 lflags |= FORCECLOSE;
3076 }
3077 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
3078 if ((forcedunmount == 0) && error) {
3079 mount_iterreset(mp);
3080 mount_lock(mp);
3081 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3082 mp->mnt_lflag &= ~MNT_LUNMOUNT;
3083 mp->mnt_lflag &= ~MNT_LFORCE;
3084 goto out;
3085 }
3086
3087 error = VFS_UNMOUNT(mp, flags, ctx);
3088 if (error) {
3089 mount_iterreset(mp);
3090 mount_lock(mp);
3091 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
3092 mp->mnt_lflag &= ~MNT_LUNMOUNT;
3093 mp->mnt_lflag &= ~MNT_LFORCE;
3094 goto out;
3095 }
3096
3097 /* increment the operations count */
3098 if (!error) {
3099 OSAddAtomic(1, &vfs_nummntops);
3100 }
3101
3102 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
3103 /* hold an io reference and drop the usecount before close */
3104 devvp = mp->mnt_devvp;
3105 vnode_getalways(devvp);
3106 vnode_rele(devvp);
3107 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
3108 ctx);
3109 vnode_clearmountedon(devvp);
3110 vnode_put(devvp);
3111 }
3112 lck_rw_done(&mp->mnt_rwlock);
3113 mount_list_remove(mp);
3114 lck_rw_lock_exclusive(&mp->mnt_rwlock);
3115
3116 /* mark the mount point hook in the vp but not drop the ref yet */
3117 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
3118 /*
3119 * The covered vnode needs special handling. Trying to get an
3120 * iocount must not block here as this may lead to deadlocks
3121 * if the Filesystem to which the covered vnode belongs is
3122 * undergoing forced unmounts. Since we hold a usecount, the
3123 * vnode cannot be reused (it can, however, still be terminated)
3124 */
3125 vnode_getalways(coveredvp);
3126 vnode_lock_spin(coveredvp);
3127
3128 mp->mnt_crossref++;
3129 coveredvp->v_mountedhere = (struct mount *)0;
3130 CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
3131 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
3132 wakeup(&coveredvp->v_flag);
3133 vnode_unlock(coveredvp);
3134 vnode_put(coveredvp);
3135 }
3136
3137 mount_list_lock();
3138 mp->mnt_vtable->vfc_refcount--;
3139 mount_list_unlock();
3140
3141 cache_purgevfs(mp); /* remove cache entries for this file sys */
3142 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
3143 mount_lock(mp);
3144 mp->mnt_lflag |= MNT_LDEAD;
3145
3146 if (mp->mnt_lflag & MNT_LWAIT) {
3147 /*
3148 * do the wakeup here
3149 * in case we block in mount_refdrain
3150 * which will drop the mount lock
3151 * and allow anyone blocked in vfs_busy
3152 * to wakeup and see the LDEAD state
3153 */
3154 mp->mnt_lflag &= ~MNT_LWAIT;
3155 wakeup((caddr_t)mp);
3156 }
3157 mount_refdrain(mp);
3158
3159 /* free disk_conditioner_info structure for this mount */
3160 disk_conditioner_unmount(mp);
3161
3162 out:
3163 if (mp->mnt_lflag & MNT_LWAIT) {
3164 mp->mnt_lflag &= ~MNT_LWAIT;
3165 needwakeup = 1;
3166 }
3167
3168 #if CONFIG_TRIGGERS
3169 if (flags & MNT_NOBLOCK && p != kernproc) {
3170 // Restore P_NOREMOTEHANG bit to its previous value
3171 if ((pflags_save & P_NOREMOTEHANG) == 0) {
3172 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
3173 }
3174 }
3175
3176 /*
3177 * Callback and context are set together under the mount lock, and
3178 * never cleared, so we're safe to examine them here, drop the lock,
3179 * and call out.
3180 */
3181 if (mp->mnt_triggercallback != NULL) {
3182 mount_unlock(mp);
3183 if (error == 0) {
3184 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
3185 } else if (did_vflush) {
3186 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
3187 }
3188 } else {
3189 mount_unlock(mp);
3190 }
3191 #else
3192 mount_unlock(mp);
3193 #endif /* CONFIG_TRIGGERS */
3194
3195 lck_rw_done(&mp->mnt_rwlock);
3196
3197 if (needwakeup) {
3198 wakeup((caddr_t)mp);
3199 }
3200
3201 if (!error) {
3202 if ((coveredvp != NULLVP)) {
3203 vnode_t pvp = NULLVP;
3204
3205 /*
3206 * The covered vnode needs special handling. Trying to
3207 * get an iocount must not block here as this may lead
3208 * to deadlocks if the Filesystem to which the covered
3209 * vnode belongs is undergoing forced unmounts. Since we
3210 * hold a usecount, the vnode cannot be reused
3211 * (it can, however, still be terminated).
3212 */
3213 vnode_getalways(coveredvp);
3214
3215 mount_dropcrossref(mp, coveredvp, 0);
3216 /*
3217 * We'll _try_ to detect if this really needs to be
3218 * done. The coveredvp can only be in termination (or
3219 * terminated) if the coveredvp's mount point is in a
3220 * forced unmount (or has been) since we still hold the
3221 * ref.
3222 */
3223 if (!vnode_isrecycled(coveredvp)) {
3224 pvp = vnode_getparent(coveredvp);
3225 #if CONFIG_TRIGGERS
3226 if (coveredvp->v_resolve) {
3227 vnode_trigger_rearm(coveredvp, ctx);
3228 }
3229 #endif
3230 }
3231
3232 vnode_rele(coveredvp);
3233 vnode_put(coveredvp);
3234 coveredvp = NULLVP;
3235
3236 if (pvp) {
3237 lock_vnode_and_post(pvp, NOTE_WRITE);
3238 vnode_put(pvp);
3239 }
3240 } else if (mp->mnt_flag & MNT_ROOTFS) {
3241 if (nc_smr_enabled) {
3242 vfs_smr_synchronize();
3243 }
3244
3245 mount_lock_destroy(mp);
3246 #if CONFIG_MACF
3247 mac_mount_label_destroy(mp);
3248 #endif
3249 zfree(mount_zone, mp);
3250 } else {
3251 panic("dounmount: no coveredvp");
3252 }
3253 }
3254 return error;
3255 }
3256
3257 /*
3258 * Unmount any mounts in this filesystem.
3259 */
3260 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3261 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3262 {
3263 mount_t smp;
3264 fsid_t *fsids, fsid;
3265 int fsids_sz;
3266 int count = 0, i, m = 0;
3267 vnode_t vp;
3268
3269 mount_list_lock();
3270
3271 // Get an array to hold the submounts fsids.
3272 TAILQ_FOREACH(smp, &mountlist, mnt_list)
3273 count++;
3274 fsids_sz = count * sizeof(fsid_t);
3275 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3276 if (fsids == NULL) {
3277 mount_list_unlock();
3278 goto out;
3279 }
3280 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3281
3282 /*
3283 * Fill the array with submount fsids.
3284 * Since mounts are always added to the tail of the mount list, the
3285 * list is always in mount order.
3286 * For each mount check if the mounted-on vnode belongs to a
3287 * mount that's already added to our array of mounts to be unmounted.
3288 */
3289 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3290 vp = smp->mnt_vnodecovered;
3291 if (vp == NULL) {
3292 continue;
3293 }
3294 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3295 for (i = 0; i <= m; i++) {
3296 if (fsids[i].val[0] == fsid.val[0] &&
3297 fsids[i].val[1] == fsid.val[1]) {
3298 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3299 break;
3300 }
3301 }
3302 }
3303 mount_list_unlock();
3304
3305 // Unmount the submounts in reverse order. Ignore errors.
3306 for (i = m; i > 0; i--) {
3307 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3308 if (smp) {
3309 mount_ref(smp, 0);
3310 mount_iterdrop(smp);
3311 (void) dounmount(smp, flags, 1, ctx);
3312 }
3313 }
3314 out:
3315 kfree_data(fsids, fsids_sz);
3316 }
3317
3318 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3319 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3320 {
3321 vnode_hold(dp);
3322 vnode_lock(dp);
3323 mp->mnt_crossref--;
3324
3325 if (mp->mnt_crossref < 0) {
3326 panic("mount cross refs -ve");
3327 }
3328
3329 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3330 if (need_put) {
3331 vnode_put_locked(dp);
3332 }
3333 vnode_drop_and_unlock(dp);
3334
3335 if (nc_smr_enabled) {
3336 vfs_smr_synchronize();
3337 }
3338
3339 mount_lock_destroy(mp);
3340 #if CONFIG_MACF
3341 mac_mount_label_destroy(mp);
3342 #endif
3343 zfree(mount_zone, mp);
3344 return;
3345 }
3346 if (need_put) {
3347 vnode_put_locked(dp);
3348 }
3349 vnode_drop_and_unlock(dp);
3350 }
3351
3352
3353 /*
3354 * Sync each mounted filesystem.
3355 */
3356 #if DIAGNOSTIC
3357 int syncprt = 0;
3358 #endif
3359
3360 int print_vmpage_stat = 0;
3361
3362 /*
3363 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3364 * mounted read-write with the passed waitfor value.
3365 *
3366 * Parameters: mp mount-point descriptor per mounted file-system instance.
3367 * arg user argument (please see below)
3368 *
3369 * User argument is a pointer to 32 bit unsigned integer which describes the
3370 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3371 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3372 * waitfor value.
3373 *
3374 * Returns: VFS_RETURNED
3375 */
3376 static int
sync_callback(mount_t mp,void * arg)3377 sync_callback(mount_t mp, void *arg)
3378 {
3379 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3380 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3381 unsigned waitfor = MNT_NOWAIT;
3382
3383 if (arg) {
3384 waitfor = *(uint32_t*)arg;
3385 }
3386
3387 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3388 if (waitfor != MNT_WAIT &&
3389 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3390 waitfor != MNT_NOWAIT &&
3391 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3392 waitfor != MNT_DWAIT &&
3393 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3394 panic("Passed inappropriate waitfor %u to "
3395 "sync_callback()", waitfor);
3396 }
3397
3398 mp->mnt_flag &= ~MNT_ASYNC;
3399 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3400 if (asyncflag) {
3401 mp->mnt_flag |= MNT_ASYNC;
3402 }
3403 }
3404
3405 return VFS_RETURNED;
3406 }
3407
3408 /* ARGSUSED */
3409 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3410 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3411 {
3412 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3413
3414 if (print_vmpage_stat) {
3415 vm_countdirtypages();
3416 }
3417
3418 #if DIAGNOSTIC
3419 if (syncprt) {
3420 vfs_bufstats();
3421 }
3422 #endif /* DIAGNOSTIC */
3423 return 0;
3424 }
3425
3426 typedef enum {
3427 SYNC_ALL = 0,
3428 SYNC_ONLY_RELIABLE_MEDIA = 1,
3429 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3430 } sync_type_t;
3431
3432 static int
sync_internal_callback(mount_t mp,void * arg)3433 sync_internal_callback(mount_t mp, void *arg)
3434 {
3435 if (arg) {
3436 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3437 (mp->mnt_flag & MNT_LOCAL);
3438 sync_type_t sync_type = *((sync_type_t *)arg);
3439
3440 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3441 return VFS_RETURNED;
3442 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3443 return VFS_RETURNED;
3444 }
3445 }
3446
3447 (void)sync_callback(mp, NULL);
3448
3449 return VFS_RETURNED;
3450 }
3451
3452 int sync_thread_state = 0;
3453 int sync_timeout_seconds = 5;
3454
3455 #define SYNC_THREAD_RUN 0x0001
3456 #define SYNC_THREAD_RUNNING 0x0002
3457
3458 #if CONFIG_PHYS_WRITE_ACCT
3459 thread_t pm_sync_thread;
3460 #endif /* CONFIG_PHYS_WRITE_ACCT */
3461
3462 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3463 sync_thread(__unused void *arg, __unused wait_result_t wr)
3464 {
3465 sync_type_t sync_type;
3466 #if CONFIG_PHYS_WRITE_ACCT
3467 pm_sync_thread = current_thread();
3468 #endif /* CONFIG_PHYS_WRITE_ACCT */
3469
3470 lck_mtx_lock(&sync_mtx_lck);
3471 while (sync_thread_state & SYNC_THREAD_RUN) {
3472 sync_thread_state &= ~SYNC_THREAD_RUN;
3473 lck_mtx_unlock(&sync_mtx_lck);
3474
3475 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3476 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3477 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3478 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3479
3480 lck_mtx_lock(&sync_mtx_lck);
3481 }
3482 /*
3483 * This wakeup _has_ to be issued before the lock is released otherwise
3484 * we may end up waking up a thread in sync_internal which is
3485 * expecting a wakeup from a thread it just created and not from this
3486 * thread which is about to exit.
3487 */
3488 wakeup(&sync_thread_state);
3489 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3490 #if CONFIG_PHYS_WRITE_ACCT
3491 pm_sync_thread = NULL;
3492 #endif /* CONFIG_PHYS_WRITE_ACCT */
3493 lck_mtx_unlock(&sync_mtx_lck);
3494
3495 if (print_vmpage_stat) {
3496 vm_countdirtypages();
3497 }
3498
3499 #if DIAGNOSTIC
3500 if (syncprt) {
3501 vfs_bufstats();
3502 }
3503 #endif /* DIAGNOSTIC */
3504 }
3505
3506 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3507
3508 /*
3509 * An in-kernel sync for power management to call.
3510 * This function always returns within sync_timeout seconds.
3511 */
3512 __private_extern__ int
sync_internal(void)3513 sync_internal(void)
3514 {
3515 thread_t thd = NULL;
3516 int error;
3517 int thread_created = FALSE;
3518 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3519
3520 lck_mtx_lock(&sync_mtx_lck);
3521 sync_thread_state |= SYNC_THREAD_RUN;
3522 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3523 int kr;
3524
3525 sync_thread_state |= SYNC_THREAD_RUNNING;
3526 kr = kernel_thread_start(sync_thread, NULL, &thd);
3527 if (kr != KERN_SUCCESS) {
3528 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3529 lck_mtx_unlock(&sync_mtx_lck);
3530 printf("sync_thread failed\n");
3531 return 0;
3532 }
3533 thread_created = TRUE;
3534 }
3535
3536 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3537 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3538 if (error) {
3539 struct timeval now;
3540
3541 microtime(&now);
3542 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3543 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3544 sync_timeout_last_print.tv_sec = now.tv_sec;
3545 }
3546 }
3547
3548 if (thread_created) {
3549 thread_deallocate(thd);
3550 }
3551
3552 return 0;
3553 } /* end of sync_internal call */
3554
3555 /*
3556 * Change filesystem quotas.
3557 */
3558 #if QUOTA
3559 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3560 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3561 {
3562 struct mount *mp;
3563 int error, quota_cmd, quota_status = 0;
3564 caddr_t datap;
3565 size_t fnamelen;
3566 struct nameidata nd;
3567 vfs_context_t ctx = vfs_context_current();
3568 struct dqblk my_dqblk = {};
3569
3570 AUDIT_ARG(uid, uap->uid);
3571 AUDIT_ARG(cmd, uap->cmd);
3572 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3573 uap->path, ctx);
3574 error = namei(&nd);
3575 if (error) {
3576 return error;
3577 }
3578 mp = nd.ni_vp->v_mount;
3579 mount_ref(mp, 0);
3580 vnode_put(nd.ni_vp);
3581 nameidone(&nd);
3582
3583 #if CONFIG_MACF
3584 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3585 if (error != 0) {
3586 goto out;
3587 }
3588 #endif
3589
3590 /* copyin any data we will need for downstream code */
3591 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3592
3593 switch (quota_cmd) {
3594 case Q_QUOTAON:
3595 /* uap->arg specifies a file from which to take the quotas */
3596 fnamelen = MAXPATHLEN;
3597 datap = zalloc(ZV_NAMEI);
3598 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3599 break;
3600 case Q_GETQUOTA:
3601 /* uap->arg is a pointer to a dqblk structure. */
3602 datap = (caddr_t) &my_dqblk;
3603 break;
3604 case Q_SETQUOTA:
3605 case Q_SETUSE:
3606 /* uap->arg is a pointer to a dqblk structure. */
3607 datap = (caddr_t) &my_dqblk;
3608 if (proc_is64bit(p)) {
3609 struct user_dqblk my_dqblk64;
3610 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3611 if (error == 0) {
3612 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3613 }
3614 } else {
3615 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3616 }
3617 break;
3618 case Q_QUOTASTAT:
3619 /* uap->arg is a pointer to an integer */
3620 datap = (caddr_t) "a_status;
3621 break;
3622 default:
3623 datap = NULL;
3624 break;
3625 } /* switch */
3626
3627 if (error == 0) {
3628 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3629 }
3630
3631 switch (quota_cmd) {
3632 case Q_QUOTAON:
3633 if (datap != NULL) {
3634 zfree(ZV_NAMEI, datap);
3635 }
3636 break;
3637 case Q_GETQUOTA:
3638 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3639 if (error == 0) {
3640 if (proc_is64bit(p)) {
3641 struct user_dqblk my_dqblk64;
3642
3643 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3644 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3645 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3646 } else {
3647 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3648 }
3649 }
3650 break;
3651 case Q_QUOTASTAT:
3652 /* uap->arg is a pointer to an integer */
3653 if (error == 0) {
3654 error = copyout(datap, uap->arg, sizeof(quota_status));
3655 }
3656 break;
3657 default:
3658 break;
3659 } /* switch */
3660
3661 out:
3662 mount_drop(mp, 0);
3663 return error;
3664 }
3665 #else
3666 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3667 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3668 {
3669 return EOPNOTSUPP;
3670 }
3671 #endif /* QUOTA */
3672
3673 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3674 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3675 {
3676 int error;
3677 vfs_context_t ctx = vfs_context_current();
3678
3679 #if CONFIG_MACF
3680 error = mac_mount_check_stat(ctx, mp);
3681 if (error != 0) {
3682 return error;
3683 }
3684 #endif
3685
3686 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3687 if (error != 0) {
3688 return error;
3689 }
3690
3691 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3692 }
3693
3694 /*
3695 * Get filesystem statistics.
3696 *
3697 * Returns: 0 Success
3698 * namei:???
3699 * vfs_update_vfsstat:???
3700 * munge_statfs:EFAULT
3701 */
3702 /* ARGSUSED */
3703 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3704 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3705 {
3706 int error;
3707 struct mount *mp;
3708 struct nameidata nd;
3709 vfs_context_t ctx = vfs_context_current();
3710 vnode_t vp;
3711
3712 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3713 UIO_USERSPACE, uap->path, ctx);
3714 error = namei(&nd);
3715 if (error != 0) {
3716 return error;
3717 }
3718 vp = nd.ni_vp;
3719 mp = vp->v_mount;
3720 nameidone(&nd);
3721
3722 error = statfs_internal(p, mp, uap->buf);
3723 vnode_put(vp);
3724
3725 return error;
3726 }
3727
3728 /*
3729 * Get filesystem statistics.
3730 */
3731 /* ARGSUSED */
3732 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3733 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3734 {
3735 int error;
3736 vnode_t vp = NULL;
3737 struct mount *mp;
3738
3739 AUDIT_ARG(fd, uap->fd);
3740
3741 if ((error = file_vnode(uap->fd, &vp)) ||
3742 (error = vnode_getwithref(vp))) {
3743 goto out;
3744 }
3745
3746 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3747
3748 mp = vp->v_mount;
3749 if (!mp) {
3750 error = EBADF;
3751 goto out_vnode;
3752 }
3753
3754 error = statfs_internal(p, mp, uap->buf);
3755
3756 out_vnode:
3757 vnode_put(vp);
3758
3759 out:
3760 if (vp != NULL) {
3761 file_drop(uap->fd);
3762 }
3763
3764 return error;
3765 }
3766
3767 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3768 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3769 {
3770 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3771
3772 bzero(sfs, sizeof(*sfs));
3773
3774 sfs->f_bsize = vsfs->f_bsize;
3775 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3776 sfs->f_blocks = vsfs->f_blocks;
3777 sfs->f_bfree = vsfs->f_bfree;
3778 sfs->f_bavail = vsfs->f_bavail;
3779 sfs->f_files = vsfs->f_files;
3780 sfs->f_ffree = vsfs->f_ffree;
3781 sfs->f_fsid = vsfs->f_fsid;
3782 sfs->f_owner = vsfs->f_owner;
3783 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3784 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3785 sfs->f_fssubtype = vsfs->f_fssubtype;
3786 sfs->f_flags_ext = vfs_getextflags(mp);
3787 vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3788 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3789 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3790 }
3791
3792 /*
3793 * Get file system statistics in 64-bit mode
3794 */
3795 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3796 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3797 {
3798 struct mount *mp;
3799 int error;
3800 struct nameidata *ndp;
3801 struct statfs64 *sfsp;
3802 vfs_context_t ctxp = vfs_context_current();
3803 vnode_t vp;
3804 struct {
3805 struct nameidata nd;
3806 struct statfs64 sfs;
3807 } *__nameidata_statfs64;
3808
3809 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3810 Z_WAITOK);
3811 ndp = &__nameidata_statfs64->nd;
3812
3813 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3814 UIO_USERSPACE, uap->path, ctxp);
3815 error = namei(ndp);
3816 if (error != 0) {
3817 goto out;
3818 }
3819 vp = ndp->ni_vp;
3820 mp = vp->v_mount;
3821 nameidone(ndp);
3822
3823 #if CONFIG_MACF
3824 error = mac_mount_check_stat(ctxp, mp);
3825 if (error != 0) {
3826 vnode_put(vp);
3827 goto out;
3828 }
3829 #endif
3830
3831 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3832 if (error != 0) {
3833 vnode_put(vp);
3834 goto out;
3835 }
3836
3837 sfsp = &__nameidata_statfs64->sfs;
3838 vfs_get_statfs64(mp, sfsp);
3839 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3840 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3841 /* This process does not want to see a seperate data volume mountpoint */
3842 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3843 }
3844 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3845 vnode_put(vp);
3846
3847 out:
3848 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3849
3850 return error;
3851 }
3852
3853 /*
3854 * Get file system statistics in 64-bit mode
3855 */
3856 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3857 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3858 {
3859 struct vnode *vp;
3860 struct mount *mp;
3861 struct statfs64 sfs;
3862 int error;
3863
3864 AUDIT_ARG(fd, uap->fd);
3865
3866 if ((error = file_vnode(uap->fd, &vp))) {
3867 return error;
3868 }
3869
3870 error = vnode_getwithref(vp);
3871 if (error) {
3872 file_drop(uap->fd);
3873 return error;
3874 }
3875
3876 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3877
3878 mp = vp->v_mount;
3879 if (!mp) {
3880 error = EBADF;
3881 goto out;
3882 }
3883
3884 #if CONFIG_MACF
3885 error = mac_mount_check_stat(vfs_context_current(), mp);
3886 if (error != 0) {
3887 goto out;
3888 }
3889 #endif
3890
3891 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3892 goto out;
3893 }
3894
3895 vfs_get_statfs64(mp, &sfs);
3896 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3897 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3898 /* This process does not want to see a seperate data volume mountpoint */
3899 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3900 }
3901 error = copyout(&sfs, uap->buf, sizeof(sfs));
3902
3903 out:
3904 file_drop(uap->fd);
3905 vnode_put(vp);
3906
3907 return error;
3908 }
3909
3910 struct getfsstat_struct {
3911 user_addr_t sfsp;
3912 user_addr_t *mp;
3913 int count;
3914 int maxcount;
3915 int flags;
3916 int error;
3917 };
3918
3919
3920 static int
getfsstat_callback(mount_t mp,void * arg)3921 getfsstat_callback(mount_t mp, void * arg)
3922 {
3923 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3924 struct vfsstatfs *sp;
3925 int error, my_size;
3926 vfs_context_t ctx = vfs_context_current();
3927
3928 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3929 #if CONFIG_MACF
3930 error = mac_mount_check_stat(ctx, mp);
3931 if (error != 0) {
3932 fstp->error = error;
3933 return VFS_RETURNED_DONE;
3934 }
3935 #endif
3936 sp = &mp->mnt_vfsstat;
3937 /*
3938 * If MNT_NOWAIT is specified, do not refresh the
3939 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3940 */
3941 if ((mp->mnt_lflag & MNT_LDEAD) ||
3942 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3943 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3944 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3945 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3946 return VFS_RETURNED;
3947 }
3948
3949 /*
3950 * Need to handle LP64 version of struct statfs
3951 */
3952 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3953 if (error) {
3954 fstp->error = error;
3955 return VFS_RETURNED_DONE;
3956 }
3957 fstp->sfsp += my_size;
3958
3959 if (fstp->mp) {
3960 #if CONFIG_MACF
3961 error = mac_mount_label_get(mp, *fstp->mp);
3962 if (error) {
3963 fstp->error = error;
3964 return VFS_RETURNED_DONE;
3965 }
3966 #endif
3967 fstp->mp++;
3968 }
3969 }
3970 fstp->count++;
3971 return VFS_RETURNED;
3972 }
3973
3974 /*
3975 * Get statistics on all filesystems.
3976 */
3977 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3978 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3979 {
3980 struct __mac_getfsstat_args muap;
3981
3982 muap.buf = uap->buf;
3983 muap.bufsize = uap->bufsize;
3984 muap.mac = USER_ADDR_NULL;
3985 muap.macsize = 0;
3986 muap.flags = uap->flags;
3987
3988 return __mac_getfsstat(p, &muap, retval);
3989 }
3990
3991 /*
3992 * __mac_getfsstat: Get MAC-related file system statistics
3993 *
3994 * Parameters: p (ignored)
3995 * uap User argument descriptor (see below)
3996 * retval Count of file system statistics (N stats)
3997 *
3998 * Indirect: uap->bufsize Buffer size
3999 * uap->macsize MAC info size
4000 * uap->buf Buffer where information will be returned
4001 * uap->mac MAC info
4002 * uap->flags File system flags
4003 *
4004 *
4005 * Returns: 0 Success
4006 * !0 Not success
4007 *
4008 */
4009 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)4010 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
4011 {
4012 user_addr_t sfsp;
4013 user_addr_t *mp;
4014 size_t count, maxcount, bufsize, macsize;
4015 struct getfsstat_struct fst;
4016
4017 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
4018 return EINVAL;
4019 }
4020
4021 bufsize = (size_t) uap->bufsize;
4022 macsize = (size_t) uap->macsize;
4023
4024 if (IS_64BIT_PROCESS(p)) {
4025 maxcount = bufsize / sizeof(struct user64_statfs);
4026 } else {
4027 maxcount = bufsize / sizeof(struct user32_statfs);
4028 }
4029 sfsp = uap->buf;
4030 count = 0;
4031
4032 mp = NULL;
4033
4034 #if CONFIG_MACF
4035 if (uap->mac != USER_ADDR_NULL) {
4036 u_int32_t *mp0;
4037 int error;
4038 unsigned int i;
4039
4040 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
4041 if (count != maxcount) {
4042 return EINVAL;
4043 }
4044
4045 /* Copy in the array */
4046 mp0 = kalloc_data(macsize, Z_WAITOK);
4047 if (mp0 == NULL) {
4048 return ENOMEM;
4049 }
4050
4051 error = copyin(uap->mac, mp0, macsize);
4052 if (error) {
4053 kfree_data(mp0, macsize);
4054 return error;
4055 }
4056
4057 /* Normalize to an array of user_addr_t */
4058 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
4059 if (mp == NULL) {
4060 kfree_data(mp0, macsize);
4061 return ENOMEM;
4062 }
4063
4064 for (i = 0; i < count; i++) {
4065 if (IS_64BIT_PROCESS(p)) {
4066 mp[i] = ((user_addr_t *)mp0)[i];
4067 } else {
4068 mp[i] = (user_addr_t)mp0[i];
4069 }
4070 }
4071 kfree_data(mp0, macsize);
4072 }
4073 #endif
4074
4075
4076 fst.sfsp = sfsp;
4077 fst.mp = mp;
4078 fst.flags = uap->flags;
4079 fst.count = 0;
4080 fst.error = 0;
4081 fst.maxcount = (int)maxcount;
4082
4083
4084 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
4085
4086 if (mp) {
4087 kfree_data(mp, count * sizeof(user_addr_t));
4088 }
4089
4090 if (fst.error) {
4091 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4092 return fst.error;
4093 }
4094
4095 if (fst.sfsp && fst.count > fst.maxcount) {
4096 *retval = fst.maxcount;
4097 } else {
4098 *retval = fst.count;
4099 }
4100 return 0;
4101 }
4102
4103 static int
getfsstat64_callback(mount_t mp,void * arg)4104 getfsstat64_callback(mount_t mp, void * arg)
4105 {
4106 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
4107 struct vfsstatfs *sp;
4108 struct statfs64 sfs;
4109 int error;
4110
4111 if (fstp->sfsp && fstp->count < fstp->maxcount) {
4112 #if CONFIG_MACF
4113 error = mac_mount_check_stat(vfs_context_current(), mp);
4114 if (error != 0) {
4115 fstp->error = error;
4116 return VFS_RETURNED_DONE;
4117 }
4118 #endif
4119 sp = &mp->mnt_vfsstat;
4120 /*
4121 * If MNT_NOWAIT is specified, do not refresh the fsstat
4122 * cache. MNT_WAIT overrides MNT_NOWAIT.
4123 *
4124 * We treat MNT_DWAIT as MNT_WAIT for all instances of
4125 * getfsstat, since the constants are out of the same
4126 * namespace.
4127 */
4128 if ((mp->mnt_lflag & MNT_LDEAD) ||
4129 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
4130 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
4131 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
4132 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
4133 return VFS_RETURNED;
4134 }
4135
4136 vfs_get_statfs64(mp, &sfs);
4137 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
4138 if (error) {
4139 fstp->error = error;
4140 return VFS_RETURNED_DONE;
4141 }
4142 fstp->sfsp += sizeof(sfs);
4143 }
4144 fstp->count++;
4145 return VFS_RETURNED;
4146 }
4147
4148 /*
4149 * Get statistics on all file systems in 64 bit mode.
4150 */
4151 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)4152 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
4153 {
4154 user_addr_t sfsp;
4155 int count, maxcount;
4156 struct getfsstat_struct fst;
4157
4158 maxcount = uap->bufsize / sizeof(struct statfs64);
4159
4160 sfsp = uap->buf;
4161 count = 0;
4162
4163 fst.sfsp = sfsp;
4164 fst.flags = uap->flags;
4165 fst.count = 0;
4166 fst.error = 0;
4167 fst.maxcount = maxcount;
4168
4169 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
4170
4171 if (fst.error) {
4172 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4173 return fst.error;
4174 }
4175
4176 if (fst.sfsp && fst.count > fst.maxcount) {
4177 *retval = fst.maxcount;
4178 } else {
4179 *retval = fst.count;
4180 }
4181
4182 return 0;
4183 }
4184
4185 /*
4186 * gets the associated vnode with the file descriptor passed.
4187 * as input
4188 *
4189 * INPUT
4190 * ctx - vfs context of caller
4191 * fd - file descriptor for which vnode is required.
4192 * vpp - Pointer to pointer to vnode to be returned.
4193 *
4194 * The vnode is returned with an iocount so any vnode obtained
4195 * by this call needs a vnode_put
4196 *
4197 */
4198 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4199 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4200 {
4201 int error;
4202 vnode_t vp;
4203 struct fileproc *fp;
4204 proc_t p = vfs_context_proc(ctx);
4205
4206 *vpp = NULLVP;
4207
4208 error = fp_getfvp(p, fd, &fp, &vp);
4209 if (error) {
4210 return error;
4211 }
4212
4213 error = vnode_getwithref(vp);
4214 if (error) {
4215 (void)fp_drop(p, fd, fp, 0);
4216 return error;
4217 }
4218
4219 (void)fp_drop(p, fd, fp, 0);
4220 *vpp = vp;
4221 return error;
4222 }
4223
4224 int
vnode_getfromid(int volfs_id,uint64_t objid,vfs_context_t ctx,int realfsid,vnode_t * vpp)4225 vnode_getfromid(int volfs_id, uint64_t objid, vfs_context_t ctx, int realfsid, vnode_t *vpp)
4226 {
4227 int error = 0;
4228 vnode_t vp = NULLVP;
4229 struct mount *mp = NULL;
4230
4231 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
4232 error = ENOTSUP; /* unexpected failure */
4233 return ENOTSUP;
4234 }
4235
4236 #if CONFIG_UNION_MOUNTS
4237 unionget:
4238 #endif /* CONFIG_UNION_MOUNTS */
4239 if (objid == 2) {
4240 struct vfs_attr vfsattr;
4241 int use_vfs_root = TRUE;
4242
4243 VFSATTR_INIT(&vfsattr);
4244 VFSATTR_WANTED(&vfsattr, f_capabilities);
4245 if (!realfsid &&
4246 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
4247 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
4248 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
4249 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
4250 use_vfs_root = FALSE;
4251 }
4252 }
4253
4254 if (use_vfs_root) {
4255 error = VFS_ROOT(mp, &vp, ctx);
4256 } else {
4257 error = VFS_VGET(mp, objid, &vp, ctx);
4258 }
4259 } else {
4260 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
4261 }
4262
4263 #if CONFIG_UNION_MOUNTS
4264 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
4265 /*
4266 * If the fileid isn't found and we're in a union
4267 * mount volume, then see if the fileid is in the
4268 * mounted-on volume.
4269 */
4270 struct mount *tmp = mp;
4271 mp = vnode_mount(tmp->mnt_vnodecovered);
4272 vfs_unbusy(tmp);
4273 if (vfs_busy(mp, LK_NOWAIT) == 0) {
4274 goto unionget;
4275 }
4276 } else {
4277 vfs_unbusy(mp);
4278 }
4279 #else
4280 vfs_unbusy(mp);
4281 #endif /* CONFIG_UNION_MOUNTS */
4282
4283 if (!error) {
4284 *vpp = vp;
4285 }
4286
4287 return error;
4288 }
4289
4290 /*
4291 * Wrapper function around namei to start lookup from a directory
4292 * specified by a file descriptor ni_dirfd.
4293 *
4294 * In addition to all the errors returned by namei, this call can
4295 * return ENOTDIR if the file descriptor does not refer to a directory.
4296 * and EBADF if the file descriptor is not valid.
4297 */
4298 int
nameiat(struct nameidata * ndp,int dirfd)4299 nameiat(struct nameidata *ndp, int dirfd)
4300 {
4301 if ((dirfd != AT_FDCWD) &&
4302 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4303 !(ndp->ni_cnd.cn_flags & USEDVP)) {
4304 int error = 0;
4305 char c;
4306
4307 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4308 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4309 if (error) {
4310 return error;
4311 }
4312 } else {
4313 c = *((char *)(ndp->ni_dirp));
4314 }
4315
4316 if (c != '/') {
4317 vnode_t dvp_at;
4318
4319 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4320 &dvp_at);
4321 if (error) {
4322 return error;
4323 }
4324
4325 if (vnode_vtype(dvp_at) != VDIR) {
4326 vnode_put(dvp_at);
4327 return ENOTDIR;
4328 }
4329
4330 ndp->ni_dvp = dvp_at;
4331 ndp->ni_cnd.cn_flags |= USEDVP;
4332 error = namei(ndp);
4333 ndp->ni_cnd.cn_flags &= ~USEDVP;
4334 vnode_put(dvp_at);
4335 return error;
4336 }
4337 }
4338
4339 return namei(ndp);
4340 }
4341
4342 /*
4343 * Change current working directory to a given file descriptor.
4344 */
4345 /* ARGSUSED */
4346 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4347 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4348 {
4349 vnode_t vp;
4350 vnode_t tdp;
4351 vnode_t tvp;
4352 struct mount *mp;
4353 int error, should_put = 1;
4354
4355 AUDIT_ARG(fd, fd);
4356 if (per_thread && fd == -1) {
4357 /*
4358 * Switching back from per-thread to per process CWD; verify we
4359 * in fact have one before proceeding. The only success case
4360 * for this code path is to return 0 preemptively after zapping
4361 * the thread structure contents.
4362 */
4363 thread_t th = vfs_context_thread(ctx);
4364 if (th) {
4365 uthread_t uth = get_bsdthread_info(th);
4366 tvp = uth->uu_cdir;
4367 uth->uu_cdir = NULLVP;
4368 if (tvp != NULLVP) {
4369 vnode_rele(tvp);
4370 return 0;
4371 }
4372 }
4373 return EBADF;
4374 }
4375
4376 if ((error = file_vnode(fd, &vp))) {
4377 return error;
4378 }
4379 if ((error = vnode_getwithref(vp))) {
4380 file_drop(fd);
4381 return error;
4382 }
4383
4384 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4385
4386 if (vp->v_type != VDIR) {
4387 error = ENOTDIR;
4388 goto out;
4389 }
4390
4391 #if CONFIG_MACF
4392 error = mac_vnode_check_chdir(ctx, vp);
4393 if (error) {
4394 goto out;
4395 }
4396 #endif
4397 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4398 if (error) {
4399 goto out;
4400 }
4401
4402 while (!error && (mp = vp->v_mountedhere) != NULL) {
4403 if (vfs_busy(mp, LK_NOWAIT)) {
4404 error = EACCES;
4405 goto out;
4406 }
4407 error = VFS_ROOT(mp, &tdp, ctx);
4408 vfs_unbusy(mp);
4409 if (error) {
4410 break;
4411 }
4412 vnode_put(vp);
4413 vp = tdp;
4414 }
4415 if (error) {
4416 goto out;
4417 }
4418 if ((error = vnode_ref(vp))) {
4419 goto out;
4420 }
4421 vnode_put(vp);
4422 should_put = 0;
4423
4424 if (per_thread) {
4425 thread_t th = vfs_context_thread(ctx);
4426 if (th) {
4427 uthread_t uth = get_bsdthread_info(th);
4428 tvp = uth->uu_cdir;
4429 uth->uu_cdir = vp;
4430 OSBitOrAtomic(P_THCWD, &p->p_flag);
4431 } else {
4432 vnode_rele(vp);
4433 error = ENOENT;
4434 goto out;
4435 }
4436 } else {
4437 proc_dirs_lock_exclusive(p);
4438 proc_fdlock(p);
4439 tvp = p->p_fd.fd_cdir;
4440 p->p_fd.fd_cdir = vp;
4441 proc_fdunlock(p);
4442 proc_dirs_unlock_exclusive(p);
4443 }
4444
4445 if (tvp) {
4446 vnode_rele(tvp);
4447 }
4448
4449 out:
4450 if (should_put) {
4451 vnode_put(vp);
4452 }
4453 file_drop(fd);
4454
4455 return error;
4456 }
4457
4458 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4459 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4460 {
4461 return fchdir(p, vfs_context_current(), uap->fd, false);
4462 }
4463
4464 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4465 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4466 {
4467 return fchdir(p, vfs_context_current(), uap->fd, true);
4468 }
4469
4470
4471 /*
4472 * Change current working directory (".").
4473 *
4474 * Returns: 0 Success
4475 * change_dir:ENOTDIR
4476 * change_dir:???
4477 * vnode_ref:ENOENT No such file or directory
4478 */
4479 /* ARGSUSED */
4480 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4481 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4482 {
4483 int error;
4484 vnode_t tvp;
4485
4486 error = change_dir(ndp, ctx);
4487 if (error) {
4488 return error;
4489 }
4490 if ((error = vnode_ref(ndp->ni_vp))) {
4491 vnode_put(ndp->ni_vp);
4492 return error;
4493 }
4494 /*
4495 * drop the iocount we picked up in change_dir
4496 */
4497 vnode_put(ndp->ni_vp);
4498
4499 if (per_thread) {
4500 thread_t th = vfs_context_thread(ctx);
4501 if (th) {
4502 uthread_t uth = get_bsdthread_info(th);
4503 tvp = uth->uu_cdir;
4504 uth->uu_cdir = ndp->ni_vp;
4505 OSBitOrAtomic(P_THCWD, &p->p_flag);
4506 } else {
4507 vnode_rele(ndp->ni_vp);
4508 return ENOENT;
4509 }
4510 } else {
4511 proc_dirs_lock_exclusive(p);
4512 proc_fdlock(p);
4513 tvp = p->p_fd.fd_cdir;
4514 p->p_fd.fd_cdir = ndp->ni_vp;
4515 proc_fdunlock(p);
4516 proc_dirs_unlock_exclusive(p);
4517 }
4518
4519 if (tvp) {
4520 vnode_rele(tvp);
4521 }
4522
4523 return 0;
4524 }
4525
4526
4527 /*
4528 * Change current working directory (".").
4529 *
4530 * Returns: 0 Success
4531 * chdir_internal:ENOTDIR
4532 * chdir_internal:ENOENT No such file or directory
4533 * chdir_internal:???
4534 */
4535 /* ARGSUSED */
4536 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4537 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4538 {
4539 struct nameidata nd;
4540 vfs_context_t ctx = vfs_context_current();
4541
4542 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4543 UIO_USERSPACE, uap->path, ctx);
4544
4545 return chdir_internal(p, ctx, &nd, per_thread);
4546 }
4547
4548
4549 /*
4550 * chdir
4551 *
4552 * Change current working directory (".") for the entire process
4553 *
4554 * Parameters: p Process requesting the call
4555 * uap User argument descriptor (see below)
4556 * retval (ignored)
4557 *
4558 * Indirect parameters: uap->path Directory path
4559 *
4560 * Returns: 0 Success
4561 * common_chdir: ENOTDIR
4562 * common_chdir: ENOENT No such file or directory
4563 * common_chdir: ???
4564 *
4565 */
4566 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4567 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4568 {
4569 return common_chdir(p, (void *)uap, 0);
4570 }
4571
4572 /*
4573 * __pthread_chdir
4574 *
4575 * Change current working directory (".") for a single thread
4576 *
4577 * Parameters: p Process requesting the call
4578 * uap User argument descriptor (see below)
4579 * retval (ignored)
4580 *
4581 * Indirect parameters: uap->path Directory path
4582 *
4583 * Returns: 0 Success
4584 * common_chdir: ENOTDIR
4585 * common_chdir: ENOENT No such file or directory
4586 * common_chdir: ???
4587 *
4588 */
4589 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4590 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4591 {
4592 return common_chdir(p, (void *)uap, 1);
4593 }
4594
4595
4596 /*
4597 * Change notion of root (``/'') directory.
4598 */
4599 /* ARGSUSED */
4600 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4601 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4602 {
4603 struct filedesc *fdp = &p->p_fd;
4604 int error;
4605 struct nameidata nd;
4606 vnode_t tvp;
4607 vfs_context_t ctx = vfs_context_current();
4608
4609 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4610 return error;
4611 }
4612
4613 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4614 UIO_USERSPACE, uap->path, ctx);
4615 error = change_dir(&nd, ctx);
4616 if (error) {
4617 return error;
4618 }
4619
4620 #if CONFIG_MACF
4621 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4622 &nd.ni_cnd);
4623 if (error) {
4624 vnode_put(nd.ni_vp);
4625 return error;
4626 }
4627 #endif
4628
4629 if ((error = vnode_ref(nd.ni_vp))) {
4630 vnode_put(nd.ni_vp);
4631 return error;
4632 }
4633 vnode_put(nd.ni_vp);
4634
4635 /*
4636 * This lock provides the guarantee that as long as you hold the lock
4637 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4638 * on a referenced vnode in namei when determining the rootvnode for
4639 * a process.
4640 */
4641 /* needed for synchronization with lookup */
4642 proc_dirs_lock_exclusive(p);
4643 /* needed for setting the flag and other activities on the fd itself */
4644 proc_fdlock(p);
4645 tvp = fdp->fd_rdir;
4646 fdp->fd_rdir = nd.ni_vp;
4647 fdt_flag_set(fdp, FD_CHROOT);
4648 proc_fdunlock(p);
4649 proc_dirs_unlock_exclusive(p);
4650
4651 if (tvp != NULL) {
4652 vnode_rele(tvp);
4653 }
4654
4655 return 0;
4656 }
4657
4658 #define PATHSTATICBUFLEN 256
4659 #define PIVOT_ROOT_ENTITLEMENT \
4660 "com.apple.private.vfs.pivot-root"
4661
4662 #if defined(XNU_TARGET_OS_OSX)
4663 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4664 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4665 {
4666 int error;
4667 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4668 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4669 char *new_rootfs_path_before_buf = NULL;
4670 char *old_rootfs_path_after_buf = NULL;
4671 char *incoming = NULL;
4672 char *outgoing = NULL;
4673 vnode_t incoming_rootvp = NULLVP;
4674 size_t bytes_copied;
4675
4676 /*
4677 * XXX : Additional restrictions needed
4678 * - perhaps callable only once.
4679 */
4680 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4681 return error;
4682 }
4683
4684 /*
4685 * pivot_root can be executed by launchd only.
4686 * Enforce entitlement.
4687 */
4688 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4689 return EPERM;
4690 }
4691
4692 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4693 if (error == ENAMETOOLONG) {
4694 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4695 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4696 }
4697
4698 if (error) {
4699 goto out;
4700 }
4701
4702 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4703 if (error == ENAMETOOLONG) {
4704 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4705 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4706 }
4707 if (error) {
4708 goto out;
4709 }
4710
4711 if (new_rootfs_path_before_buf) {
4712 incoming = new_rootfs_path_before_buf;
4713 } else {
4714 incoming = &new_rootfs_path_before[0];
4715 }
4716
4717 if (old_rootfs_path_after_buf) {
4718 outgoing = old_rootfs_path_after_buf;
4719 } else {
4720 outgoing = &old_rootfs_path_after[0];
4721 }
4722
4723 /*
4724 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4725 * Userland is not allowed to pivot to an image.
4726 */
4727 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4728 if (error) {
4729 goto out;
4730 }
4731 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4732 if (error) {
4733 goto out;
4734 }
4735
4736 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4737
4738 out:
4739 if (incoming_rootvp != NULLVP) {
4740 vnode_put(incoming_rootvp);
4741 incoming_rootvp = NULLVP;
4742 }
4743
4744 if (old_rootfs_path_after_buf) {
4745 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4746 }
4747
4748 if (new_rootfs_path_before_buf) {
4749 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4750 }
4751
4752 return error;
4753 }
4754 #else
4755 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4756 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4757 {
4758 return nosys(p, NULL, retval);
4759 }
4760 #endif /* XNU_TARGET_OS_OSX */
4761
4762 /*
4763 * Common routine for chroot and chdir.
4764 *
4765 * Returns: 0 Success
4766 * ENOTDIR Not a directory
4767 * namei:??? [anything namei can return]
4768 * vnode_authorize:??? [anything vnode_authorize can return]
4769 */
4770 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4771 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4772 {
4773 vnode_t vp;
4774 int error;
4775
4776 if ((error = namei(ndp))) {
4777 return error;
4778 }
4779 nameidone(ndp);
4780 vp = ndp->ni_vp;
4781
4782 if (vp->v_type != VDIR) {
4783 vnode_put(vp);
4784 return ENOTDIR;
4785 }
4786
4787 #if CONFIG_MACF
4788 error = mac_vnode_check_chdir(ctx, vp);
4789 if (error) {
4790 vnode_put(vp);
4791 return error;
4792 }
4793 #endif
4794
4795 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4796 if (error) {
4797 vnode_put(vp);
4798 return error;
4799 }
4800
4801 return error;
4802 }
4803
4804 /*
4805 * Free the vnode data (for directories) associated with the file glob.
4806 */
4807 struct fd_vn_data *
fg_vn_data_alloc(void)4808 fg_vn_data_alloc(void)
4809 {
4810 struct fd_vn_data *fvdata;
4811
4812 /* Allocate per fd vnode data */
4813 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4814 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4815 return fvdata;
4816 }
4817
4818 /*
4819 * Free the vnode data (for directories) associated with the file glob.
4820 */
4821 void
fg_vn_data_free(void * fgvndata)4822 fg_vn_data_free(void *fgvndata)
4823 {
4824 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4825
4826 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4827 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4828 kfree_type(struct fd_vn_data, fvdata);
4829 }
4830
4831 /*
4832 * Check permissions, allocate an open file structure,
4833 * and call the device open routine if any.
4834 *
4835 * Returns: 0 Success
4836 * EINVAL
4837 * EINTR
4838 * falloc:ENFILE
4839 * falloc:EMFILE
4840 * falloc:ENOMEM
4841 * vn_open_auth:???
4842 * dupfdopen:???
4843 * VNOP_ADVLOCK:???
4844 * vnode_setsize:???
4845 *
4846 * XXX Need to implement uid, gid
4847 */
4848 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4849 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4850 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4851 {
4852 proc_t p = vfs_context_proc(ctx);
4853 kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4854 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4855 struct fileproc *fp;
4856 vnode_t vp;
4857 int flags, oflags, amode;
4858 int type, indx, error;
4859 struct vfs_context context;
4860 vnode_t authvp = NULLVP;
4861
4862 oflags = uflags;
4863
4864 amode = oflags & O_ACCMODE;
4865 /*
4866 * Because O_RDONLY is 0, it is not possible to distinguish between
4867 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4868 * with FREAD/FWRITE.
4869 */
4870 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4871 return EINVAL;
4872 }
4873
4874 flags = FFLAGS(uflags);
4875 CLR(flags, FENCRYPTED);
4876 CLR(flags, FUNENCRYPTED);
4877
4878 AUDIT_ARG(fflags, oflags);
4879 AUDIT_ARG(mode, vap->va_mode);
4880
4881 if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4882 return error;
4883 }
4884 if (flags & O_CLOEXEC) {
4885 fp->fp_flags |= FP_CLOEXEC;
4886 }
4887 if (flags & O_CLOFORK) {
4888 fp->fp_flags |= FP_CLOFORK;
4889 }
4890
4891 /* setup state to recognize when fdesc_open was called */
4892 uu->uu_dupfd = -1;
4893
4894 /*
4895 * Disable read/write access if file is opened with O_EVTONLY and
4896 * the process has requested to deny read/write access.
4897 */
4898 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4899 flags &= ~(FREAD | FWRITE);
4900 }
4901
4902 if (authfd != AUTH_OPEN_NOAUTHFD) {
4903 error = vnode_getfromfd(ctx, authfd, &authvp);
4904 if (error) {
4905 fp_free(p, indx, fp);
4906 return error;
4907 }
4908 }
4909
4910 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4911 if (authvp != NULLVP) {
4912 vnode_put(authvp);
4913 }
4914 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4915 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4916 *retval = indx;
4917 return 0;
4918 }
4919 }
4920 if (error == ERESTART) {
4921 error = EINTR;
4922 }
4923 fp_free(p, indx, fp);
4924 return error;
4925 }
4926
4927 if (authvp != NULLVP) {
4928 vnode_put(authvp);
4929 }
4930
4931 uu->uu_dupfd = 0;
4932 vp = ndp->ni_vp;
4933
4934 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4935 fp->fp_glob->fg_ops = &vnops;
4936 fp_set_data(fp, vp);
4937
4938 #if CONFIG_FILE_LEASES
4939 /*
4940 * If we are creating a file or open with truncate, we need to break the
4941 * lease if there is a read lease placed on the parent dir.
4942 */
4943 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4944 vnode_breakdirlease(vp, true, oflags);
4945 }
4946 /* Now check if there is a lease placed on the file itself. */
4947 error = vnode_breaklease(vp, oflags, ctx);
4948 if (error) {
4949 goto bad;
4950 }
4951 #endif /* CONFIG_FILE_LEASES */
4952
4953 if (flags & (O_EXLOCK | O_SHLOCK)) {
4954 struct flock lf = {
4955 .l_whence = SEEK_SET,
4956 };
4957
4958 if (flags & O_EXLOCK) {
4959 lf.l_type = F_WRLCK;
4960 } else {
4961 lf.l_type = F_RDLCK;
4962 }
4963 type = F_FLOCK;
4964 if ((flags & FNONBLOCK) == 0) {
4965 type |= F_WAIT;
4966 }
4967 #if CONFIG_MACF
4968 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4969 F_SETLK, &lf);
4970 if (error) {
4971 goto bad;
4972 }
4973 #endif
4974 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4975 goto bad;
4976 }
4977 fp->fp_glob->fg_flag |= FWASLOCKED;
4978 }
4979
4980 /* try to truncate by setting the size attribute */
4981 if (flags & O_TRUNC) {
4982 if ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0) {
4983 goto bad;
4984 }
4985 fp->fp_glob->fg_flag |= FWASWRITTEN;
4986 }
4987
4988 /*
4989 * For directories we hold some additional information in the fd.
4990 */
4991 if (vnode_vtype(vp) == VDIR) {
4992 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4993 } else {
4994 fp->fp_glob->fg_vn_data = NULL;
4995 }
4996
4997 #if CONFIG_SECLUDED_MEMORY
4998 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4999 memory_object_control_t moc;
5000 const char *v_name;
5001
5002 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
5003
5004 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
5005 /* nothing to do... */
5006 } else if (fp->fp_glob->fg_flag & FWRITE) {
5007 /* writable -> no longer eligible for secluded pages */
5008 memory_object_mark_eligible_for_secluded(moc,
5009 FALSE);
5010 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
5011 char pathname[32] = { 0, };
5012 size_t copied;
5013 /* XXX FBDP: better way to detect /Applications/ ? */
5014 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5015 (void)copyinstr(ndp->ni_dirp,
5016 pathname,
5017 sizeof(pathname),
5018 &copied);
5019 } else {
5020 copystr(CAST_DOWN(void *, ndp->ni_dirp),
5021 pathname,
5022 sizeof(pathname),
5023 &copied);
5024 }
5025 pathname[sizeof(pathname) - 1] = '\0';
5026 if (strncmp(pathname,
5027 "/Applications/",
5028 strlen("/Applications/")) == 0 &&
5029 strncmp(pathname,
5030 "/Applications/Camera.app/",
5031 strlen("/Applications/Camera.app/")) != 0) {
5032 /*
5033 * not writable
5034 * AND from "/Applications/"
5035 * AND not from "/Applications/Camera.app/"
5036 * ==> eligible for secluded
5037 */
5038 memory_object_mark_eligible_for_secluded(moc,
5039 TRUE);
5040 }
5041 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
5042 (v_name = vnode_getname(vp))) {
5043 size_t len = strlen(v_name);
5044
5045 if (!strncmp(v_name, "dyld", len) ||
5046 !strncmp(v_name, "launchd", len) ||
5047 !strncmp(v_name, "Camera", len) ||
5048 !strncmp(v_name, "SpringBoard", len) ||
5049 !strncmp(v_name, "backboardd", len) ||
5050 !strncmp(v_name, "cameracaptured", len)) {
5051 /*
5052 * This file matters when launching Camera:
5053 * do not store its contents in the secluded
5054 * pool that will be drained on Camera launch.
5055 */
5056 memory_object_mark_eligible_for_secluded(moc,
5057 FALSE);
5058 } else if (!strncmp(v_name, "audiomxd", len) ||
5059 !strncmp(v_name, "mediaplaybackd", len)) {
5060 memory_object_mark_eligible_for_secluded(moc,
5061 FALSE);
5062 memory_object_mark_for_realtime(moc,
5063 true);
5064 } else if (!strncmp(v_name, "bluetoothd", len)) {
5065 /*
5066 * bluetoothd might be needed for realtime audio
5067 * playback.
5068 */
5069 memory_object_mark_eligible_for_secluded(moc,
5070 FALSE);
5071 memory_object_mark_for_realtime(moc,
5072 true);
5073 } else {
5074 char pathname[64] = { 0, };
5075 size_t copied;
5076 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5077 (void)copyinstr(ndp->ni_dirp,
5078 pathname,
5079 sizeof(pathname),
5080 &copied);
5081 } else {
5082 copystr(CAST_DOWN(void *, ndp->ni_dirp),
5083 pathname,
5084 sizeof(pathname),
5085 &copied);
5086 }
5087 pathname[sizeof(pathname) - 1] = '\0';
5088 if (strncmp(pathname,
5089 "/Library/Audio/Plug-Ins/",
5090 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
5091 strncmp(pathname,
5092 "/System/Library/Audio/Plug-Ins/",
5093 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
5094 /*
5095 * This may be an audio plugin required
5096 * for realtime playback.
5097 * ==> NOT eligible for secluded.
5098 */
5099 memory_object_mark_eligible_for_secluded(moc,
5100 FALSE);
5101 memory_object_mark_for_realtime(moc,
5102 true);
5103 }
5104 }
5105 vnode_putname(v_name);
5106 }
5107 }
5108 #endif /* CONFIG_SECLUDED_MEMORY */
5109
5110 vnode_put(vp);
5111
5112 /*
5113 * The first terminal open (without a O_NOCTTY) by a session leader
5114 * results in it being set as the controlling terminal.
5115 */
5116 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
5117 !(flags & O_NOCTTY)) {
5118 int tmp = 0;
5119
5120 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
5121 (caddr_t)&tmp, ctx);
5122 }
5123
5124 proc_fdlock(p);
5125 procfdtbl_releasefd(p, indx, NULL);
5126
5127 fp_drop(p, indx, fp, 1);
5128 proc_fdunlock(p);
5129
5130 *retval = indx;
5131
5132 return 0;
5133 bad:
5134 context = *vfs_context_current();
5135 context.vc_ucred = fp->fp_glob->fg_cred;
5136
5137 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
5138 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
5139 struct flock lf = {
5140 .l_whence = SEEK_SET,
5141 .l_type = F_UNLCK,
5142 };
5143
5144 (void)VNOP_ADVLOCK(
5145 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
5146 }
5147
5148 vn_close(vp, fp->fp_glob->fg_flag, &context);
5149 vnode_put(vp);
5150 fp_free(p, indx, fp);
5151
5152 return error;
5153 }
5154
5155 /*
5156 * While most of the *at syscall handlers can call nameiat() which
5157 * is a wrapper around namei, the use of namei and initialisation
5158 * of nameidata are far removed and in different functions - namei
5159 * gets called in vn_open_auth for open1. So we'll just do here what
5160 * nameiat() does.
5161 */
5162 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)5163 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
5164 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
5165 int dirfd, int authfd)
5166 {
5167 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
5168 int error;
5169 char c;
5170
5171 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5172 error = copyin(ndp->ni_dirp, &c, sizeof(char));
5173 if (error) {
5174 return error;
5175 }
5176 } else {
5177 c = *((char *)(ndp->ni_dirp));
5178 }
5179
5180 if (c != '/') {
5181 vnode_t dvp_at;
5182
5183 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
5184 &dvp_at);
5185 if (error) {
5186 return error;
5187 }
5188
5189 if (vnode_vtype(dvp_at) != VDIR) {
5190 vnode_put(dvp_at);
5191 return ENOTDIR;
5192 }
5193
5194 ndp->ni_dvp = dvp_at;
5195 ndp->ni_cnd.cn_flags |= USEDVP;
5196 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
5197 retval, authfd);
5198 vnode_put(dvp_at);
5199 return error;
5200 }
5201 }
5202
5203 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
5204 }
5205
5206 /*
5207 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
5208 *
5209 * Parameters: p Process requesting the open
5210 * uap User argument descriptor (see below)
5211 * retval Pointer to an area to receive the
5212 * return calue from the system call
5213 *
5214 * Indirect: uap->path Path to open (same as 'open')
5215 * uap->flags Flags to open (same as 'open'
5216 * uap->uid UID to set, if creating
5217 * uap->gid GID to set, if creating
5218 * uap->mode File mode, if creating (same as 'open')
5219 * uap->xsecurity ACL to set, if creating
5220 *
5221 * Returns: 0 Success
5222 * !0 errno value
5223 *
5224 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5225 *
5226 * XXX: We should enummerate the possible errno values here, and where
5227 * in the code they originated.
5228 */
5229 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)5230 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
5231 {
5232 int ciferror;
5233 kauth_filesec_t xsecdst;
5234 struct vnode_attr va;
5235 struct nameidata nd;
5236 int cmode;
5237
5238 AUDIT_ARG(owner, uap->uid, uap->gid);
5239
5240 xsecdst = NULL;
5241 if ((uap->xsecurity != USER_ADDR_NULL) &&
5242 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
5243 return ciferror;
5244 }
5245
5246 VATTR_INIT(&va);
5247 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
5248 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5249 if (uap->uid != KAUTH_UID_NONE) {
5250 VATTR_SET(&va, va_uid, uap->uid);
5251 }
5252 if (uap->gid != KAUTH_GID_NONE) {
5253 VATTR_SET(&va, va_gid, uap->gid);
5254 }
5255 if (xsecdst != NULL) {
5256 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5257 va.va_vaflags |= VA_FILESEC_ACL;
5258 }
5259
5260 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5261 uap->path, vfs_context_current());
5262
5263 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5264 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5265 if (xsecdst != NULL) {
5266 kauth_filesec_free(xsecdst);
5267 }
5268
5269 return ciferror;
5270 }
5271
5272 /*
5273 * Go through the data-protected atomically controlled open (2)
5274 *
5275 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5276 */
5277 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5278 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5279 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5280 {
5281 /*
5282 * Follow the same path as normal open(2)
5283 * Look up the item if it exists, and acquire the vnode.
5284 */
5285 struct vnode_attr va;
5286 struct nameidata nd;
5287 int cmode;
5288 int error;
5289 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5290
5291 VATTR_INIT(&va);
5292 /* Mask off all but regular access permissions */
5293 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5294 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5295
5296 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5297 path, ctx);
5298
5299 /*
5300 * Initialize the extra fields in vnode_attr to pass down our
5301 * extra fields.
5302 * 1. target cprotect class.
5303 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5304 */
5305 if (flags & O_CREAT) {
5306 /* lower level kernel code validates that the class is valid before applying it. */
5307 if (class != PROTECTION_CLASS_DEFAULT) {
5308 /*
5309 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5310 * file behave the same as open (2)
5311 */
5312 VATTR_SET(&va, va_dataprotect_class, class);
5313 }
5314 }
5315
5316 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5317 if (flags & (O_RDWR | O_WRONLY)) {
5318 /*
5319 * Not allowed to write raw encrypted bytes or when opening authenticated.
5320 */
5321 return EINVAL;
5322 }
5323 if (dpflags & O_DP_GETRAWENCRYPTED) {
5324 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5325 }
5326 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5327 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5328 }
5329 if (dpflags & O_DP_AUTHENTICATE) {
5330 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5331 }
5332 }
5333
5334 error = open1at(vfs_context_current(), &nd, flags, &va,
5335 NULL, NULL, retval, fd, authfd);
5336
5337 return error;
5338 }
5339
5340 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5341 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5342 {
5343 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5344 return EINVAL;
5345 }
5346
5347 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5348 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5349 }
5350
5351 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5352 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5353 {
5354 if (uap->dpflags & O_DP_AUTHENTICATE) {
5355 return EINVAL;
5356 }
5357
5358 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5359 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5360 }
5361
5362 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval,uint64_t * objidp,fsid_t * fsidp)5363 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5364 int fd, enum uio_seg segflg, int *retval, uint64_t *objidp, fsid_t *fsidp)
5365 {
5366 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5367 struct {
5368 struct vnode_attr va;
5369 struct nameidata nd;
5370 } *__open_data;
5371 struct vnode_attr *vap;
5372 struct nameidata *ndp;
5373 int cmode;
5374 int error;
5375
5376 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5377 vap = &__open_data->va;
5378 ndp = &__open_data->nd;
5379
5380 VATTR_INIT(vap);
5381 /* Mask off all but regular access permissions */
5382 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5383 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5384
5385 /* Check for fileid and fsid authentication */
5386 if (objidp || fsidp) {
5387 if (!objidp || !fsidp) {
5388 error = EINVAL;
5389 goto out;
5390 }
5391 VATTR_SET(vap, va_flags, VA_VAFILEID);
5392 VATTR_SET(vap, va_fileid, *objidp);
5393 VATTR_SET(vap, va_fsid64, *fsidp);
5394 }
5395
5396 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5397 segflg, path, ctx);
5398
5399 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5400
5401 out:
5402 kfree_type(typeof(*__open_data), __open_data);
5403
5404 return error;
5405 }
5406
5407 int
open(proc_t p,struct open_args * uap,int32_t * retval)5408 open(proc_t p, struct open_args *uap, int32_t *retval)
5409 {
5410 __pthread_testcancel(1);
5411 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5412 }
5413
5414 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5415 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5416 int32_t *retval)
5417 {
5418 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5419 uap->mode, AT_FDCWD, UIO_USERSPACE, retval, NULL, NULL);
5420 }
5421
5422 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5423 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5424 int32_t *retval)
5425 {
5426 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5427 uap->mode, uap->fd, UIO_USERSPACE, retval, NULL, NULL);
5428 }
5429
5430 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5431 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5432 {
5433 __pthread_testcancel(1);
5434 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5435 }
5436
5437 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5438
5439 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5440 vfs_context_can_open_by_id(vfs_context_t ctx)
5441 {
5442 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5443 return TRUE;
5444 }
5445
5446 return IOTaskHasEntitlement(vfs_context_task(ctx),
5447 OPEN_BY_ID_ENTITLEMENT);
5448 }
5449
5450 #define MAX_OPENBYID_NP_RETRIES 10
5451
5452 /*
5453 * openbyid_np: open a file given a file system id and a file system object id
5454 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5455 * file systems that don't support object ids it is a node id (uint64_t).
5456 *
5457 * Parameters: p Process requesting the open
5458 * uap User argument descriptor (see below)
5459 * retval Pointer to an area to receive the
5460 * return calue from the system call
5461 *
5462 * Indirect: uap->path Path to open (same as 'open')
5463 *
5464 * uap->fsid id of target file system
5465 * uap->objid id of target file system object
5466 * uap->flags Flags to open (same as 'open')
5467 *
5468 * Returns: 0 Success
5469 * !0 errno value
5470 *
5471 *
5472 * XXX: We should enummerate the possible errno values here, and where
5473 * in the code they originated.
5474 */
5475 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5476 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5477 {
5478 fsid_t fsid;
5479 uint64_t objid;
5480 int fd;
5481 int error;
5482 int retry_count = 0;
5483 char *buf = NULL;
5484 int buflen = MAXPATHLEN;
5485 int pathlen = 0;
5486 vfs_context_t ctx = vfs_context_current();
5487
5488 if (!vfs_context_can_open_by_id(ctx)) {
5489 return EPERM;
5490 }
5491
5492 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5493 return error;
5494 }
5495
5496 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5497 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5498 return error;
5499 }
5500
5501 AUDIT_ARG(value32, fsid.val[0]);
5502 AUDIT_ARG(value64, objid);
5503
5504 retry:
5505 fd = -1;
5506 error = 0;
5507 buf = NULL;
5508 pathlen = 0;
5509 buflen = MAXPATHLEN;
5510
5511 /*resolve path from fsis, objid*/
5512 do {
5513 buf = kalloc_data(buflen + 1, Z_WAITOK);
5514 if (buf == NULL) {
5515 return ENOMEM;
5516 }
5517
5518 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5519 buf, FSOPT_ISREALFSID, &pathlen);
5520
5521 if (error) {
5522 kfree_data(buf, buflen + 1);
5523 buf = NULL;
5524 }
5525 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5526
5527 if (error) {
5528 return error;
5529 }
5530
5531 buf[pathlen] = 0;
5532
5533 error = openat_internal(
5534 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, &fd, &objid, &fsid);
5535
5536 kfree_data(buf, buflen + 1);
5537
5538 /* Ensure the correct file is opened */
5539 if (error == ERECYCLE) {
5540 if (retry_count < MAX_OPENBYID_NP_RETRIES) {
5541 retry_count += 1;
5542 goto retry;
5543 } else {
5544 printf("openbyid_np() retry limit due to ERECYCLE reached\n");
5545 error = ENOENT;
5546 }
5547 }
5548
5549 if (!error) {
5550 *retval = fd;
5551 }
5552
5553 return error;
5554 }
5555
5556
5557 /*
5558 * Create a special file.
5559 */
5560 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5561 int fd);
5562
5563 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5564 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5565 mode_t mode, int fd)
5566 {
5567 vfs_context_t ctx = vfs_context_current();
5568 struct nameidata nd;
5569 vnode_t vp, dvp;
5570 int error;
5571
5572 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5573 if ((mode & S_IFMT) == S_IFIFO) {
5574 return mkfifo1(ctx, upath, vap, fd);
5575 }
5576
5577 AUDIT_ARG(mode, mode);
5578 AUDIT_ARG(value32, vap->va_rdev);
5579
5580 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5581 return error;
5582 }
5583 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5584 UIO_USERSPACE, upath, ctx);
5585 error = nameiat(&nd, fd);
5586 if (error) {
5587 return error;
5588 }
5589 dvp = nd.ni_dvp;
5590 vp = nd.ni_vp;
5591
5592 if (vp != NULL) {
5593 error = EEXIST;
5594 goto out;
5595 }
5596
5597 switch (mode & S_IFMT) {
5598 case S_IFCHR:
5599 VATTR_SET(vap, va_type, VCHR);
5600 break;
5601 case S_IFBLK:
5602 VATTR_SET(vap, va_type, VBLK);
5603 break;
5604 default:
5605 error = EINVAL;
5606 goto out;
5607 }
5608
5609 #if CONFIG_MACF
5610 error = mac_vnode_check_create(ctx,
5611 nd.ni_dvp, &nd.ni_cnd, vap);
5612 if (error) {
5613 goto out;
5614 }
5615 #endif
5616
5617 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5618 goto out;
5619 }
5620
5621 #if CONFIG_FILE_LEASES
5622 vnode_breakdirlease(dvp, false, O_WRONLY);
5623 #endif
5624
5625 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5626 goto out;
5627 }
5628
5629 if (vp) {
5630 int update_flags = 0;
5631
5632 // Make sure the name & parent pointers are hooked up
5633 if (vp->v_name == NULL) {
5634 update_flags |= VNODE_UPDATE_NAME;
5635 }
5636 if (vp->v_parent == NULLVP) {
5637 update_flags |= VNODE_UPDATE_PARENT;
5638 }
5639
5640 if (update_flags) {
5641 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5642 }
5643
5644 #if CONFIG_FSE
5645 add_fsevent(FSE_CREATE_FILE, ctx,
5646 FSE_ARG_VNODE, vp,
5647 FSE_ARG_DONE);
5648 #endif
5649 }
5650
5651 out:
5652 /*
5653 * nameidone has to happen before we vnode_put(dvp)
5654 * since it may need to release the fs_nodelock on the dvp
5655 */
5656 nameidone(&nd);
5657
5658 if (vp) {
5659 vnode_put(vp);
5660 }
5661 vnode_put(dvp);
5662
5663 return error;
5664 }
5665
5666 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5667 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5668 {
5669 struct vnode_attr va;
5670
5671 VATTR_INIT(&va);
5672 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5673 VATTR_SET(&va, va_rdev, uap->dev);
5674
5675 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5676 }
5677
5678 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5679 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5680 {
5681 struct vnode_attr va;
5682
5683 VATTR_INIT(&va);
5684 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5685 VATTR_SET(&va, va_rdev, uap->dev);
5686
5687 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5688 }
5689
5690 /*
5691 * Create a named pipe.
5692 *
5693 * Returns: 0 Success
5694 * EEXIST
5695 * namei:???
5696 * vnode_authorize:???
5697 * vn_create:???
5698 */
5699 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5700 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5701 {
5702 vnode_t vp, dvp;
5703 int error;
5704 struct nameidata nd;
5705
5706 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5707 UIO_USERSPACE, upath, ctx);
5708 error = nameiat(&nd, fd);
5709 if (error) {
5710 return error;
5711 }
5712 dvp = nd.ni_dvp;
5713 vp = nd.ni_vp;
5714
5715 /* check that this is a new file and authorize addition */
5716 if (vp != NULL) {
5717 error = EEXIST;
5718 goto out;
5719 }
5720 VATTR_SET(vap, va_type, VFIFO);
5721
5722 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5723 goto out;
5724 }
5725
5726 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5727 out:
5728 /*
5729 * nameidone has to happen before we vnode_put(dvp)
5730 * since it may need to release the fs_nodelock on the dvp
5731 */
5732 nameidone(&nd);
5733
5734 if (vp) {
5735 vnode_put(vp);
5736 }
5737 vnode_put(dvp);
5738
5739 return error;
5740 }
5741
5742
5743 /*
5744 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5745 *
5746 * Parameters: p Process requesting the open
5747 * uap User argument descriptor (see below)
5748 * retval (Ignored)
5749 *
5750 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5751 * uap->uid UID to set
5752 * uap->gid GID to set
5753 * uap->mode File mode to set (same as 'mkfifo')
5754 * uap->xsecurity ACL to set, if creating
5755 *
5756 * Returns: 0 Success
5757 * !0 errno value
5758 *
5759 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5760 *
5761 * XXX: We should enummerate the possible errno values here, and where
5762 * in the code they originated.
5763 */
5764 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5765 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5766 {
5767 int ciferror;
5768 kauth_filesec_t xsecdst;
5769 struct vnode_attr va;
5770
5771 AUDIT_ARG(owner, uap->uid, uap->gid);
5772
5773 xsecdst = KAUTH_FILESEC_NONE;
5774 if (uap->xsecurity != USER_ADDR_NULL) {
5775 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5776 return ciferror;
5777 }
5778 }
5779
5780 VATTR_INIT(&va);
5781 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5782 if (uap->uid != KAUTH_UID_NONE) {
5783 VATTR_SET(&va, va_uid, uap->uid);
5784 }
5785 if (uap->gid != KAUTH_GID_NONE) {
5786 VATTR_SET(&va, va_gid, uap->gid);
5787 }
5788 if (xsecdst != KAUTH_FILESEC_NONE) {
5789 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5790 va.va_vaflags |= VA_FILESEC_ACL;
5791 }
5792
5793 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5794
5795 if (xsecdst != KAUTH_FILESEC_NONE) {
5796 kauth_filesec_free(xsecdst);
5797 }
5798 return ciferror;
5799 }
5800
5801 /* ARGSUSED */
5802 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5803 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5804 {
5805 struct vnode_attr va;
5806
5807 VATTR_INIT(&va);
5808 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5809
5810 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5811 }
5812
5813 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5814 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5815 {
5816 struct vnode_attr va;
5817
5818 VATTR_INIT(&va);
5819 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5820
5821 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5822 }
5823
5824 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5825 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5826 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5827
5828 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5829 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5830 {
5831 int ret, len = _len;
5832
5833 *truncated_path = 0;
5834
5835 if (firmlink) {
5836 ret = vn_getpath(dvp, path, &len);
5837 } else {
5838 ret = vn_getpath_no_firmlink(dvp, path, &len);
5839 }
5840 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5841 if (leafname) {
5842 path[len - 1] = '/';
5843 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5844 if (len > MAXPATHLEN) {
5845 char *ptr;
5846
5847 // the string got truncated!
5848 *truncated_path = 1;
5849 ptr = strrchr(path, '/');
5850 if (ptr) {
5851 *ptr = '\0'; // chop off the string at the last directory component
5852 }
5853 len = (int)strlen(path) + 1;
5854 }
5855 }
5856 } else if (ret == 0) {
5857 *truncated_path = 1;
5858 } else if (ret != 0) {
5859 struct vnode *mydvp = dvp;
5860
5861 if (ret != ENOSPC) {
5862 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5863 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5864 }
5865 *truncated_path = 1;
5866
5867 do {
5868 if (mydvp->v_parent != NULL) {
5869 mydvp = mydvp->v_parent;
5870 } else if (mydvp->v_mount) {
5871 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5872 break;
5873 } else {
5874 // no parent and no mount point? only thing is to punt and say "/" changed
5875 strlcpy(path, "/", _len);
5876 len = 2;
5877 mydvp = NULL;
5878 }
5879
5880 if (mydvp == NULL) {
5881 break;
5882 }
5883
5884 len = _len;
5885 if (firmlink) {
5886 ret = vn_getpath(mydvp, path, &len);
5887 } else {
5888 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5889 }
5890 } while (ret == ENOSPC);
5891 }
5892
5893 return len;
5894 }
5895
5896 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5897 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5898 {
5899 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5900 }
5901
5902 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5903 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5904 {
5905 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5906 }
5907
5908 /*
5909 * Make a hard file link.
5910 *
5911 * Returns: 0 Success
5912 * EPERM
5913 * EEXIST
5914 * EXDEV
5915 * namei:???
5916 * vnode_authorize:???
5917 * VNOP_LINK:???
5918 */
5919 /* ARGSUSED */
5920 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5921 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5922 user_addr_t link, int flag, enum uio_seg segflg)
5923 {
5924 vnode_t vp, pvp, dvp, lvp;
5925 struct nameidata nd;
5926 int follow;
5927 int error;
5928 #if CONFIG_FSE
5929 fse_info finfo;
5930 #endif
5931 char *target_path = NULL;
5932 char *no_firmlink_path = NULL;
5933 vnode_t locked_vp = NULLVP;
5934 int truncated = 0;
5935 int truncated_no_firmlink_path = 0;
5936 int num_retries = 0;
5937 int need_event, has_listeners, need_kpath2;
5938 bool do_retry;
5939
5940 /* look up the object we are linking to */
5941 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5942
5943 retry:
5944 do_retry = false;
5945 vp = dvp = lvp = NULLVP;
5946 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5947 segflg, path, ctx);
5948 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
5949 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
5950 }
5951 if (flag & AT_RESOLVE_BENEATH) {
5952 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
5953 }
5954
5955 error = nameiat(&nd, fd1);
5956 if (error) {
5957 return error;
5958 }
5959 vp = nd.ni_vp;
5960
5961 nameidone(&nd);
5962
5963 /*
5964 * Normally, linking to directories is not supported.
5965 * However, some file systems may have limited support.
5966 */
5967 if (vp->v_type == VDIR) {
5968 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5969 error = EPERM; /* POSIX */
5970 goto out;
5971 }
5972
5973 /* Linking to a directory requires ownership. */
5974 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5975 struct vnode_attr dva;
5976
5977 VATTR_INIT(&dva);
5978 VATTR_WANTED(&dva, va_uid);
5979 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5980 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5981 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5982 error = EACCES;
5983 goto out;
5984 }
5985 }
5986 }
5987
5988 /* lookup the target node */
5989 #if CONFIG_TRIGGERS
5990 nd.ni_op = OP_LINK;
5991 #endif
5992 nd.ni_cnd.cn_nameiop = CREATE;
5993 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5994 nd.ni_dirp = link;
5995 error = nameiat(&nd, fd2);
5996 if (error != 0) {
5997 goto out;
5998 }
5999 dvp = nd.ni_dvp;
6000 lvp = nd.ni_vp;
6001
6002 assert(locked_vp == NULLVP);
6003 vnode_link_lock(vp);
6004 locked_vp = vp;
6005
6006 #if CONFIG_MACF
6007 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
6008 goto out2;
6009 }
6010 #endif
6011
6012 /* or to anything that kauth doesn't want us to (eg. immutable items) */
6013 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
6014 goto out2;
6015 }
6016
6017 /* target node must not exist */
6018 if (lvp != NULLVP) {
6019 error = EEXIST;
6020 goto out2;
6021 }
6022 /* cannot link across mountpoints */
6023 if (vnode_mount(vp) != vnode_mount(dvp)) {
6024 error = EXDEV;
6025 goto out2;
6026 }
6027
6028 /* authorize creation of the target note */
6029 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
6030 goto out2;
6031 }
6032
6033 #if CONFIG_FILE_LEASES
6034 vnode_breakdirlease(dvp, false, O_WRONLY);
6035 #endif
6036
6037 /* and finally make the link */
6038 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
6039 if (error) {
6040 if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
6041 do_retry = true;
6042 num_retries += 1;
6043 }
6044 goto out2;
6045 }
6046
6047 #if CONFIG_MACF
6048 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
6049 #endif
6050
6051 vnode_lock_spin(vp);
6052 vp->v_ext_flag &= ~VE_NOT_HARDLINK;
6053 vnode_unlock(vp);
6054
6055 assert(locked_vp == vp);
6056 vnode_link_unlock(locked_vp);
6057 locked_vp = NULLVP;
6058
6059 #if CONFIG_FSE
6060 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
6061 #else
6062 need_event = 0;
6063 #endif
6064 has_listeners = kauth_authorize_fileop_has_listeners();
6065
6066 need_kpath2 = 0;
6067 #if CONFIG_AUDIT
6068 if (AUDIT_RECORD_EXISTS()) {
6069 need_kpath2 = 1;
6070 }
6071 #endif
6072
6073 if (need_event || has_listeners || need_kpath2) {
6074 char *link_to_path = NULL;
6075 int len, link_name_len;
6076 int len_no_firmlink_path = 0;
6077
6078 /* build the path to the new link file */
6079 GET_PATH(target_path);
6080
6081 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
6082 if (no_firmlink_path == NULL) {
6083 GET_PATH(no_firmlink_path);
6084 }
6085 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6086
6087 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
6088
6089 if (has_listeners) {
6090 /* build the path to file we are linking to */
6091 GET_PATH(link_to_path);
6092
6093 link_name_len = MAXPATHLEN;
6094 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
6095 /*
6096 * Call out to allow 3rd party notification of rename.
6097 * Ignore result of kauth_authorize_fileop call.
6098 */
6099 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
6100 (uintptr_t)link_to_path,
6101 (uintptr_t)target_path);
6102 }
6103 if (link_to_path != NULL) {
6104 RELEASE_PATH(link_to_path);
6105 }
6106 }
6107 #if CONFIG_FSE
6108 if (need_event) {
6109 /* construct fsevent */
6110 if (get_fse_info(vp, &finfo, ctx) == 0) {
6111 if (truncated_no_firmlink_path) {
6112 finfo.mode |= FSE_TRUNCATED_PATH;
6113 }
6114
6115 // build the path to the destination of the link
6116 add_fsevent(FSE_CREATE_FILE, ctx,
6117 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6118 FSE_ARG_FINFO, &finfo,
6119 FSE_ARG_DONE);
6120 }
6121
6122 pvp = vp->v_parent;
6123 // need an iocount on parent vnode in this case
6124 if (pvp && pvp != dvp) {
6125 pvp = vnode_getparent_if_different(vp, dvp);
6126 }
6127 if (pvp) {
6128 add_fsevent(FSE_STAT_CHANGED, ctx,
6129 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
6130 }
6131 if (pvp && pvp != dvp) {
6132 vnode_put(pvp);
6133 }
6134 }
6135 #endif
6136 }
6137 out2:
6138 /*
6139 * nameidone has to happen before we vnode_put(dvp)
6140 * since it may need to release the fs_nodelock on the dvp
6141 */
6142 nameidone(&nd);
6143 if (target_path != NULL) {
6144 RELEASE_PATH(target_path);
6145 target_path = NULL;
6146 }
6147 if (no_firmlink_path != NULL) {
6148 RELEASE_PATH(no_firmlink_path);
6149 no_firmlink_path = NULL;
6150 }
6151 out:
6152 if (locked_vp) {
6153 assert(locked_vp == vp);
6154 vnode_link_unlock(locked_vp);
6155 locked_vp = NULLVP;
6156 }
6157 if (lvp) {
6158 vnode_put(lvp);
6159 }
6160 if (dvp) {
6161 vnode_put(dvp);
6162 }
6163 vnode_put(vp);
6164
6165 if (do_retry) {
6166 goto retry;
6167 }
6168
6169 return error;
6170 }
6171
6172 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)6173 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
6174 {
6175 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6176 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
6177 }
6178
6179 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)6180 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
6181 {
6182 if (uap->flag & ~(AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
6183 return EINVAL;
6184 }
6185
6186 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
6187 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
6188 }
6189
6190 /*
6191 * Make a symbolic link.
6192 *
6193 * We could add support for ACLs here too...
6194 */
6195 /* ARGSUSED */
6196 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)6197 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
6198 user_addr_t link, enum uio_seg segflg)
6199 {
6200 struct vnode_attr va;
6201 char *path;
6202 int error;
6203 struct nameidata nd;
6204 vnode_t vp, dvp;
6205 size_t dummy = 0;
6206 proc_t p;
6207
6208 error = 0;
6209 if (UIO_SEG_IS_USER_SPACE(segflg)) {
6210 path = zalloc(ZV_NAMEI);
6211 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
6212 } else {
6213 path = (char *)path_data;
6214 }
6215 if (error) {
6216 goto out;
6217 }
6218 AUDIT_ARG(text, path); /* This is the link string */
6219
6220 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
6221 segflg, link, ctx);
6222
6223 error = nameiat(&nd, fd);
6224 if (error) {
6225 goto out;
6226 }
6227 dvp = nd.ni_dvp;
6228 vp = nd.ni_vp;
6229
6230 p = vfs_context_proc(ctx);
6231 VATTR_INIT(&va);
6232 VATTR_SET(&va, va_type, VLNK);
6233 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
6234
6235 #if CONFIG_MACF
6236 error = mac_vnode_check_create(ctx,
6237 dvp, &nd.ni_cnd, &va);
6238 #endif
6239 if (error != 0) {
6240 goto skipit;
6241 }
6242
6243 if (vp != NULL) {
6244 error = EEXIST;
6245 goto skipit;
6246 }
6247
6248 /* authorize */
6249 if (error == 0) {
6250 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6251 }
6252 /* get default ownership, etc. */
6253 if (error == 0) {
6254 error = vnode_authattr_new(dvp, &va, 0, ctx);
6255 }
6256
6257 #if CONFIG_FILE_LEASES
6258 vnode_breakdirlease(dvp, false, O_WRONLY);
6259 #endif
6260
6261 if (error == 0) {
6262 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
6263 }
6264
6265 /* do fallback attribute handling */
6266 if (error == 0 && vp) {
6267 error = vnode_setattr_fallback(vp, &va, ctx);
6268 }
6269
6270 #if CONFIG_MACF
6271 if (error == 0 && vp) {
6272 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
6273 }
6274 #endif
6275
6276 if (error == 0) {
6277 int update_flags = 0;
6278
6279 /*check if a new vnode was created, else try to get one*/
6280 if (vp == NULL) {
6281 nd.ni_cnd.cn_nameiop = LOOKUP;
6282 #if CONFIG_TRIGGERS
6283 nd.ni_op = OP_LOOKUP;
6284 #endif
6285 /*
6286 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
6287 * reallocated again in namei().
6288 */
6289 nd.ni_cnd.cn_flags &= HASBUF;
6290 error = nameiat(&nd, fd);
6291 if (error) {
6292 goto skipit;
6293 }
6294 vp = nd.ni_vp;
6295 }
6296
6297 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
6298 /* call out to allow 3rd party notification of rename.
6299 * Ignore result of kauth_authorize_fileop call.
6300 */
6301 if (kauth_authorize_fileop_has_listeners() &&
6302 namei(&nd) == 0) {
6303 char *new_link_path = NULL;
6304 int len;
6305
6306 /* build the path to the new link file */
6307 new_link_path = get_pathbuff();
6308 len = MAXPATHLEN;
6309 vn_getpath(dvp, new_link_path, &len);
6310 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
6311 new_link_path[len - 1] = '/';
6312 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
6313 }
6314
6315 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
6316 (uintptr_t)path, (uintptr_t)new_link_path);
6317 if (new_link_path != NULL) {
6318 release_pathbuff(new_link_path);
6319 }
6320 }
6321 #endif
6322 // Make sure the name & parent pointers are hooked up
6323 if (vp->v_name == NULL) {
6324 update_flags |= VNODE_UPDATE_NAME;
6325 }
6326 if (vp->v_parent == NULLVP) {
6327 update_flags |= VNODE_UPDATE_PARENT;
6328 }
6329
6330 if (update_flags) {
6331 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6332 }
6333
6334 #if CONFIG_FSE
6335 add_fsevent(FSE_CREATE_FILE, ctx,
6336 FSE_ARG_VNODE, vp,
6337 FSE_ARG_DONE);
6338 #endif
6339 }
6340
6341 skipit:
6342 /*
6343 * nameidone has to happen before we vnode_put(dvp)
6344 * since it may need to release the fs_nodelock on the dvp
6345 */
6346 nameidone(&nd);
6347
6348 if (vp) {
6349 vnode_put(vp);
6350 }
6351 vnode_put(dvp);
6352 out:
6353 if (path && (path != (char *)path_data)) {
6354 zfree(ZV_NAMEI, path);
6355 }
6356
6357 return error;
6358 }
6359
6360 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6361 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6362 {
6363 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6364 uap->link, UIO_USERSPACE);
6365 }
6366
6367 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6368 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6369 __unused int32_t *retval)
6370 {
6371 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6372 uap->path2, UIO_USERSPACE);
6373 }
6374
6375 /*
6376 * Delete a whiteout from the filesystem.
6377 * No longer supported.
6378 */
6379 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6380 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6381 {
6382 return ENOTSUP;
6383 }
6384
6385 /*
6386 * Delete a name from the filesystem.
6387 */
6388 /* ARGSUSED */
6389 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6390 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6391 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6392 {
6393 struct {
6394 struct nameidata nd;
6395 #if CONFIG_FSE
6396 struct vnode_attr va;
6397 fse_info finfo;
6398 #endif
6399 } *__unlink_data;
6400 struct nameidata *ndp;
6401 vnode_t vp, dvp;
6402 int error;
6403 struct componentname *cnp;
6404 char *path = NULL;
6405 char *no_firmlink_path = NULL;
6406 int len_path = 0;
6407 int len_no_firmlink_path = 0;
6408 int flags;
6409 int need_event;
6410 int has_listeners;
6411 int truncated_path;
6412 int truncated_no_firmlink_path;
6413 int batched;
6414 struct vnode_attr *vap;
6415 vnode_t locked_vp = NULLVP;
6416 int do_retry;
6417 int retry_count = 0;
6418 int cn_flags;
6419 int nofollow_any = 0;
6420 int resolve_beneath = 0;
6421
6422 cn_flags = LOCKPARENT;
6423 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6424 cn_flags |= AUDITVNPATH1;
6425 }
6426 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6427 nofollow_any = NAMEI_NOFOLLOW_ANY;
6428 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6429 }
6430 if (unlink_flags & VNODE_REMOVE_RESOLVE_BENEATH) {
6431 resolve_beneath = NAMEI_RESOLVE_BENEATH;
6432 unlink_flags &= ~VNODE_REMOVE_RESOLVE_BENEATH;
6433 }
6434 /* If a starting dvp is passed, it trumps any fd passed. */
6435 if (start_dvp) {
6436 cn_flags |= USEDVP;
6437 }
6438
6439 #if NAMEDRSRCFORK
6440 /* unlink or delete is allowed on rsrc forks and named streams */
6441 cn_flags |= CN_ALLOWRSRCFORK;
6442 #endif
6443
6444 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6445 ndp = &__unlink_data->nd;
6446 #if CONFIG_FSE
6447 fse_info *finfop = &__unlink_data->finfo;
6448 #endif
6449
6450 retry:
6451 do_retry = 0;
6452 flags = 0;
6453 need_event = 0;
6454 has_listeners = 0;
6455 truncated_path = 0;
6456 truncated_no_firmlink_path = 0;
6457 vap = NULL;
6458
6459 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6460
6461 ndp->ni_dvp = start_dvp;
6462 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any | resolve_beneath;
6463 cnp = &ndp->ni_cnd;
6464
6465 continue_lookup:
6466 error = nameiat(ndp, fd);
6467 if (error) {
6468 goto early_out;
6469 }
6470
6471 dvp = ndp->ni_dvp;
6472 vp = ndp->ni_vp;
6473
6474 /* With Carbon delete semantics, busy files cannot be deleted */
6475 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6476 flags |= VNODE_REMOVE_NODELETEBUSY;
6477 }
6478
6479 /* Skip any potential upcalls if told to. */
6480 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6481 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6482 }
6483
6484 /* Update speculative telemetry with system discarded use state */
6485 if (unlink_flags & VNODE_REMOVE_SYSTEM_DISCARDED) {
6486 flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6487 }
6488
6489 if (vp) {
6490 batched = vnode_compound_remove_available(vp);
6491 /*
6492 * The root of a mounted filesystem cannot be deleted.
6493 */
6494 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6495 error = EBUSY;
6496 goto out;
6497 }
6498
6499 #if DEVELOPMENT || DEBUG
6500 /*
6501 * XXX VSWAP: Check for entitlements or special flag here
6502 * so we can restrict access appropriately.
6503 */
6504 #else /* DEVELOPMENT || DEBUG */
6505
6506 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6507 error = EPERM;
6508 goto out;
6509 }
6510 #endif /* DEVELOPMENT || DEBUG */
6511
6512 if (!batched) {
6513 vnode_link_lock(vp);
6514 locked_vp = vp;
6515 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6516 if (error) {
6517 if (error == ENOENT) {
6518 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6519 do_retry = 1;
6520 retry_count++;
6521 }
6522 }
6523 vnode_link_unlock(vp);
6524 locked_vp = NULLVP;
6525 goto out;
6526 }
6527 }
6528 } else {
6529 batched = 1;
6530
6531 if (!vnode_compound_remove_available(dvp)) {
6532 panic("No vp, but no compound remove?");
6533 }
6534 }
6535
6536 #if CONFIG_FSE
6537 need_event = need_fsevent(FSE_DELETE, dvp);
6538 if (need_event) {
6539 if (!batched) {
6540 if ((vp->v_flag & VISHARDLINK) == 0) {
6541 /* XXX need to get these data in batched VNOP */
6542 get_fse_info(vp, finfop, ctx);
6543 }
6544 } else {
6545 error =
6546 vfs_get_notify_attributes(&__unlink_data->va);
6547 if (error) {
6548 goto out;
6549 }
6550
6551 vap = &__unlink_data->va;
6552 }
6553 }
6554 #endif
6555 has_listeners = kauth_authorize_fileop_has_listeners();
6556 if (need_event || has_listeners) {
6557 if (path == NULL) {
6558 GET_PATH(path);
6559 }
6560 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6561 if (no_firmlink_path == NULL) {
6562 GET_PATH(no_firmlink_path);
6563 }
6564 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6565 }
6566
6567 #if NAMEDRSRCFORK
6568 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6569 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6570 } else
6571 #endif
6572 {
6573 #if CONFIG_FILE_LEASES
6574 vnode_breakdirlease(dvp, false, O_WRONLY);
6575 #endif
6576
6577 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6578 vp = ndp->ni_vp;
6579 if (error == EKEEPLOOKING) {
6580 if (!batched) {
6581 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6582 }
6583
6584 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6585 panic("EKEEPLOOKING, but continue flag not set?");
6586 }
6587
6588 if (vnode_isdir(vp)) {
6589 error = EISDIR;
6590 goto out;
6591 }
6592 goto continue_lookup;
6593 } else if (error == ENOENT && batched) {
6594 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6595 /*
6596 * For compound VNOPs, the authorization callback may
6597 * return ENOENT in case of racing hardlink lookups
6598 * hitting the name cache, redrive the lookup.
6599 */
6600 do_retry = 1;
6601 retry_count += 1;
6602 goto out;
6603 }
6604 }
6605 }
6606
6607 /*
6608 * Call out to allow 3rd party notification of delete.
6609 * Ignore result of kauth_authorize_fileop call.
6610 */
6611 if (!error) {
6612 if (has_listeners) {
6613 kauth_authorize_fileop(vfs_context_ucred(ctx),
6614 KAUTH_FILEOP_DELETE,
6615 (uintptr_t)vp,
6616 (uintptr_t)path);
6617 }
6618
6619 if (vp->v_flag & VISHARDLINK) {
6620 //
6621 // if a hardlink gets deleted we want to blow away the
6622 // v_parent link because the path that got us to this
6623 // instance of the link is no longer valid. this will
6624 // force the next call to get the path to ask the file
6625 // system instead of just following the v_parent link.
6626 //
6627 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6628 }
6629
6630 #if CONFIG_FSE
6631 if (need_event) {
6632 if (vp->v_flag & VISHARDLINK) {
6633 get_fse_info(vp, finfop, ctx);
6634 } else if (vap) {
6635 vnode_get_fse_info_from_vap(vp, finfop, vap);
6636 }
6637 if (truncated_path) {
6638 finfop->mode |= FSE_TRUNCATED_PATH;
6639 }
6640 add_fsevent(FSE_DELETE, ctx,
6641 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6642 FSE_ARG_FINFO, finfop,
6643 FSE_ARG_DONE);
6644 }
6645 #endif
6646
6647 #if CONFIG_MACF
6648 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6649 #endif
6650 }
6651
6652 out:
6653 if (locked_vp) {
6654 assert(locked_vp == vp);
6655 vnode_link_unlock(locked_vp);
6656 locked_vp = NULLVP;
6657 }
6658
6659 if (path != NULL) {
6660 RELEASE_PATH(path);
6661 path = NULL;
6662 }
6663
6664 if (no_firmlink_path != NULL) {
6665 RELEASE_PATH(no_firmlink_path);
6666 no_firmlink_path = NULL;
6667 }
6668 #if NAMEDRSRCFORK
6669 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6670 * will cause its shadow file to go away if necessary.
6671 */
6672 if (vp && (vnode_isnamedstream(vp)) &&
6673 (vp->v_parent != NULLVP) &&
6674 vnode_isshadow(vp)) {
6675 vnode_recycle(vp);
6676 }
6677 #endif
6678 /*
6679 * nameidone has to happen before we vnode_put(dvp)
6680 * since it may need to release the fs_nodelock on the dvp
6681 */
6682 nameidone(ndp);
6683 vnode_put(dvp);
6684 if (vp) {
6685 vnode_put(vp);
6686 }
6687
6688 if (do_retry) {
6689 goto retry;
6690 }
6691
6692 early_out:
6693 kfree_type(typeof(*__unlink_data), __unlink_data);
6694 return error;
6695 }
6696
6697 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6698 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6699 enum uio_seg segflg, int unlink_flags)
6700 {
6701 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6702 unlink_flags);
6703 }
6704
6705 /*
6706 * Delete a name from the filesystem using Carbon semantics.
6707 */
6708 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6709 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6710 {
6711 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6712 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6713 }
6714
6715 /*
6716 * Delete a name from the filesystem using POSIX semantics.
6717 */
6718 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6719 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6720 {
6721 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6722 uap->path, UIO_USERSPACE, 0);
6723 }
6724
6725 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6726 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6727 {
6728 int unlink_flags = 0;
6729
6730 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED | AT_RESOLVE_BENEATH | AT_NODELETEBUSY)) {
6731 return EINVAL;
6732 }
6733
6734 if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6735 unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6736 }
6737 if (uap->flag & AT_RESOLVE_BENEATH) {
6738 unlink_flags |= VNODE_REMOVE_RESOLVE_BENEATH;
6739 }
6740
6741 if (uap->flag & AT_SYSTEM_DISCARDED) {
6742 unlink_flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6743 }
6744
6745 if (uap->flag & AT_NODELETEBUSY) {
6746 unlink_flags |= VNODE_REMOVE_NODELETEBUSY;
6747 }
6748
6749 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6750 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6751 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6752 }
6753 return rmdirat_internal(vfs_context_current(), uap->fd,
6754 uap->path, UIO_USERSPACE, unlink_flags);
6755 } else {
6756 return unlinkat_internal(vfs_context_current(), uap->fd,
6757 NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6758 }
6759 }
6760
6761 /*
6762 * Reposition read/write file offset.
6763 */
6764 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6765 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6766 {
6767 struct fileproc *fp;
6768 vnode_t vp;
6769 struct vfs_context *ctx;
6770 off_t offset = uap->offset, file_size;
6771 int error;
6772
6773 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6774 if (error == ENOTSUP) {
6775 return ESPIPE;
6776 }
6777 return error;
6778 }
6779 if (
6780 // rdar://3837316: Seeking a pipe is disallowed by POSIX.
6781 vnode_isfifo(vp)
6782 // rdar://120750171: Seeking a TTY is undefined and should be denied.
6783 || vnode_istty(vp)
6784 ) {
6785 file_drop(uap->fd);
6786 return ESPIPE;
6787 }
6788
6789
6790 ctx = vfs_context_current();
6791 #if CONFIG_MACF
6792 if (uap->whence == L_INCR && uap->offset == 0) {
6793 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6794 fp->fp_glob);
6795 } else {
6796 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6797 fp->fp_glob);
6798 }
6799 if (error) {
6800 file_drop(uap->fd);
6801 return error;
6802 }
6803 #endif
6804 if ((error = vnode_getwithref(vp))) {
6805 file_drop(uap->fd);
6806 return error;
6807 }
6808
6809 switch (uap->whence) {
6810 case L_INCR:
6811 offset += fp->fp_glob->fg_offset;
6812 break;
6813 case L_XTND:
6814 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6815 break;
6816 }
6817 offset += file_size;
6818 break;
6819 case L_SET:
6820 break;
6821 case SEEK_HOLE:
6822 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6823 break;
6824 case SEEK_DATA:
6825 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6826 break;
6827 default:
6828 error = EINVAL;
6829 }
6830 if (error == 0) {
6831 if (uap->offset > 0 && offset < 0) {
6832 /* Incremented/relative move past max size */
6833 error = EOVERFLOW;
6834 } else {
6835 /*
6836 * Allow negative offsets on character devices, per
6837 * POSIX 1003.1-2001. Most likely for writing disk
6838 * labels.
6839 */
6840 if (offset < 0 && vp->v_type != VCHR) {
6841 /* Decremented/relative move before start */
6842 error = EINVAL;
6843 } else {
6844 /* Success */
6845 fp->fp_glob->fg_offset = offset;
6846 *retval = fp->fp_glob->fg_offset;
6847 }
6848 }
6849 }
6850
6851 /*
6852 * An lseek can affect whether data is "available to read." Use
6853 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6854 */
6855 post_event_if_success(vp, error, NOTE_NONE);
6856 (void)vnode_put(vp);
6857 file_drop(uap->fd);
6858 return error;
6859 }
6860
6861
6862 /*
6863 * Check access permissions.
6864 *
6865 * Returns: 0 Success
6866 * vnode_authorize:???
6867 */
6868 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6869 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6870 {
6871 kauth_action_t action;
6872 int error;
6873
6874 /*
6875 * If just the regular access bits, convert them to something
6876 * that vnode_authorize will understand.
6877 */
6878 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6879 action = 0;
6880 if (uflags & R_OK) {
6881 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6882 }
6883 if (uflags & W_OK) {
6884 if (vnode_isdir(vp)) {
6885 action |= KAUTH_VNODE_ADD_FILE |
6886 KAUTH_VNODE_ADD_SUBDIRECTORY;
6887 /* might want delete rights here too */
6888 } else {
6889 action |= KAUTH_VNODE_WRITE_DATA;
6890 }
6891 }
6892 if (uflags & X_OK) {
6893 if (vnode_isdir(vp)) {
6894 action |= KAUTH_VNODE_SEARCH;
6895 } else {
6896 action |= KAUTH_VNODE_EXECUTE;
6897 }
6898 }
6899 } else {
6900 /* take advantage of definition of uflags */
6901 action = uflags >> 8;
6902 }
6903
6904 #if CONFIG_MACF
6905 error = mac_vnode_check_access(ctx, vp, uflags);
6906 if (error) {
6907 return error;
6908 }
6909 #endif /* MAC */
6910
6911 /* action == 0 means only check for existence */
6912 if (action != 0) {
6913 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6914 } else {
6915 error = 0;
6916 }
6917
6918 return error;
6919 }
6920
6921
6922
6923 /*
6924 * access_extended: Check access permissions in bulk.
6925 *
6926 * Description: uap->entries Pointer to an array of accessx
6927 * descriptor structs, plus one or
6928 * more NULL terminated strings (see
6929 * "Notes" section below).
6930 * uap->size Size of the area pointed to by
6931 * uap->entries.
6932 * uap->results Pointer to the results array.
6933 *
6934 * Returns: 0 Success
6935 * ENOMEM Insufficient memory
6936 * EINVAL Invalid arguments
6937 * namei:EFAULT Bad address
6938 * namei:ENAMETOOLONG Filename too long
6939 * namei:ENOENT No such file or directory
6940 * namei:ELOOP Too many levels of symbolic links
6941 * namei:EBADF Bad file descriptor
6942 * namei:ENOTDIR Not a directory
6943 * namei:???
6944 * access1:
6945 *
6946 * Implicit returns:
6947 * uap->results Array contents modified
6948 *
6949 * Notes: The uap->entries are structured as an arbitrary length array
6950 * of accessx descriptors, followed by one or more NULL terminated
6951 * strings
6952 *
6953 * struct accessx_descriptor[0]
6954 * ...
6955 * struct accessx_descriptor[n]
6956 * char name_data[0];
6957 *
6958 * We determine the entry count by walking the buffer containing
6959 * the uap->entries argument descriptor. For each descriptor we
6960 * see, the valid values for the offset ad_name_offset will be
6961 * in the byte range:
6962 *
6963 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6964 * to
6965 * [ uap->entries + uap->size - 2 ]
6966 *
6967 * since we must have at least one string, and the string must
6968 * be at least one character plus the NULL terminator in length.
6969 *
6970 * XXX: Need to support the check-as uid argument
6971 */
6972 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6973 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6974 {
6975 struct accessx_descriptor *input = NULL;
6976 errno_t *result = NULL;
6977 errno_t error = 0;
6978 int wantdelete = 0;
6979 size_t desc_max, desc_actual = 0;
6980 unsigned int i, j;
6981 struct vfs_context context;
6982 struct nameidata nd;
6983 int niopts;
6984 vnode_t vp = NULL;
6985 vnode_t dvp = NULL;
6986 #define ACCESSX_MAX_DESCR_ON_STACK 10
6987 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6988
6989 context.vc_ucred = NULL;
6990
6991 /*
6992 * Validate parameters; if valid, copy the descriptor array and string
6993 * arguments into local memory. Before proceeding, the following
6994 * conditions must have been met:
6995 *
6996 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6997 * o There must be sufficient room in the request for at least one
6998 * descriptor and a one yte NUL terminated string.
6999 * o The allocation of local storage must not fail.
7000 */
7001 if (uap->size > ACCESSX_MAX_TABLESIZE) {
7002 return ENOMEM;
7003 }
7004 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
7005 return EINVAL;
7006 }
7007 if (uap->size <= sizeof(stack_input)) {
7008 input = stack_input;
7009 } else {
7010 input = kalloc_data(uap->size, Z_WAITOK);
7011 if (input == NULL) {
7012 error = ENOMEM;
7013 goto out;
7014 }
7015 }
7016 error = copyin(uap->entries, input, uap->size);
7017 if (error) {
7018 goto out;
7019 }
7020
7021 AUDIT_ARG(opaque, input, uap->size);
7022
7023 /*
7024 * Force NUL termination of the copyin buffer to avoid nami() running
7025 * off the end. If the caller passes us bogus data, they may get a
7026 * bogus result.
7027 */
7028 ((char *)input)[uap->size - 1] = 0;
7029
7030 /*
7031 * Access is defined as checking against the process' real identity,
7032 * even if operations are checking the effective identity. This
7033 * requires that we use a local vfs context.
7034 */
7035 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
7036 context.vc_thread = current_thread();
7037
7038 /*
7039 * Find out how many entries we have, so we can allocate the result
7040 * array by walking the list and adjusting the count downward by the
7041 * earliest string offset we see.
7042 */
7043 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
7044 desc_actual = desc_max;
7045 for (i = 0; i < desc_actual; i++) {
7046 /*
7047 * Take the offset to the name string for this entry and
7048 * convert to an input array index, which would be one off
7049 * the end of the array if this entry was the lowest-addressed
7050 * name string.
7051 */
7052 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
7053
7054 /*
7055 * An offset greater than the max allowable offset is an error.
7056 * It is also an error for any valid entry to point
7057 * to a location prior to the end of the current entry, if
7058 * it's not a reference to the string of the previous entry.
7059 */
7060 if (j > desc_max || (j != 0 && j <= i)) {
7061 error = EINVAL;
7062 goto out;
7063 }
7064
7065 /* Also do not let ad_name_offset point to something beyond the size of the input */
7066 if (input[i].ad_name_offset >= uap->size) {
7067 error = EINVAL;
7068 goto out;
7069 }
7070
7071 /*
7072 * An offset of 0 means use the previous descriptor's offset;
7073 * this is used to chain multiple requests for the same file
7074 * to avoid multiple lookups.
7075 */
7076 if (j == 0) {
7077 /* This is not valid for the first entry */
7078 if (i == 0) {
7079 error = EINVAL;
7080 goto out;
7081 }
7082 continue;
7083 }
7084
7085 /*
7086 * If the offset of the string for this descriptor is before
7087 * what we believe is the current actual last descriptor,
7088 * then we need to adjust our estimate downward; this permits
7089 * the string table following the last descriptor to be out
7090 * of order relative to the descriptor list.
7091 */
7092 if (j < desc_actual) {
7093 desc_actual = j;
7094 }
7095 }
7096
7097 /*
7098 * We limit the actual number of descriptors we are willing to process
7099 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
7100 * requested does not exceed this limit,
7101 */
7102 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
7103 error = ENOMEM;
7104 goto out;
7105 }
7106 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
7107 if (result == NULL) {
7108 error = ENOMEM;
7109 goto out;
7110 }
7111
7112 /*
7113 * Do the work by iterating over the descriptor entries we know to
7114 * at least appear to contain valid data.
7115 */
7116 error = 0;
7117 for (i = 0; i < desc_actual; i++) {
7118 /*
7119 * If the ad_name_offset is 0, then we use the previous
7120 * results to make the check; otherwise, we are looking up
7121 * a new file name.
7122 */
7123 if (input[i].ad_name_offset != 0) {
7124 /* discard old vnodes */
7125 if (vp) {
7126 vnode_put(vp);
7127 vp = NULL;
7128 }
7129 if (dvp) {
7130 vnode_put(dvp);
7131 dvp = NULL;
7132 }
7133
7134 /*
7135 * Scan forward in the descriptor list to see if we
7136 * need the parent vnode. We will need it if we are
7137 * deleting, since we must have rights to remove
7138 * entries in the parent directory, as well as the
7139 * rights to delete the object itself.
7140 */
7141 wantdelete = input[i].ad_flags & _DELETE_OK;
7142 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
7143 if (input[j].ad_flags & _DELETE_OK) {
7144 wantdelete = 1;
7145 }
7146 }
7147
7148 niopts = FOLLOW | AUDITVNPATH1;
7149
7150 /* need parent for vnode_authorize for deletion test */
7151 if (wantdelete) {
7152 niopts |= WANTPARENT;
7153 }
7154
7155 /* do the lookup */
7156 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
7157 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
7158 &context);
7159 error = namei(&nd);
7160 if (!error) {
7161 vp = nd.ni_vp;
7162 if (wantdelete) {
7163 dvp = nd.ni_dvp;
7164 }
7165 }
7166 nameidone(&nd);
7167 }
7168
7169 /*
7170 * Handle lookup errors.
7171 */
7172 switch (error) {
7173 case ENOENT:
7174 case EACCES:
7175 case EPERM:
7176 case ENOTDIR:
7177 result[i] = error;
7178 break;
7179 case 0:
7180 /* run this access check */
7181 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
7182 break;
7183 default:
7184 /* fatal lookup error */
7185
7186 goto out;
7187 }
7188 }
7189
7190 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
7191
7192 /* copy out results */
7193 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
7194
7195 out:
7196 if (input && input != stack_input) {
7197 kfree_data(input, uap->size);
7198 }
7199 if (result) {
7200 kfree_data(result, desc_actual * sizeof(errno_t));
7201 }
7202 if (vp) {
7203 vnode_put(vp);
7204 }
7205 if (dvp) {
7206 vnode_put(dvp);
7207 }
7208 if (IS_VALID_CRED(context.vc_ucred)) {
7209 kauth_cred_unref(&context.vc_ucred);
7210 }
7211 return error;
7212 }
7213
7214
7215 /*
7216 * Returns: 0 Success
7217 * namei:EFAULT Bad address
7218 * namei:ENAMETOOLONG Filename too long
7219 * namei:ENOENT No such file or directory
7220 * namei:ELOOP Too many levels of symbolic links
7221 * namei:EBADF Bad file descriptor
7222 * namei:ENOTDIR Not a directory
7223 * namei:???
7224 * access1:
7225 */
7226 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)7227 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
7228 int flag, enum uio_seg segflg)
7229 {
7230 int error;
7231 struct nameidata nd;
7232 int niopts;
7233 struct vfs_context context;
7234 #if NAMEDRSRCFORK
7235 int is_namedstream = 0;
7236 #endif
7237
7238 /*
7239 * Unless the AT_EACCESS option is used, Access is defined as checking
7240 * against the process' real identity, even if operations are checking
7241 * the effective identity. So we need to tweak the credential
7242 * in the context for that case.
7243 */
7244 if (!(flag & AT_EACCESS)) {
7245 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
7246 } else {
7247 context.vc_ucred = ctx->vc_ucred;
7248 }
7249 context.vc_thread = ctx->vc_thread;
7250
7251
7252 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
7253 /* need parent for vnode_authorize for deletion test */
7254 if (amode & _DELETE_OK) {
7255 niopts |= WANTPARENT;
7256 }
7257 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
7258 path, &context);
7259 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7260 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7261 }
7262 if (flag & AT_RESOLVE_BENEATH) {
7263 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
7264 }
7265
7266 #if NAMEDRSRCFORK
7267 /* access(F_OK) calls are allowed for resource forks. */
7268 if (amode == F_OK) {
7269 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7270 }
7271 #endif
7272 error = nameiat(&nd, fd);
7273 if (error) {
7274 goto out;
7275 }
7276
7277 #if NAMEDRSRCFORK
7278 /* Grab reference on the shadow stream file vnode to
7279 * force an inactive on release which will mark it
7280 * for recycle.
7281 */
7282 if (vnode_isnamedstream(nd.ni_vp) &&
7283 (nd.ni_vp->v_parent != NULLVP) &&
7284 vnode_isshadow(nd.ni_vp)) {
7285 is_namedstream = 1;
7286 vnode_ref(nd.ni_vp);
7287 }
7288 #endif
7289
7290 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
7291
7292 #if NAMEDRSRCFORK
7293 if (is_namedstream) {
7294 vnode_rele(nd.ni_vp);
7295 }
7296 #endif
7297
7298 vnode_put(nd.ni_vp);
7299 if (amode & _DELETE_OK) {
7300 vnode_put(nd.ni_dvp);
7301 }
7302 nameidone(&nd);
7303
7304 out:
7305 if (!(flag & AT_EACCESS)) {
7306 kauth_cred_unref(&context.vc_ucred);
7307 }
7308 return error;
7309 }
7310
7311 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)7312 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
7313 {
7314 return faccessat_internal(vfs_context_current(), AT_FDCWD,
7315 uap->path, uap->flags, 0, UIO_USERSPACE);
7316 }
7317
7318 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)7319 faccessat(__unused proc_t p, struct faccessat_args *uap,
7320 __unused int32_t *retval)
7321 {
7322 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
7323 return EINVAL;
7324 }
7325
7326 return faccessat_internal(vfs_context_current(), uap->fd,
7327 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
7328 }
7329
7330 /*
7331 * Returns: 0 Success
7332 * EFAULT
7333 * copyout:EFAULT
7334 * namei:???
7335 * vn_stat:???
7336 */
7337 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)7338 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
7339 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
7340 enum uio_seg segflg, int fd, int flag)
7341 {
7342 struct nameidata *ndp = NULL;
7343 int follow;
7344 union {
7345 struct stat sb;
7346 struct stat64 sb64;
7347 } source = {};
7348 union {
7349 struct user64_stat user64_sb;
7350 struct user32_stat user32_sb;
7351 struct user64_stat64 user64_sb64;
7352 struct user32_stat64 user32_sb64;
7353 } dest = {};
7354 caddr_t sbp;
7355 int error, my_size;
7356 kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
7357 size_t xsecurity_bufsize;
7358 void * statptr;
7359 struct fileproc *fp = NULL;
7360 int needsrealdev = 0;
7361
7362 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7363 ndp = kalloc_type(struct nameidata, Z_WAITOK);
7364 NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7365 segflg, path, ctx);
7366 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7367 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7368 }
7369 if (flag & AT_RESOLVE_BENEATH) {
7370 ndp->ni_flag |= NAMEI_RESOLVE_BENEATH;
7371 }
7372
7373 #if NAMEDRSRCFORK
7374 int is_namedstream = 0;
7375 /* stat calls are allowed for resource forks. */
7376 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7377 #endif
7378
7379 if (flag & AT_FDONLY) {
7380 vnode_t fvp;
7381
7382 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7383 if (error) {
7384 goto out;
7385 }
7386 if ((error = vnode_getwithref(fvp))) {
7387 file_drop(fd);
7388 goto out;
7389 }
7390 ndp->ni_vp = fvp;
7391 } else {
7392 error = nameiat(ndp, fd);
7393 if (error) {
7394 goto out;
7395 }
7396 }
7397
7398 statptr = (void *)&source;
7399
7400 #if NAMEDRSRCFORK
7401 /* Grab reference on the shadow stream file vnode to
7402 * force an inactive on release which will mark it
7403 * for recycle.
7404 */
7405 if (vnode_isnamedstream(ndp->ni_vp) &&
7406 (ndp->ni_vp->v_parent != NULLVP) &&
7407 vnode_isshadow(ndp->ni_vp)) {
7408 is_namedstream = 1;
7409 vnode_ref(ndp->ni_vp);
7410 }
7411 #endif
7412
7413 needsrealdev = flag & AT_REALDEV ? 1 : 0;
7414 if (fp && (xsecurity == USER_ADDR_NULL)) {
7415 /*
7416 * If the caller has the file open, and is not
7417 * requesting extended security information, we are
7418 * going to let them get the basic stat information.
7419 */
7420 error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7421 fp->fp_glob->fg_cred);
7422 } else {
7423 error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7424 isstat64, needsrealdev, ctx);
7425 }
7426
7427 #if NAMEDRSRCFORK
7428 if (is_namedstream) {
7429 vnode_rele(ndp->ni_vp);
7430 }
7431 #endif
7432 vnode_put(ndp->ni_vp);
7433 nameidone(ndp);
7434
7435 if (fp) {
7436 file_drop(fd);
7437 fp = NULL;
7438 }
7439
7440 if (error) {
7441 goto out;
7442 }
7443 /* Zap spare fields */
7444 if (isstat64 != 0) {
7445 source.sb64.st_lspare = 0;
7446 source.sb64.st_qspare[0] = 0LL;
7447 source.sb64.st_qspare[1] = 0LL;
7448 if (vfs_context_is64bit(ctx)) {
7449 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7450 my_size = sizeof(dest.user64_sb64);
7451 sbp = (caddr_t)&dest.user64_sb64;
7452 } else {
7453 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7454 my_size = sizeof(dest.user32_sb64);
7455 sbp = (caddr_t)&dest.user32_sb64;
7456 }
7457 /*
7458 * Check if we raced (post lookup) against the last unlink of a file.
7459 */
7460 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7461 source.sb64.st_nlink = 1;
7462 }
7463 } else {
7464 source.sb.st_lspare = 0;
7465 source.sb.st_qspare[0] = 0LL;
7466 source.sb.st_qspare[1] = 0LL;
7467 if (vfs_context_is64bit(ctx)) {
7468 munge_user64_stat(&source.sb, &dest.user64_sb);
7469 my_size = sizeof(dest.user64_sb);
7470 sbp = (caddr_t)&dest.user64_sb;
7471 } else {
7472 munge_user32_stat(&source.sb, &dest.user32_sb);
7473 my_size = sizeof(dest.user32_sb);
7474 sbp = (caddr_t)&dest.user32_sb;
7475 }
7476
7477 /*
7478 * Check if we raced (post lookup) against the last unlink of a file.
7479 */
7480 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7481 source.sb.st_nlink = 1;
7482 }
7483 }
7484 if ((error = copyout(sbp, ub, my_size)) != 0) {
7485 goto out;
7486 }
7487
7488 /* caller wants extended security information? */
7489 if (xsecurity != USER_ADDR_NULL) {
7490 /* did we get any? */
7491 if (fsec == KAUTH_FILESEC_NONE) {
7492 if (susize(xsecurity_size, 0) != 0) {
7493 error = EFAULT;
7494 goto out;
7495 }
7496 } else {
7497 /* find the user buffer size */
7498 xsecurity_bufsize = fusize(xsecurity_size);
7499
7500 /* copy out the actual data size */
7501 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7502 error = EFAULT;
7503 goto out;
7504 }
7505
7506 /* if the caller supplied enough room, copy out to it */
7507 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7508 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7509 }
7510 }
7511 }
7512 out:
7513 if (ndp) {
7514 kfree_type(struct nameidata, ndp);
7515 }
7516 if (fsec != KAUTH_FILESEC_NONE) {
7517 kauth_filesec_free(fsec);
7518 }
7519 return error;
7520 }
7521
7522 /*
7523 * stat_extended: Get file status; with extended security (ACL).
7524 *
7525 * Parameters: p (ignored)
7526 * uap User argument descriptor (see below)
7527 * retval (ignored)
7528 *
7529 * Indirect: uap->path Path of file to get status from
7530 * uap->ub User buffer (holds file status info)
7531 * uap->xsecurity ACL to get (extended security)
7532 * uap->xsecurity_size Size of ACL
7533 *
7534 * Returns: 0 Success
7535 * !0 errno value
7536 *
7537 */
7538 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7539 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7540 __unused int32_t *retval)
7541 {
7542 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7543 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7544 0);
7545 }
7546
7547 /*
7548 * Returns: 0 Success
7549 * fstatat_internal:??? [see fstatat_internal() in this file]
7550 */
7551 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7552 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7553 {
7554 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7555 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7556 }
7557
7558 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7559 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7560 {
7561 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7562 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7563 }
7564
7565 /*
7566 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7567 *
7568 * Parameters: p (ignored)
7569 * uap User argument descriptor (see below)
7570 * retval (ignored)
7571 *
7572 * Indirect: uap->path Path of file to get status from
7573 * uap->ub User buffer (holds file status info)
7574 * uap->xsecurity ACL to get (extended security)
7575 * uap->xsecurity_size Size of ACL
7576 *
7577 * Returns: 0 Success
7578 * !0 errno value
7579 *
7580 */
7581 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7582 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7583 {
7584 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7585 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7586 0);
7587 }
7588
7589 /*
7590 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7591 *
7592 * Parameters: p (ignored)
7593 * uap User argument descriptor (see below)
7594 * retval (ignored)
7595 *
7596 * Indirect: uap->path Path of file to get status from
7597 * uap->ub User buffer (holds file status info)
7598 * uap->xsecurity ACL to get (extended security)
7599 * uap->xsecurity_size Size of ACL
7600 *
7601 * Returns: 0 Success
7602 * !0 errno value
7603 *
7604 */
7605 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7606 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7607 {
7608 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7609 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7610 AT_SYMLINK_NOFOLLOW);
7611 }
7612
7613 /*
7614 * Get file status; this version does not follow links.
7615 */
7616 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7617 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7618 {
7619 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7620 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7621 }
7622
7623 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7624 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7625 {
7626 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7627 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7628 }
7629
7630 /*
7631 * lstat64_extended: Get file status; can handle large inode numbers; does not
7632 * follow links; with extended security (ACL).
7633 *
7634 * Parameters: p (ignored)
7635 * uap User argument descriptor (see below)
7636 * retval (ignored)
7637 *
7638 * Indirect: uap->path Path of file to get status from
7639 * uap->ub User buffer (holds file status info)
7640 * uap->xsecurity ACL to get (extended security)
7641 * uap->xsecurity_size Size of ACL
7642 *
7643 * Returns: 0 Success
7644 * !0 errno value
7645 *
7646 */
7647 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7648 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7649 {
7650 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7651 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7652 AT_SYMLINK_NOFOLLOW);
7653 }
7654
7655 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7656 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7657 {
7658 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
7659 return EINVAL;
7660 }
7661
7662 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7663 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7664 }
7665
7666 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7667 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7668 __unused int32_t *retval)
7669 {
7670 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
7671 return EINVAL;
7672 }
7673
7674 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7675 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7676 }
7677
7678 /*
7679 * Get configurable pathname variables.
7680 *
7681 * Returns: 0 Success
7682 * namei:???
7683 * vn_pathconf:???
7684 *
7685 * Notes: Global implementation constants are intended to be
7686 * implemented in this function directly; all other constants
7687 * are per-FS implementation, and therefore must be handled in
7688 * each respective FS, instead.
7689 *
7690 * XXX We implement some things globally right now that should actually be
7691 * XXX per-FS; we will need to deal with this at some point.
7692 */
7693 /* ARGSUSED */
7694 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7695 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7696 {
7697 int error;
7698 struct nameidata nd;
7699 vfs_context_t ctx = vfs_context_current();
7700
7701 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7702 UIO_USERSPACE, uap->path, ctx);
7703 error = namei(&nd);
7704 if (error) {
7705 return error;
7706 }
7707
7708 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7709
7710 vnode_put(nd.ni_vp);
7711 nameidone(&nd);
7712 return error;
7713 }
7714
7715 /*
7716 * Return target name of a symbolic link.
7717 */
7718 /* ARGSUSED */
7719 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7720 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7721 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7722 int *retval)
7723 {
7724 vnode_t vp;
7725 uio_t auio;
7726 int error;
7727 struct nameidata nd;
7728 UIO_STACKBUF(uio_buf, 1);
7729 bool put_vnode;
7730
7731 if (bufsize > INT32_MAX) {
7732 return EINVAL;
7733 }
7734
7735 if (lnk_vp) {
7736 vp = lnk_vp;
7737 put_vnode = false;
7738 } else {
7739 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7740 seg, path, ctx);
7741
7742 error = nameiat(&nd, fd);
7743 if (error) {
7744 return error;
7745 }
7746 vp = nd.ni_vp;
7747 put_vnode = true;
7748 nameidone(&nd);
7749 }
7750
7751 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7752 &uio_buf[0], sizeof(uio_buf));
7753 uio_addiov(auio, buf, bufsize);
7754 if (vp->v_type != VLNK) {
7755 error = EINVAL;
7756 } else {
7757 #if CONFIG_MACF
7758 error = mac_vnode_check_readlink(ctx, vp);
7759 #endif
7760 if (error == 0) {
7761 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7762 ctx);
7763 }
7764 if (error == 0) {
7765 error = VNOP_READLINK(vp, auio, ctx);
7766 }
7767 }
7768
7769 if (put_vnode) {
7770 vnode_put(vp);
7771 }
7772
7773 *retval = (int)(bufsize - uio_resid(auio));
7774 return error;
7775 }
7776
7777 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7778 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7779 {
7780 enum uio_seg procseg;
7781 vnode_t vp;
7782 int error;
7783
7784 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7785
7786 AUDIT_ARG(fd, uap->fd);
7787
7788 if ((error = file_vnode(uap->fd, &vp))) {
7789 return error;
7790 }
7791 if ((error = vnode_getwithref(vp))) {
7792 file_drop(uap->fd);
7793 return error;
7794 }
7795
7796 error = readlinkat_internal(vfs_context_current(), -1,
7797 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7798 uap->bufsize, procseg, retval);
7799
7800 vnode_put(vp);
7801 file_drop(uap->fd);
7802 return error;
7803 }
7804
7805 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7806 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7807 {
7808 enum uio_seg procseg;
7809
7810 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7811 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7812 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7813 uap->count, procseg, retval);
7814 }
7815
7816 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7817 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7818 {
7819 enum uio_seg procseg;
7820
7821 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7822 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7823 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7824 retval);
7825 }
7826
7827 /*
7828 * Change file flags, the deep inner layer.
7829 */
7830 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7831 chflags0(vnode_t vp, struct vnode_attr *va,
7832 int (*setattr)(vnode_t, void *, vfs_context_t),
7833 void *arg, vfs_context_t ctx)
7834 {
7835 kauth_action_t action = 0;
7836 int error;
7837
7838 #if CONFIG_MACF
7839 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7840 if (error) {
7841 goto out;
7842 }
7843 #endif
7844
7845 /* request authorisation, disregard immutability */
7846 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7847 goto out;
7848 }
7849 /*
7850 * Request that the auth layer disregard those file flags it's allowed to when
7851 * authorizing this operation; we need to do this in order to be able to
7852 * clear immutable flags.
7853 */
7854 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7855 goto out;
7856 }
7857 error = (*setattr)(vp, arg, ctx);
7858
7859 #if CONFIG_MACF
7860 if (error == 0) {
7861 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7862 }
7863 #endif
7864
7865 out:
7866 return error;
7867 }
7868
7869 /*
7870 * Change file flags.
7871 *
7872 * NOTE: this will vnode_put() `vp'
7873 */
7874 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7875 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7876 {
7877 struct vnode_attr va;
7878 int error;
7879
7880 VATTR_INIT(&va);
7881 VATTR_SET(&va, va_flags, flags);
7882
7883 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7884 vnode_put(vp);
7885
7886 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7887 error = ENOTSUP;
7888 }
7889
7890 return error;
7891 }
7892
7893 /*
7894 * Change flags of a file given a path name.
7895 */
7896 /* ARGSUSED */
7897 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7898 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7899 {
7900 vnode_t vp;
7901 vfs_context_t ctx = vfs_context_current();
7902 int error;
7903 struct nameidata nd;
7904 uint32_t wantparent = 0;
7905
7906 #if CONFIG_FILE_LEASES
7907 wantparent = WANTPARENT;
7908 #endif
7909
7910 AUDIT_ARG(fflags, uap->flags);
7911 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7912 UIO_USERSPACE, uap->path, ctx);
7913 error = namei(&nd);
7914 if (error) {
7915 return error;
7916 }
7917 vp = nd.ni_vp;
7918
7919 #if CONFIG_FILE_LEASES
7920 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7921 vnode_put(nd.ni_dvp);
7922 #endif
7923
7924 nameidone(&nd);
7925
7926 /* we don't vnode_put() here because chflags1 does internally */
7927 error = chflags1(vp, uap->flags, ctx);
7928
7929 return error;
7930 }
7931
7932 /*
7933 * Change flags of a file given a file descriptor.
7934 */
7935 /* ARGSUSED */
7936 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7937 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7938 {
7939 vnode_t vp;
7940 int error;
7941
7942 AUDIT_ARG(fd, uap->fd);
7943 AUDIT_ARG(fflags, uap->flags);
7944 if ((error = file_vnode(uap->fd, &vp))) {
7945 return error;
7946 }
7947
7948 if ((error = vnode_getwithref(vp))) {
7949 file_drop(uap->fd);
7950 return error;
7951 }
7952
7953 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7954
7955 #if CONFIG_FILE_LEASES
7956 vnode_breakdirlease(vp, true, O_WRONLY);
7957 #endif
7958
7959 /* we don't vnode_put() here because chflags1 does internally */
7960 error = chflags1(vp, uap->flags, vfs_context_current());
7961
7962 file_drop(uap->fd);
7963 return error;
7964 }
7965
7966 /*
7967 * Change security information on a filesystem object.
7968 *
7969 * Returns: 0 Success
7970 * EPERM Operation not permitted
7971 * vnode_authattr:??? [anything vnode_authattr can return]
7972 * vnode_authorize:??? [anything vnode_authorize can return]
7973 * vnode_setattr:??? [anything vnode_setattr can return]
7974 *
7975 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7976 * translated to EPERM before being returned.
7977 */
7978 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7979 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7980 {
7981 kauth_action_t action;
7982 int error;
7983
7984 AUDIT_ARG(mode, vap->va_mode);
7985 /* XXX audit new args */
7986
7987 #if NAMEDSTREAMS
7988 /* chmod calls are not allowed for resource forks. */
7989 if (vp->v_flag & VISNAMEDSTREAM) {
7990 return EPERM;
7991 }
7992 #endif
7993
7994 #if CONFIG_MACF
7995 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7996 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7997 return error;
7998 }
7999
8000 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
8001 if ((error = mac_vnode_check_setowner(ctx, vp,
8002 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
8003 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
8004 return error;
8005 }
8006 }
8007
8008 if (VATTR_IS_ACTIVE(vap, va_acl) &&
8009 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
8010 return error;
8011 }
8012 #endif
8013
8014 /* make sure that the caller is allowed to set this security information */
8015 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
8016 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8017 if (error == EACCES) {
8018 error = EPERM;
8019 }
8020 return error;
8021 }
8022
8023 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
8024 return error;
8025 }
8026
8027 #if CONFIG_MACF
8028 if (VATTR_IS_ACTIVE(vap, va_mode)) {
8029 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
8030 }
8031
8032 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
8033 mac_vnode_notify_setowner(ctx, vp,
8034 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
8035 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
8036 }
8037
8038 if (VATTR_IS_ACTIVE(vap, va_acl)) {
8039 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
8040 }
8041 #endif
8042
8043 return error;
8044 }
8045
8046
8047 /*
8048 * Change mode of a file given a path name.
8049 *
8050 * Returns: 0 Success
8051 * namei:??? [anything namei can return]
8052 * chmod_vnode:??? [anything chmod_vnode can return]
8053 */
8054 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)8055 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
8056 int fd, int flag, enum uio_seg segflg)
8057 {
8058 struct nameidata nd;
8059 int follow, error;
8060 uint32_t wantparent = 0;
8061
8062 #if CONFIG_FILE_LEASES
8063 wantparent = WANTPARENT;
8064 #endif
8065
8066 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8067 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
8068 segflg, path, ctx);
8069 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8070 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8071 }
8072 if (flag & AT_RESOLVE_BENEATH) {
8073 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
8074 }
8075 if ((error = nameiat(&nd, fd))) {
8076 return error;
8077 }
8078
8079 #if CONFIG_FILE_LEASES
8080 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8081 vnode_put(nd.ni_dvp);
8082 #endif
8083
8084 error = chmod_vnode(ctx, nd.ni_vp, vap);
8085 vnode_put(nd.ni_vp);
8086 nameidone(&nd);
8087 return error;
8088 }
8089
8090 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)8091 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
8092 gid_t gid, user_addr_t xsecurity)
8093 {
8094 int error;
8095
8096 VATTR_INIT(pva);
8097
8098 if (mode != -1) {
8099 VATTR_SET(pva, va_mode, mode & ALLPERMS);
8100 } else {
8101 pva->va_mode = 0;
8102 }
8103
8104 if (uid != KAUTH_UID_NONE) {
8105 VATTR_SET(pva, va_uid, uid);
8106 }
8107
8108 if (gid != KAUTH_GID_NONE) {
8109 VATTR_SET(pva, va_gid, gid);
8110 }
8111
8112 *pxsecdst = NULL;
8113 switch (xsecurity) {
8114 case USER_ADDR_NULL:
8115 break;
8116
8117 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
8118 VATTR_SET(pva, va_acl, NULL);
8119 break;
8120
8121 default:
8122 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
8123 return error;
8124 }
8125
8126 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
8127 pva->va_vaflags |= VA_FILESEC_ACL;
8128 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
8129 break;
8130 }
8131
8132 return 0;
8133 }
8134
8135 /*
8136 * chmod_extended: Change the mode of a file given a path name; with extended
8137 * argument list (including extended security (ACL)).
8138 *
8139 * Parameters: p Process requesting the open
8140 * uap User argument descriptor (see below)
8141 * retval (ignored)
8142 *
8143 * Indirect: uap->path Path to object (same as 'chmod')
8144 * uap->uid UID to set
8145 * uap->gid GID to set
8146 * uap->mode File mode to set (same as 'chmod')
8147 * uap->xsecurity ACL to set (or delete)
8148 *
8149 * Returns: 0 Success
8150 * !0 errno value
8151 *
8152 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
8153 *
8154 * XXX: We should enummerate the possible errno values here, and where
8155 * in the code they originated.
8156 */
8157 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)8158 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
8159 {
8160 int error;
8161 struct vnode_attr va;
8162 kauth_filesec_t xsecdst = NULL;
8163
8164 AUDIT_ARG(owner, uap->uid, uap->gid);
8165
8166 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8167 uap->gid, uap->xsecurity);
8168
8169 if (error) {
8170 return error;
8171 }
8172
8173 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
8174 UIO_USERSPACE);
8175
8176 if (xsecdst != NULL) {
8177 kauth_filesec_free(xsecdst);
8178 }
8179 return error;
8180 }
8181
8182 /*
8183 * Returns: 0 Success
8184 * chmodat:??? [anything chmodat can return]
8185 */
8186 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)8187 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
8188 int flag, enum uio_seg segflg)
8189 {
8190 struct vnode_attr va;
8191
8192 VATTR_INIT(&va);
8193 VATTR_SET(&va, va_mode, mode & ALLPERMS);
8194
8195 return chmodat(ctx, path, &va, fd, flag, segflg);
8196 }
8197
8198 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)8199 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
8200 {
8201 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
8202 AT_FDCWD, 0, UIO_USERSPACE);
8203 }
8204
8205 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)8206 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
8207 {
8208 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
8209 return EINVAL;
8210 }
8211
8212 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
8213 uap->fd, uap->flag, UIO_USERSPACE);
8214 }
8215
8216 /*
8217 * Change mode of a file given a file descriptor.
8218 */
8219 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)8220 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
8221 {
8222 vnode_t vp;
8223 int error;
8224
8225 AUDIT_ARG(fd, fd);
8226
8227 if ((error = file_vnode(fd, &vp)) != 0) {
8228 return error;
8229 }
8230 if ((error = vnode_getwithref(vp)) != 0) {
8231 file_drop(fd);
8232 return error;
8233 }
8234 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8235
8236 #if CONFIG_FILE_LEASES
8237 vnode_breakdirlease(vp, true, O_WRONLY);
8238 #endif
8239
8240 error = chmod_vnode(vfs_context_current(), vp, vap);
8241 (void)vnode_put(vp);
8242 file_drop(fd);
8243
8244 return error;
8245 }
8246
8247 /*
8248 * fchmod_extended: Change mode of a file given a file descriptor; with
8249 * extended argument list (including extended security (ACL)).
8250 *
8251 * Parameters: p Process requesting to change file mode
8252 * uap User argument descriptor (see below)
8253 * retval (ignored)
8254 *
8255 * Indirect: uap->mode File mode to set (same as 'chmod')
8256 * uap->uid UID to set
8257 * uap->gid GID to set
8258 * uap->xsecurity ACL to set (or delete)
8259 * uap->fd File descriptor of file to change mode
8260 *
8261 * Returns: 0 Success
8262 * !0 errno value
8263 *
8264 */
8265 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)8266 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
8267 {
8268 int error;
8269 struct vnode_attr va;
8270 kauth_filesec_t xsecdst = NULL;
8271
8272 AUDIT_ARG(owner, uap->uid, uap->gid);
8273
8274 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8275 uap->gid, uap->xsecurity);
8276
8277 if (error) {
8278 return error;
8279 }
8280
8281 error = fchmod1(p, uap->fd, &va);
8282
8283 if (xsecdst != NULL) {
8284 kauth_filesec_free(xsecdst);
8285 }
8286 return error;
8287 }
8288
8289 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)8290 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
8291 {
8292 struct vnode_attr va;
8293
8294 VATTR_INIT(&va);
8295 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
8296
8297 return fchmod1(p, uap->fd, &va);
8298 }
8299
8300 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)8301 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
8302 {
8303 struct vnode_attr va;
8304 kauth_action_t action;
8305 int error;
8306
8307 VATTR_INIT(&va);
8308 if (uid != (uid_t)VNOVAL) {
8309 VATTR_SET(&va, va_uid, uid);
8310 }
8311 if (gid != (gid_t)VNOVAL) {
8312 VATTR_SET(&va, va_gid, gid);
8313 }
8314
8315 #if NAMEDSTREAMS
8316 /* chown calls are not allowed for resource forks. */
8317 if (vp->v_flag & VISNAMEDSTREAM) {
8318 error = EPERM;
8319 goto out;
8320 }
8321 #endif
8322
8323 #if CONFIG_MACF
8324 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
8325 if (error) {
8326 goto out;
8327 }
8328 #endif
8329
8330 /* preflight and authorize attribute changes */
8331 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8332 goto out;
8333 }
8334 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8335 /*
8336 * EACCES is only allowed from namei(); permissions failure should
8337 * return EPERM, so we need to translate the error code.
8338 */
8339 if (error == EACCES) {
8340 error = EPERM;
8341 }
8342
8343 goto out;
8344 }
8345
8346 #if CONFIG_FILE_LEASES
8347 vnode_breakdirlease(vp, true, O_WRONLY);
8348 #endif
8349
8350 error = vnode_setattr(vp, &va, ctx);
8351
8352 #if CONFIG_MACF
8353 if (error == 0) {
8354 mac_vnode_notify_setowner(ctx, vp, uid, gid);
8355 }
8356 #endif
8357
8358 out:
8359 return error;
8360 }
8361
8362 /*
8363 * Set ownership given a path name.
8364 */
8365 /* ARGSUSED */
8366 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)8367 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
8368 gid_t gid, int flag, enum uio_seg segflg)
8369 {
8370 vnode_t vp;
8371 int error;
8372 struct nameidata nd;
8373 int follow;
8374
8375 AUDIT_ARG(owner, uid, gid);
8376
8377 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8378 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8379 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8380 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8381 }
8382 if (flag & AT_RESOLVE_BENEATH) {
8383 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
8384 }
8385
8386 error = nameiat(&nd, fd);
8387 if (error) {
8388 return error;
8389 }
8390
8391 vp = nd.ni_vp;
8392 error = vn_chown_internal(ctx, vp, uid, gid);
8393
8394 nameidone(&nd);
8395 vnode_put(vp);
8396 return error;
8397 }
8398
8399 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8400 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8401 {
8402 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8403 uap->uid, uap->gid, 0, UIO_USERSPACE);
8404 }
8405
8406 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8407 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8408 {
8409 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8410 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8411 }
8412
8413 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8414 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8415 {
8416 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) {
8417 return EINVAL;
8418 }
8419
8420 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8421 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8422 }
8423
8424 /*
8425 * Set ownership given a file descriptor.
8426 */
8427 /* ARGSUSED */
8428 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8429 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8430 {
8431 vfs_context_t ctx = vfs_context_current();
8432 vnode_t vp;
8433 int error;
8434
8435 AUDIT_ARG(owner, uap->uid, uap->gid);
8436 AUDIT_ARG(fd, uap->fd);
8437
8438 if ((error = file_vnode(uap->fd, &vp))) {
8439 return error;
8440 }
8441
8442 if ((error = vnode_getwithref(vp))) {
8443 file_drop(uap->fd);
8444 return error;
8445 }
8446 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8447
8448 error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8449
8450 (void)vnode_put(vp);
8451 file_drop(uap->fd);
8452 return error;
8453 }
8454
8455 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8456 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8457 {
8458 int error;
8459
8460 if (usrtvp == USER_ADDR_NULL) {
8461 struct timeval old_tv;
8462 /* XXX Y2038 bug because of microtime argument */
8463 microtime(&old_tv);
8464 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8465 tsp[1] = tsp[0];
8466 } else {
8467 if (IS_64BIT_PROCESS(current_proc())) {
8468 struct user64_timeval tv[2];
8469 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8470 if (error) {
8471 return error;
8472 }
8473 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8474 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8475 } else {
8476 struct user32_timeval tv[2];
8477 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8478 if (error) {
8479 return error;
8480 }
8481 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8482 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8483 }
8484 }
8485 return 0;
8486 }
8487
8488 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8489 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8490 int nullflag)
8491 {
8492 int error;
8493 struct vnode_attr va;
8494 kauth_action_t action;
8495
8496 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8497
8498 VATTR_INIT(&va);
8499 VATTR_SET(&va, va_access_time, ts[0]);
8500 VATTR_SET(&va, va_modify_time, ts[1]);
8501 if (nullflag) {
8502 va.va_vaflags |= VA_UTIMES_NULL;
8503 }
8504
8505 #if NAMEDSTREAMS
8506 /* utimes calls are not allowed for resource forks. */
8507 if (vp->v_flag & VISNAMEDSTREAM) {
8508 error = EPERM;
8509 goto out;
8510 }
8511 #endif
8512
8513 #if CONFIG_MACF
8514 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8515 if (error) {
8516 goto out;
8517 }
8518 #endif
8519 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8520 if (!nullflag && error == EACCES) {
8521 error = EPERM;
8522 }
8523 goto out;
8524 }
8525
8526 /* since we may not need to auth anything, check here */
8527 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8528 if (!nullflag && error == EACCES) {
8529 error = EPERM;
8530 }
8531 goto out;
8532 }
8533 error = vnode_setattr(vp, &va, ctx);
8534
8535 #if CONFIG_MACF
8536 if (error == 0) {
8537 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8538 }
8539 #endif
8540
8541 out:
8542 return error;
8543 }
8544
8545 /*
8546 * Set the access and modification times of a file.
8547 */
8548 /* ARGSUSED */
8549 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8550 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8551 {
8552 struct timespec ts[2];
8553 user_addr_t usrtvp;
8554 int error;
8555 struct nameidata nd;
8556 vfs_context_t ctx = vfs_context_current();
8557 uint32_t wantparent = 0;
8558
8559 #if CONFIG_FILE_LEASES
8560 wantparent = WANTPARENT;
8561 #endif
8562
8563 /*
8564 * AUDIT: Needed to change the order of operations to do the
8565 * name lookup first because auditing wants the path.
8566 */
8567 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8568 UIO_USERSPACE, uap->path, ctx);
8569 error = namei(&nd);
8570 if (error) {
8571 return error;
8572 }
8573
8574 /*
8575 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8576 * the current time instead.
8577 */
8578 usrtvp = uap->tptr;
8579 if ((error = getutimes(usrtvp, ts)) != 0) {
8580 goto out;
8581 }
8582
8583 #if CONFIG_FILE_LEASES
8584 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8585 #endif
8586
8587 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8588
8589 out:
8590 #if CONFIG_FILE_LEASES
8591 vnode_put(nd.ni_dvp);
8592 #endif
8593 nameidone(&nd);
8594 vnode_put(nd.ni_vp);
8595 return error;
8596 }
8597
8598 /*
8599 * Set the access and modification times of a file.
8600 */
8601 /* ARGSUSED */
8602 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8603 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8604 {
8605 struct timespec ts[2];
8606 vnode_t vp;
8607 user_addr_t usrtvp;
8608 int error;
8609
8610 AUDIT_ARG(fd, uap->fd);
8611 usrtvp = uap->tptr;
8612 if ((error = getutimes(usrtvp, ts)) != 0) {
8613 return error;
8614 }
8615 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8616 return error;
8617 }
8618 if ((error = vnode_getwithref(vp))) {
8619 file_drop(uap->fd);
8620 return error;
8621 }
8622
8623 #if CONFIG_FILE_LEASES
8624 vnode_breakdirlease(vp, true, O_WRONLY);
8625 #endif
8626
8627 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8628
8629 vnode_put(vp);
8630 file_drop(uap->fd);
8631 return error;
8632 }
8633
8634 static int
truncate_validate_common(proc_t p,off_t length)8635 truncate_validate_common(proc_t p, off_t length)
8636 {
8637 rlim_t fsize_limit;
8638
8639 if (length < 0) {
8640 return EINVAL;
8641 }
8642
8643 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8644 if ((rlim_t)length > fsize_limit) {
8645 psignal(p, SIGXFSZ);
8646 return EFBIG;
8647 }
8648
8649 return 0;
8650 }
8651
8652 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8653 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8654 vfs_context_t ctx, boolean_t need_auth)
8655 {
8656 struct vnode_attr va;
8657 kauth_action_t action;
8658 int error;
8659
8660 VATTR_INIT(&va);
8661 VATTR_SET(&va, va_data_size, length);
8662
8663 #if CONFIG_MACF
8664 error = mac_vnode_check_truncate(ctx, cred, vp);
8665 if (error) {
8666 return error;
8667 }
8668 #endif
8669
8670 /*
8671 * If we reached here from `ftruncate` then we already did an effective
8672 * `vnode_authorize` upon open. We honour the result from then.
8673 */
8674 if (need_auth) {
8675 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8676 return error;
8677 }
8678
8679 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8680 return error;
8681 }
8682 }
8683
8684 #if CONFIG_FILE_LEASES
8685 /* Check if there is a lease placed on the parent directory. */
8686 vnode_breakdirlease(vp, true, O_WRONLY);
8687
8688 /* Now check if there is a lease placed on the file itself. */
8689 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8690 #endif
8691
8692 error = vnode_setattr(vp, &va, ctx);
8693
8694 #if CONFIG_MACF
8695 if (error == 0) {
8696 mac_vnode_notify_truncate(ctx, cred, vp);
8697 }
8698 #endif
8699
8700 return error;
8701 }
8702
8703 /*
8704 * Truncate a file given its path name.
8705 */
8706 /* ARGSUSED */
8707 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8708 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8709 {
8710 vfs_context_t ctx = vfs_context_current();
8711 vnode_t vp;
8712 int error;
8713 struct nameidata nd;
8714
8715 if ((error = truncate_validate_common(p, uap->length))) {
8716 return error;
8717 }
8718
8719 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8720 UIO_USERSPACE, uap->path, ctx);
8721
8722 if ((error = namei(&nd))) {
8723 return error;
8724 }
8725
8726 vp = nd.ni_vp;
8727 nameidone(&nd);
8728
8729 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8730 vnode_put(vp);
8731
8732 return error;
8733 }
8734
8735 /*
8736 * Truncate a file given a file descriptor.
8737 */
8738 /* ARGSUSED */
8739 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8740 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8741 {
8742 struct vnode_attr va;
8743 vnode_t vp = NULLVP;
8744 struct fileproc *fp;
8745 bool need_vnode_put = false;
8746 int error;
8747
8748 AUDIT_ARG(fd, uap->fd);
8749
8750 if ((error = truncate_validate_common(p, uap->length))) {
8751 return error;
8752 }
8753
8754 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8755 return error;
8756 }
8757
8758 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8759 case DTYPE_PSXSHM:
8760 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8761 goto out;
8762 case DTYPE_VNODE:
8763 break;
8764 default:
8765 error = EINVAL;
8766 goto out;
8767 }
8768
8769 vp = (vnode_t)fp_get_data(fp);
8770
8771 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8772 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8773 error = EINVAL;
8774 goto out;
8775 }
8776
8777 if ((error = vnode_getwithref(vp)) != 0) {
8778 goto out;
8779 }
8780 need_vnode_put = true;
8781
8782 VATTR_INIT(&va);
8783 VATTR_WANTED(&va, va_flags);
8784
8785 error = vnode_getattr(vp, &va, vfs_context_current());
8786 if (error) {
8787 goto out;
8788 }
8789
8790 /* Don't allow ftruncate if the file has append-only flag set. */
8791 if (va.va_flags & APPEND) {
8792 error = EPERM;
8793 goto out;
8794 }
8795
8796 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8797
8798 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8799 vfs_context_current(), false);
8800 if (!error) {
8801 fp->fp_glob->fg_flag |= FWASWRITTEN;
8802 }
8803
8804 out:
8805 if (vp && need_vnode_put) {
8806 vnode_put(vp);
8807 }
8808
8809 file_drop(uap->fd);
8810 return error;
8811 }
8812
8813
8814 /*
8815 * Sync an open file with synchronized I/O _file_ integrity completion
8816 */
8817 /* ARGSUSED */
8818 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8819 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8820 {
8821 __pthread_testcancel(1);
8822 return fsync_common(p, uap, MNT_WAIT);
8823 }
8824
8825
8826 /*
8827 * Sync an open file with synchronized I/O _file_ integrity completion
8828 *
8829 * Notes: This is a legacy support function that does not test for
8830 * thread cancellation points.
8831 */
8832 /* ARGSUSED */
8833 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8834 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8835 {
8836 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8837 }
8838
8839
8840 /*
8841 * Sync an open file with synchronized I/O _data_ integrity completion
8842 */
8843 /* ARGSUSED */
8844 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8845 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8846 {
8847 __pthread_testcancel(1);
8848 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8849 }
8850
8851
8852 /*
8853 * fsync_common
8854 *
8855 * Common fsync code to support both synchronized I/O file integrity completion
8856 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8857 *
8858 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8859 * will only guarantee that the file data contents are retrievable. If
8860 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8861 * includes additional metadata unnecessary for retrieving the file data
8862 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8863 * storage.
8864 *
8865 * Parameters: p The process
8866 * uap->fd The descriptor to synchronize
8867 * flags The data integrity flags
8868 *
8869 * Returns: int Success
8870 * fp_getfvp:EBADF Bad file descriptor
8871 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8872 * VNOP_FSYNC:??? unspecified
8873 *
8874 * Notes: We use struct fsync_args because it is a short name, and all
8875 * caller argument structures are otherwise identical.
8876 */
8877 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8878 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8879 {
8880 vnode_t vp;
8881 struct fileproc *fp;
8882 vfs_context_t ctx = vfs_context_current();
8883 int error;
8884
8885 AUDIT_ARG(fd, uap->fd);
8886
8887 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8888 return error;
8889 }
8890 if ((error = vnode_getwithref(vp))) {
8891 file_drop(uap->fd);
8892 return error;
8893 }
8894
8895 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8896
8897 error = VNOP_FSYNC(vp, flags, ctx);
8898
8899 #if NAMEDRSRCFORK
8900 /* Sync resource fork shadow file if necessary. */
8901 if ((error == 0) &&
8902 (vp->v_flag & VISNAMEDSTREAM) &&
8903 (vp->v_parent != NULLVP) &&
8904 vnode_isshadow(vp) &&
8905 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8906 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8907 }
8908 #endif
8909
8910 (void)vnode_put(vp);
8911 file_drop(uap->fd);
8912 return error;
8913 }
8914
8915 /*
8916 * Duplicate files. Source must be a file, target must be a file or
8917 * must not exist.
8918 *
8919 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8920 * perform inheritance correctly.
8921 */
8922 /* ARGSUSED */
8923 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8924 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8925 {
8926 vnode_t tvp, fvp, tdvp, sdvp;
8927 struct nameidata fromnd, tond;
8928 int error;
8929 vfs_context_t ctx = vfs_context_current();
8930
8931 /* Check that the flags are valid. */
8932 if (uap->flags & ~CPF_MASK) {
8933 return EINVAL;
8934 }
8935
8936 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8937 UIO_USERSPACE, uap->from, ctx);
8938 if ((error = namei(&fromnd))) {
8939 return error;
8940 }
8941 fvp = fromnd.ni_vp;
8942
8943 NDINIT(&tond, CREATE, OP_LINK,
8944 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8945 UIO_USERSPACE, uap->to, ctx);
8946 if ((error = namei(&tond))) {
8947 goto out1;
8948 }
8949 tdvp = tond.ni_dvp;
8950 tvp = tond.ni_vp;
8951
8952 if (tvp != NULL) {
8953 if (!(uap->flags & CPF_OVERWRITE)) {
8954 error = EEXIST;
8955 goto out;
8956 }
8957 }
8958
8959 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8960 error = EISDIR;
8961 goto out;
8962 }
8963
8964 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8965 error = EOPNOTSUPP;
8966 goto out;
8967 }
8968
8969 #if CONFIG_MACF
8970 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8971 goto out;
8972 }
8973 #endif /* CONFIG_MACF */
8974
8975 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8976 goto out;
8977 }
8978 if (tvp) {
8979 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8980 goto out;
8981 }
8982 }
8983 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8984 goto out;
8985 }
8986
8987 if (fvp == tdvp) {
8988 error = EINVAL;
8989 }
8990 /*
8991 * If source is the same as the destination (that is the
8992 * same inode number) then there is nothing to do.
8993 * (fixed to have POSIX semantics - CSM 3/2/98)
8994 */
8995 if (fvp == tvp) {
8996 error = -1;
8997 }
8998
8999 #if CONFIG_FILE_LEASES
9000 vnode_breakdirlease(tdvp, false, O_WRONLY);
9001 #endif
9002
9003 if (!error) {
9004 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
9005 }
9006 out:
9007 sdvp = tond.ni_startdir;
9008 /*
9009 * nameidone has to happen before we vnode_put(tdvp)
9010 * since it may need to release the fs_nodelock on the tdvp
9011 */
9012 nameidone(&tond);
9013
9014 if (tvp) {
9015 vnode_put(tvp);
9016 }
9017 vnode_put(tdvp);
9018 vnode_put(sdvp);
9019 out1:
9020 vnode_put(fvp);
9021
9022 nameidone(&fromnd);
9023
9024 if (error == -1) {
9025 return 0;
9026 }
9027 return error;
9028 }
9029
9030 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
9031
9032 /*
9033 * Helper function for doing clones. The caller is expected to provide an
9034 * iocounted source vnode and release it.
9035 */
9036 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)9037 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
9038 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
9039 {
9040 vnode_t tvp, tdvp;
9041 struct nameidata *tondp = NULL;
9042 int error;
9043 int follow;
9044 boolean_t free_src_acl;
9045 boolean_t attr_cleanup;
9046 enum vtype v_type;
9047 kauth_action_t action;
9048 struct componentname *cnp;
9049 uint32_t defaulted = 0;
9050 struct {
9051 struct vnode_attr va[2];
9052 } *va2p = NULL;
9053 struct vnode_attr *vap = NULL;
9054 struct vnode_attr *nvap = NULL;
9055 uint32_t vnop_flags;
9056
9057 v_type = vnode_vtype(fvp);
9058 switch (v_type) {
9059 case VLNK:
9060 /* FALLTHRU */
9061 case VREG:
9062 action = KAUTH_VNODE_ADD_FILE;
9063 break;
9064 case VDIR:
9065 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
9066 fvp->v_mountedhere) {
9067 return EINVAL;
9068 }
9069 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
9070 break;
9071 default:
9072 return EINVAL;
9073 }
9074
9075 AUDIT_ARG(fd2, dst_dirfd);
9076 AUDIT_ARG(value32, flags);
9077
9078 tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9079 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9080 NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
9081 UIO_USERSPACE, dst, ctx);
9082 if (flags & CLONE_NOFOLLOW_ANY) {
9083 tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9084 }
9085 if (flags & CLONE_RESOLVE_BENEATH) {
9086 tondp->ni_flag |= NAMEI_RESOLVE_BENEATH;
9087 }
9088
9089 if ((error = nameiat(tondp, dst_dirfd))) {
9090 kfree_type(struct nameidata, tondp);
9091 return error;
9092 }
9093 cnp = &tondp->ni_cnd;
9094 tdvp = tondp->ni_dvp;
9095 tvp = tondp->ni_vp;
9096
9097 free_src_acl = FALSE;
9098 attr_cleanup = FALSE;
9099
9100 if (tvp != NULL) {
9101 error = EEXIST;
9102 goto out;
9103 }
9104
9105 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
9106 error = EXDEV;
9107 goto out;
9108 }
9109
9110 #if CONFIG_MACF
9111 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
9112 goto out;
9113 }
9114 #endif
9115 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
9116 goto out;
9117 }
9118
9119 action = KAUTH_VNODE_GENERIC_READ_BITS;
9120 if (data_read_authorised) {
9121 action &= ~KAUTH_VNODE_READ_DATA;
9122 }
9123 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
9124 goto out;
9125 }
9126
9127 va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
9128 vap = &va2p->va[0];
9129 nvap = &va2p->va[1];
9130
9131 /*
9132 * certain attributes may need to be changed from the source, we ask for
9133 * those here with the exception of source file's ACLs unless the CLONE_ACL
9134 * flag is specified. By default, the clone file will inherit the target
9135 * directory's ACLs unless the the CLONE_ACL flag is specified then it
9136 * will inherit the source file's ACLs instead.
9137 */
9138 VATTR_INIT(vap);
9139 VATTR_WANTED(vap, va_uid);
9140 VATTR_WANTED(vap, va_gid);
9141 VATTR_WANTED(vap, va_mode);
9142 VATTR_WANTED(vap, va_flags);
9143 if (flags & CLONE_ACL) {
9144 VATTR_WANTED(vap, va_acl);
9145 }
9146
9147 if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
9148 goto out;
9149 }
9150
9151 VATTR_INIT(nvap);
9152 VATTR_SET(nvap, va_type, v_type);
9153 if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
9154 VATTR_SET(nvap, va_acl, vap->va_acl);
9155 free_src_acl = TRUE;
9156 }
9157
9158 /* Handle ACL inheritance, initialize vap. */
9159 if (v_type == VLNK) {
9160 error = vnode_authattr_new(tdvp, nvap, 0, ctx);
9161 } else {
9162 error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
9163 if (error) {
9164 goto out;
9165 }
9166 attr_cleanup = TRUE;
9167 }
9168
9169 vnop_flags = VNODE_CLONEFILE_DEFAULT;
9170 /*
9171 * We've got initial values for all security parameters,
9172 * If we are superuser, then we can change owners to be the
9173 * same as the source. Both superuser and the owner have default
9174 * WRITE_SECURITY privileges so all other fields can be taken
9175 * from source as well.
9176 */
9177 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
9178 if (VATTR_IS_SUPPORTED(vap, va_uid)) {
9179 VATTR_SET(nvap, va_uid, vap->va_uid);
9180 }
9181 if (VATTR_IS_SUPPORTED(vap, va_gid)) {
9182 VATTR_SET(nvap, va_gid, vap->va_gid);
9183 }
9184 } else {
9185 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
9186 }
9187
9188 if (VATTR_IS_SUPPORTED(vap, va_mode)) {
9189 VATTR_SET(nvap, va_mode, vap->va_mode);
9190 }
9191 if (VATTR_IS_SUPPORTED(vap, va_flags)) {
9192 VATTR_SET(nvap, va_flags,
9193 ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
9194 (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
9195 }
9196
9197 #if CONFIG_FILE_LEASES
9198 vnode_breakdirlease(tdvp, false, O_WRONLY);
9199 #endif
9200
9201 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
9202
9203 if (!error && tvp) {
9204 int update_flags = 0;
9205 #if CONFIG_FSE
9206 int fsevent;
9207 #endif /* CONFIG_FSE */
9208
9209 /*
9210 * If some of the requested attributes weren't handled by the
9211 * VNOP, use our fallback code.
9212 */
9213 if (!VATTR_ALL_SUPPORTED(nvap)) {
9214 (void)vnode_setattr_fallback(tvp, nvap, ctx);
9215 }
9216
9217 #if CONFIG_MACF
9218 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
9219 VNODE_LABEL_CREATE, ctx);
9220 #endif
9221
9222 // Make sure the name & parent pointers are hooked up
9223 if (tvp->v_name == NULL) {
9224 update_flags |= VNODE_UPDATE_NAME;
9225 }
9226 if (tvp->v_parent == NULLVP) {
9227 update_flags |= VNODE_UPDATE_PARENT;
9228 }
9229
9230 if (update_flags) {
9231 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
9232 cnp->cn_namelen, cnp->cn_hash, update_flags);
9233 }
9234
9235 #if CONFIG_FSE
9236 switch (vnode_vtype(tvp)) {
9237 case VLNK:
9238 /* FALLTHRU */
9239 case VREG:
9240 fsevent = FSE_CREATE_FILE;
9241 break;
9242 case VDIR:
9243 fsevent = FSE_CREATE_DIR;
9244 break;
9245 default:
9246 goto out;
9247 }
9248
9249 if (need_fsevent(fsevent, tvp)) {
9250 /*
9251 * The following is a sequence of three explicit events.
9252 * A pair of FSE_CLONE events representing the source and destination
9253 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
9254 * fseventsd may coalesce the destination clone and create events
9255 * into a single event resulting in the following sequence for a client
9256 * FSE_CLONE (src)
9257 * FSE_CLONE | FSE_CREATE (dst)
9258 */
9259 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
9260 FSE_ARG_DONE);
9261 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
9262 FSE_ARG_DONE);
9263 }
9264 #endif /* CONFIG_FSE */
9265 }
9266
9267 out:
9268 if (attr_cleanup) {
9269 vn_attribute_cleanup(nvap, defaulted);
9270 }
9271 if (free_src_acl && vap->va_acl) {
9272 kauth_acl_free(vap->va_acl);
9273 }
9274 if (va2p) {
9275 kfree_type(typeof(*va2p), va2p);
9276 }
9277 nameidone(tondp);
9278 kfree_type(struct nameidata, tondp);
9279 if (tvp) {
9280 vnode_put(tvp);
9281 }
9282 vnode_put(tdvp);
9283 return error;
9284 }
9285
9286 /*
9287 * clone files or directories, target must not exist.
9288 */
9289 /* ARGSUSED */
9290 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)9291 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
9292 __unused int32_t *retval)
9293 {
9294 vnode_t fvp;
9295 struct nameidata *ndp = NULL;
9296 int follow;
9297 int error;
9298 vfs_context_t ctx = vfs_context_current();
9299
9300 /* Check that the flags are valid. */
9301 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9302 CLONE_NOFOLLOW_ANY | CLONE_RESOLVE_BENEATH)) {
9303 return EINVAL;
9304 }
9305
9306 AUDIT_ARG(fd, uap->src_dirfd);
9307
9308 ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9309
9310 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9311 NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
9312 UIO_USERSPACE, uap->src, ctx);
9313 if (uap->flags & CLONE_NOFOLLOW_ANY) {
9314 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9315 }
9316 if (uap->flags & CLONE_RESOLVE_BENEATH) {
9317 ndp->ni_flag |= NAMEI_RESOLVE_BENEATH;
9318 }
9319
9320 if ((error = nameiat(ndp, uap->src_dirfd))) {
9321 kfree_type(struct nameidata, ndp);
9322 return error;
9323 }
9324
9325 fvp = ndp->ni_vp;
9326 nameidone(ndp);
9327 kfree_type(struct nameidata, ndp);
9328
9329 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
9330 uap->flags, ctx);
9331
9332 vnode_put(fvp);
9333 return error;
9334 }
9335
9336 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)9337 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
9338 __unused int32_t *retval)
9339 {
9340 vnode_t fvp;
9341 struct fileproc *fp;
9342 int error;
9343 vfs_context_t ctx = vfs_context_current();
9344
9345 /* Check that the flags are valid. */
9346 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9347 CLONE_NOFOLLOW_ANY | CLONE_RESOLVE_BENEATH)) {
9348 return EINVAL;
9349 }
9350
9351 AUDIT_ARG(fd, uap->src_fd);
9352 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
9353 if (error) {
9354 return error;
9355 }
9356
9357 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9358 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
9359 error = EBADF;
9360 goto out;
9361 }
9362
9363 if ((error = vnode_getwithref(fvp))) {
9364 goto out;
9365 }
9366
9367 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
9368
9369 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
9370 uap->flags, ctx);
9371
9372 vnode_put(fvp);
9373 out:
9374 file_drop(uap->src_fd);
9375 return error;
9376 }
9377
9378 static int
rename_submounts_callback(mount_t mp,void * arg)9379 rename_submounts_callback(mount_t mp, void *arg)
9380 {
9381 char *prefix = (char *)arg;
9382 int prefix_len = (int)strlen(prefix);
9383 int error = 0;
9384
9385 if (strncmp(mp->mnt_vfsstat.f_mntonname, prefix, prefix_len) != 0) {
9386 return 0;
9387 }
9388
9389 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
9390 return 0;
9391 }
9392
9393 if ((error = vfs_busy(mp, LK_NOWAIT))) {
9394 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
9395 return -1;
9396 }
9397
9398 size_t pathlen = MAXPATHLEN;
9399 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
9400 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
9401 }
9402
9403 vfs_unbusy(mp);
9404
9405 return error;
9406 }
9407
9408 /*
9409 * Rename files. Source and destination must either both be directories,
9410 * or both not be directories. If target is a directory, it must be empty.
9411 */
9412 /* ARGSUSED */
9413 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9414 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9415 int tofd, user_addr_t to, int segflg, u_int uflags)
9416 {
9417 vnode_t tvp, tdvp;
9418 vnode_t fvp, fdvp;
9419 vnode_t mnt_fvp;
9420 struct nameidata *fromnd, *tond;
9421 int error = 0;
9422 int do_retry;
9423 int retry_count;
9424 int mntrename;
9425 int dirrename;
9426 int need_event;
9427 int need_kpath2;
9428 int has_listeners;
9429 const char *oname = NULL;
9430 char *old_dirpath = NULL, *from_name = NULL, *to_name = NULL;
9431 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9432 int from_len = 0, to_len = 0;
9433 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9434 int holding_mntlock;
9435 int vn_authorize_skipped;
9436 mount_t locked_mp = NULL;
9437 vnode_t oparent = NULLVP;
9438 vnode_t locked_vp = NULLVP;
9439 #if CONFIG_FSE
9440 fse_info from_finfo = {}, to_finfo;
9441 #endif
9442 int from_truncated = 0, to_truncated = 0;
9443 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9444 int batched = 0;
9445 struct vnode_attr *fvap, *tvap;
9446 int continuing = 0;
9447 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9448 int32_t nofollow_any = 0;
9449 int32_t resolve_beneath = 0;
9450 /* carving out a chunk for structs that are too big to be on stack. */
9451 struct {
9452 struct nameidata from_node, to_node;
9453 struct vnode_attr fv_attr, tv_attr;
9454 } * __rename_data;
9455
9456 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9457 fromnd = &__rename_data->from_node;
9458 tond = &__rename_data->to_node;
9459
9460 holding_mntlock = 0;
9461 do_retry = 0;
9462 retry_count = 0;
9463 retry:
9464 fvp = tvp = NULL;
9465 fdvp = tdvp = NULL;
9466 fvap = tvap = NULL;
9467 mnt_fvp = NULLVP;
9468 mntrename = dirrename = FALSE;
9469 vn_authorize_skipped = FALSE;
9470
9471 if (uflags & RENAME_NOFOLLOW_ANY) {
9472 nofollow_any = NAMEI_NOFOLLOW_ANY;
9473 }
9474 if (uflags & RENAME_RESOLVE_BENEATH) {
9475 resolve_beneath = NAMEI_RESOLVE_BENEATH;
9476 }
9477 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9478 segflg, from, ctx);
9479 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any | resolve_beneath;
9480
9481 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9482 segflg, to, ctx);
9483 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any | resolve_beneath;
9484
9485 continue_lookup:
9486 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9487 if ((error = nameiat(fromnd, fromfd))) {
9488 goto out1;
9489 }
9490 fdvp = fromnd->ni_dvp;
9491 fvp = fromnd->ni_vp;
9492
9493 if (fvp && fvp->v_type == VDIR) {
9494 tond->ni_cnd.cn_flags |= WILLBEDIR;
9495 #if defined(XNU_TARGET_OS_OSX)
9496 dirrename = TRUE;
9497 #endif
9498 }
9499 }
9500
9501 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9502 if ((error = nameiat(tond, tofd))) {
9503 /*
9504 * Translate error code for rename("dir1", "dir2/.").
9505 */
9506 if (error == EISDIR && fvp->v_type == VDIR) {
9507 error = EINVAL;
9508 }
9509 goto out1;
9510 }
9511 tdvp = tond->ni_dvp;
9512 tvp = tond->ni_vp;
9513 }
9514
9515 #if DEVELOPMENT || DEBUG
9516 /*
9517 * XXX VSWAP: Check for entitlements or special flag here
9518 * so we can restrict access appropriately.
9519 */
9520 #else /* DEVELOPMENT || DEBUG */
9521
9522 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9523 error = EPERM;
9524 goto out1;
9525 }
9526
9527 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9528 error = EPERM;
9529 goto out1;
9530 }
9531 #endif /* DEVELOPMENT || DEBUG */
9532
9533 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9534 error = ENOENT;
9535 goto out1;
9536 }
9537
9538 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9539 int32_t pval = 0;
9540 int err = 0;
9541
9542 /*
9543 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9544 * has the same name as target iff the following conditions are met:
9545 * 1. the target file system is case insensitive
9546 * 2. source and target directories are the same
9547 * 3. source and target files are the same
9548 * 4. name only differs in case (determined by underlying filesystem)
9549 */
9550 if (fvp != tvp || fdvp != tdvp) {
9551 error = EEXIST;
9552 goto out1;
9553 }
9554
9555 /*
9556 * Assume that the target file system is case sensitive if
9557 * _PC_CASE_SENSITIVE selector isn't supported.
9558 */
9559 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9560 if (err != 0 || pval != 0) {
9561 error = EEXIST;
9562 goto out1;
9563 }
9564 }
9565
9566 batched = vnode_compound_rename_available(fdvp);
9567
9568 #if CONFIG_FSE
9569 need_event = need_fsevent(FSE_RENAME, fdvp);
9570 if (need_event) {
9571 if (fvp) {
9572 get_fse_info(fvp, &from_finfo, ctx);
9573 } else {
9574 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9575 if (error) {
9576 goto out1;
9577 }
9578
9579 fvap = &__rename_data->fv_attr;
9580 }
9581
9582 if (tvp) {
9583 get_fse_info(tvp, &to_finfo, ctx);
9584 } else if (batched) {
9585 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9586 if (error) {
9587 goto out1;
9588 }
9589
9590 tvap = &__rename_data->tv_attr;
9591 }
9592 }
9593 #else
9594 need_event = 0;
9595 #endif /* CONFIG_FSE */
9596
9597 has_listeners = kauth_authorize_fileop_has_listeners();
9598
9599 need_kpath2 = 0;
9600 #if CONFIG_AUDIT
9601 if (AUDIT_RECORD_EXISTS()) {
9602 need_kpath2 = 1;
9603 }
9604 #endif
9605
9606 if (need_event || has_listeners) {
9607 if (from_name == NULL) {
9608 GET_PATH(from_name);
9609 }
9610
9611 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9612
9613 if (from_name_no_firmlink == NULL) {
9614 GET_PATH(from_name_no_firmlink);
9615 }
9616
9617 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9618 }
9619
9620 if (need_event || need_kpath2 || has_listeners) {
9621 if (to_name == NULL) {
9622 GET_PATH(to_name);
9623 }
9624
9625 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9626
9627 if (to_name_no_firmlink == NULL) {
9628 GET_PATH(to_name_no_firmlink);
9629 }
9630
9631 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9632 if (to_name && need_kpath2) {
9633 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9634 }
9635 }
9636 if (!fvp) {
9637 /*
9638 * Claim: this check will never reject a valid rename.
9639 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9640 * Suppose fdvp and tdvp are not on the same mount.
9641 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9642 * then you can't move it to within another dir on the same mountpoint.
9643 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9644 *
9645 * If this check passes, then we are safe to pass these vnodes to the same FS.
9646 */
9647 if (fdvp->v_mount != tdvp->v_mount) {
9648 error = EXDEV;
9649 goto out1;
9650 }
9651 goto skipped_lookup;
9652 }
9653
9654 /*
9655 * If the source and destination are the same (i.e. they're
9656 * links to the same vnode) and the target file system is
9657 * case sensitive, then there is nothing to do.
9658 *
9659 * XXX Come back to this.
9660 */
9661 if (fvp == tvp) {
9662 int pathconf_val;
9663
9664 /*
9665 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9666 * then assume that this file system is case sensitive.
9667 */
9668 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9669 pathconf_val != 0) {
9670 vn_authorize_skipped = TRUE;
9671 goto out1;
9672 }
9673 }
9674
9675 /*
9676 * Allow the renaming of mount points.
9677 * - target must not exist
9678 * - target must reside in the same directory as source
9679 * - union mounts cannot be renamed
9680 * - the root fs, and tightly-linked system volumes, cannot be renamed
9681 *
9682 * XXX Handle this in VFS after a continued lookup (if we missed
9683 * in the cache to start off)
9684 *
9685 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9686 * we'll skip past here. The file system is responsible for
9687 * checking that @tvp is not a descendent of @fvp and vice versa
9688 * so it should always return EINVAL if either @tvp or @fvp is the
9689 * root of a volume.
9690 */
9691 if ((fvp->v_flag & VROOT) &&
9692 (fvp->v_type == VDIR) &&
9693 (tvp == NULL) &&
9694 (fvp->v_mountedhere == NULL) &&
9695 (fdvp == tdvp) &&
9696 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9697 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9698 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9699 vnode_t coveredvp;
9700
9701 /* switch fvp to the covered vnode */
9702 coveredvp = fvp->v_mount->mnt_vnodecovered;
9703 if ((vnode_getwithref(coveredvp))) {
9704 error = ENOENT;
9705 goto out1;
9706 }
9707 /*
9708 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9709 * later.
9710 */
9711 mnt_fvp = fvp;
9712
9713 fvp = coveredvp;
9714 mntrename = TRUE;
9715 }
9716 /*
9717 * Check for cross-device rename.
9718 * For rename on mountpoint, we want to also check the source and its parent
9719 * belong to the same mountpoint.
9720 */
9721 if ((fvp->v_mount != tdvp->v_mount) ||
9722 (fvp->v_mount != fdvp->v_mount) ||
9723 (tvp && (fvp->v_mount != tvp->v_mount))) {
9724 error = EXDEV;
9725 goto out1;
9726 }
9727
9728 /*
9729 * If source is the same as the destination (that is the
9730 * same inode number) then there is nothing to do...
9731 * EXCEPT if the underlying file system supports case
9732 * insensitivity and is case preserving. In this case
9733 * the file system needs to handle the special case of
9734 * getting the same vnode as target (fvp) and source (tvp).
9735 *
9736 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9737 * and _PC_CASE_PRESERVING can have this exception, and they need to
9738 * handle the special case of getting the same vnode as target and
9739 * source. NOTE: Then the target is unlocked going into vnop_rename,
9740 * so not to cause locking problems. There is a single reference on tvp.
9741 *
9742 * NOTE - that fvp == tvp also occurs if they are hard linked and
9743 * that correct behaviour then is just to return success without doing
9744 * anything.
9745 *
9746 * XXX filesystem should take care of this itself, perhaps...
9747 */
9748 if (fvp == tvp && fdvp == tdvp) {
9749 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9750 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9751 fromnd->ni_cnd.cn_namelen)) {
9752 vn_authorize_skipped = TRUE;
9753 goto out1;
9754 }
9755 }
9756
9757 if (holding_mntlock && fvp->v_mount != locked_mp) {
9758 /*
9759 * we're holding a reference and lock
9760 * on locked_mp, but it no longer matches
9761 * what we want to do... so drop our hold
9762 */
9763 mount_unlock_renames(locked_mp);
9764 mount_drop(locked_mp, 0);
9765 holding_mntlock = 0;
9766 }
9767 if (tdvp != fdvp && fvp->v_type == VDIR) {
9768 /*
9769 * serialize renames that re-shape
9770 * the tree... if holding_mntlock is
9771 * set, then we're ready to go...
9772 * otherwise we
9773 * first need to drop the iocounts
9774 * we picked up, second take the
9775 * lock to serialize the access,
9776 * then finally start the lookup
9777 * process over with the lock held
9778 */
9779 if (!holding_mntlock) {
9780 /*
9781 * need to grab a reference on
9782 * the mount point before we
9783 * drop all the iocounts... once
9784 * the iocounts are gone, the mount
9785 * could follow
9786 */
9787 locked_mp = fvp->v_mount;
9788 mount_ref(locked_mp, 0);
9789
9790 /*
9791 * nameidone has to happen before we vnode_put(tvp)
9792 * since it may need to release the fs_nodelock on the tvp
9793 */
9794 nameidone(tond);
9795
9796 if (tvp) {
9797 vnode_put(tvp);
9798 }
9799 vnode_put(tdvp);
9800
9801 /*
9802 * nameidone has to happen before we vnode_put(fdvp)
9803 * since it may need to release the fs_nodelock on the fvp
9804 */
9805 nameidone(fromnd);
9806
9807 vnode_put(fvp);
9808 vnode_put(fdvp);
9809
9810 if (mnt_fvp != NULLVP) {
9811 vnode_put(mnt_fvp);
9812 }
9813
9814 mount_lock_renames(locked_mp);
9815 holding_mntlock = 1;
9816
9817 goto retry;
9818 }
9819 } else {
9820 /*
9821 * when we dropped the iocounts to take
9822 * the lock, we allowed the identity of
9823 * the various vnodes to change... if they did,
9824 * we may no longer be dealing with a rename
9825 * that reshapes the tree... once we're holding
9826 * the iocounts, the vnodes can't change type
9827 * so we're free to drop the lock at this point
9828 * and continue on
9829 */
9830 if (holding_mntlock) {
9831 mount_unlock_renames(locked_mp);
9832 mount_drop(locked_mp, 0);
9833 holding_mntlock = 0;
9834 }
9835 }
9836
9837 if (!batched) {
9838 assert(locked_vp == NULLVP);
9839 vnode_link_lock(fvp);
9840 locked_vp = fvp;
9841 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9842 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9843 flags, NULL);
9844 if (error) {
9845 if (error == ENOENT) {
9846 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9847 /*
9848 * We encountered a race where after doing the namei,
9849 * tvp stops being valid. If so, simply re-drive the rename
9850 * call from the top.
9851 */
9852 do_retry = 1;
9853 retry_count += 1;
9854 }
9855 }
9856 goto out1;
9857 }
9858 }
9859
9860 /* Release the 'mnt_fvp' now that it is no longer needed. */
9861 if (mnt_fvp != NULLVP) {
9862 vnode_put(mnt_fvp);
9863 mnt_fvp = NULLVP;
9864 }
9865
9866 // save these off so we can later verify that fvp is the same
9867 oname = fvp->v_name;
9868 oparent = fvp->v_parent;
9869
9870 /*
9871 * If renaming a directory, stash its path which we need later when
9872 * updating the 'f_mntonname' of sub mounts.
9873 */
9874 if (dirrename) {
9875 int pathlen = MAXPATHLEN;
9876
9877 old_dirpath = zalloc(ZV_NAMEI);
9878 error = vn_getpath_fsenter(fvp, old_dirpath, &pathlen);
9879 if (error) {
9880 /*
9881 * Process that supports long path (opt-in to IO policy
9882 * IOPOL_TYPE_VFS_SUPPORT_LONG_PATHS) can have directory with path
9883 * length up to MAXLONGPATHLEN (8192). Since max path length in
9884 * mount's 'f_mntonname' is MAXPATHLEN (1024), this means the
9885 * directory can't be the parent of the sub mounts so we can just
9886 * silently drop the error and skip the check to update the
9887 * 'f_mntonname' of sub mounts.
9888 */
9889 if (error == ENOSPC) {
9890 dirrename = false;
9891 error = 0;
9892 if (old_dirpath) {
9893 zfree(ZV_NAMEI, old_dirpath);
9894 old_dirpath = NULL;
9895 }
9896 } else {
9897 goto out1;
9898 }
9899 }
9900 }
9901
9902 skipped_lookup:
9903 #if CONFIG_FILE_LEASES
9904 /* Lease break needed for source's parent dir? */
9905 vnode_breakdirlease(fdvp, false, O_WRONLY);
9906
9907 /* Lease break needed for target's parent dir? */
9908 vnode_breakdirlease(tdvp, false, O_WRONLY);
9909 #endif
9910
9911 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9912 tdvp, &tvp, &tond->ni_cnd, tvap,
9913 flags, ctx);
9914
9915 if (locked_vp) {
9916 vnode_link_unlock(fvp);
9917 locked_vp = NULLVP;
9918 }
9919
9920 if (holding_mntlock) {
9921 /*
9922 * we can drop our serialization
9923 * lock now
9924 */
9925 mount_unlock_renames(locked_mp);
9926 mount_drop(locked_mp, 0);
9927 holding_mntlock = 0;
9928 }
9929 if (error) {
9930 if (error == EDATALESS) {
9931 /*
9932 * If we've been here before, something has gone
9933 * horribly wrong and we should just get out lest
9934 * we spiral around the drain forever.
9935 */
9936 if (flags & VFS_RENAME_DATALESS) {
9937 error = EIO;
9938 goto out1;
9939 }
9940
9941 /*
9942 * The object we're renaming is dataless (or has a
9943 * dataless descendent) and requires materialization
9944 * before the rename occurs. But we're holding the
9945 * mount point's rename lock, so it's not safe to
9946 * make the upcall.
9947 *
9948 * In this case, we release the lock (above), perform
9949 * the materialization, and start the whole thing over.
9950 */
9951 error = vfs_materialize_reparent(fvp, tdvp);
9952 if (error == 0) {
9953 /*
9954 * The next time around we need to tell the
9955 * file system that the materializtaion has
9956 * been performed.
9957 */
9958 flags |= VFS_RENAME_DATALESS;
9959 do_retry = 1;
9960 }
9961 goto out1;
9962 }
9963 if (error == EKEEPLOOKING) {
9964 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9965 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9966 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9967 }
9968 }
9969
9970 fromnd->ni_vp = fvp;
9971 tond->ni_vp = tvp;
9972
9973 goto continue_lookup;
9974 }
9975
9976 /*
9977 * We may encounter a race in the VNOP where the destination didn't
9978 * exist when we did the namei, but it does by the time we go and
9979 * try to create the entry. In this case, we should re-drive this rename
9980 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9981 * but other filesystems susceptible to this race could return it, too.
9982 */
9983 if (error == ERECYCLE) {
9984 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9985 do_retry = 1;
9986 retry_count += 1;
9987 } else {
9988 printf("rename retry limit due to ERECYCLE reached\n");
9989 error = ENOENT;
9990 }
9991 }
9992
9993 /*
9994 * For compound VNOPs, the authorization callback may return
9995 * ENOENT in case of racing hardlink lookups hitting the name
9996 * cache, redrive the lookup.
9997 */
9998 if (batched && error == ENOENT) {
9999 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10000 do_retry = 1;
10001 retry_count += 1;
10002 }
10003 }
10004
10005 goto out1;
10006 }
10007
10008 /* call out to allow 3rd party notification of rename.
10009 * Ignore result of kauth_authorize_fileop call.
10010 */
10011 kauth_authorize_fileop(vfs_context_ucred(ctx),
10012 KAUTH_FILEOP_RENAME,
10013 (uintptr_t)from_name, (uintptr_t)to_name);
10014 if (flags & VFS_RENAME_SWAP) {
10015 kauth_authorize_fileop(vfs_context_ucred(ctx),
10016 KAUTH_FILEOP_RENAME,
10017 (uintptr_t)to_name, (uintptr_t)from_name);
10018 }
10019
10020 #if CONFIG_FSE
10021 if (from_name != NULL && to_name != NULL) {
10022 if (from_truncated || to_truncated) {
10023 // set it here since only the from_finfo gets reported up to user space
10024 from_finfo.mode |= FSE_TRUNCATED_PATH;
10025 }
10026
10027 if (tvap && tvp) {
10028 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
10029 }
10030 if (fvap) {
10031 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
10032 }
10033
10034 if (tvp) {
10035 add_fsevent(FSE_RENAME, ctx,
10036 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10037 FSE_ARG_FINFO, &from_finfo,
10038 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10039 FSE_ARG_FINFO, &to_finfo,
10040 FSE_ARG_DONE);
10041 if (flags & VFS_RENAME_SWAP) {
10042 /*
10043 * Strictly speaking, swap is the equivalent of
10044 * *three* renames. FSEvents clients should only take
10045 * the events as a hint, so we only bother reporting
10046 * two.
10047 */
10048 add_fsevent(FSE_RENAME, ctx,
10049 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10050 FSE_ARG_FINFO, &to_finfo,
10051 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10052 FSE_ARG_FINFO, &from_finfo,
10053 FSE_ARG_DONE);
10054 }
10055 } else {
10056 add_fsevent(FSE_RENAME, ctx,
10057 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
10058 FSE_ARG_FINFO, &from_finfo,
10059 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
10060 FSE_ARG_DONE);
10061 }
10062 }
10063 #endif /* CONFIG_FSE */
10064
10065 /*
10066 * update filesystem's mount point data
10067 */
10068 if (mntrename) {
10069 char *cp, *pathend, *mpname;
10070 char * tobuf;
10071 struct mount *mp;
10072 int maxlen;
10073 size_t len = 0;
10074
10075 mp = fvp->v_mountedhere;
10076
10077 if (vfs_busy(mp, LK_NOWAIT)) {
10078 error = EBUSY;
10079 goto out1;
10080 }
10081 tobuf = zalloc(ZV_NAMEI);
10082
10083 if (UIO_SEG_IS_USER_SPACE(segflg)) {
10084 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
10085 } else {
10086 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
10087 }
10088 if (!error) {
10089 /* find current mount point prefix */
10090 pathend = &mp->mnt_vfsstat.f_mntonname[0];
10091 for (cp = pathend; *cp != '\0'; ++cp) {
10092 if (*cp == '/') {
10093 pathend = cp + 1;
10094 }
10095 }
10096 /* find last component of target name */
10097 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
10098 if (*cp == '/') {
10099 mpname = cp + 1;
10100 }
10101 }
10102
10103 /* Update f_mntonname of sub mounts */
10104 vfs_iterate(0, rename_submounts_callback,
10105 (void *)mp->mnt_vfsstat.f_mntonname);
10106
10107 /* append name to prefix */
10108 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
10109 bzero(pathend, maxlen);
10110
10111 strlcpy(pathend, mpname, maxlen);
10112 }
10113 zfree(ZV_NAMEI, tobuf);
10114
10115 vfs_unbusy(mp);
10116
10117 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
10118 } else if (dirrename) {
10119 /*
10120 * If we renamed a directory, we need to check if there is any sub
10121 * mount(s) mounted under the directory. If so, then we need to update
10122 * the sub mount's f_mntonname path.
10123 */
10124 vfs_iterate(0, rename_submounts_callback, (void *)old_dirpath);
10125 }
10126
10127 /*
10128 * fix up name & parent pointers. note that we first
10129 * check that fvp has the same name/parent pointers it
10130 * had before the rename call... this is a 'weak' check
10131 * at best...
10132 *
10133 * XXX oparent and oname may not be set in the compound vnop case
10134 */
10135 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
10136 int update_flags;
10137
10138 update_flags = VNODE_UPDATE_NAME;
10139
10140 if (fdvp != tdvp) {
10141 update_flags |= VNODE_UPDATE_PARENT;
10142 }
10143
10144 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
10145 }
10146 out1:
10147 /*
10148 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
10149 * skipped earlier as no actual rename was performed.
10150 */
10151 if (vn_authorize_skipped && error == 0) {
10152 error = vn_authorize_renamex_with_paths(fdvp, fvp,
10153 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
10154 flags, NULL);
10155 if (error && error == ENOENT) {
10156 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10157 do_retry = 1;
10158 retry_count += 1;
10159 }
10160 }
10161 }
10162 if (locked_vp) {
10163 assert(locked_vp == fvp);
10164 vnode_link_unlock(locked_vp);
10165 locked_vp = NULLVP;
10166 }
10167 if (to_name != NULL) {
10168 RELEASE_PATH(to_name);
10169 to_name = NULL;
10170 }
10171 if (to_name_no_firmlink != NULL) {
10172 RELEASE_PATH(to_name_no_firmlink);
10173 to_name_no_firmlink = NULL;
10174 }
10175 if (from_name != NULL) {
10176 RELEASE_PATH(from_name);
10177 from_name = NULL;
10178 }
10179 if (from_name_no_firmlink != NULL) {
10180 RELEASE_PATH(from_name_no_firmlink);
10181 from_name_no_firmlink = NULL;
10182 }
10183 if (old_dirpath != NULL) {
10184 zfree(ZV_NAMEI, old_dirpath);
10185 old_dirpath = NULL;
10186 }
10187 if (holding_mntlock) {
10188 mount_unlock_renames(locked_mp);
10189 mount_drop(locked_mp, 0);
10190 holding_mntlock = 0;
10191 }
10192 if (tdvp) {
10193 /*
10194 * nameidone has to happen before we vnode_put(tdvp)
10195 * since it may need to release the fs_nodelock on the tdvp
10196 */
10197 nameidone(tond);
10198
10199 if (tvp) {
10200 vnode_put(tvp);
10201 }
10202 vnode_put(tdvp);
10203 }
10204 if (fdvp) {
10205 /*
10206 * nameidone has to happen before we vnode_put(fdvp)
10207 * since it may need to release the fs_nodelock on the fdvp
10208 */
10209 nameidone(fromnd);
10210
10211 if (fvp) {
10212 vnode_put(fvp);
10213 }
10214 vnode_put(fdvp);
10215 }
10216 if (mnt_fvp != NULLVP) {
10217 vnode_put(mnt_fvp);
10218 }
10219 /*
10220 * If things changed after we did the namei, then we will re-drive
10221 * this rename call from the top.
10222 */
10223 if (do_retry) {
10224 do_retry = 0;
10225 goto retry;
10226 }
10227
10228 kfree_type(typeof(*__rename_data), __rename_data);
10229 return error;
10230 }
10231
10232 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)10233 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
10234 {
10235 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
10236 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
10237 }
10238
10239 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)10240 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
10241 {
10242 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY | RENAME_RESOLVE_BENEATH)) {
10243 return EINVAL;
10244 }
10245
10246 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
10247 return EINVAL;
10248 }
10249
10250 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
10251 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
10252 }
10253
10254 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)10255 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
10256 {
10257 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
10258 uap->tofd, uap->to, UIO_USERSPACE, 0);
10259 }
10260
10261 /*
10262 * Make a directory file.
10263 *
10264 * Returns: 0 Success
10265 * EEXIST
10266 * namei:???
10267 * vnode_authorize:???
10268 * vn_create:???
10269 */
10270 /* ARGSUSED */
10271 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)10272 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
10273 enum uio_seg segflg)
10274 {
10275 vnode_t vp, dvp;
10276 int error;
10277 int update_flags = 0;
10278 int batched;
10279 struct nameidata nd;
10280
10281 AUDIT_ARG(mode, vap->va_mode);
10282 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
10283 path, ctx);
10284 nd.ni_cnd.cn_flags |= WILLBEDIR;
10285 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
10286
10287 continue_lookup:
10288 error = nameiat(&nd, fd);
10289 if (error) {
10290 return error;
10291 }
10292 dvp = nd.ni_dvp;
10293 vp = nd.ni_vp;
10294
10295 if (vp != NULL) {
10296 error = EEXIST;
10297 goto out;
10298 }
10299
10300 batched = vnode_compound_mkdir_available(dvp);
10301
10302 VATTR_SET(vap, va_type, VDIR);
10303
10304 /*
10305 * XXX
10306 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
10307 * only get EXISTS or EISDIR for existing path components, and not that it could see
10308 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
10309 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
10310 */
10311 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
10312 if (error == EACCES || error == EPERM) {
10313 int error2;
10314
10315 nameidone(&nd);
10316 vnode_put(dvp);
10317 dvp = NULLVP;
10318
10319 /*
10320 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
10321 * rather than EACCESS if the target exists.
10322 */
10323 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
10324 path, ctx);
10325 error2 = nameiat(&nd, fd);
10326 if (error2) {
10327 goto out;
10328 } else {
10329 vp = nd.ni_vp;
10330 error = EEXIST;
10331 goto out;
10332 }
10333 }
10334
10335 goto out;
10336 }
10337
10338 #if CONFIG_FILE_LEASES
10339 vnode_breakdirlease(dvp, false, O_WRONLY);
10340 #endif
10341
10342 /*
10343 * make the directory
10344 */
10345 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
10346 if (error == EKEEPLOOKING) {
10347 nd.ni_vp = vp;
10348 goto continue_lookup;
10349 }
10350
10351 goto out;
10352 }
10353
10354 // Make sure the name & parent pointers are hooked up
10355 if (vp->v_name == NULL) {
10356 update_flags |= VNODE_UPDATE_NAME;
10357 }
10358 if (vp->v_parent == NULLVP) {
10359 update_flags |= VNODE_UPDATE_PARENT;
10360 }
10361
10362 if (update_flags) {
10363 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
10364 }
10365
10366 #if CONFIG_FSE
10367 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
10368 #endif
10369
10370 out:
10371 /*
10372 * nameidone has to happen before we vnode_put(dvp)
10373 * since it may need to release the fs_nodelock on the dvp
10374 */
10375 nameidone(&nd);
10376
10377 if (vp) {
10378 vnode_put(vp);
10379 }
10380 if (dvp) {
10381 vnode_put(dvp);
10382 }
10383
10384 return error;
10385 }
10386
10387 /*
10388 * mkdir_extended: Create a directory; with extended security (ACL).
10389 *
10390 * Parameters: p Process requesting to create the directory
10391 * uap User argument descriptor (see below)
10392 * retval (ignored)
10393 *
10394 * Indirect: uap->path Path of directory to create
10395 * uap->mode Access permissions to set
10396 * uap->xsecurity ACL to set
10397 *
10398 * Returns: 0 Success
10399 * !0 Not success
10400 *
10401 */
10402 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)10403 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
10404 {
10405 int ciferror;
10406 kauth_filesec_t xsecdst;
10407 struct vnode_attr va;
10408
10409 AUDIT_ARG(owner, uap->uid, uap->gid);
10410
10411 xsecdst = NULL;
10412 if ((uap->xsecurity != USER_ADDR_NULL) &&
10413 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
10414 return ciferror;
10415 }
10416
10417 VATTR_INIT(&va);
10418 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10419 if (xsecdst != NULL) {
10420 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
10421 va.va_vaflags |= VA_FILESEC_ACL;
10422 }
10423
10424 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10425 UIO_USERSPACE);
10426 if (xsecdst != NULL) {
10427 kauth_filesec_free(xsecdst);
10428 }
10429 return ciferror;
10430 }
10431
10432 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)10433 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
10434 {
10435 struct vnode_attr va;
10436
10437 VATTR_INIT(&va);
10438 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10439
10440 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10441 UIO_USERSPACE);
10442 }
10443
10444 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)10445 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
10446 {
10447 struct vnode_attr va;
10448
10449 VATTR_INIT(&va);
10450 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10451
10452 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
10453 UIO_USERSPACE);
10454 }
10455
10456 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)10457 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
10458 enum uio_seg segflg, int unlink_flags)
10459 {
10460 struct {
10461 struct nameidata nd;
10462 #if CONFIG_FSE
10463 struct vnode_attr va;
10464 #endif /* CONFIG_FSE */
10465 } *__rmdir_data;
10466 vnode_t vp, dvp;
10467 int error;
10468 struct nameidata *ndp;
10469 char *path = NULL;
10470 char *no_firmlink_path = NULL;
10471 int len_path = 0;
10472 int len_no_firmlink_path = 0;
10473 int has_listeners = 0;
10474 int need_event = 0;
10475 int truncated_path = 0;
10476 int truncated_no_firmlink_path = 0;
10477 struct vnode_attr *vap = NULL;
10478 int restart_count = 0;
10479 int batched;
10480
10481 int restart_flag;
10482 int nofollow_any = 0;
10483 int resolve_beneath = 0;
10484
10485 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10486 ndp = &__rmdir_data->nd;
10487
10488 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10489 nofollow_any = NAMEI_NOFOLLOW_ANY;
10490 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10491 }
10492 if (unlink_flags & VNODE_REMOVE_RESOLVE_BENEATH) {
10493 resolve_beneath = NAMEI_RESOLVE_BENEATH;
10494 unlink_flags &= ~VNODE_REMOVE_RESOLVE_BENEATH;
10495 }
10496
10497 /*
10498 * This loop exists to restart rmdir in the unlikely case that two
10499 * processes are simultaneously trying to remove the same directory
10500 * containing orphaned appleDouble files.
10501 */
10502 do {
10503 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10504 segflg, dirpath, ctx);
10505 ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any | resolve_beneath;
10506 continue_lookup:
10507 restart_flag = 0;
10508 vap = NULL;
10509
10510 error = nameiat(ndp, fd);
10511 if (error) {
10512 goto err_out;
10513 }
10514
10515 dvp = ndp->ni_dvp;
10516 vp = ndp->ni_vp;
10517
10518 if (vp) {
10519 batched = vnode_compound_rmdir_available(vp);
10520
10521 if (vp->v_flag & VROOT) {
10522 /*
10523 * The root of a mounted filesystem cannot be deleted.
10524 */
10525 error = EBUSY;
10526 goto out;
10527 }
10528
10529 #if DEVELOPMENT || DEBUG
10530 /*
10531 * XXX VSWAP: Check for entitlements or special flag here
10532 * so we can restrict access appropriately.
10533 */
10534 #else /* DEVELOPMENT || DEBUG */
10535
10536 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10537 error = EPERM;
10538 goto out;
10539 }
10540 #endif /* DEVELOPMENT || DEBUG */
10541
10542 /*
10543 * Removed a check here; we used to abort if vp's vid
10544 * was not the same as what we'd seen the last time around.
10545 * I do not think that check was valid, because if we retry
10546 * and all dirents are gone, the directory could legitimately
10547 * be recycled but still be present in a situation where we would
10548 * have had permission to delete. Therefore, we won't make
10549 * an effort to preserve that check now that we may not have a
10550 * vp here.
10551 */
10552
10553 if (!batched) {
10554 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10555 if (error) {
10556 if (error == ENOENT) {
10557 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10558 restart_flag = 1;
10559 restart_count += 1;
10560 }
10561 }
10562 goto out;
10563 }
10564 }
10565 } else {
10566 batched = 1;
10567
10568 if (!vnode_compound_rmdir_available(dvp)) {
10569 panic("No error, but no compound rmdir?");
10570 }
10571 }
10572
10573 #if CONFIG_FSE
10574 fse_info finfo = {0};
10575
10576 need_event = need_fsevent(FSE_DELETE, dvp);
10577 if (need_event) {
10578 if (!batched) {
10579 get_fse_info(vp, &finfo, ctx);
10580 } else {
10581 error = vfs_get_notify_attributes(&__rmdir_data->va);
10582 if (error) {
10583 goto out;
10584 }
10585
10586 vap = &__rmdir_data->va;
10587 }
10588 }
10589 #endif
10590 has_listeners = kauth_authorize_fileop_has_listeners();
10591 if (need_event || has_listeners) {
10592 if (path == NULL) {
10593 GET_PATH(path);
10594 }
10595
10596 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10597
10598 if (no_firmlink_path == NULL) {
10599 GET_PATH(no_firmlink_path);
10600 }
10601
10602 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10603 #if CONFIG_FSE
10604 if (truncated_no_firmlink_path) {
10605 finfo.mode |= FSE_TRUNCATED_PATH;
10606 }
10607 #endif
10608 }
10609
10610 #if CONFIG_FILE_LEASES
10611 vnode_breakdirlease(dvp, false, O_WRONLY);
10612 #endif
10613
10614 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10615 ndp->ni_vp = vp;
10616 if (vp == NULLVP) {
10617 /* Couldn't find a vnode */
10618 goto out;
10619 }
10620
10621 if (error == EKEEPLOOKING) {
10622 goto continue_lookup;
10623 } else if (batched && error == ENOENT) {
10624 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10625 /*
10626 * For compound VNOPs, the authorization callback
10627 * may return ENOENT in case of racing hard link lookups
10628 * redrive the lookup.
10629 */
10630 restart_flag = 1;
10631 restart_count += 1;
10632 goto out;
10633 }
10634 }
10635
10636 /*
10637 * XXX There's no provision for passing flags
10638 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10639 * because it's not empty, then we try again
10640 * with VNOP_REMOVE(), passing in a special
10641 * flag that clever file systems will know
10642 * how to handle.
10643 */
10644 if (error == ENOTEMPTY &&
10645 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10646 /*
10647 * Only do this if the directory is actually
10648 * marked as DATALESS.
10649 */
10650 struct vnode_attr *lvap =
10651 kalloc_type(struct vnode_attr, Z_WAITOK);
10652
10653 VATTR_INIT(lvap);
10654 VATTR_WANTED(lvap, va_flags);
10655 if (vnode_getattr(vp, lvap, ctx) == 0 &&
10656 VATTR_IS_SUPPORTED(lvap, va_flags) &&
10657 (lvap->va_flags & SF_DATALESS) != 0) {
10658 /*
10659 * If this fails, we want to keep the original
10660 * error.
10661 */
10662 if (vn_remove(dvp, &vp, ndp,
10663 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10664 error = 0;
10665 }
10666 }
10667 kfree_type(struct vnode_attr, lvap);
10668 }
10669
10670 #if CONFIG_APPLEDOUBLE
10671 /*
10672 * Special case to remove orphaned AppleDouble
10673 * files. I don't like putting this in the kernel,
10674 * but carbon does not like putting this in carbon either,
10675 * so here we are.
10676 */
10677 if (error == ENOTEMPTY) {
10678 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10679 if (ad_error == EBUSY) {
10680 error = ad_error;
10681 goto out;
10682 }
10683
10684
10685 /*
10686 * Assuming everything went well, we will try the RMDIR again
10687 */
10688 if (!ad_error) {
10689 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10690 }
10691 }
10692 #endif /* CONFIG_APPLEDOUBLE */
10693 /*
10694 * Call out to allow 3rd party notification of delete.
10695 * Ignore result of kauth_authorize_fileop call.
10696 */
10697 if (!error) {
10698 if (has_listeners) {
10699 kauth_authorize_fileop(vfs_context_ucred(ctx),
10700 KAUTH_FILEOP_DELETE,
10701 (uintptr_t)vp,
10702 (uintptr_t)path);
10703 }
10704
10705 if (vp->v_flag & VISHARDLINK) {
10706 // see the comment in unlink1() about why we update
10707 // the parent of a hard link when it is removed
10708 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10709 }
10710
10711 #if CONFIG_FSE
10712 if (need_event) {
10713 if (vap) {
10714 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10715 }
10716 add_fsevent(FSE_DELETE, ctx,
10717 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10718 FSE_ARG_FINFO, &finfo,
10719 FSE_ARG_DONE);
10720 }
10721 #endif
10722
10723 #if CONFIG_MACF
10724 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10725 #endif
10726 }
10727
10728 out:
10729 if (path != NULL) {
10730 RELEASE_PATH(path);
10731 path = NULL;
10732 }
10733
10734 if (no_firmlink_path != NULL) {
10735 RELEASE_PATH(no_firmlink_path);
10736 no_firmlink_path = NULL;
10737 }
10738
10739 /*
10740 * nameidone has to happen before we vnode_put(dvp)
10741 * since it may need to release the fs_nodelock on the dvp
10742 */
10743 nameidone(ndp);
10744 vnode_put(dvp);
10745
10746 if (vp) {
10747 vnode_put(vp);
10748 }
10749
10750 if (restart_flag == 0) {
10751 wakeup_one((caddr_t)vp);
10752 goto err_out;
10753 }
10754 tsleep(vp, PVFS, "rm AD", 1);
10755 } while (restart_flag != 0);
10756
10757 err_out:
10758 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10759
10760 return error;
10761 }
10762
10763 /*
10764 * Remove a directory file.
10765 */
10766 /* ARGSUSED */
10767 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10768 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10769 {
10770 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10771 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10772 }
10773
10774 /* Get direntry length padded to 8 byte alignment */
10775 #define DIRENT64_LEN(namlen) \
10776 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10777
10778 /* Get dirent length padded to 4 byte alignment */
10779 #define DIRENT_LEN(namelen) \
10780 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10781
10782 /* Get the end of this dirent */
10783 #define DIRENT_END(dep) \
10784 (((char *)(dep)) + (dep)->d_reclen - 1)
10785
10786 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10787 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10788 int *numdirent, vfs_context_t ctxp)
10789 {
10790 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10791 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10792 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10793 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10794 } else {
10795 size_t bufsize;
10796 void * bufptr;
10797 uio_t auio;
10798 struct direntry *entry64;
10799 struct dirent *dep;
10800 size_t bytesread;
10801 int error;
10802
10803 /*
10804 * We're here because the underlying file system does not
10805 * support direnties or we mounted denying support so we must
10806 * fall back to dirents and convert them to direntries.
10807 *
10808 * Our kernel buffer needs to be smaller since re-packing will
10809 * expand each dirent. The worse case (when the name length
10810 * is 3 or less) corresponds to a struct direntry size of 32
10811 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10812 * (4-byte aligned). So having a buffer that is 3/8 the size
10813 * will prevent us from reading more than we can pack.
10814 *
10815 * Since this buffer is wired memory, we will limit the
10816 * buffer size to a maximum of 32K. We would really like to
10817 * use 32K in the MIN(), but we use magic number 87371 to
10818 * prevent uio_resid() * 3 / 8 from overflowing.
10819 */
10820 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10821 bufptr = kalloc_data(bufsize, Z_WAITOK);
10822 if (bufptr == NULL) {
10823 return ENOMEM;
10824 }
10825
10826 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10827 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10828 auio->uio_offset = uio->uio_offset;
10829
10830 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10831
10832 dep = (struct dirent *)bufptr;
10833 bytesread = bufsize - uio_resid(auio);
10834
10835 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10836 /*
10837 * Convert all the entries and copy them out to user's buffer.
10838 */
10839 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10840 /* First check that the dirent struct up to d_name is within the buffer */
10841 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10842 /* Check that the length of the entire dirent is within the buffer */
10843 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10844 /* Check that the actual length including the name doesn't exceed d_reclen */
10845 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10846 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10847 vp->v_mount->mnt_vfsstat.f_mntonname,
10848 vp->v_name ? vp->v_name : "<unknown>");
10849 error = EIO;
10850 break;
10851 }
10852
10853 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10854
10855 bzero(entry64, enbufsize);
10856 /* Convert a dirent to a dirent64. */
10857 entry64->d_ino = dep->d_ino;
10858 entry64->d_seekoff = 0;
10859 entry64->d_reclen = (uint16_t)enbufsize;
10860 entry64->d_namlen = dep->d_namlen;
10861 entry64->d_type = dep->d_type;
10862 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10863
10864 /* Move to next entry. */
10865 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10866
10867 /* Copy entry64 to user's buffer. */
10868 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10869 }
10870
10871 /* Update the real offset using the offset we got from VNOP_READDIR. */
10872 if (error == 0) {
10873 uio->uio_offset = auio->uio_offset;
10874 }
10875 uio_free(auio);
10876 kfree_data(bufptr, bufsize);
10877 kfree_type(struct direntry, entry64);
10878 return error;
10879 }
10880 }
10881
10882 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10883
10884 /*
10885 * Read a block of directory entries in a file system independent format.
10886 */
10887 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10888 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10889 off_t *offset, int *eofflag, int flags)
10890 {
10891 vnode_t vp;
10892 struct vfs_context context = *vfs_context_current(); /* local copy */
10893 struct fileproc *fp;
10894 uio_t auio;
10895 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10896 off_t loff;
10897 int error, numdirent;
10898 UIO_STACKBUF(uio_buf, 1);
10899
10900 get_from_fd:
10901 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10902 if (error) {
10903 return error;
10904 }
10905
10906 vn_offset_lock(fp->fp_glob);
10907 if (((vnode_t)fp_get_data(fp)) != vp) {
10908 vn_offset_unlock(fp->fp_glob);
10909 file_drop(fd);
10910 goto get_from_fd;
10911 }
10912
10913 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10914 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10915 error = EBADF;
10916 goto out;
10917 }
10918
10919 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10920 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10921 }
10922
10923 #if CONFIG_MACF
10924 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10925 if (error) {
10926 goto out;
10927 }
10928 #endif
10929
10930 if ((error = vnode_getwithref(vp))) {
10931 goto out;
10932 }
10933 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10934
10935 #if CONFIG_UNION_MOUNTS
10936 unionread:
10937 #endif /* CONFIG_UNION_MOUNTS */
10938 if (vp->v_type != VDIR) {
10939 (void)vnode_put(vp);
10940 error = EINVAL;
10941 goto out;
10942 }
10943
10944 #if CONFIG_MACF
10945 error = mac_vnode_check_readdir(&context, vp);
10946 if (error != 0) {
10947 (void)vnode_put(vp);
10948 goto out;
10949 }
10950 #endif /* MAC */
10951
10952 loff = fp->fp_glob->fg_offset;
10953 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10954 uio_addiov(auio, bufp, bufsize);
10955
10956 if (flags & VNODE_READDIR_EXTENDED) {
10957 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10958 fp->fp_glob->fg_offset = uio_offset(auio);
10959 } else {
10960 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10961 fp->fp_glob->fg_offset = uio_offset(auio);
10962 }
10963 if (error) {
10964 (void)vnode_put(vp);
10965 goto out;
10966 }
10967
10968 #if CONFIG_UNION_MOUNTS
10969 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10970 (vp->v_mount->mnt_flag & MNT_UNION)) {
10971 vnode_t uvp;
10972
10973 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10974 if (vnode_ref(uvp) == 0) {
10975 fp_set_data(fp, uvp);
10976 fp->fp_glob->fg_offset = 0;
10977 vnode_rele(vp);
10978 vnode_put(vp);
10979 vp = uvp;
10980 goto unionread;
10981 } else {
10982 /* could not get a ref, can't replace in fd */
10983 vnode_put(uvp);
10984 }
10985 }
10986 }
10987 #endif /* CONFIG_UNION_MOUNTS */
10988
10989 vnode_put(vp);
10990 if (offset) {
10991 *offset = loff;
10992 }
10993
10994 *bytesread = bufsize - uio_resid(auio);
10995 out:
10996 vn_offset_unlock(fp->fp_glob);
10997 file_drop(fd);
10998 return error;
10999 }
11000
11001
11002 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)11003 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
11004 {
11005 off_t offset;
11006 ssize_t bytesread;
11007 int error, eofflag;
11008
11009 AUDIT_ARG(fd, uap->fd);
11010 error = getdirentries_common(uap->fd, uap->buf, uap->count,
11011 &bytesread, &offset, &eofflag, 0);
11012
11013 if (error == 0) {
11014 if (proc_is64bit(p)) {
11015 user64_long_t base = (user64_long_t)offset;
11016 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
11017 } else {
11018 user32_long_t base = (user32_long_t)offset;
11019 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
11020 }
11021 *retval = (int)bytesread;
11022 }
11023 return error;
11024 }
11025
11026 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)11027 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
11028 {
11029 off_t offset;
11030 ssize_t bytesread;
11031 int error, eofflag;
11032 user_size_t bufsize;
11033
11034 AUDIT_ARG(fd, uap->fd);
11035
11036 /*
11037 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
11038 * then the kernel carves out the last 4 bytes to return extended
11039 * information to userspace (namely whether we reached EOF with this call).
11040 */
11041 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
11042 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
11043 } else {
11044 bufsize = uap->bufsize;
11045 }
11046
11047 error = getdirentries_common(uap->fd, uap->buf, bufsize,
11048 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
11049
11050 if (error == 0) {
11051 *retval = bytesread;
11052 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
11053
11054 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
11055 getdirentries64_flags_t flags = 0;
11056 if (eofflag) {
11057 flags |= GETDIRENTRIES64_EOF;
11058 }
11059 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
11060 sizeof(flags));
11061 }
11062 }
11063 return error;
11064 }
11065
11066
11067 /*
11068 * Set the mode mask for creation of filesystem nodes.
11069 * XXX implement xsecurity
11070 */
11071 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
11072 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)11073 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
11074 {
11075 AUDIT_ARG(mask, newmask);
11076 proc_fdlock(p);
11077 *retval = p->p_fd.fd_cmask;
11078 p->p_fd.fd_cmask = newmask & ALLPERMS;
11079 proc_fdunlock(p);
11080 return 0;
11081 }
11082
11083 /*
11084 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
11085 *
11086 * Parameters: p Process requesting to set the umask
11087 * uap User argument descriptor (see below)
11088 * retval umask of the process (parameter p)
11089 *
11090 * Indirect: uap->newmask umask to set
11091 * uap->xsecurity ACL to set
11092 *
11093 * Returns: 0 Success
11094 * !0 Not success
11095 *
11096 */
11097 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)11098 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
11099 {
11100 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
11101 }
11102
11103 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)11104 umask(proc_t p, struct umask_args *uap, int32_t *retval)
11105 {
11106 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
11107 }
11108
11109 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
11110 "com.apple.private.vfs.revoke-mounted-device"
11111
11112 /*
11113 * Void all references to file by ripping underlying filesystem
11114 * away from vnode.
11115 */
11116 /* ARGSUSED */
11117 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)11118 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
11119 {
11120 vnode_t vp;
11121 struct vnode_attr va;
11122 vfs_context_t ctx = vfs_context_current();
11123 int error;
11124 struct nameidata nd;
11125
11126 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
11127 uap->path, ctx);
11128 error = namei(&nd);
11129 if (error) {
11130 return error;
11131 }
11132 vp = nd.ni_vp;
11133
11134 nameidone(&nd);
11135
11136 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
11137 error = ENOTSUP;
11138 goto out;
11139 }
11140
11141 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
11142 error = EBUSY;
11143 goto out;
11144 }
11145
11146 #if CONFIG_MACF
11147 error = mac_vnode_check_revoke(ctx, vp);
11148 if (error) {
11149 goto out;
11150 }
11151 #endif
11152
11153 VATTR_INIT(&va);
11154 VATTR_WANTED(&va, va_uid);
11155 if ((error = vnode_getattr(vp, &va, ctx))) {
11156 goto out;
11157 }
11158 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
11159 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
11160 goto out;
11161 }
11162 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
11163 VNOP_REVOKE(vp, REVOKEALL, ctx);
11164 }
11165 out:
11166 vnode_put(vp);
11167 return error;
11168 }
11169
11170
11171 /*
11172 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
11173 * The following system calls are designed to support features
11174 * which are specific to the HFS & HFS Plus volume formats
11175 */
11176
11177
11178 /*
11179 * Obtain attribute information on objects in a directory while enumerating
11180 * the directory.
11181 */
11182 /* ARGSUSED */
11183 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)11184 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
11185 {
11186 vnode_t vp;
11187 struct fileproc *fp;
11188 uio_t auio = NULL;
11189 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11190 uint32_t count = 0, savecount = 0;
11191 uint32_t newstate = 0;
11192 int error, eofflag = 0;
11193 off_t loff = 0;
11194 struct attrlist attributelist;
11195 vfs_context_t ctx = vfs_context_current();
11196 int fd = uap->fd;
11197 UIO_STACKBUF(uio_buf, 1);
11198 kauth_action_t action;
11199
11200 AUDIT_ARG(fd, fd);
11201
11202 /* Get the attributes into kernel space */
11203 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
11204 return error;
11205 }
11206 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
11207 return error;
11208 }
11209 savecount = count;
11210
11211 get_from_fd:
11212 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
11213 return error;
11214 }
11215
11216 vn_offset_lock(fp->fp_glob);
11217 if (((vnode_t)fp_get_data(fp)) != vp) {
11218 vn_offset_unlock(fp->fp_glob);
11219 file_drop(fd);
11220 goto get_from_fd;
11221 }
11222
11223 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
11224 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
11225 error = EBADF;
11226 goto out;
11227 }
11228
11229
11230 #if CONFIG_MACF
11231 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
11232 fp->fp_glob);
11233 if (error) {
11234 goto out;
11235 }
11236 #endif
11237
11238
11239 if ((error = vnode_getwithref(vp))) {
11240 goto out;
11241 }
11242
11243 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
11244
11245 #if CONFIG_UNION_MOUNTS
11246 unionread:
11247 #endif /* CONFIG_UNION_MOUNTS */
11248 if (vp->v_type != VDIR) {
11249 (void)vnode_put(vp);
11250 error = EINVAL;
11251 goto out;
11252 }
11253
11254 #if CONFIG_MACF
11255 error = mac_vnode_check_readdir(ctx, vp);
11256 if (error != 0) {
11257 (void)vnode_put(vp);
11258 goto out;
11259 }
11260 #endif /* MAC */
11261
11262 /* set up the uio structure which will contain the users return buffer */
11263 loff = fp->fp_glob->fg_offset;
11264 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11265 uio_addiov(auio, uap->buffer, uap->buffersize);
11266
11267 /*
11268 * If the only item requested is file names, we can let that past with
11269 * just LIST_DIRECTORY. If they want any other attributes, that means
11270 * they need SEARCH as well.
11271 */
11272 action = KAUTH_VNODE_LIST_DIRECTORY;
11273 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
11274 attributelist.fileattr || attributelist.dirattr) {
11275 action |= KAUTH_VNODE_SEARCH;
11276 }
11277
11278 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
11279 /* Believe it or not, uap->options only has 32-bits of valid
11280 * info, so truncate before extending again */
11281
11282 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
11283 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
11284 }
11285
11286 if (error) {
11287 (void) vnode_put(vp);
11288 goto out;
11289 }
11290
11291 #if CONFIG_UNION_MOUNTS
11292 /*
11293 * If we've got the last entry of a directory in a union mount
11294 * then reset the eofflag and pretend there's still more to come.
11295 * The next call will again set eofflag and the buffer will be empty,
11296 * so traverse to the underlying directory and do the directory
11297 * read there.
11298 */
11299 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
11300 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
11301 eofflag = 0;
11302 } else { // Empty buffer
11303 vnode_t uvp;
11304 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
11305 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
11306 fp_set_data(fp, uvp);
11307 fp->fp_glob->fg_offset = 0; // reset index for new dir
11308 count = savecount;
11309 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
11310 vnode_put(vp);
11311 vp = uvp;
11312 goto unionread;
11313 } else {
11314 /* could not get a ref, can't replace in fd */
11315 vnode_put(uvp);
11316 }
11317 }
11318 }
11319 }
11320 #endif /* CONFIG_UNION_MOUNTS */
11321
11322 (void)vnode_put(vp);
11323
11324 if (error) {
11325 goto out;
11326 }
11327 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
11328
11329 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
11330 goto out;
11331 }
11332 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
11333 goto out;
11334 }
11335 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
11336 goto out;
11337 }
11338
11339 *retval = eofflag; /* similar to getdirentries */
11340 error = 0;
11341 out:
11342 vn_offset_unlock(fp->fp_glob);
11343 file_drop(fd);
11344 return error; /* return error earlier, an retval of 0 or 1 now */
11345 } /* end of getdirentriesattr system call */
11346
11347 /*
11348 * Exchange data between two files
11349 */
11350
11351 /* ARGSUSED */
11352 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)11353 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
11354 {
11355 struct nameidata fnd, snd;
11356 vfs_context_t ctx = vfs_context_current();
11357 vnode_t fvp;
11358 vnode_t svp;
11359 int error;
11360 u_int32_t nameiflags;
11361 char *fpath = NULL;
11362 char *spath = NULL;
11363 int flen = 0, slen = 0;
11364 int from_truncated = 0, to_truncated = 0;
11365 #if CONFIG_FSE
11366 fse_info f_finfo, s_finfo;
11367 #endif
11368
11369 nameiflags = 0;
11370 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11371 nameiflags |= FOLLOW;
11372 }
11373
11374 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
11375 UIO_USERSPACE, uap->path1, ctx);
11376
11377 error = namei(&fnd);
11378 if (error) {
11379 goto out2;
11380 }
11381
11382 nameidone(&fnd);
11383 fvp = fnd.ni_vp;
11384
11385 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
11386 UIO_USERSPACE, uap->path2, ctx);
11387
11388 error = namei(&snd);
11389 if (error) {
11390 vnode_put(fvp);
11391 goto out2;
11392 }
11393 nameidone(&snd);
11394 svp = snd.ni_vp;
11395
11396 /*
11397 * if the files are the same, return an inval error
11398 */
11399 if (svp == fvp) {
11400 error = EINVAL;
11401 goto out;
11402 }
11403
11404 /*
11405 * if the files are on different volumes, return an error
11406 */
11407 if (svp->v_mount != fvp->v_mount) {
11408 error = EXDEV;
11409 goto out;
11410 }
11411
11412 /* If they're not files, return an error */
11413 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
11414 error = EINVAL;
11415 goto out;
11416 }
11417
11418 #if CONFIG_MACF
11419 error = mac_vnode_check_exchangedata(ctx,
11420 fvp, svp);
11421 if (error) {
11422 goto out;
11423 }
11424 #endif
11425 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
11426 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
11427 goto out;
11428 }
11429
11430 if (
11431 #if CONFIG_FSE
11432 need_fsevent(FSE_EXCHANGE, fvp) ||
11433 #endif
11434 kauth_authorize_fileop_has_listeners()) {
11435 GET_PATH(fpath);
11436 GET_PATH(spath);
11437
11438 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
11439 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
11440
11441 #if CONFIG_FSE
11442 get_fse_info(fvp, &f_finfo, ctx);
11443 get_fse_info(svp, &s_finfo, ctx);
11444 if (from_truncated || to_truncated) {
11445 // set it here since only the f_finfo gets reported up to user space
11446 f_finfo.mode |= FSE_TRUNCATED_PATH;
11447 }
11448 #endif
11449 }
11450 /* Ok, make the call */
11451 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
11452
11453 if (error == 0) {
11454 const char *tmpname;
11455
11456 if (fpath != NULL && spath != NULL) {
11457 /* call out to allow 3rd party notification of exchangedata.
11458 * Ignore result of kauth_authorize_fileop call.
11459 */
11460 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
11461 (uintptr_t)fpath, (uintptr_t)spath);
11462 }
11463 name_cache_lock();
11464
11465 tmpname = fvp->v_name;
11466 fvp->v_name = svp->v_name;
11467 svp->v_name = tmpname;
11468
11469 if (fvp->v_parent != svp->v_parent) {
11470 vnode_t tmp;
11471
11472 tmp = fvp->v_parent;
11473 fvp->v_parent = svp->v_parent;
11474 svp->v_parent = tmp;
11475 }
11476 name_cache_unlock();
11477
11478 #if CONFIG_FSE
11479 if (fpath != NULL && spath != NULL) {
11480 add_fsevent(FSE_EXCHANGE, ctx,
11481 FSE_ARG_STRING, flen, fpath,
11482 FSE_ARG_FINFO, &f_finfo,
11483 FSE_ARG_STRING, slen, spath,
11484 FSE_ARG_FINFO, &s_finfo,
11485 FSE_ARG_DONE);
11486 }
11487 #endif
11488 }
11489
11490 out:
11491 if (fpath != NULL) {
11492 RELEASE_PATH(fpath);
11493 }
11494 if (spath != NULL) {
11495 RELEASE_PATH(spath);
11496 }
11497 vnode_put(svp);
11498 vnode_put(fvp);
11499 out2:
11500 return error;
11501 }
11502
11503 /*
11504 * Return (in MB) the amount of freespace on the given vnode's volume.
11505 */
11506 uint32_t freespace_mb(vnode_t vp);
11507
11508 uint32_t
freespace_mb(vnode_t vp)11509 freespace_mb(vnode_t vp)
11510 {
11511 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11512 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11513 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11514 }
11515
11516 #if CONFIG_SEARCHFS
11517
11518 /* ARGSUSED */
11519
11520 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11521 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11522 {
11523 vnode_t vp, tvp;
11524 int i, error = 0;
11525 int fserror = 0;
11526 struct nameidata nd;
11527 struct user64_fssearchblock searchblock;
11528 struct searchstate *state;
11529 struct attrlist *returnattrs;
11530 struct timeval timelimit;
11531 void *searchparams1, *searchparams2;
11532 uio_t auio = NULL;
11533 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11534 uint32_t nummatches;
11535 size_t mallocsize;
11536 uint32_t nameiflags;
11537 vfs_context_t ctx = vfs_context_current();
11538 UIO_STACKBUF(uio_buf, 1);
11539
11540 /* Start by copying in fsearchblock parameter list */
11541 if (IS_64BIT_PROCESS(p)) {
11542 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11543 timelimit.tv_sec = searchblock.timelimit.tv_sec;
11544 timelimit.tv_usec = searchblock.timelimit.tv_usec;
11545 } else {
11546 struct user32_fssearchblock tmp_searchblock;
11547
11548 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11549 // munge into 64-bit version
11550 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11551 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11552 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11553 searchblock.maxmatches = tmp_searchblock.maxmatches;
11554 /*
11555 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11556 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11557 */
11558 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11559 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11560 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11561 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11562 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11563 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11564 searchblock.searchattrs = tmp_searchblock.searchattrs;
11565 }
11566 if (error) {
11567 return error;
11568 }
11569
11570 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11571 */
11572 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11573 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11574 return EINVAL;
11575 }
11576
11577 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11578 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
11579 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11580 /* block. */
11581 /* */
11582 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
11583 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
11584 /* assumes the size is still 556 bytes it will continue to work */
11585
11586 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11587 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11588
11589 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11590
11591 /* Now set up the various pointers to the correct place in our newly allocated memory */
11592
11593 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11594 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11595 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11596
11597 /* Now copy in the stuff given our local variables. */
11598
11599 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11600 goto freeandexit;
11601 }
11602
11603 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11604 goto freeandexit;
11605 }
11606
11607 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11608 goto freeandexit;
11609 }
11610
11611 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11612 goto freeandexit;
11613 }
11614
11615 /*
11616 * When searching a union mount, need to set the
11617 * start flag at the first call on each layer to
11618 * reset state for the new volume.
11619 */
11620 if (uap->options & SRCHFS_START) {
11621 state->ss_union_layer = 0;
11622 } else {
11623 uap->options |= state->ss_union_flags;
11624 }
11625 state->ss_union_flags = 0;
11626
11627 /*
11628 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11629 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11630 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11631 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11632 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11633 */
11634
11635 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11636 attrreference_t* string_ref;
11637 u_int32_t* start_length;
11638 user64_size_t param_length;
11639
11640 /* validate searchparams1 */
11641 param_length = searchblock.sizeofsearchparams1;
11642 /* skip the word that specifies length of the buffer */
11643 start_length = (u_int32_t*) searchparams1;
11644 start_length = start_length + 1;
11645 string_ref = (attrreference_t*) start_length;
11646
11647 /* ensure no negative offsets or too big offsets */
11648 if (string_ref->attr_dataoffset < 0) {
11649 error = EINVAL;
11650 goto freeandexit;
11651 }
11652 if (string_ref->attr_length > MAXPATHLEN) {
11653 error = EINVAL;
11654 goto freeandexit;
11655 }
11656
11657 /* Check for pointer overflow in the string ref */
11658 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11659 error = EINVAL;
11660 goto freeandexit;
11661 }
11662
11663 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11664 error = EINVAL;
11665 goto freeandexit;
11666 }
11667 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11668 error = EINVAL;
11669 goto freeandexit;
11670 }
11671 }
11672
11673 /* set up the uio structure which will contain the users return buffer */
11674 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11675 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11676
11677 nameiflags = 0;
11678 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11679 nameiflags |= FOLLOW;
11680 }
11681 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11682 UIO_USERSPACE, uap->path, ctx);
11683
11684 error = namei(&nd);
11685 if (error) {
11686 goto freeandexit;
11687 }
11688 vp = nd.ni_vp;
11689 nameidone(&nd);
11690
11691 /*
11692 * Switch to the root vnode for the volume
11693 */
11694 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11695 vnode_put(vp);
11696 if (error) {
11697 goto freeandexit;
11698 }
11699 vp = tvp;
11700
11701 #if CONFIG_UNION_MOUNTS
11702 /*
11703 * If it's a union mount, the path lookup takes
11704 * us to the top layer. But we may need to descend
11705 * to a lower layer. For non-union mounts the layer
11706 * is always zero.
11707 */
11708 for (i = 0; i < (int) state->ss_union_layer; i++) {
11709 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11710 break;
11711 }
11712 tvp = vp;
11713 vp = vp->v_mount->mnt_vnodecovered;
11714 if (vp == NULL) {
11715 vnode_put(tvp);
11716 error = ENOENT;
11717 goto freeandexit;
11718 }
11719 error = vnode_getwithref(vp);
11720 vnode_put(tvp);
11721 if (error) {
11722 goto freeandexit;
11723 }
11724 }
11725 #endif /* CONFIG_UNION_MOUNTS */
11726
11727 #if CONFIG_MACF
11728 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11729 if (error) {
11730 vnode_put(vp);
11731 goto freeandexit;
11732 }
11733 #endif
11734
11735
11736 /*
11737 * If searchblock.maxmatches == 0, then skip the search. This has happened
11738 * before and sometimes the underlying code doesnt deal with it well.
11739 */
11740 if (searchblock.maxmatches == 0) {
11741 nummatches = 0;
11742 goto saveandexit;
11743 }
11744
11745 /*
11746 * Allright, we have everything we need, so lets make that call.
11747 *
11748 * We keep special track of the return value from the file system:
11749 * EAGAIN is an acceptable error condition that shouldn't keep us
11750 * from copying out any results...
11751 */
11752
11753 fserror = VNOP_SEARCHFS(vp,
11754 searchparams1,
11755 searchparams2,
11756 &searchblock.searchattrs,
11757 (uint32_t)searchblock.maxmatches,
11758 &timelimit,
11759 returnattrs,
11760 &nummatches,
11761 (uint32_t)uap->scriptcode,
11762 (uint32_t)uap->options,
11763 auio,
11764 (struct searchstate *) &state->ss_fsstate,
11765 ctx);
11766
11767 #if CONFIG_UNION_MOUNTS
11768 /*
11769 * If it's a union mount we need to be called again
11770 * to search the mounted-on filesystem.
11771 */
11772 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11773 state->ss_union_flags = SRCHFS_START;
11774 state->ss_union_layer++; // search next layer down
11775 fserror = EAGAIN;
11776 }
11777 #endif /* CONFIG_UNION_MOUNTS */
11778
11779 saveandexit:
11780
11781 vnode_put(vp);
11782
11783 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11784 * search state. Everything was already put into he return buffer by the vop call. */
11785
11786 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11787 goto freeandexit;
11788 }
11789
11790 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11791 goto freeandexit;
11792 }
11793
11794 error = fserror;
11795
11796 freeandexit:
11797
11798 kfree_data(searchparams1, mallocsize);
11799
11800 return error;
11801 } /* end of searchfs system call */
11802
11803 #else /* CONFIG_SEARCHFS */
11804
11805 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11806 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11807 {
11808 return ENOTSUP;
11809 }
11810
11811 #endif /* CONFIG_SEARCHFS */
11812
11813
11814 #if CONFIG_DATALESS_FILES
11815
11816 /*
11817 * === Namespace Resolver Up-call Mechanism ===
11818 *
11819 * When I/O is performed to a dataless file or directory (read, write,
11820 * lookup-in, etc.), the file system performs an upcall to the namespace
11821 * resolver (filecoordinationd) to materialize the object.
11822 *
11823 * We need multiple up-calls to be in flight at once, and we need these
11824 * up-calls to be interruptible, thus the following implementation:
11825 *
11826 * => The nspace_resolver_request represents the in-kernel request state.
11827 * It contains a request ID, storage space for the errno code returned
11828 * by filecoordinationd, and flags.
11829 *
11830 * => The request ID is simply a global monotonically incrementing 32-bit
11831 * number. Outstanding requests are stored in a hash table, and the
11832 * hash function is extremely simple.
11833 *
11834 * => When an upcall is to be made to filecoordinationd, a request structure
11835 * is allocated on the stack (it is small, and needs to live only during
11836 * the duration of the call to resolve_nspace_item_ext()). It is
11837 * initialized and inserted into the table. Some backpressure from
11838 * filecoordinationd is applied by limiting the numnber of entries that
11839 * can be inserted into the table (and thus limiting the number of
11840 * outstanding requests issued to filecoordinationd); waiting for an
11841 * available slot is interruptible.
11842 *
11843 * => Once the request has been inserted into the table, the up-call is made
11844 * to filecoordinationd via a MiG-generated stub. The up-call returns
11845 * immediately and filecoordinationd processes the request asynchronously.
11846 *
11847 * => The caller now waits for the request to complete. Tnis is achieved by
11848 * sleeping on the address of the request structure and waiting for
11849 * filecoordinationd to mark the request structure as complete. This
11850 * is an interruptible sleep call; if interrupted, the request structure
11851 * is removed from the table and EINTR is returned to the caller. If
11852 * this occurs, an advisory up-call is made to filecoordinationd with
11853 * the request ID to indicate that the request can be aborted or
11854 * de-prioritized at the discretion of filecoordinationd.
11855 *
11856 * => When filecoordinationd has completed the request, it signals completion
11857 * by writing to the vfs.nspace.complete sysctl node. Only a process
11858 * decorated as a namespace resolver can write to this sysctl node. The
11859 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11860 * The request ID is looked up in the table, and if the request is found,
11861 * the error code is stored in the request structure and a wakeup()
11862 * issued on the address of the request structure. If the request is not
11863 * found, we simply drop the completion notification, assuming that the
11864 * caller was interrupted.
11865 *
11866 * => When the waiting thread wakes up, it extracts the error code from the
11867 * request structure, removes the request from the table, and returns the
11868 * error code to the calling function. Fini!
11869 */
11870
11871 struct nspace_resolver_request {
11872 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11873 vnode_t r_vp;
11874 vnode_t r_tdvp;
11875 uint32_t r_req_id;
11876 int r_resolver_error;
11877 int r_flags;
11878 };
11879
11880 #define RRF_COMPLETE 0x0001
11881 #define RRF_COMPLETING 0x0002
11882
11883 struct nspace_resolver_completion_data {
11884 uint32_t req_id;
11885 int32_t resolver_error;
11886 uint64_t orig_gencount;
11887 uint64_t orig_syncroot;
11888 };
11889
11890 static uint32_t
next_nspace_req_id(void)11891 next_nspace_req_id(void)
11892 {
11893 static uint32_t next_req_id;
11894
11895 return OSAddAtomic(1, &next_req_id);
11896 }
11897
11898 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11899 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11900
11901 static LIST_HEAD(nspace_resolver_requesthead,
11902 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11903 static u_long nspace_resolver_request_hashmask;
11904 static u_int nspace_resolver_request_count;
11905 static bool nspace_resolver_request_wait_slot;
11906 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11907 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11908 &nspace_resolver_request_lck_grp);
11909
11910 #define NSPACE_REQ_LOCK() \
11911 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11912 #define NSPACE_REQ_UNLOCK() \
11913 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11914
11915 #define NSPACE_RESOLVER_HASH(req_id) \
11916 (&nspace_resolver_request_hashtbl[(req_id) & \
11917 nspace_resolver_request_hashmask])
11918
11919 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11920 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11921 {
11922 struct nspace_resolver_requesthead *bucket;
11923 struct nspace_resolver_request *req;
11924
11925 bucket = NSPACE_RESOLVER_HASH(req_id);
11926 LIST_FOREACH(req, bucket, r_hashlink) {
11927 if (req->r_req_id == req_id) {
11928 /*
11929 * If this request already has a completion
11930 * pending, don't return it again.
11931 */
11932 if ((req->r_flags & RRF_COMPLETING) != 0 &&
11933 skip_completing) {
11934 req = NULL;
11935 }
11936 return req;
11937 }
11938 }
11939
11940 return NULL;
11941 }
11942
11943 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11944 nspace_resolver_req_add(struct nspace_resolver_request *req)
11945 {
11946 struct nspace_resolver_requesthead *bucket;
11947 int error;
11948
11949 NSPACE_REQ_LOCK();
11950
11951 while (nspace_resolver_request_count >=
11952 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11953 nspace_resolver_request_wait_slot = true;
11954 error = msleep(&nspace_resolver_request_count,
11955 &nspace_resolver_request_hash_mutex,
11956 PVFS | PCATCH, "nspacerq", NULL);
11957 if (error) {
11958 NSPACE_REQ_UNLOCK();
11959 return error;
11960 }
11961 }
11962
11963 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11964 #if DIAGNOSTIC
11965 assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11966 #endif /* DIAGNOSTIC */
11967 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11968 nspace_resolver_request_count++;
11969
11970 NSPACE_REQ_UNLOCK();
11971
11972 return 0;
11973 }
11974
11975 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11976 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11977 {
11978 /*
11979 * If a completion is in-progress, we have to wait for the
11980 * completion handler to finish because it's still using 'req',
11981 * which is allocated on our stack a couple of frames up.
11982 */
11983 while ((req->r_flags & RRF_COMPLETING) != 0) {
11984 (void) msleep(req, &nspace_resolver_request_hash_mutex,
11985 PVFS, "nspacecmplt", NULL);
11986 }
11987 }
11988
11989 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11990 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11991 {
11992 struct nspace_resolver_requesthead *bucket;
11993
11994 /* We're called with NSPACE_REQ_LOCK held. */
11995
11996 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11997 #if DIAGNOSTIC
11998 assert((req->r_flags & RRF_COMPLETING) == 0);
11999 assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
12000 #endif /* DIAGNOSTIC */
12001 LIST_REMOVE(req, r_hashlink);
12002 nspace_resolver_request_count--;
12003
12004 if (nspace_resolver_request_wait_slot) {
12005 nspace_resolver_request_wait_slot = false;
12006 wakeup(&nspace_resolver_request_count);
12007 }
12008
12009 nspace_resolver_req_wait_pending_completion(req);
12010
12011 NSPACE_REQ_UNLOCK();
12012 }
12013
12014 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)12015 nspace_resolver_req_remove(struct nspace_resolver_request *req)
12016 {
12017 NSPACE_REQ_LOCK();
12018 nspace_resolver_req_remove_and_unlock(req);
12019 }
12020
12021 static void
nspace_resolver_req_cancel(uint32_t req_id)12022 nspace_resolver_req_cancel(uint32_t req_id)
12023 {
12024 kern_return_t kr;
12025 mach_port_t mp;
12026
12027 // Failures here aren't fatal -- the cancellation message
12028 // sent to the resolver is merely advisory.
12029
12030 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12031 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12032 return;
12033 }
12034
12035 kr = send_nspace_resolve_cancel(mp, req_id);
12036 if (kr != KERN_SUCCESS) {
12037 os_log_error(OS_LOG_DEFAULT,
12038 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
12039 }
12040
12041 ipc_port_release_send(mp);
12042 }
12043
12044 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)12045 nspace_resolver_req_wait(struct nspace_resolver_request *req)
12046 {
12047 bool send_cancel_message = false;
12048 int error;
12049
12050 NSPACE_REQ_LOCK();
12051
12052 while ((req->r_flags & RRF_COMPLETE) == 0) {
12053 error = msleep(req, &nspace_resolver_request_hash_mutex,
12054 PVFS | PCATCH, "nspace", NULL);
12055 if (error && error != ERESTART) {
12056 req->r_resolver_error = (error == EINTR) ? EINTR :
12057 ETIMEDOUT;
12058 send_cancel_message = true;
12059 break;
12060 }
12061 }
12062
12063 nspace_resolver_req_remove_and_unlock(req);
12064
12065 /*
12066 * It's safe to continue referencing 'req' here because it's
12067 * allocated on our caller's stack.
12068 */
12069
12070 if (send_cancel_message) {
12071 nspace_resolver_req_cancel(req->r_req_id);
12072 }
12073
12074 return req->r_resolver_error;
12075 }
12076
12077 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)12078 nspace_resolver_req_mark_complete(
12079 struct nspace_resolver_request *req,
12080 int resolver_error)
12081 {
12082 req->r_resolver_error = resolver_error;
12083 req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
12084 wakeup(req);
12085 }
12086
12087 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)12088 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
12089 {
12090 req->r_flags |= RRF_COMPLETING;
12091 }
12092
12093 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)12094 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
12095 {
12096 struct nspace_resolver_request *req;
12097 int error;
12098 struct vnode_attr va;
12099 vnode_t vp;
12100
12101 NSPACE_REQ_LOCK();
12102
12103 req = nspace_resolver_req_lookup(c->req_id, true);
12104 if (req == NULL) {
12105 /*
12106 * If we don't find the request corresponding to our req_id,
12107 * just drop the completion on the floor; it's likely that
12108 * the requester interrupted with a signal, or it may already
12109 * be completing.
12110 */
12111 NSPACE_REQ_UNLOCK();
12112 return;
12113 }
12114
12115 /*
12116 * Get out now if the resolver reported an error.
12117 */
12118 if ((error = c->resolver_error) != 0) {
12119 goto out;
12120 }
12121
12122 /*
12123 * If the resolver did not specify any namespace shape criteria
12124 * for letting the operation proceed, then get out now.
12125 */
12126 if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
12127 goto out;
12128 }
12129
12130 /*
12131 * We're going to have to acquire the mount rename lock and do
12132 * some I/O in order to verify the criteria. Mark the request
12133 * as pending so no one else messes with it after we drop the
12134 * NSPACE_REQ_LOCK.
12135 */
12136 nspace_resolver_req_mark_completion_pending(req);
12137 NSPACE_REQ_UNLOCK();
12138
12139 /*
12140 * Lock out renames from changing the shape of the tree while
12141 * validate the criteria.
12142 */
12143 mount_t locked_mp = req->r_vp->v_mount;
12144 mount_ref(locked_mp, 0);
12145 mount_lock_renames(locked_mp);
12146
12147 if (c->orig_gencount != 0) {
12148 vp = req->r_vp;
12149 if (error) {
12150 goto out_dropmount;
12151 }
12152
12153 VATTR_INIT(&va);
12154 VATTR_WANTED(&va, va_recursive_gencount);
12155 error = vnode_getattr(vp, &va, vfs_context_kernel());
12156 if (error) {
12157 goto out_dropmount;
12158 }
12159 if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
12160 va.va_recursive_gencount != c->orig_gencount) {
12161 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
12162 c->orig_gencount, va.va_recursive_gencount);
12163 error = EBUSY;
12164 goto out_dropmount;
12165 }
12166 }
12167
12168 /*
12169 * Ignore orig_syncroot if a destination directory wasn't specified
12170 * in the request.
12171 */
12172 if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
12173 uint64_t syncroot_id;
12174
12175 if (error) {
12176 goto out_dropmount;
12177 }
12178
12179 #ifndef APFSIOC_GET_SYNC_ROOT
12180 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
12181 #endif
12182
12183 error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
12184 (caddr_t)&syncroot_id, 0, vfs_context_kernel());
12185 if (error) {
12186 goto out_dropmount;
12187 }
12188 if (syncroot_id != c->orig_syncroot) {
12189 printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
12190 c->orig_syncroot, syncroot_id);
12191 error = EBUSY;
12192 goto out_dropmount;
12193 }
12194 }
12195
12196 out_dropmount:
12197 mount_unlock_renames(locked_mp);
12198 mount_drop(locked_mp, 0);
12199 NSPACE_REQ_LOCK();
12200
12201 out:
12202 nspace_resolver_req_mark_complete(req, error);
12203 NSPACE_REQ_UNLOCK();
12204 }
12205
12206 static struct proc *nspace_resolver_proc;
12207
12208 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)12209 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
12210 {
12211 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12212 p == nspace_resolver_proc) ? 1 : 0;
12213 return 0;
12214 }
12215
12216 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
12217
12218 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)12219 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
12220 {
12221 vfs_context_t ctx = vfs_context_current();
12222 int error = 0;
12223
12224 //
12225 // The system filecoordinationd runs as uid == 0. This also
12226 // has the nice side-effect of filtering out filecoordinationd
12227 // running in the simulator.
12228 //
12229 if (!vfs_context_issuser(ctx) ||
12230 !vfs_context_is_dataless_resolver(ctx)) {
12231 return EPERM;
12232 }
12233
12234 if (is_resolver) {
12235 NSPACE_REQ_LOCK();
12236
12237 if (nspace_resolver_proc == NULL) {
12238 proc_lock(p);
12239 p->p_lflag |= P_LNSPACE_RESOLVER;
12240 proc_unlock(p);
12241 nspace_resolver_proc = p;
12242 } else {
12243 error = EBUSY;
12244 }
12245
12246 NSPACE_REQ_UNLOCK();
12247 } else {
12248 // This is basically just like the exit case.
12249 // nspace_resolver_exited() will verify that the
12250 // process is the resolver, and will clear the
12251 // global.
12252 nspace_resolver_exited(p);
12253 }
12254
12255 return error;
12256 }
12257
12258 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)12259 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
12260 {
12261 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
12262 (p->p_vfs_iopolicy &
12263 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
12264 *is_prevented = 1;
12265 } else {
12266 *is_prevented = 0;
12267 }
12268 return 0;
12269 }
12270
12271 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)12272 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
12273 {
12274 if (p->p_lflag & P_LNSPACE_RESOLVER) {
12275 return is_prevented ? 0 : EBUSY;
12276 }
12277
12278 if (is_prevented) {
12279 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
12280 } else {
12281 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
12282 }
12283 return 0;
12284 }
12285
12286 static int
nspace_materialization_get_thread_state(int * is_prevented)12287 nspace_materialization_get_thread_state(int *is_prevented)
12288 {
12289 uthread_t ut = current_uthread();
12290
12291 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
12292 return 0;
12293 }
12294
12295 static int
nspace_materialization_set_thread_state(int is_prevented)12296 nspace_materialization_set_thread_state(int is_prevented)
12297 {
12298 uthread_t ut = current_uthread();
12299
12300 if (is_prevented) {
12301 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
12302 } else {
12303 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
12304 }
12305 return 0;
12306 }
12307
12308 /* the vfs.nspace branch */
12309 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
12310
12311 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12312 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
12313 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12314 {
12315 struct proc *p = req->p;
12316 int new_value, old_value, changed = 0;
12317 int error;
12318
12319 error = nspace_resolver_get_proc_state(p, &old_value);
12320 if (error) {
12321 return error;
12322 }
12323
12324 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12325 &changed);
12326 if (error == 0 && changed) {
12327 error = nspace_resolver_set_proc_state(p, new_value);
12328 }
12329 return error;
12330 }
12331
12332 /* decorate this process as the dataless file resolver */
12333 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
12334 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12335 0, 0, sysctl_nspace_resolver, "I", "");
12336
12337 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12338 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
12339 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12340 {
12341 struct proc *p = req->p;
12342 int new_value, old_value, changed = 0;
12343 int error;
12344
12345 error = nspace_materialization_get_proc_state(p, &old_value);
12346 if (error) {
12347 return error;
12348 }
12349
12350 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12351 &changed);
12352 if (error == 0 && changed) {
12353 error = nspace_materialization_set_proc_state(p, new_value);
12354 }
12355 return error;
12356 }
12357
12358 /* decorate this process as not wanting to materialize dataless files */
12359 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
12360 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12361 0, 0, sysctl_nspace_prevent_materialization, "I", "");
12362
12363 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12364 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
12365 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12366 {
12367 int new_value, old_value, changed = 0;
12368 int error;
12369
12370 error = nspace_materialization_get_thread_state(&old_value);
12371 if (error) {
12372 return error;
12373 }
12374
12375 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12376 &changed);
12377 if (error == 0 && changed) {
12378 error = nspace_materialization_set_thread_state(new_value);
12379 }
12380 return error;
12381 }
12382
12383 /* decorate this thread as not wanting to materialize dataless files */
12384 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
12385 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12386 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
12387
12388 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12389 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
12390 __unused int arg2, struct sysctl_req *req)
12391 {
12392 struct proc *p = req->p;
12393 uint32_t req_status[2] = { 0, 0 };
12394 uint64_t gencount = 0;
12395 uint64_t syncroot = 0;
12396 int error, is_resolver, changed = 0, other_changed;
12397
12398 error = nspace_resolver_get_proc_state(p, &is_resolver);
12399 if (error) {
12400 return error;
12401 }
12402
12403 if (!is_resolver) {
12404 return EPERM;
12405 }
12406
12407 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
12408 &changed);
12409 if (error) {
12410 return error;
12411 }
12412
12413 /*
12414 * Get the gencount if it was passed. Ignore errors, because
12415 * it's optional.
12416 */
12417 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
12418 &other_changed);
12419 if (error) {
12420 gencount = 0;
12421 error = 0;
12422 }
12423
12424 /*
12425 * ...and now the syncroot ID.
12426 */
12427 error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
12428 &other_changed);
12429 if (error) {
12430 syncroot = 0;
12431 error = 0;
12432 }
12433
12434 /*
12435 * req_status[0] is the req_id
12436 *
12437 * req_status[1] is the errno
12438 */
12439 if (error == 0 && changed) {
12440 const struct nspace_resolver_completion_data cd = {
12441 .req_id = req_status[0],
12442 .resolver_error = req_status[1],
12443 .orig_gencount = gencount,
12444 .orig_syncroot = syncroot,
12445 };
12446 nspace_resolver_req_completed(&cd);
12447 }
12448 return error;
12449 }
12450
12451 /* Resolver reports completed reqs here. */
12452 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
12453 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12454 0, 0, sysctl_nspace_complete, "-", "");
12455
12456 #endif /* CONFIG_DATALESS_FILES */
12457
12458 #if CONFIG_DATALESS_FILES
12459 #define __no_dataless_unused /* nothing */
12460 #else
12461 #define __no_dataless_unused __unused
12462 #endif
12463
12464 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12465 vfs_context_dataless_materialization_is_prevented(
12466 vfs_context_t const ctx __no_dataless_unused)
12467 {
12468 #if CONFIG_DATALESS_FILES
12469 proc_t const p = vfs_context_proc(ctx);
12470 thread_t const t = vfs_context_thread(ctx);
12471 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
12472
12473 /*
12474 * Kernel context ==> return EDEADLK, as we would with any random
12475 * process decorated as no-materialize.
12476 */
12477 if (ctx == vfs_context_kernel()) {
12478 return EDEADLK;
12479 }
12480
12481 /*
12482 * If the process has the dataless-manipulation entitlement,
12483 * materialization is prevented, and depending on the kind
12484 * of file system operation, things get to proceed as if the
12485 * object is not dataless.
12486 */
12487 if (vfs_context_is_dataless_manipulator(ctx)) {
12488 return EJUSTRETURN;
12489 }
12490
12491 /*
12492 * Per-thread decorations override any process-wide decorations.
12493 * (Foundation uses this, and this overrides even the dataless-
12494 * manipulation entitlement so as to make API contracts consistent.)
12495 */
12496 if (ut != NULL) {
12497 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12498 return EDEADLK;
12499 }
12500 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12501 return 0;
12502 }
12503 }
12504
12505 /*
12506 * If the process's iopolicy specifies that dataless files
12507 * can be materialized, then we let it go ahead.
12508 */
12509 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12510 return 0;
12511 }
12512 #endif /* CONFIG_DATALESS_FILES */
12513
12514 /*
12515 * The default behavior is to not materialize dataless files;
12516 * return to the caller that deadlock was detected.
12517 */
12518 return EDEADLK;
12519 }
12520
12521 void
nspace_resolver_init(void)12522 nspace_resolver_init(void)
12523 {
12524 #if CONFIG_DATALESS_FILES
12525 nspace_resolver_request_hashtbl =
12526 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12527 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12528 #endif /* CONFIG_DATALESS_FILES */
12529 }
12530
12531 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12532 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12533 {
12534 #if CONFIG_DATALESS_FILES
12535 struct nspace_resolver_requesthead *bucket;
12536 struct nspace_resolver_request *req;
12537 u_long idx;
12538
12539 NSPACE_REQ_LOCK();
12540
12541 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12542 p == nspace_resolver_proc) {
12543 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12544 bucket = &nspace_resolver_request_hashtbl[idx];
12545 LIST_FOREACH(req, bucket, r_hashlink) {
12546 nspace_resolver_req_wait_pending_completion(req);
12547 nspace_resolver_req_mark_complete(req,
12548 ETIMEDOUT);
12549 }
12550 }
12551 nspace_resolver_proc = NULL;
12552 }
12553
12554 NSPACE_REQ_UNLOCK();
12555 #endif /* CONFIG_DATALESS_FILES */
12556 }
12557
12558 #define DATALESS_RESOLVER_ENTITLEMENT \
12559 "com.apple.private.vfs.dataless-resolver"
12560 #define DATALESS_MANIPULATION_ENTITLEMENT \
12561 "com.apple.private.vfs.dataless-manipulation"
12562
12563 #if CONFIG_DATALESS_FILES
12564 /*
12565 * Return TRUE if the vfs context is associated with the dataless
12566 * resolver.
12567 */
12568 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12569 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12570 {
12571 return IOTaskHasEntitlement(vfs_context_task(ctx),
12572 DATALESS_RESOLVER_ENTITLEMENT);
12573 }
12574 #endif /* CONFIG_DATALESS_FILES */
12575
12576 /*
12577 * Return TRUE if the vfs context is associated with a process entitled
12578 * for dataless manipulation.
12579 *
12580 * XXX Arguably belongs in vfs_subr.c, but is here because of the
12581 * complication around CONFIG_DATALESS_FILES.
12582 */
12583 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12584 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12585 {
12586 #if CONFIG_DATALESS_FILES
12587 task_t task = vfs_context_task(ctx);
12588 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12589 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12590 #else
12591 return false;
12592 #endif /* CONFIG_DATALESS_FILES */
12593 }
12594
12595 #if CONFIG_DATALESS_FILES
12596 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12597 log_materialization_prevented(vnode_t vp, uint64_t op)
12598 {
12599 char p_name[MAXCOMLEN + 1];
12600 char *vntype;
12601 proc_selfname(&p_name[0], sizeof(p_name));
12602
12603 if (vp->v_type == VREG) {
12604 vntype = "File";
12605 } else if (vp->v_type == VDIR) {
12606 vntype = "Dir";
12607 } else if (vp->v_type == VLNK) {
12608 vntype = "SymLink";
12609 } else {
12610 vntype = "Other";
12611 }
12612
12613 #if DEVELOPMENT
12614 struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12615
12616 VATTR_INIT(vap);
12617 VATTR_WANTED(vap, va_fsid);
12618 VATTR_WANTED(vap, va_fileid);
12619 if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12620 os_log_debug(OS_LOG_DEFAULT,
12621 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12622 p_name, proc_selfpid(), op, vntype,
12623 vap->va_fsid, vap->va_fsid, vap->va_fileid);
12624 } else
12625 #endif
12626 {
12627 os_log_debug(OS_LOG_DEFAULT,
12628 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12629 p_name, proc_selfpid(), op, vntype);
12630 }
12631 #if DEVELOPMENT
12632 kfree_type(struct vnode_attr, vap);
12633 #endif
12634 }
12635 #endif /* CONFIG_DATALESS_FILES */
12636
12637 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12638 vfs_materialize_item(
12639 vnode_t vp __no_dataless_unused,
12640 uint32_t op __no_dataless_unused,
12641 int64_t offset __no_dataless_unused,
12642 int64_t size __no_dataless_unused,
12643 char *lookup_name __no_dataless_unused,
12644 size_t const namelen __no_dataless_unused,
12645 vnode_t tdvp __no_dataless_unused)
12646 {
12647 #if CONFIG_DATALESS_FILES
12648 kern_return_t kern_ret;
12649 mach_port_t mach_port;
12650 char *path = NULL;
12651 vfs_context_t context;
12652 int path_len;
12653 int error;
12654 audit_token_t atoken;
12655 enum vtype vp_vtype;
12656
12657 /* Swap files are special; ignore them */
12658 if (vnode_isswap(vp)) {
12659 return 0;
12660 }
12661
12662 /*
12663 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12664 * are no longer used nor supported.
12665 */
12666 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12667 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12668 return ENOTSUP;
12669 }
12670 if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12671 os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12672 return ENOTSUP;
12673 }
12674
12675 /* Normalize 'op'. */
12676 op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12677
12678 /*
12679 * To-directory is only meaningful for rename operations;
12680 * ignore it if someone handed one to us unexpectedly.
12681 */
12682 if (op != NAMESPACE_HANDLER_RENAME_OP) {
12683 tdvp = NULL;
12684 }
12685
12686 context = vfs_context_current();
12687
12688 /* Remember this for later. */
12689 vp_vtype = vnode_vtype(vp);
12690
12691 error = vfs_context_dataless_materialization_is_prevented(context);
12692 if (error) {
12693 log_materialization_prevented(vp, op);
12694 goto out_check_errors;
12695 }
12696
12697 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12698 &mach_port);
12699 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12700 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12701 /*
12702 * Treat this like being unable to access the backing store
12703 * server.
12704 */
12705 return ETIMEDOUT;
12706 }
12707
12708 int path_alloc_len = MAXPATHLEN;
12709 do {
12710 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12711 if (path == NULL) {
12712 return ENOMEM;
12713 }
12714
12715 path_len = path_alloc_len;
12716 error = vn_getpath(vp, path, &path_len);
12717 if (error == 0) {
12718 break;
12719 } else if (error == ENOSPC) {
12720 kfree_data(path, path_alloc_len);
12721 path = NULL;
12722 } else {
12723 goto out_release_port;
12724 }
12725 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) &&
12726 path_alloc_len <= MAXLONGPATHLEN);
12727
12728 error = vfs_context_copy_audit_token(context, &atoken);
12729 if (error) {
12730 goto out_release_port;
12731 }
12732
12733 struct nspace_resolver_request req = {
12734 .r_req_id = next_nspace_req_id(),
12735 .r_vp = vp,
12736 .r_tdvp = tdvp,
12737 };
12738
12739 error = nspace_resolver_req_add(&req);
12740 if (error) {
12741 goto out_release_port;
12742 }
12743
12744 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12745
12746 if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12747 char *dest_path = NULL;
12748 int dest_path_len;
12749
12750 dest_path = zalloc(ZV_NAMEI);
12751 dest_path_len = MAXPATHLEN;
12752
12753 error = vn_getpath(tdvp, dest_path, &dest_path_len);
12754 if (error) {
12755 zfree(ZV_NAMEI, dest_path);
12756 goto out_release_port;
12757 }
12758
12759 /*
12760 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12761 * compatibility with existing agents in user-space
12762 * who get passed this value.
12763 */
12764 kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12765 req.r_req_id,
12766 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12767 path, dest_path, atoken);
12768
12769 zfree(ZV_NAMEI, dest_path);
12770 } else if (vp_vtype == VDIR) {
12771 char *tmpname = NULL;
12772
12773 /*
12774 * If the caller provided a lookup_name *and* a name length,
12775 * then we assume the lookup_name is not NUL-terminated.
12776 * Allocate a temporary buffer in this case to provide
12777 * a NUL-terminated path name to the IPC call.
12778 */
12779 if (lookup_name != NULL && namelen != 0) {
12780 if (namelen >= PATH_MAX) {
12781 error = EINVAL;
12782 goto out_req_remove;
12783 }
12784 tmpname = zalloc(ZV_NAMEI);
12785 strlcpy(tmpname, lookup_name, namelen + 1);
12786 lookup_name = tmpname;
12787 } else if (lookup_name != NULL) {
12788 /*
12789 * If the caller provided a lookup_name with a
12790 * zero name length, then we assume it's NUL-
12791 * terminated. Verify it has a valid length.
12792 */
12793 if (strlen(lookup_name) >= PATH_MAX) {
12794 error = EINVAL;
12795 goto out_req_remove;
12796 }
12797 }
12798
12799 /* (See above.) */
12800 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12801 req.r_req_id,
12802 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12803 lookup_name == NULL ? "" : lookup_name, path, atoken);
12804
12805 if (tmpname != NULL) {
12806 zfree(ZV_NAMEI, tmpname);
12807
12808 /*
12809 * Poison lookup_name rather than reference
12810 * freed memory.
12811 */
12812 lookup_name = NULL;
12813 }
12814 } else {
12815 /* (See above.) */
12816 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12817 req.r_req_id,
12818 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12819 offset, size, path, atoken);
12820 }
12821 if (kern_ret != KERN_SUCCESS) {
12822 /*
12823 * Also treat this like being unable to access the backing
12824 * store server.
12825 */
12826 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12827 kern_ret);
12828 error = ETIMEDOUT;
12829 goto out_req_remove;
12830 }
12831
12832 /*
12833 * Give back the memory we allocated earlier while we wait; we
12834 * no longer need it.
12835 */
12836 kfree_data(path, path_alloc_len);
12837 path = NULL;
12838
12839 /*
12840 * Request has been submitted to the resolver. Now (interruptibly)
12841 * wait for completion. Upon requrn, the request will have been
12842 * removed from the lookup table.
12843 */
12844 error = nspace_resolver_req_wait(&req);
12845
12846 out_release_port:
12847 if (path != NULL) {
12848 kfree_data(path, path_alloc_len);
12849 path = NULL;
12850 }
12851 ipc_port_release_send(mach_port);
12852
12853 out_check_errors:
12854 /*
12855 * The file resolver owns the logic about what error to return
12856 * to the caller. We only need to handle a couple of special
12857 * cases here:
12858 */
12859 if (error == EJUSTRETURN) {
12860 /*
12861 * The requesting process is allowed to interact with
12862 * dataless objects. Make a couple of sanity-checks
12863 * here to ensure the action makes sense.
12864 */
12865 switch (op) {
12866 case NAMESPACE_HANDLER_WRITE_OP:
12867 case NAMESPACE_HANDLER_TRUNCATE_OP:
12868 case NAMESPACE_HANDLER_RENAME_OP:
12869 /*
12870 * This handles the case of the resolver itself
12871 * writing data to the file (or throwing it
12872 * away).
12873 */
12874 error = 0;
12875 break;
12876 case NAMESPACE_HANDLER_READ_OP:
12877 case NAMESPACE_HANDLER_LOOKUP_OP:
12878 /*
12879 * This handles the case of the resolver needing
12880 * to look up inside of a dataless directory while
12881 * it's in the process of materializing it (for
12882 * example, creating files or directories).
12883 */
12884 error = (vp_vtype == VDIR) ? 0 : EBADF;
12885 break;
12886 default:
12887 error = EBADF;
12888 break;
12889 }
12890 }
12891
12892 return error;
12893
12894 out_req_remove:
12895 nspace_resolver_req_remove(&req);
12896 goto out_release_port;
12897 #else
12898 return ENOTSUP;
12899 #endif /* CONFIG_DATALESS_FILES */
12900 }
12901
12902 /*
12903 * vfs_materialize_file: Materialize a regular file.
12904 *
12905 * Inputs:
12906 * vp The dataless file to be materialized.
12907 *
12908 * op What kind of operation is being performed:
12909 * -> NAMESPACE_HANDLER_READ_OP
12910 * -> NAMESPACE_HANDLER_WRITE_OP
12911 * -> NAMESPACE_HANDLER_LINK_CREATE
12912 * -> NAMESPACE_HANDLER_DELETE_OP
12913 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12914 * -> NAMESPACE_HANDLER_RENAME_OP
12915 *
12916 * offset offset of I/O for READ or WRITE. Ignored for
12917 * other ops.
12918 *
12919 * size size of I/O for READ or WRITE Ignored for
12920 * other ops.
12921 *
12922 * If offset or size are -1 for a READ or WRITE, then the resolver should
12923 * consider the range to be unknown.
12924 *
12925 * Upon successful return, the caller may proceed with the operation.
12926 * N.B. the file may still be "dataless" in this case.
12927 */
12928 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12929 vfs_materialize_file(
12930 struct vnode *vp,
12931 uint64_t op,
12932 int64_t offset,
12933 int64_t size)
12934 {
12935 if (vp->v_type != VREG) {
12936 return EFTYPE;
12937 }
12938 return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12939 NULL);
12940 }
12941
12942 /*
12943 * vfs_materialize_dir:
12944 *
12945 * Inputs:
12946 * vp The dataless directory to be materialized.
12947 *
12948 * op What kind of operation is being performed:
12949 * -> NAMESPACE_HANDLER_READ_OP
12950 * -> NAMESPACE_HANDLER_WRITE_OP
12951 * -> NAMESPACE_HANDLER_DELETE_OP
12952 * -> NAMESPACE_HANDLER_RENAME_OP
12953 * -> NAMESPACE_HANDLER_LOOKUP_OP
12954 *
12955 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12956 * other ops. May or may not be NUL-terminated; see below.
12957 *
12958 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12959 * terminated and namelen is the number of valid bytes in
12960 * lookup_name. If zero, then lookup_name is assumed to be
12961 * NUL-terminated.
12962 *
12963 * Upon successful return, the caller may proceed with the operation.
12964 * N.B. the directory may still be "dataless" in this case.
12965 */
12966 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12967 vfs_materialize_dir(
12968 struct vnode *vp,
12969 uint64_t op,
12970 char *lookup_name,
12971 size_t namelen)
12972 {
12973 if (vp->v_type != VDIR) {
12974 return EFTYPE;
12975 }
12976 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12977 return EINVAL;
12978 }
12979 return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12980 namelen, NULL);
12981 }
12982
12983 /*
12984 * vfs_materialize_reparent:
12985 *
12986 * Inputs:
12987 * vp The dataless file or directory to be materialized.
12988 *
12989 * tdvp The new parent directory for the dataless file.
12990 *
12991 * Upon successful return, the caller may proceed with the operation.
12992 * N.B. the item may still be "dataless" in this case.
12993 */
12994 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12995 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12996 {
12997 if (vp->v_type != VDIR && vp->v_type != VREG) {
12998 return EFTYPE;
12999 }
13000 return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
13001 0, 0, NULL, 0, tdvp);
13002 }
13003
13004 #if 0
13005 static int
13006 build_volfs_path(struct vnode *vp, char *path, int *len)
13007 {
13008 struct vnode_attr va;
13009 int ret;
13010
13011 VATTR_INIT(&va);
13012 VATTR_WANTED(&va, va_fsid);
13013 VATTR_WANTED(&va, va_fileid);
13014
13015 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
13016 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
13017 ret = -1;
13018 } else {
13019 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
13020 ret = 0;
13021 }
13022
13023 return ret;
13024 }
13025 #endif
13026
13027 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)13028 fsctl_bogus_command_compat(unsigned long cmd)
13029 {
13030 switch (cmd) {
13031 case IOCBASECMD(FSIOC_SYNC_VOLUME):
13032 return FSIOC_SYNC_VOLUME;
13033 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
13034 return FSIOC_ROUTEFS_SETROUTEID;
13035 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
13036 return FSIOC_SET_PACKAGE_EXTS;
13037 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
13038 return FSIOC_SET_FSTYPENAME_OVERRIDE;
13039 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
13040 return DISK_CONDITIONER_IOC_GET;
13041 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
13042 return DISK_CONDITIONER_IOC_SET;
13043 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
13044 return FSIOC_FIOSEEKHOLE;
13045 case IOCBASECMD(FSIOC_FIOSEEKDATA):
13046 return FSIOC_FIOSEEKDATA;
13047 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
13048 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
13049 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
13050 return SPOTLIGHT_IOC_GET_LAST_MTIME;
13051 }
13052
13053 return cmd;
13054 }
13055
13056 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)13057 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
13058 {
13059 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
13060 }
13061
13062 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)13063 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
13064 {
13065 struct vfs_attr vfa;
13066 mount_t mp = vp->v_mount;
13067 unsigned arg;
13068 int error;
13069
13070 /* record vid of vp so we can drop it below. */
13071 uint32_t vvid = vp->v_id;
13072
13073 /*
13074 * Then grab mount_iterref so that we can release the vnode.
13075 * Without this, a thread may call vnode_iterate_prepare then
13076 * get into a deadlock because we've never released the root vp
13077 */
13078 error = mount_iterref(mp, 0);
13079 if (error) {
13080 return error;
13081 }
13082 vnode_hold(vp);
13083 vnode_put(vp);
13084
13085 arg = MNT_NOWAIT;
13086 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
13087 arg = MNT_WAIT;
13088 }
13089
13090 /*
13091 * If the filessytem supports multiple filesytems in a
13092 * partition (For eg APFS volumes in a container, it knows
13093 * that the waitfor argument to VFS_SYNC are flags.
13094 */
13095 VFSATTR_INIT(&vfa);
13096 VFSATTR_WANTED(&vfa, f_capabilities);
13097 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
13098 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
13099 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
13100 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
13101 arg |= MNT_VOLUME;
13102 }
13103
13104 /* issue the sync for this volume */
13105 (void)sync_callback(mp, &arg);
13106
13107 /*
13108 * Then release the mount_iterref once we're done syncing; it's not
13109 * needed for the VNOP_IOCTL below
13110 */
13111 mount_iterdrop(mp);
13112
13113 if (arg & FSCTL_SYNC_FULLSYNC) {
13114 /* re-obtain vnode iocount on the root vp, if possible */
13115 error = vnode_getwithvid(vp, vvid);
13116 if (error == 0) {
13117 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
13118 vnode_put(vp);
13119 }
13120 }
13121 vnode_drop(vp);
13122 /* mark the argument VP as having been released */
13123 *arg_vp = NULL;
13124 return error;
13125 }
13126
13127 #if ROUTEFS
13128 static int __attribute__((noinline))
handle_routes(user_addr_t udata)13129 handle_routes(user_addr_t udata)
13130 {
13131 char routepath[MAXPATHLEN];
13132 size_t len = 0;
13133 int error;
13134
13135 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
13136 return error;
13137 }
13138 bzero(routepath, MAXPATHLEN);
13139 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
13140 if (error) {
13141 return error;
13142 }
13143 error = routefs_kernel_mount(routepath);
13144 return error;
13145 }
13146 #endif
13147
13148 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)13149 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
13150 {
13151 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
13152 struct vnode_attr va;
13153 int error;
13154
13155 VATTR_INIT(&va);
13156 VATTR_SET(&va, va_flags, cas->new_flags);
13157
13158 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
13159
13160 #if CONFIG_FSE
13161 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
13162 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
13163 }
13164 #endif
13165
13166 return error;
13167 }
13168
13169 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)13170 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
13171 {
13172 struct mount *mp = NULL;
13173 errno_t rootauth = 0;
13174
13175 mp = vp->v_mount;
13176
13177 /*
13178 * query the underlying FS and see if it reports something
13179 * sane for this vnode. If volume is authenticated via
13180 * chunklist, leave that for the caller to determine.
13181 */
13182 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13183
13184 return rootauth;
13185 }
13186
13187 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
13188 "com.apple.private.kernel.set-package-extensions"
13189
13190 /*
13191 * Make a filesystem-specific control call:
13192 */
13193 /* ARGSUSED */
13194 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)13195 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
13196 {
13197 int error = 0;
13198 boolean_t is64bit;
13199 u_int size;
13200 #define STK_PARAMS 128
13201 char stkbuf[STK_PARAMS] = {0};
13202 caddr_t data, memp;
13203 vnode_t vp = *arg_vp;
13204
13205 if (vp->v_type == VCHR || vp->v_type == VBLK) {
13206 return ENOTTY;
13207 }
13208
13209 cmd = fsctl_bogus_command_compat(cmd);
13210
13211 size = IOCPARM_LEN(cmd);
13212 if (size > IOCPARM_MAX) {
13213 return EINVAL;
13214 }
13215
13216 is64bit = proc_is64bit(p);
13217
13218 memp = NULL;
13219
13220 if (size > sizeof(stkbuf)) {
13221 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
13222 return ENOMEM;
13223 }
13224 data = memp;
13225 } else {
13226 data = &stkbuf[0];
13227 };
13228
13229 if (cmd & IOC_IN) {
13230 if (size) {
13231 error = copyin(udata, data, size);
13232 if (error) {
13233 if (memp) {
13234 kfree_data(memp, size);
13235 }
13236 return error;
13237 }
13238 } else {
13239 if (is64bit) {
13240 *(user_addr_t *)data = udata;
13241 } else {
13242 *(uint32_t *)data = (uint32_t)udata;
13243 }
13244 };
13245 } else if ((cmd & IOC_OUT) && size) {
13246 /*
13247 * Zero the buffer so the user always
13248 * gets back something deterministic.
13249 */
13250 bzero(data, size);
13251 } else if (cmd & IOC_VOID) {
13252 if (is64bit) {
13253 *(user_addr_t *)data = udata;
13254 } else {
13255 *(uint32_t *)data = (uint32_t)udata;
13256 }
13257 }
13258
13259 /* Check to see if it's a generic command */
13260 switch (cmd) {
13261 case FSIOC_SYNC_VOLUME:
13262 error = handle_sync_volume(vp, arg_vp, data, ctx);
13263 break;
13264
13265 case FSIOC_ROUTEFS_SETROUTEID:
13266 #if ROUTEFS
13267 error = handle_routes(udata);
13268 #endif
13269 break;
13270
13271 case FSIOC_SET_PACKAGE_EXTS: {
13272 user_addr_t ext_strings;
13273 uint32_t num_entries;
13274 uint32_t max_width;
13275
13276 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
13277 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
13278 error = EPERM;
13279 break;
13280 }
13281
13282 if ((is64bit && size != sizeof(user64_package_ext_info))
13283 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
13284 // either you're 64-bit and passed a 64-bit struct or
13285 // you're 32-bit and passed a 32-bit struct. otherwise
13286 // it's not ok.
13287 error = EINVAL;
13288 break;
13289 }
13290
13291 if (is64bit) {
13292 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
13293 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
13294 }
13295 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
13296 num_entries = ((user64_package_ext_info *)data)->num_entries;
13297 max_width = ((user64_package_ext_info *)data)->max_width;
13298 } else {
13299 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
13300 num_entries = ((user32_package_ext_info *)data)->num_entries;
13301 max_width = ((user32_package_ext_info *)data)->max_width;
13302 }
13303 error = set_package_extensions_table(ext_strings, num_entries, max_width);
13304 }
13305 break;
13306
13307 case FSIOC_SET_FSTYPENAME_OVERRIDE:
13308 {
13309 mount_t mp;
13310
13311 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
13312 break;
13313 }
13314 if ((mp = vp->v_mount) != NULL) {
13315 mount_lock(mp);
13316 if (data[0] != 0) {
13317 for (int i = 0; i < MFSTYPENAMELEN; i++) {
13318 if (!data[i]) {
13319 goto continue_copy;
13320 }
13321 }
13322 /*
13323 * Getting here means we have a user data
13324 * string which has no NULL termination in
13325 * its first MFSTYPENAMELEN bytes. This is
13326 * bogus, let's avoid strlcpy-ing the read
13327 * data and return an error.
13328 */
13329 error = EINVAL;
13330 goto unlock;
13331 continue_copy:
13332 vfs_setfstypename_locked(mp, data);
13333 if (vfs_isrdonly(mp) &&
13334 strcmp(data, "mtmfs") == 0) {
13335 mp->mnt_kern_flag |=
13336 MNTK_EXTENDED_SECURITY;
13337 mp->mnt_kern_flag &=
13338 ~MNTK_AUTH_OPAQUE;
13339 }
13340 } else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13341 const char *name =
13342 vfs_getfstypenameref_locked(mp, NULL);
13343 if (strcmp(name, "mtmfs") == 0) {
13344 mp->mnt_kern_flag &=
13345 ~MNTK_EXTENDED_SECURITY;
13346 }
13347 vfs_setfstypename_locked(mp, NULL);
13348 }
13349 unlock:
13350 mount_unlock(mp);
13351 }
13352 }
13353 break;
13354
13355 case DISK_CONDITIONER_IOC_GET: {
13356 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
13357 }
13358 break;
13359
13360 case DISK_CONDITIONER_IOC_SET: {
13361 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
13362 }
13363 break;
13364
13365 case FSIOC_CAS_BSDFLAGS:
13366 error = handle_flags(vp, data, ctx);
13367 break;
13368
13369 case FSIOC_FD_ONLY_OPEN_ONCE: {
13370 error = 0;
13371 if (vnode_usecount(vp) > 1) {
13372 vnode_lock_spin(vp);
13373 if (vp->v_lflag & VL_HASSTREAMS) {
13374 if (vnode_isinuse_locked(vp, 1, 1)) {
13375 error = EBUSY;
13376 }
13377 } else if (vnode_usecount(vp) > 1) {
13378 error = EBUSY;
13379 }
13380 vnode_unlock(vp);
13381 }
13382 }
13383 break;
13384
13385 case FSIOC_EVAL_ROOTAUTH:
13386 error = handle_auth(vp, cmd, data, options, ctx);
13387 break;
13388
13389 case FSIOC_TEST_FSE_ACCESS_GRANTED:
13390 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
13391 break;
13392
13393 #if CONFIG_EXCLAVES
13394 case FSIOC_EXCLAVE_FS_REGISTER:
13395 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13396 error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
13397 } else {
13398 error = EPERM;
13399 }
13400 break;
13401
13402 case FSIOC_EXCLAVE_FS_UNREGISTER:
13403 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13404 error = vfs_exclave_fs_unregister(vp);
13405 } else {
13406 error = EPERM;
13407 }
13408 break;
13409
13410 case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
13411 exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
13412 exclave_fs_base_dir_t *dirs = NULL;
13413 if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT) &&
13414 !IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_LIST_ENTITLEMENT)) {
13415 error = EPERM;
13416 break;
13417 }
13418 if (get_base_dirs->base_dirs) {
13419 if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
13420 error = EINVAL;
13421 break;
13422 }
13423 dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
13424 if (!dirs) {
13425 error = ENOSPC;
13426 break;
13427 }
13428 }
13429 error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
13430 if (!error && dirs) {
13431 error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
13432 get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
13433 }
13434 if (dirs) {
13435 kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
13436 }
13437 }
13438 break;
13439 #endif
13440
13441 default: {
13442 /*
13443 * Other, known commands shouldn't be passed down here.
13444 * (When adding a selector to this list, it may be prudent
13445 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
13446 */
13447 switch (cmd) {
13448 case F_PUNCHHOLE:
13449 case F_TRIM_ACTIVE_FILE:
13450 case F_RDADVISE:
13451 case F_TRANSCODEKEY:
13452 case F_GETPROTECTIONLEVEL:
13453 case F_GETDEFAULTPROTLEVEL:
13454 case F_MAKECOMPRESSED:
13455 case F_SET_GREEDY_MODE:
13456 case F_SETSTATICCONTENT:
13457 case F_SETIOTYPE:
13458 case F_SETBACKINGSTORE:
13459 case F_GETPATH_MTMINFO:
13460 case APFSIOC_REVERT_TO_SNAPSHOT:
13461 case FSIOC_FIOSEEKHOLE:
13462 case FSIOC_FIOSEEKDATA:
13463 case HFS_GET_BOOT_INFO:
13464 case HFS_SET_BOOT_INFO:
13465 case FIOPINSWAP:
13466 case F_CHKCLEAN:
13467 case F_FULLFSYNC:
13468 case F_BARRIERFSYNC:
13469 case F_FREEZE_FS:
13470 case F_THAW_FS:
13471 case FSIOC_KERNEL_ROOTAUTH:
13472 case FSIOC_GRAFT_FS:
13473 case FSIOC_UNGRAFT_FS:
13474 case FSIOC_AUTH_FS:
13475 case F_SPECULATIVE_READ:
13476 case F_ATTRIBUTION_TAG:
13477 case F_TRANSFEREXTENTS:
13478 case F_ASSERT_BG_ACCESS:
13479 case F_RELEASE_BG_ACCESS:
13480 error = EINVAL;
13481 goto outdrop;
13482 }
13483 /* Invoke the filesystem-specific code */
13484 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13485 }
13486 } /* end switch stmt */
13487
13488 /*
13489 * if no errors, copy any data to user. Size was
13490 * already set and checked above.
13491 */
13492 if (error == 0 && (cmd & IOC_OUT) && size) {
13493 error = copyout(data, udata, size);
13494 }
13495
13496 outdrop:
13497 if (memp) {
13498 kfree_data(memp, size);
13499 }
13500
13501 return error;
13502 }
13503
13504 /* ARGSUSED */
13505 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13506 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13507 {
13508 int error;
13509 struct nameidata nd;
13510 uint32_t nameiflags;
13511 vnode_t vp = NULL;
13512 vfs_context_t ctx = vfs_context_current();
13513
13514 AUDIT_ARG(cmd, (int)uap->cmd);
13515 AUDIT_ARG(value32, uap->options);
13516 /* Get the vnode for the file we are getting info on: */
13517 nameiflags = 0;
13518 //
13519 // if we come through fsctl() then the file is by definition not open.
13520 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13521 // lest the caller mistakenly thinks the only open is their own (but in
13522 // reality it's someone elses).
13523 //
13524 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13525 return EINVAL;
13526 }
13527 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13528 nameiflags |= FOLLOW;
13529 }
13530 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13531 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13532 }
13533 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13534 UIO_USERSPACE, uap->path, ctx);
13535 if ((error = namei(&nd))) {
13536 goto done;
13537 }
13538 vp = nd.ni_vp;
13539 nameidone(&nd);
13540
13541 #if CONFIG_MACF
13542 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13543 if (error) {
13544 goto done;
13545 }
13546 #endif
13547
13548 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13549
13550 done:
13551 if (vp) {
13552 vnode_put(vp);
13553 }
13554 return error;
13555 }
13556 /* ARGSUSED */
13557 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13558 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13559 {
13560 int error;
13561 vnode_t vp = NULL;
13562 vfs_context_t ctx = vfs_context_current();
13563 int fd = -1;
13564
13565 AUDIT_ARG(fd, uap->fd);
13566 AUDIT_ARG(cmd, (int)uap->cmd);
13567 AUDIT_ARG(value32, uap->options);
13568
13569 /* Get the vnode for the file we are getting info on: */
13570 if ((error = file_vnode(uap->fd, &vp))) {
13571 return error;
13572 }
13573 fd = uap->fd;
13574 if ((error = vnode_getwithref(vp))) {
13575 file_drop(fd);
13576 return error;
13577 }
13578
13579 #if CONFIG_MACF
13580 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13581 file_drop(fd);
13582 vnode_put(vp);
13583 return error;
13584 }
13585 #endif
13586
13587 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13588
13589 file_drop(fd);
13590
13591 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13592 if (vp) {
13593 vnode_put(vp);
13594 }
13595
13596 return error;
13597 }
13598 /* end of fsctl system call */
13599
13600 #define FILESEC_ACCESS_ENTITLEMENT \
13601 "com.apple.private.vfs.filesec-access"
13602
13603 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13604 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13605 {
13606 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13607 /*
13608 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13609 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13610 */
13611 if ((!setting && vfs_context_issuser(ctx)) ||
13612 IOTaskHasEntitlement(vfs_context_task(ctx),
13613 FILESEC_ACCESS_ENTITLEMENT)) {
13614 return 0;
13615 }
13616 }
13617
13618 return EPERM;
13619 }
13620
13621 /*
13622 * Retrieve the data of an extended attribute.
13623 */
13624 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13625 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13626 {
13627 vnode_t vp;
13628 struct nameidata nd;
13629 char attrname[XATTR_MAXNAMELEN + 1];
13630 vfs_context_t ctx = vfs_context_current();
13631 uio_t auio = NULL;
13632 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13633 size_t attrsize = 0;
13634 size_t namelen;
13635 u_int32_t nameiflags;
13636 int error;
13637 UIO_STACKBUF(uio_buf, 1);
13638
13639 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13640 return EINVAL;
13641 }
13642
13643 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13644 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13645 if (uap->options & XATTR_NOFOLLOW_ANY) {
13646 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13647 }
13648 if (uap->options & XATTR_RESOLVE_BENEATH) {
13649 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
13650 }
13651
13652 if ((error = namei(&nd))) {
13653 return error;
13654 }
13655 vp = nd.ni_vp;
13656 nameidone(&nd);
13657
13658 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13659 if (error != 0) {
13660 goto out;
13661 }
13662 if (xattr_protected(attrname) &&
13663 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13664 goto out;
13665 }
13666 /*
13667 * the specific check for 0xffffffff is a hack to preserve
13668 * binaray compatibilty in K64 with applications that discovered
13669 * that passing in a buf pointer and a size of -1 resulted in
13670 * just the size of the indicated extended attribute being returned.
13671 * this isn't part of the documented behavior, but because of the
13672 * original implemtation's check for "uap->size > 0", this behavior
13673 * was allowed. In K32 that check turned into a signed comparison
13674 * even though uap->size is unsigned... in K64, we blow by that
13675 * check because uap->size is unsigned and doesn't get sign smeared
13676 * in the munger for a 32 bit user app. we also need to add a
13677 * check to limit the maximum size of the buffer being passed in...
13678 * unfortunately, the underlying fileystems seem to just malloc
13679 * the requested size even if the actual extended attribute is tiny.
13680 * because that malloc is for kernel wired memory, we have to put a
13681 * sane limit on it.
13682 *
13683 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13684 * U64 running on K64 will yield -1 (64 bits wide)
13685 * U32/U64 running on K32 will yield -1 (32 bits wide)
13686 */
13687 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13688 goto no_uio;
13689 }
13690
13691 if (uap->value) {
13692 if (uap->size > (size_t)XATTR_MAXSIZE) {
13693 uap->size = XATTR_MAXSIZE;
13694 }
13695
13696 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13697 &uio_buf[0], sizeof(uio_buf));
13698 uio_addiov(auio, uap->value, uap->size);
13699 }
13700 no_uio:
13701 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13702 out:
13703 vnode_put(vp);
13704
13705 if (auio) {
13706 *retval = uap->size - uio_resid(auio);
13707 } else {
13708 *retval = (user_ssize_t)attrsize;
13709 }
13710
13711 return error;
13712 }
13713
13714 /*
13715 * Retrieve the data of an extended attribute.
13716 */
13717 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13718 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13719 {
13720 vnode_t vp;
13721 char attrname[XATTR_MAXNAMELEN + 1];
13722 vfs_context_t ctx = vfs_context_current();
13723 uio_t auio = NULL;
13724 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13725 size_t attrsize = 0;
13726 size_t namelen;
13727 int error;
13728 UIO_STACKBUF(uio_buf, 1);
13729
13730 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13731 XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
13732 return EINVAL;
13733 }
13734
13735 if ((error = file_vnode(uap->fd, &vp))) {
13736 return error;
13737 }
13738 if ((error = vnode_getwithref(vp))) {
13739 file_drop(uap->fd);
13740 return error;
13741 }
13742 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13743 if (error != 0) {
13744 goto out;
13745 }
13746 if (xattr_protected(attrname) &&
13747 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13748 goto out;
13749 }
13750 if (uap->value && uap->size > 0) {
13751 if (uap->size > (size_t)XATTR_MAXSIZE) {
13752 uap->size = XATTR_MAXSIZE;
13753 }
13754
13755 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13756 &uio_buf[0], sizeof(uio_buf));
13757 uio_addiov(auio, uap->value, uap->size);
13758 }
13759
13760 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13761 out:
13762 (void)vnode_put(vp);
13763 file_drop(uap->fd);
13764
13765 if (auio) {
13766 *retval = uap->size - uio_resid(auio);
13767 } else {
13768 *retval = (user_ssize_t)attrsize;
13769 }
13770 return error;
13771 }
13772
13773 /* struct for checkdirs iteration */
13774 struct setxattr_ctx {
13775 struct nameidata nd;
13776 char attrname[XATTR_MAXNAMELEN + 1];
13777 UIO_STACKBUF(uio_buf, 1);
13778 };
13779
13780 /*
13781 * Set the data of an extended attribute.
13782 */
13783 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13784 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13785 {
13786 vnode_t vp;
13787 vfs_context_t ctx = vfs_context_current();
13788 uio_t auio = NULL;
13789 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13790 size_t namelen;
13791 u_int32_t nameiflags;
13792 int error;
13793 struct setxattr_ctx *sactx;
13794
13795 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13796 return EINVAL;
13797 }
13798
13799 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13800 if (sactx == NULL) {
13801 return ENOMEM;
13802 }
13803
13804 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13805 if (error != 0) {
13806 if (error == EPERM) {
13807 /* if the string won't fit in attrname, copyinstr emits EPERM */
13808 error = ENAMETOOLONG;
13809 }
13810 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13811 goto out;
13812 }
13813 if (xattr_protected(sactx->attrname) &&
13814 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13815 goto out;
13816 }
13817 if (uap->size != 0 && uap->value == 0) {
13818 error = EINVAL;
13819 goto out;
13820 }
13821 if (uap->size > INT_MAX) {
13822 error = E2BIG;
13823 goto out;
13824 }
13825
13826 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13827 #if CONFIG_FILE_LEASES
13828 nameiflags |= WANTPARENT;
13829 #endif
13830 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13831 if (uap->options & XATTR_NOFOLLOW_ANY) {
13832 sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13833 }
13834 if (uap->options & XATTR_RESOLVE_BENEATH) {
13835 sactx->nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
13836 }
13837
13838 if ((error = namei(&sactx->nd))) {
13839 goto out;
13840 }
13841 vp = sactx->nd.ni_vp;
13842 #if CONFIG_FILE_LEASES
13843 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13844 vnode_put(sactx->nd.ni_dvp);
13845 #endif
13846 nameidone(&sactx->nd);
13847
13848 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13849 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13850 uio_addiov(auio, uap->value, uap->size);
13851
13852 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13853 #if CONFIG_FSE
13854 if (error == 0) {
13855 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13856 FSE_ARG_VNODE, vp,
13857 FSE_ARG_DONE);
13858 }
13859 #endif
13860 vnode_put(vp);
13861 out:
13862 kfree_type(struct setxattr_ctx, sactx);
13863 *retval = 0;
13864 return error;
13865 }
13866
13867 /*
13868 * Set the data of an extended attribute.
13869 */
13870 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13871 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13872 {
13873 vnode_t vp;
13874 char attrname[XATTR_MAXNAMELEN + 1];
13875 vfs_context_t ctx = vfs_context_current();
13876 uio_t auio = NULL;
13877 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13878 size_t namelen;
13879 int error;
13880 UIO_STACKBUF(uio_buf, 1);
13881
13882 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13883 XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
13884 return EINVAL;
13885 }
13886
13887 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13888 if (error != 0) {
13889 if (error == EPERM) {
13890 /* if the string won't fit in attrname, copyinstr emits EPERM */
13891 return ENAMETOOLONG;
13892 }
13893 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13894 return error;
13895 }
13896 if (xattr_protected(attrname) &&
13897 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13898 return error;
13899 }
13900 if (uap->size != 0 && uap->value == 0) {
13901 return EINVAL;
13902 }
13903 if (uap->size > INT_MAX) {
13904 return E2BIG;
13905 }
13906 if ((error = file_vnode(uap->fd, &vp))) {
13907 return error;
13908 }
13909 if ((error = vnode_getwithref(vp))) {
13910 file_drop(uap->fd);
13911 return error;
13912 }
13913
13914 #if CONFIG_FILE_LEASES
13915 vnode_breakdirlease(vp, true, O_WRONLY);
13916 #endif
13917
13918 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13919 &uio_buf[0], sizeof(uio_buf));
13920 uio_addiov(auio, uap->value, uap->size);
13921
13922 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13923 #if CONFIG_FSE
13924 if (error == 0) {
13925 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13926 FSE_ARG_VNODE, vp,
13927 FSE_ARG_DONE);
13928 }
13929 #endif
13930 vnode_put(vp);
13931 file_drop(uap->fd);
13932 *retval = 0;
13933 return error;
13934 }
13935
13936 /*
13937 * Remove an extended attribute.
13938 * XXX Code duplication here.
13939 */
13940 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13941 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13942 {
13943 vnode_t vp;
13944 struct nameidata nd;
13945 char attrname[XATTR_MAXNAMELEN + 1];
13946 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13947 vfs_context_t ctx = vfs_context_current();
13948 size_t namelen;
13949 u_int32_t nameiflags;
13950 int error;
13951
13952 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13953 return EINVAL;
13954 }
13955
13956 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13957 if (error != 0) {
13958 return error;
13959 }
13960 if (xattr_protected(attrname)) {
13961 return EPERM;
13962 }
13963 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13964 #if CONFIG_FILE_LEASES
13965 nameiflags |= WANTPARENT;
13966 #endif
13967 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13968 if (uap->options & XATTR_NOFOLLOW_ANY) {
13969 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13970 }
13971 if (uap->options & XATTR_RESOLVE_BENEATH) {
13972 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
13973 }
13974
13975 if ((error = namei(&nd))) {
13976 return error;
13977 }
13978 vp = nd.ni_vp;
13979 #if CONFIG_FILE_LEASES
13980 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13981 vnode_put(nd.ni_dvp);
13982 #endif
13983 nameidone(&nd);
13984
13985 error = vn_removexattr(vp, attrname, uap->options, ctx);
13986 #if CONFIG_FSE
13987 if (error == 0) {
13988 add_fsevent(FSE_XATTR_REMOVED, ctx,
13989 FSE_ARG_VNODE, vp,
13990 FSE_ARG_DONE);
13991 }
13992 #endif
13993 vnode_put(vp);
13994 *retval = 0;
13995 return error;
13996 }
13997
13998 /*
13999 * Remove an extended attribute.
14000 * XXX Code duplication here.
14001 */
14002 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)14003 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
14004 {
14005 vnode_t vp;
14006 char attrname[XATTR_MAXNAMELEN + 1];
14007 size_t namelen;
14008 int error;
14009 #if CONFIG_FSE
14010 vfs_context_t ctx = vfs_context_current();
14011 #endif
14012
14013 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
14014 XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
14015 return EINVAL;
14016 }
14017
14018 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
14019 if (error != 0) {
14020 return error;
14021 }
14022 if (xattr_protected(attrname)) {
14023 return EPERM;
14024 }
14025 if ((error = file_vnode(uap->fd, &vp))) {
14026 return error;
14027 }
14028 if ((error = vnode_getwithref(vp))) {
14029 file_drop(uap->fd);
14030 return error;
14031 }
14032
14033 #if CONFIG_FILE_LEASES
14034 vnode_breakdirlease(vp, true, O_WRONLY);
14035 #endif
14036
14037 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
14038 #if CONFIG_FSE
14039 if (error == 0) {
14040 add_fsevent(FSE_XATTR_REMOVED, ctx,
14041 FSE_ARG_VNODE, vp,
14042 FSE_ARG_DONE);
14043 }
14044 #endif
14045 vnode_put(vp);
14046 file_drop(uap->fd);
14047 *retval = 0;
14048 return error;
14049 }
14050
14051 /*
14052 * Retrieve the list of extended attribute names.
14053 * XXX Code duplication here.
14054 */
14055 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)14056 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
14057 {
14058 vnode_t vp;
14059 struct nameidata nd;
14060 vfs_context_t ctx = vfs_context_current();
14061 uio_t auio = NULL;
14062 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
14063 size_t attrsize = 0;
14064 u_int32_t nameiflags;
14065 int error;
14066 UIO_STACKBUF(uio_buf, 1);
14067
14068 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
14069 return EINVAL;
14070 }
14071
14072 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
14073 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
14074 if (uap->options & XATTR_NOFOLLOW_ANY) {
14075 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
14076 }
14077 if (uap->options & XATTR_RESOLVE_BENEATH) {
14078 nd.ni_flag |= NAMEI_RESOLVE_BENEATH;
14079 }
14080
14081 if ((error = namei(&nd))) {
14082 return error;
14083 }
14084 vp = nd.ni_vp;
14085 nameidone(&nd);
14086 if (uap->namebuf != 0 && uap->bufsize > 0) {
14087 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
14088 &uio_buf[0], sizeof(uio_buf));
14089 uio_addiov(auio, uap->namebuf, uap->bufsize);
14090 }
14091
14092 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
14093
14094 vnode_put(vp);
14095 if (auio) {
14096 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
14097 } else {
14098 *retval = (user_ssize_t)attrsize;
14099 }
14100 return error;
14101 }
14102
14103 /*
14104 * Retrieve the list of extended attribute names.
14105 * XXX Code duplication here.
14106 */
14107 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)14108 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
14109 {
14110 vnode_t vp;
14111 uio_t auio = NULL;
14112 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
14113 size_t attrsize = 0;
14114 int error;
14115 UIO_STACKBUF(uio_buf, 1);
14116
14117 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
14118 XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) {
14119 return EINVAL;
14120 }
14121
14122 if ((error = file_vnode(uap->fd, &vp))) {
14123 return error;
14124 }
14125 if ((error = vnode_getwithref(vp))) {
14126 file_drop(uap->fd);
14127 return error;
14128 }
14129 if (uap->namebuf != 0 && uap->bufsize > 0) {
14130 auio = uio_createwithbuffer(1, 0, spacetype,
14131 UIO_READ, &uio_buf[0], sizeof(uio_buf));
14132 uio_addiov(auio, uap->namebuf, uap->bufsize);
14133 }
14134
14135 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
14136
14137 vnode_put(vp);
14138 file_drop(uap->fd);
14139 if (auio) {
14140 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
14141 } else {
14142 *retval = (user_ssize_t)attrsize;
14143 }
14144 return error;
14145 }
14146
14147 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)14148 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
14149 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
14150 {
14151 int error;
14152 vnode_t vp;
14153 int length;
14154 int bpflags;
14155 /* maximum number of times to retry build_path */
14156 unsigned int retries = 0x10;
14157
14158 if (bufsize > MAXLONGPATHLEN) {
14159 return EINVAL;
14160 }
14161
14162 if (buf == NULL) {
14163 return ENOMEM;
14164 }
14165
14166 retry:
14167 error = vnode_getfromid(volfs_id, objid, ctx, options & FSOPT_ISREALFSID, &vp);
14168 if (error) {
14169 return error;
14170 }
14171
14172 #if CONFIG_MACF
14173 error = mac_vnode_check_fsgetpath(ctx, vp);
14174 if (error) {
14175 vnode_put(vp);
14176 return error;
14177 }
14178 #endif
14179
14180 /* Obtain the absolute path to this vnode. */
14181 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
14182 if (options & FSOPT_NOFIRMLINKPATH) {
14183 bpflags |= BUILDPATH_NO_FIRMLINK;
14184 }
14185 bpflags |= BUILDPATH_CHECK_MOVED;
14186 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
14187 vnode_put(vp);
14188
14189 if (error) {
14190 /* there was a race building the path, try a few more times */
14191 if (error == EAGAIN) {
14192 --retries;
14193 if (retries > 0) {
14194 goto retry;
14195 }
14196
14197 error = ENOENT;
14198 }
14199 goto out;
14200 }
14201
14202 AUDIT_ARG(text, buf);
14203
14204 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
14205 kdebug_vfs_lookup(buf, length, vp, KDBG_VFSLKUP_LOOKUP);
14206 }
14207
14208 *pathlen = length; /* may be superseded by error */
14209
14210 out:
14211 return error;
14212 }
14213
14214 /*
14215 * Obtain the full pathname of a file system object by id.
14216 */
14217 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)14218 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
14219 uint32_t options, user_ssize_t *retval)
14220 {
14221 vfs_context_t ctx = vfs_context_current();
14222 fsid_t fsid;
14223 char *realpath;
14224 int length;
14225 int error;
14226
14227 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
14228 return EINVAL;
14229 }
14230
14231 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
14232 return error;
14233 }
14234 AUDIT_ARG(value32, fsid.val[0]);
14235 AUDIT_ARG(value64, objid);
14236 /* Restrict output buffer size for now. */
14237
14238 if (bufsize > MAXLONGPATHLEN || bufsize <= 0) {
14239 return EINVAL;
14240 }
14241 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
14242 if (realpath == NULL) {
14243 return ENOMEM;
14244 }
14245
14246 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
14247 options, &length);
14248
14249 if (error) {
14250 goto out;
14251 }
14252
14253 error = copyout((caddr_t)realpath, buf, length);
14254
14255 *retval = (user_ssize_t)length; /* may be superseded by error */
14256 out:
14257 kfree_data(realpath, bufsize);
14258 return error;
14259 }
14260
14261 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)14262 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
14263 {
14264 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
14265 0, retval);
14266 }
14267
14268 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)14269 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
14270 {
14271 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
14272 uap->options, retval);
14273 }
14274
14275 /*
14276 * Common routine to handle various flavors of statfs data heading out
14277 * to user space.
14278 *
14279 * Returns: 0 Success
14280 * EFAULT
14281 */
14282 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)14283 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
14284 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
14285 boolean_t partial_copy)
14286 {
14287 int error;
14288 int my_size, copy_size;
14289
14290 if (is_64_bit) {
14291 struct user64_statfs sfs;
14292 my_size = copy_size = sizeof(sfs);
14293 bzero(&sfs, my_size);
14294 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14295 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14296 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14297 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
14298 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
14299 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
14300 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
14301 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
14302 sfs.f_files = (user64_long_t)sfsp->f_files;
14303 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
14304 sfs.f_fsid = sfsp->f_fsid;
14305 sfs.f_owner = sfsp->f_owner;
14306 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14307 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14308 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14309
14310 if (partial_copy) {
14311 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14312 }
14313 error = copyout((caddr_t)&sfs, bufp, copy_size);
14314 } else {
14315 struct user32_statfs sfs;
14316
14317 my_size = copy_size = sizeof(sfs);
14318 bzero(&sfs, my_size);
14319
14320 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14321 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14322 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14323
14324 /*
14325 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
14326 * have to fudge the numbers here in that case. We inflate the blocksize in order
14327 * to reflect the filesystem size as best we can.
14328 */
14329 if ((sfsp->f_blocks > INT_MAX)
14330 /* Hack for 4061702 . I think the real fix is for Carbon to
14331 * look for some volume capability and not depend on hidden
14332 * semantics agreed between a FS and carbon.
14333 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
14334 * for Carbon to set bNoVolumeSizes volume attribute.
14335 * Without this the webdavfs files cannot be copied onto
14336 * disk as they look huge. This change should not affect
14337 * XSAN as they should not setting these to -1..
14338 */
14339 && (sfsp->f_blocks != 0xffffffffffffffffULL)
14340 && (sfsp->f_bfree != 0xffffffffffffffffULL)
14341 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
14342 int shift;
14343
14344 /*
14345 * Work out how far we have to shift the block count down to make it fit.
14346 * Note that it's possible to have to shift so far that the resulting
14347 * blocksize would be unreportably large. At that point, we will clip
14348 * any values that don't fit.
14349 *
14350 * For safety's sake, we also ensure that f_iosize is never reported as
14351 * being smaller than f_bsize.
14352 */
14353 for (shift = 0; shift < 32; shift++) {
14354 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
14355 break;
14356 }
14357 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
14358 break;
14359 }
14360 }
14361 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
14362 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
14363 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
14364 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
14365 #undef __SHIFT_OR_CLIP
14366 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
14367 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
14368 } else {
14369 /* filesystem is small enough to be reported honestly */
14370 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
14371 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
14372 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
14373 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
14374 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
14375 }
14376 sfs.f_files = (user32_long_t)sfsp->f_files;
14377 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
14378 sfs.f_fsid = sfsp->f_fsid;
14379 sfs.f_owner = sfsp->f_owner;
14380 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14381 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14382 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14383
14384 if (partial_copy) {
14385 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14386 }
14387 error = copyout((caddr_t)&sfs, bufp, copy_size);
14388 }
14389
14390 if (sizep != NULL) {
14391 *sizep = my_size;
14392 }
14393 return error;
14394 }
14395
14396 /*
14397 * copy stat structure into user_stat structure.
14398 */
14399 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)14400 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
14401 {
14402 bzero(usbp, sizeof(*usbp));
14403
14404 usbp->st_dev = sbp->st_dev;
14405 usbp->st_ino = sbp->st_ino;
14406 usbp->st_mode = sbp->st_mode;
14407 usbp->st_nlink = sbp->st_nlink;
14408 usbp->st_uid = sbp->st_uid;
14409 usbp->st_gid = sbp->st_gid;
14410 usbp->st_rdev = sbp->st_rdev;
14411 #ifndef _POSIX_C_SOURCE
14412 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14413 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14414 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14415 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14416 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14417 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14418 #else
14419 usbp->st_atime = sbp->st_atime;
14420 usbp->st_atimensec = sbp->st_atimensec;
14421 usbp->st_mtime = sbp->st_mtime;
14422 usbp->st_mtimensec = sbp->st_mtimensec;
14423 usbp->st_ctime = sbp->st_ctime;
14424 usbp->st_ctimensec = sbp->st_ctimensec;
14425 #endif
14426 usbp->st_size = sbp->st_size;
14427 usbp->st_blocks = sbp->st_blocks;
14428 usbp->st_blksize = sbp->st_blksize;
14429 usbp->st_flags = sbp->st_flags;
14430 usbp->st_gen = sbp->st_gen;
14431 usbp->st_lspare = sbp->st_lspare;
14432 usbp->st_qspare[0] = sbp->st_qspare[0];
14433 usbp->st_qspare[1] = sbp->st_qspare[1];
14434 }
14435
14436 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14437 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14438 {
14439 bzero(usbp, sizeof(*usbp));
14440
14441 usbp->st_dev = sbp->st_dev;
14442 usbp->st_ino = sbp->st_ino;
14443 usbp->st_mode = sbp->st_mode;
14444 usbp->st_nlink = sbp->st_nlink;
14445 usbp->st_uid = sbp->st_uid;
14446 usbp->st_gid = sbp->st_gid;
14447 usbp->st_rdev = sbp->st_rdev;
14448 #ifndef _POSIX_C_SOURCE
14449 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14450 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14451 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14452 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14453 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14454 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14455 #else
14456 usbp->st_atime = sbp->st_atime;
14457 usbp->st_atimensec = sbp->st_atimensec;
14458 usbp->st_mtime = sbp->st_mtime;
14459 usbp->st_mtimensec = sbp->st_mtimensec;
14460 usbp->st_ctime = sbp->st_ctime;
14461 usbp->st_ctimensec = sbp->st_ctimensec;
14462 #endif
14463 usbp->st_size = sbp->st_size;
14464 usbp->st_blocks = sbp->st_blocks;
14465 usbp->st_blksize = sbp->st_blksize;
14466 usbp->st_flags = sbp->st_flags;
14467 usbp->st_gen = sbp->st_gen;
14468 usbp->st_lspare = sbp->st_lspare;
14469 usbp->st_qspare[0] = sbp->st_qspare[0];
14470 usbp->st_qspare[1] = sbp->st_qspare[1];
14471 }
14472
14473 /*
14474 * copy stat64 structure into user_stat64 structure.
14475 */
14476 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14477 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14478 {
14479 bzero(usbp, sizeof(*usbp));
14480
14481 usbp->st_dev = sbp->st_dev;
14482 usbp->st_ino = sbp->st_ino;
14483 usbp->st_mode = sbp->st_mode;
14484 usbp->st_nlink = sbp->st_nlink;
14485 usbp->st_uid = sbp->st_uid;
14486 usbp->st_gid = sbp->st_gid;
14487 usbp->st_rdev = sbp->st_rdev;
14488 #ifndef _POSIX_C_SOURCE
14489 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14490 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14491 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14492 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14493 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14494 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14495 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14496 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14497 #else
14498 usbp->st_atime = sbp->st_atime;
14499 usbp->st_atimensec = sbp->st_atimensec;
14500 usbp->st_mtime = sbp->st_mtime;
14501 usbp->st_mtimensec = sbp->st_mtimensec;
14502 usbp->st_ctime = sbp->st_ctime;
14503 usbp->st_ctimensec = sbp->st_ctimensec;
14504 usbp->st_birthtime = sbp->st_birthtime;
14505 usbp->st_birthtimensec = sbp->st_birthtimensec;
14506 #endif
14507 usbp->st_size = sbp->st_size;
14508 usbp->st_blocks = sbp->st_blocks;
14509 usbp->st_blksize = sbp->st_blksize;
14510 usbp->st_flags = sbp->st_flags;
14511 usbp->st_gen = sbp->st_gen;
14512 usbp->st_lspare = sbp->st_lspare;
14513 usbp->st_qspare[0] = sbp->st_qspare[0];
14514 usbp->st_qspare[1] = sbp->st_qspare[1];
14515 }
14516
14517 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14518 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14519 {
14520 bzero(usbp, sizeof(*usbp));
14521
14522 usbp->st_dev = sbp->st_dev;
14523 usbp->st_ino = sbp->st_ino;
14524 usbp->st_mode = sbp->st_mode;
14525 usbp->st_nlink = sbp->st_nlink;
14526 usbp->st_uid = sbp->st_uid;
14527 usbp->st_gid = sbp->st_gid;
14528 usbp->st_rdev = sbp->st_rdev;
14529 #ifndef _POSIX_C_SOURCE
14530 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14531 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14532 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14533 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14534 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14535 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14536 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14537 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14538 #else
14539 usbp->st_atime = sbp->st_atime;
14540 usbp->st_atimensec = sbp->st_atimensec;
14541 usbp->st_mtime = sbp->st_mtime;
14542 usbp->st_mtimensec = sbp->st_mtimensec;
14543 usbp->st_ctime = sbp->st_ctime;
14544 usbp->st_ctimensec = sbp->st_ctimensec;
14545 usbp->st_birthtime = sbp->st_birthtime;
14546 usbp->st_birthtimensec = sbp->st_birthtimensec;
14547 #endif
14548 usbp->st_size = sbp->st_size;
14549 usbp->st_blocks = sbp->st_blocks;
14550 usbp->st_blksize = sbp->st_blksize;
14551 usbp->st_flags = sbp->st_flags;
14552 usbp->st_gen = sbp->st_gen;
14553 usbp->st_lspare = sbp->st_lspare;
14554 usbp->st_qspare[0] = sbp->st_qspare[0];
14555 usbp->st_qspare[1] = sbp->st_qspare[1];
14556 }
14557
14558 /*
14559 * Purge buffer cache for simulating cold starts
14560 */
14561 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14562 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14563 {
14564 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14565
14566 return VNODE_RETURNED;
14567 }
14568
14569 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14570 vfs_purge_callback(mount_t mp, __unused void * arg)
14571 {
14572 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14573
14574 return VFS_RETURNED;
14575 }
14576
14577 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14578 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14579
14580 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14581 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14582 {
14583 if (!kauth_cred_issuser(kauth_cred_get())) {
14584 return EPERM;
14585 }
14586
14587 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14588
14589 /* also flush any VM pagers backed by files */
14590 if (vfs_purge_vm_pagers) {
14591 vm_purge_filebacked_pagers();
14592 }
14593
14594 return 0;
14595 }
14596
14597 /*
14598 * gets the vnode associated with the (unnamed) snapshot directory
14599 * for a Filesystem. The snapshot directory vnode is returned with
14600 * an iocount on it.
14601 */
14602 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14603 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14604 {
14605 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14606 }
14607
14608 /*
14609 * Get the snapshot vnode.
14610 *
14611 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14612 * needs nameidone() on ndp.
14613 *
14614 * If the snapshot vnode exists it is returned in ndp->ni_vp.
14615 *
14616 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14617 * not needed.
14618 */
14619 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14620 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14621 user_addr_t name, struct nameidata *ndp, int32_t op,
14622 #if !CONFIG_TRIGGERS
14623 __unused
14624 #endif
14625 enum path_operation pathop,
14626 vfs_context_t ctx)
14627 {
14628 int error, i;
14629 caddr_t name_buf;
14630 size_t name_len;
14631 struct vfs_attr vfa;
14632
14633 *sdvpp = NULLVP;
14634 *rvpp = NULLVP;
14635
14636 error = vnode_getfromfd(ctx, dirfd, rvpp);
14637 if (error) {
14638 return error;
14639 }
14640
14641 if (!vnode_isvroot(*rvpp)) {
14642 error = EINVAL;
14643 goto out;
14644 }
14645
14646 /* Make sure the filesystem supports snapshots */
14647 VFSATTR_INIT(&vfa);
14648 VFSATTR_WANTED(&vfa, f_capabilities);
14649 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14650 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14651 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14652 VOL_CAP_INT_SNAPSHOT)) ||
14653 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14654 VOL_CAP_INT_SNAPSHOT))) {
14655 error = ENOTSUP;
14656 goto out;
14657 }
14658
14659 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14660 if (error) {
14661 goto out;
14662 }
14663
14664 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14665 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14666 if (error) {
14667 goto out1;
14668 }
14669
14670 /*
14671 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14672 * (the length returned by copyinstr includes the terminating NUL)
14673 */
14674 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14675 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14676 error = EINVAL;
14677 goto out1;
14678 }
14679 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14680 ;
14681 }
14682 if (i < (int)name_len) {
14683 error = EINVAL;
14684 goto out1;
14685 }
14686
14687 #if CONFIG_MACF
14688 if (op == CREATE) {
14689 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14690 name_buf);
14691 } else if (op == DELETE) {
14692 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14693 name_buf);
14694 }
14695 if (error) {
14696 goto out1;
14697 }
14698 #endif
14699
14700 /* Check if the snapshot already exists ... */
14701 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14702 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14703 ndp->ni_dvp = *sdvpp;
14704
14705 error = namei(ndp);
14706 out1:
14707 zfree(ZV_NAMEI, name_buf);
14708 out:
14709 if (error) {
14710 if (*sdvpp) {
14711 vnode_put(*sdvpp);
14712 *sdvpp = NULLVP;
14713 }
14714 if (*rvpp) {
14715 vnode_put(*rvpp);
14716 *rvpp = NULLVP;
14717 }
14718 }
14719 return error;
14720 }
14721
14722 /*
14723 * create a filesystem snapshot (for supporting filesystems)
14724 *
14725 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14726 * We get to the (unnamed) snapshot directory vnode and create the vnode
14727 * for the snapshot in it.
14728 *
14729 * Restrictions:
14730 *
14731 * a) Passed in name for snapshot cannot have slashes.
14732 * b) name can't be "." or ".."
14733 *
14734 * Since this requires superuser privileges, vnode_authorize calls are not
14735 * made.
14736 */
14737 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14738 snapshot_create(int dirfd, user_addr_t name, uint32_t flags,
14739 vfs_context_t ctx)
14740 {
14741 vnode_t rvp, snapdvp;
14742 int error;
14743 struct nameidata *ndp;
14744
14745 /* No flags are currently defined */
14746 if (flags) {
14747 printf("snapshot_create: Invalid flags passed 0x%x\n", flags);
14748 return EINVAL;
14749 }
14750
14751 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14752
14753 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14754 OP_LINK, ctx);
14755 if (error) {
14756 goto out;
14757 }
14758
14759 if (ndp->ni_vp) {
14760 vnode_put(ndp->ni_vp);
14761 error = EEXIST;
14762 } else {
14763 struct vnode_attr *vap;
14764 vnode_t vp = NULLVP;
14765
14766 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14767
14768 VATTR_INIT(vap);
14769 VATTR_SET(vap, va_type, VREG);
14770 VATTR_SET(vap, va_mode, 0);
14771
14772 error = vn_create(snapdvp, &vp, ndp, vap,
14773 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14774 if (!error && vp) {
14775 vnode_put(vp);
14776 }
14777
14778 kfree_type(struct vnode_attr, vap);
14779 }
14780
14781 nameidone(ndp);
14782 vnode_put(snapdvp);
14783 vnode_put(rvp);
14784 out:
14785 kfree_type(struct nameidata, ndp);
14786
14787 return error;
14788 }
14789
14790 /*
14791 * Delete a Filesystem snapshot
14792 *
14793 * get the vnode for the unnamed snapshot directory and the snapshot and
14794 * delete the snapshot.
14795 */
14796 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14797 snapshot_delete(int dirfd, user_addr_t name, uint32_t flags,
14798 vfs_context_t ctx)
14799 {
14800 vnode_t rvp, snapdvp;
14801 int error;
14802 struct nameidata *ndp;
14803
14804 /* No flags are currently defined */
14805 if (flags) {
14806 printf("snapshot_delete: Invalid flags passed 0x%x\n", flags);
14807 return EINVAL;
14808 }
14809
14810 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14811
14812 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14813 OP_UNLINK, ctx);
14814 if (error) {
14815 goto out;
14816 }
14817
14818 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14819 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14820
14821 vnode_put(ndp->ni_vp);
14822 nameidone(ndp);
14823 vnode_put(snapdvp);
14824 vnode_put(rvp);
14825 out:
14826 kfree_type(struct nameidata, ndp);
14827
14828 return error;
14829 }
14830
14831 /*
14832 * Revert a filesystem to a snapshot
14833 *
14834 * Marks the filesystem to revert to the given snapshot on next mount.
14835 */
14836 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)14837 snapshot_revert(int dirfd, user_addr_t name, uint32_t flags,
14838 vfs_context_t ctx)
14839 {
14840 int error;
14841 vnode_t rvp;
14842 mount_t mp;
14843 struct fs_snapshot_revert_args revert_data;
14844 struct componentname cnp;
14845 caddr_t name_buf;
14846 size_t name_len;
14847
14848 /* No flags are currently defined */
14849 if (flags) {
14850 printf("snapshot_revert: Invalid flags passed 0x%x\n", flags);
14851 return EINVAL;
14852 }
14853
14854 error = vnode_getfromfd(ctx, dirfd, &rvp);
14855 if (error) {
14856 return error;
14857 }
14858 mp = vnode_mount(rvp);
14859
14860 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14861 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14862 if (error) {
14863 zfree(ZV_NAMEI, name_buf);
14864 vnode_put(rvp);
14865 return error;
14866 }
14867
14868 #if CONFIG_MACF
14869 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14870 if (error) {
14871 zfree(ZV_NAMEI, name_buf);
14872 vnode_put(rvp);
14873 return error;
14874 }
14875 #endif
14876
14877 /*
14878 * Grab mount_iterref so that we can release the vnode,
14879 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14880 */
14881 error = mount_iterref(mp, 0);
14882 vnode_put(rvp);
14883 if (error) {
14884 zfree(ZV_NAMEI, name_buf);
14885 return error;
14886 }
14887
14888 memset(&cnp, 0, sizeof(cnp));
14889 cnp.cn_pnbuf = (char *)name_buf;
14890 cnp.cn_nameiop = LOOKUP;
14891 cnp.cn_flags = ISLASTCN | HASBUF;
14892 cnp.cn_pnlen = MAXPATHLEN;
14893 cnp.cn_nameptr = cnp.cn_pnbuf;
14894 cnp.cn_namelen = (int)name_len;
14895 revert_data.sr_cnp = &cnp;
14896
14897 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14898 mount_iterdrop(mp);
14899 zfree(ZV_NAMEI, name_buf);
14900
14901 if (error) {
14902 /* If there was any error, try again using VNOP_IOCTL */
14903
14904 vnode_t snapdvp;
14905 struct nameidata namend;
14906
14907 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14908 OP_LOOKUP, ctx);
14909 if (error) {
14910 return error;
14911 }
14912
14913
14914 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14915 0, ctx);
14916
14917 vnode_put(namend.ni_vp);
14918 nameidone(&namend);
14919 vnode_put(snapdvp);
14920 vnode_put(rvp);
14921 }
14922
14923 return error;
14924 }
14925
14926 /*
14927 * rename a Filesystem snapshot
14928 *
14929 * get the vnode for the unnamed snapshot directory and the snapshot and
14930 * rename the snapshot. This is a very specialised (and simple) case of
14931 * rename(2) (which has to deal with a lot more complications). It differs
14932 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14933 */
14934 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,uint32_t flags,vfs_context_t ctx)14935 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14936 uint32_t flags, vfs_context_t ctx)
14937 {
14938 vnode_t rvp, snapdvp;
14939 int error, i;
14940 caddr_t newname_buf;
14941 size_t name_len;
14942 vnode_t fvp;
14943 struct nameidata *fromnd, *tond;
14944 /* carving out a chunk for structs that are too big to be on stack. */
14945 struct {
14946 struct nameidata from_node;
14947 struct nameidata to_node;
14948 } * __rename_data;
14949
14950 /* No flags are currently defined */
14951 if (flags) {
14952 printf("snapshot_rename: Invalid flags passed 0x%x\n", flags);
14953 return EINVAL;
14954 }
14955
14956 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14957 fromnd = &__rename_data->from_node;
14958 tond = &__rename_data->to_node;
14959
14960 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14961 OP_UNLINK, ctx);
14962 if (error) {
14963 goto out;
14964 }
14965 fvp = fromnd->ni_vp;
14966
14967 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14968 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14969 if (error) {
14970 goto out1;
14971 }
14972
14973 /*
14974 * Some sanity checks- new name can't be empty, "." or ".." or have
14975 * slashes.
14976 * (the length returned by copyinstr includes the terminating NUL)
14977 *
14978 * The FS rename VNOP is suppossed to handle this but we'll pick it
14979 * off here itself.
14980 */
14981 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14982 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14983 error = EINVAL;
14984 goto out1;
14985 }
14986 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14987 ;
14988 }
14989 if (i < (int)name_len) {
14990 error = EINVAL;
14991 goto out1;
14992 }
14993
14994 #if CONFIG_MACF
14995 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14996 newname_buf);
14997 if (error) {
14998 goto out1;
14999 }
15000 #endif
15001
15002 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
15003 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
15004 tond->ni_dvp = snapdvp;
15005
15006 error = namei(tond);
15007 if (error) {
15008 goto out2;
15009 } else if (tond->ni_vp) {
15010 /*
15011 * snapshot rename behaves differently than rename(2) - if the
15012 * new name exists, EEXIST is returned.
15013 */
15014 vnode_put(tond->ni_vp);
15015 error = EEXIST;
15016 goto out2;
15017 }
15018
15019 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
15020 &tond->ni_cnd, ctx);
15021
15022 out2:
15023 nameidone(tond);
15024 out1:
15025 zfree(ZV_NAMEI, newname_buf);
15026 vnode_put(fvp);
15027 vnode_put(snapdvp);
15028 vnode_put(rvp);
15029 nameidone(fromnd);
15030 out:
15031 kfree_type(typeof(*__rename_data), __rename_data);
15032 return error;
15033 }
15034
15035 /*
15036 * Mount a Filesystem snapshot
15037 *
15038 * get the vnode for the unnamed snapshot directory and the snapshot and
15039 * mount the snapshot.
15040 */
15041 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,uint32_t flags,vfs_context_t ctx)15042 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
15043 __unused user_addr_t mnt_data, uint32_t flags, vfs_context_t ctx)
15044 {
15045 mount_t mp;
15046 vnode_t rvp, snapdvp, snapvp, vp, pvp;
15047 struct fs_snapshot_mount_args smnt_data;
15048 int error, mount_flags = 0;
15049 struct nameidata *snapndp, *dirndp;
15050 /* carving out a chunk for structs that are too big to be on stack. */
15051 struct {
15052 struct nameidata snapnd;
15053 struct nameidata dirnd;
15054 } * __snapshot_mount_data;
15055
15056 /* Check for invalid flags */
15057 if (flags & ~SNAPSHOT_MNT_VALIDMASK) {
15058 printf("snapshot_mount: Invalid flags passed 0x%x\n", flags);
15059 return EINVAL;
15060 }
15061
15062 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
15063 snapndp = &__snapshot_mount_data->snapnd;
15064 dirndp = &__snapshot_mount_data->dirnd;
15065
15066 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
15067 OP_LOOKUP, ctx);
15068 if (error) {
15069 goto out;
15070 }
15071
15072 snapvp = snapndp->ni_vp;
15073 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
15074 error = EIO;
15075 goto out1;
15076 }
15077
15078 /* Convert snapshot_mount flags to mount flags */
15079 if (flags & SNAPSHOT_MNT_NOEXEC) {
15080 mount_flags |= MNT_NOEXEC;
15081 }
15082 if (flags & SNAPSHOT_MNT_NOSUID) {
15083 mount_flags |= MNT_NOSUID;
15084 }
15085 if (flags & SNAPSHOT_MNT_NODEV) {
15086 mount_flags |= MNT_NODEV;
15087 }
15088 if (flags & SNAPSHOT_MNT_DONTBROWSE) {
15089 mount_flags |= MNT_DONTBROWSE;
15090 }
15091 if (flags & SNAPSHOT_MNT_IGNORE_OWNERSHIP) {
15092 mount_flags |= MNT_IGNORE_OWNERSHIP;
15093 }
15094 if (flags & SNAPSHOT_MNT_NOFOLLOW) {
15095 mount_flags |= MNT_NOFOLLOW;
15096 }
15097
15098 /* Get the vnode to be covered */
15099 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
15100 UIO_USERSPACE, directory, ctx);
15101 if (mount_flags & MNT_NOFOLLOW) {
15102 dirndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
15103 }
15104
15105 error = namei(dirndp);
15106 if (error) {
15107 goto out1;
15108 }
15109
15110 vp = dirndp->ni_vp;
15111 pvp = dirndp->ni_dvp;
15112 mp = vnode_mount(rvp);
15113
15114 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
15115 error = EINVAL;
15116 goto out2;
15117 }
15118
15119 #if CONFIG_MACF
15120 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
15121 mp->mnt_vfsstat.f_fstypename);
15122 if (error) {
15123 goto out2;
15124 }
15125 #endif
15126
15127 smnt_data.sm_mp = mp;
15128 smnt_data.sm_cnp = &snapndp->ni_cnd;
15129 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
15130 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), mount_flags,
15131 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
15132
15133 out2:
15134 vnode_put(vp);
15135 vnode_put(pvp);
15136 nameidone(dirndp);
15137 out1:
15138 vnode_put(snapvp);
15139 vnode_put(snapdvp);
15140 vnode_put(rvp);
15141 nameidone(snapndp);
15142 out:
15143 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
15144 return error;
15145 }
15146
15147 /*
15148 * Root from a snapshot of the filesystem
15149 *
15150 * Marks the filesystem to root from the given snapshot on next boot.
15151 */
15152 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,uint32_t flags,vfs_context_t ctx)15153 snapshot_root(int dirfd, user_addr_t name, uint32_t flags,
15154 vfs_context_t ctx)
15155 {
15156 int error;
15157 vnode_t rvp;
15158 mount_t mp;
15159 struct fs_snapshot_root_args root_data;
15160 struct componentname cnp;
15161 caddr_t name_buf;
15162 size_t name_len;
15163
15164 /* No flags are currently defined */
15165 if (flags) {
15166 printf("snapshot_root: Invalid flags passed 0x%x\n", flags);
15167 return EINVAL;
15168 }
15169
15170 error = vnode_getfromfd(ctx, dirfd, &rvp);
15171 if (error) {
15172 return error;
15173 }
15174 mp = vnode_mount(rvp);
15175
15176 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
15177 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
15178 if (error) {
15179 zfree(ZV_NAMEI, name_buf);
15180 vnode_put(rvp);
15181 return error;
15182 }
15183
15184 // XXX MAC checks ?
15185
15186 /*
15187 * Grab mount_iterref so that we can release the vnode,
15188 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
15189 */
15190 error = mount_iterref(mp, 0);
15191 vnode_put(rvp);
15192 if (error) {
15193 zfree(ZV_NAMEI, name_buf);
15194 return error;
15195 }
15196
15197 memset(&cnp, 0, sizeof(cnp));
15198 cnp.cn_pnbuf = (char *)name_buf;
15199 cnp.cn_nameiop = LOOKUP;
15200 cnp.cn_flags = ISLASTCN | HASBUF;
15201 cnp.cn_pnlen = MAXPATHLEN;
15202 cnp.cn_nameptr = cnp.cn_pnbuf;
15203 cnp.cn_namelen = (int)name_len;
15204 root_data.sr_cnp = &cnp;
15205
15206 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
15207
15208 mount_iterdrop(mp);
15209 zfree(ZV_NAMEI, name_buf);
15210
15211 return error;
15212 }
15213
15214 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)15215 vfs_context_can_snapshot(vfs_context_t ctx)
15216 {
15217 static const char * const snapshot_entitlements[] = {
15218 "com.apple.private.vfs.snapshot",
15219 "com.apple.developer.vfs.snapshot",
15220 "com.apple.private.apfs.arv.limited.snapshot",
15221 };
15222 static const size_t nentitlements =
15223 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
15224 size_t i;
15225
15226 task_t task = vfs_context_task(ctx);
15227 for (i = 0; i < nentitlements; i++) {
15228 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
15229 return TRUE;
15230 }
15231 }
15232 return FALSE;
15233 }
15234
15235 /*
15236 * FS snapshot operations dispatcher
15237 */
15238 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)15239 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
15240 __unused int32_t *retval)
15241 {
15242 int error;
15243 vfs_context_t ctx = vfs_context_current();
15244
15245 AUDIT_ARG(fd, uap->dirfd);
15246 AUDIT_ARG(value32, uap->op);
15247
15248 if (!vfs_context_can_snapshot(ctx)) {
15249 return EPERM;
15250 }
15251
15252 /*
15253 * Enforce user authorization for snapshot modification operations,
15254 * or if trying to root from snapshot.
15255 */
15256 if (uap->op != SNAPSHOT_OP_MOUNT) {
15257 vnode_t dvp = NULLVP;
15258 vnode_t devvp = NULLVP;
15259 mount_t mp;
15260
15261 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
15262 if (error) {
15263 return error;
15264 }
15265 mp = vnode_mount(dvp);
15266 devvp = mp->mnt_devvp;
15267
15268 /* get an iocount on devvp */
15269 if (devvp == NULLVP) {
15270 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
15271 /* for mounts which arent block devices */
15272 if (error == ENOENT) {
15273 error = ENXIO;
15274 }
15275 } else {
15276 error = vnode_getwithref(devvp);
15277 }
15278
15279 if (error) {
15280 vnode_put(dvp);
15281 return error;
15282 }
15283
15284 if ((vfs_context_issuser(ctx) == 0) &&
15285 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
15286 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
15287 error = EPERM;
15288 }
15289 vnode_put(dvp);
15290 vnode_put(devvp);
15291
15292 if (error) {
15293 return error;
15294 }
15295 }
15296
15297 switch (uap->op) {
15298 case SNAPSHOT_OP_CREATE:
15299 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
15300 break;
15301 case SNAPSHOT_OP_DELETE:
15302 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
15303 break;
15304 case SNAPSHOT_OP_RENAME:
15305 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
15306 uap->flags, ctx);
15307 break;
15308 case SNAPSHOT_OP_MOUNT:
15309 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
15310 uap->data, uap->flags, ctx);
15311 break;
15312 case SNAPSHOT_OP_REVERT:
15313 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
15314 break;
15315 #if CONFIG_MNT_ROOTSNAP
15316 case SNAPSHOT_OP_ROOT:
15317 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
15318 break;
15319 #endif /* CONFIG_MNT_ROOTSNAP */
15320 default:
15321 error = ENOSYS;
15322 }
15323
15324 return error;
15325 }
15326