1 /*
2 * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/syslimits.h> /* For MAXLONGPATHLEN */
77 #include <sys/namei.h>
78 #include <sys/filedesc.h>
79 #include <sys/kernel.h>
80 #include <sys/file_internal.h>
81 #include <sys/stat.h>
82 #include <sys/vnode_internal.h>
83 #include <sys/mount_internal.h>
84 #include <sys/proc_internal.h>
85 #include <sys/kauth.h>
86 #include <sys/uio_internal.h>
87 #include <kern/kalloc.h>
88 #include <sys/mman.h>
89 #include <sys/dirent.h>
90 #include <sys/attr.h>
91 #include <sys/sysctl.h>
92 #include <sys/ubc.h>
93 #include <sys/quota.h>
94 #include <sys/kdebug.h>
95 #include <sys/fsevents.h>
96 #include <sys/imgsrc.h>
97 #include <sys/sysproto.h>
98 #include <sys/sysctl.h>
99 #include <sys/xattr.h>
100 #include <sys/fcntl.h>
101 #include <sys/stdio.h>
102 #include <sys/fsctl.h>
103 #include <sys/ubc_internal.h>
104 #include <sys/disk.h>
105 #include <sys/content_protection.h>
106 #include <sys/clonefile.h>
107 #include <sys/snapshot.h>
108 #include <sys/priv.h>
109 #include <sys/fsgetpath.h>
110 #include <machine/cons.h>
111 #include <machine/limits.h>
112 #include <miscfs/specfs/specdev.h>
113
114 #include <vfs/vfs_disk_conditioner.h>
115 #if CONFIG_EXCLAVES
116 #include <vfs/vfs_exclave_fs.h>
117 #endif
118
119 #include <security/audit/audit.h>
120 #include <bsm/audit_kevents.h>
121
122 #include <mach/mach_types.h>
123 #include <kern/kern_types.h>
124 #include <kern/kalloc.h>
125 #include <kern/task.h>
126
127 #include <vm/vm_pageout.h>
128 #include <vm/vm_protos.h>
129 #include <vm/memory_object_xnu.h>
130
131 #include <libkern/OSAtomic.h>
132 #include <os/atomic_private.h>
133 #include <pexpert/pexpert.h>
134 #include <IOKit/IOBSD.h>
135
136 // deps for MIG call
137 #include <kern/host.h>
138 #include <kern/ipc_misc.h>
139 #include <mach/host_priv.h>
140 #include <mach/vfs_nspace.h>
141 #include <os/log.h>
142
143 #include <nfs/nfs_conf.h>
144
145 #if ROUTEFS
146 #include <miscfs/routefs/routefs.h>
147 #endif /* ROUTEFS */
148
149 #if CONFIG_MACF
150 #include <security/mac.h>
151 #include <security/mac_framework.h>
152 #endif
153
154 #if CONFIG_FSE
155 #define GET_PATH(x) \
156 ((x) = get_pathbuff())
157 #define RELEASE_PATH(x) \
158 release_pathbuff(x)
159 #else
160 #define GET_PATH(x) \
161 ((x) = zalloc(ZV_NAMEI))
162 #define RELEASE_PATH(x) \
163 zfree(ZV_NAMEI, x)
164 #endif /* CONFIG_FSE */
165
166 #ifndef HFS_GET_BOOT_INFO
167 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
168 #endif
169
170 #ifndef HFS_SET_BOOT_INFO
171 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
172 #endif
173
174 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
175 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
176 #endif
177
178 extern void disk_conditioner_unmount(mount_t mp);
179
180 /* struct for checkdirs iteration */
181 struct cdirargs {
182 vnode_t olddp;
183 vnode_t newdp;
184 };
185 /* callback for checkdirs iteration */
186 static int checkdirs_callback(proc_t p, void * arg);
187
188 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
189 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
190 void enablequotas(struct mount *mp, vfs_context_t ctx);
191 static int getfsstat_callback(mount_t mp, void * arg);
192 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
193 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
194 static int sync_callback(mount_t, void *);
195 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
196 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
197 boolean_t partial_copy);
198 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
199 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
200 struct componentname *cnp, user_addr_t fsmountargs,
201 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
202 void vfs_notify_mount(vnode_t pdvp);
203
204 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
205
206 struct fd_vn_data * fg_vn_data_alloc(void);
207
208 /*
209 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
210 * Concurrent lookups (or lookups by ids) on hard links can cause the
211 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
212 * does) to return ENOENT as the path cannot be returned from the name cache
213 * alone. We have no option but to retry and hope to get one namei->reverse path
214 * generation done without an intervening lookup, lookup by id on the hard link
215 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
216 * which currently are the MAC hooks for rename, unlink and rmdir.
217 */
218 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
219
220 /* Max retry limit for rename due to vnode recycling. */
221 #define MAX_RENAME_ERECYCLE_RETRIES 1024
222
223 #define MAX_LINK_ENOENT_RETRIES 1024
224
225 /* Max retries for concurrent mounts on the same covered vnode. */
226 #define MAX_MOUNT_RETRIES 10
227
228 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
229 int unlink_flags);
230
231 #ifdef CONFIG_IMGSRC_ACCESS
232 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
233 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
234 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
235 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
236 static void mount_end_update(mount_t mp);
237 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
238 #endif /* CONFIG_IMGSRC_ACCESS */
239
240 //snapshot functions
241 #if CONFIG_MNT_ROOTSNAP
242 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
243 #else
244 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
245 #endif
246
247 __private_extern__
248 int sync_internal(void);
249
250 __private_extern__
251 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
252
253 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
254 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
255
256 /* vars for sync mutex */
257 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
258 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
259
260 extern lck_rw_t rootvnode_rw_lock;
261
262 VFS_SMR_DECLARE;
263 extern uint32_t nc_smr_enabled;
264
265 /*
266 * incremented each time a mount or unmount operation occurs
267 * used to invalidate the cached value of the rootvp in the
268 * mount structure utilized by cache_lookup_path
269 */
270 uint32_t mount_generation = 0;
271
272 /* counts number of mount and unmount operations */
273 unsigned int vfs_nummntops = 0;
274
275 /* system-wide, per-boot unique mount ID */
276 static _Atomic uint64_t mount_unique_id = 1;
277
278 extern const struct fileops vnops;
279 #if CONFIG_APPLEDOUBLE
280 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
281 #endif /* CONFIG_APPLEDOUBLE */
282
283
284 /*
285 * Virtual File System System Calls
286 */
287
288 /*
289 * Private in-kernel mounting spi (specific use-cases only)
290 */
291 boolean_t
vfs_iskernelmount(mount_t mp)292 vfs_iskernelmount(mount_t mp)
293 {
294 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
295 }
296
297 __private_extern__
298 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)299 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
300 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
301 vfs_context_t ctx)
302 {
303 struct nameidata nd;
304 boolean_t did_namei;
305 int error;
306
307 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
308 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
309 if (syscall_flags & MNT_NOFOLLOW) {
310 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
311 }
312
313 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
314
315 /*
316 * Get the vnode to be covered if it's not supplied
317 */
318 if (vp == NULLVP) {
319 error = namei(&nd);
320 if (error) {
321 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
322 printf("failed to locate mount-on path: %s ", path);
323 }
324 return error;
325 }
326 vp = nd.ni_vp;
327 pvp = nd.ni_dvp;
328 did_namei = TRUE;
329 } else {
330 char *pnbuf = CAST_DOWN(char *, path);
331
332 nd.ni_cnd.cn_pnbuf = pnbuf;
333 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
334 did_namei = FALSE;
335 }
336
337 kern_flags |= KERNEL_MOUNT_KMOUNT;
338 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
339 syscall_flags, kern_flags, NULL, ctx);
340
341 if (did_namei) {
342 vnode_put(vp);
343 vnode_put(pvp);
344 nameidone(&nd);
345 }
346
347 return error;
348 }
349
350 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)351 vfs_mount_at_path(const char *fstype, const char *path,
352 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
353 int mnt_flags, int flags)
354 {
355 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
356 int error, km_flags = 0;
357 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
358
359 /*
360 * This call is currently restricted to specific use cases.
361 */
362 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
363 return ENOTSUP;
364 }
365
366 #if !defined(XNU_TARGET_OS_OSX)
367 if (strcmp(fstype, "lifs") == 0) {
368 syscall_flags |= MNT_NOEXEC;
369 }
370 #endif
371
372 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
373 km_flags |= KERNEL_MOUNT_NOAUTH;
374 }
375 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
376 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
377 }
378
379 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
380 syscall_flags, km_flags, ctx);
381 if (error) {
382 printf("%s: mount on %s failed, error %d\n", __func__, path,
383 error);
384 }
385
386 return error;
387 }
388
389 /*
390 * Mount a file system.
391 */
392 /* ARGSUSED */
393 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)394 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
395 {
396 struct __mac_mount_args muap;
397
398 muap.type = uap->type;
399 muap.path = uap->path;
400 muap.flags = uap->flags;
401 muap.data = uap->data;
402 muap.mac_p = USER_ADDR_NULL;
403 return __mac_mount(p, &muap, retval);
404 }
405
406 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)407 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
408 {
409 struct componentname cn;
410 vfs_context_t ctx = vfs_context_current();
411 size_t dummy = 0;
412 int error;
413 int flags = uap->flags;
414 char fstypename[MFSNAMELEN];
415 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
416 vnode_t pvp;
417 vnode_t vp;
418
419 AUDIT_ARG(fd, uap->fd);
420 AUDIT_ARG(fflags, flags);
421 /* fstypename will get audited by mount_common */
422
423 /* Sanity check the flags */
424 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
425 return ENOTSUP;
426 }
427
428 if (flags & MNT_UNION) {
429 return EPERM;
430 }
431
432 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
433 if (error) {
434 return error;
435 }
436
437 if ((error = file_vnode(uap->fd, &vp)) != 0) {
438 return error;
439 }
440
441 if ((error = vnode_getwithref(vp)) != 0) {
442 file_drop(uap->fd);
443 return error;
444 }
445
446 pvp = vnode_getparent(vp);
447 if (pvp == NULL) {
448 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
449 error = EBUSY;
450 } else {
451 error = EINVAL;
452 }
453 vnode_put(vp);
454 file_drop(uap->fd);
455 return error;
456 }
457
458 memset(&cn, 0, sizeof(struct componentname));
459 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
460 cn.cn_pnlen = MAXPATHLEN;
461
462 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
463 zfree(ZV_NAMEI, cn.cn_pnbuf);
464 vnode_put(pvp);
465 vnode_put(vp);
466 file_drop(uap->fd);
467 return error;
468 }
469
470 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
471
472 zfree(ZV_NAMEI, cn.cn_pnbuf);
473 vnode_put(pvp);
474 vnode_put(vp);
475 file_drop(uap->fd);
476
477 return error;
478 }
479
480 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
481
482 /*
483 * Get the size of a graft file (a manifest or payload file).
484 * The vp should be an iocounted vnode.
485 */
486 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)487 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
488 {
489 struct stat64 sb = {};
490 int error;
491
492 *size = 0;
493
494 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
495 if (error) {
496 return error;
497 }
498
499 if (sb.st_size == 0) {
500 error = ENODATA;
501 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
502 error = EFBIG;
503 } else {
504 *size = (size_t) sb.st_size;
505 }
506
507 return error;
508 }
509
510 /*
511 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
512 * `size` must already be validated.
513 */
514 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)515 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
516 {
517 return vn_rdwr(UIO_READ, graft_vp,
518 (caddr_t) buf, (int) size, /* offset */ 0,
519 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
520 vfs_context_ucred(vctx), /* resid */ NULL,
521 vfs_context_proc(vctx));
522 }
523
524 /*
525 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
526 * and read it into `buf`.
527 * If `path_prefix` is non-NULL, verify that the file path has that prefix.
528 */
529 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)530 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
531 {
532 vnode_t metadata_vp = NULLVP;
533 char *path = NULL;
534 int error;
535
536 // Convert this graft fd to a vnode.
537 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
538 goto out;
539 }
540
541 // Verify that the vnode path starts with `path_prefix` if it was passed.
542 if (path_prefix) {
543 int len = MAXPATHLEN;
544 path = zalloc(ZV_NAMEI);
545 if ((error = vn_getpath(metadata_vp, path, &len))) {
546 goto out;
547 }
548 if (strncmp(path, path_prefix, strlen(path_prefix))) {
549 error = EINVAL;
550 goto out;
551 }
552 }
553
554 // Get (and validate) size information.
555 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
556 goto out;
557 }
558
559 // Read each file into the provided buffer - we must get the expected amount of bytes.
560 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
561 goto out;
562 }
563
564 out:
565 if (path) {
566 zfree(ZV_NAMEI, path);
567 }
568 if (metadata_vp) {
569 vnode_put(metadata_vp);
570 metadata_vp = NULLVP;
571 }
572
573 return error;
574 }
575
576 #if XNU_TARGET_OS_OSX
577 #if defined(__arm64e__)
578 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
579 #else /* x86_64 */
580 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
581 #endif /* x86_64 */
582 #else /* !XNU_TARGET_OS_OSX */
583 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
584 #endif /* !XNU_TARGET_OS_OSX */
585
586 /*
587 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
588 * provided in `gfs`, saving the size of data read in `gfs`.
589 */
590 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)591 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
592 vfs_context_t vctx, fsioc_graft_fs_t *gfs)
593 {
594 const char *manifest_path_prefix = NULL;
595 int error;
596
597 // For Mobile Asset, make sure that the manifest comes from a data vault.
598 if (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) {
599 manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
600 }
601
602 // Read the authentic manifest.
603 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
604 manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
605 return error;
606 }
607
608 // The user manifest is currently unused, but set its size.
609 gfs->user_manifest_size = 0;
610
611 // Read the payload.
612 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
613 NULL, &gfs->payload_size, gfs->payload))) {
614 return error;
615 }
616
617 return 0;
618 }
619
620 /*
621 * Call into the filesystem to verify and graft a cryptex.
622 */
623 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)624 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
625 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
626 {
627 fsioc_graft_fs_t gfs = {};
628 uint64_t graft_dir_ino = 0;
629 struct stat64 sb = {};
630 int error;
631
632 // Pre-flight arguments.
633 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
634 // Make sure that this graft version matches what we support.
635 return ENOTSUP;
636 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
637 // For this type, cryptex VP must live on same volume as the target of graft.
638 return EXDEV;
639 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
640 // We cannot graft upon non-directories.
641 return ENOTDIR;
642 } else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
643 // We do not allow grafts inside disk images.
644 return ENODEV;
645 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
646 sbc_args->sbc_payload_fd < 0) {
647 // We cannot graft without a manifest and payload.
648 return EINVAL;
649 }
650
651 if (mounton_vp) {
652 // Get the mounton's inode number.
653 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
654 if (error) {
655 return error;
656 }
657 graft_dir_ino = (uint64_t) sb.st_ino;
658 }
659
660 // Create buffers (of our maximum-defined size) to store authentication info.
661 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
662 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
663
664 if (!gfs.authentic_manifest || !gfs.payload) {
665 error = ENOMEM;
666 goto out;
667 }
668
669 // Read our fd's into our buffers.
670 // (Note that this will set the buffer size fields in `gfs`.)
671 error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
672 if (error) {
673 goto out;
674 }
675
676 gfs.graft_version = FSIOC_GRAFT_VERSION;
677 gfs.graft_type = graft_type;
678 gfs.graft_4cc = sbc_args->sbc_4cc;
679 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
680 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
681 }
682 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
683 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
684 }
685 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
686 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
687 }
688 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
689 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
690 }
691 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
692 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
693 }
694 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
695 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
696 }
697 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
698
699 // Call into the FS to perform the graft (and validation).
700 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
701
702 out:
703 if (gfs.authentic_manifest) {
704 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
705 gfs.authentic_manifest = NULL;
706 }
707 if (gfs.payload) {
708 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
709 gfs.payload = NULL;
710 }
711
712 return error;
713 }
714
715 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
716
717 /*
718 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
719 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
720 */
721 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)722 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
723 {
724 int ua_dmgfd = uap->dmg_fd;
725 user_addr_t ua_mountdir = uap->mountdir;
726 uint32_t ua_grafttype = uap->graft_type;
727 user_addr_t ua_graftargs = uap->gda;
728
729 graftdmg_args_un kern_gda = {};
730 int error = 0;
731 secure_boot_cryptex_args_t *sbc_args = NULL;
732
733 vnode_t cryptex_vp = NULLVP;
734 vnode_t mounton_vp = NULLVP;
735 struct nameidata nd = {};
736 vfs_context_t ctx = vfs_context_current();
737
738 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
739 return EPERM;
740 }
741
742 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
743 if (error) {
744 return error;
745 }
746
747 // Copy mount dir in, if provided.
748 if (ua_mountdir != USER_ADDR_NULL) {
749 // Acquire vnode for mount-on path
750 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
751 UIO_USERSPACE, ua_mountdir, ctx);
752
753 error = namei(&nd);
754 if (error) {
755 return error;
756 }
757 mounton_vp = nd.ni_vp;
758 }
759
760 // Convert fd to vnode.
761 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
762 if (error) {
763 goto graftout;
764 }
765
766 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
767 error = EINVAL;
768 } else {
769 sbc_args = &kern_gda.sbc_args;
770 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
771 }
772
773 graftout:
774 if (cryptex_vp) {
775 vnode_put(cryptex_vp);
776 cryptex_vp = NULLVP;
777 }
778 if (mounton_vp) {
779 vnode_put(mounton_vp);
780 mounton_vp = NULLVP;
781 }
782 if (ua_mountdir != USER_ADDR_NULL) {
783 nameidone(&nd);
784 }
785
786 return error;
787 }
788
789 /*
790 * Ungraft a cryptex disk image (via mount dir FD)
791 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
792 */
793 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)794 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
795 {
796 int error = 0;
797 user_addr_t ua_mountdir = uap->mountdir;
798 fsioc_ungraft_fs_t ugfs;
799 vnode_t mounton_vp = NULLVP;
800 struct nameidata nd = {};
801 vfs_context_t ctx = vfs_context_current();
802
803 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
804 return EPERM;
805 }
806
807 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
808 return EINVAL;
809 }
810
811 ugfs.ungraft_flags = 0;
812
813 // Acquire vnode for mount-on path
814 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
815 UIO_USERSPACE, ua_mountdir, ctx);
816
817 error = namei(&nd);
818 if (error) {
819 return error;
820 }
821 mounton_vp = nd.ni_vp;
822
823 // Call into the FS to perform the ungraft
824 error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
825
826 vnode_put(mounton_vp);
827 nameidone(&nd);
828
829 return error;
830 }
831
832
833 void
vfs_notify_mount(vnode_t pdvp)834 vfs_notify_mount(vnode_t pdvp)
835 {
836 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
837 lock_vnode_and_post(pdvp, NOTE_WRITE);
838 }
839
840 /*
841 * __mac_mount:
842 * Mount a file system taking into account MAC label behavior.
843 * See mount(2) man page for more information
844 *
845 * Parameters: p Process requesting the mount
846 * uap User argument descriptor (see below)
847 * retval (ignored)
848 *
849 * Indirect: uap->type Filesystem type
850 * uap->path Path to mount
851 * uap->data Mount arguments
852 * uap->mac_p MAC info
853 * uap->flags Mount flags
854 *
855 *
856 * Returns: 0 Success
857 * !0 Not success
858 */
859 boolean_t root_fs_upgrade_try = FALSE;
860
861 #define MAX_NESTED_UNION_MOUNTS 10
862
863 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)864 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
865 {
866 vnode_t pvp = NULLVP;
867 vnode_t vp = NULLVP;
868 int need_nameidone = 0;
869 vfs_context_t ctx = vfs_context_current();
870 char fstypename[MFSNAMELEN];
871 struct nameidata nd;
872 size_t dummy = 0;
873 char *labelstr = NULL;
874 size_t labelsz = 0;
875 int flags = uap->flags;
876 int error;
877 int num_retries = 0;
878 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
879 boolean_t is_64bit = IS_64BIT_PROCESS(p);
880 #else
881 #pragma unused(p)
882 #endif
883 /*
884 * Get the fs type name from user space
885 */
886 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
887 if (error) {
888 return error;
889 }
890
891 retry:
892 /*
893 * Get the vnode to be covered
894 */
895 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
896 UIO_USERSPACE, uap->path, ctx);
897 if (flags & MNT_NOFOLLOW) {
898 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
899 }
900 error = namei(&nd);
901 if (error) {
902 goto out;
903 }
904 need_nameidone = 1;
905 vp = nd.ni_vp;
906 pvp = nd.ni_dvp;
907
908 #ifdef CONFIG_IMGSRC_ACCESS
909 /* Mounting image source cannot be batched with other operations */
910 if (flags == MNT_IMGSRC_BY_INDEX) {
911 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
912 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
913 goto out;
914 }
915 #endif /* CONFIG_IMGSRC_ACCESS */
916
917 #if CONFIG_MACF
918 /*
919 * Get the label string (if any) from user space
920 */
921 if (uap->mac_p != USER_ADDR_NULL) {
922 struct user_mac mac;
923 size_t ulen = 0;
924
925 if (is_64bit) {
926 struct user64_mac mac64;
927 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
928 mac.m_buflen = (user_size_t)mac64.m_buflen;
929 mac.m_string = (user_addr_t)mac64.m_string;
930 } else {
931 struct user32_mac mac32;
932 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
933 mac.m_buflen = mac32.m_buflen;
934 mac.m_string = mac32.m_string;
935 }
936 if (error) {
937 goto out;
938 }
939 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
940 (mac.m_buflen < 2)) {
941 error = EINVAL;
942 goto out;
943 }
944 labelsz = mac.m_buflen;
945 labelstr = kalloc_data(labelsz, Z_WAITOK);
946 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
947 if (error) {
948 goto out;
949 }
950 AUDIT_ARG(mac_string, labelstr);
951 }
952 #endif /* CONFIG_MACF */
953
954 AUDIT_ARG(fflags, flags);
955
956 if (flags & MNT_UNION) {
957 #if CONFIG_UNION_MOUNTS
958 mount_t mp = vp->v_mount;
959 int nested_union_mounts = 0;
960
961 name_cache_lock_shared();
962
963 /* Walk up the vnodecovered chain and check for nested union mounts. */
964 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
965 while (mp) {
966 if (!(mp->mnt_flag & MNT_UNION)) {
967 break;
968 }
969 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
970
971 /*
972 * Limit the max nested unon mounts to prevent stack exhaustion
973 * when calling lookup_traverse_union().
974 */
975 if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
976 error = ELOOP;
977 break;
978 }
979 }
980
981 name_cache_unlock();
982 if (error) {
983 goto out;
984 }
985 #else
986 error = EPERM;
987 goto out;
988 #endif /* CONFIG_UNION_MOUNTS */
989 }
990
991 if ((vp->v_flag & VROOT) &&
992 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
993 #if CONFIG_UNION_MOUNTS
994 if (!(flags & MNT_UNION)) {
995 flags |= MNT_UPDATE;
996 } else {
997 /*
998 * For a union mount on '/', treat it as fresh
999 * mount instead of update.
1000 * Otherwise, union mouting on '/' used to panic the
1001 * system before, since mnt_vnodecovered was found to
1002 * be NULL for '/' which is required for unionlookup
1003 * after it gets ENOENT on union mount.
1004 */
1005 flags = (flags & ~(MNT_UPDATE));
1006 }
1007 #else
1008 flags |= MNT_UPDATE;
1009 #endif /* CONFIG_UNION_MOUNTS */
1010
1011 #if SECURE_KERNEL
1012 if ((flags & MNT_RDONLY) == 0) {
1013 /* Release kernels are not allowed to mount "/" as rw */
1014 error = EPERM;
1015 goto out;
1016 }
1017 #endif
1018
1019 /*
1020 * See 7392553 for more details on why this check exists.
1021 * Suffice to say: If this check is ON and something tries
1022 * to mount the rootFS RW, we'll turn off the codesign
1023 * bitmap optimization.
1024 */
1025 #if CHECK_CS_VALIDATION_BITMAP
1026 if ((flags & MNT_RDONLY) == 0) {
1027 root_fs_upgrade_try = TRUE;
1028 }
1029 #endif
1030 }
1031
1032 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1033 labelstr, ctx);
1034
1035 out:
1036
1037 #if CONFIG_MACF
1038 kfree_data(labelstr, labelsz);
1039 #endif /* CONFIG_MACF */
1040
1041 if (vp) {
1042 vnode_put(vp);
1043 vp = NULLVP;
1044 }
1045 if (pvp) {
1046 vnode_put(pvp);
1047 pvp = NULLVP;
1048 }
1049 if (need_nameidone) {
1050 nameidone(&nd);
1051 need_nameidone = 0;
1052 }
1053
1054 if (error == EBUSY) {
1055 /* Retry the lookup and mount again due to concurrent mounts. */
1056 if (++num_retries < MAX_MOUNT_RETRIES) {
1057 goto retry;
1058 }
1059 }
1060
1061 return error;
1062 }
1063
1064 /*
1065 * common mount implementation (final stage of mounting)
1066 *
1067 * Arguments:
1068 * fstypename file system type (ie it's vfs name)
1069 * pvp parent of covered vnode
1070 * vp covered vnode
1071 * cnp component name (ie path) of covered vnode
1072 * flags generic mount flags
1073 * fsmountargs file system specific data
1074 * labelstr optional MAC label
1075 * kernelmount TRUE for mounts initiated from inside the kernel
1076 * ctx caller's context
1077 */
1078 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1079 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1080 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1081 char *labelstr, vfs_context_t ctx)
1082 {
1083 #if !CONFIG_MACF
1084 #pragma unused(labelstr)
1085 #endif
1086 struct vnode *devvp = NULLVP;
1087 struct vnode *device_vnode = NULLVP;
1088 #if CONFIG_MACF
1089 struct vnode *rvp;
1090 #endif
1091 struct mount *mp = NULL;
1092 struct vfstable *vfsp = (struct vfstable *)0;
1093 struct proc *p = vfs_context_proc(ctx);
1094 int error, flag = 0;
1095 bool flag_set = false;
1096 user_addr_t devpath = USER_ADDR_NULL;
1097 int ronly = 0;
1098 int mntalloc = 0;
1099 boolean_t vfsp_ref = FALSE;
1100 boolean_t is_rwlock_locked = FALSE;
1101 boolean_t did_rele = FALSE;
1102 boolean_t have_usecount = FALSE;
1103 boolean_t did_set_lmount = FALSE;
1104 boolean_t did_set_vmount = FALSE;
1105 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1106
1107 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1108 /* Check for mutually-exclusive flag bits */
1109 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1110 int bitcount = 0;
1111 while (checkflags != 0) {
1112 checkflags &= (checkflags - 1);
1113 bitcount++;
1114 }
1115
1116 if (bitcount > 1) {
1117 //not allowed to request multiple mount-by-role flags
1118 error = EINVAL;
1119 goto out1;
1120 }
1121 #endif
1122
1123 /*
1124 * Process an update for an existing mount
1125 */
1126 if (flags & MNT_UPDATE) {
1127 if ((vp->v_flag & VROOT) == 0) {
1128 error = EINVAL;
1129 goto out1;
1130 }
1131 mp = vp->v_mount;
1132
1133 /* if unmount or mount in progress, return error */
1134 mount_lock_spin(mp);
1135 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1136 mount_unlock(mp);
1137 error = EBUSY;
1138 goto out1;
1139 }
1140 mp->mnt_lflag |= MNT_LMOUNT;
1141 did_set_lmount = TRUE;
1142 mount_unlock(mp);
1143 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1144 is_rwlock_locked = TRUE;
1145 /*
1146 * We only allow the filesystem to be reloaded if it
1147 * is currently mounted read-only.
1148 */
1149 if ((flags & MNT_RELOAD) &&
1150 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1151 error = ENOTSUP;
1152 goto out1;
1153 }
1154
1155 /*
1156 * If content protection is enabled, update mounts are not
1157 * allowed to turn it off.
1158 */
1159 if ((mp->mnt_flag & MNT_CPROTECT) &&
1160 ((flags & MNT_CPROTECT) == 0)) {
1161 error = EINVAL;
1162 goto out1;
1163 }
1164
1165 /*
1166 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1167 * failure to return an error for this so we'll just silently
1168 * add it if it is not passed in.
1169 */
1170 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1171 ((flags & MNT_REMOVABLE) == 0)) {
1172 flags |= MNT_REMOVABLE;
1173 }
1174
1175 /* Can't downgrade the backer of the root FS */
1176 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1177 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1178 error = ENOTSUP;
1179 goto out1;
1180 }
1181
1182 /*
1183 * Only root, or the user that did the original mount is
1184 * permitted to update it.
1185 */
1186 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1187 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1188 goto out1;
1189 }
1190 #if CONFIG_MACF
1191 error = mac_mount_check_remount(ctx, mp, flags);
1192 if (error != 0) {
1193 goto out1;
1194 }
1195 #endif
1196 /*
1197 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1198 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1199 */
1200 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1201 flags |= MNT_NOSUID | MNT_NODEV;
1202 if (mp->mnt_flag & MNT_NOEXEC) {
1203 flags |= MNT_NOEXEC;
1204 }
1205 }
1206 flag = mp->mnt_flag;
1207 flag_set = true;
1208
1209
1210
1211 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1212
1213 vfsp = mp->mnt_vtable;
1214 goto update;
1215 } // MNT_UPDATE
1216
1217 /*
1218 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1219 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1220 */
1221 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1222 flags |= MNT_NOSUID | MNT_NODEV;
1223 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1224 flags |= MNT_NOEXEC;
1225 }
1226 }
1227
1228 /* XXXAUDIT: Should we capture the type on the error path as well? */
1229 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1230 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1231 mount_list_lock();
1232 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1233 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1234 vfsp->vfc_refcount++;
1235 vfsp_ref = TRUE;
1236 break;
1237 }
1238 }
1239 mount_list_unlock();
1240 if (vfsp == NULL) {
1241 error = ENODEV;
1242 goto out1;
1243 }
1244
1245 /*
1246 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1247 * except in ROSV configs and for the initial BaseSystem root.
1248 */
1249 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1250 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1251 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1252 error = EINVAL; /* unsupported request */
1253 goto out1;
1254 }
1255
1256 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1257 if (error != 0) {
1258 goto out1;
1259 }
1260
1261 /*
1262 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1263 */
1264 did_set_vmount = TRUE;
1265
1266 /*
1267 * Allocate and initialize the filesystem (mount_t)
1268 */
1269 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1270 mntalloc = 1;
1271
1272 /* Initialize the default IO constraints */
1273 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1274 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1275 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1276 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1277 mp->mnt_devblocksize = DEV_BSIZE;
1278 mp->mnt_alignmentmask = PAGE_MASK;
1279 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1280 mp->mnt_ioscale = 1;
1281 mp->mnt_ioflags = 0;
1282 mp->mnt_realrootvp = NULLVP;
1283 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1284
1285 mp->mnt_lflag |= MNT_LMOUNT;
1286 did_set_lmount = TRUE;
1287
1288 TAILQ_INIT(&mp->mnt_vnodelist);
1289 TAILQ_INIT(&mp->mnt_workerqueue);
1290 TAILQ_INIT(&mp->mnt_newvnodes);
1291 mount_lock_init(mp);
1292 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1293 is_rwlock_locked = TRUE;
1294 mp->mnt_op = vfsp->vfc_vfsops;
1295 mp->mnt_vtable = vfsp;
1296 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1297 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1298 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1299 do {
1300 size_t pathlen = MAXPATHLEN;
1301
1302 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1303 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1304 }
1305 } while (0);
1306 mp->mnt_vnodecovered = vp;
1307 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1308 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1309 mp->mnt_devbsdunit = 0;
1310 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1311
1312 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1313 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1314
1315 if (kernelmount) {
1316 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1317 }
1318 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1319 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1320 }
1321
1322 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1323 // kernel mounted devfs
1324 mp->mnt_kern_flag |= MNTK_SYSTEM;
1325 }
1326
1327 update:
1328
1329 /*
1330 * Set the mount level flags.
1331 */
1332 if (flags & MNT_RDONLY) {
1333 mp->mnt_flag |= MNT_RDONLY;
1334 } else if (mp->mnt_flag & MNT_RDONLY) {
1335 // disallow read/write upgrades of file systems that
1336 // had the TYPENAME_OVERRIDE feature set.
1337 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1338 error = EPERM;
1339 goto out1;
1340 }
1341 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1342 }
1343 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1344 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1345 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1346 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1347 MNT_QUARANTINE | MNT_CPROTECT);
1348
1349 #if SECURE_KERNEL
1350 #if !CONFIG_MNT_SUID
1351 /*
1352 * On release builds of iOS based platforms, always enforce NOSUID on
1353 * all mounts. We do this here because we can catch update mounts as well as
1354 * non-update mounts in this case.
1355 */
1356 mp->mnt_flag |= (MNT_NOSUID);
1357 #endif
1358 #endif
1359
1360 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1361 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1362 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1363 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1364 MNT_QUARANTINE | MNT_CPROTECT);
1365
1366 #if CONFIG_MACF
1367 if (flags & MNT_MULTILABEL) {
1368 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1369 error = EINVAL;
1370 goto out1;
1371 }
1372 mp->mnt_flag |= MNT_MULTILABEL;
1373 }
1374 #endif
1375 /*
1376 * Process device path for local file systems if requested.
1377 *
1378 * Snapshot and mount-by-role mounts do not use this path; they are
1379 * passing other opaque data in the device path field.
1380 *
1381 * Basesystemroot mounts pass a device path to be resolved here,
1382 * but it's just a char * already inside the kernel, which
1383 * kernel_mount() shoved into a user_addr_t to call us. So for such
1384 * mounts we must skip copyin (both of the address and of the string
1385 * (in NDINIT).
1386 */
1387 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1388 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1389 boolean_t do_copyin_devpath = true;
1390 #if CONFIG_BASESYSTEMROOT
1391 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1392 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1393 // We have been passed fsmountargs, which is typed as a user_addr_t,
1394 // but is actually a char ** pointing to a (kernelspace) string.
1395 // We manually unpack it with a series of casts and dereferences
1396 // that reverses what was done just above us on the stack in
1397 // imageboot_pivot_image().
1398 // After retrieving the path to the dev node (which we will NDINIT
1399 // in a moment), we pass NULL fsmountargs on to the filesystem.
1400 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1401 char **devnamepp = (char **)fsmountargs;
1402 char *devnamep = *devnamepp;
1403 devpath = CAST_USER_ADDR_T(devnamep);
1404 do_copyin_devpath = false;
1405 fsmountargs = USER_ADDR_NULL;
1406
1407 //Now that we have a mp, denote that this mount is for the basesystem.
1408 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1409 }
1410 #endif // CONFIG_BASESYSTEMROOT
1411
1412 if (do_copyin_devpath) {
1413 if (vfs_context_is64bit(ctx)) {
1414 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1415 goto out1;
1416 }
1417 fsmountargs += sizeof(devpath);
1418 } else {
1419 user32_addr_t tmp;
1420 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1421 goto out1;
1422 }
1423 /* munge into LP64 addr */
1424 devpath = CAST_USER_ADDR_T(tmp);
1425 fsmountargs += sizeof(tmp);
1426 }
1427 }
1428
1429 /* Lookup device and authorize access to it */
1430 if ((devpath)) {
1431 struct nameidata nd;
1432
1433 enum uio_seg seg = UIO_USERSPACE;
1434 #if CONFIG_BASESYSTEMROOT
1435 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1436 seg = UIO_SYSSPACE;
1437 }
1438 #endif // CONFIG_BASESYSTEMROOT
1439
1440 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1441 if (flags & MNT_NOFOLLOW) {
1442 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
1443 }
1444 if ((error = namei(&nd))) {
1445 goto out1;
1446 }
1447
1448 devvp = nd.ni_vp;
1449
1450 if (devvp->v_type != VBLK) {
1451 error = ENOTBLK;
1452 nameidone(&nd);
1453 goto out2;
1454 }
1455 if (major(devvp->v_rdev) >= nblkdev) {
1456 error = ENXIO;
1457 nameidone(&nd);
1458 goto out2;
1459 }
1460 /*
1461 * If mount by non-root, then verify that user has necessary
1462 * permissions on the device.
1463 */
1464 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1465 kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1466
1467 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1468 accessmode |= KAUTH_VNODE_WRITE_DATA;
1469 }
1470 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1471 nameidone(&nd);
1472 goto out2;
1473 }
1474 }
1475
1476 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1477 nameidone(&nd);
1478 }
1479 /* On first mount, preflight and open device */
1480 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1481 if ((error = vnode_ref(devvp))) {
1482 goto out2;
1483 }
1484 /*
1485 * Disallow multiple mounts of the same device.
1486 * Disallow mounting of a device that is currently in use
1487 * (except for root, which might share swap device for miniroot).
1488 * Flush out any old buffers remaining from a previous use.
1489 */
1490 if ((error = vfs_setmounting(devvp))) {
1491 vnode_rele(devvp);
1492 goto out2;
1493 }
1494
1495 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1496 error = EBUSY;
1497 goto out3;
1498 }
1499 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1500 error = ENOTBLK;
1501 goto out3;
1502 }
1503 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1504 goto out3;
1505 }
1506
1507 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1508 #if CONFIG_MACF
1509 error = mac_vnode_check_open(ctx,
1510 devvp,
1511 ronly ? FREAD : FREAD | FWRITE);
1512 if (error) {
1513 goto out3;
1514 }
1515 #endif /* MAC */
1516 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1517 goto out3;
1518 }
1519
1520 mp->mnt_devvp = devvp;
1521 device_vnode = devvp;
1522 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1523 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1524 (device_vnode = mp->mnt_devvp)) {
1525 dev_t dev;
1526 int maj;
1527 /*
1528 * If upgrade to read-write by non-root, then verify
1529 * that user has necessary permissions on the device.
1530 */
1531 vnode_getalways(device_vnode);
1532
1533 if (suser(vfs_context_ucred(ctx), NULL) &&
1534 (error = vnode_authorize(device_vnode, NULL,
1535 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1536 ctx)) != 0) {
1537 vnode_put(device_vnode);
1538 goto out2;
1539 }
1540
1541 /* Tell the device that we're upgrading */
1542 dev = (dev_t)device_vnode->v_rdev;
1543 maj = major(dev);
1544
1545 if ((u_int)maj >= (u_int)nblkdev) {
1546 panic("Volume mounted on a device with invalid major number.");
1547 }
1548
1549 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1550 vnode_put(device_vnode);
1551 device_vnode = NULLVP;
1552 if (error != 0) {
1553 goto out2;
1554 }
1555 }
1556 } // localargs && !(snapshot | data | vm)
1557
1558 #if CONFIG_MACF
1559 if ((flags & MNT_UPDATE) == 0) {
1560 mac_mount_label_init(mp);
1561 mac_mount_label_associate(ctx, mp);
1562 }
1563 if (labelstr) {
1564 if ((flags & MNT_UPDATE) != 0) {
1565 error = mac_mount_check_label_update(ctx, mp);
1566 if (error != 0) {
1567 goto out3;
1568 }
1569 }
1570 }
1571 #endif
1572 /*
1573 * Mount the filesystem. We already asserted that internal_flags
1574 * cannot have more than one mount-by-role bit set.
1575 */
1576 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1577 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1578 (caddr_t)fsmountargs, 0, ctx);
1579 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1580 #if CONFIG_ROSV_STARTUP
1581 struct mount *origin_mp = (struct mount*)fsmountargs;
1582 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1583 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1584 if (error) {
1585 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1586 } else {
1587 /* Mark volume associated with system volume */
1588 mp->mnt_kern_flag |= MNTK_SYSTEM;
1589
1590 /* Attempt to acquire the mnt_devvp and set it up */
1591 struct vnode *mp_devvp = NULL;
1592 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1593 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1594 0, &mp_devvp, vfs_context_kernel());
1595 if (!lerr) {
1596 mp->mnt_devvp = mp_devvp;
1597 //vnode_lookup took an iocount, need to drop it.
1598 vnode_put(mp_devvp);
1599 // now set `device_vnode` to the devvp that was acquired.
1600 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1601 // note that though the iocount above was dropped, the mount acquires
1602 // an implicit reference against the device.
1603 device_vnode = mp_devvp;
1604 }
1605 }
1606 }
1607 #else
1608 error = EINVAL;
1609 #endif
1610 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1611 #if CONFIG_MOUNT_VM
1612 struct mount *origin_mp = (struct mount*)fsmountargs;
1613 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1614 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1615 if (error) {
1616 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1617 } else {
1618 /* Mark volume associated with system volume and a swap mount */
1619 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1620 /* Attempt to acquire the mnt_devvp and set it up */
1621 struct vnode *mp_devvp = NULL;
1622 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1623 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1624 0, &mp_devvp, vfs_context_kernel());
1625 if (!lerr) {
1626 mp->mnt_devvp = mp_devvp;
1627 //vnode_lookup took an iocount, need to drop it.
1628 vnode_put(mp_devvp);
1629
1630 // now set `device_vnode` to the devvp that was acquired.
1631 // note that though the iocount above was dropped, the mount acquires
1632 // an implicit reference against the device.
1633 device_vnode = mp_devvp;
1634 }
1635 }
1636 }
1637 #else
1638 error = EINVAL;
1639 #endif
1640 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1641 #if CONFIG_MOUNT_PREBOOTRECOVERY
1642 struct mount *origin_mp = (struct mount*)fsmountargs;
1643 uint32_t mount_role = 0;
1644 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1645 mount_role = VFS_PREBOOT_ROLE;
1646 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1647 mount_role = VFS_RECOVERY_ROLE;
1648 }
1649
1650 if (mount_role != 0) {
1651 fs_role_mount_args_t frma = {origin_mp, mount_role};
1652 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1653 if (error) {
1654 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1655 } else {
1656 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1657 /* Mark volume associated with system volume */
1658 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1659 /* Attempt to acquire the mnt_devvp and set it up */
1660 struct vnode *mp_devvp = NULL;
1661 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1662 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1663 0, &mp_devvp, vfs_context_kernel());
1664 if (!lerr) {
1665 mp->mnt_devvp = mp_devvp;
1666 //vnode_lookup took an iocount, need to drop it.
1667 vnode_put(mp_devvp);
1668
1669 // now set `device_vnode` to the devvp that was acquired.
1670 // note that though the iocount above was dropped, the mount acquires
1671 // an implicit reference against the device.
1672 device_vnode = mp_devvp;
1673 }
1674 }
1675 }
1676 } else {
1677 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1678 error = EINVAL;
1679 }
1680 #else
1681 error = EINVAL;
1682 #endif
1683 } else {
1684 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1685 }
1686
1687 if (flags & MNT_UPDATE) {
1688 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1689 mp->mnt_flag &= ~MNT_RDONLY;
1690 }
1691 mp->mnt_flag &= ~
1692 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1693 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1694 if (error) {
1695 mp->mnt_flag = flag; /* restore flag value */
1696 }
1697 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1698 lck_rw_done(&mp->mnt_rwlock);
1699 is_rwlock_locked = FALSE;
1700 if (!error) {
1701 enablequotas(mp, ctx);
1702 }
1703 goto exit;
1704 }
1705
1706 /*
1707 * Put the new filesystem on the mount list after root.
1708 */
1709 if (error == 0) {
1710 struct vfs_attr vfsattr;
1711 if (device_vnode) {
1712 /*
1713 * cache the IO attributes for the underlying physical media...
1714 * an error return indicates the underlying driver doesn't
1715 * support all the queries necessary... however, reasonable
1716 * defaults will have been set, so no reason to bail or care
1717 *
1718 * Need to do this before calling the MAC hook as it needs
1719 * information from this call.
1720 */
1721 vfs_init_io_attributes(device_vnode, mp);
1722 }
1723
1724 #if CONFIG_MACF
1725 error = mac_mount_check_mount_late(ctx, mp);
1726 if (error != 0) {
1727 goto out4;
1728 }
1729
1730 if (vfs_flags(mp) & MNT_MULTILABEL) {
1731 error = VFS_ROOT(mp, &rvp, ctx);
1732 if (error) {
1733 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1734 goto out4;
1735 }
1736 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1737 /*
1738 * drop reference provided by VFS_ROOT
1739 */
1740 vnode_put(rvp);
1741
1742 if (error) {
1743 goto out4;
1744 }
1745 }
1746 #endif /* MAC */
1747
1748 vnode_lock_spin(vp);
1749 CLR(vp->v_flag, VMOUNT);
1750 vp->v_mountedhere = mp;
1751 SET(vp->v_flag, VMOUNTEDHERE);
1752
1753 /*
1754 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1755 * 'v_mountedhere' to be planted.
1756 */
1757 wakeup(&vp->v_flag);
1758 vnode_unlock(vp);
1759
1760 /*
1761 * taking the name_cache_lock exclusively will
1762 * insure that everyone is out of the fast path who
1763 * might be trying to use a now stale copy of
1764 * vp->v_mountedhere->mnt_realrootvp
1765 * bumping mount_generation causes the cached values
1766 * to be invalidated
1767 */
1768 name_cache_lock();
1769 mount_generation++;
1770 name_cache_unlock();
1771
1772 error = vnode_ref(vp);
1773 if (error != 0) {
1774 goto out4;
1775 }
1776
1777 have_usecount = TRUE;
1778
1779 error = checkdirs(vp, ctx);
1780 if (error != 0) {
1781 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1782 goto out4;
1783 }
1784 /*
1785 * there is no cleanup code here so I have made it void
1786 * we need to revisit this
1787 */
1788 (void)VFS_START(mp, 0, ctx);
1789
1790 if (mount_list_add(mp) != 0) {
1791 /*
1792 * The system is shutting down trying to umount
1793 * everything, so fail with a plausible errno.
1794 */
1795 error = EBUSY;
1796 goto out4;
1797 }
1798 lck_rw_done(&mp->mnt_rwlock);
1799 is_rwlock_locked = FALSE;
1800
1801 /* Check if this mounted file system supports EAs or named streams. */
1802 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1803 VFSATTR_INIT(&vfsattr);
1804 VFSATTR_WANTED(&vfsattr, f_capabilities);
1805 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1806 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1807 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1808 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1809 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1810 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1811 }
1812 #if NAMEDSTREAMS
1813 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1814 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1815 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1816 }
1817 #endif
1818 /* Check if this file system supports path from id lookups. */
1819 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1820 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1821 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1822 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1823 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1824 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1825 }
1826
1827 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1828 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1829 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1830 }
1831 }
1832 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1833 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1834 }
1835 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1836 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1837 }
1838 /* Get subtype if supported to cache it */
1839 VFSATTR_INIT(&vfsattr);
1840 VFSATTR_WANTED(&vfsattr, f_fssubtype);
1841 if (vfs_getattr(mp, &vfsattr, ctx) == 0 && VFSATTR_IS_SUPPORTED(&vfsattr, f_fssubtype)) {
1842 mp->mnt_vfsstat.f_fssubtype = vfsattr.f_fssubtype;
1843 }
1844
1845 /* increment the operations count */
1846 OSAddAtomic(1, &vfs_nummntops);
1847 enablequotas(mp, ctx);
1848
1849 if (device_vnode) {
1850 vfs_setmountedon(device_vnode);
1851 }
1852
1853 /* Now that mount is setup, notify the listeners */
1854 vfs_notify_mount(pvp);
1855 IOBSDMountChange(mp, kIOMountChangeMount);
1856 #if CONFIG_MACF
1857 mac_mount_notify_mount(ctx, mp);
1858 #endif /* CONFIG_MACF */
1859 } else {
1860 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1861 if (mp->mnt_vnodelist.tqh_first != NULL) {
1862 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1863 mp->mnt_vtable->vfc_name, error);
1864 }
1865
1866 vnode_lock_spin(vp);
1867 CLR(vp->v_flag, VMOUNT);
1868 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
1869 wakeup(&vp->v_flag);
1870 vnode_unlock(vp);
1871 mount_list_lock();
1872 mp->mnt_vtable->vfc_refcount--;
1873 mount_list_unlock();
1874
1875 if (device_vnode) {
1876 vnode_rele(device_vnode);
1877 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1878 vfs_clearmounting(device_vnode);
1879 }
1880 lck_rw_done(&mp->mnt_rwlock);
1881 is_rwlock_locked = FALSE;
1882
1883 if (nc_smr_enabled) {
1884 vfs_smr_synchronize();
1885 }
1886
1887 /*
1888 * if we get here, we have a mount structure that needs to be freed,
1889 * but since the coveredvp hasn't yet been updated to point at it,
1890 * no need to worry about other threads holding a crossref on this mp
1891 * so it's ok to just free it
1892 */
1893 mount_lock_destroy(mp);
1894 #if CONFIG_MACF
1895 mac_mount_label_destroy(mp);
1896 #endif
1897 zfree(mount_zone, mp);
1898 did_set_lmount = false;
1899 }
1900 exit:
1901 /*
1902 * drop I/O count on the device vp if there was one
1903 */
1904 if (devpath && devvp) {
1905 vnode_put(devvp);
1906 }
1907
1908 if (did_set_lmount) {
1909 mount_lock_spin(mp);
1910 mp->mnt_lflag &= ~MNT_LMOUNT;
1911 mount_unlock(mp);
1912 }
1913
1914 return error;
1915
1916 /* Error condition exits */
1917 out4:
1918 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1919
1920 /*
1921 * If the mount has been placed on the covered vp,
1922 * it may have been discovered by now, so we have
1923 * to treat this just like an unmount
1924 */
1925 mount_lock_spin(mp);
1926 mp->mnt_lflag |= MNT_LDEAD;
1927 mount_unlock(mp);
1928
1929 if (device_vnode != NULLVP) {
1930 vnode_rele(device_vnode);
1931 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1932 ctx);
1933 vfs_clearmounting(device_vnode);
1934 did_rele = TRUE;
1935 }
1936
1937 vnode_lock_spin(vp);
1938
1939 mp->mnt_crossref++;
1940 CLR(vp->v_flag, VMOUNTEDHERE);
1941 vp->v_mountedhere = (mount_t) 0;
1942
1943 vnode_unlock(vp);
1944
1945 if (have_usecount) {
1946 vnode_rele(vp);
1947 }
1948 out3:
1949 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1950 vnode_rele(devvp);
1951 vfs_clearmounting(devvp);
1952 }
1953 out2:
1954 if (devpath && devvp) {
1955 vnode_put(devvp);
1956 }
1957 out1:
1958 /* Release mnt_rwlock only when it was taken */
1959 if (is_rwlock_locked == TRUE) {
1960 if (flag_set) {
1961 mp->mnt_flag = flag; /* restore mnt_flag value */
1962 }
1963 lck_rw_done(&mp->mnt_rwlock);
1964 }
1965
1966 if (did_set_lmount) {
1967 mount_lock_spin(mp);
1968 mp->mnt_lflag &= ~MNT_LMOUNT;
1969 mount_unlock(mp);
1970 }
1971
1972 if (did_set_vmount) {
1973 vnode_lock_spin(vp);
1974 CLR(vp->v_flag, VMOUNT);
1975 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
1976 wakeup(&vp->v_flag);
1977 vnode_unlock(vp);
1978 }
1979
1980 if (mntalloc) {
1981 if (mp->mnt_crossref) {
1982 mount_dropcrossref(mp, vp, 0);
1983 } else {
1984 if (nc_smr_enabled) {
1985 vfs_smr_synchronize();
1986 }
1987
1988 mount_lock_destroy(mp);
1989 #if CONFIG_MACF
1990 mac_mount_label_destroy(mp);
1991 #endif
1992 zfree(mount_zone, mp);
1993 }
1994 }
1995 if (vfsp_ref) {
1996 mount_list_lock();
1997 vfsp->vfc_refcount--;
1998 mount_list_unlock();
1999 }
2000
2001 return error;
2002 }
2003
2004 /*
2005 * Flush in-core data, check for competing mount attempts,
2006 * and set VMOUNT
2007 */
2008 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)2009 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
2010 {
2011 #if !CONFIG_MACF
2012 #pragma unused(cnp,fsname)
2013 #endif
2014 struct vnode_attr va;
2015 int error;
2016 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
2017 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
2018 boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
2019
2020 if (!skip_auth) {
2021 /*
2022 * If the user is not root, ensure that they own the directory
2023 * onto which we are attempting to mount.
2024 */
2025 VATTR_INIT(&va);
2026 VATTR_WANTED(&va, va_uid);
2027 if ((error = vnode_getattr(vp, &va, ctx)) ||
2028 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2029 (!vfs_context_issuser(ctx)))) {
2030 error = EPERM;
2031 goto out;
2032 }
2033 }
2034
2035 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
2036 goto out;
2037 }
2038
2039 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
2040 goto out;
2041 }
2042
2043 if (vp->v_type != VDIR) {
2044 error = ENOTDIR;
2045 goto out;
2046 }
2047
2048 vnode_lock_spin(vp);
2049
2050 if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
2051 error = EBUSY;
2052 } else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
2053 (vp->v_mountedhere != NULL))) {
2054 /*
2055 * For mount triggered from mount() call, we want to wait for the
2056 * current in-progress mount to complete, redo lookup and retry the
2057 * mount again. Similarly, we also want to retry if we lost the race
2058 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
2059 * 'v_mountedhere' has been planted after initial lookup.
2060 */
2061 if (ISSET(vp->v_flag, VMOUNT)) {
2062 vnode_lock_convert(vp);
2063 msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
2064 }
2065 error = EBUSY;
2066 } else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
2067 error = EBUSY;
2068 }
2069
2070 if (error) {
2071 vnode_unlock(vp);
2072 goto out;
2073 }
2074 SET(vp->v_flag, VMOUNT);
2075 vnode_unlock(vp);
2076
2077 #if CONFIG_MACF
2078 error = mac_mount_check_mount(ctx, vp,
2079 cnp, fsname);
2080 if (error != 0) {
2081 vnode_lock_spin(vp);
2082 CLR(vp->v_flag, VMOUNT);
2083 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2084 wakeup(&vp->v_flag);
2085 vnode_unlock(vp);
2086 }
2087 #endif
2088
2089 out:
2090 return error;
2091 }
2092
2093 #if CONFIG_IMGSRC_ACCESS
2094
2095 #define DEBUG_IMGSRC 0
2096
2097 #if DEBUG_IMGSRC
2098 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2099 #else
2100 #define IMGSRC_DEBUG(args...) do { } while(0)
2101 #endif
2102
2103 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2104 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2105 {
2106 struct nameidata nd;
2107 vnode_t vp, realdevvp;
2108 kauth_action_t accessmode;
2109 int error;
2110 enum uio_seg uio = UIO_USERSPACE;
2111
2112 if (ctx == vfs_context_kernel()) {
2113 uio = UIO_SYSSPACE;
2114 }
2115
2116 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2117 if ((error = namei(&nd))) {
2118 IMGSRC_DEBUG("namei() failed with %d\n", error);
2119 return error;
2120 }
2121
2122 vp = nd.ni_vp;
2123
2124 if (!vnode_isblk(vp)) {
2125 IMGSRC_DEBUG("Not block device.\n");
2126 error = ENOTBLK;
2127 goto out;
2128 }
2129
2130 realdevvp = mp->mnt_devvp;
2131 if (realdevvp == NULLVP) {
2132 IMGSRC_DEBUG("No device backs the mount.\n");
2133 error = ENXIO;
2134 goto out;
2135 }
2136
2137 error = vnode_getwithref(realdevvp);
2138 if (error != 0) {
2139 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2140 goto out;
2141 }
2142
2143 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2144 IMGSRC_DEBUG("Wrong dev_t.\n");
2145 error = ENXIO;
2146 goto out1;
2147 }
2148
2149 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2150
2151 /*
2152 * If mount by non-root, then verify that user has necessary
2153 * permissions on the device.
2154 */
2155 if (!vfs_context_issuser(ctx)) {
2156 accessmode = KAUTH_VNODE_READ_DATA;
2157 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2158 accessmode |= KAUTH_VNODE_WRITE_DATA;
2159 }
2160 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2161 IMGSRC_DEBUG("Access denied.\n");
2162 goto out1;
2163 }
2164 }
2165
2166 *devvpp = vp;
2167
2168 out1:
2169 vnode_put(realdevvp);
2170
2171 out:
2172 nameidone(&nd);
2173
2174 if (error) {
2175 vnode_put(vp);
2176 }
2177
2178 return error;
2179 }
2180
2181 /*
2182 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2183 * and call checkdirs()
2184 */
2185 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2186 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2187 {
2188 int error;
2189
2190 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2191
2192 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2193 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2194
2195 vnode_lock_spin(vp);
2196 CLR(vp->v_flag, VMOUNT);
2197 vp->v_mountedhere = mp;
2198 SET(vp->v_flag, VMOUNTEDHERE);
2199 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2200 wakeup(&vp->v_flag);
2201 vnode_unlock(vp);
2202
2203 /*
2204 * taking the name_cache_lock exclusively will
2205 * insure that everyone is out of the fast path who
2206 * might be trying to use a now stale copy of
2207 * vp->v_mountedhere->mnt_realrootvp
2208 * bumping mount_generation causes the cached values
2209 * to be invalidated
2210 */
2211 name_cache_lock();
2212 mount_generation++;
2213 name_cache_unlock();
2214
2215 error = vnode_ref(vp);
2216 if (error != 0) {
2217 goto out;
2218 }
2219
2220 error = checkdirs(vp, ctx);
2221 if (error != 0) {
2222 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2223 vnode_rele(vp);
2224 goto out;
2225 }
2226
2227 out:
2228 if (error != 0) {
2229 mp->mnt_vnodecovered = NULLVP;
2230 }
2231 return error;
2232 }
2233
2234 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2235 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2236 {
2237 vnode_rele(vp);
2238 vnode_lock_spin(vp);
2239 CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2240 vp->v_mountedhere = (mount_t)NULL;
2241 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2242 wakeup(&vp->v_flag);
2243 vnode_unlock(vp);
2244
2245 mp->mnt_vnodecovered = NULLVP;
2246 }
2247
2248 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2249 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2250 {
2251 int error;
2252
2253 /* unmount in progress return error */
2254 mount_lock_spin(mp);
2255 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2256 mount_unlock(mp);
2257 return EBUSY;
2258 }
2259 mount_unlock(mp);
2260 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2261
2262 /*
2263 * We only allow the filesystem to be reloaded if it
2264 * is currently mounted read-only.
2265 */
2266 if ((flags & MNT_RELOAD) &&
2267 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2268 error = ENOTSUP;
2269 goto out;
2270 }
2271
2272 /*
2273 * Only root, or the user that did the original mount is
2274 * permitted to update it.
2275 */
2276 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2277 (!vfs_context_issuser(ctx))) {
2278 error = EPERM;
2279 goto out;
2280 }
2281 #if CONFIG_MACF
2282 error = mac_mount_check_remount(ctx, mp, flags);
2283 if (error != 0) {
2284 goto out;
2285 }
2286 #endif
2287
2288 out:
2289 if (error) {
2290 lck_rw_done(&mp->mnt_rwlock);
2291 }
2292
2293 return error;
2294 }
2295
2296 static void
mount_end_update(mount_t mp)2297 mount_end_update(mount_t mp)
2298 {
2299 lck_rw_done(&mp->mnt_rwlock);
2300 }
2301
2302 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2303 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2304 {
2305 vnode_t vp;
2306
2307 if (height >= MAX_IMAGEBOOT_NESTING) {
2308 return EINVAL;
2309 }
2310
2311 vp = imgsrc_rootvnodes[height];
2312 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2313 *rvpp = vp;
2314 return 0;
2315 } else {
2316 return ENOENT;
2317 }
2318 }
2319
2320 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2321 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2322 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2323 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2324 {
2325 int error;
2326 mount_t mp;
2327 boolean_t placed = FALSE;
2328 struct vfstable *vfsp;
2329 user_addr_t devpath;
2330 char *old_mntonname;
2331 vnode_t rvp;
2332 vnode_t devvp;
2333 uint32_t height;
2334 uint32_t flags;
2335
2336 /* If we didn't imageboot, nothing to move */
2337 if (imgsrc_rootvnodes[0] == NULLVP) {
2338 return EINVAL;
2339 }
2340
2341 /* Only root can do this */
2342 if (!vfs_context_issuser(ctx)) {
2343 return EPERM;
2344 }
2345
2346 IMGSRC_DEBUG("looking for root vnode.\n");
2347
2348 /*
2349 * Get root vnode of filesystem we're moving.
2350 */
2351 if (by_index) {
2352 if (is64bit) {
2353 struct user64_mnt_imgsrc_args mia64;
2354 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2355 if (error != 0) {
2356 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2357 return error;
2358 }
2359
2360 height = mia64.mi_height;
2361 flags = mia64.mi_flags;
2362 devpath = (user_addr_t)mia64.mi_devpath;
2363 } else {
2364 struct user32_mnt_imgsrc_args mia32;
2365 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2366 if (error != 0) {
2367 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2368 return error;
2369 }
2370
2371 height = mia32.mi_height;
2372 flags = mia32.mi_flags;
2373 devpath = mia32.mi_devpath;
2374 }
2375 } else {
2376 /*
2377 * For binary compatibility--assumes one level of nesting.
2378 */
2379 if (is64bit) {
2380 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2381 return error;
2382 }
2383 } else {
2384 user32_addr_t tmp;
2385 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2386 return error;
2387 }
2388
2389 /* munge into LP64 addr */
2390 devpath = CAST_USER_ADDR_T(tmp);
2391 }
2392
2393 height = 0;
2394 flags = 0;
2395 }
2396
2397 if (flags != 0) {
2398 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2399 return EINVAL;
2400 }
2401
2402 error = get_imgsrc_rootvnode(height, &rvp);
2403 if (error != 0) {
2404 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2405 return error;
2406 }
2407
2408 IMGSRC_DEBUG("got old root vnode\n");
2409
2410 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2411
2412 /* Can only move once */
2413 mp = vnode_mount(rvp);
2414 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2415 IMGSRC_DEBUG("Already moved.\n");
2416 error = EBUSY;
2417 goto out0;
2418 }
2419
2420 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2421 IMGSRC_DEBUG("Starting updated.\n");
2422
2423 /* Get exclusive rwlock on mount, authorize update on mp */
2424 error = mount_begin_update(mp, ctx, 0);
2425 if (error != 0) {
2426 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2427 goto out0;
2428 }
2429
2430 /*
2431 * It can only be moved once. Flag is set under the rwlock,
2432 * so we're now safe to proceed.
2433 */
2434 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2435 IMGSRC_DEBUG("Already moved [2]\n");
2436 goto out1;
2437 }
2438
2439 IMGSRC_DEBUG("Preparing coveredvp.\n");
2440
2441 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2442 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2443 if (error != 0) {
2444 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2445 goto out1;
2446 }
2447
2448 IMGSRC_DEBUG("Covered vp OK.\n");
2449
2450 /* Sanity check the name caller has provided */
2451 vfsp = mp->mnt_vtable;
2452 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2453 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2454 vfsp->vfc_name, fsname);
2455 error = EINVAL;
2456 goto out2;
2457 }
2458
2459 /* Check the device vnode and update mount-from name, for local filesystems */
2460 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2461 IMGSRC_DEBUG("Local, doing device validation.\n");
2462
2463 if (devpath != USER_ADDR_NULL) {
2464 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2465 if (error) {
2466 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2467 goto out2;
2468 }
2469
2470 vnode_put(devvp);
2471 }
2472 }
2473
2474 /*
2475 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2476 * and increment the name cache's mount generation
2477 */
2478
2479 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2480 error = place_mount_and_checkdirs(mp, vp, ctx);
2481 if (error != 0) {
2482 goto out2;
2483 }
2484
2485 placed = TRUE;
2486
2487 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2488 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2489
2490 /* Forbid future moves */
2491 mount_lock(mp);
2492 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2493 mount_unlock(mp);
2494
2495 /* Finally, add to mount list, completely ready to go */
2496 if (mount_list_add(mp) != 0) {
2497 /*
2498 * The system is shutting down trying to umount
2499 * everything, so fail with a plausible errno.
2500 */
2501 error = EBUSY;
2502 goto out3;
2503 }
2504
2505 mount_end_update(mp);
2506 vnode_put(rvp);
2507 zfree(ZV_NAMEI, old_mntonname);
2508
2509 vfs_notify_mount(pvp);
2510 #if CONFIG_MACF
2511 mac_mount_notify_mount(ctx, mp);
2512 #endif /* CONFIG_MACF */
2513
2514 return 0;
2515 out3:
2516 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2517
2518 mount_lock(mp);
2519 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2520 mount_unlock(mp);
2521
2522 out2:
2523 /*
2524 * Placing the mp on the vnode clears VMOUNT,
2525 * so cleanup is different after that point
2526 */
2527 if (placed) {
2528 /* Rele the vp, clear VMOUNT and v_mountedhere */
2529 undo_place_on_covered_vp(mp, vp);
2530 } else {
2531 vnode_lock_spin(vp);
2532 CLR(vp->v_flag, VMOUNT);
2533 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2534 wakeup(&vp->v_flag);
2535 vnode_unlock(vp);
2536 }
2537 out1:
2538 mount_end_update(mp);
2539
2540 out0:
2541 vnode_put(rvp);
2542 zfree(ZV_NAMEI, old_mntonname);
2543 return error;
2544 }
2545
2546 #endif /* CONFIG_IMGSRC_ACCESS */
2547
2548 void
enablequotas(struct mount * mp,vfs_context_t ctx)2549 enablequotas(struct mount *mp, vfs_context_t ctx)
2550 {
2551 struct nameidata qnd;
2552 int type;
2553 char qfpath[MAXPATHLEN];
2554 const char *qfname = QUOTAFILENAME;
2555 const char *qfopsname = QUOTAOPSNAME;
2556 const char *qfextension[] = INITQFNAMES;
2557
2558 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2559 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2560 return;
2561 }
2562 /*
2563 * Enable filesystem disk quotas if necessary.
2564 * We ignore errors as this should not interfere with final mount
2565 */
2566 for (type = 0; type < MAXQUOTAS; type++) {
2567 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2568 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2569 CAST_USER_ADDR_T(qfpath), ctx);
2570 if (namei(&qnd) != 0) {
2571 continue; /* option file to trigger quotas is not present */
2572 }
2573 vnode_put(qnd.ni_vp);
2574 nameidone(&qnd);
2575 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2576
2577 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2578 }
2579 return;
2580 }
2581
2582
2583 static int
checkdirs_callback(proc_t p,void * arg)2584 checkdirs_callback(proc_t p, void * arg)
2585 {
2586 struct cdirargs *cdrp = (struct cdirargs *)arg;
2587 vnode_t olddp = cdrp->olddp;
2588 vnode_t newdp = cdrp->newdp;
2589 struct filedesc *fdp = &p->p_fd;
2590 vnode_t new_cvp = newdp;
2591 vnode_t new_rvp = newdp;
2592 vnode_t old_cvp = NULL;
2593 vnode_t old_rvp = NULL;
2594
2595 /*
2596 * XXX Also needs to iterate each thread in the process to see if it
2597 * XXX is using a per-thread current working directory, and, if so,
2598 * XXX update that as well.
2599 */
2600
2601 /*
2602 * First, with the proc_fdlock held, check to see if we will need
2603 * to do any work. If not, we will get out fast.
2604 */
2605 proc_fdlock(p);
2606 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2607 proc_fdunlock(p);
2608 return PROC_RETURNED;
2609 }
2610 proc_fdunlock(p);
2611
2612 /*
2613 * Ok, we will have to do some work. Always take two refs
2614 * because we might need that many. We'll dispose of whatever
2615 * we ended up not using.
2616 */
2617 if (vnode_ref(newdp) != 0) {
2618 return PROC_RETURNED;
2619 }
2620 if (vnode_ref(newdp) != 0) {
2621 vnode_rele(newdp);
2622 return PROC_RETURNED;
2623 }
2624
2625 proc_dirs_lock_exclusive(p);
2626 /*
2627 * Now do the work. Note: we dropped the proc_fdlock, so we
2628 * have to do all of the checks again.
2629 */
2630 proc_fdlock(p);
2631 if (fdp->fd_cdir == olddp) {
2632 old_cvp = olddp;
2633 fdp->fd_cdir = newdp;
2634 new_cvp = NULL;
2635 }
2636 if (fdp->fd_rdir == olddp) {
2637 old_rvp = olddp;
2638 fdp->fd_rdir = newdp;
2639 new_rvp = NULL;
2640 }
2641 proc_fdunlock(p);
2642 proc_dirs_unlock_exclusive(p);
2643
2644 /*
2645 * Dispose of any references that are no longer needed.
2646 */
2647 if (old_cvp != NULL) {
2648 vnode_rele(old_cvp);
2649 }
2650 if (old_rvp != NULL) {
2651 vnode_rele(old_rvp);
2652 }
2653 if (new_cvp != NULL) {
2654 vnode_rele(new_cvp);
2655 }
2656 if (new_rvp != NULL) {
2657 vnode_rele(new_rvp);
2658 }
2659
2660 return PROC_RETURNED;
2661 }
2662
2663
2664
2665 /*
2666 * Scan all active processes to see if any of them have a current
2667 * or root directory onto which the new filesystem has just been
2668 * mounted. If so, replace them with the new mount point.
2669 */
2670 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2671 checkdirs(vnode_t olddp, vfs_context_t ctx)
2672 {
2673 vnode_t newdp;
2674 vnode_t tvp;
2675 int err;
2676 struct cdirargs cdr;
2677
2678 if (olddp->v_usecount == 1) {
2679 return 0;
2680 }
2681 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2682
2683 if (err != 0) {
2684 #if DIAGNOSTIC
2685 panic("mount: lost mount: error %d", err);
2686 #endif
2687 return err;
2688 }
2689
2690 cdr.olddp = olddp;
2691 cdr.newdp = newdp;
2692 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2693 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2694
2695 if (rootvnode == olddp) {
2696 vnode_ref(newdp);
2697 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2698 tvp = rootvnode;
2699 rootvnode = newdp;
2700 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2701 vnode_rele(tvp);
2702 }
2703
2704 vnode_put(newdp);
2705 return 0;
2706 }
2707
2708 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2709 "com.apple.private.vfs.role-account-unmount"
2710 #define SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT \
2711 "com.apple.private.vfs.system-volume-unmount"
2712
2713 /*
2714 * Unmount a file system.
2715 *
2716 * Note: unmount takes a path to the vnode mounted on as argument,
2717 * not special file (as before).
2718 */
2719 /* ARGSUSED */
2720 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2721 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2722 {
2723 vnode_t vp;
2724 struct mount *mp;
2725 int flags = uap->flags;
2726 int error;
2727 struct nameidata nd;
2728 vfs_context_t ctx;
2729
2730 /*
2731 * If the process has the entitlement, use the kernel's context when
2732 * performing lookup on the mount path as the process might lack proper
2733 * permission to access the directory.
2734 */
2735 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2736 vfs_context_kernel() : vfs_context_current();
2737
2738 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2739 UIO_USERSPACE, uap->path, ctx);
2740 if (flags & MNT_NOFOLLOW) {
2741 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
2742 }
2743
2744 error = namei(&nd);
2745 if (error) {
2746 return error;
2747 }
2748 vp = nd.ni_vp;
2749 mp = vp->v_mount;
2750 nameidone(&nd);
2751
2752 /*
2753 * Must be the root of the filesystem
2754 */
2755 if ((vp->v_flag & VROOT) == 0) {
2756 vnode_put(vp);
2757 return EINVAL;
2758 }
2759 #if CONFIG_MACF
2760 error = mac_mount_check_umount(ctx, mp);
2761 if (error != 0) {
2762 vnode_put(vp);
2763 return error;
2764 }
2765 #endif
2766 mount_ref(mp, 0);
2767 vnode_put(vp);
2768 /* safedounmount consumes the mount ref */
2769 return safedounmount(mp, flags, ctx);
2770 }
2771
2772 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2773 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2774 {
2775 mount_t mp;
2776
2777 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2778 if (mp == (mount_t)0) {
2779 return ENOENT;
2780 }
2781 mount_ref(mp, 0);
2782 mount_iterdrop(mp);
2783 /* safedounmount consumes the mount ref */
2784 return safedounmount(mp, flags, ctx);
2785 }
2786
2787 /*
2788 * The mount struct comes with a mount ref which will be consumed.
2789 * Do the actual file system unmount, prevent some common foot shooting.
2790 */
2791 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2792 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2793 {
2794 int error;
2795 proc_t p = vfs_context_proc(ctx);
2796
2797 /*
2798 * If the file system is not responding and MNT_NOBLOCK
2799 * is set and not a forced unmount then return EBUSY.
2800 */
2801 if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2802 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2803 error = EBUSY;
2804 goto out;
2805 }
2806
2807 /*
2808 * Skip authorization in two cases:
2809 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2810 * This entitlement allows non-root processes unmount volumes mounted by
2811 * other processes.
2812 * - If the mount is tagged as permissive and this is not a forced-unmount
2813 * attempt.
2814 */
2815 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2816 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2817 /*
2818 * Only root, or the user that did the original mount is
2819 * permitted to unmount this filesystem.
2820 */
2821 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2822 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2823 goto out;
2824 }
2825 }
2826
2827 /*
2828 * Don't allow unmounting the root file system, or other volumes
2829 * associated with it (for example, the associated VM or DATA mounts) .
2830 */
2831 if (mp->mnt_flag & MNT_ROOTFS) {
2832 error = EBUSY; /* the root is always busy */
2833 goto out;
2834 }
2835 if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !IOCurrentTaskHasEntitlement(SYSTEM_VOLUME_UNMOUNT_ENTITLEMENT)) {
2836 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2837 mp->mnt_vfsstat.f_mntonname);
2838 error = EBUSY; /* root-associated volumes are always busy unless caller is entitled */
2839 goto out;
2840 }
2841
2842 /*
2843 * If the mount is providing the root filesystem's disk image
2844 * (i.e. imageboot), don't allow unmounting
2845 */
2846 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2847 error = EBUSY;
2848 goto out;
2849 }
2850
2851 return dounmount(mp, flags, 1, ctx);
2852
2853 out:
2854 mount_drop(mp, 0);
2855 return error;
2856 }
2857
2858 /*
2859 * Do the actual file system unmount.
2860 */
2861 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2862 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2863 {
2864 vnode_t coveredvp = (vnode_t)0;
2865 int error;
2866 int needwakeup = 0;
2867 int forcedunmount = 0;
2868 int lflags = 0;
2869 struct vnode *devvp = NULLVP;
2870 #if CONFIG_TRIGGERS
2871 proc_t p = vfs_context_proc(ctx);
2872 int did_vflush = 0;
2873 int pflags_save = 0;
2874 #endif /* CONFIG_TRIGGERS */
2875
2876 #if CONFIG_FSE
2877 if (!(flags & MNT_FORCE)) {
2878 fsevent_unmount(mp, ctx); /* has to come first! */
2879 }
2880 #endif
2881
2882 mount_lock(mp);
2883
2884 /*
2885 * If already an unmount in progress just return EBUSY.
2886 * Even a forced unmount cannot override.
2887 */
2888 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2889 if (withref != 0) {
2890 mount_drop(mp, 1);
2891 }
2892 mount_unlock(mp);
2893 return EBUSY;
2894 }
2895
2896 if (flags & MNT_FORCE) {
2897 forcedunmount = 1;
2898 mp->mnt_lflag |= MNT_LFORCE;
2899 }
2900
2901 #if CONFIG_TRIGGERS
2902 if (flags & MNT_NOBLOCK && p != kernproc) {
2903 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2904 }
2905 #endif
2906
2907 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2908 mp->mnt_lflag |= MNT_LUNMOUNT;
2909 mp->mnt_flag &= ~MNT_ASYNC;
2910 /*
2911 * anyone currently in the fast path that
2912 * trips over the cached rootvp will be
2913 * dumped out and forced into the slow path
2914 * to regenerate a new cached value
2915 */
2916 mp->mnt_realrootvp = NULLVP;
2917 mount_unlock(mp);
2918
2919 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2920 /*
2921 * Force unmount any mounts in this filesystem.
2922 * If any unmounts fail - just leave them dangling.
2923 * Avoids recursion.
2924 */
2925 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2926 }
2927
2928 /*
2929 * taking the name_cache_lock exclusively will
2930 * insure that everyone is out of the fast path who
2931 * might be trying to use a now stale copy of
2932 * vp->v_mountedhere->mnt_realrootvp
2933 * bumping mount_generation causes the cached values
2934 * to be invalidated
2935 */
2936 name_cache_lock();
2937 mount_generation++;
2938 name_cache_unlock();
2939
2940
2941 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2942 if (withref != 0) {
2943 mount_drop(mp, 0);
2944 }
2945 error = 0;
2946 if (forcedunmount == 0) {
2947 ubc_umount(mp); /* release cached vnodes */
2948 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2949 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2950 if (error) {
2951 mount_lock(mp);
2952 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2953 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2954 mp->mnt_lflag &= ~MNT_LFORCE;
2955 goto out;
2956 }
2957 }
2958 }
2959
2960 IOBSDMountChange(mp, kIOMountChangeUnmount);
2961
2962 #if CONFIG_TRIGGERS
2963 vfs_nested_trigger_unmounts(mp, flags, ctx);
2964 did_vflush = 1;
2965 #endif
2966 if (forcedunmount) {
2967 lflags |= FORCECLOSE;
2968 }
2969 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2970 if ((forcedunmount == 0) && error) {
2971 mount_lock(mp);
2972 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2973 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2974 mp->mnt_lflag &= ~MNT_LFORCE;
2975 goto out;
2976 }
2977
2978 /* make sure there are no one in the mount iterations or lookup */
2979 mount_iterdrain(mp);
2980
2981 error = VFS_UNMOUNT(mp, flags, ctx);
2982 if (error) {
2983 mount_iterreset(mp);
2984 mount_lock(mp);
2985 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2986 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2987 mp->mnt_lflag &= ~MNT_LFORCE;
2988 goto out;
2989 }
2990
2991 /* increment the operations count */
2992 if (!error) {
2993 OSAddAtomic(1, &vfs_nummntops);
2994 }
2995
2996 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2997 /* hold an io reference and drop the usecount before close */
2998 devvp = mp->mnt_devvp;
2999 vnode_getalways(devvp);
3000 vnode_rele(devvp);
3001 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
3002 ctx);
3003 vnode_clearmountedon(devvp);
3004 vnode_put(devvp);
3005 }
3006 lck_rw_done(&mp->mnt_rwlock);
3007 mount_list_remove(mp);
3008 lck_rw_lock_exclusive(&mp->mnt_rwlock);
3009
3010 /* mark the mount point hook in the vp but not drop the ref yet */
3011 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
3012 /*
3013 * The covered vnode needs special handling. Trying to get an
3014 * iocount must not block here as this may lead to deadlocks
3015 * if the Filesystem to which the covered vnode belongs is
3016 * undergoing forced unmounts. Since we hold a usecount, the
3017 * vnode cannot be reused (it can, however, still be terminated)
3018 */
3019 vnode_getalways(coveredvp);
3020 vnode_lock_spin(coveredvp);
3021
3022 mp->mnt_crossref++;
3023 coveredvp->v_mountedhere = (struct mount *)0;
3024 CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
3025 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
3026 wakeup(&coveredvp->v_flag);
3027 vnode_unlock(coveredvp);
3028 vnode_put(coveredvp);
3029 }
3030
3031 mount_list_lock();
3032 mp->mnt_vtable->vfc_refcount--;
3033 mount_list_unlock();
3034
3035 cache_purgevfs(mp); /* remove cache entries for this file sys */
3036 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
3037 mount_lock(mp);
3038 mp->mnt_lflag |= MNT_LDEAD;
3039
3040 if (mp->mnt_lflag & MNT_LWAIT) {
3041 /*
3042 * do the wakeup here
3043 * in case we block in mount_refdrain
3044 * which will drop the mount lock
3045 * and allow anyone blocked in vfs_busy
3046 * to wakeup and see the LDEAD state
3047 */
3048 mp->mnt_lflag &= ~MNT_LWAIT;
3049 wakeup((caddr_t)mp);
3050 }
3051 mount_refdrain(mp);
3052
3053 /* free disk_conditioner_info structure for this mount */
3054 disk_conditioner_unmount(mp);
3055
3056 out:
3057 if (mp->mnt_lflag & MNT_LWAIT) {
3058 mp->mnt_lflag &= ~MNT_LWAIT;
3059 needwakeup = 1;
3060 }
3061
3062 #if CONFIG_TRIGGERS
3063 if (flags & MNT_NOBLOCK && p != kernproc) {
3064 // Restore P_NOREMOTEHANG bit to its previous value
3065 if ((pflags_save & P_NOREMOTEHANG) == 0) {
3066 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
3067 }
3068 }
3069
3070 /*
3071 * Callback and context are set together under the mount lock, and
3072 * never cleared, so we're safe to examine them here, drop the lock,
3073 * and call out.
3074 */
3075 if (mp->mnt_triggercallback != NULL) {
3076 mount_unlock(mp);
3077 if (error == 0) {
3078 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
3079 } else if (did_vflush) {
3080 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
3081 }
3082 } else {
3083 mount_unlock(mp);
3084 }
3085 #else
3086 mount_unlock(mp);
3087 #endif /* CONFIG_TRIGGERS */
3088
3089 lck_rw_done(&mp->mnt_rwlock);
3090
3091 if (needwakeup) {
3092 wakeup((caddr_t)mp);
3093 }
3094
3095 if (!error) {
3096 if ((coveredvp != NULLVP)) {
3097 vnode_t pvp = NULLVP;
3098
3099 /*
3100 * The covered vnode needs special handling. Trying to
3101 * get an iocount must not block here as this may lead
3102 * to deadlocks if the Filesystem to which the covered
3103 * vnode belongs is undergoing forced unmounts. Since we
3104 * hold a usecount, the vnode cannot be reused
3105 * (it can, however, still be terminated).
3106 */
3107 vnode_getalways(coveredvp);
3108
3109 mount_dropcrossref(mp, coveredvp, 0);
3110 /*
3111 * We'll _try_ to detect if this really needs to be
3112 * done. The coveredvp can only be in termination (or
3113 * terminated) if the coveredvp's mount point is in a
3114 * forced unmount (or has been) since we still hold the
3115 * ref.
3116 */
3117 if (!vnode_isrecycled(coveredvp)) {
3118 pvp = vnode_getparent(coveredvp);
3119 #if CONFIG_TRIGGERS
3120 if (coveredvp->v_resolve) {
3121 vnode_trigger_rearm(coveredvp, ctx);
3122 }
3123 #endif
3124 }
3125
3126 vnode_rele(coveredvp);
3127 vnode_put(coveredvp);
3128 coveredvp = NULLVP;
3129
3130 if (pvp) {
3131 lock_vnode_and_post(pvp, NOTE_WRITE);
3132 vnode_put(pvp);
3133 }
3134 } else if (mp->mnt_flag & MNT_ROOTFS) {
3135 if (nc_smr_enabled) {
3136 vfs_smr_synchronize();
3137 }
3138
3139 mount_lock_destroy(mp);
3140 #if CONFIG_MACF
3141 mac_mount_label_destroy(mp);
3142 #endif
3143 zfree(mount_zone, mp);
3144 } else {
3145 panic("dounmount: no coveredvp");
3146 }
3147 }
3148 return error;
3149 }
3150
3151 /*
3152 * Unmount any mounts in this filesystem.
3153 */
3154 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3155 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3156 {
3157 mount_t smp;
3158 fsid_t *fsids, fsid;
3159 int fsids_sz;
3160 int count = 0, i, m = 0;
3161 vnode_t vp;
3162
3163 mount_list_lock();
3164
3165 // Get an array to hold the submounts fsids.
3166 TAILQ_FOREACH(smp, &mountlist, mnt_list)
3167 count++;
3168 fsids_sz = count * sizeof(fsid_t);
3169 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3170 if (fsids == NULL) {
3171 mount_list_unlock();
3172 goto out;
3173 }
3174 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3175
3176 /*
3177 * Fill the array with submount fsids.
3178 * Since mounts are always added to the tail of the mount list, the
3179 * list is always in mount order.
3180 * For each mount check if the mounted-on vnode belongs to a
3181 * mount that's already added to our array of mounts to be unmounted.
3182 */
3183 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3184 vp = smp->mnt_vnodecovered;
3185 if (vp == NULL) {
3186 continue;
3187 }
3188 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3189 for (i = 0; i <= m; i++) {
3190 if (fsids[i].val[0] == fsid.val[0] &&
3191 fsids[i].val[1] == fsid.val[1]) {
3192 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3193 break;
3194 }
3195 }
3196 }
3197 mount_list_unlock();
3198
3199 // Unmount the submounts in reverse order. Ignore errors.
3200 for (i = m; i > 0; i--) {
3201 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3202 if (smp) {
3203 mount_ref(smp, 0);
3204 mount_iterdrop(smp);
3205 (void) dounmount(smp, flags, 1, ctx);
3206 }
3207 }
3208 out:
3209 kfree_data(fsids, fsids_sz);
3210 }
3211
3212 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3213 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3214 {
3215 vnode_hold(dp);
3216 vnode_lock(dp);
3217 mp->mnt_crossref--;
3218
3219 if (mp->mnt_crossref < 0) {
3220 panic("mount cross refs -ve");
3221 }
3222
3223 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3224 if (need_put) {
3225 vnode_put_locked(dp);
3226 }
3227 vnode_drop_and_unlock(dp);
3228
3229 if (nc_smr_enabled) {
3230 vfs_smr_synchronize();
3231 }
3232
3233 mount_lock_destroy(mp);
3234 #if CONFIG_MACF
3235 mac_mount_label_destroy(mp);
3236 #endif
3237 zfree(mount_zone, mp);
3238 return;
3239 }
3240 if (need_put) {
3241 vnode_put_locked(dp);
3242 }
3243 vnode_drop_and_unlock(dp);
3244 }
3245
3246
3247 /*
3248 * Sync each mounted filesystem.
3249 */
3250 #if DIAGNOSTIC
3251 int syncprt = 0;
3252 #endif
3253
3254 int print_vmpage_stat = 0;
3255
3256 /*
3257 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3258 * mounted read-write with the passed waitfor value.
3259 *
3260 * Parameters: mp mount-point descriptor per mounted file-system instance.
3261 * arg user argument (please see below)
3262 *
3263 * User argument is a pointer to 32 bit unsigned integer which describes the
3264 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3265 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3266 * waitfor value.
3267 *
3268 * Returns: VFS_RETURNED
3269 */
3270 static int
sync_callback(mount_t mp,void * arg)3271 sync_callback(mount_t mp, void *arg)
3272 {
3273 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3274 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3275 unsigned waitfor = MNT_NOWAIT;
3276
3277 if (arg) {
3278 waitfor = *(uint32_t*)arg;
3279 }
3280
3281 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3282 if (waitfor != MNT_WAIT &&
3283 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3284 waitfor != MNT_NOWAIT &&
3285 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3286 waitfor != MNT_DWAIT &&
3287 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3288 panic("Passed inappropriate waitfor %u to "
3289 "sync_callback()", waitfor);
3290 }
3291
3292 mp->mnt_flag &= ~MNT_ASYNC;
3293 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3294 if (asyncflag) {
3295 mp->mnt_flag |= MNT_ASYNC;
3296 }
3297 }
3298
3299 return VFS_RETURNED;
3300 }
3301
3302 /* ARGSUSED */
3303 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3304 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3305 {
3306 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3307
3308 if (print_vmpage_stat) {
3309 vm_countdirtypages();
3310 }
3311
3312 #if DIAGNOSTIC
3313 if (syncprt) {
3314 vfs_bufstats();
3315 }
3316 #endif /* DIAGNOSTIC */
3317 return 0;
3318 }
3319
3320 typedef enum {
3321 SYNC_ALL = 0,
3322 SYNC_ONLY_RELIABLE_MEDIA = 1,
3323 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3324 } sync_type_t;
3325
3326 static int
sync_internal_callback(mount_t mp,void * arg)3327 sync_internal_callback(mount_t mp, void *arg)
3328 {
3329 if (arg) {
3330 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3331 (mp->mnt_flag & MNT_LOCAL);
3332 sync_type_t sync_type = *((sync_type_t *)arg);
3333
3334 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3335 return VFS_RETURNED;
3336 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3337 return VFS_RETURNED;
3338 }
3339 }
3340
3341 (void)sync_callback(mp, NULL);
3342
3343 return VFS_RETURNED;
3344 }
3345
3346 int sync_thread_state = 0;
3347 int sync_timeout_seconds = 5;
3348
3349 #define SYNC_THREAD_RUN 0x0001
3350 #define SYNC_THREAD_RUNNING 0x0002
3351
3352 #if CONFIG_PHYS_WRITE_ACCT
3353 thread_t pm_sync_thread;
3354 #endif /* CONFIG_PHYS_WRITE_ACCT */
3355
3356 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3357 sync_thread(__unused void *arg, __unused wait_result_t wr)
3358 {
3359 sync_type_t sync_type;
3360 #if CONFIG_PHYS_WRITE_ACCT
3361 pm_sync_thread = current_thread();
3362 #endif /* CONFIG_PHYS_WRITE_ACCT */
3363
3364 lck_mtx_lock(&sync_mtx_lck);
3365 while (sync_thread_state & SYNC_THREAD_RUN) {
3366 sync_thread_state &= ~SYNC_THREAD_RUN;
3367 lck_mtx_unlock(&sync_mtx_lck);
3368
3369 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3370 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3371 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3372 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3373
3374 lck_mtx_lock(&sync_mtx_lck);
3375 }
3376 /*
3377 * This wakeup _has_ to be issued before the lock is released otherwise
3378 * we may end up waking up a thread in sync_internal which is
3379 * expecting a wakeup from a thread it just created and not from this
3380 * thread which is about to exit.
3381 */
3382 wakeup(&sync_thread_state);
3383 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3384 #if CONFIG_PHYS_WRITE_ACCT
3385 pm_sync_thread = NULL;
3386 #endif /* CONFIG_PHYS_WRITE_ACCT */
3387 lck_mtx_unlock(&sync_mtx_lck);
3388
3389 if (print_vmpage_stat) {
3390 vm_countdirtypages();
3391 }
3392
3393 #if DIAGNOSTIC
3394 if (syncprt) {
3395 vfs_bufstats();
3396 }
3397 #endif /* DIAGNOSTIC */
3398 }
3399
3400 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3401
3402 /*
3403 * An in-kernel sync for power management to call.
3404 * This function always returns within sync_timeout seconds.
3405 */
3406 __private_extern__ int
sync_internal(void)3407 sync_internal(void)
3408 {
3409 thread_t thd = NULL;
3410 int error;
3411 int thread_created = FALSE;
3412 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3413
3414 lck_mtx_lock(&sync_mtx_lck);
3415 sync_thread_state |= SYNC_THREAD_RUN;
3416 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3417 int kr;
3418
3419 sync_thread_state |= SYNC_THREAD_RUNNING;
3420 kr = kernel_thread_start(sync_thread, NULL, &thd);
3421 if (kr != KERN_SUCCESS) {
3422 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3423 lck_mtx_unlock(&sync_mtx_lck);
3424 printf("sync_thread failed\n");
3425 return 0;
3426 }
3427 thread_created = TRUE;
3428 }
3429
3430 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3431 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3432 if (error) {
3433 struct timeval now;
3434
3435 microtime(&now);
3436 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3437 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3438 sync_timeout_last_print.tv_sec = now.tv_sec;
3439 }
3440 }
3441
3442 if (thread_created) {
3443 thread_deallocate(thd);
3444 }
3445
3446 return 0;
3447 } /* end of sync_internal call */
3448
3449 /*
3450 * Change filesystem quotas.
3451 */
3452 #if QUOTA
3453 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3454 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3455 {
3456 struct mount *mp;
3457 int error, quota_cmd, quota_status = 0;
3458 caddr_t datap;
3459 size_t fnamelen;
3460 struct nameidata nd;
3461 vfs_context_t ctx = vfs_context_current();
3462 struct dqblk my_dqblk = {};
3463
3464 AUDIT_ARG(uid, uap->uid);
3465 AUDIT_ARG(cmd, uap->cmd);
3466 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3467 uap->path, ctx);
3468 error = namei(&nd);
3469 if (error) {
3470 return error;
3471 }
3472 mp = nd.ni_vp->v_mount;
3473 mount_ref(mp, 0);
3474 vnode_put(nd.ni_vp);
3475 nameidone(&nd);
3476
3477 #if CONFIG_MACF
3478 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3479 if (error != 0) {
3480 goto out;
3481 }
3482 #endif
3483
3484 /* copyin any data we will need for downstream code */
3485 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3486
3487 switch (quota_cmd) {
3488 case Q_QUOTAON:
3489 /* uap->arg specifies a file from which to take the quotas */
3490 fnamelen = MAXPATHLEN;
3491 datap = zalloc(ZV_NAMEI);
3492 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3493 break;
3494 case Q_GETQUOTA:
3495 /* uap->arg is a pointer to a dqblk structure. */
3496 datap = (caddr_t) &my_dqblk;
3497 break;
3498 case Q_SETQUOTA:
3499 case Q_SETUSE:
3500 /* uap->arg is a pointer to a dqblk structure. */
3501 datap = (caddr_t) &my_dqblk;
3502 if (proc_is64bit(p)) {
3503 struct user_dqblk my_dqblk64;
3504 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3505 if (error == 0) {
3506 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3507 }
3508 } else {
3509 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3510 }
3511 break;
3512 case Q_QUOTASTAT:
3513 /* uap->arg is a pointer to an integer */
3514 datap = (caddr_t) "a_status;
3515 break;
3516 default:
3517 datap = NULL;
3518 break;
3519 } /* switch */
3520
3521 if (error == 0) {
3522 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3523 }
3524
3525 switch (quota_cmd) {
3526 case Q_QUOTAON:
3527 if (datap != NULL) {
3528 zfree(ZV_NAMEI, datap);
3529 }
3530 break;
3531 case Q_GETQUOTA:
3532 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3533 if (error == 0) {
3534 if (proc_is64bit(p)) {
3535 struct user_dqblk my_dqblk64;
3536
3537 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3538 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3539 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3540 } else {
3541 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3542 }
3543 }
3544 break;
3545 case Q_QUOTASTAT:
3546 /* uap->arg is a pointer to an integer */
3547 if (error == 0) {
3548 error = copyout(datap, uap->arg, sizeof(quota_status));
3549 }
3550 break;
3551 default:
3552 break;
3553 } /* switch */
3554
3555 out:
3556 mount_drop(mp, 0);
3557 return error;
3558 }
3559 #else
3560 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3561 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3562 {
3563 return EOPNOTSUPP;
3564 }
3565 #endif /* QUOTA */
3566
3567 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3568 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3569 {
3570 int error;
3571 vfs_context_t ctx = vfs_context_current();
3572
3573 #if CONFIG_MACF
3574 error = mac_mount_check_stat(ctx, mp);
3575 if (error != 0) {
3576 return error;
3577 }
3578 #endif
3579
3580 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3581 if (error != 0) {
3582 return error;
3583 }
3584
3585 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3586 }
3587
3588 /*
3589 * Get filesystem statistics.
3590 *
3591 * Returns: 0 Success
3592 * namei:???
3593 * vfs_update_vfsstat:???
3594 * munge_statfs:EFAULT
3595 */
3596 /* ARGSUSED */
3597 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3598 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3599 {
3600 int error;
3601 struct mount *mp;
3602 struct nameidata nd;
3603 vfs_context_t ctx = vfs_context_current();
3604 vnode_t vp;
3605
3606 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3607 UIO_USERSPACE, uap->path, ctx);
3608 error = namei(&nd);
3609 if (error != 0) {
3610 return error;
3611 }
3612 vp = nd.ni_vp;
3613 mp = vp->v_mount;
3614 nameidone(&nd);
3615
3616 error = statfs_internal(p, mp, uap->buf);
3617 vnode_put(vp);
3618
3619 return error;
3620 }
3621
3622 /*
3623 * Get filesystem statistics.
3624 */
3625 /* ARGSUSED */
3626 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3627 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3628 {
3629 int error;
3630 vnode_t vp = NULL;
3631 struct mount *mp;
3632
3633 AUDIT_ARG(fd, uap->fd);
3634
3635 if ((error = file_vnode(uap->fd, &vp)) ||
3636 (error = vnode_getwithref(vp))) {
3637 goto out;
3638 }
3639
3640 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3641
3642 mp = vp->v_mount;
3643 if (!mp) {
3644 error = EBADF;
3645 goto out_vnode;
3646 }
3647
3648 error = statfs_internal(p, mp, uap->buf);
3649
3650 out_vnode:
3651 vnode_put(vp);
3652
3653 out:
3654 if (vp != NULL) {
3655 file_drop(uap->fd);
3656 }
3657
3658 return error;
3659 }
3660
3661 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3662 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3663 {
3664 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3665
3666 bzero(sfs, sizeof(*sfs));
3667
3668 sfs->f_bsize = vsfs->f_bsize;
3669 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3670 sfs->f_blocks = vsfs->f_blocks;
3671 sfs->f_bfree = vsfs->f_bfree;
3672 sfs->f_bavail = vsfs->f_bavail;
3673 sfs->f_files = vsfs->f_files;
3674 sfs->f_ffree = vsfs->f_ffree;
3675 sfs->f_fsid = vsfs->f_fsid;
3676 sfs->f_owner = vsfs->f_owner;
3677 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3678 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3679 sfs->f_fssubtype = vsfs->f_fssubtype;
3680 sfs->f_flags_ext = vfs_getextflags(mp);
3681 vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3682 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3683 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3684 }
3685
3686 /*
3687 * Get file system statistics in 64-bit mode
3688 */
3689 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3690 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3691 {
3692 struct mount *mp;
3693 int error;
3694 struct nameidata *ndp;
3695 struct statfs64 *sfsp;
3696 vfs_context_t ctxp = vfs_context_current();
3697 vnode_t vp;
3698 struct {
3699 struct nameidata nd;
3700 struct statfs64 sfs;
3701 } *__nameidata_statfs64;
3702
3703 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3704 Z_WAITOK);
3705 ndp = &__nameidata_statfs64->nd;
3706
3707 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3708 UIO_USERSPACE, uap->path, ctxp);
3709 error = namei(ndp);
3710 if (error != 0) {
3711 goto out;
3712 }
3713 vp = ndp->ni_vp;
3714 mp = vp->v_mount;
3715 nameidone(ndp);
3716
3717 #if CONFIG_MACF
3718 error = mac_mount_check_stat(ctxp, mp);
3719 if (error != 0) {
3720 vnode_put(vp);
3721 goto out;
3722 }
3723 #endif
3724
3725 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3726 if (error != 0) {
3727 vnode_put(vp);
3728 goto out;
3729 }
3730
3731 sfsp = &__nameidata_statfs64->sfs;
3732 vfs_get_statfs64(mp, sfsp);
3733 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3734 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3735 /* This process does not want to see a seperate data volume mountpoint */
3736 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3737 }
3738 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3739 vnode_put(vp);
3740
3741 out:
3742 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3743
3744 return error;
3745 }
3746
3747 /*
3748 * Get file system statistics in 64-bit mode
3749 */
3750 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3751 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3752 {
3753 struct vnode *vp;
3754 struct mount *mp;
3755 struct statfs64 sfs;
3756 int error;
3757
3758 AUDIT_ARG(fd, uap->fd);
3759
3760 if ((error = file_vnode(uap->fd, &vp))) {
3761 return error;
3762 }
3763
3764 error = vnode_getwithref(vp);
3765 if (error) {
3766 file_drop(uap->fd);
3767 return error;
3768 }
3769
3770 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3771
3772 mp = vp->v_mount;
3773 if (!mp) {
3774 error = EBADF;
3775 goto out;
3776 }
3777
3778 #if CONFIG_MACF
3779 error = mac_mount_check_stat(vfs_context_current(), mp);
3780 if (error != 0) {
3781 goto out;
3782 }
3783 #endif
3784
3785 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3786 goto out;
3787 }
3788
3789 vfs_get_statfs64(mp, &sfs);
3790 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3791 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3792 /* This process does not want to see a seperate data volume mountpoint */
3793 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3794 }
3795 error = copyout(&sfs, uap->buf, sizeof(sfs));
3796
3797 out:
3798 file_drop(uap->fd);
3799 vnode_put(vp);
3800
3801 return error;
3802 }
3803
3804 struct getfsstat_struct {
3805 user_addr_t sfsp;
3806 user_addr_t *mp;
3807 int count;
3808 int maxcount;
3809 int flags;
3810 int error;
3811 };
3812
3813
3814 static int
getfsstat_callback(mount_t mp,void * arg)3815 getfsstat_callback(mount_t mp, void * arg)
3816 {
3817 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3818 struct vfsstatfs *sp;
3819 int error, my_size;
3820 vfs_context_t ctx = vfs_context_current();
3821
3822 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3823 #if CONFIG_MACF
3824 error = mac_mount_check_stat(ctx, mp);
3825 if (error != 0) {
3826 fstp->error = error;
3827 return VFS_RETURNED_DONE;
3828 }
3829 #endif
3830 sp = &mp->mnt_vfsstat;
3831 /*
3832 * If MNT_NOWAIT is specified, do not refresh the
3833 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3834 */
3835 if ((mp->mnt_lflag & MNT_LDEAD) ||
3836 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3837 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3838 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3839 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3840 return VFS_RETURNED;
3841 }
3842
3843 /*
3844 * Need to handle LP64 version of struct statfs
3845 */
3846 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3847 if (error) {
3848 fstp->error = error;
3849 return VFS_RETURNED_DONE;
3850 }
3851 fstp->sfsp += my_size;
3852
3853 if (fstp->mp) {
3854 #if CONFIG_MACF
3855 error = mac_mount_label_get(mp, *fstp->mp);
3856 if (error) {
3857 fstp->error = error;
3858 return VFS_RETURNED_DONE;
3859 }
3860 #endif
3861 fstp->mp++;
3862 }
3863 }
3864 fstp->count++;
3865 return VFS_RETURNED;
3866 }
3867
3868 /*
3869 * Get statistics on all filesystems.
3870 */
3871 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3872 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3873 {
3874 struct __mac_getfsstat_args muap;
3875
3876 muap.buf = uap->buf;
3877 muap.bufsize = uap->bufsize;
3878 muap.mac = USER_ADDR_NULL;
3879 muap.macsize = 0;
3880 muap.flags = uap->flags;
3881
3882 return __mac_getfsstat(p, &muap, retval);
3883 }
3884
3885 /*
3886 * __mac_getfsstat: Get MAC-related file system statistics
3887 *
3888 * Parameters: p (ignored)
3889 * uap User argument descriptor (see below)
3890 * retval Count of file system statistics (N stats)
3891 *
3892 * Indirect: uap->bufsize Buffer size
3893 * uap->macsize MAC info size
3894 * uap->buf Buffer where information will be returned
3895 * uap->mac MAC info
3896 * uap->flags File system flags
3897 *
3898 *
3899 * Returns: 0 Success
3900 * !0 Not success
3901 *
3902 */
3903 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3904 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3905 {
3906 user_addr_t sfsp;
3907 user_addr_t *mp;
3908 size_t count, maxcount, bufsize, macsize;
3909 struct getfsstat_struct fst;
3910
3911 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3912 return EINVAL;
3913 }
3914
3915 bufsize = (size_t) uap->bufsize;
3916 macsize = (size_t) uap->macsize;
3917
3918 if (IS_64BIT_PROCESS(p)) {
3919 maxcount = bufsize / sizeof(struct user64_statfs);
3920 } else {
3921 maxcount = bufsize / sizeof(struct user32_statfs);
3922 }
3923 sfsp = uap->buf;
3924 count = 0;
3925
3926 mp = NULL;
3927
3928 #if CONFIG_MACF
3929 if (uap->mac != USER_ADDR_NULL) {
3930 u_int32_t *mp0;
3931 int error;
3932 unsigned int i;
3933
3934 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3935 if (count != maxcount) {
3936 return EINVAL;
3937 }
3938
3939 /* Copy in the array */
3940 mp0 = kalloc_data(macsize, Z_WAITOK);
3941 if (mp0 == NULL) {
3942 return ENOMEM;
3943 }
3944
3945 error = copyin(uap->mac, mp0, macsize);
3946 if (error) {
3947 kfree_data(mp0, macsize);
3948 return error;
3949 }
3950
3951 /* Normalize to an array of user_addr_t */
3952 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3953 if (mp == NULL) {
3954 kfree_data(mp0, macsize);
3955 return ENOMEM;
3956 }
3957
3958 for (i = 0; i < count; i++) {
3959 if (IS_64BIT_PROCESS(p)) {
3960 mp[i] = ((user_addr_t *)mp0)[i];
3961 } else {
3962 mp[i] = (user_addr_t)mp0[i];
3963 }
3964 }
3965 kfree_data(mp0, macsize);
3966 }
3967 #endif
3968
3969
3970 fst.sfsp = sfsp;
3971 fst.mp = mp;
3972 fst.flags = uap->flags;
3973 fst.count = 0;
3974 fst.error = 0;
3975 fst.maxcount = (int)maxcount;
3976
3977
3978 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3979
3980 if (mp) {
3981 kfree_data(mp, count * sizeof(user_addr_t));
3982 }
3983
3984 if (fst.error) {
3985 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3986 return fst.error;
3987 }
3988
3989 if (fst.sfsp && fst.count > fst.maxcount) {
3990 *retval = fst.maxcount;
3991 } else {
3992 *retval = fst.count;
3993 }
3994 return 0;
3995 }
3996
3997 static int
getfsstat64_callback(mount_t mp,void * arg)3998 getfsstat64_callback(mount_t mp, void * arg)
3999 {
4000 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
4001 struct vfsstatfs *sp;
4002 struct statfs64 sfs;
4003 int error;
4004
4005 if (fstp->sfsp && fstp->count < fstp->maxcount) {
4006 #if CONFIG_MACF
4007 error = mac_mount_check_stat(vfs_context_current(), mp);
4008 if (error != 0) {
4009 fstp->error = error;
4010 return VFS_RETURNED_DONE;
4011 }
4012 #endif
4013 sp = &mp->mnt_vfsstat;
4014 /*
4015 * If MNT_NOWAIT is specified, do not refresh the fsstat
4016 * cache. MNT_WAIT overrides MNT_NOWAIT.
4017 *
4018 * We treat MNT_DWAIT as MNT_WAIT for all instances of
4019 * getfsstat, since the constants are out of the same
4020 * namespace.
4021 */
4022 if ((mp->mnt_lflag & MNT_LDEAD) ||
4023 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
4024 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
4025 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
4026 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
4027 return VFS_RETURNED;
4028 }
4029
4030 vfs_get_statfs64(mp, &sfs);
4031 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
4032 if (error) {
4033 fstp->error = error;
4034 return VFS_RETURNED_DONE;
4035 }
4036 fstp->sfsp += sizeof(sfs);
4037 }
4038 fstp->count++;
4039 return VFS_RETURNED;
4040 }
4041
4042 /*
4043 * Get statistics on all file systems in 64 bit mode.
4044 */
4045 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)4046 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
4047 {
4048 user_addr_t sfsp;
4049 int count, maxcount;
4050 struct getfsstat_struct fst;
4051
4052 maxcount = uap->bufsize / sizeof(struct statfs64);
4053
4054 sfsp = uap->buf;
4055 count = 0;
4056
4057 fst.sfsp = sfsp;
4058 fst.flags = uap->flags;
4059 fst.count = 0;
4060 fst.error = 0;
4061 fst.maxcount = maxcount;
4062
4063 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
4064
4065 if (fst.error) {
4066 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
4067 return fst.error;
4068 }
4069
4070 if (fst.sfsp && fst.count > fst.maxcount) {
4071 *retval = fst.maxcount;
4072 } else {
4073 *retval = fst.count;
4074 }
4075
4076 return 0;
4077 }
4078
4079 /*
4080 * gets the associated vnode with the file descriptor passed.
4081 * as input
4082 *
4083 * INPUT
4084 * ctx - vfs context of caller
4085 * fd - file descriptor for which vnode is required.
4086 * vpp - Pointer to pointer to vnode to be returned.
4087 *
4088 * The vnode is returned with an iocount so any vnode obtained
4089 * by this call needs a vnode_put
4090 *
4091 */
4092 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4093 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4094 {
4095 int error;
4096 vnode_t vp;
4097 struct fileproc *fp;
4098 proc_t p = vfs_context_proc(ctx);
4099
4100 *vpp = NULLVP;
4101
4102 error = fp_getfvp(p, fd, &fp, &vp);
4103 if (error) {
4104 return error;
4105 }
4106
4107 error = vnode_getwithref(vp);
4108 if (error) {
4109 (void)fp_drop(p, fd, fp, 0);
4110 return error;
4111 }
4112
4113 (void)fp_drop(p, fd, fp, 0);
4114 *vpp = vp;
4115 return error;
4116 }
4117
4118 /*
4119 * Wrapper function around namei to start lookup from a directory
4120 * specified by a file descriptor ni_dirfd.
4121 *
4122 * In addition to all the errors returned by namei, this call can
4123 * return ENOTDIR if the file descriptor does not refer to a directory.
4124 * and EBADF if the file descriptor is not valid.
4125 */
4126 int
nameiat(struct nameidata * ndp,int dirfd)4127 nameiat(struct nameidata *ndp, int dirfd)
4128 {
4129 if ((dirfd != AT_FDCWD) &&
4130 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4131 !(ndp->ni_cnd.cn_flags & USEDVP)) {
4132 int error = 0;
4133 char c;
4134
4135 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4136 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4137 if (error) {
4138 return error;
4139 }
4140 } else {
4141 c = *((char *)(ndp->ni_dirp));
4142 }
4143
4144 if (c != '/') {
4145 vnode_t dvp_at;
4146
4147 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4148 &dvp_at);
4149 if (error) {
4150 return error;
4151 }
4152
4153 if (vnode_vtype(dvp_at) != VDIR) {
4154 vnode_put(dvp_at);
4155 return ENOTDIR;
4156 }
4157
4158 ndp->ni_dvp = dvp_at;
4159 ndp->ni_cnd.cn_flags |= USEDVP;
4160 error = namei(ndp);
4161 ndp->ni_cnd.cn_flags &= ~USEDVP;
4162 vnode_put(dvp_at);
4163 return error;
4164 }
4165 }
4166
4167 return namei(ndp);
4168 }
4169
4170 /*
4171 * Change current working directory to a given file descriptor.
4172 */
4173 /* ARGSUSED */
4174 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4175 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4176 {
4177 vnode_t vp;
4178 vnode_t tdp;
4179 vnode_t tvp;
4180 struct mount *mp;
4181 int error, should_put = 1;
4182
4183 AUDIT_ARG(fd, fd);
4184 if (per_thread && fd == -1) {
4185 /*
4186 * Switching back from per-thread to per process CWD; verify we
4187 * in fact have one before proceeding. The only success case
4188 * for this code path is to return 0 preemptively after zapping
4189 * the thread structure contents.
4190 */
4191 thread_t th = vfs_context_thread(ctx);
4192 if (th) {
4193 uthread_t uth = get_bsdthread_info(th);
4194 tvp = uth->uu_cdir;
4195 uth->uu_cdir = NULLVP;
4196 if (tvp != NULLVP) {
4197 vnode_rele(tvp);
4198 return 0;
4199 }
4200 }
4201 return EBADF;
4202 }
4203
4204 if ((error = file_vnode(fd, &vp))) {
4205 return error;
4206 }
4207 if ((error = vnode_getwithref(vp))) {
4208 file_drop(fd);
4209 return error;
4210 }
4211
4212 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4213
4214 if (vp->v_type != VDIR) {
4215 error = ENOTDIR;
4216 goto out;
4217 }
4218
4219 #if CONFIG_MACF
4220 error = mac_vnode_check_chdir(ctx, vp);
4221 if (error) {
4222 goto out;
4223 }
4224 #endif
4225 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4226 if (error) {
4227 goto out;
4228 }
4229
4230 while (!error && (mp = vp->v_mountedhere) != NULL) {
4231 if (vfs_busy(mp, LK_NOWAIT)) {
4232 error = EACCES;
4233 goto out;
4234 }
4235 error = VFS_ROOT(mp, &tdp, ctx);
4236 vfs_unbusy(mp);
4237 if (error) {
4238 break;
4239 }
4240 vnode_put(vp);
4241 vp = tdp;
4242 }
4243 if (error) {
4244 goto out;
4245 }
4246 if ((error = vnode_ref(vp))) {
4247 goto out;
4248 }
4249 vnode_put(vp);
4250 should_put = 0;
4251
4252 if (per_thread) {
4253 thread_t th = vfs_context_thread(ctx);
4254 if (th) {
4255 uthread_t uth = get_bsdthread_info(th);
4256 tvp = uth->uu_cdir;
4257 uth->uu_cdir = vp;
4258 OSBitOrAtomic(P_THCWD, &p->p_flag);
4259 } else {
4260 vnode_rele(vp);
4261 error = ENOENT;
4262 goto out;
4263 }
4264 } else {
4265 proc_dirs_lock_exclusive(p);
4266 proc_fdlock(p);
4267 tvp = p->p_fd.fd_cdir;
4268 p->p_fd.fd_cdir = vp;
4269 proc_fdunlock(p);
4270 proc_dirs_unlock_exclusive(p);
4271 }
4272
4273 if (tvp) {
4274 vnode_rele(tvp);
4275 }
4276
4277 out:
4278 if (should_put) {
4279 vnode_put(vp);
4280 }
4281 file_drop(fd);
4282
4283 return error;
4284 }
4285
4286 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4287 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4288 {
4289 return fchdir(p, vfs_context_current(), uap->fd, false);
4290 }
4291
4292 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4293 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4294 {
4295 return fchdir(p, vfs_context_current(), uap->fd, true);
4296 }
4297
4298
4299 /*
4300 * Change current working directory (".").
4301 *
4302 * Returns: 0 Success
4303 * change_dir:ENOTDIR
4304 * change_dir:???
4305 * vnode_ref:ENOENT No such file or directory
4306 */
4307 /* ARGSUSED */
4308 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4309 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4310 {
4311 int error;
4312 vnode_t tvp;
4313
4314 error = change_dir(ndp, ctx);
4315 if (error) {
4316 return error;
4317 }
4318 if ((error = vnode_ref(ndp->ni_vp))) {
4319 vnode_put(ndp->ni_vp);
4320 return error;
4321 }
4322 /*
4323 * drop the iocount we picked up in change_dir
4324 */
4325 vnode_put(ndp->ni_vp);
4326
4327 if (per_thread) {
4328 thread_t th = vfs_context_thread(ctx);
4329 if (th) {
4330 uthread_t uth = get_bsdthread_info(th);
4331 tvp = uth->uu_cdir;
4332 uth->uu_cdir = ndp->ni_vp;
4333 OSBitOrAtomic(P_THCWD, &p->p_flag);
4334 } else {
4335 vnode_rele(ndp->ni_vp);
4336 return ENOENT;
4337 }
4338 } else {
4339 proc_dirs_lock_exclusive(p);
4340 proc_fdlock(p);
4341 tvp = p->p_fd.fd_cdir;
4342 p->p_fd.fd_cdir = ndp->ni_vp;
4343 proc_fdunlock(p);
4344 proc_dirs_unlock_exclusive(p);
4345 }
4346
4347 if (tvp) {
4348 vnode_rele(tvp);
4349 }
4350
4351 return 0;
4352 }
4353
4354
4355 /*
4356 * Change current working directory (".").
4357 *
4358 * Returns: 0 Success
4359 * chdir_internal:ENOTDIR
4360 * chdir_internal:ENOENT No such file or directory
4361 * chdir_internal:???
4362 */
4363 /* ARGSUSED */
4364 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4365 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4366 {
4367 struct nameidata nd;
4368 vfs_context_t ctx = vfs_context_current();
4369
4370 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4371 UIO_USERSPACE, uap->path, ctx);
4372
4373 return chdir_internal(p, ctx, &nd, per_thread);
4374 }
4375
4376
4377 /*
4378 * chdir
4379 *
4380 * Change current working directory (".") for the entire process
4381 *
4382 * Parameters: p Process requesting the call
4383 * uap User argument descriptor (see below)
4384 * retval (ignored)
4385 *
4386 * Indirect parameters: uap->path Directory path
4387 *
4388 * Returns: 0 Success
4389 * common_chdir: ENOTDIR
4390 * common_chdir: ENOENT No such file or directory
4391 * common_chdir: ???
4392 *
4393 */
4394 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4395 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4396 {
4397 return common_chdir(p, (void *)uap, 0);
4398 }
4399
4400 /*
4401 * __pthread_chdir
4402 *
4403 * Change current working directory (".") for a single thread
4404 *
4405 * Parameters: p Process requesting the call
4406 * uap User argument descriptor (see below)
4407 * retval (ignored)
4408 *
4409 * Indirect parameters: uap->path Directory path
4410 *
4411 * Returns: 0 Success
4412 * common_chdir: ENOTDIR
4413 * common_chdir: ENOENT No such file or directory
4414 * common_chdir: ???
4415 *
4416 */
4417 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4418 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4419 {
4420 return common_chdir(p, (void *)uap, 1);
4421 }
4422
4423
4424 /*
4425 * Change notion of root (``/'') directory.
4426 */
4427 /* ARGSUSED */
4428 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4429 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4430 {
4431 struct filedesc *fdp = &p->p_fd;
4432 int error;
4433 struct nameidata nd;
4434 vnode_t tvp;
4435 vfs_context_t ctx = vfs_context_current();
4436
4437 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4438 return error;
4439 }
4440
4441 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4442 UIO_USERSPACE, uap->path, ctx);
4443 error = change_dir(&nd, ctx);
4444 if (error) {
4445 return error;
4446 }
4447
4448 #if CONFIG_MACF
4449 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4450 &nd.ni_cnd);
4451 if (error) {
4452 vnode_put(nd.ni_vp);
4453 return error;
4454 }
4455 #endif
4456
4457 if ((error = vnode_ref(nd.ni_vp))) {
4458 vnode_put(nd.ni_vp);
4459 return error;
4460 }
4461 vnode_put(nd.ni_vp);
4462
4463 /*
4464 * This lock provides the guarantee that as long as you hold the lock
4465 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4466 * on a referenced vnode in namei when determining the rootvnode for
4467 * a process.
4468 */
4469 /* needed for synchronization with lookup */
4470 proc_dirs_lock_exclusive(p);
4471 /* needed for setting the flag and other activities on the fd itself */
4472 proc_fdlock(p);
4473 tvp = fdp->fd_rdir;
4474 fdp->fd_rdir = nd.ni_vp;
4475 fdt_flag_set(fdp, FD_CHROOT);
4476 proc_fdunlock(p);
4477 proc_dirs_unlock_exclusive(p);
4478
4479 if (tvp != NULL) {
4480 vnode_rele(tvp);
4481 }
4482
4483 return 0;
4484 }
4485
4486 #define PATHSTATICBUFLEN 256
4487 #define PIVOT_ROOT_ENTITLEMENT \
4488 "com.apple.private.vfs.pivot-root"
4489
4490 #if defined(XNU_TARGET_OS_OSX)
4491 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4492 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4493 {
4494 int error;
4495 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4496 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4497 char *new_rootfs_path_before_buf = NULL;
4498 char *old_rootfs_path_after_buf = NULL;
4499 char *incoming = NULL;
4500 char *outgoing = NULL;
4501 vnode_t incoming_rootvp = NULLVP;
4502 size_t bytes_copied;
4503
4504 /*
4505 * XXX : Additional restrictions needed
4506 * - perhaps callable only once.
4507 */
4508 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4509 return error;
4510 }
4511
4512 /*
4513 * pivot_root can be executed by launchd only.
4514 * Enforce entitlement.
4515 */
4516 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4517 return EPERM;
4518 }
4519
4520 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4521 if (error == ENAMETOOLONG) {
4522 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4523 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4524 }
4525
4526 if (error) {
4527 goto out;
4528 }
4529
4530 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4531 if (error == ENAMETOOLONG) {
4532 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4533 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4534 }
4535 if (error) {
4536 goto out;
4537 }
4538
4539 if (new_rootfs_path_before_buf) {
4540 incoming = new_rootfs_path_before_buf;
4541 } else {
4542 incoming = &new_rootfs_path_before[0];
4543 }
4544
4545 if (old_rootfs_path_after_buf) {
4546 outgoing = old_rootfs_path_after_buf;
4547 } else {
4548 outgoing = &old_rootfs_path_after[0];
4549 }
4550
4551 /*
4552 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4553 * Userland is not allowed to pivot to an image.
4554 */
4555 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4556 if (error) {
4557 goto out;
4558 }
4559 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4560 if (error) {
4561 goto out;
4562 }
4563
4564 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4565
4566 out:
4567 if (incoming_rootvp != NULLVP) {
4568 vnode_put(incoming_rootvp);
4569 incoming_rootvp = NULLVP;
4570 }
4571
4572 if (old_rootfs_path_after_buf) {
4573 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4574 }
4575
4576 if (new_rootfs_path_before_buf) {
4577 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4578 }
4579
4580 return error;
4581 }
4582 #else
4583 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4584 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4585 {
4586 return nosys(p, NULL, retval);
4587 }
4588 #endif /* XNU_TARGET_OS_OSX */
4589
4590 /*
4591 * Common routine for chroot and chdir.
4592 *
4593 * Returns: 0 Success
4594 * ENOTDIR Not a directory
4595 * namei:??? [anything namei can return]
4596 * vnode_authorize:??? [anything vnode_authorize can return]
4597 */
4598 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4599 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4600 {
4601 vnode_t vp;
4602 int error;
4603
4604 if ((error = namei(ndp))) {
4605 return error;
4606 }
4607 nameidone(ndp);
4608 vp = ndp->ni_vp;
4609
4610 if (vp->v_type != VDIR) {
4611 vnode_put(vp);
4612 return ENOTDIR;
4613 }
4614
4615 #if CONFIG_MACF
4616 error = mac_vnode_check_chdir(ctx, vp);
4617 if (error) {
4618 vnode_put(vp);
4619 return error;
4620 }
4621 #endif
4622
4623 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4624 if (error) {
4625 vnode_put(vp);
4626 return error;
4627 }
4628
4629 return error;
4630 }
4631
4632 /*
4633 * Free the vnode data (for directories) associated with the file glob.
4634 */
4635 struct fd_vn_data *
fg_vn_data_alloc(void)4636 fg_vn_data_alloc(void)
4637 {
4638 struct fd_vn_data *fvdata;
4639
4640 /* Allocate per fd vnode data */
4641 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4642 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4643 return fvdata;
4644 }
4645
4646 /*
4647 * Free the vnode data (for directories) associated with the file glob.
4648 */
4649 void
fg_vn_data_free(void * fgvndata)4650 fg_vn_data_free(void *fgvndata)
4651 {
4652 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4653
4654 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4655 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4656 kfree_type(struct fd_vn_data, fvdata);
4657 }
4658
4659 /*
4660 * Check permissions, allocate an open file structure,
4661 * and call the device open routine if any.
4662 *
4663 * Returns: 0 Success
4664 * EINVAL
4665 * EINTR
4666 * falloc:ENFILE
4667 * falloc:EMFILE
4668 * falloc:ENOMEM
4669 * vn_open_auth:???
4670 * dupfdopen:???
4671 * VNOP_ADVLOCK:???
4672 * vnode_setsize:???
4673 *
4674 * XXX Need to implement uid, gid
4675 */
4676 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4677 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4678 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4679 {
4680 proc_t p = vfs_context_proc(ctx);
4681 kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4682 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4683 struct fileproc *fp;
4684 vnode_t vp;
4685 int flags, oflags, amode;
4686 int type, indx, error;
4687 struct vfs_context context;
4688 vnode_t authvp = NULLVP;
4689
4690 oflags = uflags;
4691
4692 amode = oflags & O_ACCMODE;
4693 /*
4694 * Because O_RDONLY is 0, it is not possible to distinguish between
4695 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4696 * with FREAD/FWRITE.
4697 */
4698 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4699 return EINVAL;
4700 }
4701
4702 flags = FFLAGS(uflags);
4703 CLR(flags, FENCRYPTED);
4704 CLR(flags, FUNENCRYPTED);
4705
4706 AUDIT_ARG(fflags, oflags);
4707 AUDIT_ARG(mode, vap->va_mode);
4708
4709 if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4710 return error;
4711 }
4712 if (flags & O_CLOEXEC) {
4713 fp->fp_flags |= FP_CLOEXEC;
4714 }
4715 if (flags & O_CLOFORK) {
4716 fp->fp_flags |= FP_CLOFORK;
4717 }
4718
4719 /* setup state to recognize when fdesc_open was called */
4720 uu->uu_dupfd = -1;
4721
4722 /*
4723 * Disable read/write access if file is opened with O_EVTONLY and
4724 * the process has requested to deny read/write access.
4725 */
4726 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4727 flags &= ~(FREAD | FWRITE);
4728 }
4729
4730 if (authfd != AUTH_OPEN_NOAUTHFD) {
4731 error = vnode_getfromfd(ctx, authfd, &authvp);
4732 if (error) {
4733 fp_free(p, indx, fp);
4734 return error;
4735 }
4736 }
4737
4738 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4739 if (authvp != NULLVP) {
4740 vnode_put(authvp);
4741 }
4742 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4743 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4744 *retval = indx;
4745 return 0;
4746 }
4747 }
4748 if (error == ERESTART) {
4749 error = EINTR;
4750 }
4751 fp_free(p, indx, fp);
4752 return error;
4753 }
4754
4755 if (authvp != NULLVP) {
4756 vnode_put(authvp);
4757 }
4758
4759 uu->uu_dupfd = 0;
4760 vp = ndp->ni_vp;
4761
4762 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4763 fp->fp_glob->fg_ops = &vnops;
4764 fp_set_data(fp, vp);
4765
4766 #if CONFIG_FILE_LEASES
4767 /*
4768 * If we are creating a file or open with truncate, we need to break the
4769 * lease if there is a read lease placed on the parent dir.
4770 */
4771 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4772 vnode_breakdirlease(vp, true, oflags);
4773 }
4774 /* Now check if there is a lease placed on the file itself. */
4775 error = vnode_breaklease(vp, oflags, ctx);
4776 if (error) {
4777 goto bad;
4778 }
4779 #endif /* CONFIG_FILE_LEASES */
4780
4781 if (flags & (O_EXLOCK | O_SHLOCK)) {
4782 struct flock lf = {
4783 .l_whence = SEEK_SET,
4784 };
4785
4786 if (flags & O_EXLOCK) {
4787 lf.l_type = F_WRLCK;
4788 } else {
4789 lf.l_type = F_RDLCK;
4790 }
4791 type = F_FLOCK;
4792 if ((flags & FNONBLOCK) == 0) {
4793 type |= F_WAIT;
4794 }
4795 #if CONFIG_MACF
4796 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4797 F_SETLK, &lf);
4798 if (error) {
4799 goto bad;
4800 }
4801 #endif
4802 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4803 goto bad;
4804 }
4805 fp->fp_glob->fg_flag |= FWASLOCKED;
4806 }
4807
4808 /* try to truncate by setting the size attribute */
4809 if (flags & O_TRUNC) {
4810 if ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0) {
4811 goto bad;
4812 }
4813 fp->fp_glob->fg_flag |= FWASWRITTEN;
4814 }
4815
4816 /*
4817 * For directories we hold some additional information in the fd.
4818 */
4819 if (vnode_vtype(vp) == VDIR) {
4820 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4821 } else {
4822 fp->fp_glob->fg_vn_data = NULL;
4823 }
4824
4825 #if CONFIG_SECLUDED_MEMORY
4826 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4827 memory_object_control_t moc;
4828 const char *v_name;
4829
4830 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4831
4832 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4833 /* nothing to do... */
4834 } else if (fp->fp_glob->fg_flag & FWRITE) {
4835 /* writable -> no longer eligible for secluded pages */
4836 memory_object_mark_eligible_for_secluded(moc,
4837 FALSE);
4838 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4839 char pathname[32] = { 0, };
4840 size_t copied;
4841 /* XXX FBDP: better way to detect /Applications/ ? */
4842 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4843 (void)copyinstr(ndp->ni_dirp,
4844 pathname,
4845 sizeof(pathname),
4846 &copied);
4847 } else {
4848 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4849 pathname,
4850 sizeof(pathname),
4851 &copied);
4852 }
4853 pathname[sizeof(pathname) - 1] = '\0';
4854 if (strncmp(pathname,
4855 "/Applications/",
4856 strlen("/Applications/")) == 0 &&
4857 strncmp(pathname,
4858 "/Applications/Camera.app/",
4859 strlen("/Applications/Camera.app/")) != 0) {
4860 /*
4861 * not writable
4862 * AND from "/Applications/"
4863 * AND not from "/Applications/Camera.app/"
4864 * ==> eligible for secluded
4865 */
4866 memory_object_mark_eligible_for_secluded(moc,
4867 TRUE);
4868 }
4869 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4870 (v_name = vnode_getname(vp))) {
4871 size_t len = strlen(v_name);
4872
4873 if (!strncmp(v_name, "dyld", len) ||
4874 !strncmp(v_name, "launchd", len) ||
4875 !strncmp(v_name, "Camera", len) ||
4876 !strncmp(v_name, "SpringBoard", len) ||
4877 !strncmp(v_name, "backboardd", len) ||
4878 !strncmp(v_name, "cameracaptured", len)) {
4879 /*
4880 * This file matters when launching Camera:
4881 * do not store its contents in the secluded
4882 * pool that will be drained on Camera launch.
4883 */
4884 memory_object_mark_eligible_for_secluded(moc,
4885 FALSE);
4886 } else if (!strncmp(v_name, "audiomxd", len) ||
4887 !strncmp(v_name, "mediaplaybackd", len)) {
4888 memory_object_mark_eligible_for_secluded(moc,
4889 FALSE);
4890 memory_object_mark_for_realtime(moc,
4891 true);
4892 } else if (!strncmp(v_name, "bluetoothd", len)) {
4893 /*
4894 * bluetoothd might be needed for realtime audio
4895 * playback.
4896 */
4897 memory_object_mark_eligible_for_secluded(moc,
4898 FALSE);
4899 memory_object_mark_for_realtime(moc,
4900 true);
4901 } else {
4902 char pathname[64] = { 0, };
4903 size_t copied;
4904 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4905 (void)copyinstr(ndp->ni_dirp,
4906 pathname,
4907 sizeof(pathname),
4908 &copied);
4909 } else {
4910 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4911 pathname,
4912 sizeof(pathname),
4913 &copied);
4914 }
4915 pathname[sizeof(pathname) - 1] = '\0';
4916 if (strncmp(pathname,
4917 "/Library/Audio/Plug-Ins/",
4918 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4919 strncmp(pathname,
4920 "/System/Library/Audio/Plug-Ins/",
4921 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4922 /*
4923 * This may be an audio plugin required
4924 * for realtime playback.
4925 * ==> NOT eligible for secluded.
4926 */
4927 memory_object_mark_eligible_for_secluded(moc,
4928 FALSE);
4929 memory_object_mark_for_realtime(moc,
4930 true);
4931 }
4932 }
4933 vnode_putname(v_name);
4934 }
4935 }
4936 #endif /* CONFIG_SECLUDED_MEMORY */
4937
4938 vnode_put(vp);
4939
4940 /*
4941 * The first terminal open (without a O_NOCTTY) by a session leader
4942 * results in it being set as the controlling terminal.
4943 */
4944 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4945 !(flags & O_NOCTTY)) {
4946 int tmp = 0;
4947
4948 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4949 (caddr_t)&tmp, ctx);
4950 }
4951
4952 proc_fdlock(p);
4953 procfdtbl_releasefd(p, indx, NULL);
4954
4955 fp_drop(p, indx, fp, 1);
4956 proc_fdunlock(p);
4957
4958 *retval = indx;
4959
4960 return 0;
4961 bad:
4962 context = *vfs_context_current();
4963 context.vc_ucred = fp->fp_glob->fg_cred;
4964
4965 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4966 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4967 struct flock lf = {
4968 .l_whence = SEEK_SET,
4969 .l_type = F_UNLCK,
4970 };
4971
4972 (void)VNOP_ADVLOCK(
4973 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4974 }
4975
4976 vn_close(vp, fp->fp_glob->fg_flag, &context);
4977 vnode_put(vp);
4978 fp_free(p, indx, fp);
4979
4980 return error;
4981 }
4982
4983 /*
4984 * While most of the *at syscall handlers can call nameiat() which
4985 * is a wrapper around namei, the use of namei and initialisation
4986 * of nameidata are far removed and in different functions - namei
4987 * gets called in vn_open_auth for open1. So we'll just do here what
4988 * nameiat() does.
4989 */
4990 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4991 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4992 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4993 int dirfd, int authfd)
4994 {
4995 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4996 int error;
4997 char c;
4998
4999 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
5000 error = copyin(ndp->ni_dirp, &c, sizeof(char));
5001 if (error) {
5002 return error;
5003 }
5004 } else {
5005 c = *((char *)(ndp->ni_dirp));
5006 }
5007
5008 if (c != '/') {
5009 vnode_t dvp_at;
5010
5011 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
5012 &dvp_at);
5013 if (error) {
5014 return error;
5015 }
5016
5017 if (vnode_vtype(dvp_at) != VDIR) {
5018 vnode_put(dvp_at);
5019 return ENOTDIR;
5020 }
5021
5022 ndp->ni_dvp = dvp_at;
5023 ndp->ni_cnd.cn_flags |= USEDVP;
5024 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
5025 retval, authfd);
5026 vnode_put(dvp_at);
5027 return error;
5028 }
5029 }
5030
5031 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
5032 }
5033
5034 /*
5035 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
5036 *
5037 * Parameters: p Process requesting the open
5038 * uap User argument descriptor (see below)
5039 * retval Pointer to an area to receive the
5040 * return calue from the system call
5041 *
5042 * Indirect: uap->path Path to open (same as 'open')
5043 * uap->flags Flags to open (same as 'open'
5044 * uap->uid UID to set, if creating
5045 * uap->gid GID to set, if creating
5046 * uap->mode File mode, if creating (same as 'open')
5047 * uap->xsecurity ACL to set, if creating
5048 *
5049 * Returns: 0 Success
5050 * !0 errno value
5051 *
5052 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5053 *
5054 * XXX: We should enummerate the possible errno values here, and where
5055 * in the code they originated.
5056 */
5057 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)5058 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
5059 {
5060 int ciferror;
5061 kauth_filesec_t xsecdst;
5062 struct vnode_attr va;
5063 struct nameidata nd;
5064 int cmode;
5065
5066 AUDIT_ARG(owner, uap->uid, uap->gid);
5067
5068 xsecdst = NULL;
5069 if ((uap->xsecurity != USER_ADDR_NULL) &&
5070 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
5071 return ciferror;
5072 }
5073
5074 VATTR_INIT(&va);
5075 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
5076 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5077 if (uap->uid != KAUTH_UID_NONE) {
5078 VATTR_SET(&va, va_uid, uap->uid);
5079 }
5080 if (uap->gid != KAUTH_GID_NONE) {
5081 VATTR_SET(&va, va_gid, uap->gid);
5082 }
5083 if (xsecdst != NULL) {
5084 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5085 va.va_vaflags |= VA_FILESEC_ACL;
5086 }
5087
5088 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5089 uap->path, vfs_context_current());
5090
5091 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5092 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5093 if (xsecdst != NULL) {
5094 kauth_filesec_free(xsecdst);
5095 }
5096
5097 return ciferror;
5098 }
5099
5100 /*
5101 * Go through the data-protected atomically controlled open (2)
5102 *
5103 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5104 */
5105 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5106 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5107 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5108 {
5109 /*
5110 * Follow the same path as normal open(2)
5111 * Look up the item if it exists, and acquire the vnode.
5112 */
5113 struct vnode_attr va;
5114 struct nameidata nd;
5115 int cmode;
5116 int error;
5117 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5118
5119 VATTR_INIT(&va);
5120 /* Mask off all but regular access permissions */
5121 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5122 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5123
5124 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5125 path, ctx);
5126
5127 /*
5128 * Initialize the extra fields in vnode_attr to pass down our
5129 * extra fields.
5130 * 1. target cprotect class.
5131 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5132 */
5133 if (flags & O_CREAT) {
5134 /* lower level kernel code validates that the class is valid before applying it. */
5135 if (class != PROTECTION_CLASS_DEFAULT) {
5136 /*
5137 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5138 * file behave the same as open (2)
5139 */
5140 VATTR_SET(&va, va_dataprotect_class, class);
5141 }
5142 }
5143
5144 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5145 if (flags & (O_RDWR | O_WRONLY)) {
5146 /*
5147 * Not allowed to write raw encrypted bytes or when opening authenticated.
5148 */
5149 return EINVAL;
5150 }
5151 if (dpflags & O_DP_GETRAWENCRYPTED) {
5152 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5153 }
5154 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5155 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5156 }
5157 if (dpflags & O_DP_AUTHENTICATE) {
5158 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5159 }
5160 }
5161
5162 error = open1at(vfs_context_current(), &nd, flags, &va,
5163 NULL, NULL, retval, fd, authfd);
5164
5165 return error;
5166 }
5167
5168 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5169 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5170 {
5171 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5172 return EINVAL;
5173 }
5174
5175 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5176 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5177 }
5178
5179 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5180 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5181 {
5182 if (uap->dpflags & O_DP_AUTHENTICATE) {
5183 return EINVAL;
5184 }
5185
5186 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5187 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5188 }
5189
5190 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5191 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5192 int fd, enum uio_seg segflg, int *retval)
5193 {
5194 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5195 struct {
5196 struct vnode_attr va;
5197 struct nameidata nd;
5198 } *__open_data;
5199 struct vnode_attr *vap;
5200 struct nameidata *ndp;
5201 int cmode;
5202 int error;
5203
5204 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5205 vap = &__open_data->va;
5206 ndp = &__open_data->nd;
5207
5208 VATTR_INIT(vap);
5209 /* Mask off all but regular access permissions */
5210 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5211 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5212
5213 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5214 segflg, path, ctx);
5215
5216 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5217
5218 kfree_type(typeof(*__open_data), __open_data);
5219
5220 return error;
5221 }
5222
5223 int
open(proc_t p,struct open_args * uap,int32_t * retval)5224 open(proc_t p, struct open_args *uap, int32_t *retval)
5225 {
5226 __pthread_testcancel(1);
5227 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5228 }
5229
5230 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5231 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5232 int32_t *retval)
5233 {
5234 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5235 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5236 }
5237
5238 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5239 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5240 int32_t *retval)
5241 {
5242 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5243 uap->mode, uap->fd, UIO_USERSPACE, retval);
5244 }
5245
5246 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5247 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5248 {
5249 __pthread_testcancel(1);
5250 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5251 }
5252
5253 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5254
5255 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5256 vfs_context_can_open_by_id(vfs_context_t ctx)
5257 {
5258 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5259 return TRUE;
5260 }
5261
5262 return IOTaskHasEntitlement(vfs_context_task(ctx),
5263 OPEN_BY_ID_ENTITLEMENT);
5264 }
5265
5266 /*
5267 * openbyid_np: open a file given a file system id and a file system object id
5268 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5269 * file systems that don't support object ids it is a node id (uint64_t).
5270 *
5271 * Parameters: p Process requesting the open
5272 * uap User argument descriptor (see below)
5273 * retval Pointer to an area to receive the
5274 * return calue from the system call
5275 *
5276 * Indirect: uap->path Path to open (same as 'open')
5277 *
5278 * uap->fsid id of target file system
5279 * uap->objid id of target file system object
5280 * uap->flags Flags to open (same as 'open')
5281 *
5282 * Returns: 0 Success
5283 * !0 errno value
5284 *
5285 *
5286 * XXX: We should enummerate the possible errno values here, and where
5287 * in the code they originated.
5288 */
5289 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5290 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5291 {
5292 fsid_t fsid;
5293 uint64_t objid;
5294 int error;
5295 char *buf = NULL;
5296 int buflen = MAXPATHLEN;
5297 int pathlen = 0;
5298 vfs_context_t ctx = vfs_context_current();
5299
5300 if (!vfs_context_can_open_by_id(ctx)) {
5301 return EPERM;
5302 }
5303
5304 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5305 return error;
5306 }
5307
5308 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5309 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5310 return error;
5311 }
5312
5313 AUDIT_ARG(value32, fsid.val[0]);
5314 AUDIT_ARG(value64, objid);
5315
5316 /*resolve path from fsis, objid*/
5317 do {
5318 buf = kalloc_data(buflen + 1, Z_WAITOK);
5319 if (buf == NULL) {
5320 return ENOMEM;
5321 }
5322
5323 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5324 buf, FSOPT_ISREALFSID, &pathlen);
5325
5326 if (error) {
5327 kfree_data(buf, buflen + 1);
5328 buf = NULL;
5329 }
5330 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5331
5332 if (error) {
5333 return error;
5334 }
5335
5336 buf[pathlen] = 0;
5337
5338 error = openat_internal(
5339 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5340
5341 kfree_data(buf, buflen + 1);
5342
5343 return error;
5344 }
5345
5346
5347 /*
5348 * Create a special file.
5349 */
5350 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5351 int fd);
5352
5353 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5354 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5355 mode_t mode, int fd)
5356 {
5357 vfs_context_t ctx = vfs_context_current();
5358 struct nameidata nd;
5359 vnode_t vp, dvp;
5360 int error;
5361
5362 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5363 if ((mode & S_IFMT) == S_IFIFO) {
5364 return mkfifo1(ctx, upath, vap, fd);
5365 }
5366
5367 AUDIT_ARG(mode, mode);
5368 AUDIT_ARG(value32, vap->va_rdev);
5369
5370 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5371 return error;
5372 }
5373 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5374 UIO_USERSPACE, upath, ctx);
5375 error = nameiat(&nd, fd);
5376 if (error) {
5377 return error;
5378 }
5379 dvp = nd.ni_dvp;
5380 vp = nd.ni_vp;
5381
5382 if (vp != NULL) {
5383 error = EEXIST;
5384 goto out;
5385 }
5386
5387 switch (mode & S_IFMT) {
5388 case S_IFCHR:
5389 VATTR_SET(vap, va_type, VCHR);
5390 break;
5391 case S_IFBLK:
5392 VATTR_SET(vap, va_type, VBLK);
5393 break;
5394 default:
5395 error = EINVAL;
5396 goto out;
5397 }
5398
5399 #if CONFIG_MACF
5400 error = mac_vnode_check_create(ctx,
5401 nd.ni_dvp, &nd.ni_cnd, vap);
5402 if (error) {
5403 goto out;
5404 }
5405 #endif
5406
5407 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5408 goto out;
5409 }
5410
5411 #if CONFIG_FILE_LEASES
5412 vnode_breakdirlease(dvp, false, O_WRONLY);
5413 #endif
5414
5415 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5416 goto out;
5417 }
5418
5419 if (vp) {
5420 int update_flags = 0;
5421
5422 // Make sure the name & parent pointers are hooked up
5423 if (vp->v_name == NULL) {
5424 update_flags |= VNODE_UPDATE_NAME;
5425 }
5426 if (vp->v_parent == NULLVP) {
5427 update_flags |= VNODE_UPDATE_PARENT;
5428 }
5429
5430 if (update_flags) {
5431 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5432 }
5433
5434 #if CONFIG_FSE
5435 add_fsevent(FSE_CREATE_FILE, ctx,
5436 FSE_ARG_VNODE, vp,
5437 FSE_ARG_DONE);
5438 #endif
5439 }
5440
5441 out:
5442 /*
5443 * nameidone has to happen before we vnode_put(dvp)
5444 * since it may need to release the fs_nodelock on the dvp
5445 */
5446 nameidone(&nd);
5447
5448 if (vp) {
5449 vnode_put(vp);
5450 }
5451 vnode_put(dvp);
5452
5453 return error;
5454 }
5455
5456 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5457 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5458 {
5459 struct vnode_attr va;
5460
5461 VATTR_INIT(&va);
5462 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5463 VATTR_SET(&va, va_rdev, uap->dev);
5464
5465 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5466 }
5467
5468 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5469 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5470 {
5471 struct vnode_attr va;
5472
5473 VATTR_INIT(&va);
5474 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5475 VATTR_SET(&va, va_rdev, uap->dev);
5476
5477 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5478 }
5479
5480 /*
5481 * Create a named pipe.
5482 *
5483 * Returns: 0 Success
5484 * EEXIST
5485 * namei:???
5486 * vnode_authorize:???
5487 * vn_create:???
5488 */
5489 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5490 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5491 {
5492 vnode_t vp, dvp;
5493 int error;
5494 struct nameidata nd;
5495
5496 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5497 UIO_USERSPACE, upath, ctx);
5498 error = nameiat(&nd, fd);
5499 if (error) {
5500 return error;
5501 }
5502 dvp = nd.ni_dvp;
5503 vp = nd.ni_vp;
5504
5505 /* check that this is a new file and authorize addition */
5506 if (vp != NULL) {
5507 error = EEXIST;
5508 goto out;
5509 }
5510 VATTR_SET(vap, va_type, VFIFO);
5511
5512 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5513 goto out;
5514 }
5515
5516 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5517 out:
5518 /*
5519 * nameidone has to happen before we vnode_put(dvp)
5520 * since it may need to release the fs_nodelock on the dvp
5521 */
5522 nameidone(&nd);
5523
5524 if (vp) {
5525 vnode_put(vp);
5526 }
5527 vnode_put(dvp);
5528
5529 return error;
5530 }
5531
5532
5533 /*
5534 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5535 *
5536 * Parameters: p Process requesting the open
5537 * uap User argument descriptor (see below)
5538 * retval (Ignored)
5539 *
5540 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5541 * uap->uid UID to set
5542 * uap->gid GID to set
5543 * uap->mode File mode to set (same as 'mkfifo')
5544 * uap->xsecurity ACL to set, if creating
5545 *
5546 * Returns: 0 Success
5547 * !0 errno value
5548 *
5549 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5550 *
5551 * XXX: We should enummerate the possible errno values here, and where
5552 * in the code they originated.
5553 */
5554 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5555 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5556 {
5557 int ciferror;
5558 kauth_filesec_t xsecdst;
5559 struct vnode_attr va;
5560
5561 AUDIT_ARG(owner, uap->uid, uap->gid);
5562
5563 xsecdst = KAUTH_FILESEC_NONE;
5564 if (uap->xsecurity != USER_ADDR_NULL) {
5565 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5566 return ciferror;
5567 }
5568 }
5569
5570 VATTR_INIT(&va);
5571 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5572 if (uap->uid != KAUTH_UID_NONE) {
5573 VATTR_SET(&va, va_uid, uap->uid);
5574 }
5575 if (uap->gid != KAUTH_GID_NONE) {
5576 VATTR_SET(&va, va_gid, uap->gid);
5577 }
5578 if (xsecdst != KAUTH_FILESEC_NONE) {
5579 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5580 va.va_vaflags |= VA_FILESEC_ACL;
5581 }
5582
5583 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5584
5585 if (xsecdst != KAUTH_FILESEC_NONE) {
5586 kauth_filesec_free(xsecdst);
5587 }
5588 return ciferror;
5589 }
5590
5591 /* ARGSUSED */
5592 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5593 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5594 {
5595 struct vnode_attr va;
5596
5597 VATTR_INIT(&va);
5598 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5599
5600 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5601 }
5602
5603 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5604 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5605 {
5606 struct vnode_attr va;
5607
5608 VATTR_INIT(&va);
5609 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5610
5611 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5612 }
5613
5614 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5615 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5616 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5617
5618 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5619 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5620 {
5621 int ret, len = _len;
5622
5623 *truncated_path = 0;
5624
5625 if (firmlink) {
5626 ret = vn_getpath(dvp, path, &len);
5627 } else {
5628 ret = vn_getpath_no_firmlink(dvp, path, &len);
5629 }
5630 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5631 if (leafname) {
5632 path[len - 1] = '/';
5633 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5634 if (len > MAXPATHLEN) {
5635 char *ptr;
5636
5637 // the string got truncated!
5638 *truncated_path = 1;
5639 ptr = strrchr(path, '/');
5640 if (ptr) {
5641 *ptr = '\0'; // chop off the string at the last directory component
5642 }
5643 len = (int)strlen(path) + 1;
5644 }
5645 }
5646 } else if (ret == 0) {
5647 *truncated_path = 1;
5648 } else if (ret != 0) {
5649 struct vnode *mydvp = dvp;
5650
5651 if (ret != ENOSPC) {
5652 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5653 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5654 }
5655 *truncated_path = 1;
5656
5657 do {
5658 if (mydvp->v_parent != NULL) {
5659 mydvp = mydvp->v_parent;
5660 } else if (mydvp->v_mount) {
5661 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5662 break;
5663 } else {
5664 // no parent and no mount point? only thing is to punt and say "/" changed
5665 strlcpy(path, "/", _len);
5666 len = 2;
5667 mydvp = NULL;
5668 }
5669
5670 if (mydvp == NULL) {
5671 break;
5672 }
5673
5674 len = _len;
5675 if (firmlink) {
5676 ret = vn_getpath(mydvp, path, &len);
5677 } else {
5678 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5679 }
5680 } while (ret == ENOSPC);
5681 }
5682
5683 return len;
5684 }
5685
5686 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5687 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5688 {
5689 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5690 }
5691
5692 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5693 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5694 {
5695 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5696 }
5697
5698 /*
5699 * Make a hard file link.
5700 *
5701 * Returns: 0 Success
5702 * EPERM
5703 * EEXIST
5704 * EXDEV
5705 * namei:???
5706 * vnode_authorize:???
5707 * VNOP_LINK:???
5708 */
5709 /* ARGSUSED */
5710 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5711 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5712 user_addr_t link, int flag, enum uio_seg segflg)
5713 {
5714 vnode_t vp, pvp, dvp, lvp;
5715 struct nameidata nd;
5716 int follow;
5717 int error;
5718 #if CONFIG_FSE
5719 fse_info finfo;
5720 #endif
5721 char *target_path = NULL;
5722 char *no_firmlink_path = NULL;
5723 vnode_t locked_vp = NULLVP;
5724 int truncated = 0;
5725 int truncated_no_firmlink_path = 0;
5726 int num_retries = 0;
5727 int need_event, has_listeners, need_kpath2;
5728 bool do_retry;
5729
5730 /* look up the object we are linking to */
5731 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5732
5733 retry:
5734 do_retry = false;
5735 vp = dvp = lvp = NULLVP;
5736 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5737 segflg, path, ctx);
5738
5739 error = nameiat(&nd, fd1);
5740 if (error) {
5741 return error;
5742 }
5743 vp = nd.ni_vp;
5744
5745 nameidone(&nd);
5746
5747 /*
5748 * Normally, linking to directories is not supported.
5749 * However, some file systems may have limited support.
5750 */
5751 if (vp->v_type == VDIR) {
5752 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5753 error = EPERM; /* POSIX */
5754 goto out;
5755 }
5756
5757 /* Linking to a directory requires ownership. */
5758 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5759 struct vnode_attr dva;
5760
5761 VATTR_INIT(&dva);
5762 VATTR_WANTED(&dva, va_uid);
5763 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5764 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5765 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5766 error = EACCES;
5767 goto out;
5768 }
5769 }
5770 }
5771
5772 /* lookup the target node */
5773 #if CONFIG_TRIGGERS
5774 nd.ni_op = OP_LINK;
5775 #endif
5776 nd.ni_cnd.cn_nameiop = CREATE;
5777 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5778 nd.ni_dirp = link;
5779 error = nameiat(&nd, fd2);
5780 if (error != 0) {
5781 goto out;
5782 }
5783 dvp = nd.ni_dvp;
5784 lvp = nd.ni_vp;
5785
5786 assert(locked_vp == NULLVP);
5787 vnode_link_lock(vp);
5788 locked_vp = vp;
5789
5790 #if CONFIG_MACF
5791 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5792 goto out2;
5793 }
5794 #endif
5795
5796 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5797 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5798 goto out2;
5799 }
5800
5801 /* target node must not exist */
5802 if (lvp != NULLVP) {
5803 error = EEXIST;
5804 goto out2;
5805 }
5806 /* cannot link across mountpoints */
5807 if (vnode_mount(vp) != vnode_mount(dvp)) {
5808 error = EXDEV;
5809 goto out2;
5810 }
5811
5812 /* authorize creation of the target note */
5813 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5814 goto out2;
5815 }
5816
5817 #if CONFIG_FILE_LEASES
5818 vnode_breakdirlease(dvp, false, O_WRONLY);
5819 #endif
5820
5821 /* and finally make the link */
5822 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5823 if (error) {
5824 if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
5825 do_retry = true;
5826 num_retries += 1;
5827 }
5828 goto out2;
5829 }
5830
5831 #if CONFIG_MACF
5832 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5833 #endif
5834
5835 assert(locked_vp == vp);
5836 vnode_link_unlock(locked_vp);
5837 locked_vp = NULLVP;
5838
5839 #if CONFIG_FSE
5840 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5841 #else
5842 need_event = 0;
5843 #endif
5844 has_listeners = kauth_authorize_fileop_has_listeners();
5845
5846 need_kpath2 = 0;
5847 #if CONFIG_AUDIT
5848 if (AUDIT_RECORD_EXISTS()) {
5849 need_kpath2 = 1;
5850 }
5851 #endif
5852
5853 if (need_event || has_listeners || need_kpath2) {
5854 char *link_to_path = NULL;
5855 int len, link_name_len;
5856 int len_no_firmlink_path = 0;
5857
5858 /* build the path to the new link file */
5859 GET_PATH(target_path);
5860
5861 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5862 if (no_firmlink_path == NULL) {
5863 GET_PATH(no_firmlink_path);
5864 }
5865 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5866
5867 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5868
5869 if (has_listeners) {
5870 /* build the path to file we are linking to */
5871 GET_PATH(link_to_path);
5872
5873 link_name_len = MAXPATHLEN;
5874 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5875 /*
5876 * Call out to allow 3rd party notification of rename.
5877 * Ignore result of kauth_authorize_fileop call.
5878 */
5879 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5880 (uintptr_t)link_to_path,
5881 (uintptr_t)target_path);
5882 }
5883 if (link_to_path != NULL) {
5884 RELEASE_PATH(link_to_path);
5885 }
5886 }
5887 #if CONFIG_FSE
5888 if (need_event) {
5889 /* construct fsevent */
5890 if (get_fse_info(vp, &finfo, ctx) == 0) {
5891 if (truncated_no_firmlink_path) {
5892 finfo.mode |= FSE_TRUNCATED_PATH;
5893 }
5894
5895 // build the path to the destination of the link
5896 add_fsevent(FSE_CREATE_FILE, ctx,
5897 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5898 FSE_ARG_FINFO, &finfo,
5899 FSE_ARG_DONE);
5900 }
5901
5902 pvp = vp->v_parent;
5903 // need an iocount on parent vnode in this case
5904 if (pvp && pvp != dvp) {
5905 pvp = vnode_getparent_if_different(vp, dvp);
5906 }
5907 if (pvp) {
5908 add_fsevent(FSE_STAT_CHANGED, ctx,
5909 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5910 }
5911 if (pvp && pvp != dvp) {
5912 vnode_put(pvp);
5913 }
5914 }
5915 #endif
5916 }
5917 out2:
5918 /*
5919 * nameidone has to happen before we vnode_put(dvp)
5920 * since it may need to release the fs_nodelock on the dvp
5921 */
5922 nameidone(&nd);
5923 if (target_path != NULL) {
5924 RELEASE_PATH(target_path);
5925 target_path = NULL;
5926 }
5927 if (no_firmlink_path != NULL) {
5928 RELEASE_PATH(no_firmlink_path);
5929 no_firmlink_path = NULL;
5930 }
5931 out:
5932 if (locked_vp) {
5933 assert(locked_vp == vp);
5934 vnode_link_unlock(locked_vp);
5935 locked_vp = NULLVP;
5936 }
5937 if (lvp) {
5938 vnode_put(lvp);
5939 }
5940 if (dvp) {
5941 vnode_put(dvp);
5942 }
5943 vnode_put(vp);
5944
5945 if (do_retry) {
5946 goto retry;
5947 }
5948
5949 return error;
5950 }
5951
5952 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5953 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5954 {
5955 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5956 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5957 }
5958
5959 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5960 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5961 {
5962 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5963 return EINVAL;
5964 }
5965
5966 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5967 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5968 }
5969
5970 /*
5971 * Make a symbolic link.
5972 *
5973 * We could add support for ACLs here too...
5974 */
5975 /* ARGSUSED */
5976 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5977 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5978 user_addr_t link, enum uio_seg segflg)
5979 {
5980 struct vnode_attr va;
5981 char *path;
5982 int error;
5983 struct nameidata nd;
5984 vnode_t vp, dvp;
5985 size_t dummy = 0;
5986 proc_t p;
5987
5988 error = 0;
5989 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5990 path = zalloc(ZV_NAMEI);
5991 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5992 } else {
5993 path = (char *)path_data;
5994 }
5995 if (error) {
5996 goto out;
5997 }
5998 AUDIT_ARG(text, path); /* This is the link string */
5999
6000 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
6001 segflg, link, ctx);
6002
6003 error = nameiat(&nd, fd);
6004 if (error) {
6005 goto out;
6006 }
6007 dvp = nd.ni_dvp;
6008 vp = nd.ni_vp;
6009
6010 p = vfs_context_proc(ctx);
6011 VATTR_INIT(&va);
6012 VATTR_SET(&va, va_type, VLNK);
6013 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
6014
6015 #if CONFIG_MACF
6016 error = mac_vnode_check_create(ctx,
6017 dvp, &nd.ni_cnd, &va);
6018 #endif
6019 if (error != 0) {
6020 goto skipit;
6021 }
6022
6023 if (vp != NULL) {
6024 error = EEXIST;
6025 goto skipit;
6026 }
6027
6028 /* authorize */
6029 if (error == 0) {
6030 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
6031 }
6032 /* get default ownership, etc. */
6033 if (error == 0) {
6034 error = vnode_authattr_new(dvp, &va, 0, ctx);
6035 }
6036
6037 #if CONFIG_FILE_LEASES
6038 vnode_breakdirlease(dvp, false, O_WRONLY);
6039 #endif
6040
6041 if (error == 0) {
6042 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
6043 }
6044
6045 /* do fallback attribute handling */
6046 if (error == 0 && vp) {
6047 error = vnode_setattr_fallback(vp, &va, ctx);
6048 }
6049
6050 #if CONFIG_MACF
6051 if (error == 0 && vp) {
6052 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
6053 }
6054 #endif
6055
6056 if (error == 0) {
6057 int update_flags = 0;
6058
6059 /*check if a new vnode was created, else try to get one*/
6060 if (vp == NULL) {
6061 nd.ni_cnd.cn_nameiop = LOOKUP;
6062 #if CONFIG_TRIGGERS
6063 nd.ni_op = OP_LOOKUP;
6064 #endif
6065 /*
6066 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
6067 * reallocated again in namei().
6068 */
6069 nd.ni_cnd.cn_flags &= HASBUF;
6070 error = nameiat(&nd, fd);
6071 if (error) {
6072 goto skipit;
6073 }
6074 vp = nd.ni_vp;
6075 }
6076
6077 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
6078 /* call out to allow 3rd party notification of rename.
6079 * Ignore result of kauth_authorize_fileop call.
6080 */
6081 if (kauth_authorize_fileop_has_listeners() &&
6082 namei(&nd) == 0) {
6083 char *new_link_path = NULL;
6084 int len;
6085
6086 /* build the path to the new link file */
6087 new_link_path = get_pathbuff();
6088 len = MAXPATHLEN;
6089 vn_getpath(dvp, new_link_path, &len);
6090 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
6091 new_link_path[len - 1] = '/';
6092 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
6093 }
6094
6095 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
6096 (uintptr_t)path, (uintptr_t)new_link_path);
6097 if (new_link_path != NULL) {
6098 release_pathbuff(new_link_path);
6099 }
6100 }
6101 #endif
6102 // Make sure the name & parent pointers are hooked up
6103 if (vp->v_name == NULL) {
6104 update_flags |= VNODE_UPDATE_NAME;
6105 }
6106 if (vp->v_parent == NULLVP) {
6107 update_flags |= VNODE_UPDATE_PARENT;
6108 }
6109
6110 if (update_flags) {
6111 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6112 }
6113
6114 #if CONFIG_FSE
6115 add_fsevent(FSE_CREATE_FILE, ctx,
6116 FSE_ARG_VNODE, vp,
6117 FSE_ARG_DONE);
6118 #endif
6119 }
6120
6121 skipit:
6122 /*
6123 * nameidone has to happen before we vnode_put(dvp)
6124 * since it may need to release the fs_nodelock on the dvp
6125 */
6126 nameidone(&nd);
6127
6128 if (vp) {
6129 vnode_put(vp);
6130 }
6131 vnode_put(dvp);
6132 out:
6133 if (path && (path != (char *)path_data)) {
6134 zfree(ZV_NAMEI, path);
6135 }
6136
6137 return error;
6138 }
6139
6140 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6141 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6142 {
6143 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6144 uap->link, UIO_USERSPACE);
6145 }
6146
6147 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6148 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6149 __unused int32_t *retval)
6150 {
6151 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6152 uap->path2, UIO_USERSPACE);
6153 }
6154
6155 /*
6156 * Delete a whiteout from the filesystem.
6157 * No longer supported.
6158 */
6159 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6160 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6161 {
6162 return ENOTSUP;
6163 }
6164
6165 /*
6166 * Delete a name from the filesystem.
6167 */
6168 /* ARGSUSED */
6169 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6170 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6171 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6172 {
6173 struct {
6174 struct nameidata nd;
6175 #if CONFIG_FSE
6176 struct vnode_attr va;
6177 fse_info finfo;
6178 #endif
6179 } *__unlink_data;
6180 struct nameidata *ndp;
6181 vnode_t vp, dvp;
6182 int error;
6183 struct componentname *cnp;
6184 char *path = NULL;
6185 char *no_firmlink_path = NULL;
6186 int len_path = 0;
6187 int len_no_firmlink_path = 0;
6188 int flags;
6189 int need_event;
6190 int has_listeners;
6191 int truncated_path;
6192 int truncated_no_firmlink_path;
6193 int batched;
6194 struct vnode_attr *vap;
6195 vnode_t locked_vp = NULLVP;
6196 int do_retry;
6197 int retry_count = 0;
6198 int cn_flags;
6199 int nofollow_any = 0;
6200
6201 cn_flags = LOCKPARENT;
6202 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6203 cn_flags |= AUDITVNPATH1;
6204 }
6205 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6206 nofollow_any = NAMEI_NOFOLLOW_ANY;
6207 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6208 }
6209 /* If a starting dvp is passed, it trumps any fd passed. */
6210 if (start_dvp) {
6211 cn_flags |= USEDVP;
6212 }
6213
6214 #if NAMEDRSRCFORK
6215 /* unlink or delete is allowed on rsrc forks and named streams */
6216 cn_flags |= CN_ALLOWRSRCFORK;
6217 #endif
6218
6219 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6220 ndp = &__unlink_data->nd;
6221 #if CONFIG_FSE
6222 fse_info *finfop = &__unlink_data->finfo;
6223 #endif
6224
6225 retry:
6226 do_retry = 0;
6227 flags = 0;
6228 need_event = 0;
6229 has_listeners = 0;
6230 truncated_path = 0;
6231 truncated_no_firmlink_path = 0;
6232 vap = NULL;
6233
6234 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6235
6236 ndp->ni_dvp = start_dvp;
6237 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6238 cnp = &ndp->ni_cnd;
6239
6240 continue_lookup:
6241 error = nameiat(ndp, fd);
6242 if (error) {
6243 goto early_out;
6244 }
6245
6246 dvp = ndp->ni_dvp;
6247 vp = ndp->ni_vp;
6248
6249 /* With Carbon delete semantics, busy files cannot be deleted */
6250 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6251 flags |= VNODE_REMOVE_NODELETEBUSY;
6252 }
6253
6254 /* Skip any potential upcalls if told to. */
6255 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6256 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6257 }
6258
6259 /* Update speculative telemetry with system discarded use state */
6260 if (unlink_flags & VNODE_REMOVE_SYSTEM_DISCARDED) {
6261 flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6262 }
6263
6264 if (vp) {
6265 batched = vnode_compound_remove_available(vp);
6266 /*
6267 * The root of a mounted filesystem cannot be deleted.
6268 */
6269 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6270 error = EBUSY;
6271 goto out;
6272 }
6273
6274 #if DEVELOPMENT || DEBUG
6275 /*
6276 * XXX VSWAP: Check for entitlements or special flag here
6277 * so we can restrict access appropriately.
6278 */
6279 #else /* DEVELOPMENT || DEBUG */
6280
6281 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6282 error = EPERM;
6283 goto out;
6284 }
6285 #endif /* DEVELOPMENT || DEBUG */
6286
6287 if (!batched) {
6288 vnode_link_lock(vp);
6289 locked_vp = vp;
6290 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6291 if (error) {
6292 if (error == ENOENT) {
6293 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6294 do_retry = 1;
6295 retry_count++;
6296 }
6297 }
6298 vnode_link_unlock(vp);
6299 locked_vp = NULLVP;
6300 goto out;
6301 }
6302 }
6303 } else {
6304 batched = 1;
6305
6306 if (!vnode_compound_remove_available(dvp)) {
6307 panic("No vp, but no compound remove?");
6308 }
6309 }
6310
6311 #if CONFIG_FSE
6312 need_event = need_fsevent(FSE_DELETE, dvp);
6313 if (need_event) {
6314 if (!batched) {
6315 if ((vp->v_flag & VISHARDLINK) == 0) {
6316 /* XXX need to get these data in batched VNOP */
6317 get_fse_info(vp, finfop, ctx);
6318 }
6319 } else {
6320 error =
6321 vfs_get_notify_attributes(&__unlink_data->va);
6322 if (error) {
6323 goto out;
6324 }
6325
6326 vap = &__unlink_data->va;
6327 }
6328 }
6329 #endif
6330 has_listeners = kauth_authorize_fileop_has_listeners();
6331 if (need_event || has_listeners) {
6332 if (path == NULL) {
6333 GET_PATH(path);
6334 }
6335 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6336 if (no_firmlink_path == NULL) {
6337 GET_PATH(no_firmlink_path);
6338 }
6339 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6340 }
6341
6342 #if NAMEDRSRCFORK
6343 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6344 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6345 } else
6346 #endif
6347 {
6348 #if CONFIG_FILE_LEASES
6349 vnode_breakdirlease(dvp, false, O_WRONLY);
6350 #endif
6351
6352 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6353 vp = ndp->ni_vp;
6354 if (error == EKEEPLOOKING) {
6355 if (!batched) {
6356 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6357 }
6358
6359 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6360 panic("EKEEPLOOKING, but continue flag not set?");
6361 }
6362
6363 if (vnode_isdir(vp)) {
6364 error = EISDIR;
6365 goto out;
6366 }
6367 goto continue_lookup;
6368 } else if (error == ENOENT && batched) {
6369 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6370 /*
6371 * For compound VNOPs, the authorization callback may
6372 * return ENOENT in case of racing hardlink lookups
6373 * hitting the name cache, redrive the lookup.
6374 */
6375 do_retry = 1;
6376 retry_count += 1;
6377 goto out;
6378 }
6379 }
6380 }
6381
6382 /*
6383 * Call out to allow 3rd party notification of delete.
6384 * Ignore result of kauth_authorize_fileop call.
6385 */
6386 if (!error) {
6387 if (has_listeners) {
6388 kauth_authorize_fileop(vfs_context_ucred(ctx),
6389 KAUTH_FILEOP_DELETE,
6390 (uintptr_t)vp,
6391 (uintptr_t)path);
6392 }
6393
6394 if (vp->v_flag & VISHARDLINK) {
6395 //
6396 // if a hardlink gets deleted we want to blow away the
6397 // v_parent link because the path that got us to this
6398 // instance of the link is no longer valid. this will
6399 // force the next call to get the path to ask the file
6400 // system instead of just following the v_parent link.
6401 //
6402 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6403 }
6404
6405 #if CONFIG_FSE
6406 if (need_event) {
6407 if (vp->v_flag & VISHARDLINK) {
6408 get_fse_info(vp, finfop, ctx);
6409 } else if (vap) {
6410 vnode_get_fse_info_from_vap(vp, finfop, vap);
6411 }
6412 if (truncated_path) {
6413 finfop->mode |= FSE_TRUNCATED_PATH;
6414 }
6415 add_fsevent(FSE_DELETE, ctx,
6416 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6417 FSE_ARG_FINFO, finfop,
6418 FSE_ARG_DONE);
6419 }
6420 #endif
6421
6422 #if CONFIG_MACF
6423 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6424 #endif
6425 }
6426
6427 out:
6428 if (locked_vp) {
6429 assert(locked_vp == vp);
6430 vnode_link_unlock(locked_vp);
6431 locked_vp = NULLVP;
6432 }
6433
6434 if (path != NULL) {
6435 RELEASE_PATH(path);
6436 path = NULL;
6437 }
6438
6439 if (no_firmlink_path != NULL) {
6440 RELEASE_PATH(no_firmlink_path);
6441 no_firmlink_path = NULL;
6442 }
6443 #if NAMEDRSRCFORK
6444 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6445 * will cause its shadow file to go away if necessary.
6446 */
6447 if (vp && (vnode_isnamedstream(vp)) &&
6448 (vp->v_parent != NULLVP) &&
6449 vnode_isshadow(vp)) {
6450 vnode_recycle(vp);
6451 }
6452 #endif
6453 /*
6454 * nameidone has to happen before we vnode_put(dvp)
6455 * since it may need to release the fs_nodelock on the dvp
6456 */
6457 nameidone(ndp);
6458 vnode_put(dvp);
6459 if (vp) {
6460 vnode_put(vp);
6461 }
6462
6463 if (do_retry) {
6464 goto retry;
6465 }
6466
6467 early_out:
6468 kfree_type(typeof(*__unlink_data), __unlink_data);
6469 return error;
6470 }
6471
6472 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6473 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6474 enum uio_seg segflg, int unlink_flags)
6475 {
6476 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6477 unlink_flags);
6478 }
6479
6480 /*
6481 * Delete a name from the filesystem using Carbon semantics.
6482 */
6483 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6484 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6485 {
6486 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6487 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6488 }
6489
6490 /*
6491 * Delete a name from the filesystem using POSIX semantics.
6492 */
6493 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6494 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6495 {
6496 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6497 uap->path, UIO_USERSPACE, 0);
6498 }
6499
6500 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6501 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6502 {
6503 int unlink_flags = 0;
6504
6505 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED)) {
6506 return EINVAL;
6507 }
6508
6509 if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6510 unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6511 }
6512
6513 if (uap->flag & AT_SYSTEM_DISCARDED) {
6514 unlink_flags |= VNODE_REMOVE_SYSTEM_DISCARDED;
6515 }
6516
6517 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6518 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6519 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6520 }
6521 return rmdirat_internal(vfs_context_current(), uap->fd,
6522 uap->path, UIO_USERSPACE, unlink_flags);
6523 } else {
6524 return unlinkat_internal(vfs_context_current(), uap->fd,
6525 NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6526 }
6527 }
6528
6529 /*
6530 * Reposition read/write file offset.
6531 */
6532 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6533 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6534 {
6535 struct fileproc *fp;
6536 vnode_t vp;
6537 struct vfs_context *ctx;
6538 off_t offset = uap->offset, file_size;
6539 int error;
6540
6541 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6542 if (error == ENOTSUP) {
6543 return ESPIPE;
6544 }
6545 return error;
6546 }
6547 if (
6548 // rdar://3837316: Seeking a pipe is disallowed by POSIX.
6549 vnode_isfifo(vp)
6550 // rdar://120750171: Seeking a TTY is undefined and should be denied.
6551 || vnode_istty(vp)
6552 ) {
6553 file_drop(uap->fd);
6554 return ESPIPE;
6555 }
6556
6557
6558 ctx = vfs_context_current();
6559 #if CONFIG_MACF
6560 if (uap->whence == L_INCR && uap->offset == 0) {
6561 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6562 fp->fp_glob);
6563 } else {
6564 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6565 fp->fp_glob);
6566 }
6567 if (error) {
6568 file_drop(uap->fd);
6569 return error;
6570 }
6571 #endif
6572 if ((error = vnode_getwithref(vp))) {
6573 file_drop(uap->fd);
6574 return error;
6575 }
6576
6577 switch (uap->whence) {
6578 case L_INCR:
6579 offset += fp->fp_glob->fg_offset;
6580 break;
6581 case L_XTND:
6582 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6583 break;
6584 }
6585 offset += file_size;
6586 break;
6587 case L_SET:
6588 break;
6589 case SEEK_HOLE:
6590 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6591 break;
6592 case SEEK_DATA:
6593 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6594 break;
6595 default:
6596 error = EINVAL;
6597 }
6598 if (error == 0) {
6599 if (uap->offset > 0 && offset < 0) {
6600 /* Incremented/relative move past max size */
6601 error = EOVERFLOW;
6602 } else {
6603 /*
6604 * Allow negative offsets on character devices, per
6605 * POSIX 1003.1-2001. Most likely for writing disk
6606 * labels.
6607 */
6608 if (offset < 0 && vp->v_type != VCHR) {
6609 /* Decremented/relative move before start */
6610 error = EINVAL;
6611 } else {
6612 /* Success */
6613 fp->fp_glob->fg_offset = offset;
6614 *retval = fp->fp_glob->fg_offset;
6615 }
6616 }
6617 }
6618
6619 /*
6620 * An lseek can affect whether data is "available to read." Use
6621 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6622 */
6623 post_event_if_success(vp, error, NOTE_NONE);
6624 (void)vnode_put(vp);
6625 file_drop(uap->fd);
6626 return error;
6627 }
6628
6629
6630 /*
6631 * Check access permissions.
6632 *
6633 * Returns: 0 Success
6634 * vnode_authorize:???
6635 */
6636 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6637 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6638 {
6639 kauth_action_t action;
6640 int error;
6641
6642 /*
6643 * If just the regular access bits, convert them to something
6644 * that vnode_authorize will understand.
6645 */
6646 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6647 action = 0;
6648 if (uflags & R_OK) {
6649 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6650 }
6651 if (uflags & W_OK) {
6652 if (vnode_isdir(vp)) {
6653 action |= KAUTH_VNODE_ADD_FILE |
6654 KAUTH_VNODE_ADD_SUBDIRECTORY;
6655 /* might want delete rights here too */
6656 } else {
6657 action |= KAUTH_VNODE_WRITE_DATA;
6658 }
6659 }
6660 if (uflags & X_OK) {
6661 if (vnode_isdir(vp)) {
6662 action |= KAUTH_VNODE_SEARCH;
6663 } else {
6664 action |= KAUTH_VNODE_EXECUTE;
6665 }
6666 }
6667 } else {
6668 /* take advantage of definition of uflags */
6669 action = uflags >> 8;
6670 }
6671
6672 #if CONFIG_MACF
6673 error = mac_vnode_check_access(ctx, vp, uflags);
6674 if (error) {
6675 return error;
6676 }
6677 #endif /* MAC */
6678
6679 /* action == 0 means only check for existence */
6680 if (action != 0) {
6681 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6682 } else {
6683 error = 0;
6684 }
6685
6686 return error;
6687 }
6688
6689
6690
6691 /*
6692 * access_extended: Check access permissions in bulk.
6693 *
6694 * Description: uap->entries Pointer to an array of accessx
6695 * descriptor structs, plus one or
6696 * more NULL terminated strings (see
6697 * "Notes" section below).
6698 * uap->size Size of the area pointed to by
6699 * uap->entries.
6700 * uap->results Pointer to the results array.
6701 *
6702 * Returns: 0 Success
6703 * ENOMEM Insufficient memory
6704 * EINVAL Invalid arguments
6705 * namei:EFAULT Bad address
6706 * namei:ENAMETOOLONG Filename too long
6707 * namei:ENOENT No such file or directory
6708 * namei:ELOOP Too many levels of symbolic links
6709 * namei:EBADF Bad file descriptor
6710 * namei:ENOTDIR Not a directory
6711 * namei:???
6712 * access1:
6713 *
6714 * Implicit returns:
6715 * uap->results Array contents modified
6716 *
6717 * Notes: The uap->entries are structured as an arbitrary length array
6718 * of accessx descriptors, followed by one or more NULL terminated
6719 * strings
6720 *
6721 * struct accessx_descriptor[0]
6722 * ...
6723 * struct accessx_descriptor[n]
6724 * char name_data[0];
6725 *
6726 * We determine the entry count by walking the buffer containing
6727 * the uap->entries argument descriptor. For each descriptor we
6728 * see, the valid values for the offset ad_name_offset will be
6729 * in the byte range:
6730 *
6731 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6732 * to
6733 * [ uap->entries + uap->size - 2 ]
6734 *
6735 * since we must have at least one string, and the string must
6736 * be at least one character plus the NULL terminator in length.
6737 *
6738 * XXX: Need to support the check-as uid argument
6739 */
6740 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6741 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6742 {
6743 struct accessx_descriptor *input = NULL;
6744 errno_t *result = NULL;
6745 errno_t error = 0;
6746 int wantdelete = 0;
6747 size_t desc_max, desc_actual = 0;
6748 unsigned int i, j;
6749 struct vfs_context context;
6750 struct nameidata nd;
6751 int niopts;
6752 vnode_t vp = NULL;
6753 vnode_t dvp = NULL;
6754 #define ACCESSX_MAX_DESCR_ON_STACK 10
6755 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6756
6757 context.vc_ucred = NULL;
6758
6759 /*
6760 * Validate parameters; if valid, copy the descriptor array and string
6761 * arguments into local memory. Before proceeding, the following
6762 * conditions must have been met:
6763 *
6764 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6765 * o There must be sufficient room in the request for at least one
6766 * descriptor and a one yte NUL terminated string.
6767 * o The allocation of local storage must not fail.
6768 */
6769 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6770 return ENOMEM;
6771 }
6772 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6773 return EINVAL;
6774 }
6775 if (uap->size <= sizeof(stack_input)) {
6776 input = stack_input;
6777 } else {
6778 input = kalloc_data(uap->size, Z_WAITOK);
6779 if (input == NULL) {
6780 error = ENOMEM;
6781 goto out;
6782 }
6783 }
6784 error = copyin(uap->entries, input, uap->size);
6785 if (error) {
6786 goto out;
6787 }
6788
6789 AUDIT_ARG(opaque, input, uap->size);
6790
6791 /*
6792 * Force NUL termination of the copyin buffer to avoid nami() running
6793 * off the end. If the caller passes us bogus data, they may get a
6794 * bogus result.
6795 */
6796 ((char *)input)[uap->size - 1] = 0;
6797
6798 /*
6799 * Access is defined as checking against the process' real identity,
6800 * even if operations are checking the effective identity. This
6801 * requires that we use a local vfs context.
6802 */
6803 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6804 context.vc_thread = current_thread();
6805
6806 /*
6807 * Find out how many entries we have, so we can allocate the result
6808 * array by walking the list and adjusting the count downward by the
6809 * earliest string offset we see.
6810 */
6811 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6812 desc_actual = desc_max;
6813 for (i = 0; i < desc_actual; i++) {
6814 /*
6815 * Take the offset to the name string for this entry and
6816 * convert to an input array index, which would be one off
6817 * the end of the array if this entry was the lowest-addressed
6818 * name string.
6819 */
6820 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6821
6822 /*
6823 * An offset greater than the max allowable offset is an error.
6824 * It is also an error for any valid entry to point
6825 * to a location prior to the end of the current entry, if
6826 * it's not a reference to the string of the previous entry.
6827 */
6828 if (j > desc_max || (j != 0 && j <= i)) {
6829 error = EINVAL;
6830 goto out;
6831 }
6832
6833 /* Also do not let ad_name_offset point to something beyond the size of the input */
6834 if (input[i].ad_name_offset >= uap->size) {
6835 error = EINVAL;
6836 goto out;
6837 }
6838
6839 /*
6840 * An offset of 0 means use the previous descriptor's offset;
6841 * this is used to chain multiple requests for the same file
6842 * to avoid multiple lookups.
6843 */
6844 if (j == 0) {
6845 /* This is not valid for the first entry */
6846 if (i == 0) {
6847 error = EINVAL;
6848 goto out;
6849 }
6850 continue;
6851 }
6852
6853 /*
6854 * If the offset of the string for this descriptor is before
6855 * what we believe is the current actual last descriptor,
6856 * then we need to adjust our estimate downward; this permits
6857 * the string table following the last descriptor to be out
6858 * of order relative to the descriptor list.
6859 */
6860 if (j < desc_actual) {
6861 desc_actual = j;
6862 }
6863 }
6864
6865 /*
6866 * We limit the actual number of descriptors we are willing to process
6867 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6868 * requested does not exceed this limit,
6869 */
6870 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6871 error = ENOMEM;
6872 goto out;
6873 }
6874 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6875 if (result == NULL) {
6876 error = ENOMEM;
6877 goto out;
6878 }
6879
6880 /*
6881 * Do the work by iterating over the descriptor entries we know to
6882 * at least appear to contain valid data.
6883 */
6884 error = 0;
6885 for (i = 0; i < desc_actual; i++) {
6886 /*
6887 * If the ad_name_offset is 0, then we use the previous
6888 * results to make the check; otherwise, we are looking up
6889 * a new file name.
6890 */
6891 if (input[i].ad_name_offset != 0) {
6892 /* discard old vnodes */
6893 if (vp) {
6894 vnode_put(vp);
6895 vp = NULL;
6896 }
6897 if (dvp) {
6898 vnode_put(dvp);
6899 dvp = NULL;
6900 }
6901
6902 /*
6903 * Scan forward in the descriptor list to see if we
6904 * need the parent vnode. We will need it if we are
6905 * deleting, since we must have rights to remove
6906 * entries in the parent directory, as well as the
6907 * rights to delete the object itself.
6908 */
6909 wantdelete = input[i].ad_flags & _DELETE_OK;
6910 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6911 if (input[j].ad_flags & _DELETE_OK) {
6912 wantdelete = 1;
6913 }
6914 }
6915
6916 niopts = FOLLOW | AUDITVNPATH1;
6917
6918 /* need parent for vnode_authorize for deletion test */
6919 if (wantdelete) {
6920 niopts |= WANTPARENT;
6921 }
6922
6923 /* do the lookup */
6924 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6925 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6926 &context);
6927 error = namei(&nd);
6928 if (!error) {
6929 vp = nd.ni_vp;
6930 if (wantdelete) {
6931 dvp = nd.ni_dvp;
6932 }
6933 }
6934 nameidone(&nd);
6935 }
6936
6937 /*
6938 * Handle lookup errors.
6939 */
6940 switch (error) {
6941 case ENOENT:
6942 case EACCES:
6943 case EPERM:
6944 case ENOTDIR:
6945 result[i] = error;
6946 break;
6947 case 0:
6948 /* run this access check */
6949 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6950 break;
6951 default:
6952 /* fatal lookup error */
6953
6954 goto out;
6955 }
6956 }
6957
6958 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6959
6960 /* copy out results */
6961 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6962
6963 out:
6964 if (input && input != stack_input) {
6965 kfree_data(input, uap->size);
6966 }
6967 if (result) {
6968 kfree_data(result, desc_actual * sizeof(errno_t));
6969 }
6970 if (vp) {
6971 vnode_put(vp);
6972 }
6973 if (dvp) {
6974 vnode_put(dvp);
6975 }
6976 if (IS_VALID_CRED(context.vc_ucred)) {
6977 kauth_cred_unref(&context.vc_ucred);
6978 }
6979 return error;
6980 }
6981
6982
6983 /*
6984 * Returns: 0 Success
6985 * namei:EFAULT Bad address
6986 * namei:ENAMETOOLONG Filename too long
6987 * namei:ENOENT No such file or directory
6988 * namei:ELOOP Too many levels of symbolic links
6989 * namei:EBADF Bad file descriptor
6990 * namei:ENOTDIR Not a directory
6991 * namei:???
6992 * access1:
6993 */
6994 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6995 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6996 int flag, enum uio_seg segflg)
6997 {
6998 int error;
6999 struct nameidata nd;
7000 int niopts;
7001 struct vfs_context context;
7002 #if NAMEDRSRCFORK
7003 int is_namedstream = 0;
7004 #endif
7005
7006 /*
7007 * Unless the AT_EACCESS option is used, Access is defined as checking
7008 * against the process' real identity, even if operations are checking
7009 * the effective identity. So we need to tweak the credential
7010 * in the context for that case.
7011 */
7012 if (!(flag & AT_EACCESS)) {
7013 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
7014 } else {
7015 context.vc_ucred = ctx->vc_ucred;
7016 }
7017 context.vc_thread = ctx->vc_thread;
7018
7019
7020 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
7021 /* need parent for vnode_authorize for deletion test */
7022 if (amode & _DELETE_OK) {
7023 niopts |= WANTPARENT;
7024 }
7025 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
7026 path, &context);
7027 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7028 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7029 }
7030
7031 #if NAMEDRSRCFORK
7032 /* access(F_OK) calls are allowed for resource forks. */
7033 if (amode == F_OK) {
7034 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7035 }
7036 #endif
7037 error = nameiat(&nd, fd);
7038 if (error) {
7039 goto out;
7040 }
7041
7042 #if NAMEDRSRCFORK
7043 /* Grab reference on the shadow stream file vnode to
7044 * force an inactive on release which will mark it
7045 * for recycle.
7046 */
7047 if (vnode_isnamedstream(nd.ni_vp) &&
7048 (nd.ni_vp->v_parent != NULLVP) &&
7049 vnode_isshadow(nd.ni_vp)) {
7050 is_namedstream = 1;
7051 vnode_ref(nd.ni_vp);
7052 }
7053 #endif
7054
7055 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
7056
7057 #if NAMEDRSRCFORK
7058 if (is_namedstream) {
7059 vnode_rele(nd.ni_vp);
7060 }
7061 #endif
7062
7063 vnode_put(nd.ni_vp);
7064 if (amode & _DELETE_OK) {
7065 vnode_put(nd.ni_dvp);
7066 }
7067 nameidone(&nd);
7068
7069 out:
7070 if (!(flag & AT_EACCESS)) {
7071 kauth_cred_unref(&context.vc_ucred);
7072 }
7073 return error;
7074 }
7075
7076 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)7077 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
7078 {
7079 return faccessat_internal(vfs_context_current(), AT_FDCWD,
7080 uap->path, uap->flags, 0, UIO_USERSPACE);
7081 }
7082
7083 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)7084 faccessat(__unused proc_t p, struct faccessat_args *uap,
7085 __unused int32_t *retval)
7086 {
7087 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7088 return EINVAL;
7089 }
7090
7091 return faccessat_internal(vfs_context_current(), uap->fd,
7092 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
7093 }
7094
7095 /*
7096 * Returns: 0 Success
7097 * EFAULT
7098 * copyout:EFAULT
7099 * namei:???
7100 * vn_stat:???
7101 */
7102 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)7103 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
7104 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
7105 enum uio_seg segflg, int fd, int flag)
7106 {
7107 struct nameidata *ndp = NULL;
7108 int follow;
7109 union {
7110 struct stat sb;
7111 struct stat64 sb64;
7112 } source = {};
7113 union {
7114 struct user64_stat user64_sb;
7115 struct user32_stat user32_sb;
7116 struct user64_stat64 user64_sb64;
7117 struct user32_stat64 user32_sb64;
7118 } dest = {};
7119 caddr_t sbp;
7120 int error, my_size;
7121 kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
7122 size_t xsecurity_bufsize;
7123 void * statptr;
7124 struct fileproc *fp = NULL;
7125 int needsrealdev = 0;
7126
7127 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7128 ndp = kalloc_type(struct nameidata, Z_WAITOK);
7129 NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7130 segflg, path, ctx);
7131 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7132 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7133 }
7134
7135 #if NAMEDRSRCFORK
7136 int is_namedstream = 0;
7137 /* stat calls are allowed for resource forks. */
7138 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7139 #endif
7140
7141 if (flag & AT_FDONLY) {
7142 vnode_t fvp;
7143
7144 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7145 if (error) {
7146 goto out;
7147 }
7148 if ((error = vnode_getwithref(fvp))) {
7149 file_drop(fd);
7150 goto out;
7151 }
7152 ndp->ni_vp = fvp;
7153 } else {
7154 error = nameiat(ndp, fd);
7155 if (error) {
7156 goto out;
7157 }
7158 }
7159
7160 statptr = (void *)&source;
7161
7162 #if NAMEDRSRCFORK
7163 /* Grab reference on the shadow stream file vnode to
7164 * force an inactive on release which will mark it
7165 * for recycle.
7166 */
7167 if (vnode_isnamedstream(ndp->ni_vp) &&
7168 (ndp->ni_vp->v_parent != NULLVP) &&
7169 vnode_isshadow(ndp->ni_vp)) {
7170 is_namedstream = 1;
7171 vnode_ref(ndp->ni_vp);
7172 }
7173 #endif
7174
7175 needsrealdev = flag & AT_REALDEV ? 1 : 0;
7176 if (fp && (xsecurity == USER_ADDR_NULL)) {
7177 /*
7178 * If the caller has the file open, and is not
7179 * requesting extended security information, we are
7180 * going to let them get the basic stat information.
7181 */
7182 error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7183 fp->fp_glob->fg_cred);
7184 } else {
7185 error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7186 isstat64, needsrealdev, ctx);
7187 }
7188
7189 #if NAMEDRSRCFORK
7190 if (is_namedstream) {
7191 vnode_rele(ndp->ni_vp);
7192 }
7193 #endif
7194 vnode_put(ndp->ni_vp);
7195 nameidone(ndp);
7196
7197 if (fp) {
7198 file_drop(fd);
7199 fp = NULL;
7200 }
7201
7202 if (error) {
7203 goto out;
7204 }
7205 /* Zap spare fields */
7206 if (isstat64 != 0) {
7207 source.sb64.st_lspare = 0;
7208 source.sb64.st_qspare[0] = 0LL;
7209 source.sb64.st_qspare[1] = 0LL;
7210 if (vfs_context_is64bit(ctx)) {
7211 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7212 my_size = sizeof(dest.user64_sb64);
7213 sbp = (caddr_t)&dest.user64_sb64;
7214 } else {
7215 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7216 my_size = sizeof(dest.user32_sb64);
7217 sbp = (caddr_t)&dest.user32_sb64;
7218 }
7219 /*
7220 * Check if we raced (post lookup) against the last unlink of a file.
7221 */
7222 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7223 source.sb64.st_nlink = 1;
7224 }
7225 } else {
7226 source.sb.st_lspare = 0;
7227 source.sb.st_qspare[0] = 0LL;
7228 source.sb.st_qspare[1] = 0LL;
7229 if (vfs_context_is64bit(ctx)) {
7230 munge_user64_stat(&source.sb, &dest.user64_sb);
7231 my_size = sizeof(dest.user64_sb);
7232 sbp = (caddr_t)&dest.user64_sb;
7233 } else {
7234 munge_user32_stat(&source.sb, &dest.user32_sb);
7235 my_size = sizeof(dest.user32_sb);
7236 sbp = (caddr_t)&dest.user32_sb;
7237 }
7238
7239 /*
7240 * Check if we raced (post lookup) against the last unlink of a file.
7241 */
7242 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7243 source.sb.st_nlink = 1;
7244 }
7245 }
7246 if ((error = copyout(sbp, ub, my_size)) != 0) {
7247 goto out;
7248 }
7249
7250 /* caller wants extended security information? */
7251 if (xsecurity != USER_ADDR_NULL) {
7252 /* did we get any? */
7253 if (fsec == KAUTH_FILESEC_NONE) {
7254 if (susize(xsecurity_size, 0) != 0) {
7255 error = EFAULT;
7256 goto out;
7257 }
7258 } else {
7259 /* find the user buffer size */
7260 xsecurity_bufsize = fusize(xsecurity_size);
7261
7262 /* copy out the actual data size */
7263 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7264 error = EFAULT;
7265 goto out;
7266 }
7267
7268 /* if the caller supplied enough room, copy out to it */
7269 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7270 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7271 }
7272 }
7273 }
7274 out:
7275 if (ndp) {
7276 kfree_type(struct nameidata, ndp);
7277 }
7278 if (fsec != KAUTH_FILESEC_NONE) {
7279 kauth_filesec_free(fsec);
7280 }
7281 return error;
7282 }
7283
7284 /*
7285 * stat_extended: Get file status; with extended security (ACL).
7286 *
7287 * Parameters: p (ignored)
7288 * uap User argument descriptor (see below)
7289 * retval (ignored)
7290 *
7291 * Indirect: uap->path Path of file to get status from
7292 * uap->ub User buffer (holds file status info)
7293 * uap->xsecurity ACL to get (extended security)
7294 * uap->xsecurity_size Size of ACL
7295 *
7296 * Returns: 0 Success
7297 * !0 errno value
7298 *
7299 */
7300 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7301 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7302 __unused int32_t *retval)
7303 {
7304 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7305 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7306 0);
7307 }
7308
7309 /*
7310 * Returns: 0 Success
7311 * fstatat_internal:??? [see fstatat_internal() in this file]
7312 */
7313 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7314 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7315 {
7316 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7317 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7318 }
7319
7320 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7321 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7322 {
7323 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7324 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7325 }
7326
7327 /*
7328 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7329 *
7330 * Parameters: p (ignored)
7331 * uap User argument descriptor (see below)
7332 * retval (ignored)
7333 *
7334 * Indirect: uap->path Path of file to get status from
7335 * uap->ub User buffer (holds file status info)
7336 * uap->xsecurity ACL to get (extended security)
7337 * uap->xsecurity_size Size of ACL
7338 *
7339 * Returns: 0 Success
7340 * !0 errno value
7341 *
7342 */
7343 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7344 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7345 {
7346 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7347 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7348 0);
7349 }
7350
7351 /*
7352 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7353 *
7354 * Parameters: p (ignored)
7355 * uap User argument descriptor (see below)
7356 * retval (ignored)
7357 *
7358 * Indirect: uap->path Path of file to get status from
7359 * uap->ub User buffer (holds file status info)
7360 * uap->xsecurity ACL to get (extended security)
7361 * uap->xsecurity_size Size of ACL
7362 *
7363 * Returns: 0 Success
7364 * !0 errno value
7365 *
7366 */
7367 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7368 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7369 {
7370 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7371 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7372 AT_SYMLINK_NOFOLLOW);
7373 }
7374
7375 /*
7376 * Get file status; this version does not follow links.
7377 */
7378 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7379 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7380 {
7381 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7382 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7383 }
7384
7385 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7386 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7387 {
7388 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7389 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7390 }
7391
7392 /*
7393 * lstat64_extended: Get file status; can handle large inode numbers; does not
7394 * follow links; with extended security (ACL).
7395 *
7396 * Parameters: p (ignored)
7397 * uap User argument descriptor (see below)
7398 * retval (ignored)
7399 *
7400 * Indirect: uap->path Path of file to get status from
7401 * uap->ub User buffer (holds file status info)
7402 * uap->xsecurity ACL to get (extended security)
7403 * uap->xsecurity_size Size of ACL
7404 *
7405 * Returns: 0 Success
7406 * !0 errno value
7407 *
7408 */
7409 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7410 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7411 {
7412 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7413 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7414 AT_SYMLINK_NOFOLLOW);
7415 }
7416
7417 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7418 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7419 {
7420 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7421 return EINVAL;
7422 }
7423
7424 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7425 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7426 }
7427
7428 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7429 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7430 __unused int32_t *retval)
7431 {
7432 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7433 return EINVAL;
7434 }
7435
7436 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7437 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7438 }
7439
7440 /*
7441 * Get configurable pathname variables.
7442 *
7443 * Returns: 0 Success
7444 * namei:???
7445 * vn_pathconf:???
7446 *
7447 * Notes: Global implementation constants are intended to be
7448 * implemented in this function directly; all other constants
7449 * are per-FS implementation, and therefore must be handled in
7450 * each respective FS, instead.
7451 *
7452 * XXX We implement some things globally right now that should actually be
7453 * XXX per-FS; we will need to deal with this at some point.
7454 */
7455 /* ARGSUSED */
7456 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7457 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7458 {
7459 int error;
7460 struct nameidata nd;
7461 vfs_context_t ctx = vfs_context_current();
7462
7463 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7464 UIO_USERSPACE, uap->path, ctx);
7465 error = namei(&nd);
7466 if (error) {
7467 return error;
7468 }
7469
7470 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7471
7472 vnode_put(nd.ni_vp);
7473 nameidone(&nd);
7474 return error;
7475 }
7476
7477 /*
7478 * Return target name of a symbolic link.
7479 */
7480 /* ARGSUSED */
7481 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7482 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7483 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7484 int *retval)
7485 {
7486 vnode_t vp;
7487 uio_t auio;
7488 int error;
7489 struct nameidata nd;
7490 UIO_STACKBUF(uio_buf, 1);
7491 bool put_vnode;
7492
7493 if (bufsize > INT32_MAX) {
7494 return EINVAL;
7495 }
7496
7497 if (lnk_vp) {
7498 vp = lnk_vp;
7499 put_vnode = false;
7500 } else {
7501 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7502 seg, path, ctx);
7503
7504 error = nameiat(&nd, fd);
7505 if (error) {
7506 return error;
7507 }
7508 vp = nd.ni_vp;
7509 put_vnode = true;
7510 nameidone(&nd);
7511 }
7512
7513 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7514 &uio_buf[0], sizeof(uio_buf));
7515 uio_addiov(auio, buf, bufsize);
7516 if (vp->v_type != VLNK) {
7517 error = EINVAL;
7518 } else {
7519 #if CONFIG_MACF
7520 error = mac_vnode_check_readlink(ctx, vp);
7521 #endif
7522 if (error == 0) {
7523 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7524 ctx);
7525 }
7526 if (error == 0) {
7527 error = VNOP_READLINK(vp, auio, ctx);
7528 }
7529 }
7530
7531 if (put_vnode) {
7532 vnode_put(vp);
7533 }
7534
7535 *retval = (int)(bufsize - uio_resid(auio));
7536 return error;
7537 }
7538
7539 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7540 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7541 {
7542 enum uio_seg procseg;
7543 vnode_t vp;
7544 int error;
7545
7546 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7547
7548 AUDIT_ARG(fd, uap->fd);
7549
7550 if ((error = file_vnode(uap->fd, &vp))) {
7551 return error;
7552 }
7553 if ((error = vnode_getwithref(vp))) {
7554 file_drop(uap->fd);
7555 return error;
7556 }
7557
7558 error = readlinkat_internal(vfs_context_current(), -1,
7559 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7560 uap->bufsize, procseg, retval);
7561
7562 vnode_put(vp);
7563 file_drop(uap->fd);
7564 return error;
7565 }
7566
7567 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7568 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7569 {
7570 enum uio_seg procseg;
7571
7572 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7573 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7574 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7575 uap->count, procseg, retval);
7576 }
7577
7578 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7579 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7580 {
7581 enum uio_seg procseg;
7582
7583 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7584 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7585 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7586 retval);
7587 }
7588
7589 /*
7590 * Change file flags, the deep inner layer.
7591 */
7592 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7593 chflags0(vnode_t vp, struct vnode_attr *va,
7594 int (*setattr)(vnode_t, void *, vfs_context_t),
7595 void *arg, vfs_context_t ctx)
7596 {
7597 kauth_action_t action = 0;
7598 int error;
7599
7600 #if CONFIG_MACF
7601 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7602 if (error) {
7603 goto out;
7604 }
7605 #endif
7606
7607 /* request authorisation, disregard immutability */
7608 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7609 goto out;
7610 }
7611 /*
7612 * Request that the auth layer disregard those file flags it's allowed to when
7613 * authorizing this operation; we need to do this in order to be able to
7614 * clear immutable flags.
7615 */
7616 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7617 goto out;
7618 }
7619 error = (*setattr)(vp, arg, ctx);
7620
7621 #if CONFIG_MACF
7622 if (error == 0) {
7623 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7624 }
7625 #endif
7626
7627 out:
7628 return error;
7629 }
7630
7631 /*
7632 * Change file flags.
7633 *
7634 * NOTE: this will vnode_put() `vp'
7635 */
7636 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7637 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7638 {
7639 struct vnode_attr va;
7640 int error;
7641
7642 VATTR_INIT(&va);
7643 VATTR_SET(&va, va_flags, flags);
7644
7645 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7646 vnode_put(vp);
7647
7648 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7649 error = ENOTSUP;
7650 }
7651
7652 return error;
7653 }
7654
7655 /*
7656 * Change flags of a file given a path name.
7657 */
7658 /* ARGSUSED */
7659 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7660 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7661 {
7662 vnode_t vp;
7663 vfs_context_t ctx = vfs_context_current();
7664 int error;
7665 struct nameidata nd;
7666 uint32_t wantparent = 0;
7667
7668 #if CONFIG_FILE_LEASES
7669 wantparent = WANTPARENT;
7670 #endif
7671
7672 AUDIT_ARG(fflags, uap->flags);
7673 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7674 UIO_USERSPACE, uap->path, ctx);
7675 error = namei(&nd);
7676 if (error) {
7677 return error;
7678 }
7679 vp = nd.ni_vp;
7680
7681 #if CONFIG_FILE_LEASES
7682 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7683 vnode_put(nd.ni_dvp);
7684 #endif
7685
7686 nameidone(&nd);
7687
7688 /* we don't vnode_put() here because chflags1 does internally */
7689 error = chflags1(vp, uap->flags, ctx);
7690
7691 return error;
7692 }
7693
7694 /*
7695 * Change flags of a file given a file descriptor.
7696 */
7697 /* ARGSUSED */
7698 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7699 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7700 {
7701 vnode_t vp;
7702 int error;
7703
7704 AUDIT_ARG(fd, uap->fd);
7705 AUDIT_ARG(fflags, uap->flags);
7706 if ((error = file_vnode(uap->fd, &vp))) {
7707 return error;
7708 }
7709
7710 if ((error = vnode_getwithref(vp))) {
7711 file_drop(uap->fd);
7712 return error;
7713 }
7714
7715 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7716
7717 #if CONFIG_FILE_LEASES
7718 vnode_breakdirlease(vp, true, O_WRONLY);
7719 #endif
7720
7721 /* we don't vnode_put() here because chflags1 does internally */
7722 error = chflags1(vp, uap->flags, vfs_context_current());
7723
7724 file_drop(uap->fd);
7725 return error;
7726 }
7727
7728 /*
7729 * Change security information on a filesystem object.
7730 *
7731 * Returns: 0 Success
7732 * EPERM Operation not permitted
7733 * vnode_authattr:??? [anything vnode_authattr can return]
7734 * vnode_authorize:??? [anything vnode_authorize can return]
7735 * vnode_setattr:??? [anything vnode_setattr can return]
7736 *
7737 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7738 * translated to EPERM before being returned.
7739 */
7740 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7741 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7742 {
7743 kauth_action_t action;
7744 int error;
7745
7746 AUDIT_ARG(mode, vap->va_mode);
7747 /* XXX audit new args */
7748
7749 #if NAMEDSTREAMS
7750 /* chmod calls are not allowed for resource forks. */
7751 if (vp->v_flag & VISNAMEDSTREAM) {
7752 return EPERM;
7753 }
7754 #endif
7755
7756 #if CONFIG_MACF
7757 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7758 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7759 return error;
7760 }
7761
7762 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7763 if ((error = mac_vnode_check_setowner(ctx, vp,
7764 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7765 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7766 return error;
7767 }
7768 }
7769
7770 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7771 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7772 return error;
7773 }
7774 #endif
7775
7776 /* make sure that the caller is allowed to set this security information */
7777 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7778 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7779 if (error == EACCES) {
7780 error = EPERM;
7781 }
7782 return error;
7783 }
7784
7785 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7786 return error;
7787 }
7788
7789 #if CONFIG_MACF
7790 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7791 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7792 }
7793
7794 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7795 mac_vnode_notify_setowner(ctx, vp,
7796 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7797 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7798 }
7799
7800 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7801 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7802 }
7803 #endif
7804
7805 return error;
7806 }
7807
7808
7809 /*
7810 * Change mode of a file given a path name.
7811 *
7812 * Returns: 0 Success
7813 * namei:??? [anything namei can return]
7814 * chmod_vnode:??? [anything chmod_vnode can return]
7815 */
7816 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7817 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7818 int fd, int flag, enum uio_seg segflg)
7819 {
7820 struct nameidata nd;
7821 int follow, error;
7822 uint32_t wantparent = 0;
7823
7824 #if CONFIG_FILE_LEASES
7825 wantparent = WANTPARENT;
7826 #endif
7827
7828 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7829 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7830 segflg, path, ctx);
7831 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7832 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7833 }
7834 if ((error = nameiat(&nd, fd))) {
7835 return error;
7836 }
7837
7838 #if CONFIG_FILE_LEASES
7839 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7840 vnode_put(nd.ni_dvp);
7841 #endif
7842
7843 error = chmod_vnode(ctx, nd.ni_vp, vap);
7844 vnode_put(nd.ni_vp);
7845 nameidone(&nd);
7846 return error;
7847 }
7848
7849 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7850 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7851 gid_t gid, user_addr_t xsecurity)
7852 {
7853 int error;
7854
7855 VATTR_INIT(pva);
7856
7857 if (mode != -1) {
7858 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7859 } else {
7860 pva->va_mode = 0;
7861 }
7862
7863 if (uid != KAUTH_UID_NONE) {
7864 VATTR_SET(pva, va_uid, uid);
7865 }
7866
7867 if (gid != KAUTH_GID_NONE) {
7868 VATTR_SET(pva, va_gid, gid);
7869 }
7870
7871 *pxsecdst = NULL;
7872 switch (xsecurity) {
7873 case USER_ADDR_NULL:
7874 break;
7875
7876 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7877 VATTR_SET(pva, va_acl, NULL);
7878 break;
7879
7880 default:
7881 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7882 return error;
7883 }
7884
7885 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7886 pva->va_vaflags |= VA_FILESEC_ACL;
7887 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7888 break;
7889 }
7890
7891 return 0;
7892 }
7893
7894 /*
7895 * chmod_extended: Change the mode of a file given a path name; with extended
7896 * argument list (including extended security (ACL)).
7897 *
7898 * Parameters: p Process requesting the open
7899 * uap User argument descriptor (see below)
7900 * retval (ignored)
7901 *
7902 * Indirect: uap->path Path to object (same as 'chmod')
7903 * uap->uid UID to set
7904 * uap->gid GID to set
7905 * uap->mode File mode to set (same as 'chmod')
7906 * uap->xsecurity ACL to set (or delete)
7907 *
7908 * Returns: 0 Success
7909 * !0 errno value
7910 *
7911 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7912 *
7913 * XXX: We should enummerate the possible errno values here, and where
7914 * in the code they originated.
7915 */
7916 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7917 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7918 {
7919 int error;
7920 struct vnode_attr va;
7921 kauth_filesec_t xsecdst = NULL;
7922
7923 AUDIT_ARG(owner, uap->uid, uap->gid);
7924
7925 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7926 uap->gid, uap->xsecurity);
7927
7928 if (error) {
7929 return error;
7930 }
7931
7932 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7933 UIO_USERSPACE);
7934
7935 if (xsecdst != NULL) {
7936 kauth_filesec_free(xsecdst);
7937 }
7938 return error;
7939 }
7940
7941 /*
7942 * Returns: 0 Success
7943 * chmodat:??? [anything chmodat can return]
7944 */
7945 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7946 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7947 int flag, enum uio_seg segflg)
7948 {
7949 struct vnode_attr va;
7950
7951 VATTR_INIT(&va);
7952 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7953
7954 return chmodat(ctx, path, &va, fd, flag, segflg);
7955 }
7956
7957 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7958 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7959 {
7960 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7961 AT_FDCWD, 0, UIO_USERSPACE);
7962 }
7963
7964 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7965 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7966 {
7967 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7968 return EINVAL;
7969 }
7970
7971 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7972 uap->fd, uap->flag, UIO_USERSPACE);
7973 }
7974
7975 /*
7976 * Change mode of a file given a file descriptor.
7977 */
7978 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7979 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7980 {
7981 vnode_t vp;
7982 int error;
7983
7984 AUDIT_ARG(fd, fd);
7985
7986 if ((error = file_vnode(fd, &vp)) != 0) {
7987 return error;
7988 }
7989 if ((error = vnode_getwithref(vp)) != 0) {
7990 file_drop(fd);
7991 return error;
7992 }
7993 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7994
7995 #if CONFIG_FILE_LEASES
7996 vnode_breakdirlease(vp, true, O_WRONLY);
7997 #endif
7998
7999 error = chmod_vnode(vfs_context_current(), vp, vap);
8000 (void)vnode_put(vp);
8001 file_drop(fd);
8002
8003 return error;
8004 }
8005
8006 /*
8007 * fchmod_extended: Change mode of a file given a file descriptor; with
8008 * extended argument list (including extended security (ACL)).
8009 *
8010 * Parameters: p Process requesting to change file mode
8011 * uap User argument descriptor (see below)
8012 * retval (ignored)
8013 *
8014 * Indirect: uap->mode File mode to set (same as 'chmod')
8015 * uap->uid UID to set
8016 * uap->gid GID to set
8017 * uap->xsecurity ACL to set (or delete)
8018 * uap->fd File descriptor of file to change mode
8019 *
8020 * Returns: 0 Success
8021 * !0 errno value
8022 *
8023 */
8024 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)8025 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
8026 {
8027 int error;
8028 struct vnode_attr va;
8029 kauth_filesec_t xsecdst = NULL;
8030
8031 AUDIT_ARG(owner, uap->uid, uap->gid);
8032
8033 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
8034 uap->gid, uap->xsecurity);
8035
8036 if (error) {
8037 return error;
8038 }
8039
8040 error = fchmod1(p, uap->fd, &va);
8041
8042 if (xsecdst != NULL) {
8043 kauth_filesec_free(xsecdst);
8044 }
8045 return error;
8046 }
8047
8048 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)8049 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
8050 {
8051 struct vnode_attr va;
8052
8053 VATTR_INIT(&va);
8054 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
8055
8056 return fchmod1(p, uap->fd, &va);
8057 }
8058
8059 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)8060 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
8061 {
8062 struct vnode_attr va;
8063 kauth_action_t action;
8064 int error;
8065
8066 VATTR_INIT(&va);
8067 if (uid != (uid_t)VNOVAL) {
8068 VATTR_SET(&va, va_uid, uid);
8069 }
8070 if (gid != (gid_t)VNOVAL) {
8071 VATTR_SET(&va, va_gid, gid);
8072 }
8073
8074 #if NAMEDSTREAMS
8075 /* chown calls are not allowed for resource forks. */
8076 if (vp->v_flag & VISNAMEDSTREAM) {
8077 error = EPERM;
8078 goto out;
8079 }
8080 #endif
8081
8082 #if CONFIG_MACF
8083 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
8084 if (error) {
8085 goto out;
8086 }
8087 #endif
8088
8089 /* preflight and authorize attribute changes */
8090 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8091 goto out;
8092 }
8093 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8094 /*
8095 * EACCES is only allowed from namei(); permissions failure should
8096 * return EPERM, so we need to translate the error code.
8097 */
8098 if (error == EACCES) {
8099 error = EPERM;
8100 }
8101
8102 goto out;
8103 }
8104
8105 #if CONFIG_FILE_LEASES
8106 vnode_breakdirlease(vp, true, O_WRONLY);
8107 #endif
8108
8109 error = vnode_setattr(vp, &va, ctx);
8110
8111 #if CONFIG_MACF
8112 if (error == 0) {
8113 mac_vnode_notify_setowner(ctx, vp, uid, gid);
8114 }
8115 #endif
8116
8117 out:
8118 return error;
8119 }
8120
8121 /*
8122 * Set ownership given a path name.
8123 */
8124 /* ARGSUSED */
8125 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)8126 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
8127 gid_t gid, int flag, enum uio_seg segflg)
8128 {
8129 vnode_t vp;
8130 int error;
8131 struct nameidata nd;
8132 int follow;
8133
8134 AUDIT_ARG(owner, uid, gid);
8135
8136 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8137 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8138 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8139 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8140 }
8141
8142 error = nameiat(&nd, fd);
8143 if (error) {
8144 return error;
8145 }
8146
8147 vp = nd.ni_vp;
8148 error = vn_chown_internal(ctx, vp, uid, gid);
8149
8150 nameidone(&nd);
8151 vnode_put(vp);
8152 return error;
8153 }
8154
8155 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8156 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8157 {
8158 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8159 uap->uid, uap->gid, 0, UIO_USERSPACE);
8160 }
8161
8162 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8163 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8164 {
8165 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8166 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8167 }
8168
8169 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8170 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8171 {
8172 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
8173 return EINVAL;
8174 }
8175
8176 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8177 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8178 }
8179
8180 /*
8181 * Set ownership given a file descriptor.
8182 */
8183 /* ARGSUSED */
8184 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8185 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8186 {
8187 vfs_context_t ctx = vfs_context_current();
8188 vnode_t vp;
8189 int error;
8190
8191 AUDIT_ARG(owner, uap->uid, uap->gid);
8192 AUDIT_ARG(fd, uap->fd);
8193
8194 if ((error = file_vnode(uap->fd, &vp))) {
8195 return error;
8196 }
8197
8198 if ((error = vnode_getwithref(vp))) {
8199 file_drop(uap->fd);
8200 return error;
8201 }
8202 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8203
8204 error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8205
8206 (void)vnode_put(vp);
8207 file_drop(uap->fd);
8208 return error;
8209 }
8210
8211 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8212 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8213 {
8214 int error;
8215
8216 if (usrtvp == USER_ADDR_NULL) {
8217 struct timeval old_tv;
8218 /* XXX Y2038 bug because of microtime argument */
8219 microtime(&old_tv);
8220 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8221 tsp[1] = tsp[0];
8222 } else {
8223 if (IS_64BIT_PROCESS(current_proc())) {
8224 struct user64_timeval tv[2];
8225 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8226 if (error) {
8227 return error;
8228 }
8229 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8230 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8231 } else {
8232 struct user32_timeval tv[2];
8233 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8234 if (error) {
8235 return error;
8236 }
8237 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8238 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8239 }
8240 }
8241 return 0;
8242 }
8243
8244 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8245 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8246 int nullflag)
8247 {
8248 int error;
8249 struct vnode_attr va;
8250 kauth_action_t action;
8251
8252 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8253
8254 VATTR_INIT(&va);
8255 VATTR_SET(&va, va_access_time, ts[0]);
8256 VATTR_SET(&va, va_modify_time, ts[1]);
8257 if (nullflag) {
8258 va.va_vaflags |= VA_UTIMES_NULL;
8259 }
8260
8261 #if NAMEDSTREAMS
8262 /* utimes calls are not allowed for resource forks. */
8263 if (vp->v_flag & VISNAMEDSTREAM) {
8264 error = EPERM;
8265 goto out;
8266 }
8267 #endif
8268
8269 #if CONFIG_MACF
8270 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8271 if (error) {
8272 goto out;
8273 }
8274 #endif
8275 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8276 if (!nullflag && error == EACCES) {
8277 error = EPERM;
8278 }
8279 goto out;
8280 }
8281
8282 /* since we may not need to auth anything, check here */
8283 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8284 if (!nullflag && error == EACCES) {
8285 error = EPERM;
8286 }
8287 goto out;
8288 }
8289 error = vnode_setattr(vp, &va, ctx);
8290
8291 #if CONFIG_MACF
8292 if (error == 0) {
8293 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8294 }
8295 #endif
8296
8297 out:
8298 return error;
8299 }
8300
8301 /*
8302 * Set the access and modification times of a file.
8303 */
8304 /* ARGSUSED */
8305 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8306 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8307 {
8308 struct timespec ts[2];
8309 user_addr_t usrtvp;
8310 int error;
8311 struct nameidata nd;
8312 vfs_context_t ctx = vfs_context_current();
8313 uint32_t wantparent = 0;
8314
8315 #if CONFIG_FILE_LEASES
8316 wantparent = WANTPARENT;
8317 #endif
8318
8319 /*
8320 * AUDIT: Needed to change the order of operations to do the
8321 * name lookup first because auditing wants the path.
8322 */
8323 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8324 UIO_USERSPACE, uap->path, ctx);
8325 error = namei(&nd);
8326 if (error) {
8327 return error;
8328 }
8329
8330 /*
8331 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8332 * the current time instead.
8333 */
8334 usrtvp = uap->tptr;
8335 if ((error = getutimes(usrtvp, ts)) != 0) {
8336 goto out;
8337 }
8338
8339 #if CONFIG_FILE_LEASES
8340 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8341 #endif
8342
8343 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8344
8345 out:
8346 #if CONFIG_FILE_LEASES
8347 vnode_put(nd.ni_dvp);
8348 #endif
8349 nameidone(&nd);
8350 vnode_put(nd.ni_vp);
8351 return error;
8352 }
8353
8354 /*
8355 * Set the access and modification times of a file.
8356 */
8357 /* ARGSUSED */
8358 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8359 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8360 {
8361 struct timespec ts[2];
8362 vnode_t vp;
8363 user_addr_t usrtvp;
8364 int error;
8365
8366 AUDIT_ARG(fd, uap->fd);
8367 usrtvp = uap->tptr;
8368 if ((error = getutimes(usrtvp, ts)) != 0) {
8369 return error;
8370 }
8371 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8372 return error;
8373 }
8374 if ((error = vnode_getwithref(vp))) {
8375 file_drop(uap->fd);
8376 return error;
8377 }
8378
8379 #if CONFIG_FILE_LEASES
8380 vnode_breakdirlease(vp, true, O_WRONLY);
8381 #endif
8382
8383 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8384
8385 vnode_put(vp);
8386 file_drop(uap->fd);
8387 return error;
8388 }
8389
8390 static int
truncate_validate_common(proc_t p,off_t length)8391 truncate_validate_common(proc_t p, off_t length)
8392 {
8393 rlim_t fsize_limit;
8394
8395 if (length < 0) {
8396 return EINVAL;
8397 }
8398
8399 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8400 if ((rlim_t)length > fsize_limit) {
8401 psignal(p, SIGXFSZ);
8402 return EFBIG;
8403 }
8404
8405 return 0;
8406 }
8407
8408 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8409 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8410 vfs_context_t ctx, boolean_t need_auth)
8411 {
8412 struct vnode_attr va;
8413 kauth_action_t action;
8414 int error;
8415
8416 VATTR_INIT(&va);
8417 VATTR_SET(&va, va_data_size, length);
8418
8419 #if CONFIG_MACF
8420 error = mac_vnode_check_truncate(ctx, cred, vp);
8421 if (error) {
8422 return error;
8423 }
8424 #endif
8425
8426 /*
8427 * If we reached here from `ftruncate` then we already did an effective
8428 * `vnode_authorize` upon open. We honour the result from then.
8429 */
8430 if (need_auth) {
8431 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8432 return error;
8433 }
8434
8435 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8436 return error;
8437 }
8438 }
8439
8440 #if CONFIG_FILE_LEASES
8441 /* Check if there is a lease placed on the parent directory. */
8442 vnode_breakdirlease(vp, true, O_WRONLY);
8443
8444 /* Now check if there is a lease placed on the file itself. */
8445 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8446 #endif
8447
8448 error = vnode_setattr(vp, &va, ctx);
8449
8450 #if CONFIG_MACF
8451 if (error == 0) {
8452 mac_vnode_notify_truncate(ctx, cred, vp);
8453 }
8454 #endif
8455
8456 return error;
8457 }
8458
8459 /*
8460 * Truncate a file given its path name.
8461 */
8462 /* ARGSUSED */
8463 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8464 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8465 {
8466 vfs_context_t ctx = vfs_context_current();
8467 vnode_t vp;
8468 int error;
8469 struct nameidata nd;
8470
8471 if ((error = truncate_validate_common(p, uap->length))) {
8472 return error;
8473 }
8474
8475 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8476 UIO_USERSPACE, uap->path, ctx);
8477
8478 if ((error = namei(&nd))) {
8479 return error;
8480 }
8481
8482 vp = nd.ni_vp;
8483 nameidone(&nd);
8484
8485 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8486 vnode_put(vp);
8487
8488 return error;
8489 }
8490
8491 /*
8492 * Truncate a file given a file descriptor.
8493 */
8494 /* ARGSUSED */
8495 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8496 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8497 {
8498 struct vnode_attr va;
8499 vnode_t vp = NULLVP;
8500 struct fileproc *fp;
8501 bool need_vnode_put = false;
8502 int error;
8503
8504 AUDIT_ARG(fd, uap->fd);
8505
8506 if ((error = truncate_validate_common(p, uap->length))) {
8507 return error;
8508 }
8509
8510 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8511 return error;
8512 }
8513
8514 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8515 case DTYPE_PSXSHM:
8516 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8517 goto out;
8518 case DTYPE_VNODE:
8519 break;
8520 default:
8521 error = EINVAL;
8522 goto out;
8523 }
8524
8525 vp = (vnode_t)fp_get_data(fp);
8526
8527 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8528 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8529 error = EINVAL;
8530 goto out;
8531 }
8532
8533 if ((error = vnode_getwithref(vp)) != 0) {
8534 goto out;
8535 }
8536 need_vnode_put = true;
8537
8538 VATTR_INIT(&va);
8539 VATTR_WANTED(&va, va_flags);
8540
8541 error = vnode_getattr(vp, &va, vfs_context_current());
8542 if (error) {
8543 goto out;
8544 }
8545
8546 /* Don't allow ftruncate if the file has append-only flag set. */
8547 if (va.va_flags & APPEND) {
8548 error = EPERM;
8549 goto out;
8550 }
8551
8552 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8553
8554 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8555 vfs_context_current(), false);
8556 if (!error) {
8557 fp->fp_glob->fg_flag |= FWASWRITTEN;
8558 }
8559
8560 out:
8561 if (vp && need_vnode_put) {
8562 vnode_put(vp);
8563 }
8564
8565 file_drop(uap->fd);
8566 return error;
8567 }
8568
8569
8570 /*
8571 * Sync an open file with synchronized I/O _file_ integrity completion
8572 */
8573 /* ARGSUSED */
8574 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8575 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8576 {
8577 __pthread_testcancel(1);
8578 return fsync_common(p, uap, MNT_WAIT);
8579 }
8580
8581
8582 /*
8583 * Sync an open file with synchronized I/O _file_ integrity completion
8584 *
8585 * Notes: This is a legacy support function that does not test for
8586 * thread cancellation points.
8587 */
8588 /* ARGSUSED */
8589 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8590 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8591 {
8592 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8593 }
8594
8595
8596 /*
8597 * Sync an open file with synchronized I/O _data_ integrity completion
8598 */
8599 /* ARGSUSED */
8600 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8601 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8602 {
8603 __pthread_testcancel(1);
8604 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8605 }
8606
8607
8608 /*
8609 * fsync_common
8610 *
8611 * Common fsync code to support both synchronized I/O file integrity completion
8612 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8613 *
8614 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8615 * will only guarantee that the file data contents are retrievable. If
8616 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8617 * includes additional metadata unnecessary for retrieving the file data
8618 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8619 * storage.
8620 *
8621 * Parameters: p The process
8622 * uap->fd The descriptor to synchronize
8623 * flags The data integrity flags
8624 *
8625 * Returns: int Success
8626 * fp_getfvp:EBADF Bad file descriptor
8627 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8628 * VNOP_FSYNC:??? unspecified
8629 *
8630 * Notes: We use struct fsync_args because it is a short name, and all
8631 * caller argument structures are otherwise identical.
8632 */
8633 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8634 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8635 {
8636 vnode_t vp;
8637 struct fileproc *fp;
8638 vfs_context_t ctx = vfs_context_current();
8639 int error;
8640
8641 AUDIT_ARG(fd, uap->fd);
8642
8643 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8644 return error;
8645 }
8646 if ((error = vnode_getwithref(vp))) {
8647 file_drop(uap->fd);
8648 return error;
8649 }
8650
8651 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8652
8653 error = VNOP_FSYNC(vp, flags, ctx);
8654
8655 #if NAMEDRSRCFORK
8656 /* Sync resource fork shadow file if necessary. */
8657 if ((error == 0) &&
8658 (vp->v_flag & VISNAMEDSTREAM) &&
8659 (vp->v_parent != NULLVP) &&
8660 vnode_isshadow(vp) &&
8661 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8662 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8663 }
8664 #endif
8665
8666 (void)vnode_put(vp);
8667 file_drop(uap->fd);
8668 return error;
8669 }
8670
8671 /*
8672 * Duplicate files. Source must be a file, target must be a file or
8673 * must not exist.
8674 *
8675 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8676 * perform inheritance correctly.
8677 */
8678 /* ARGSUSED */
8679 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8680 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8681 {
8682 vnode_t tvp, fvp, tdvp, sdvp;
8683 struct nameidata fromnd, tond;
8684 int error;
8685 vfs_context_t ctx = vfs_context_current();
8686
8687 /* Check that the flags are valid. */
8688 if (uap->flags & ~CPF_MASK) {
8689 return EINVAL;
8690 }
8691
8692 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8693 UIO_USERSPACE, uap->from, ctx);
8694 if ((error = namei(&fromnd))) {
8695 return error;
8696 }
8697 fvp = fromnd.ni_vp;
8698
8699 NDINIT(&tond, CREATE, OP_LINK,
8700 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8701 UIO_USERSPACE, uap->to, ctx);
8702 if ((error = namei(&tond))) {
8703 goto out1;
8704 }
8705 tdvp = tond.ni_dvp;
8706 tvp = tond.ni_vp;
8707
8708 if (tvp != NULL) {
8709 if (!(uap->flags & CPF_OVERWRITE)) {
8710 error = EEXIST;
8711 goto out;
8712 }
8713 }
8714
8715 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8716 error = EISDIR;
8717 goto out;
8718 }
8719
8720 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8721 error = EOPNOTSUPP;
8722 goto out;
8723 }
8724
8725 #if CONFIG_MACF
8726 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8727 goto out;
8728 }
8729 #endif /* CONFIG_MACF */
8730
8731 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8732 goto out;
8733 }
8734 if (tvp) {
8735 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8736 goto out;
8737 }
8738 }
8739 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8740 goto out;
8741 }
8742
8743 if (fvp == tdvp) {
8744 error = EINVAL;
8745 }
8746 /*
8747 * If source is the same as the destination (that is the
8748 * same inode number) then there is nothing to do.
8749 * (fixed to have POSIX semantics - CSM 3/2/98)
8750 */
8751 if (fvp == tvp) {
8752 error = -1;
8753 }
8754
8755 #if CONFIG_FILE_LEASES
8756 vnode_breakdirlease(tdvp, false, O_WRONLY);
8757 #endif
8758
8759 if (!error) {
8760 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8761 }
8762 out:
8763 sdvp = tond.ni_startdir;
8764 /*
8765 * nameidone has to happen before we vnode_put(tdvp)
8766 * since it may need to release the fs_nodelock on the tdvp
8767 */
8768 nameidone(&tond);
8769
8770 if (tvp) {
8771 vnode_put(tvp);
8772 }
8773 vnode_put(tdvp);
8774 vnode_put(sdvp);
8775 out1:
8776 vnode_put(fvp);
8777
8778 nameidone(&fromnd);
8779
8780 if (error == -1) {
8781 return 0;
8782 }
8783 return error;
8784 }
8785
8786 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8787
8788 /*
8789 * Helper function for doing clones. The caller is expected to provide an
8790 * iocounted source vnode and release it.
8791 */
8792 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8793 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8794 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8795 {
8796 vnode_t tvp, tdvp;
8797 struct nameidata *tondp = NULL;
8798 int error;
8799 int follow;
8800 boolean_t free_src_acl;
8801 boolean_t attr_cleanup;
8802 enum vtype v_type;
8803 kauth_action_t action;
8804 struct componentname *cnp;
8805 uint32_t defaulted = 0;
8806 struct {
8807 struct vnode_attr va[2];
8808 } *va2p = NULL;
8809 struct vnode_attr *vap = NULL;
8810 struct vnode_attr *nvap = NULL;
8811 uint32_t vnop_flags;
8812
8813 v_type = vnode_vtype(fvp);
8814 switch (v_type) {
8815 case VLNK:
8816 /* FALLTHRU */
8817 case VREG:
8818 action = KAUTH_VNODE_ADD_FILE;
8819 break;
8820 case VDIR:
8821 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8822 fvp->v_mountedhere) {
8823 return EINVAL;
8824 }
8825 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8826 break;
8827 default:
8828 return EINVAL;
8829 }
8830
8831 AUDIT_ARG(fd2, dst_dirfd);
8832 AUDIT_ARG(value32, flags);
8833
8834 tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8835 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8836 NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8837 UIO_USERSPACE, dst, ctx);
8838 if (flags & CLONE_NOFOLLOW_ANY) {
8839 tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
8840 }
8841
8842 if ((error = nameiat(tondp, dst_dirfd))) {
8843 kfree_type(struct nameidata, tondp);
8844 return error;
8845 }
8846 cnp = &tondp->ni_cnd;
8847 tdvp = tondp->ni_dvp;
8848 tvp = tondp->ni_vp;
8849
8850 free_src_acl = FALSE;
8851 attr_cleanup = FALSE;
8852
8853 if (tvp != NULL) {
8854 error = EEXIST;
8855 goto out;
8856 }
8857
8858 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8859 error = EXDEV;
8860 goto out;
8861 }
8862
8863 #if CONFIG_MACF
8864 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8865 goto out;
8866 }
8867 #endif
8868 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8869 goto out;
8870 }
8871
8872 action = KAUTH_VNODE_GENERIC_READ_BITS;
8873 if (data_read_authorised) {
8874 action &= ~KAUTH_VNODE_READ_DATA;
8875 }
8876 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8877 goto out;
8878 }
8879
8880 va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
8881 vap = &va2p->va[0];
8882 nvap = &va2p->va[1];
8883
8884 /*
8885 * certain attributes may need to be changed from the source, we ask for
8886 * those here with the exception of source file's ACLs unless the CLONE_ACL
8887 * flag is specified. By default, the clone file will inherit the target
8888 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8889 * will inherit the source file's ACLs instead.
8890 */
8891 VATTR_INIT(vap);
8892 VATTR_WANTED(vap, va_uid);
8893 VATTR_WANTED(vap, va_gid);
8894 VATTR_WANTED(vap, va_mode);
8895 VATTR_WANTED(vap, va_flags);
8896 if (flags & CLONE_ACL) {
8897 VATTR_WANTED(vap, va_acl);
8898 }
8899
8900 if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
8901 goto out;
8902 }
8903
8904 VATTR_INIT(nvap);
8905 VATTR_SET(nvap, va_type, v_type);
8906 if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
8907 VATTR_SET(nvap, va_acl, vap->va_acl);
8908 free_src_acl = TRUE;
8909 }
8910
8911 /* Handle ACL inheritance, initialize vap. */
8912 if (v_type == VLNK) {
8913 error = vnode_authattr_new(tdvp, nvap, 0, ctx);
8914 } else {
8915 error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
8916 if (error) {
8917 goto out;
8918 }
8919 attr_cleanup = TRUE;
8920 }
8921
8922 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8923 /*
8924 * We've got initial values for all security parameters,
8925 * If we are superuser, then we can change owners to be the
8926 * same as the source. Both superuser and the owner have default
8927 * WRITE_SECURITY privileges so all other fields can be taken
8928 * from source as well.
8929 */
8930 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8931 if (VATTR_IS_SUPPORTED(vap, va_uid)) {
8932 VATTR_SET(nvap, va_uid, vap->va_uid);
8933 }
8934 if (VATTR_IS_SUPPORTED(vap, va_gid)) {
8935 VATTR_SET(nvap, va_gid, vap->va_gid);
8936 }
8937 } else {
8938 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8939 }
8940
8941 if (VATTR_IS_SUPPORTED(vap, va_mode)) {
8942 VATTR_SET(nvap, va_mode, vap->va_mode);
8943 }
8944 if (VATTR_IS_SUPPORTED(vap, va_flags)) {
8945 VATTR_SET(nvap, va_flags,
8946 ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8947 (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8948 }
8949
8950 #if CONFIG_FILE_LEASES
8951 vnode_breakdirlease(tdvp, false, O_WRONLY);
8952 #endif
8953
8954 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
8955
8956 if (!error && tvp) {
8957 int update_flags = 0;
8958 #if CONFIG_FSE
8959 int fsevent;
8960 #endif /* CONFIG_FSE */
8961
8962 /*
8963 * If some of the requested attributes weren't handled by the
8964 * VNOP, use our fallback code.
8965 */
8966 if (!VATTR_ALL_SUPPORTED(nvap)) {
8967 (void)vnode_setattr_fallback(tvp, nvap, ctx);
8968 }
8969
8970 #if CONFIG_MACF
8971 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8972 VNODE_LABEL_CREATE, ctx);
8973 #endif
8974
8975 // Make sure the name & parent pointers are hooked up
8976 if (tvp->v_name == NULL) {
8977 update_flags |= VNODE_UPDATE_NAME;
8978 }
8979 if (tvp->v_parent == NULLVP) {
8980 update_flags |= VNODE_UPDATE_PARENT;
8981 }
8982
8983 if (update_flags) {
8984 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8985 cnp->cn_namelen, cnp->cn_hash, update_flags);
8986 }
8987
8988 #if CONFIG_FSE
8989 switch (vnode_vtype(tvp)) {
8990 case VLNK:
8991 /* FALLTHRU */
8992 case VREG:
8993 fsevent = FSE_CREATE_FILE;
8994 break;
8995 case VDIR:
8996 fsevent = FSE_CREATE_DIR;
8997 break;
8998 default:
8999 goto out;
9000 }
9001
9002 if (need_fsevent(fsevent, tvp)) {
9003 /*
9004 * The following is a sequence of three explicit events.
9005 * A pair of FSE_CLONE events representing the source and destination
9006 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
9007 * fseventsd may coalesce the destination clone and create events
9008 * into a single event resulting in the following sequence for a client
9009 * FSE_CLONE (src)
9010 * FSE_CLONE | FSE_CREATE (dst)
9011 */
9012 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
9013 FSE_ARG_DONE);
9014 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
9015 FSE_ARG_DONE);
9016 }
9017 #endif /* CONFIG_FSE */
9018 }
9019
9020 out:
9021 if (attr_cleanup) {
9022 vn_attribute_cleanup(nvap, defaulted);
9023 }
9024 if (free_src_acl && vap->va_acl) {
9025 kauth_acl_free(vap->va_acl);
9026 }
9027 if (va2p) {
9028 kfree_type(typeof(*va2p), va2p);
9029 }
9030 nameidone(tondp);
9031 kfree_type(struct nameidata, tondp);
9032 if (tvp) {
9033 vnode_put(tvp);
9034 }
9035 vnode_put(tdvp);
9036 return error;
9037 }
9038
9039 /*
9040 * clone files or directories, target must not exist.
9041 */
9042 /* ARGSUSED */
9043 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)9044 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
9045 __unused int32_t *retval)
9046 {
9047 vnode_t fvp;
9048 struct nameidata *ndp = NULL;
9049 int follow;
9050 int error;
9051 vfs_context_t ctx = vfs_context_current();
9052
9053 /* Check that the flags are valid. */
9054 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9055 CLONE_NOFOLLOW_ANY)) {
9056 return EINVAL;
9057 }
9058
9059 AUDIT_ARG(fd, uap->src_dirfd);
9060
9061 ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9062
9063 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
9064 NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
9065 UIO_USERSPACE, uap->src, ctx);
9066 if (uap->flags & CLONE_NOFOLLOW_ANY) {
9067 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
9068 }
9069
9070 if ((error = nameiat(ndp, uap->src_dirfd))) {
9071 kfree_type(struct nameidata, ndp);
9072 return error;
9073 }
9074
9075 fvp = ndp->ni_vp;
9076 nameidone(ndp);
9077 kfree_type(struct nameidata, ndp);
9078
9079 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
9080 uap->flags, ctx);
9081
9082 vnode_put(fvp);
9083 return error;
9084 }
9085
9086 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)9087 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
9088 __unused int32_t *retval)
9089 {
9090 vnode_t fvp;
9091 struct fileproc *fp;
9092 int error;
9093 vfs_context_t ctx = vfs_context_current();
9094
9095 /* Check that the flags are valid. */
9096 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
9097 CLONE_NOFOLLOW_ANY)) {
9098 return EINVAL;
9099 }
9100
9101 AUDIT_ARG(fd, uap->src_fd);
9102 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
9103 if (error) {
9104 return error;
9105 }
9106
9107 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9108 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
9109 error = EBADF;
9110 goto out;
9111 }
9112
9113 if ((error = vnode_getwithref(fvp))) {
9114 goto out;
9115 }
9116
9117 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
9118
9119 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
9120 uap->flags, ctx);
9121
9122 vnode_put(fvp);
9123 out:
9124 file_drop(uap->src_fd);
9125 return error;
9126 }
9127
9128 static int
rename_submounts_callback(mount_t mp,void * arg)9129 rename_submounts_callback(mount_t mp, void *arg)
9130 {
9131 int error = 0;
9132 mount_t pmp = (mount_t)arg;
9133 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
9134
9135 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
9136 return 0;
9137 }
9138
9139 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
9140 return 0;
9141 }
9142
9143 if ((error = vfs_busy(mp, LK_NOWAIT))) {
9144 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
9145 return -1;
9146 }
9147
9148 size_t pathlen = MAXPATHLEN;
9149 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
9150 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
9151 }
9152
9153 vfs_unbusy(mp);
9154
9155 return error;
9156 }
9157
9158 /*
9159 * Rename files. Source and destination must either both be directories,
9160 * or both not be directories. If target is a directory, it must be empty.
9161 */
9162 /* ARGSUSED */
9163 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9164 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9165 int tofd, user_addr_t to, int segflg, u_int uflags)
9166 {
9167 vnode_t tvp, tdvp;
9168 vnode_t fvp, fdvp;
9169 vnode_t mnt_fvp;
9170 struct nameidata *fromnd, *tond;
9171 int error = 0;
9172 int do_retry;
9173 int retry_count;
9174 int mntrename;
9175 int need_event;
9176 int need_kpath2;
9177 int has_listeners;
9178 const char *oname = NULL;
9179 char *from_name = NULL, *to_name = NULL;
9180 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9181 int from_len = 0, to_len = 0;
9182 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9183 int holding_mntlock;
9184 int vn_authorize_skipped;
9185 mount_t locked_mp = NULL;
9186 vnode_t oparent = NULLVP;
9187 vnode_t locked_vp = NULLVP;
9188 #if CONFIG_FSE
9189 fse_info from_finfo = {}, to_finfo;
9190 #endif
9191 int from_truncated = 0, to_truncated = 0;
9192 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9193 int batched = 0;
9194 struct vnode_attr *fvap, *tvap;
9195 int continuing = 0;
9196 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9197 int32_t nofollow_any = 0;
9198 /* carving out a chunk for structs that are too big to be on stack. */
9199 struct {
9200 struct nameidata from_node, to_node;
9201 struct vnode_attr fv_attr, tv_attr;
9202 } * __rename_data;
9203
9204 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9205 fromnd = &__rename_data->from_node;
9206 tond = &__rename_data->to_node;
9207
9208 holding_mntlock = 0;
9209 do_retry = 0;
9210 retry_count = 0;
9211 retry:
9212 fvp = tvp = NULL;
9213 fdvp = tdvp = NULL;
9214 fvap = tvap = NULL;
9215 mnt_fvp = NULLVP;
9216 mntrename = FALSE;
9217 vn_authorize_skipped = FALSE;
9218
9219 if (uflags & RENAME_NOFOLLOW_ANY) {
9220 nofollow_any = NAMEI_NOFOLLOW_ANY;
9221 }
9222 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9223 segflg, from, ctx);
9224 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9225
9226 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9227 segflg, to, ctx);
9228 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9229
9230 continue_lookup:
9231 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9232 if ((error = nameiat(fromnd, fromfd))) {
9233 goto out1;
9234 }
9235 fdvp = fromnd->ni_dvp;
9236 fvp = fromnd->ni_vp;
9237
9238 if (fvp && fvp->v_type == VDIR) {
9239 tond->ni_cnd.cn_flags |= WILLBEDIR;
9240 }
9241 }
9242
9243 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9244 if ((error = nameiat(tond, tofd))) {
9245 /*
9246 * Translate error code for rename("dir1", "dir2/.").
9247 */
9248 if (error == EISDIR && fvp->v_type == VDIR) {
9249 error = EINVAL;
9250 }
9251 goto out1;
9252 }
9253 tdvp = tond->ni_dvp;
9254 tvp = tond->ni_vp;
9255 }
9256
9257 #if DEVELOPMENT || DEBUG
9258 /*
9259 * XXX VSWAP: Check for entitlements or special flag here
9260 * so we can restrict access appropriately.
9261 */
9262 #else /* DEVELOPMENT || DEBUG */
9263
9264 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9265 error = EPERM;
9266 goto out1;
9267 }
9268
9269 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9270 error = EPERM;
9271 goto out1;
9272 }
9273 #endif /* DEVELOPMENT || DEBUG */
9274
9275 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9276 error = ENOENT;
9277 goto out1;
9278 }
9279
9280 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9281 int32_t pval = 0;
9282 int err = 0;
9283
9284 /*
9285 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9286 * has the same name as target iff the following conditions are met:
9287 * 1. the target file system is case insensitive
9288 * 2. source and target directories are the same
9289 * 3. source and target files are the same
9290 * 4. name only differs in case (determined by underlying filesystem)
9291 */
9292 if (fvp != tvp || fdvp != tdvp) {
9293 error = EEXIST;
9294 goto out1;
9295 }
9296
9297 /*
9298 * Assume that the target file system is case sensitive if
9299 * _PC_CASE_SENSITIVE selector isn't supported.
9300 */
9301 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9302 if (err != 0 || pval != 0) {
9303 error = EEXIST;
9304 goto out1;
9305 }
9306 }
9307
9308 batched = vnode_compound_rename_available(fdvp);
9309
9310 #if CONFIG_FSE
9311 need_event = need_fsevent(FSE_RENAME, fdvp);
9312 if (need_event) {
9313 if (fvp) {
9314 get_fse_info(fvp, &from_finfo, ctx);
9315 } else {
9316 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9317 if (error) {
9318 goto out1;
9319 }
9320
9321 fvap = &__rename_data->fv_attr;
9322 }
9323
9324 if (tvp) {
9325 get_fse_info(tvp, &to_finfo, ctx);
9326 } else if (batched) {
9327 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9328 if (error) {
9329 goto out1;
9330 }
9331
9332 tvap = &__rename_data->tv_attr;
9333 }
9334 }
9335 #else
9336 need_event = 0;
9337 #endif /* CONFIG_FSE */
9338
9339 has_listeners = kauth_authorize_fileop_has_listeners();
9340
9341 need_kpath2 = 0;
9342 #if CONFIG_AUDIT
9343 if (AUDIT_RECORD_EXISTS()) {
9344 need_kpath2 = 1;
9345 }
9346 #endif
9347
9348 if (need_event || has_listeners) {
9349 if (from_name == NULL) {
9350 GET_PATH(from_name);
9351 }
9352
9353 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9354
9355 if (from_name_no_firmlink == NULL) {
9356 GET_PATH(from_name_no_firmlink);
9357 }
9358
9359 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9360 }
9361
9362 if (need_event || need_kpath2 || has_listeners) {
9363 if (to_name == NULL) {
9364 GET_PATH(to_name);
9365 }
9366
9367 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9368
9369 if (to_name_no_firmlink == NULL) {
9370 GET_PATH(to_name_no_firmlink);
9371 }
9372
9373 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9374 if (to_name && need_kpath2) {
9375 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9376 }
9377 }
9378 if (!fvp) {
9379 /*
9380 * Claim: this check will never reject a valid rename.
9381 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9382 * Suppose fdvp and tdvp are not on the same mount.
9383 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9384 * then you can't move it to within another dir on the same mountpoint.
9385 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9386 *
9387 * If this check passes, then we are safe to pass these vnodes to the same FS.
9388 */
9389 if (fdvp->v_mount != tdvp->v_mount) {
9390 error = EXDEV;
9391 goto out1;
9392 }
9393 goto skipped_lookup;
9394 }
9395
9396 /*
9397 * If the source and destination are the same (i.e. they're
9398 * links to the same vnode) and the target file system is
9399 * case sensitive, then there is nothing to do.
9400 *
9401 * XXX Come back to this.
9402 */
9403 if (fvp == tvp) {
9404 int pathconf_val;
9405
9406 /*
9407 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9408 * then assume that this file system is case sensitive.
9409 */
9410 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9411 pathconf_val != 0) {
9412 vn_authorize_skipped = TRUE;
9413 goto out1;
9414 }
9415 }
9416
9417 /*
9418 * Allow the renaming of mount points.
9419 * - target must not exist
9420 * - target must reside in the same directory as source
9421 * - union mounts cannot be renamed
9422 * - the root fs, and tightly-linked system volumes, cannot be renamed
9423 *
9424 * XXX Handle this in VFS after a continued lookup (if we missed
9425 * in the cache to start off)
9426 *
9427 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9428 * we'll skip past here. The file system is responsible for
9429 * checking that @tvp is not a descendent of @fvp and vice versa
9430 * so it should always return EINVAL if either @tvp or @fvp is the
9431 * root of a volume.
9432 */
9433 if ((fvp->v_flag & VROOT) &&
9434 (fvp->v_type == VDIR) &&
9435 (tvp == NULL) &&
9436 (fvp->v_mountedhere == NULL) &&
9437 (fdvp == tdvp) &&
9438 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9439 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9440 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9441 vnode_t coveredvp;
9442
9443 /* switch fvp to the covered vnode */
9444 coveredvp = fvp->v_mount->mnt_vnodecovered;
9445 if ((vnode_getwithref(coveredvp))) {
9446 error = ENOENT;
9447 goto out1;
9448 }
9449 /*
9450 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9451 * later.
9452 */
9453 mnt_fvp = fvp;
9454
9455 fvp = coveredvp;
9456 mntrename = TRUE;
9457 }
9458 /*
9459 * Check for cross-device rename.
9460 * For rename on mountpoint, we want to also check the source and its parent
9461 * belong to the same mountpoint.
9462 */
9463 if ((fvp->v_mount != tdvp->v_mount) ||
9464 (fvp->v_mount != fdvp->v_mount) ||
9465 (tvp && (fvp->v_mount != tvp->v_mount))) {
9466 error = EXDEV;
9467 goto out1;
9468 }
9469
9470 /*
9471 * If source is the same as the destination (that is the
9472 * same inode number) then there is nothing to do...
9473 * EXCEPT if the underlying file system supports case
9474 * insensitivity and is case preserving. In this case
9475 * the file system needs to handle the special case of
9476 * getting the same vnode as target (fvp) and source (tvp).
9477 *
9478 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9479 * and _PC_CASE_PRESERVING can have this exception, and they need to
9480 * handle the special case of getting the same vnode as target and
9481 * source. NOTE: Then the target is unlocked going into vnop_rename,
9482 * so not to cause locking problems. There is a single reference on tvp.
9483 *
9484 * NOTE - that fvp == tvp also occurs if they are hard linked and
9485 * that correct behaviour then is just to return success without doing
9486 * anything.
9487 *
9488 * XXX filesystem should take care of this itself, perhaps...
9489 */
9490 if (fvp == tvp && fdvp == tdvp) {
9491 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9492 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9493 fromnd->ni_cnd.cn_namelen)) {
9494 vn_authorize_skipped = TRUE;
9495 goto out1;
9496 }
9497 }
9498
9499 if (holding_mntlock && fvp->v_mount != locked_mp) {
9500 /*
9501 * we're holding a reference and lock
9502 * on locked_mp, but it no longer matches
9503 * what we want to do... so drop our hold
9504 */
9505 mount_unlock_renames(locked_mp);
9506 mount_drop(locked_mp, 0);
9507 holding_mntlock = 0;
9508 }
9509 if (tdvp != fdvp && fvp->v_type == VDIR) {
9510 /*
9511 * serialize renames that re-shape
9512 * the tree... if holding_mntlock is
9513 * set, then we're ready to go...
9514 * otherwise we
9515 * first need to drop the iocounts
9516 * we picked up, second take the
9517 * lock to serialize the access,
9518 * then finally start the lookup
9519 * process over with the lock held
9520 */
9521 if (!holding_mntlock) {
9522 /*
9523 * need to grab a reference on
9524 * the mount point before we
9525 * drop all the iocounts... once
9526 * the iocounts are gone, the mount
9527 * could follow
9528 */
9529 locked_mp = fvp->v_mount;
9530 mount_ref(locked_mp, 0);
9531
9532 /*
9533 * nameidone has to happen before we vnode_put(tvp)
9534 * since it may need to release the fs_nodelock on the tvp
9535 */
9536 nameidone(tond);
9537
9538 if (tvp) {
9539 vnode_put(tvp);
9540 }
9541 vnode_put(tdvp);
9542
9543 /*
9544 * nameidone has to happen before we vnode_put(fdvp)
9545 * since it may need to release the fs_nodelock on the fvp
9546 */
9547 nameidone(fromnd);
9548
9549 vnode_put(fvp);
9550 vnode_put(fdvp);
9551
9552 if (mnt_fvp != NULLVP) {
9553 vnode_put(mnt_fvp);
9554 }
9555
9556 mount_lock_renames(locked_mp);
9557 holding_mntlock = 1;
9558
9559 goto retry;
9560 }
9561 } else {
9562 /*
9563 * when we dropped the iocounts to take
9564 * the lock, we allowed the identity of
9565 * the various vnodes to change... if they did,
9566 * we may no longer be dealing with a rename
9567 * that reshapes the tree... once we're holding
9568 * the iocounts, the vnodes can't change type
9569 * so we're free to drop the lock at this point
9570 * and continue on
9571 */
9572 if (holding_mntlock) {
9573 mount_unlock_renames(locked_mp);
9574 mount_drop(locked_mp, 0);
9575 holding_mntlock = 0;
9576 }
9577 }
9578
9579 if (!batched) {
9580 assert(locked_vp == NULLVP);
9581 vnode_link_lock(fvp);
9582 locked_vp = fvp;
9583 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9584 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9585 flags, NULL);
9586 if (error) {
9587 if (error == ENOENT) {
9588 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9589 /*
9590 * We encountered a race where after doing the namei,
9591 * tvp stops being valid. If so, simply re-drive the rename
9592 * call from the top.
9593 */
9594 do_retry = 1;
9595 retry_count += 1;
9596 }
9597 }
9598 vnode_link_unlock(fvp);
9599 locked_vp = NULLVP;
9600 goto out1;
9601 }
9602 }
9603
9604 /* Release the 'mnt_fvp' now that it is no longer needed. */
9605 if (mnt_fvp != NULLVP) {
9606 vnode_put(mnt_fvp);
9607 mnt_fvp = NULLVP;
9608 }
9609
9610 // save these off so we can later verify that fvp is the same
9611 oname = fvp->v_name;
9612 oparent = fvp->v_parent;
9613
9614 skipped_lookup:
9615 #if CONFIG_FILE_LEASES
9616 /* Lease break needed for source's parent dir? */
9617 vnode_breakdirlease(fdvp, false, O_WRONLY);
9618
9619 /* Lease break needed for target's parent dir? */
9620 vnode_breakdirlease(tdvp, false, O_WRONLY);
9621 #endif
9622
9623 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9624 tdvp, &tvp, &tond->ni_cnd, tvap,
9625 flags, ctx);
9626
9627 if (locked_vp) {
9628 vnode_link_unlock(fvp);
9629 locked_vp = NULLVP;
9630 }
9631
9632 if (holding_mntlock) {
9633 /*
9634 * we can drop our serialization
9635 * lock now
9636 */
9637 mount_unlock_renames(locked_mp);
9638 mount_drop(locked_mp, 0);
9639 holding_mntlock = 0;
9640 }
9641 if (error) {
9642 if (error == EDATALESS) {
9643 /*
9644 * If we've been here before, something has gone
9645 * horribly wrong and we should just get out lest
9646 * we spiral around the drain forever.
9647 */
9648 if (flags & VFS_RENAME_DATALESS) {
9649 error = EIO;
9650 goto out1;
9651 }
9652
9653 /*
9654 * The object we're renaming is dataless (or has a
9655 * dataless descendent) and requires materialization
9656 * before the rename occurs. But we're holding the
9657 * mount point's rename lock, so it's not safe to
9658 * make the upcall.
9659 *
9660 * In this case, we release the lock (above), perform
9661 * the materialization, and start the whole thing over.
9662 */
9663 error = vfs_materialize_reparent(fvp, tdvp);
9664 if (error == 0) {
9665 /*
9666 * The next time around we need to tell the
9667 * file system that the materializtaion has
9668 * been performed.
9669 */
9670 flags |= VFS_RENAME_DATALESS;
9671 do_retry = 1;
9672 }
9673 goto out1;
9674 }
9675 if (error == EKEEPLOOKING) {
9676 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9677 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9678 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9679 }
9680 }
9681
9682 fromnd->ni_vp = fvp;
9683 tond->ni_vp = tvp;
9684
9685 goto continue_lookup;
9686 }
9687
9688 /*
9689 * We may encounter a race in the VNOP where the destination didn't
9690 * exist when we did the namei, but it does by the time we go and
9691 * try to create the entry. In this case, we should re-drive this rename
9692 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9693 * but other filesystems susceptible to this race could return it, too.
9694 */
9695 if (error == ERECYCLE) {
9696 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9697 do_retry = 1;
9698 retry_count += 1;
9699 } else {
9700 printf("rename retry limit due to ERECYCLE reached\n");
9701 error = ENOENT;
9702 }
9703 }
9704
9705 /*
9706 * For compound VNOPs, the authorization callback may return
9707 * ENOENT in case of racing hardlink lookups hitting the name
9708 * cache, redrive the lookup.
9709 */
9710 if (batched && error == ENOENT) {
9711 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9712 do_retry = 1;
9713 retry_count += 1;
9714 }
9715 }
9716
9717 goto out1;
9718 }
9719
9720 /* call out to allow 3rd party notification of rename.
9721 * Ignore result of kauth_authorize_fileop call.
9722 */
9723 kauth_authorize_fileop(vfs_context_ucred(ctx),
9724 KAUTH_FILEOP_RENAME,
9725 (uintptr_t)from_name, (uintptr_t)to_name);
9726 if (flags & VFS_RENAME_SWAP) {
9727 kauth_authorize_fileop(vfs_context_ucred(ctx),
9728 KAUTH_FILEOP_RENAME,
9729 (uintptr_t)to_name, (uintptr_t)from_name);
9730 }
9731
9732 #if CONFIG_FSE
9733 if (from_name != NULL && to_name != NULL) {
9734 if (from_truncated || to_truncated) {
9735 // set it here since only the from_finfo gets reported up to user space
9736 from_finfo.mode |= FSE_TRUNCATED_PATH;
9737 }
9738
9739 if (tvap && tvp) {
9740 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9741 }
9742 if (fvap) {
9743 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9744 }
9745
9746 if (tvp) {
9747 add_fsevent(FSE_RENAME, ctx,
9748 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9749 FSE_ARG_FINFO, &from_finfo,
9750 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9751 FSE_ARG_FINFO, &to_finfo,
9752 FSE_ARG_DONE);
9753 if (flags & VFS_RENAME_SWAP) {
9754 /*
9755 * Strictly speaking, swap is the equivalent of
9756 * *three* renames. FSEvents clients should only take
9757 * the events as a hint, so we only bother reporting
9758 * two.
9759 */
9760 add_fsevent(FSE_RENAME, ctx,
9761 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9762 FSE_ARG_FINFO, &to_finfo,
9763 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9764 FSE_ARG_FINFO, &from_finfo,
9765 FSE_ARG_DONE);
9766 }
9767 } else {
9768 add_fsevent(FSE_RENAME, ctx,
9769 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9770 FSE_ARG_FINFO, &from_finfo,
9771 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9772 FSE_ARG_DONE);
9773 }
9774 }
9775 #endif /* CONFIG_FSE */
9776
9777 /*
9778 * update filesystem's mount point data
9779 */
9780 if (mntrename) {
9781 char *cp, *pathend, *mpname;
9782 char * tobuf;
9783 struct mount *mp;
9784 int maxlen;
9785 size_t len = 0;
9786
9787 mp = fvp->v_mountedhere;
9788
9789 if (vfs_busy(mp, LK_NOWAIT)) {
9790 error = EBUSY;
9791 goto out1;
9792 }
9793 tobuf = zalloc(ZV_NAMEI);
9794
9795 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9796 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9797 } else {
9798 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9799 }
9800 if (!error) {
9801 /* find current mount point prefix */
9802 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9803 for (cp = pathend; *cp != '\0'; ++cp) {
9804 if (*cp == '/') {
9805 pathend = cp + 1;
9806 }
9807 }
9808 /* find last component of target name */
9809 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9810 if (*cp == '/') {
9811 mpname = cp + 1;
9812 }
9813 }
9814
9815 /* Update f_mntonname of sub mounts */
9816 vfs_iterate(0, rename_submounts_callback, (void *)mp);
9817
9818 /* append name to prefix */
9819 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9820 bzero(pathend, maxlen);
9821
9822 strlcpy(pathend, mpname, maxlen);
9823 }
9824 zfree(ZV_NAMEI, tobuf);
9825
9826 vfs_unbusy(mp);
9827
9828 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9829 }
9830 /*
9831 * fix up name & parent pointers. note that we first
9832 * check that fvp has the same name/parent pointers it
9833 * had before the rename call... this is a 'weak' check
9834 * at best...
9835 *
9836 * XXX oparent and oname may not be set in the compound vnop case
9837 */
9838 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9839 int update_flags;
9840
9841 update_flags = VNODE_UPDATE_NAME;
9842
9843 if (fdvp != tdvp) {
9844 update_flags |= VNODE_UPDATE_PARENT;
9845 }
9846
9847 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9848 }
9849 out1:
9850 /*
9851 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9852 * skipped earlier as no actual rename was performed.
9853 */
9854 if (vn_authorize_skipped && error == 0) {
9855 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9856 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9857 flags, NULL);
9858 if (error && error == ENOENT) {
9859 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9860 do_retry = 1;
9861 retry_count += 1;
9862 }
9863 }
9864 }
9865 if (to_name != NULL) {
9866 RELEASE_PATH(to_name);
9867 to_name = NULL;
9868 }
9869 if (to_name_no_firmlink != NULL) {
9870 RELEASE_PATH(to_name_no_firmlink);
9871 to_name_no_firmlink = NULL;
9872 }
9873 if (from_name != NULL) {
9874 RELEASE_PATH(from_name);
9875 from_name = NULL;
9876 }
9877 if (from_name_no_firmlink != NULL) {
9878 RELEASE_PATH(from_name_no_firmlink);
9879 from_name_no_firmlink = NULL;
9880 }
9881 if (holding_mntlock) {
9882 mount_unlock_renames(locked_mp);
9883 mount_drop(locked_mp, 0);
9884 holding_mntlock = 0;
9885 }
9886 if (tdvp) {
9887 /*
9888 * nameidone has to happen before we vnode_put(tdvp)
9889 * since it may need to release the fs_nodelock on the tdvp
9890 */
9891 nameidone(tond);
9892
9893 if (tvp) {
9894 vnode_put(tvp);
9895 }
9896 vnode_put(tdvp);
9897 }
9898 if (fdvp) {
9899 /*
9900 * nameidone has to happen before we vnode_put(fdvp)
9901 * since it may need to release the fs_nodelock on the fdvp
9902 */
9903 nameidone(fromnd);
9904
9905 if (fvp) {
9906 vnode_put(fvp);
9907 }
9908 vnode_put(fdvp);
9909 }
9910 if (mnt_fvp != NULLVP) {
9911 vnode_put(mnt_fvp);
9912 }
9913 /*
9914 * If things changed after we did the namei, then we will re-drive
9915 * this rename call from the top.
9916 */
9917 if (do_retry) {
9918 do_retry = 0;
9919 goto retry;
9920 }
9921
9922 kfree_type(typeof(*__rename_data), __rename_data);
9923 return error;
9924 }
9925
9926 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9927 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9928 {
9929 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9930 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9931 }
9932
9933 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9934 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9935 {
9936 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9937 return EINVAL;
9938 }
9939
9940 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9941 return EINVAL;
9942 }
9943
9944 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9945 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9946 }
9947
9948 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9949 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9950 {
9951 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9952 uap->tofd, uap->to, UIO_USERSPACE, 0);
9953 }
9954
9955 /*
9956 * Make a directory file.
9957 *
9958 * Returns: 0 Success
9959 * EEXIST
9960 * namei:???
9961 * vnode_authorize:???
9962 * vn_create:???
9963 */
9964 /* ARGSUSED */
9965 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9966 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9967 enum uio_seg segflg)
9968 {
9969 vnode_t vp, dvp;
9970 int error;
9971 int update_flags = 0;
9972 int batched;
9973 struct nameidata nd;
9974
9975 AUDIT_ARG(mode, vap->va_mode);
9976 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9977 path, ctx);
9978 nd.ni_cnd.cn_flags |= WILLBEDIR;
9979 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9980
9981 continue_lookup:
9982 error = nameiat(&nd, fd);
9983 if (error) {
9984 return error;
9985 }
9986 dvp = nd.ni_dvp;
9987 vp = nd.ni_vp;
9988
9989 if (vp != NULL) {
9990 error = EEXIST;
9991 goto out;
9992 }
9993
9994 batched = vnode_compound_mkdir_available(dvp);
9995
9996 VATTR_SET(vap, va_type, VDIR);
9997
9998 /*
9999 * XXX
10000 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
10001 * only get EXISTS or EISDIR for existing path components, and not that it could see
10002 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
10003 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
10004 */
10005 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
10006 if (error == EACCES || error == EPERM) {
10007 int error2;
10008
10009 nameidone(&nd);
10010 vnode_put(dvp);
10011 dvp = NULLVP;
10012
10013 /*
10014 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
10015 * rather than EACCESS if the target exists.
10016 */
10017 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
10018 path, ctx);
10019 error2 = nameiat(&nd, fd);
10020 if (error2) {
10021 goto out;
10022 } else {
10023 vp = nd.ni_vp;
10024 error = EEXIST;
10025 goto out;
10026 }
10027 }
10028
10029 goto out;
10030 }
10031
10032 #if CONFIG_FILE_LEASES
10033 vnode_breakdirlease(dvp, false, O_WRONLY);
10034 #endif
10035
10036 /*
10037 * make the directory
10038 */
10039 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
10040 if (error == EKEEPLOOKING) {
10041 nd.ni_vp = vp;
10042 goto continue_lookup;
10043 }
10044
10045 goto out;
10046 }
10047
10048 // Make sure the name & parent pointers are hooked up
10049 if (vp->v_name == NULL) {
10050 update_flags |= VNODE_UPDATE_NAME;
10051 }
10052 if (vp->v_parent == NULLVP) {
10053 update_flags |= VNODE_UPDATE_PARENT;
10054 }
10055
10056 if (update_flags) {
10057 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
10058 }
10059
10060 #if CONFIG_FSE
10061 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
10062 #endif
10063
10064 out:
10065 /*
10066 * nameidone has to happen before we vnode_put(dvp)
10067 * since it may need to release the fs_nodelock on the dvp
10068 */
10069 nameidone(&nd);
10070
10071 if (vp) {
10072 vnode_put(vp);
10073 }
10074 if (dvp) {
10075 vnode_put(dvp);
10076 }
10077
10078 return error;
10079 }
10080
10081 /*
10082 * mkdir_extended: Create a directory; with extended security (ACL).
10083 *
10084 * Parameters: p Process requesting to create the directory
10085 * uap User argument descriptor (see below)
10086 * retval (ignored)
10087 *
10088 * Indirect: uap->path Path of directory to create
10089 * uap->mode Access permissions to set
10090 * uap->xsecurity ACL to set
10091 *
10092 * Returns: 0 Success
10093 * !0 Not success
10094 *
10095 */
10096 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)10097 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
10098 {
10099 int ciferror;
10100 kauth_filesec_t xsecdst;
10101 struct vnode_attr va;
10102
10103 AUDIT_ARG(owner, uap->uid, uap->gid);
10104
10105 xsecdst = NULL;
10106 if ((uap->xsecurity != USER_ADDR_NULL) &&
10107 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
10108 return ciferror;
10109 }
10110
10111 VATTR_INIT(&va);
10112 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10113 if (xsecdst != NULL) {
10114 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
10115 va.va_vaflags |= VA_FILESEC_ACL;
10116 }
10117
10118 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10119 UIO_USERSPACE);
10120 if (xsecdst != NULL) {
10121 kauth_filesec_free(xsecdst);
10122 }
10123 return ciferror;
10124 }
10125
10126 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)10127 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
10128 {
10129 struct vnode_attr va;
10130
10131 VATTR_INIT(&va);
10132 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10133
10134 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
10135 UIO_USERSPACE);
10136 }
10137
10138 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)10139 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
10140 {
10141 struct vnode_attr va;
10142
10143 VATTR_INIT(&va);
10144 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
10145
10146 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
10147 UIO_USERSPACE);
10148 }
10149
10150 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)10151 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
10152 enum uio_seg segflg, int unlink_flags)
10153 {
10154 struct {
10155 struct nameidata nd;
10156 #if CONFIG_FSE
10157 struct vnode_attr va;
10158 #endif /* CONFIG_FSE */
10159 } *__rmdir_data;
10160 vnode_t vp, dvp;
10161 int error;
10162 struct nameidata *ndp;
10163 char *path = NULL;
10164 char *no_firmlink_path = NULL;
10165 int len_path = 0;
10166 int len_no_firmlink_path = 0;
10167 int has_listeners = 0;
10168 int need_event = 0;
10169 int truncated_path = 0;
10170 int truncated_no_firmlink_path = 0;
10171 struct vnode_attr *vap = NULL;
10172 int restart_count = 0;
10173 int batched;
10174
10175 int restart_flag;
10176 int nofollow_any = 0;
10177
10178 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10179 ndp = &__rmdir_data->nd;
10180
10181 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10182 nofollow_any = NAMEI_NOFOLLOW_ANY;
10183 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10184 }
10185
10186 /*
10187 * This loop exists to restart rmdir in the unlikely case that two
10188 * processes are simultaneously trying to remove the same directory
10189 * containing orphaned appleDouble files.
10190 */
10191 do {
10192 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10193 segflg, dirpath, ctx);
10194 ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
10195 continue_lookup:
10196 restart_flag = 0;
10197 vap = NULL;
10198
10199 error = nameiat(ndp, fd);
10200 if (error) {
10201 goto err_out;
10202 }
10203
10204 dvp = ndp->ni_dvp;
10205 vp = ndp->ni_vp;
10206
10207 if (vp) {
10208 batched = vnode_compound_rmdir_available(vp);
10209
10210 if (vp->v_flag & VROOT) {
10211 /*
10212 * The root of a mounted filesystem cannot be deleted.
10213 */
10214 error = EBUSY;
10215 goto out;
10216 }
10217
10218 #if DEVELOPMENT || DEBUG
10219 /*
10220 * XXX VSWAP: Check for entitlements or special flag here
10221 * so we can restrict access appropriately.
10222 */
10223 #else /* DEVELOPMENT || DEBUG */
10224
10225 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10226 error = EPERM;
10227 goto out;
10228 }
10229 #endif /* DEVELOPMENT || DEBUG */
10230
10231 /*
10232 * Removed a check here; we used to abort if vp's vid
10233 * was not the same as what we'd seen the last time around.
10234 * I do not think that check was valid, because if we retry
10235 * and all dirents are gone, the directory could legitimately
10236 * be recycled but still be present in a situation where we would
10237 * have had permission to delete. Therefore, we won't make
10238 * an effort to preserve that check now that we may not have a
10239 * vp here.
10240 */
10241
10242 if (!batched) {
10243 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10244 if (error) {
10245 if (error == ENOENT) {
10246 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10247 restart_flag = 1;
10248 restart_count += 1;
10249 }
10250 }
10251 goto out;
10252 }
10253 }
10254 } else {
10255 batched = 1;
10256
10257 if (!vnode_compound_rmdir_available(dvp)) {
10258 panic("No error, but no compound rmdir?");
10259 }
10260 }
10261
10262 #if CONFIG_FSE
10263 fse_info finfo = {0};
10264
10265 need_event = need_fsevent(FSE_DELETE, dvp);
10266 if (need_event) {
10267 if (!batched) {
10268 get_fse_info(vp, &finfo, ctx);
10269 } else {
10270 error = vfs_get_notify_attributes(&__rmdir_data->va);
10271 if (error) {
10272 goto out;
10273 }
10274
10275 vap = &__rmdir_data->va;
10276 }
10277 }
10278 #endif
10279 has_listeners = kauth_authorize_fileop_has_listeners();
10280 if (need_event || has_listeners) {
10281 if (path == NULL) {
10282 GET_PATH(path);
10283 }
10284
10285 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10286
10287 if (no_firmlink_path == NULL) {
10288 GET_PATH(no_firmlink_path);
10289 }
10290
10291 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10292 #if CONFIG_FSE
10293 if (truncated_no_firmlink_path) {
10294 finfo.mode |= FSE_TRUNCATED_PATH;
10295 }
10296 #endif
10297 }
10298
10299 #if CONFIG_FILE_LEASES
10300 vnode_breakdirlease(dvp, false, O_WRONLY);
10301 #endif
10302
10303 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10304 ndp->ni_vp = vp;
10305 if (vp == NULLVP) {
10306 /* Couldn't find a vnode */
10307 goto out;
10308 }
10309
10310 if (error == EKEEPLOOKING) {
10311 goto continue_lookup;
10312 } else if (batched && error == ENOENT) {
10313 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10314 /*
10315 * For compound VNOPs, the authorization callback
10316 * may return ENOENT in case of racing hard link lookups
10317 * redrive the lookup.
10318 */
10319 restart_flag = 1;
10320 restart_count += 1;
10321 goto out;
10322 }
10323 }
10324
10325 /*
10326 * XXX There's no provision for passing flags
10327 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10328 * because it's not empty, then we try again
10329 * with VNOP_REMOVE(), passing in a special
10330 * flag that clever file systems will know
10331 * how to handle.
10332 */
10333 if (error == ENOTEMPTY &&
10334 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10335 /*
10336 * Only do this if the directory is actually
10337 * marked as DATALESS.
10338 */
10339 struct vnode_attr *lvap =
10340 kalloc_type(struct vnode_attr, Z_WAITOK);
10341
10342 VATTR_INIT(lvap);
10343 VATTR_WANTED(lvap, va_flags);
10344 if (vnode_getattr(vp, lvap, ctx) == 0 &&
10345 VATTR_IS_SUPPORTED(lvap, va_flags) &&
10346 (lvap->va_flags & SF_DATALESS) != 0) {
10347 /*
10348 * If this fails, we want to keep the original
10349 * error.
10350 */
10351 if (vn_remove(dvp, &vp, ndp,
10352 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10353 error = 0;
10354 }
10355 }
10356 kfree_type(struct vnode_attr, lvap);
10357 }
10358
10359 #if CONFIG_APPLEDOUBLE
10360 /*
10361 * Special case to remove orphaned AppleDouble
10362 * files. I don't like putting this in the kernel,
10363 * but carbon does not like putting this in carbon either,
10364 * so here we are.
10365 */
10366 if (error == ENOTEMPTY) {
10367 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10368 if (ad_error == EBUSY) {
10369 error = ad_error;
10370 goto out;
10371 }
10372
10373
10374 /*
10375 * Assuming everything went well, we will try the RMDIR again
10376 */
10377 if (!ad_error) {
10378 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10379 }
10380 }
10381 #endif /* CONFIG_APPLEDOUBLE */
10382 /*
10383 * Call out to allow 3rd party notification of delete.
10384 * Ignore result of kauth_authorize_fileop call.
10385 */
10386 if (!error) {
10387 if (has_listeners) {
10388 kauth_authorize_fileop(vfs_context_ucred(ctx),
10389 KAUTH_FILEOP_DELETE,
10390 (uintptr_t)vp,
10391 (uintptr_t)path);
10392 }
10393
10394 if (vp->v_flag & VISHARDLINK) {
10395 // see the comment in unlink1() about why we update
10396 // the parent of a hard link when it is removed
10397 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10398 }
10399
10400 #if CONFIG_FSE
10401 if (need_event) {
10402 if (vap) {
10403 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10404 }
10405 add_fsevent(FSE_DELETE, ctx,
10406 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10407 FSE_ARG_FINFO, &finfo,
10408 FSE_ARG_DONE);
10409 }
10410 #endif
10411
10412 #if CONFIG_MACF
10413 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10414 #endif
10415 }
10416
10417 out:
10418 if (path != NULL) {
10419 RELEASE_PATH(path);
10420 path = NULL;
10421 }
10422
10423 if (no_firmlink_path != NULL) {
10424 RELEASE_PATH(no_firmlink_path);
10425 no_firmlink_path = NULL;
10426 }
10427
10428 /*
10429 * nameidone has to happen before we vnode_put(dvp)
10430 * since it may need to release the fs_nodelock on the dvp
10431 */
10432 nameidone(ndp);
10433 vnode_put(dvp);
10434
10435 if (vp) {
10436 vnode_put(vp);
10437 }
10438
10439 if (restart_flag == 0) {
10440 wakeup_one((caddr_t)vp);
10441 goto err_out;
10442 }
10443 tsleep(vp, PVFS, "rm AD", 1);
10444 } while (restart_flag != 0);
10445
10446 err_out:
10447 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10448
10449 return error;
10450 }
10451
10452 /*
10453 * Remove a directory file.
10454 */
10455 /* ARGSUSED */
10456 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10457 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10458 {
10459 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10460 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10461 }
10462
10463 /* Get direntry length padded to 8 byte alignment */
10464 #define DIRENT64_LEN(namlen) \
10465 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10466
10467 /* Get dirent length padded to 4 byte alignment */
10468 #define DIRENT_LEN(namelen) \
10469 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10470
10471 /* Get the end of this dirent */
10472 #define DIRENT_END(dep) \
10473 (((char *)(dep)) + (dep)->d_reclen - 1)
10474
10475 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10476 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10477 int *numdirent, vfs_context_t ctxp)
10478 {
10479 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10480 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10481 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10482 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10483 } else {
10484 size_t bufsize;
10485 void * bufptr;
10486 uio_t auio;
10487 struct direntry *entry64;
10488 struct dirent *dep;
10489 size_t bytesread;
10490 int error;
10491
10492 /*
10493 * We're here because the underlying file system does not
10494 * support direnties or we mounted denying support so we must
10495 * fall back to dirents and convert them to direntries.
10496 *
10497 * Our kernel buffer needs to be smaller since re-packing will
10498 * expand each dirent. The worse case (when the name length
10499 * is 3 or less) corresponds to a struct direntry size of 32
10500 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10501 * (4-byte aligned). So having a buffer that is 3/8 the size
10502 * will prevent us from reading more than we can pack.
10503 *
10504 * Since this buffer is wired memory, we will limit the
10505 * buffer size to a maximum of 32K. We would really like to
10506 * use 32K in the MIN(), but we use magic number 87371 to
10507 * prevent uio_resid() * 3 / 8 from overflowing.
10508 */
10509 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10510 bufptr = kalloc_data(bufsize, Z_WAITOK);
10511 if (bufptr == NULL) {
10512 return ENOMEM;
10513 }
10514
10515 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10516 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10517 auio->uio_offset = uio->uio_offset;
10518
10519 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10520
10521 dep = (struct dirent *)bufptr;
10522 bytesread = bufsize - uio_resid(auio);
10523
10524 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10525 /*
10526 * Convert all the entries and copy them out to user's buffer.
10527 */
10528 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10529 /* First check that the dirent struct up to d_name is within the buffer */
10530 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10531 /* Check that the length of the entire dirent is within the buffer */
10532 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10533 /* Check that the actual length including the name doesn't exceed d_reclen */
10534 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10535 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10536 vp->v_mount->mnt_vfsstat.f_mntonname,
10537 vp->v_name ? vp->v_name : "<unknown>");
10538 error = EIO;
10539 break;
10540 }
10541
10542 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10543
10544 bzero(entry64, enbufsize);
10545 /* Convert a dirent to a dirent64. */
10546 entry64->d_ino = dep->d_ino;
10547 entry64->d_seekoff = 0;
10548 entry64->d_reclen = (uint16_t)enbufsize;
10549 entry64->d_namlen = dep->d_namlen;
10550 entry64->d_type = dep->d_type;
10551 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10552
10553 /* Move to next entry. */
10554 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10555
10556 /* Copy entry64 to user's buffer. */
10557 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10558 }
10559
10560 /* Update the real offset using the offset we got from VNOP_READDIR. */
10561 if (error == 0) {
10562 uio->uio_offset = auio->uio_offset;
10563 }
10564 uio_free(auio);
10565 kfree_data(bufptr, bufsize);
10566 kfree_type(struct direntry, entry64);
10567 return error;
10568 }
10569 }
10570
10571 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10572
10573 /*
10574 * Read a block of directory entries in a file system independent format.
10575 */
10576 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10577 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10578 off_t *offset, int *eofflag, int flags)
10579 {
10580 vnode_t vp;
10581 struct vfs_context context = *vfs_context_current(); /* local copy */
10582 struct fileproc *fp;
10583 uio_t auio;
10584 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10585 off_t loff;
10586 int error, numdirent;
10587 UIO_STACKBUF(uio_buf, 1);
10588
10589 get_from_fd:
10590 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10591 if (error) {
10592 return error;
10593 }
10594
10595 vn_offset_lock(fp->fp_glob);
10596 if (((vnode_t)fp_get_data(fp)) != vp) {
10597 vn_offset_unlock(fp->fp_glob);
10598 file_drop(fd);
10599 goto get_from_fd;
10600 }
10601
10602 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10603 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10604 error = EBADF;
10605 goto out;
10606 }
10607
10608 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10609 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10610 }
10611
10612 #if CONFIG_MACF
10613 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10614 if (error) {
10615 goto out;
10616 }
10617 #endif
10618
10619 if ((error = vnode_getwithref(vp))) {
10620 goto out;
10621 }
10622 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10623
10624 #if CONFIG_UNION_MOUNTS
10625 unionread:
10626 #endif /* CONFIG_UNION_MOUNTS */
10627 if (vp->v_type != VDIR) {
10628 (void)vnode_put(vp);
10629 error = EINVAL;
10630 goto out;
10631 }
10632
10633 #if CONFIG_MACF
10634 error = mac_vnode_check_readdir(&context, vp);
10635 if (error != 0) {
10636 (void)vnode_put(vp);
10637 goto out;
10638 }
10639 #endif /* MAC */
10640
10641 loff = fp->fp_glob->fg_offset;
10642 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10643 uio_addiov(auio, bufp, bufsize);
10644
10645 if (flags & VNODE_READDIR_EXTENDED) {
10646 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10647 fp->fp_glob->fg_offset = uio_offset(auio);
10648 } else {
10649 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10650 fp->fp_glob->fg_offset = uio_offset(auio);
10651 }
10652 if (error) {
10653 (void)vnode_put(vp);
10654 goto out;
10655 }
10656
10657 #if CONFIG_UNION_MOUNTS
10658 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10659 (vp->v_mount->mnt_flag & MNT_UNION)) {
10660 vnode_t uvp;
10661
10662 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10663 if (vnode_ref(uvp) == 0) {
10664 fp_set_data(fp, uvp);
10665 fp->fp_glob->fg_offset = 0;
10666 vnode_rele(vp);
10667 vnode_put(vp);
10668 vp = uvp;
10669 goto unionread;
10670 } else {
10671 /* could not get a ref, can't replace in fd */
10672 vnode_put(uvp);
10673 }
10674 }
10675 }
10676 #endif /* CONFIG_UNION_MOUNTS */
10677
10678 vnode_put(vp);
10679 if (offset) {
10680 *offset = loff;
10681 }
10682
10683 *bytesread = bufsize - uio_resid(auio);
10684 out:
10685 vn_offset_unlock(fp->fp_glob);
10686 file_drop(fd);
10687 return error;
10688 }
10689
10690
10691 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10692 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10693 {
10694 off_t offset;
10695 ssize_t bytesread;
10696 int error, eofflag;
10697
10698 AUDIT_ARG(fd, uap->fd);
10699 error = getdirentries_common(uap->fd, uap->buf, uap->count,
10700 &bytesread, &offset, &eofflag, 0);
10701
10702 if (error == 0) {
10703 if (proc_is64bit(p)) {
10704 user64_long_t base = (user64_long_t)offset;
10705 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10706 } else {
10707 user32_long_t base = (user32_long_t)offset;
10708 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10709 }
10710 *retval = (int)bytesread;
10711 }
10712 return error;
10713 }
10714
10715 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10716 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10717 {
10718 off_t offset;
10719 ssize_t bytesread;
10720 int error, eofflag;
10721 user_size_t bufsize;
10722
10723 AUDIT_ARG(fd, uap->fd);
10724
10725 /*
10726 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10727 * then the kernel carves out the last 4 bytes to return extended
10728 * information to userspace (namely whether we reached EOF with this call).
10729 */
10730 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10731 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10732 } else {
10733 bufsize = uap->bufsize;
10734 }
10735
10736 error = getdirentries_common(uap->fd, uap->buf, bufsize,
10737 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10738
10739 if (error == 0) {
10740 *retval = bytesread;
10741 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10742
10743 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10744 getdirentries64_flags_t flags = 0;
10745 if (eofflag) {
10746 flags |= GETDIRENTRIES64_EOF;
10747 }
10748 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10749 sizeof(flags));
10750 }
10751 }
10752 return error;
10753 }
10754
10755
10756 /*
10757 * Set the mode mask for creation of filesystem nodes.
10758 * XXX implement xsecurity
10759 */
10760 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10761 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10762 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10763 {
10764 AUDIT_ARG(mask, newmask);
10765 proc_fdlock(p);
10766 *retval = p->p_fd.fd_cmask;
10767 p->p_fd.fd_cmask = newmask & ALLPERMS;
10768 proc_fdunlock(p);
10769 return 0;
10770 }
10771
10772 /*
10773 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10774 *
10775 * Parameters: p Process requesting to set the umask
10776 * uap User argument descriptor (see below)
10777 * retval umask of the process (parameter p)
10778 *
10779 * Indirect: uap->newmask umask to set
10780 * uap->xsecurity ACL to set
10781 *
10782 * Returns: 0 Success
10783 * !0 Not success
10784 *
10785 */
10786 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10787 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10788 {
10789 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10790 }
10791
10792 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10793 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10794 {
10795 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10796 }
10797
10798 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10799 "com.apple.private.vfs.revoke-mounted-device"
10800
10801 /*
10802 * Void all references to file by ripping underlying filesystem
10803 * away from vnode.
10804 */
10805 /* ARGSUSED */
10806 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10807 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10808 {
10809 vnode_t vp;
10810 struct vnode_attr va;
10811 vfs_context_t ctx = vfs_context_current();
10812 int error;
10813 struct nameidata nd;
10814
10815 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10816 uap->path, ctx);
10817 error = namei(&nd);
10818 if (error) {
10819 return error;
10820 }
10821 vp = nd.ni_vp;
10822
10823 nameidone(&nd);
10824
10825 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10826 error = ENOTSUP;
10827 goto out;
10828 }
10829
10830 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10831 error = EBUSY;
10832 goto out;
10833 }
10834
10835 #if CONFIG_MACF
10836 error = mac_vnode_check_revoke(ctx, vp);
10837 if (error) {
10838 goto out;
10839 }
10840 #endif
10841
10842 VATTR_INIT(&va);
10843 VATTR_WANTED(&va, va_uid);
10844 if ((error = vnode_getattr(vp, &va, ctx))) {
10845 goto out;
10846 }
10847 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10848 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10849 goto out;
10850 }
10851 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10852 VNOP_REVOKE(vp, REVOKEALL, ctx);
10853 }
10854 out:
10855 vnode_put(vp);
10856 return error;
10857 }
10858
10859
10860 /*
10861 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10862 * The following system calls are designed to support features
10863 * which are specific to the HFS & HFS Plus volume formats
10864 */
10865
10866
10867 /*
10868 * Obtain attribute information on objects in a directory while enumerating
10869 * the directory.
10870 */
10871 /* ARGSUSED */
10872 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10873 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10874 {
10875 vnode_t vp;
10876 struct fileproc *fp;
10877 uio_t auio = NULL;
10878 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10879 uint32_t count = 0, savecount = 0;
10880 uint32_t newstate = 0;
10881 int error, eofflag = 0;
10882 off_t loff = 0;
10883 struct attrlist attributelist;
10884 vfs_context_t ctx = vfs_context_current();
10885 int fd = uap->fd;
10886 UIO_STACKBUF(uio_buf, 1);
10887 kauth_action_t action;
10888
10889 AUDIT_ARG(fd, fd);
10890
10891 /* Get the attributes into kernel space */
10892 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10893 return error;
10894 }
10895 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10896 return error;
10897 }
10898 savecount = count;
10899
10900 get_from_fd:
10901 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10902 return error;
10903 }
10904
10905 vn_offset_lock(fp->fp_glob);
10906 if (((vnode_t)fp_get_data(fp)) != vp) {
10907 vn_offset_unlock(fp->fp_glob);
10908 file_drop(fd);
10909 goto get_from_fd;
10910 }
10911
10912 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10913 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10914 error = EBADF;
10915 goto out;
10916 }
10917
10918
10919 #if CONFIG_MACF
10920 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10921 fp->fp_glob);
10922 if (error) {
10923 goto out;
10924 }
10925 #endif
10926
10927
10928 if ((error = vnode_getwithref(vp))) {
10929 goto out;
10930 }
10931
10932 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10933
10934 #if CONFIG_UNION_MOUNTS
10935 unionread:
10936 #endif /* CONFIG_UNION_MOUNTS */
10937 if (vp->v_type != VDIR) {
10938 (void)vnode_put(vp);
10939 error = EINVAL;
10940 goto out;
10941 }
10942
10943 #if CONFIG_MACF
10944 error = mac_vnode_check_readdir(ctx, vp);
10945 if (error != 0) {
10946 (void)vnode_put(vp);
10947 goto out;
10948 }
10949 #endif /* MAC */
10950
10951 /* set up the uio structure which will contain the users return buffer */
10952 loff = fp->fp_glob->fg_offset;
10953 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10954 uio_addiov(auio, uap->buffer, uap->buffersize);
10955
10956 /*
10957 * If the only item requested is file names, we can let that past with
10958 * just LIST_DIRECTORY. If they want any other attributes, that means
10959 * they need SEARCH as well.
10960 */
10961 action = KAUTH_VNODE_LIST_DIRECTORY;
10962 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10963 attributelist.fileattr || attributelist.dirattr) {
10964 action |= KAUTH_VNODE_SEARCH;
10965 }
10966
10967 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10968 /* Believe it or not, uap->options only has 32-bits of valid
10969 * info, so truncate before extending again */
10970
10971 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10972 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10973 }
10974
10975 if (error) {
10976 (void) vnode_put(vp);
10977 goto out;
10978 }
10979
10980 #if CONFIG_UNION_MOUNTS
10981 /*
10982 * If we've got the last entry of a directory in a union mount
10983 * then reset the eofflag and pretend there's still more to come.
10984 * The next call will again set eofflag and the buffer will be empty,
10985 * so traverse to the underlying directory and do the directory
10986 * read there.
10987 */
10988 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10989 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10990 eofflag = 0;
10991 } else { // Empty buffer
10992 vnode_t uvp;
10993 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10994 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10995 fp_set_data(fp, uvp);
10996 fp->fp_glob->fg_offset = 0; // reset index for new dir
10997 count = savecount;
10998 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10999 vnode_put(vp);
11000 vp = uvp;
11001 goto unionread;
11002 } else {
11003 /* could not get a ref, can't replace in fd */
11004 vnode_put(uvp);
11005 }
11006 }
11007 }
11008 }
11009 #endif /* CONFIG_UNION_MOUNTS */
11010
11011 (void)vnode_put(vp);
11012
11013 if (error) {
11014 goto out;
11015 }
11016 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
11017
11018 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
11019 goto out;
11020 }
11021 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
11022 goto out;
11023 }
11024 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
11025 goto out;
11026 }
11027
11028 *retval = eofflag; /* similar to getdirentries */
11029 error = 0;
11030 out:
11031 vn_offset_unlock(fp->fp_glob);
11032 file_drop(fd);
11033 return error; /* return error earlier, an retval of 0 or 1 now */
11034 } /* end of getdirentriesattr system call */
11035
11036 /*
11037 * Exchange data between two files
11038 */
11039
11040 /* ARGSUSED */
11041 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)11042 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
11043 {
11044 struct nameidata fnd, snd;
11045 vfs_context_t ctx = vfs_context_current();
11046 vnode_t fvp;
11047 vnode_t svp;
11048 int error;
11049 u_int32_t nameiflags;
11050 char *fpath = NULL;
11051 char *spath = NULL;
11052 int flen = 0, slen = 0;
11053 int from_truncated = 0, to_truncated = 0;
11054 #if CONFIG_FSE
11055 fse_info f_finfo, s_finfo;
11056 #endif
11057
11058 nameiflags = 0;
11059 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11060 nameiflags |= FOLLOW;
11061 }
11062
11063 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
11064 UIO_USERSPACE, uap->path1, ctx);
11065
11066 error = namei(&fnd);
11067 if (error) {
11068 goto out2;
11069 }
11070
11071 nameidone(&fnd);
11072 fvp = fnd.ni_vp;
11073
11074 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
11075 UIO_USERSPACE, uap->path2, ctx);
11076
11077 error = namei(&snd);
11078 if (error) {
11079 vnode_put(fvp);
11080 goto out2;
11081 }
11082 nameidone(&snd);
11083 svp = snd.ni_vp;
11084
11085 /*
11086 * if the files are the same, return an inval error
11087 */
11088 if (svp == fvp) {
11089 error = EINVAL;
11090 goto out;
11091 }
11092
11093 /*
11094 * if the files are on different volumes, return an error
11095 */
11096 if (svp->v_mount != fvp->v_mount) {
11097 error = EXDEV;
11098 goto out;
11099 }
11100
11101 /* If they're not files, return an error */
11102 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
11103 error = EINVAL;
11104 goto out;
11105 }
11106
11107 #if CONFIG_MACF
11108 error = mac_vnode_check_exchangedata(ctx,
11109 fvp, svp);
11110 if (error) {
11111 goto out;
11112 }
11113 #endif
11114 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
11115 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
11116 goto out;
11117 }
11118
11119 if (
11120 #if CONFIG_FSE
11121 need_fsevent(FSE_EXCHANGE, fvp) ||
11122 #endif
11123 kauth_authorize_fileop_has_listeners()) {
11124 GET_PATH(fpath);
11125 GET_PATH(spath);
11126
11127 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
11128 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
11129
11130 #if CONFIG_FSE
11131 get_fse_info(fvp, &f_finfo, ctx);
11132 get_fse_info(svp, &s_finfo, ctx);
11133 if (from_truncated || to_truncated) {
11134 // set it here since only the f_finfo gets reported up to user space
11135 f_finfo.mode |= FSE_TRUNCATED_PATH;
11136 }
11137 #endif
11138 }
11139 /* Ok, make the call */
11140 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
11141
11142 if (error == 0) {
11143 const char *tmpname;
11144
11145 if (fpath != NULL && spath != NULL) {
11146 /* call out to allow 3rd party notification of exchangedata.
11147 * Ignore result of kauth_authorize_fileop call.
11148 */
11149 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
11150 (uintptr_t)fpath, (uintptr_t)spath);
11151 }
11152 name_cache_lock();
11153
11154 tmpname = fvp->v_name;
11155 fvp->v_name = svp->v_name;
11156 svp->v_name = tmpname;
11157
11158 if (fvp->v_parent != svp->v_parent) {
11159 vnode_t tmp;
11160
11161 tmp = fvp->v_parent;
11162 fvp->v_parent = svp->v_parent;
11163 svp->v_parent = tmp;
11164 }
11165 name_cache_unlock();
11166
11167 #if CONFIG_FSE
11168 if (fpath != NULL && spath != NULL) {
11169 add_fsevent(FSE_EXCHANGE, ctx,
11170 FSE_ARG_STRING, flen, fpath,
11171 FSE_ARG_FINFO, &f_finfo,
11172 FSE_ARG_STRING, slen, spath,
11173 FSE_ARG_FINFO, &s_finfo,
11174 FSE_ARG_DONE);
11175 }
11176 #endif
11177 }
11178
11179 out:
11180 if (fpath != NULL) {
11181 RELEASE_PATH(fpath);
11182 }
11183 if (spath != NULL) {
11184 RELEASE_PATH(spath);
11185 }
11186 vnode_put(svp);
11187 vnode_put(fvp);
11188 out2:
11189 return error;
11190 }
11191
11192 /*
11193 * Return (in MB) the amount of freespace on the given vnode's volume.
11194 */
11195 uint32_t freespace_mb(vnode_t vp);
11196
11197 uint32_t
freespace_mb(vnode_t vp)11198 freespace_mb(vnode_t vp)
11199 {
11200 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11201 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11202 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11203 }
11204
11205 #if CONFIG_SEARCHFS
11206
11207 /* ARGSUSED */
11208
11209 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11210 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11211 {
11212 vnode_t vp, tvp;
11213 int i, error = 0;
11214 int fserror = 0;
11215 struct nameidata nd;
11216 struct user64_fssearchblock searchblock;
11217 struct searchstate *state;
11218 struct attrlist *returnattrs;
11219 struct timeval timelimit;
11220 void *searchparams1, *searchparams2;
11221 uio_t auio = NULL;
11222 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11223 uint32_t nummatches;
11224 size_t mallocsize;
11225 uint32_t nameiflags;
11226 vfs_context_t ctx = vfs_context_current();
11227 UIO_STACKBUF(uio_buf, 1);
11228
11229 /* Start by copying in fsearchblock parameter list */
11230 if (IS_64BIT_PROCESS(p)) {
11231 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11232 timelimit.tv_sec = searchblock.timelimit.tv_sec;
11233 timelimit.tv_usec = searchblock.timelimit.tv_usec;
11234 } else {
11235 struct user32_fssearchblock tmp_searchblock;
11236
11237 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11238 // munge into 64-bit version
11239 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11240 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11241 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11242 searchblock.maxmatches = tmp_searchblock.maxmatches;
11243 /*
11244 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11245 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11246 */
11247 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11248 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11249 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11250 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11251 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11252 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11253 searchblock.searchattrs = tmp_searchblock.searchattrs;
11254 }
11255 if (error) {
11256 return error;
11257 }
11258
11259 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11260 */
11261 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11262 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11263 return EINVAL;
11264 }
11265
11266 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11267 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
11268 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11269 /* block. */
11270 /* */
11271 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
11272 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
11273 /* assumes the size is still 556 bytes it will continue to work */
11274
11275 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11276 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11277
11278 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11279
11280 /* Now set up the various pointers to the correct place in our newly allocated memory */
11281
11282 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11283 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11284 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11285
11286 /* Now copy in the stuff given our local variables. */
11287
11288 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11289 goto freeandexit;
11290 }
11291
11292 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11293 goto freeandexit;
11294 }
11295
11296 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11297 goto freeandexit;
11298 }
11299
11300 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11301 goto freeandexit;
11302 }
11303
11304 /*
11305 * When searching a union mount, need to set the
11306 * start flag at the first call on each layer to
11307 * reset state for the new volume.
11308 */
11309 if (uap->options & SRCHFS_START) {
11310 state->ss_union_layer = 0;
11311 } else {
11312 uap->options |= state->ss_union_flags;
11313 }
11314 state->ss_union_flags = 0;
11315
11316 /*
11317 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11318 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11319 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11320 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11321 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11322 */
11323
11324 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11325 attrreference_t* string_ref;
11326 u_int32_t* start_length;
11327 user64_size_t param_length;
11328
11329 /* validate searchparams1 */
11330 param_length = searchblock.sizeofsearchparams1;
11331 /* skip the word that specifies length of the buffer */
11332 start_length = (u_int32_t*) searchparams1;
11333 start_length = start_length + 1;
11334 string_ref = (attrreference_t*) start_length;
11335
11336 /* ensure no negative offsets or too big offsets */
11337 if (string_ref->attr_dataoffset < 0) {
11338 error = EINVAL;
11339 goto freeandexit;
11340 }
11341 if (string_ref->attr_length > MAXPATHLEN) {
11342 error = EINVAL;
11343 goto freeandexit;
11344 }
11345
11346 /* Check for pointer overflow in the string ref */
11347 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11348 error = EINVAL;
11349 goto freeandexit;
11350 }
11351
11352 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11353 error = EINVAL;
11354 goto freeandexit;
11355 }
11356 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11357 error = EINVAL;
11358 goto freeandexit;
11359 }
11360 }
11361
11362 /* set up the uio structure which will contain the users return buffer */
11363 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11364 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11365
11366 nameiflags = 0;
11367 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11368 nameiflags |= FOLLOW;
11369 }
11370 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11371 UIO_USERSPACE, uap->path, ctx);
11372
11373 error = namei(&nd);
11374 if (error) {
11375 goto freeandexit;
11376 }
11377 vp = nd.ni_vp;
11378 nameidone(&nd);
11379
11380 /*
11381 * Switch to the root vnode for the volume
11382 */
11383 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11384 vnode_put(vp);
11385 if (error) {
11386 goto freeandexit;
11387 }
11388 vp = tvp;
11389
11390 #if CONFIG_UNION_MOUNTS
11391 /*
11392 * If it's a union mount, the path lookup takes
11393 * us to the top layer. But we may need to descend
11394 * to a lower layer. For non-union mounts the layer
11395 * is always zero.
11396 */
11397 for (i = 0; i < (int) state->ss_union_layer; i++) {
11398 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11399 break;
11400 }
11401 tvp = vp;
11402 vp = vp->v_mount->mnt_vnodecovered;
11403 if (vp == NULL) {
11404 vnode_put(tvp);
11405 error = ENOENT;
11406 goto freeandexit;
11407 }
11408 error = vnode_getwithref(vp);
11409 vnode_put(tvp);
11410 if (error) {
11411 goto freeandexit;
11412 }
11413 }
11414 #endif /* CONFIG_UNION_MOUNTS */
11415
11416 #if CONFIG_MACF
11417 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11418 if (error) {
11419 vnode_put(vp);
11420 goto freeandexit;
11421 }
11422 #endif
11423
11424
11425 /*
11426 * If searchblock.maxmatches == 0, then skip the search. This has happened
11427 * before and sometimes the underlying code doesnt deal with it well.
11428 */
11429 if (searchblock.maxmatches == 0) {
11430 nummatches = 0;
11431 goto saveandexit;
11432 }
11433
11434 /*
11435 * Allright, we have everything we need, so lets make that call.
11436 *
11437 * We keep special track of the return value from the file system:
11438 * EAGAIN is an acceptable error condition that shouldn't keep us
11439 * from copying out any results...
11440 */
11441
11442 fserror = VNOP_SEARCHFS(vp,
11443 searchparams1,
11444 searchparams2,
11445 &searchblock.searchattrs,
11446 (uint32_t)searchblock.maxmatches,
11447 &timelimit,
11448 returnattrs,
11449 &nummatches,
11450 (uint32_t)uap->scriptcode,
11451 (uint32_t)uap->options,
11452 auio,
11453 (struct searchstate *) &state->ss_fsstate,
11454 ctx);
11455
11456 #if CONFIG_UNION_MOUNTS
11457 /*
11458 * If it's a union mount we need to be called again
11459 * to search the mounted-on filesystem.
11460 */
11461 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11462 state->ss_union_flags = SRCHFS_START;
11463 state->ss_union_layer++; // search next layer down
11464 fserror = EAGAIN;
11465 }
11466 #endif /* CONFIG_UNION_MOUNTS */
11467
11468 saveandexit:
11469
11470 vnode_put(vp);
11471
11472 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11473 * search state. Everything was already put into he return buffer by the vop call. */
11474
11475 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11476 goto freeandexit;
11477 }
11478
11479 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11480 goto freeandexit;
11481 }
11482
11483 error = fserror;
11484
11485 freeandexit:
11486
11487 kfree_data(searchparams1, mallocsize);
11488
11489 return error;
11490 } /* end of searchfs system call */
11491
11492 #else /* CONFIG_SEARCHFS */
11493
11494 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11495 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11496 {
11497 return ENOTSUP;
11498 }
11499
11500 #endif /* CONFIG_SEARCHFS */
11501
11502
11503 #if CONFIG_DATALESS_FILES
11504
11505 /*
11506 * === Namespace Resolver Up-call Mechanism ===
11507 *
11508 * When I/O is performed to a dataless file or directory (read, write,
11509 * lookup-in, etc.), the file system performs an upcall to the namespace
11510 * resolver (filecoordinationd) to materialize the object.
11511 *
11512 * We need multiple up-calls to be in flight at once, and we need these
11513 * up-calls to be interruptible, thus the following implementation:
11514 *
11515 * => The nspace_resolver_request represents the in-kernel request state.
11516 * It contains a request ID, storage space for the errno code returned
11517 * by filecoordinationd, and flags.
11518 *
11519 * => The request ID is simply a global monotonically incrementing 32-bit
11520 * number. Outstanding requests are stored in a hash table, and the
11521 * hash function is extremely simple.
11522 *
11523 * => When an upcall is to be made to filecoordinationd, a request structure
11524 * is allocated on the stack (it is small, and needs to live only during
11525 * the duration of the call to resolve_nspace_item_ext()). It is
11526 * initialized and inserted into the table. Some backpressure from
11527 * filecoordinationd is applied by limiting the numnber of entries that
11528 * can be inserted into the table (and thus limiting the number of
11529 * outstanding requests issued to filecoordinationd); waiting for an
11530 * available slot is interruptible.
11531 *
11532 * => Once the request has been inserted into the table, the up-call is made
11533 * to filecoordinationd via a MiG-generated stub. The up-call returns
11534 * immediately and filecoordinationd processes the request asynchronously.
11535 *
11536 * => The caller now waits for the request to complete. Tnis is achieved by
11537 * sleeping on the address of the request structure and waiting for
11538 * filecoordinationd to mark the request structure as complete. This
11539 * is an interruptible sleep call; if interrupted, the request structure
11540 * is removed from the table and EINTR is returned to the caller. If
11541 * this occurs, an advisory up-call is made to filecoordinationd with
11542 * the request ID to indicate that the request can be aborted or
11543 * de-prioritized at the discretion of filecoordinationd.
11544 *
11545 * => When filecoordinationd has completed the request, it signals completion
11546 * by writing to the vfs.nspace.complete sysctl node. Only a process
11547 * decorated as a namespace resolver can write to this sysctl node. The
11548 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11549 * The request ID is looked up in the table, and if the request is found,
11550 * the error code is stored in the request structure and a wakeup()
11551 * issued on the address of the request structure. If the request is not
11552 * found, we simply drop the completion notification, assuming that the
11553 * caller was interrupted.
11554 *
11555 * => When the waiting thread wakes up, it extracts the error code from the
11556 * request structure, removes the request from the table, and returns the
11557 * error code to the calling function. Fini!
11558 */
11559
11560 struct nspace_resolver_request {
11561 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11562 vnode_t r_vp;
11563 vnode_t r_tdvp;
11564 uint32_t r_req_id;
11565 int r_resolver_error;
11566 int r_flags;
11567 };
11568
11569 #define RRF_COMPLETE 0x0001
11570 #define RRF_COMPLETING 0x0002
11571
11572 struct nspace_resolver_completion_data {
11573 uint32_t req_id;
11574 int32_t resolver_error;
11575 uint64_t orig_gencount;
11576 uint64_t orig_syncroot;
11577 };
11578
11579 static uint32_t
next_nspace_req_id(void)11580 next_nspace_req_id(void)
11581 {
11582 static uint32_t next_req_id;
11583
11584 return OSAddAtomic(1, &next_req_id);
11585 }
11586
11587 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11588 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11589
11590 static LIST_HEAD(nspace_resolver_requesthead,
11591 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11592 static u_long nspace_resolver_request_hashmask;
11593 static u_int nspace_resolver_request_count;
11594 static bool nspace_resolver_request_wait_slot;
11595 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11596 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11597 &nspace_resolver_request_lck_grp);
11598
11599 #define NSPACE_REQ_LOCK() \
11600 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11601 #define NSPACE_REQ_UNLOCK() \
11602 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11603
11604 #define NSPACE_RESOLVER_HASH(req_id) \
11605 (&nspace_resolver_request_hashtbl[(req_id) & \
11606 nspace_resolver_request_hashmask])
11607
11608 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11609 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11610 {
11611 struct nspace_resolver_requesthead *bucket;
11612 struct nspace_resolver_request *req;
11613
11614 bucket = NSPACE_RESOLVER_HASH(req_id);
11615 LIST_FOREACH(req, bucket, r_hashlink) {
11616 if (req->r_req_id == req_id) {
11617 /*
11618 * If this request already has a completion
11619 * pending, don't return it again.
11620 */
11621 if ((req->r_flags & RRF_COMPLETING) != 0 &&
11622 skip_completing) {
11623 req = NULL;
11624 }
11625 return req;
11626 }
11627 }
11628
11629 return NULL;
11630 }
11631
11632 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11633 nspace_resolver_req_add(struct nspace_resolver_request *req)
11634 {
11635 struct nspace_resolver_requesthead *bucket;
11636 int error;
11637
11638 NSPACE_REQ_LOCK();
11639
11640 while (nspace_resolver_request_count >=
11641 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11642 nspace_resolver_request_wait_slot = true;
11643 error = msleep(&nspace_resolver_request_count,
11644 &nspace_resolver_request_hash_mutex,
11645 PVFS | PCATCH, "nspacerq", NULL);
11646 if (error) {
11647 NSPACE_REQ_UNLOCK();
11648 return error;
11649 }
11650 }
11651
11652 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11653 #if DIAGNOSTIC
11654 assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11655 #endif /* DIAGNOSTIC */
11656 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11657 nspace_resolver_request_count++;
11658
11659 NSPACE_REQ_UNLOCK();
11660
11661 return 0;
11662 }
11663
11664 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11665 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11666 {
11667 /*
11668 * If a completion is in-progress, we have to wait for the
11669 * completion handler to finish because it's still using 'req',
11670 * which is allocated on our stack a couple of frames up.
11671 */
11672 while ((req->r_flags & RRF_COMPLETING) != 0) {
11673 (void) msleep(req, &nspace_resolver_request_hash_mutex,
11674 PVFS, "nspacecmplt", NULL);
11675 }
11676 }
11677
11678 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11679 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11680 {
11681 struct nspace_resolver_requesthead *bucket;
11682
11683 /* We're called with NSPACE_REQ_LOCK held. */
11684
11685 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11686 #if DIAGNOSTIC
11687 assert((req->r_flags & RRF_COMPLETING) == 0);
11688 assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11689 #endif /* DIAGNOSTIC */
11690 LIST_REMOVE(req, r_hashlink);
11691 nspace_resolver_request_count--;
11692
11693 if (nspace_resolver_request_wait_slot) {
11694 nspace_resolver_request_wait_slot = false;
11695 wakeup(&nspace_resolver_request_count);
11696 }
11697
11698 nspace_resolver_req_wait_pending_completion(req);
11699
11700 NSPACE_REQ_UNLOCK();
11701 }
11702
11703 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11704 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11705 {
11706 NSPACE_REQ_LOCK();
11707 nspace_resolver_req_remove_and_unlock(req);
11708 }
11709
11710 static void
nspace_resolver_req_cancel(uint32_t req_id)11711 nspace_resolver_req_cancel(uint32_t req_id)
11712 {
11713 kern_return_t kr;
11714 mach_port_t mp;
11715
11716 // Failures here aren't fatal -- the cancellation message
11717 // sent to the resolver is merely advisory.
11718
11719 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11720 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11721 return;
11722 }
11723
11724 kr = send_nspace_resolve_cancel(mp, req_id);
11725 if (kr != KERN_SUCCESS) {
11726 os_log_error(OS_LOG_DEFAULT,
11727 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11728 }
11729
11730 ipc_port_release_send(mp);
11731 }
11732
11733 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11734 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11735 {
11736 bool send_cancel_message = false;
11737 int error;
11738
11739 NSPACE_REQ_LOCK();
11740
11741 while ((req->r_flags & RRF_COMPLETE) == 0) {
11742 error = msleep(req, &nspace_resolver_request_hash_mutex,
11743 PVFS | PCATCH, "nspace", NULL);
11744 if (error && error != ERESTART) {
11745 req->r_resolver_error = (error == EINTR) ? EINTR :
11746 ETIMEDOUT;
11747 send_cancel_message = true;
11748 break;
11749 }
11750 }
11751
11752 nspace_resolver_req_remove_and_unlock(req);
11753
11754 /*
11755 * It's safe to continue referencing 'req' here because it's
11756 * allocated on our caller's stack.
11757 */
11758
11759 if (send_cancel_message) {
11760 nspace_resolver_req_cancel(req->r_req_id);
11761 }
11762
11763 return req->r_resolver_error;
11764 }
11765
11766 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11767 nspace_resolver_req_mark_complete(
11768 struct nspace_resolver_request *req,
11769 int resolver_error)
11770 {
11771 req->r_resolver_error = resolver_error;
11772 req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11773 wakeup(req);
11774 }
11775
11776 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11777 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11778 {
11779 req->r_flags |= RRF_COMPLETING;
11780 }
11781
11782 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11783 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11784 {
11785 struct nspace_resolver_request *req;
11786 int error;
11787 struct vnode_attr va;
11788 vnode_t vp;
11789
11790 NSPACE_REQ_LOCK();
11791
11792 req = nspace_resolver_req_lookup(c->req_id, true);
11793 if (req == NULL) {
11794 /*
11795 * If we don't find the request corresponding to our req_id,
11796 * just drop the completion on the floor; it's likely that
11797 * the requester interrupted with a signal, or it may already
11798 * be completing.
11799 */
11800 NSPACE_REQ_UNLOCK();
11801 return;
11802 }
11803
11804 /*
11805 * Get out now if the resolver reported an error.
11806 */
11807 if ((error = c->resolver_error) != 0) {
11808 goto out;
11809 }
11810
11811 /*
11812 * If the resolver did not specify any namespace shape criteria
11813 * for letting the operation proceed, then get out now.
11814 */
11815 if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11816 goto out;
11817 }
11818
11819 /*
11820 * We're going to have to acquire the mount rename lock and do
11821 * some I/O in order to verify the criteria. Mark the request
11822 * as pending so no one else messes with it after we drop the
11823 * NSPACE_REQ_LOCK.
11824 */
11825 nspace_resolver_req_mark_completion_pending(req);
11826 NSPACE_REQ_UNLOCK();
11827
11828 /*
11829 * Lock out renames from changing the shape of the tree while
11830 * validate the criteria.
11831 */
11832 mount_t locked_mp = req->r_vp->v_mount;
11833 mount_ref(locked_mp, 0);
11834 mount_lock_renames(locked_mp);
11835
11836 if (c->orig_gencount != 0) {
11837 vp = req->r_vp;
11838 if (error) {
11839 goto out_dropmount;
11840 }
11841
11842 VATTR_INIT(&va);
11843 VATTR_WANTED(&va, va_recursive_gencount);
11844 error = vnode_getattr(vp, &va, vfs_context_kernel());
11845 if (error) {
11846 goto out_dropmount;
11847 }
11848 if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11849 va.va_recursive_gencount != c->orig_gencount) {
11850 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11851 c->orig_gencount, va.va_recursive_gencount);
11852 error = EBUSY;
11853 goto out_dropmount;
11854 }
11855 }
11856
11857 /*
11858 * Ignore orig_syncroot if a destination directory wasn't specified
11859 * in the request.
11860 */
11861 if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11862 uint64_t syncroot_id;
11863
11864 if (error) {
11865 goto out_dropmount;
11866 }
11867
11868 #ifndef APFSIOC_GET_SYNC_ROOT
11869 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11870 #endif
11871
11872 error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11873 (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11874 if (error) {
11875 goto out_dropmount;
11876 }
11877 if (syncroot_id != c->orig_syncroot) {
11878 printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11879 c->orig_syncroot, syncroot_id);
11880 error = EBUSY;
11881 goto out_dropmount;
11882 }
11883 }
11884
11885 out_dropmount:
11886 mount_unlock_renames(locked_mp);
11887 mount_drop(locked_mp, 0);
11888 NSPACE_REQ_LOCK();
11889
11890 out:
11891 nspace_resolver_req_mark_complete(req, error);
11892 NSPACE_REQ_UNLOCK();
11893 }
11894
11895 static struct proc *nspace_resolver_proc;
11896
11897 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11898 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11899 {
11900 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11901 p == nspace_resolver_proc) ? 1 : 0;
11902 return 0;
11903 }
11904
11905 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11906
11907 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11908 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11909 {
11910 vfs_context_t ctx = vfs_context_current();
11911 int error = 0;
11912
11913 //
11914 // The system filecoordinationd runs as uid == 0. This also
11915 // has the nice side-effect of filtering out filecoordinationd
11916 // running in the simulator.
11917 //
11918 if (!vfs_context_issuser(ctx) ||
11919 !vfs_context_is_dataless_resolver(ctx)) {
11920 return EPERM;
11921 }
11922
11923 if (is_resolver) {
11924 NSPACE_REQ_LOCK();
11925
11926 if (nspace_resolver_proc == NULL) {
11927 proc_lock(p);
11928 p->p_lflag |= P_LNSPACE_RESOLVER;
11929 proc_unlock(p);
11930 nspace_resolver_proc = p;
11931 } else {
11932 error = EBUSY;
11933 }
11934
11935 NSPACE_REQ_UNLOCK();
11936 } else {
11937 // This is basically just like the exit case.
11938 // nspace_resolver_exited() will verify that the
11939 // process is the resolver, and will clear the
11940 // global.
11941 nspace_resolver_exited(p);
11942 }
11943
11944 return error;
11945 }
11946
11947 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11948 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11949 {
11950 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11951 (p->p_vfs_iopolicy &
11952 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11953 *is_prevented = 1;
11954 } else {
11955 *is_prevented = 0;
11956 }
11957 return 0;
11958 }
11959
11960 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11961 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11962 {
11963 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11964 return is_prevented ? 0 : EBUSY;
11965 }
11966
11967 if (is_prevented) {
11968 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11969 } else {
11970 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11971 }
11972 return 0;
11973 }
11974
11975 static int
nspace_materialization_get_thread_state(int * is_prevented)11976 nspace_materialization_get_thread_state(int *is_prevented)
11977 {
11978 uthread_t ut = current_uthread();
11979
11980 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11981 return 0;
11982 }
11983
11984 static int
nspace_materialization_set_thread_state(int is_prevented)11985 nspace_materialization_set_thread_state(int is_prevented)
11986 {
11987 uthread_t ut = current_uthread();
11988
11989 if (is_prevented) {
11990 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11991 } else {
11992 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11993 }
11994 return 0;
11995 }
11996
11997 /* the vfs.nspace branch */
11998 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11999
12000 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12001 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
12002 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12003 {
12004 struct proc *p = req->p;
12005 int new_value, old_value, changed = 0;
12006 int error;
12007
12008 error = nspace_resolver_get_proc_state(p, &old_value);
12009 if (error) {
12010 return error;
12011 }
12012
12013 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12014 &changed);
12015 if (error == 0 && changed) {
12016 error = nspace_resolver_set_proc_state(p, new_value);
12017 }
12018 return error;
12019 }
12020
12021 /* decorate this process as the dataless file resolver */
12022 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
12023 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12024 0, 0, sysctl_nspace_resolver, "I", "");
12025
12026 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12027 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
12028 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12029 {
12030 struct proc *p = req->p;
12031 int new_value, old_value, changed = 0;
12032 int error;
12033
12034 error = nspace_materialization_get_proc_state(p, &old_value);
12035 if (error) {
12036 return error;
12037 }
12038
12039 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12040 &changed);
12041 if (error == 0 && changed) {
12042 error = nspace_materialization_set_proc_state(p, new_value);
12043 }
12044 return error;
12045 }
12046
12047 /* decorate this process as not wanting to materialize dataless files */
12048 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
12049 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12050 0, 0, sysctl_nspace_prevent_materialization, "I", "");
12051
12052 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12053 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
12054 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12055 {
12056 int new_value, old_value, changed = 0;
12057 int error;
12058
12059 error = nspace_materialization_get_thread_state(&old_value);
12060 if (error) {
12061 return error;
12062 }
12063
12064 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
12065 &changed);
12066 if (error == 0 && changed) {
12067 error = nspace_materialization_set_thread_state(new_value);
12068 }
12069 return error;
12070 }
12071
12072 /* decorate this thread as not wanting to materialize dataless files */
12073 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
12074 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12075 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
12076
12077 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12078 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
12079 __unused int arg2, struct sysctl_req *req)
12080 {
12081 struct proc *p = req->p;
12082 uint32_t req_status[2] = { 0, 0 };
12083 uint64_t gencount = 0;
12084 uint64_t syncroot = 0;
12085 int error, is_resolver, changed = 0, other_changed;
12086
12087 error = nspace_resolver_get_proc_state(p, &is_resolver);
12088 if (error) {
12089 return error;
12090 }
12091
12092 if (!is_resolver) {
12093 return EPERM;
12094 }
12095
12096 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
12097 &changed);
12098 if (error) {
12099 return error;
12100 }
12101
12102 /*
12103 * Get the gencount if it was passed. Ignore errors, because
12104 * it's optional.
12105 */
12106 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
12107 &other_changed);
12108 if (error) {
12109 gencount = 0;
12110 error = 0;
12111 }
12112
12113 /*
12114 * ...and now the syncroot ID.
12115 */
12116 error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
12117 &other_changed);
12118 if (error) {
12119 syncroot = 0;
12120 error = 0;
12121 }
12122
12123 /*
12124 * req_status[0] is the req_id
12125 *
12126 * req_status[1] is the errno
12127 */
12128 if (error == 0 && changed) {
12129 const struct nspace_resolver_completion_data cd = {
12130 .req_id = req_status[0],
12131 .resolver_error = req_status[1],
12132 .orig_gencount = gencount,
12133 .orig_syncroot = syncroot,
12134 };
12135 nspace_resolver_req_completed(&cd);
12136 }
12137 return error;
12138 }
12139
12140 /* Resolver reports completed reqs here. */
12141 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
12142 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
12143 0, 0, sysctl_nspace_complete, "-", "");
12144
12145 #endif /* CONFIG_DATALESS_FILES */
12146
12147 #if CONFIG_DATALESS_FILES
12148 #define __no_dataless_unused /* nothing */
12149 #else
12150 #define __no_dataless_unused __unused
12151 #endif
12152
12153 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)12154 vfs_context_dataless_materialization_is_prevented(
12155 vfs_context_t const ctx __no_dataless_unused)
12156 {
12157 #if CONFIG_DATALESS_FILES
12158 proc_t const p = vfs_context_proc(ctx);
12159 thread_t const t = vfs_context_thread(ctx);
12160 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
12161
12162 /*
12163 * Kernel context ==> return EDEADLK, as we would with any random
12164 * process decorated as no-materialize.
12165 */
12166 if (ctx == vfs_context_kernel()) {
12167 return EDEADLK;
12168 }
12169
12170 /*
12171 * If the process has the dataless-manipulation entitlement,
12172 * materialization is prevented, and depending on the kind
12173 * of file system operation, things get to proceed as if the
12174 * object is not dataless.
12175 */
12176 if (vfs_context_is_dataless_manipulator(ctx)) {
12177 return EJUSTRETURN;
12178 }
12179
12180 /*
12181 * Per-thread decorations override any process-wide decorations.
12182 * (Foundation uses this, and this overrides even the dataless-
12183 * manipulation entitlement so as to make API contracts consistent.)
12184 */
12185 if (ut != NULL) {
12186 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12187 return EDEADLK;
12188 }
12189 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12190 return 0;
12191 }
12192 }
12193
12194 /*
12195 * If the process's iopolicy specifies that dataless files
12196 * can be materialized, then we let it go ahead.
12197 */
12198 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12199 return 0;
12200 }
12201 #endif /* CONFIG_DATALESS_FILES */
12202
12203 /*
12204 * The default behavior is to not materialize dataless files;
12205 * return to the caller that deadlock was detected.
12206 */
12207 return EDEADLK;
12208 }
12209
12210 void
nspace_resolver_init(void)12211 nspace_resolver_init(void)
12212 {
12213 #if CONFIG_DATALESS_FILES
12214 nspace_resolver_request_hashtbl =
12215 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12216 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12217 #endif /* CONFIG_DATALESS_FILES */
12218 }
12219
12220 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12221 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12222 {
12223 #if CONFIG_DATALESS_FILES
12224 struct nspace_resolver_requesthead *bucket;
12225 struct nspace_resolver_request *req;
12226 u_long idx;
12227
12228 NSPACE_REQ_LOCK();
12229
12230 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12231 p == nspace_resolver_proc) {
12232 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12233 bucket = &nspace_resolver_request_hashtbl[idx];
12234 LIST_FOREACH(req, bucket, r_hashlink) {
12235 nspace_resolver_req_wait_pending_completion(req);
12236 nspace_resolver_req_mark_complete(req,
12237 ETIMEDOUT);
12238 }
12239 }
12240 nspace_resolver_proc = NULL;
12241 }
12242
12243 NSPACE_REQ_UNLOCK();
12244 #endif /* CONFIG_DATALESS_FILES */
12245 }
12246
12247 #define DATALESS_RESOLVER_ENTITLEMENT \
12248 "com.apple.private.vfs.dataless-resolver"
12249 #define DATALESS_MANIPULATION_ENTITLEMENT \
12250 "com.apple.private.vfs.dataless-manipulation"
12251
12252 #if CONFIG_DATALESS_FILES
12253 /*
12254 * Return TRUE if the vfs context is associated with the dataless
12255 * resolver.
12256 */
12257 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12258 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12259 {
12260 return IOTaskHasEntitlement(vfs_context_task(ctx),
12261 DATALESS_RESOLVER_ENTITLEMENT);
12262 }
12263 #endif /* CONFIG_DATALESS_FILES */
12264
12265 /*
12266 * Return TRUE if the vfs context is associated with a process entitled
12267 * for dataless manipulation.
12268 *
12269 * XXX Arguably belongs in vfs_subr.c, but is here because of the
12270 * complication around CONFIG_DATALESS_FILES.
12271 */
12272 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12273 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12274 {
12275 #if CONFIG_DATALESS_FILES
12276 task_t task = vfs_context_task(ctx);
12277 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12278 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12279 #else
12280 return false;
12281 #endif /* CONFIG_DATALESS_FILES */
12282 }
12283
12284 #if CONFIG_DATALESS_FILES
12285 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12286 log_materialization_prevented(vnode_t vp, uint64_t op)
12287 {
12288 char p_name[MAXCOMLEN + 1];
12289 char *vntype;
12290 proc_selfname(&p_name[0], sizeof(p_name));
12291
12292 if (vp->v_type == VREG) {
12293 vntype = "File";
12294 } else if (vp->v_type == VDIR) {
12295 vntype = "Dir";
12296 } else if (vp->v_type == VLNK) {
12297 vntype = "SymLink";
12298 } else {
12299 vntype = "Other";
12300 }
12301
12302 #if DEVELOPMENT
12303 struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12304
12305 VATTR_INIT(vap);
12306 VATTR_WANTED(vap, va_fsid);
12307 VATTR_WANTED(vap, va_fileid);
12308 if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12309 os_log_debug(OS_LOG_DEFAULT,
12310 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12311 p_name, proc_selfpid(), op, vntype,
12312 vap->va_fsid, vap->va_fsid, vap->va_fileid);
12313 } else
12314 #endif
12315 {
12316 os_log_debug(OS_LOG_DEFAULT,
12317 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12318 p_name, proc_selfpid(), op, vntype);
12319 }
12320 #if DEVELOPMENT
12321 kfree_type(struct vnode_attr, vap);
12322 #endif
12323 }
12324 #endif /* CONFIG_DATALESS_FILES */
12325
12326 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12327 vfs_materialize_item(
12328 vnode_t vp __no_dataless_unused,
12329 uint32_t op __no_dataless_unused,
12330 int64_t offset __no_dataless_unused,
12331 int64_t size __no_dataless_unused,
12332 char *lookup_name __no_dataless_unused,
12333 size_t const namelen __no_dataless_unused,
12334 vnode_t tdvp __no_dataless_unused)
12335 {
12336 #if CONFIG_DATALESS_FILES
12337 kern_return_t kern_ret;
12338 mach_port_t mach_port;
12339 char *path = NULL;
12340 vfs_context_t context;
12341 int path_len;
12342 int error;
12343 audit_token_t atoken;
12344 enum vtype vp_vtype;
12345
12346 /* Swap files are special; ignore them */
12347 if (vnode_isswap(vp)) {
12348 return 0;
12349 }
12350
12351 /*
12352 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12353 * are no longer used nor supported.
12354 */
12355 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12356 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12357 return ENOTSUP;
12358 }
12359 if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12360 os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12361 return ENOTSUP;
12362 }
12363
12364 /* Normalize 'op'. */
12365 op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12366
12367 /*
12368 * To-directory is only meaningful for rename operations;
12369 * ignore it if someone handed one to us unexpectedly.
12370 */
12371 if (op != NAMESPACE_HANDLER_RENAME_OP) {
12372 tdvp = NULL;
12373 }
12374
12375 context = vfs_context_current();
12376
12377 /* Remember this for later. */
12378 vp_vtype = vnode_vtype(vp);
12379
12380 error = vfs_context_dataless_materialization_is_prevented(context);
12381 if (error) {
12382 log_materialization_prevented(vp, op);
12383 goto out_check_errors;
12384 }
12385
12386 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12387 &mach_port);
12388 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12389 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12390 /*
12391 * Treat this like being unable to access the backing store
12392 * server.
12393 */
12394 return ETIMEDOUT;
12395 }
12396
12397 int path_alloc_len = MAXPATHLEN;
12398 do {
12399 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12400 if (path == NULL) {
12401 return ENOMEM;
12402 }
12403
12404 path_len = path_alloc_len;
12405 error = vn_getpath(vp, path, &path_len);
12406 if (error == 0) {
12407 break;
12408 } else if (error == ENOSPC) {
12409 kfree_data(path, path_alloc_len);
12410 path = NULL;
12411 } else {
12412 goto out_release_port;
12413 }
12414 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) &&
12415 path_alloc_len <= MAXLONGPATHLEN);
12416
12417 error = vfs_context_copy_audit_token(context, &atoken);
12418 if (error) {
12419 goto out_release_port;
12420 }
12421
12422 struct nspace_resolver_request req = {
12423 .r_req_id = next_nspace_req_id(),
12424 .r_vp = vp,
12425 .r_tdvp = tdvp,
12426 };
12427
12428 error = nspace_resolver_req_add(&req);
12429 if (error) {
12430 goto out_release_port;
12431 }
12432
12433 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12434
12435 if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12436 char *dest_path = NULL;
12437 int dest_path_len;
12438
12439 dest_path = zalloc(ZV_NAMEI);
12440 dest_path_len = MAXPATHLEN;
12441
12442 error = vn_getpath(tdvp, dest_path, &dest_path_len);
12443 if (error) {
12444 zfree(ZV_NAMEI, dest_path);
12445 goto out_release_port;
12446 }
12447
12448 /*
12449 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12450 * compatibility with existing agents in user-space
12451 * who get passed this value.
12452 */
12453 kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12454 req.r_req_id,
12455 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12456 path, dest_path, atoken);
12457
12458 zfree(ZV_NAMEI, dest_path);
12459 } else if (vp_vtype == VDIR) {
12460 char *tmpname = NULL;
12461
12462 /*
12463 * If the caller provided a lookup_name *and* a name length,
12464 * then we assume the lookup_name is not NUL-terminated.
12465 * Allocate a temporary buffer in this case to provide
12466 * a NUL-terminated path name to the IPC call.
12467 */
12468 if (lookup_name != NULL && namelen != 0) {
12469 if (namelen >= PATH_MAX) {
12470 error = EINVAL;
12471 goto out_req_remove;
12472 }
12473 tmpname = zalloc(ZV_NAMEI);
12474 strlcpy(tmpname, lookup_name, namelen + 1);
12475 lookup_name = tmpname;
12476 } else if (lookup_name != NULL) {
12477 /*
12478 * If the caller provided a lookup_name with a
12479 * zero name length, then we assume it's NUL-
12480 * terminated. Verify it has a valid length.
12481 */
12482 if (strlen(lookup_name) >= PATH_MAX) {
12483 error = EINVAL;
12484 goto out_req_remove;
12485 }
12486 }
12487
12488 /* (See above.) */
12489 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12490 req.r_req_id,
12491 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12492 lookup_name == NULL ? "" : lookup_name, path, atoken);
12493
12494 if (tmpname != NULL) {
12495 zfree(ZV_NAMEI, tmpname);
12496
12497 /*
12498 * Poison lookup_name rather than reference
12499 * freed memory.
12500 */
12501 lookup_name = NULL;
12502 }
12503 } else {
12504 /* (See above.) */
12505 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12506 req.r_req_id,
12507 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12508 offset, size, path, atoken);
12509 }
12510 if (kern_ret != KERN_SUCCESS) {
12511 /*
12512 * Also treat this like being unable to access the backing
12513 * store server.
12514 */
12515 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12516 kern_ret);
12517 error = ETIMEDOUT;
12518 goto out_req_remove;
12519 }
12520
12521 /*
12522 * Give back the memory we allocated earlier while we wait; we
12523 * no longer need it.
12524 */
12525 kfree_data(path, path_alloc_len);
12526 path = NULL;
12527
12528 /*
12529 * Request has been submitted to the resolver. Now (interruptibly)
12530 * wait for completion. Upon requrn, the request will have been
12531 * removed from the lookup table.
12532 */
12533 error = nspace_resolver_req_wait(&req);
12534
12535 out_release_port:
12536 if (path != NULL) {
12537 kfree_data(path, path_alloc_len);
12538 path = NULL;
12539 }
12540 ipc_port_release_send(mach_port);
12541
12542 out_check_errors:
12543 /*
12544 * The file resolver owns the logic about what error to return
12545 * to the caller. We only need to handle a couple of special
12546 * cases here:
12547 */
12548 if (error == EJUSTRETURN) {
12549 /*
12550 * The requesting process is allowed to interact with
12551 * dataless objects. Make a couple of sanity-checks
12552 * here to ensure the action makes sense.
12553 */
12554 switch (op) {
12555 case NAMESPACE_HANDLER_WRITE_OP:
12556 case NAMESPACE_HANDLER_TRUNCATE_OP:
12557 case NAMESPACE_HANDLER_RENAME_OP:
12558 /*
12559 * This handles the case of the resolver itself
12560 * writing data to the file (or throwing it
12561 * away).
12562 */
12563 error = 0;
12564 break;
12565 case NAMESPACE_HANDLER_READ_OP:
12566 case NAMESPACE_HANDLER_LOOKUP_OP:
12567 /*
12568 * This handles the case of the resolver needing
12569 * to look up inside of a dataless directory while
12570 * it's in the process of materializing it (for
12571 * example, creating files or directories).
12572 */
12573 error = (vp_vtype == VDIR) ? 0 : EBADF;
12574 break;
12575 default:
12576 error = EBADF;
12577 break;
12578 }
12579 }
12580
12581 return error;
12582
12583 out_req_remove:
12584 nspace_resolver_req_remove(&req);
12585 goto out_release_port;
12586 #else
12587 return ENOTSUP;
12588 #endif /* CONFIG_DATALESS_FILES */
12589 }
12590
12591 /*
12592 * vfs_materialize_file: Materialize a regular file.
12593 *
12594 * Inputs:
12595 * vp The dataless file to be materialized.
12596 *
12597 * op What kind of operation is being performed:
12598 * -> NAMESPACE_HANDLER_READ_OP
12599 * -> NAMESPACE_HANDLER_WRITE_OP
12600 * -> NAMESPACE_HANDLER_LINK_CREATE
12601 * -> NAMESPACE_HANDLER_DELETE_OP
12602 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12603 * -> NAMESPACE_HANDLER_RENAME_OP
12604 *
12605 * offset offset of I/O for READ or WRITE. Ignored for
12606 * other ops.
12607 *
12608 * size size of I/O for READ or WRITE Ignored for
12609 * other ops.
12610 *
12611 * If offset or size are -1 for a READ or WRITE, then the resolver should
12612 * consider the range to be unknown.
12613 *
12614 * Upon successful return, the caller may proceed with the operation.
12615 * N.B. the file may still be "dataless" in this case.
12616 */
12617 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12618 vfs_materialize_file(
12619 struct vnode *vp,
12620 uint64_t op,
12621 int64_t offset,
12622 int64_t size)
12623 {
12624 if (vp->v_type != VREG) {
12625 return EFTYPE;
12626 }
12627 return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12628 NULL);
12629 }
12630
12631 /*
12632 * vfs_materialize_dir:
12633 *
12634 * Inputs:
12635 * vp The dataless directory to be materialized.
12636 *
12637 * op What kind of operation is being performed:
12638 * -> NAMESPACE_HANDLER_READ_OP
12639 * -> NAMESPACE_HANDLER_WRITE_OP
12640 * -> NAMESPACE_HANDLER_DELETE_OP
12641 * -> NAMESPACE_HANDLER_RENAME_OP
12642 * -> NAMESPACE_HANDLER_LOOKUP_OP
12643 *
12644 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12645 * other ops. May or may not be NUL-terminated; see below.
12646 *
12647 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12648 * terminated and namelen is the number of valid bytes in
12649 * lookup_name. If zero, then lookup_name is assumed to be
12650 * NUL-terminated.
12651 *
12652 * Upon successful return, the caller may proceed with the operation.
12653 * N.B. the directory may still be "dataless" in this case.
12654 */
12655 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12656 vfs_materialize_dir(
12657 struct vnode *vp,
12658 uint64_t op,
12659 char *lookup_name,
12660 size_t namelen)
12661 {
12662 if (vp->v_type != VDIR) {
12663 return EFTYPE;
12664 }
12665 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12666 return EINVAL;
12667 }
12668 return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12669 namelen, NULL);
12670 }
12671
12672 /*
12673 * vfs_materialize_reparent:
12674 *
12675 * Inputs:
12676 * vp The dataless file or directory to be materialized.
12677 *
12678 * tdvp The new parent directory for the dataless file.
12679 *
12680 * Upon successful return, the caller may proceed with the operation.
12681 * N.B. the item may still be "dataless" in this case.
12682 */
12683 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12684 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12685 {
12686 if (vp->v_type != VDIR && vp->v_type != VREG) {
12687 return EFTYPE;
12688 }
12689 return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12690 0, 0, NULL, 0, tdvp);
12691 }
12692
12693 #if 0
12694 static int
12695 build_volfs_path(struct vnode *vp, char *path, int *len)
12696 {
12697 struct vnode_attr va;
12698 int ret;
12699
12700 VATTR_INIT(&va);
12701 VATTR_WANTED(&va, va_fsid);
12702 VATTR_WANTED(&va, va_fileid);
12703
12704 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12705 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12706 ret = -1;
12707 } else {
12708 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12709 ret = 0;
12710 }
12711
12712 return ret;
12713 }
12714 #endif
12715
12716 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12717 fsctl_bogus_command_compat(unsigned long cmd)
12718 {
12719 switch (cmd) {
12720 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12721 return FSIOC_SYNC_VOLUME;
12722 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12723 return FSIOC_ROUTEFS_SETROUTEID;
12724 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12725 return FSIOC_SET_PACKAGE_EXTS;
12726 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12727 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12728 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12729 return DISK_CONDITIONER_IOC_GET;
12730 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12731 return DISK_CONDITIONER_IOC_SET;
12732 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12733 return FSIOC_FIOSEEKHOLE;
12734 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12735 return FSIOC_FIOSEEKDATA;
12736 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12737 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12738 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12739 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12740 }
12741
12742 return cmd;
12743 }
12744
12745 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12746 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12747 {
12748 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12749 }
12750
12751 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12752 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12753 {
12754 struct vfs_attr vfa;
12755 mount_t mp = vp->v_mount;
12756 unsigned arg;
12757 int error;
12758
12759 /* record vid of vp so we can drop it below. */
12760 uint32_t vvid = vp->v_id;
12761
12762 /*
12763 * Then grab mount_iterref so that we can release the vnode.
12764 * Without this, a thread may call vnode_iterate_prepare then
12765 * get into a deadlock because we've never released the root vp
12766 */
12767 error = mount_iterref(mp, 0);
12768 if (error) {
12769 return error;
12770 }
12771 vnode_hold(vp);
12772 vnode_put(vp);
12773
12774 arg = MNT_NOWAIT;
12775 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12776 arg = MNT_WAIT;
12777 }
12778
12779 /*
12780 * If the filessytem supports multiple filesytems in a
12781 * partition (For eg APFS volumes in a container, it knows
12782 * that the waitfor argument to VFS_SYNC are flags.
12783 */
12784 VFSATTR_INIT(&vfa);
12785 VFSATTR_WANTED(&vfa, f_capabilities);
12786 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12787 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12788 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12789 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12790 arg |= MNT_VOLUME;
12791 }
12792
12793 /* issue the sync for this volume */
12794 (void)sync_callback(mp, &arg);
12795
12796 /*
12797 * Then release the mount_iterref once we're done syncing; it's not
12798 * needed for the VNOP_IOCTL below
12799 */
12800 mount_iterdrop(mp);
12801
12802 if (arg & FSCTL_SYNC_FULLSYNC) {
12803 /* re-obtain vnode iocount on the root vp, if possible */
12804 error = vnode_getwithvid(vp, vvid);
12805 if (error == 0) {
12806 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12807 vnode_put(vp);
12808 }
12809 }
12810 vnode_drop(vp);
12811 /* mark the argument VP as having been released */
12812 *arg_vp = NULL;
12813 return error;
12814 }
12815
12816 #if ROUTEFS
12817 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12818 handle_routes(user_addr_t udata)
12819 {
12820 char routepath[MAXPATHLEN];
12821 size_t len = 0;
12822 int error;
12823
12824 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12825 return error;
12826 }
12827 bzero(routepath, MAXPATHLEN);
12828 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12829 if (error) {
12830 return error;
12831 }
12832 error = routefs_kernel_mount(routepath);
12833 return error;
12834 }
12835 #endif
12836
12837 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12838 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12839 {
12840 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12841 struct vnode_attr va;
12842 int error;
12843
12844 VATTR_INIT(&va);
12845 VATTR_SET(&va, va_flags, cas->new_flags);
12846
12847 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12848
12849 #if CONFIG_FSE
12850 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12851 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12852 }
12853 #endif
12854
12855 return error;
12856 }
12857
12858 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12859 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12860 {
12861 struct mount *mp = NULL;
12862 errno_t rootauth = 0;
12863
12864 mp = vp->v_mount;
12865
12866 /*
12867 * query the underlying FS and see if it reports something
12868 * sane for this vnode. If volume is authenticated via
12869 * chunklist, leave that for the caller to determine.
12870 */
12871 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12872
12873 return rootauth;
12874 }
12875
12876 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12877 "com.apple.private.kernel.set-package-extensions"
12878
12879 /*
12880 * Make a filesystem-specific control call:
12881 */
12882 /* ARGSUSED */
12883 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12884 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12885 {
12886 int error = 0;
12887 boolean_t is64bit;
12888 u_int size;
12889 #define STK_PARAMS 128
12890 char stkbuf[STK_PARAMS] = {0};
12891 caddr_t data, memp;
12892 vnode_t vp = *arg_vp;
12893
12894 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12895 return ENOTTY;
12896 }
12897
12898 cmd = fsctl_bogus_command_compat(cmd);
12899
12900 size = IOCPARM_LEN(cmd);
12901 if (size > IOCPARM_MAX) {
12902 return EINVAL;
12903 }
12904
12905 is64bit = proc_is64bit(p);
12906
12907 memp = NULL;
12908
12909 if (size > sizeof(stkbuf)) {
12910 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12911 return ENOMEM;
12912 }
12913 data = memp;
12914 } else {
12915 data = &stkbuf[0];
12916 };
12917
12918 if (cmd & IOC_IN) {
12919 if (size) {
12920 error = copyin(udata, data, size);
12921 if (error) {
12922 if (memp) {
12923 kfree_data(memp, size);
12924 }
12925 return error;
12926 }
12927 } else {
12928 if (is64bit) {
12929 *(user_addr_t *)data = udata;
12930 } else {
12931 *(uint32_t *)data = (uint32_t)udata;
12932 }
12933 };
12934 } else if ((cmd & IOC_OUT) && size) {
12935 /*
12936 * Zero the buffer so the user always
12937 * gets back something deterministic.
12938 */
12939 bzero(data, size);
12940 } else if (cmd & IOC_VOID) {
12941 if (is64bit) {
12942 *(user_addr_t *)data = udata;
12943 } else {
12944 *(uint32_t *)data = (uint32_t)udata;
12945 }
12946 }
12947
12948 /* Check to see if it's a generic command */
12949 switch (cmd) {
12950 case FSIOC_SYNC_VOLUME:
12951 error = handle_sync_volume(vp, arg_vp, data, ctx);
12952 break;
12953
12954 case FSIOC_ROUTEFS_SETROUTEID:
12955 #if ROUTEFS
12956 error = handle_routes(udata);
12957 #endif
12958 break;
12959
12960 case FSIOC_SET_PACKAGE_EXTS: {
12961 user_addr_t ext_strings;
12962 uint32_t num_entries;
12963 uint32_t max_width;
12964
12965 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12966 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12967 error = EPERM;
12968 break;
12969 }
12970
12971 if ((is64bit && size != sizeof(user64_package_ext_info))
12972 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12973 // either you're 64-bit and passed a 64-bit struct or
12974 // you're 32-bit and passed a 32-bit struct. otherwise
12975 // it's not ok.
12976 error = EINVAL;
12977 break;
12978 }
12979
12980 if (is64bit) {
12981 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12982 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12983 }
12984 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12985 num_entries = ((user64_package_ext_info *)data)->num_entries;
12986 max_width = ((user64_package_ext_info *)data)->max_width;
12987 } else {
12988 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12989 num_entries = ((user32_package_ext_info *)data)->num_entries;
12990 max_width = ((user32_package_ext_info *)data)->max_width;
12991 }
12992 error = set_package_extensions_table(ext_strings, num_entries, max_width);
12993 }
12994 break;
12995
12996 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12997 {
12998 mount_t mp;
12999
13000 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
13001 break;
13002 }
13003 if ((mp = vp->v_mount) != NULL) {
13004 mount_lock(mp);
13005 if (data[0] != 0) {
13006 for (int i = 0; i < MFSTYPENAMELEN; i++) {
13007 if (!data[i]) {
13008 goto continue_copy;
13009 }
13010 }
13011 /*
13012 * Getting here means we have a user data
13013 * string which has no NULL termination in
13014 * its first MFSTYPENAMELEN bytes. This is
13015 * bogus, let's avoid strlcpy-ing the read
13016 * data and return an error.
13017 */
13018 error = EINVAL;
13019 goto unlock;
13020 continue_copy:
13021 vfs_setfstypename_locked(mp, data);
13022 if (vfs_isrdonly(mp) &&
13023 strcmp(data, "mtmfs") == 0) {
13024 mp->mnt_kern_flag |=
13025 MNTK_EXTENDED_SECURITY;
13026 mp->mnt_kern_flag &=
13027 ~MNTK_AUTH_OPAQUE;
13028 }
13029 } else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13030 const char *name =
13031 vfs_getfstypenameref_locked(mp, NULL);
13032 if (strcmp(name, "mtmfs") == 0) {
13033 mp->mnt_kern_flag &=
13034 ~MNTK_EXTENDED_SECURITY;
13035 }
13036 vfs_setfstypename_locked(mp, NULL);
13037 }
13038 unlock:
13039 mount_unlock(mp);
13040 }
13041 }
13042 break;
13043
13044 case DISK_CONDITIONER_IOC_GET: {
13045 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
13046 }
13047 break;
13048
13049 case DISK_CONDITIONER_IOC_SET: {
13050 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
13051 }
13052 break;
13053
13054 case FSIOC_CAS_BSDFLAGS:
13055 error = handle_flags(vp, data, ctx);
13056 break;
13057
13058 case FSIOC_FD_ONLY_OPEN_ONCE: {
13059 error = 0;
13060 if (vnode_usecount(vp) > 1) {
13061 vnode_lock_spin(vp);
13062 if (vp->v_lflag & VL_HASSTREAMS) {
13063 if (vnode_isinuse_locked(vp, 1, 1)) {
13064 error = EBUSY;
13065 }
13066 } else if (vnode_usecount(vp) > 1) {
13067 error = EBUSY;
13068 }
13069 vnode_unlock(vp);
13070 }
13071 }
13072 break;
13073
13074 case FSIOC_EVAL_ROOTAUTH:
13075 error = handle_auth(vp, cmd, data, options, ctx);
13076 break;
13077
13078 case FSIOC_TEST_FSE_ACCESS_GRANTED:
13079 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
13080 break;
13081
13082 #if CONFIG_EXCLAVES
13083 case FSIOC_EXCLAVE_FS_REGISTER:
13084 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13085 error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
13086 } else {
13087 error = EPERM;
13088 }
13089 break;
13090
13091 case FSIOC_EXCLAVE_FS_UNREGISTER:
13092 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13093 error = vfs_exclave_fs_unregister(vp);
13094 } else {
13095 error = EPERM;
13096 }
13097 break;
13098
13099 case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
13100 exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
13101 exclave_fs_base_dir_t *dirs = NULL;
13102 if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
13103 error = EPERM;
13104 break;
13105 }
13106 if (get_base_dirs->base_dirs) {
13107 if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
13108 error = EINVAL;
13109 break;
13110 }
13111 dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
13112 if (!dirs) {
13113 error = ENOSPC;
13114 break;
13115 }
13116 }
13117 error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
13118 if (!error && dirs) {
13119 error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
13120 get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
13121 }
13122 if (dirs) {
13123 kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
13124 }
13125 }
13126 break;
13127 #endif
13128
13129 default: {
13130 /*
13131 * Other, known commands shouldn't be passed down here.
13132 * (When adding a selector to this list, it may be prudent
13133 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
13134 */
13135 switch (cmd) {
13136 case F_PUNCHHOLE:
13137 case F_TRIM_ACTIVE_FILE:
13138 case F_RDADVISE:
13139 case F_TRANSCODEKEY:
13140 case F_GETPROTECTIONLEVEL:
13141 case F_GETDEFAULTPROTLEVEL:
13142 case F_MAKECOMPRESSED:
13143 case F_SET_GREEDY_MODE:
13144 case F_SETSTATICCONTENT:
13145 case F_SETIOTYPE:
13146 case F_SETBACKINGSTORE:
13147 case F_GETPATH_MTMINFO:
13148 case APFSIOC_REVERT_TO_SNAPSHOT:
13149 case FSIOC_FIOSEEKHOLE:
13150 case FSIOC_FIOSEEKDATA:
13151 case HFS_GET_BOOT_INFO:
13152 case HFS_SET_BOOT_INFO:
13153 case FIOPINSWAP:
13154 case F_CHKCLEAN:
13155 case F_FULLFSYNC:
13156 case F_BARRIERFSYNC:
13157 case F_FREEZE_FS:
13158 case F_THAW_FS:
13159 case FSIOC_KERNEL_ROOTAUTH:
13160 case FSIOC_GRAFT_FS:
13161 case FSIOC_UNGRAFT_FS:
13162 case FSIOC_AUTH_FS:
13163 case F_SPECULATIVE_READ:
13164 case F_ATTRIBUTION_TAG:
13165 case F_TRANSFEREXTENTS:
13166 case F_ASSERT_BG_ACCESS:
13167 case F_RELEASE_BG_ACCESS:
13168 error = EINVAL;
13169 goto outdrop;
13170 }
13171 /* Invoke the filesystem-specific code */
13172 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13173 }
13174 } /* end switch stmt */
13175
13176 /*
13177 * if no errors, copy any data to user. Size was
13178 * already set and checked above.
13179 */
13180 if (error == 0 && (cmd & IOC_OUT) && size) {
13181 error = copyout(data, udata, size);
13182 }
13183
13184 outdrop:
13185 if (memp) {
13186 kfree_data(memp, size);
13187 }
13188
13189 return error;
13190 }
13191
13192 /* ARGSUSED */
13193 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13194 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13195 {
13196 int error;
13197 struct nameidata nd;
13198 uint32_t nameiflags;
13199 vnode_t vp = NULL;
13200 vfs_context_t ctx = vfs_context_current();
13201
13202 AUDIT_ARG(cmd, (int)uap->cmd);
13203 AUDIT_ARG(value32, uap->options);
13204 /* Get the vnode for the file we are getting info on: */
13205 nameiflags = 0;
13206 //
13207 // if we come through fsctl() then the file is by definition not open.
13208 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13209 // lest the caller mistakenly thinks the only open is their own (but in
13210 // reality it's someone elses).
13211 //
13212 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13213 return EINVAL;
13214 }
13215 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13216 nameiflags |= FOLLOW;
13217 }
13218 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13219 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13220 }
13221 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13222 UIO_USERSPACE, uap->path, ctx);
13223 if ((error = namei(&nd))) {
13224 goto done;
13225 }
13226 vp = nd.ni_vp;
13227 nameidone(&nd);
13228
13229 #if CONFIG_MACF
13230 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13231 if (error) {
13232 goto done;
13233 }
13234 #endif
13235
13236 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13237
13238 done:
13239 if (vp) {
13240 vnode_put(vp);
13241 }
13242 return error;
13243 }
13244 /* ARGSUSED */
13245 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13246 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13247 {
13248 int error;
13249 vnode_t vp = NULL;
13250 vfs_context_t ctx = vfs_context_current();
13251 int fd = -1;
13252
13253 AUDIT_ARG(fd, uap->fd);
13254 AUDIT_ARG(cmd, (int)uap->cmd);
13255 AUDIT_ARG(value32, uap->options);
13256
13257 /* Get the vnode for the file we are getting info on: */
13258 if ((error = file_vnode(uap->fd, &vp))) {
13259 return error;
13260 }
13261 fd = uap->fd;
13262 if ((error = vnode_getwithref(vp))) {
13263 file_drop(fd);
13264 return error;
13265 }
13266
13267 #if CONFIG_MACF
13268 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13269 file_drop(fd);
13270 vnode_put(vp);
13271 return error;
13272 }
13273 #endif
13274
13275 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13276
13277 file_drop(fd);
13278
13279 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13280 if (vp) {
13281 vnode_put(vp);
13282 }
13283
13284 return error;
13285 }
13286 /* end of fsctl system call */
13287
13288 #define FILESEC_ACCESS_ENTITLEMENT \
13289 "com.apple.private.vfs.filesec-access"
13290
13291 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13292 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13293 {
13294 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13295 /*
13296 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13297 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13298 */
13299 if ((!setting && vfs_context_issuser(ctx)) ||
13300 IOTaskHasEntitlement(vfs_context_task(ctx),
13301 FILESEC_ACCESS_ENTITLEMENT)) {
13302 return 0;
13303 }
13304 }
13305
13306 return EPERM;
13307 }
13308
13309 /*
13310 * Retrieve the data of an extended attribute.
13311 */
13312 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13313 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13314 {
13315 vnode_t vp;
13316 struct nameidata nd;
13317 char attrname[XATTR_MAXNAMELEN + 1];
13318 vfs_context_t ctx = vfs_context_current();
13319 uio_t auio = NULL;
13320 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13321 size_t attrsize = 0;
13322 size_t namelen;
13323 u_int32_t nameiflags;
13324 int error;
13325 UIO_STACKBUF(uio_buf, 1);
13326
13327 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13328 return EINVAL;
13329 }
13330
13331 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13332 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13333 if (uap->options & XATTR_NOFOLLOW_ANY) {
13334 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13335 }
13336
13337 if ((error = namei(&nd))) {
13338 return error;
13339 }
13340 vp = nd.ni_vp;
13341 nameidone(&nd);
13342
13343 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13344 if (error != 0) {
13345 goto out;
13346 }
13347 if (xattr_protected(attrname) &&
13348 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13349 goto out;
13350 }
13351 /*
13352 * the specific check for 0xffffffff is a hack to preserve
13353 * binaray compatibilty in K64 with applications that discovered
13354 * that passing in a buf pointer and a size of -1 resulted in
13355 * just the size of the indicated extended attribute being returned.
13356 * this isn't part of the documented behavior, but because of the
13357 * original implemtation's check for "uap->size > 0", this behavior
13358 * was allowed. In K32 that check turned into a signed comparison
13359 * even though uap->size is unsigned... in K64, we blow by that
13360 * check because uap->size is unsigned and doesn't get sign smeared
13361 * in the munger for a 32 bit user app. we also need to add a
13362 * check to limit the maximum size of the buffer being passed in...
13363 * unfortunately, the underlying fileystems seem to just malloc
13364 * the requested size even if the actual extended attribute is tiny.
13365 * because that malloc is for kernel wired memory, we have to put a
13366 * sane limit on it.
13367 *
13368 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13369 * U64 running on K64 will yield -1 (64 bits wide)
13370 * U32/U64 running on K32 will yield -1 (32 bits wide)
13371 */
13372 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13373 goto no_uio;
13374 }
13375
13376 if (uap->value) {
13377 if (uap->size > (size_t)XATTR_MAXSIZE) {
13378 uap->size = XATTR_MAXSIZE;
13379 }
13380
13381 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13382 &uio_buf[0], sizeof(uio_buf));
13383 uio_addiov(auio, uap->value, uap->size);
13384 }
13385 no_uio:
13386 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13387 out:
13388 vnode_put(vp);
13389
13390 if (auio) {
13391 *retval = uap->size - uio_resid(auio);
13392 } else {
13393 *retval = (user_ssize_t)attrsize;
13394 }
13395
13396 return error;
13397 }
13398
13399 /*
13400 * Retrieve the data of an extended attribute.
13401 */
13402 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13403 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13404 {
13405 vnode_t vp;
13406 char attrname[XATTR_MAXNAMELEN + 1];
13407 vfs_context_t ctx = vfs_context_current();
13408 uio_t auio = NULL;
13409 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13410 size_t attrsize = 0;
13411 size_t namelen;
13412 int error;
13413 UIO_STACKBUF(uio_buf, 1);
13414
13415 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13416 XATTR_NOFOLLOW_ANY)) {
13417 return EINVAL;
13418 }
13419
13420 if ((error = file_vnode(uap->fd, &vp))) {
13421 return error;
13422 }
13423 if ((error = vnode_getwithref(vp))) {
13424 file_drop(uap->fd);
13425 return error;
13426 }
13427 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13428 if (error != 0) {
13429 goto out;
13430 }
13431 if (xattr_protected(attrname) &&
13432 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13433 goto out;
13434 }
13435 if (uap->value && uap->size > 0) {
13436 if (uap->size > (size_t)XATTR_MAXSIZE) {
13437 uap->size = XATTR_MAXSIZE;
13438 }
13439
13440 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13441 &uio_buf[0], sizeof(uio_buf));
13442 uio_addiov(auio, uap->value, uap->size);
13443 }
13444
13445 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13446 out:
13447 (void)vnode_put(vp);
13448 file_drop(uap->fd);
13449
13450 if (auio) {
13451 *retval = uap->size - uio_resid(auio);
13452 } else {
13453 *retval = (user_ssize_t)attrsize;
13454 }
13455 return error;
13456 }
13457
13458 /* struct for checkdirs iteration */
13459 struct setxattr_ctx {
13460 struct nameidata nd;
13461 char attrname[XATTR_MAXNAMELEN + 1];
13462 UIO_STACKBUF(uio_buf, 1);
13463 };
13464
13465 /*
13466 * Set the data of an extended attribute.
13467 */
13468 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13469 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13470 {
13471 vnode_t vp;
13472 vfs_context_t ctx = vfs_context_current();
13473 uio_t auio = NULL;
13474 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13475 size_t namelen;
13476 u_int32_t nameiflags;
13477 int error;
13478 struct setxattr_ctx *sactx;
13479
13480 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13481 return EINVAL;
13482 }
13483
13484 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13485 if (sactx == NULL) {
13486 return ENOMEM;
13487 }
13488
13489 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13490 if (error != 0) {
13491 if (error == EPERM) {
13492 /* if the string won't fit in attrname, copyinstr emits EPERM */
13493 error = ENAMETOOLONG;
13494 }
13495 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13496 goto out;
13497 }
13498 if (xattr_protected(sactx->attrname) &&
13499 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13500 goto out;
13501 }
13502 if (uap->size != 0 && uap->value == 0) {
13503 error = EINVAL;
13504 goto out;
13505 }
13506 if (uap->size > INT_MAX) {
13507 error = E2BIG;
13508 goto out;
13509 }
13510
13511 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13512 #if CONFIG_FILE_LEASES
13513 nameiflags |= WANTPARENT;
13514 #endif
13515 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13516 if (uap->options & XATTR_NOFOLLOW_ANY) {
13517 sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13518 }
13519
13520 if ((error = namei(&sactx->nd))) {
13521 goto out;
13522 }
13523 vp = sactx->nd.ni_vp;
13524 #if CONFIG_FILE_LEASES
13525 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13526 vnode_put(sactx->nd.ni_dvp);
13527 #endif
13528 nameidone(&sactx->nd);
13529
13530 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13531 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13532 uio_addiov(auio, uap->value, uap->size);
13533
13534 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13535 #if CONFIG_FSE
13536 if (error == 0) {
13537 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13538 FSE_ARG_VNODE, vp,
13539 FSE_ARG_DONE);
13540 }
13541 #endif
13542 vnode_put(vp);
13543 out:
13544 kfree_type(struct setxattr_ctx, sactx);
13545 *retval = 0;
13546 return error;
13547 }
13548
13549 /*
13550 * Set the data of an extended attribute.
13551 */
13552 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13553 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13554 {
13555 vnode_t vp;
13556 char attrname[XATTR_MAXNAMELEN + 1];
13557 vfs_context_t ctx = vfs_context_current();
13558 uio_t auio = NULL;
13559 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13560 size_t namelen;
13561 int error;
13562 UIO_STACKBUF(uio_buf, 1);
13563
13564 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13565 XATTR_NOFOLLOW_ANY)) {
13566 return EINVAL;
13567 }
13568
13569 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13570 if (error != 0) {
13571 if (error == EPERM) {
13572 /* if the string won't fit in attrname, copyinstr emits EPERM */
13573 return ENAMETOOLONG;
13574 }
13575 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13576 return error;
13577 }
13578 if (xattr_protected(attrname) &&
13579 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13580 return error;
13581 }
13582 if (uap->size != 0 && uap->value == 0) {
13583 return EINVAL;
13584 }
13585 if (uap->size > INT_MAX) {
13586 return E2BIG;
13587 }
13588 if ((error = file_vnode(uap->fd, &vp))) {
13589 return error;
13590 }
13591 if ((error = vnode_getwithref(vp))) {
13592 file_drop(uap->fd);
13593 return error;
13594 }
13595
13596 #if CONFIG_FILE_LEASES
13597 vnode_breakdirlease(vp, true, O_WRONLY);
13598 #endif
13599
13600 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13601 &uio_buf[0], sizeof(uio_buf));
13602 uio_addiov(auio, uap->value, uap->size);
13603
13604 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13605 #if CONFIG_FSE
13606 if (error == 0) {
13607 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13608 FSE_ARG_VNODE, vp,
13609 FSE_ARG_DONE);
13610 }
13611 #endif
13612 vnode_put(vp);
13613 file_drop(uap->fd);
13614 *retval = 0;
13615 return error;
13616 }
13617
13618 /*
13619 * Remove an extended attribute.
13620 * XXX Code duplication here.
13621 */
13622 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13623 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13624 {
13625 vnode_t vp;
13626 struct nameidata nd;
13627 char attrname[XATTR_MAXNAMELEN + 1];
13628 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13629 vfs_context_t ctx = vfs_context_current();
13630 size_t namelen;
13631 u_int32_t nameiflags;
13632 int error;
13633
13634 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13635 return EINVAL;
13636 }
13637
13638 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13639 if (error != 0) {
13640 return error;
13641 }
13642 if (xattr_protected(attrname)) {
13643 return EPERM;
13644 }
13645 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13646 #if CONFIG_FILE_LEASES
13647 nameiflags |= WANTPARENT;
13648 #endif
13649 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13650 if (uap->options & XATTR_NOFOLLOW_ANY) {
13651 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13652 }
13653
13654 if ((error = namei(&nd))) {
13655 return error;
13656 }
13657 vp = nd.ni_vp;
13658 #if CONFIG_FILE_LEASES
13659 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13660 vnode_put(nd.ni_dvp);
13661 #endif
13662 nameidone(&nd);
13663
13664 error = vn_removexattr(vp, attrname, uap->options, ctx);
13665 #if CONFIG_FSE
13666 if (error == 0) {
13667 add_fsevent(FSE_XATTR_REMOVED, ctx,
13668 FSE_ARG_VNODE, vp,
13669 FSE_ARG_DONE);
13670 }
13671 #endif
13672 vnode_put(vp);
13673 *retval = 0;
13674 return error;
13675 }
13676
13677 /*
13678 * Remove an extended attribute.
13679 * XXX Code duplication here.
13680 */
13681 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13682 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13683 {
13684 vnode_t vp;
13685 char attrname[XATTR_MAXNAMELEN + 1];
13686 size_t namelen;
13687 int error;
13688 #if CONFIG_FSE
13689 vfs_context_t ctx = vfs_context_current();
13690 #endif
13691
13692 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13693 XATTR_NOFOLLOW_ANY)) {
13694 return EINVAL;
13695 }
13696
13697 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13698 if (error != 0) {
13699 return error;
13700 }
13701 if (xattr_protected(attrname)) {
13702 return EPERM;
13703 }
13704 if ((error = file_vnode(uap->fd, &vp))) {
13705 return error;
13706 }
13707 if ((error = vnode_getwithref(vp))) {
13708 file_drop(uap->fd);
13709 return error;
13710 }
13711
13712 #if CONFIG_FILE_LEASES
13713 vnode_breakdirlease(vp, true, O_WRONLY);
13714 #endif
13715
13716 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13717 #if CONFIG_FSE
13718 if (error == 0) {
13719 add_fsevent(FSE_XATTR_REMOVED, ctx,
13720 FSE_ARG_VNODE, vp,
13721 FSE_ARG_DONE);
13722 }
13723 #endif
13724 vnode_put(vp);
13725 file_drop(uap->fd);
13726 *retval = 0;
13727 return error;
13728 }
13729
13730 /*
13731 * Retrieve the list of extended attribute names.
13732 * XXX Code duplication here.
13733 */
13734 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13735 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13736 {
13737 vnode_t vp;
13738 struct nameidata nd;
13739 vfs_context_t ctx = vfs_context_current();
13740 uio_t auio = NULL;
13741 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13742 size_t attrsize = 0;
13743 u_int32_t nameiflags;
13744 int error;
13745 UIO_STACKBUF(uio_buf, 1);
13746
13747 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13748 return EINVAL;
13749 }
13750
13751 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13752 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13753 if (uap->options & XATTR_NOFOLLOW_ANY) {
13754 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13755 }
13756
13757 if ((error = namei(&nd))) {
13758 return error;
13759 }
13760 vp = nd.ni_vp;
13761 nameidone(&nd);
13762 if (uap->namebuf != 0 && uap->bufsize > 0) {
13763 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13764 &uio_buf[0], sizeof(uio_buf));
13765 uio_addiov(auio, uap->namebuf, uap->bufsize);
13766 }
13767
13768 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13769
13770 vnode_put(vp);
13771 if (auio) {
13772 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13773 } else {
13774 *retval = (user_ssize_t)attrsize;
13775 }
13776 return error;
13777 }
13778
13779 /*
13780 * Retrieve the list of extended attribute names.
13781 * XXX Code duplication here.
13782 */
13783 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13784 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13785 {
13786 vnode_t vp;
13787 uio_t auio = NULL;
13788 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13789 size_t attrsize = 0;
13790 int error;
13791 UIO_STACKBUF(uio_buf, 1);
13792
13793 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13794 XATTR_NOFOLLOW_ANY)) {
13795 return EINVAL;
13796 }
13797
13798 if ((error = file_vnode(uap->fd, &vp))) {
13799 return error;
13800 }
13801 if ((error = vnode_getwithref(vp))) {
13802 file_drop(uap->fd);
13803 return error;
13804 }
13805 if (uap->namebuf != 0 && uap->bufsize > 0) {
13806 auio = uio_createwithbuffer(1, 0, spacetype,
13807 UIO_READ, &uio_buf[0], sizeof(uio_buf));
13808 uio_addiov(auio, uap->namebuf, uap->bufsize);
13809 }
13810
13811 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13812
13813 vnode_put(vp);
13814 file_drop(uap->fd);
13815 if (auio) {
13816 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13817 } else {
13818 *retval = (user_ssize_t)attrsize;
13819 }
13820 return error;
13821 }
13822
13823 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13824 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13825 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13826 {
13827 int error;
13828 struct mount *mp = NULL;
13829 vnode_t vp;
13830 int length;
13831 int bpflags;
13832 /* maximum number of times to retry build_path */
13833 unsigned int retries = 0x10;
13834
13835 if (bufsize > MAXLONGPATHLEN) {
13836 return EINVAL;
13837 }
13838
13839 if (buf == NULL) {
13840 return ENOMEM;
13841 }
13842
13843 retry:
13844 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13845 error = ENOTSUP; /* unexpected failure */
13846 return ENOTSUP;
13847 }
13848
13849 #if CONFIG_UNION_MOUNTS
13850 unionget:
13851 #endif /* CONFIG_UNION_MOUNTS */
13852 if (objid == 2) {
13853 struct vfs_attr vfsattr;
13854 int use_vfs_root = TRUE;
13855
13856 VFSATTR_INIT(&vfsattr);
13857 VFSATTR_WANTED(&vfsattr, f_capabilities);
13858 if (!(options & FSOPT_ISREALFSID) &&
13859 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13860 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13861 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13862 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13863 use_vfs_root = FALSE;
13864 }
13865 }
13866
13867 if (use_vfs_root) {
13868 error = VFS_ROOT(mp, &vp, ctx);
13869 } else {
13870 error = VFS_VGET(mp, objid, &vp, ctx);
13871 }
13872 } else {
13873 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13874 }
13875
13876 #if CONFIG_UNION_MOUNTS
13877 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13878 /*
13879 * If the fileid isn't found and we're in a union
13880 * mount volume, then see if the fileid is in the
13881 * mounted-on volume.
13882 */
13883 struct mount *tmp = mp;
13884 mp = vnode_mount(tmp->mnt_vnodecovered);
13885 vfs_unbusy(tmp);
13886 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13887 goto unionget;
13888 }
13889 } else {
13890 vfs_unbusy(mp);
13891 }
13892 #else
13893 vfs_unbusy(mp);
13894 #endif /* CONFIG_UNION_MOUNTS */
13895
13896 if (error) {
13897 return error;
13898 }
13899
13900 #if CONFIG_MACF
13901 error = mac_vnode_check_fsgetpath(ctx, vp);
13902 if (error) {
13903 vnode_put(vp);
13904 return error;
13905 }
13906 #endif
13907
13908 /* Obtain the absolute path to this vnode. */
13909 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13910 if (options & FSOPT_NOFIRMLINKPATH) {
13911 bpflags |= BUILDPATH_NO_FIRMLINK;
13912 }
13913 bpflags |= BUILDPATH_CHECK_MOVED;
13914 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13915 vnode_put(vp);
13916
13917 if (error) {
13918 /* there was a race building the path, try a few more times */
13919 if (error == EAGAIN) {
13920 --retries;
13921 if (retries > 0) {
13922 goto retry;
13923 }
13924
13925 error = ENOENT;
13926 }
13927 goto out;
13928 }
13929
13930 AUDIT_ARG(text, buf);
13931
13932 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13933 kdebug_vfs_lookup(buf, length, vp, KDBG_VFSLKUP_LOOKUP);
13934 }
13935
13936 *pathlen = length; /* may be superseded by error */
13937
13938 out:
13939 return error;
13940 }
13941
13942 /*
13943 * Obtain the full pathname of a file system object by id.
13944 */
13945 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13946 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13947 uint32_t options, user_ssize_t *retval)
13948 {
13949 vfs_context_t ctx = vfs_context_current();
13950 fsid_t fsid;
13951 char *realpath;
13952 int length;
13953 int error;
13954
13955 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13956 return EINVAL;
13957 }
13958
13959 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13960 return error;
13961 }
13962 AUDIT_ARG(value32, fsid.val[0]);
13963 AUDIT_ARG(value64, objid);
13964 /* Restrict output buffer size for now. */
13965
13966 if (bufsize > MAXLONGPATHLEN || bufsize <= 0) {
13967 return EINVAL;
13968 }
13969 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13970 if (realpath == NULL) {
13971 return ENOMEM;
13972 }
13973
13974 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13975 options, &length);
13976
13977 if (error) {
13978 goto out;
13979 }
13980
13981 error = copyout((caddr_t)realpath, buf, length);
13982
13983 *retval = (user_ssize_t)length; /* may be superseded by error */
13984 out:
13985 kfree_data(realpath, bufsize);
13986 return error;
13987 }
13988
13989 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13990 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13991 {
13992 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13993 0, retval);
13994 }
13995
13996 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13997 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13998 {
13999 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
14000 uap->options, retval);
14001 }
14002
14003 /*
14004 * Common routine to handle various flavors of statfs data heading out
14005 * to user space.
14006 *
14007 * Returns: 0 Success
14008 * EFAULT
14009 */
14010 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)14011 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
14012 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
14013 boolean_t partial_copy)
14014 {
14015 int error;
14016 int my_size, copy_size;
14017
14018 if (is_64_bit) {
14019 struct user64_statfs sfs;
14020 my_size = copy_size = sizeof(sfs);
14021 bzero(&sfs, my_size);
14022 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14023 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14024 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14025 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
14026 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
14027 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
14028 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
14029 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
14030 sfs.f_files = (user64_long_t)sfsp->f_files;
14031 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
14032 sfs.f_fsid = sfsp->f_fsid;
14033 sfs.f_owner = sfsp->f_owner;
14034 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14035 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14036 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14037
14038 if (partial_copy) {
14039 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14040 }
14041 error = copyout((caddr_t)&sfs, bufp, copy_size);
14042 } else {
14043 struct user32_statfs sfs;
14044
14045 my_size = copy_size = sizeof(sfs);
14046 bzero(&sfs, my_size);
14047
14048 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
14049 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
14050 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
14051
14052 /*
14053 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
14054 * have to fudge the numbers here in that case. We inflate the blocksize in order
14055 * to reflect the filesystem size as best we can.
14056 */
14057 if ((sfsp->f_blocks > INT_MAX)
14058 /* Hack for 4061702 . I think the real fix is for Carbon to
14059 * look for some volume capability and not depend on hidden
14060 * semantics agreed between a FS and carbon.
14061 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
14062 * for Carbon to set bNoVolumeSizes volume attribute.
14063 * Without this the webdavfs files cannot be copied onto
14064 * disk as they look huge. This change should not affect
14065 * XSAN as they should not setting these to -1..
14066 */
14067 && (sfsp->f_blocks != 0xffffffffffffffffULL)
14068 && (sfsp->f_bfree != 0xffffffffffffffffULL)
14069 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
14070 int shift;
14071
14072 /*
14073 * Work out how far we have to shift the block count down to make it fit.
14074 * Note that it's possible to have to shift so far that the resulting
14075 * blocksize would be unreportably large. At that point, we will clip
14076 * any values that don't fit.
14077 *
14078 * For safety's sake, we also ensure that f_iosize is never reported as
14079 * being smaller than f_bsize.
14080 */
14081 for (shift = 0; shift < 32; shift++) {
14082 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
14083 break;
14084 }
14085 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
14086 break;
14087 }
14088 }
14089 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
14090 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
14091 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
14092 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
14093 #undef __SHIFT_OR_CLIP
14094 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
14095 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
14096 } else {
14097 /* filesystem is small enough to be reported honestly */
14098 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
14099 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
14100 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
14101 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
14102 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
14103 }
14104 sfs.f_files = (user32_long_t)sfsp->f_files;
14105 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
14106 sfs.f_fsid = sfsp->f_fsid;
14107 sfs.f_owner = sfsp->f_owner;
14108 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
14109 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
14110 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
14111
14112 if (partial_copy) {
14113 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
14114 }
14115 error = copyout((caddr_t)&sfs, bufp, copy_size);
14116 }
14117
14118 if (sizep != NULL) {
14119 *sizep = my_size;
14120 }
14121 return error;
14122 }
14123
14124 /*
14125 * copy stat structure into user_stat structure.
14126 */
14127 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)14128 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
14129 {
14130 bzero(usbp, sizeof(*usbp));
14131
14132 usbp->st_dev = sbp->st_dev;
14133 usbp->st_ino = sbp->st_ino;
14134 usbp->st_mode = sbp->st_mode;
14135 usbp->st_nlink = sbp->st_nlink;
14136 usbp->st_uid = sbp->st_uid;
14137 usbp->st_gid = sbp->st_gid;
14138 usbp->st_rdev = sbp->st_rdev;
14139 #ifndef _POSIX_C_SOURCE
14140 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14141 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14142 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14143 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14144 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14145 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14146 #else
14147 usbp->st_atime = sbp->st_atime;
14148 usbp->st_atimensec = sbp->st_atimensec;
14149 usbp->st_mtime = sbp->st_mtime;
14150 usbp->st_mtimensec = sbp->st_mtimensec;
14151 usbp->st_ctime = sbp->st_ctime;
14152 usbp->st_ctimensec = sbp->st_ctimensec;
14153 #endif
14154 usbp->st_size = sbp->st_size;
14155 usbp->st_blocks = sbp->st_blocks;
14156 usbp->st_blksize = sbp->st_blksize;
14157 usbp->st_flags = sbp->st_flags;
14158 usbp->st_gen = sbp->st_gen;
14159 usbp->st_lspare = sbp->st_lspare;
14160 usbp->st_qspare[0] = sbp->st_qspare[0];
14161 usbp->st_qspare[1] = sbp->st_qspare[1];
14162 }
14163
14164 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14165 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14166 {
14167 bzero(usbp, sizeof(*usbp));
14168
14169 usbp->st_dev = sbp->st_dev;
14170 usbp->st_ino = sbp->st_ino;
14171 usbp->st_mode = sbp->st_mode;
14172 usbp->st_nlink = sbp->st_nlink;
14173 usbp->st_uid = sbp->st_uid;
14174 usbp->st_gid = sbp->st_gid;
14175 usbp->st_rdev = sbp->st_rdev;
14176 #ifndef _POSIX_C_SOURCE
14177 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14178 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14179 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14180 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14181 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14182 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14183 #else
14184 usbp->st_atime = sbp->st_atime;
14185 usbp->st_atimensec = sbp->st_atimensec;
14186 usbp->st_mtime = sbp->st_mtime;
14187 usbp->st_mtimensec = sbp->st_mtimensec;
14188 usbp->st_ctime = sbp->st_ctime;
14189 usbp->st_ctimensec = sbp->st_ctimensec;
14190 #endif
14191 usbp->st_size = sbp->st_size;
14192 usbp->st_blocks = sbp->st_blocks;
14193 usbp->st_blksize = sbp->st_blksize;
14194 usbp->st_flags = sbp->st_flags;
14195 usbp->st_gen = sbp->st_gen;
14196 usbp->st_lspare = sbp->st_lspare;
14197 usbp->st_qspare[0] = sbp->st_qspare[0];
14198 usbp->st_qspare[1] = sbp->st_qspare[1];
14199 }
14200
14201 /*
14202 * copy stat64 structure into user_stat64 structure.
14203 */
14204 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14205 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14206 {
14207 bzero(usbp, sizeof(*usbp));
14208
14209 usbp->st_dev = sbp->st_dev;
14210 usbp->st_ino = sbp->st_ino;
14211 usbp->st_mode = sbp->st_mode;
14212 usbp->st_nlink = sbp->st_nlink;
14213 usbp->st_uid = sbp->st_uid;
14214 usbp->st_gid = sbp->st_gid;
14215 usbp->st_rdev = sbp->st_rdev;
14216 #ifndef _POSIX_C_SOURCE
14217 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14218 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14219 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14220 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14221 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14222 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14223 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14224 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14225 #else
14226 usbp->st_atime = sbp->st_atime;
14227 usbp->st_atimensec = sbp->st_atimensec;
14228 usbp->st_mtime = sbp->st_mtime;
14229 usbp->st_mtimensec = sbp->st_mtimensec;
14230 usbp->st_ctime = sbp->st_ctime;
14231 usbp->st_ctimensec = sbp->st_ctimensec;
14232 usbp->st_birthtime = sbp->st_birthtime;
14233 usbp->st_birthtimensec = sbp->st_birthtimensec;
14234 #endif
14235 usbp->st_size = sbp->st_size;
14236 usbp->st_blocks = sbp->st_blocks;
14237 usbp->st_blksize = sbp->st_blksize;
14238 usbp->st_flags = sbp->st_flags;
14239 usbp->st_gen = sbp->st_gen;
14240 usbp->st_lspare = sbp->st_lspare;
14241 usbp->st_qspare[0] = sbp->st_qspare[0];
14242 usbp->st_qspare[1] = sbp->st_qspare[1];
14243 }
14244
14245 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14246 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14247 {
14248 bzero(usbp, sizeof(*usbp));
14249
14250 usbp->st_dev = sbp->st_dev;
14251 usbp->st_ino = sbp->st_ino;
14252 usbp->st_mode = sbp->st_mode;
14253 usbp->st_nlink = sbp->st_nlink;
14254 usbp->st_uid = sbp->st_uid;
14255 usbp->st_gid = sbp->st_gid;
14256 usbp->st_rdev = sbp->st_rdev;
14257 #ifndef _POSIX_C_SOURCE
14258 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14259 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14260 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14261 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14262 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14263 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14264 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14265 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14266 #else
14267 usbp->st_atime = sbp->st_atime;
14268 usbp->st_atimensec = sbp->st_atimensec;
14269 usbp->st_mtime = sbp->st_mtime;
14270 usbp->st_mtimensec = sbp->st_mtimensec;
14271 usbp->st_ctime = sbp->st_ctime;
14272 usbp->st_ctimensec = sbp->st_ctimensec;
14273 usbp->st_birthtime = sbp->st_birthtime;
14274 usbp->st_birthtimensec = sbp->st_birthtimensec;
14275 #endif
14276 usbp->st_size = sbp->st_size;
14277 usbp->st_blocks = sbp->st_blocks;
14278 usbp->st_blksize = sbp->st_blksize;
14279 usbp->st_flags = sbp->st_flags;
14280 usbp->st_gen = sbp->st_gen;
14281 usbp->st_lspare = sbp->st_lspare;
14282 usbp->st_qspare[0] = sbp->st_qspare[0];
14283 usbp->st_qspare[1] = sbp->st_qspare[1];
14284 }
14285
14286 /*
14287 * Purge buffer cache for simulating cold starts
14288 */
14289 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14290 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14291 {
14292 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14293
14294 return VNODE_RETURNED;
14295 }
14296
14297 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14298 vfs_purge_callback(mount_t mp, __unused void * arg)
14299 {
14300 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14301
14302 return VFS_RETURNED;
14303 }
14304
14305 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14306 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14307
14308 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14309 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14310 {
14311 if (!kauth_cred_issuser(kauth_cred_get())) {
14312 return EPERM;
14313 }
14314
14315 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14316
14317 /* also flush any VM pagers backed by files */
14318 if (vfs_purge_vm_pagers) {
14319 vm_purge_filebacked_pagers();
14320 }
14321
14322 return 0;
14323 }
14324
14325 /*
14326 * gets the vnode associated with the (unnamed) snapshot directory
14327 * for a Filesystem. The snapshot directory vnode is returned with
14328 * an iocount on it.
14329 */
14330 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14331 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14332 {
14333 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14334 }
14335
14336 /*
14337 * Get the snapshot vnode.
14338 *
14339 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14340 * needs nameidone() on ndp.
14341 *
14342 * If the snapshot vnode exists it is returned in ndp->ni_vp.
14343 *
14344 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14345 * not needed.
14346 */
14347 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14348 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14349 user_addr_t name, struct nameidata *ndp, int32_t op,
14350 #if !CONFIG_TRIGGERS
14351 __unused
14352 #endif
14353 enum path_operation pathop,
14354 vfs_context_t ctx)
14355 {
14356 int error, i;
14357 caddr_t name_buf;
14358 size_t name_len;
14359 struct vfs_attr vfa;
14360
14361 *sdvpp = NULLVP;
14362 *rvpp = NULLVP;
14363
14364 error = vnode_getfromfd(ctx, dirfd, rvpp);
14365 if (error) {
14366 return error;
14367 }
14368
14369 if (!vnode_isvroot(*rvpp)) {
14370 error = EINVAL;
14371 goto out;
14372 }
14373
14374 /* Make sure the filesystem supports snapshots */
14375 VFSATTR_INIT(&vfa);
14376 VFSATTR_WANTED(&vfa, f_capabilities);
14377 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14378 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14379 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14380 VOL_CAP_INT_SNAPSHOT)) ||
14381 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14382 VOL_CAP_INT_SNAPSHOT))) {
14383 error = ENOTSUP;
14384 goto out;
14385 }
14386
14387 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14388 if (error) {
14389 goto out;
14390 }
14391
14392 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14393 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14394 if (error) {
14395 goto out1;
14396 }
14397
14398 /*
14399 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14400 * (the length returned by copyinstr includes the terminating NUL)
14401 */
14402 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14403 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14404 error = EINVAL;
14405 goto out1;
14406 }
14407 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14408 ;
14409 }
14410 if (i < (int)name_len) {
14411 error = EINVAL;
14412 goto out1;
14413 }
14414
14415 #if CONFIG_MACF
14416 if (op == CREATE) {
14417 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14418 name_buf);
14419 } else if (op == DELETE) {
14420 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14421 name_buf);
14422 }
14423 if (error) {
14424 goto out1;
14425 }
14426 #endif
14427
14428 /* Check if the snapshot already exists ... */
14429 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14430 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14431 ndp->ni_dvp = *sdvpp;
14432
14433 error = namei(ndp);
14434 out1:
14435 zfree(ZV_NAMEI, name_buf);
14436 out:
14437 if (error) {
14438 if (*sdvpp) {
14439 vnode_put(*sdvpp);
14440 *sdvpp = NULLVP;
14441 }
14442 if (*rvpp) {
14443 vnode_put(*rvpp);
14444 *rvpp = NULLVP;
14445 }
14446 }
14447 return error;
14448 }
14449
14450 /*
14451 * create a filesystem snapshot (for supporting filesystems)
14452 *
14453 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14454 * We get to the (unnamed) snapshot directory vnode and create the vnode
14455 * for the snapshot in it.
14456 *
14457 * Restrictions:
14458 *
14459 * a) Passed in name for snapshot cannot have slashes.
14460 * b) name can't be "." or ".."
14461 *
14462 * Since this requires superuser privileges, vnode_authorize calls are not
14463 * made.
14464 */
14465 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14466 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14467 vfs_context_t ctx)
14468 {
14469 vnode_t rvp, snapdvp;
14470 int error;
14471 struct nameidata *ndp;
14472
14473 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14474
14475 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14476 OP_LINK, ctx);
14477 if (error) {
14478 goto out;
14479 }
14480
14481 if (ndp->ni_vp) {
14482 vnode_put(ndp->ni_vp);
14483 error = EEXIST;
14484 } else {
14485 struct vnode_attr *vap;
14486 vnode_t vp = NULLVP;
14487
14488 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14489
14490 VATTR_INIT(vap);
14491 VATTR_SET(vap, va_type, VREG);
14492 VATTR_SET(vap, va_mode, 0);
14493
14494 error = vn_create(snapdvp, &vp, ndp, vap,
14495 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14496 if (!error && vp) {
14497 vnode_put(vp);
14498 }
14499
14500 kfree_type(struct vnode_attr, vap);
14501 }
14502
14503 nameidone(ndp);
14504 vnode_put(snapdvp);
14505 vnode_put(rvp);
14506 out:
14507 kfree_type(struct nameidata, ndp);
14508
14509 return error;
14510 }
14511
14512 /*
14513 * Delete a Filesystem snapshot
14514 *
14515 * get the vnode for the unnamed snapshot directory and the snapshot and
14516 * delete the snapshot.
14517 */
14518 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14519 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14520 vfs_context_t ctx)
14521 {
14522 vnode_t rvp, snapdvp;
14523 int error;
14524 struct nameidata *ndp;
14525
14526 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14527
14528 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14529 OP_UNLINK, ctx);
14530 if (error) {
14531 goto out;
14532 }
14533
14534 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14535 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14536
14537 vnode_put(ndp->ni_vp);
14538 nameidone(ndp);
14539 vnode_put(snapdvp);
14540 vnode_put(rvp);
14541 out:
14542 kfree_type(struct nameidata, ndp);
14543
14544 return error;
14545 }
14546
14547 /*
14548 * Revert a filesystem to a snapshot
14549 *
14550 * Marks the filesystem to revert to the given snapshot on next mount.
14551 */
14552 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14553 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14554 vfs_context_t ctx)
14555 {
14556 int error;
14557 vnode_t rvp;
14558 mount_t mp;
14559 struct fs_snapshot_revert_args revert_data;
14560 struct componentname cnp;
14561 caddr_t name_buf;
14562 size_t name_len;
14563
14564 error = vnode_getfromfd(ctx, dirfd, &rvp);
14565 if (error) {
14566 return error;
14567 }
14568 mp = vnode_mount(rvp);
14569
14570 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14571 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14572 if (error) {
14573 zfree(ZV_NAMEI, name_buf);
14574 vnode_put(rvp);
14575 return error;
14576 }
14577
14578 #if CONFIG_MACF
14579 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14580 if (error) {
14581 zfree(ZV_NAMEI, name_buf);
14582 vnode_put(rvp);
14583 return error;
14584 }
14585 #endif
14586
14587 /*
14588 * Grab mount_iterref so that we can release the vnode,
14589 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14590 */
14591 error = mount_iterref(mp, 0);
14592 vnode_put(rvp);
14593 if (error) {
14594 zfree(ZV_NAMEI, name_buf);
14595 return error;
14596 }
14597
14598 memset(&cnp, 0, sizeof(cnp));
14599 cnp.cn_pnbuf = (char *)name_buf;
14600 cnp.cn_nameiop = LOOKUP;
14601 cnp.cn_flags = ISLASTCN | HASBUF;
14602 cnp.cn_pnlen = MAXPATHLEN;
14603 cnp.cn_nameptr = cnp.cn_pnbuf;
14604 cnp.cn_namelen = (int)name_len;
14605 revert_data.sr_cnp = &cnp;
14606
14607 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14608 mount_iterdrop(mp);
14609 zfree(ZV_NAMEI, name_buf);
14610
14611 if (error) {
14612 /* If there was any error, try again using VNOP_IOCTL */
14613
14614 vnode_t snapdvp;
14615 struct nameidata namend;
14616
14617 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14618 OP_LOOKUP, ctx);
14619 if (error) {
14620 return error;
14621 }
14622
14623
14624 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14625 0, ctx);
14626
14627 vnode_put(namend.ni_vp);
14628 nameidone(&namend);
14629 vnode_put(snapdvp);
14630 vnode_put(rvp);
14631 }
14632
14633 return error;
14634 }
14635
14636 /*
14637 * rename a Filesystem snapshot
14638 *
14639 * get the vnode for the unnamed snapshot directory and the snapshot and
14640 * rename the snapshot. This is a very specialised (and simple) case of
14641 * rename(2) (which has to deal with a lot more complications). It differs
14642 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14643 */
14644 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14645 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14646 __unused uint32_t flags, vfs_context_t ctx)
14647 {
14648 vnode_t rvp, snapdvp;
14649 int error, i;
14650 caddr_t newname_buf;
14651 size_t name_len;
14652 vnode_t fvp;
14653 struct nameidata *fromnd, *tond;
14654 /* carving out a chunk for structs that are too big to be on stack. */
14655 struct {
14656 struct nameidata from_node;
14657 struct nameidata to_node;
14658 } * __rename_data;
14659
14660 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14661 fromnd = &__rename_data->from_node;
14662 tond = &__rename_data->to_node;
14663
14664 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14665 OP_UNLINK, ctx);
14666 if (error) {
14667 goto out;
14668 }
14669 fvp = fromnd->ni_vp;
14670
14671 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14672 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14673 if (error) {
14674 goto out1;
14675 }
14676
14677 /*
14678 * Some sanity checks- new name can't be empty, "." or ".." or have
14679 * slashes.
14680 * (the length returned by copyinstr includes the terminating NUL)
14681 *
14682 * The FS rename VNOP is suppossed to handle this but we'll pick it
14683 * off here itself.
14684 */
14685 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14686 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14687 error = EINVAL;
14688 goto out1;
14689 }
14690 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14691 ;
14692 }
14693 if (i < (int)name_len) {
14694 error = EINVAL;
14695 goto out1;
14696 }
14697
14698 #if CONFIG_MACF
14699 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14700 newname_buf);
14701 if (error) {
14702 goto out1;
14703 }
14704 #endif
14705
14706 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14707 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14708 tond->ni_dvp = snapdvp;
14709
14710 error = namei(tond);
14711 if (error) {
14712 goto out2;
14713 } else if (tond->ni_vp) {
14714 /*
14715 * snapshot rename behaves differently than rename(2) - if the
14716 * new name exists, EEXIST is returned.
14717 */
14718 vnode_put(tond->ni_vp);
14719 error = EEXIST;
14720 goto out2;
14721 }
14722
14723 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14724 &tond->ni_cnd, ctx);
14725
14726 out2:
14727 nameidone(tond);
14728 out1:
14729 zfree(ZV_NAMEI, newname_buf);
14730 vnode_put(fvp);
14731 vnode_put(snapdvp);
14732 vnode_put(rvp);
14733 nameidone(fromnd);
14734 out:
14735 kfree_type(typeof(*__rename_data), __rename_data);
14736 return error;
14737 }
14738
14739 /*
14740 * Mount a Filesystem snapshot
14741 *
14742 * get the vnode for the unnamed snapshot directory and the snapshot and
14743 * mount the snapshot.
14744 */
14745 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14746 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14747 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14748 {
14749 mount_t mp;
14750 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14751 struct fs_snapshot_mount_args smnt_data;
14752 int error, mount_flags = 0;
14753 struct nameidata *snapndp, *dirndp;
14754 /* carving out a chunk for structs that are too big to be on stack. */
14755 struct {
14756 struct nameidata snapnd;
14757 struct nameidata dirnd;
14758 } * __snapshot_mount_data;
14759
14760 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14761 snapndp = &__snapshot_mount_data->snapnd;
14762 dirndp = &__snapshot_mount_data->dirnd;
14763
14764 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14765 OP_LOOKUP, ctx);
14766 if (error) {
14767 goto out;
14768 }
14769
14770 snapvp = snapndp->ni_vp;
14771 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14772 error = EIO;
14773 goto out1;
14774 }
14775
14776 /* Convert snapshot_mount flags to mount flags */
14777 if (flags & SNAPSHOT_MNT_NOSUID) {
14778 mount_flags |= MNT_NOSUID;
14779 }
14780 if (flags & SNAPSHOT_MNT_NODEV) {
14781 mount_flags |= MNT_NODEV;
14782 }
14783 if (flags & SNAPSHOT_MNT_DONTBROWSE) {
14784 mount_flags |= MNT_DONTBROWSE;
14785 }
14786 if (flags & SNAPSHOT_MNT_IGNORE_OWNERSHIP) {
14787 mount_flags |= MNT_IGNORE_OWNERSHIP;
14788 }
14789 if (flags & SNAPSHOT_MNT_NOFOLLOW) {
14790 mount_flags |= MNT_NOFOLLOW;
14791 }
14792
14793 /* Get the vnode to be covered */
14794 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14795 UIO_USERSPACE, directory, ctx);
14796 if (mount_flags & MNT_NOFOLLOW) {
14797 dirndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
14798 }
14799
14800 error = namei(dirndp);
14801 if (error) {
14802 goto out1;
14803 }
14804
14805 vp = dirndp->ni_vp;
14806 pvp = dirndp->ni_dvp;
14807 mp = vnode_mount(rvp);
14808
14809 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14810 error = EINVAL;
14811 goto out2;
14812 }
14813
14814 #if CONFIG_MACF
14815 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14816 mp->mnt_vfsstat.f_fstypename);
14817 if (error) {
14818 goto out2;
14819 }
14820 #endif
14821
14822 smnt_data.sm_mp = mp;
14823 smnt_data.sm_cnp = &snapndp->ni_cnd;
14824 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14825 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), mount_flags,
14826 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14827
14828 out2:
14829 vnode_put(vp);
14830 vnode_put(pvp);
14831 nameidone(dirndp);
14832 out1:
14833 vnode_put(snapvp);
14834 vnode_put(snapdvp);
14835 vnode_put(rvp);
14836 nameidone(snapndp);
14837 out:
14838 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14839 return error;
14840 }
14841
14842 /*
14843 * Root from a snapshot of the filesystem
14844 *
14845 * Marks the filesystem to root from the given snapshot on next boot.
14846 */
14847 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14848 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14849 vfs_context_t ctx)
14850 {
14851 int error;
14852 vnode_t rvp;
14853 mount_t mp;
14854 struct fs_snapshot_root_args root_data;
14855 struct componentname cnp;
14856 caddr_t name_buf;
14857 size_t name_len;
14858
14859 error = vnode_getfromfd(ctx, dirfd, &rvp);
14860 if (error) {
14861 return error;
14862 }
14863 mp = vnode_mount(rvp);
14864
14865 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14866 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14867 if (error) {
14868 zfree(ZV_NAMEI, name_buf);
14869 vnode_put(rvp);
14870 return error;
14871 }
14872
14873 // XXX MAC checks ?
14874
14875 /*
14876 * Grab mount_iterref so that we can release the vnode,
14877 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14878 */
14879 error = mount_iterref(mp, 0);
14880 vnode_put(rvp);
14881 if (error) {
14882 zfree(ZV_NAMEI, name_buf);
14883 return error;
14884 }
14885
14886 memset(&cnp, 0, sizeof(cnp));
14887 cnp.cn_pnbuf = (char *)name_buf;
14888 cnp.cn_nameiop = LOOKUP;
14889 cnp.cn_flags = ISLASTCN | HASBUF;
14890 cnp.cn_pnlen = MAXPATHLEN;
14891 cnp.cn_nameptr = cnp.cn_pnbuf;
14892 cnp.cn_namelen = (int)name_len;
14893 root_data.sr_cnp = &cnp;
14894
14895 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14896
14897 mount_iterdrop(mp);
14898 zfree(ZV_NAMEI, name_buf);
14899
14900 return error;
14901 }
14902
14903 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14904 vfs_context_can_snapshot(vfs_context_t ctx)
14905 {
14906 static const char * const snapshot_entitlements[] = {
14907 "com.apple.private.vfs.snapshot",
14908 "com.apple.developer.vfs.snapshot",
14909 "com.apple.private.apfs.arv.limited.snapshot",
14910 };
14911 static const size_t nentitlements =
14912 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14913 size_t i;
14914
14915 task_t task = vfs_context_task(ctx);
14916 for (i = 0; i < nentitlements; i++) {
14917 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14918 return TRUE;
14919 }
14920 }
14921 return FALSE;
14922 }
14923
14924 /*
14925 * FS snapshot operations dispatcher
14926 */
14927 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14928 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14929 __unused int32_t *retval)
14930 {
14931 int error;
14932 vfs_context_t ctx = vfs_context_current();
14933
14934 AUDIT_ARG(fd, uap->dirfd);
14935 AUDIT_ARG(value32, uap->op);
14936
14937 if (!vfs_context_can_snapshot(ctx)) {
14938 return EPERM;
14939 }
14940
14941 /*
14942 * Enforce user authorization for snapshot modification operations,
14943 * or if trying to root from snapshot.
14944 */
14945 if (uap->op != SNAPSHOT_OP_MOUNT) {
14946 vnode_t dvp = NULLVP;
14947 vnode_t devvp = NULLVP;
14948 mount_t mp;
14949
14950 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14951 if (error) {
14952 return error;
14953 }
14954 mp = vnode_mount(dvp);
14955 devvp = mp->mnt_devvp;
14956
14957 /* get an iocount on devvp */
14958 if (devvp == NULLVP) {
14959 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14960 /* for mounts which arent block devices */
14961 if (error == ENOENT) {
14962 error = ENXIO;
14963 }
14964 } else {
14965 error = vnode_getwithref(devvp);
14966 }
14967
14968 if (error) {
14969 vnode_put(dvp);
14970 return error;
14971 }
14972
14973 if ((vfs_context_issuser(ctx) == 0) &&
14974 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14975 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14976 error = EPERM;
14977 }
14978 vnode_put(dvp);
14979 vnode_put(devvp);
14980
14981 if (error) {
14982 return error;
14983 }
14984 }
14985
14986 switch (uap->op) {
14987 case SNAPSHOT_OP_CREATE:
14988 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14989 break;
14990 case SNAPSHOT_OP_DELETE:
14991 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14992 break;
14993 case SNAPSHOT_OP_RENAME:
14994 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14995 uap->flags, ctx);
14996 break;
14997 case SNAPSHOT_OP_MOUNT:
14998 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14999 uap->data, uap->flags, ctx);
15000 break;
15001 case SNAPSHOT_OP_REVERT:
15002 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
15003 break;
15004 #if CONFIG_MNT_ROOTSNAP
15005 case SNAPSHOT_OP_ROOT:
15006 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
15007 break;
15008 #endif /* CONFIG_MNT_ROOTSNAP */
15009 default:
15010 error = ENOSYS;
15011 }
15012
15013 return error;
15014 }
15015