1 /*
2 * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114 #if CONFIG_EXCLAVES
115 #include <vfs/vfs_exclave_fs.h>
116 #endif
117
118 #include <security/audit/audit.h>
119 #include <bsm/audit_kevents.h>
120
121 #include <mach/mach_types.h>
122 #include <kern/kern_types.h>
123 #include <kern/kalloc.h>
124 #include <kern/task.h>
125
126 #include <vm/vm_pageout.h>
127 #include <vm/vm_protos.h>
128 #include <vm/memory_object_xnu.h>
129
130 #include <libkern/OSAtomic.h>
131 #include <os/atomic_private.h>
132 #include <pexpert/pexpert.h>
133 #include <IOKit/IOBSD.h>
134
135 // deps for MIG call
136 #include <kern/host.h>
137 #include <kern/ipc_misc.h>
138 #include <mach/host_priv.h>
139 #include <mach/vfs_nspace.h>
140 #include <os/log.h>
141
142 #include <nfs/nfs_conf.h>
143
144 #if ROUTEFS
145 #include <miscfs/routefs/routefs.h>
146 #endif /* ROUTEFS */
147
148 #if CONFIG_MACF
149 #include <security/mac.h>
150 #include <security/mac_framework.h>
151 #endif
152
153 #if CONFIG_FSE
154 #define GET_PATH(x) \
155 ((x) = get_pathbuff())
156 #define RELEASE_PATH(x) \
157 release_pathbuff(x)
158 #else
159 #define GET_PATH(x) \
160 ((x) = zalloc(ZV_NAMEI))
161 #define RELEASE_PATH(x) \
162 zfree(ZV_NAMEI, x)
163 #endif /* CONFIG_FSE */
164
165 #ifndef HFS_GET_BOOT_INFO
166 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
167 #endif
168
169 #ifndef HFS_SET_BOOT_INFO
170 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
171 #endif
172
173 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
174 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
175 #endif
176
177 extern void disk_conditioner_unmount(mount_t mp);
178
179 /* struct for checkdirs iteration */
180 struct cdirargs {
181 vnode_t olddp;
182 vnode_t newdp;
183 };
184 /* callback for checkdirs iteration */
185 static int checkdirs_callback(proc_t p, void * arg);
186
187 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
188 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
189 void enablequotas(struct mount *mp, vfs_context_t ctx);
190 static int getfsstat_callback(mount_t mp, void * arg);
191 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
192 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
193 static int sync_callback(mount_t, void *);
194 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
195 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
196 boolean_t partial_copy);
197 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
198 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
199 struct componentname *cnp, user_addr_t fsmountargs,
200 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
201 void vfs_notify_mount(vnode_t pdvp);
202
203 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
204
205 struct fd_vn_data * fg_vn_data_alloc(void);
206
207 /*
208 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
209 * Concurrent lookups (or lookups by ids) on hard links can cause the
210 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
211 * does) to return ENOENT as the path cannot be returned from the name cache
212 * alone. We have no option but to retry and hope to get one namei->reverse path
213 * generation done without an intervening lookup, lookup by id on the hard link
214 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
215 * which currently are the MAC hooks for rename, unlink and rmdir.
216 */
217 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
218
219 /* Max retry limit for rename due to vnode recycling. */
220 #define MAX_RENAME_ERECYCLE_RETRIES 1024
221
222 #define MAX_LINK_ENOENT_RETRIES 1024
223
224 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
225 int unlink_flags);
226
227 #ifdef CONFIG_IMGSRC_ACCESS
228 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
229 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
230 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
231 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
232 static void mount_end_update(mount_t mp);
233 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
234 #endif /* CONFIG_IMGSRC_ACCESS */
235
236 //snapshot functions
237 #if CONFIG_MNT_ROOTSNAP
238 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
239 #else
240 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
241 #endif
242
243 __private_extern__
244 int sync_internal(void);
245
246 __private_extern__
247 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
248
249 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
250 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
251
252 /* vars for sync mutex */
253 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
254 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
255
256 extern lck_rw_t rootvnode_rw_lock;
257
258 VFS_SMR_DECLARE;
259 extern uint32_t nc_smr_enabled;
260
261 /*
262 * incremented each time a mount or unmount operation occurs
263 * used to invalidate the cached value of the rootvp in the
264 * mount structure utilized by cache_lookup_path
265 */
266 uint32_t mount_generation = 0;
267
268 /* counts number of mount and unmount operations */
269 unsigned int vfs_nummntops = 0;
270
271 /* system-wide, per-boot unique mount ID */
272 static _Atomic uint64_t mount_unique_id = 1;
273
274 extern const struct fileops vnops;
275 #if CONFIG_APPLEDOUBLE
276 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
277 #endif /* CONFIG_APPLEDOUBLE */
278
279 /* Maximum buffer length supported by fsgetpath(2) */
280 #define FSGETPATH_MAXBUFLEN 8192
281
282 /*
283 * Virtual File System System Calls
284 */
285
286 /*
287 * Private in-kernel mounting spi (specific use-cases only)
288 */
289 boolean_t
vfs_iskernelmount(mount_t mp)290 vfs_iskernelmount(mount_t mp)
291 {
292 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
293 }
294
295 __private_extern__
296 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)297 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
298 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
299 vfs_context_t ctx)
300 {
301 struct nameidata nd;
302 boolean_t did_namei;
303 int error;
304
305 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
306 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
307
308 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
309
310 /*
311 * Get the vnode to be covered if it's not supplied
312 */
313 if (vp == NULLVP) {
314 error = namei(&nd);
315 if (error) {
316 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
317 printf("failed to locate mount-on path: %s ", path);
318 }
319 return error;
320 }
321 vp = nd.ni_vp;
322 pvp = nd.ni_dvp;
323 did_namei = TRUE;
324 } else {
325 char *pnbuf = CAST_DOWN(char *, path);
326
327 nd.ni_cnd.cn_pnbuf = pnbuf;
328 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
329 did_namei = FALSE;
330 }
331
332 kern_flags |= KERNEL_MOUNT_KMOUNT;
333 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
334 syscall_flags, kern_flags, NULL, ctx);
335
336 if (did_namei) {
337 vnode_put(vp);
338 vnode_put(pvp);
339 nameidone(&nd);
340 }
341
342 return error;
343 }
344
345 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)346 vfs_mount_at_path(const char *fstype, const char *path,
347 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
348 int mnt_flags, int flags)
349 {
350 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
351 int error, km_flags = 0;
352 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
353
354 /*
355 * This call is currently restricted to specific use cases.
356 */
357 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
358 return ENOTSUP;
359 }
360
361 #if !defined(XNU_TARGET_OS_OSX)
362 if (strcmp(fstype, "lifs") == 0) {
363 syscall_flags |= MNT_NOEXEC;
364 }
365 #endif
366
367 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
368 km_flags |= KERNEL_MOUNT_NOAUTH;
369 }
370 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
371 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
372 }
373
374 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
375 syscall_flags, km_flags, ctx);
376 if (error) {
377 printf("%s: mount on %s failed, error %d\n", __func__, path,
378 error);
379 }
380
381 return error;
382 }
383
384 /*
385 * Mount a file system.
386 */
387 /* ARGSUSED */
388 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)389 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
390 {
391 struct __mac_mount_args muap;
392
393 muap.type = uap->type;
394 muap.path = uap->path;
395 muap.flags = uap->flags;
396 muap.data = uap->data;
397 muap.mac_p = USER_ADDR_NULL;
398 return __mac_mount(p, &muap, retval);
399 }
400
401 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)402 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
403 {
404 struct componentname cn;
405 vfs_context_t ctx = vfs_context_current();
406 size_t dummy = 0;
407 int error;
408 int flags = uap->flags;
409 char fstypename[MFSNAMELEN];
410 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
411 vnode_t pvp;
412 vnode_t vp;
413
414 AUDIT_ARG(fd, uap->fd);
415 AUDIT_ARG(fflags, flags);
416 /* fstypename will get audited by mount_common */
417
418 /* Sanity check the flags */
419 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
420 return ENOTSUP;
421 }
422
423 if (flags & MNT_UNION) {
424 return EPERM;
425 }
426
427 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
428 if (error) {
429 return error;
430 }
431
432 if ((error = file_vnode(uap->fd, &vp)) != 0) {
433 return error;
434 }
435
436 if ((error = vnode_getwithref(vp)) != 0) {
437 file_drop(uap->fd);
438 return error;
439 }
440
441 pvp = vnode_getparent(vp);
442 if (pvp == NULL) {
443 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
444 error = EBUSY;
445 } else {
446 error = EINVAL;
447 }
448 vnode_put(vp);
449 file_drop(uap->fd);
450 return error;
451 }
452
453 memset(&cn, 0, sizeof(struct componentname));
454 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
455 cn.cn_pnlen = MAXPATHLEN;
456
457 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
458 zfree(ZV_NAMEI, cn.cn_pnbuf);
459 vnode_put(pvp);
460 vnode_put(vp);
461 file_drop(uap->fd);
462 return error;
463 }
464
465 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
466
467 zfree(ZV_NAMEI, cn.cn_pnbuf);
468 vnode_put(pvp);
469 vnode_put(vp);
470 file_drop(uap->fd);
471
472 return error;
473 }
474
475 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
476
477 /*
478 * Get the size of a graft file (a manifest or payload file).
479 * The vp should be an iocounted vnode.
480 */
481 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)482 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
483 {
484 struct stat64 sb = {};
485 int error;
486
487 *size = 0;
488
489 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
490 if (error) {
491 return error;
492 }
493
494 if (sb.st_size == 0) {
495 error = ENODATA;
496 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
497 error = EFBIG;
498 } else {
499 *size = (size_t) sb.st_size;
500 }
501
502 return error;
503 }
504
505 /*
506 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
507 * `size` must already be validated.
508 */
509 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)510 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
511 {
512 return vn_rdwr(UIO_READ, graft_vp,
513 (caddr_t) buf, (int) size, /* offset */ 0,
514 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
515 vfs_context_ucred(vctx), /* resid */ NULL,
516 vfs_context_proc(vctx));
517 }
518
519 /*
520 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
521 * and read it into `buf`.
522 * If `path_prefix` is non-NULL, verify that the file path has that prefix.
523 */
524 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,const char * path_prefix,size_t * size,void * buf)525 graft_secureboot_read_fd(int fd, vfs_context_t vctx, const char *path_prefix, size_t *size, void *buf)
526 {
527 vnode_t metadata_vp = NULLVP;
528 char *path = NULL;
529 int error;
530
531 // Convert this graft fd to a vnode.
532 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
533 goto out;
534 }
535
536 // Verify that the vnode path starts with `path_prefix` if it was passed.
537 if (path_prefix) {
538 int len = MAXPATHLEN;
539 path = zalloc(ZV_NAMEI);
540 if ((error = vn_getpath(metadata_vp, path, &len))) {
541 goto out;
542 }
543 if (strncmp(path, path_prefix, strlen(path_prefix))) {
544 error = EINVAL;
545 goto out;
546 }
547 }
548
549 // Get (and validate) size information.
550 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
551 goto out;
552 }
553
554 // Read each file into the provided buffer - we must get the expected amount of bytes.
555 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
556 goto out;
557 }
558
559 out:
560 if (path) {
561 zfree(ZV_NAMEI, path);
562 }
563 if (metadata_vp) {
564 vnode_put(metadata_vp);
565 metadata_vp = NULLVP;
566 }
567
568 return error;
569 }
570
571 #if XNU_TARGET_OS_OSX
572 #if defined(__arm64e__)
573 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/manifests/"
574 #else /* x86_64 */
575 #define MOBILE_ASSET_DATA_VAULT_PATH "/System/Library/AssetsV2/"
576 #endif /* x86_64 */
577 #else /* !XNU_TARGET_OS_OSX */
578 #define MOBILE_ASSET_DATA_VAULT_PATH "/private/var/MobileAsset/AssetsV2/manifests/"
579 #endif /* !XNU_TARGET_OS_OSX */
580
581 /*
582 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
583 * provided in `gfs`, saving the size of data read in `gfs`.
584 */
585 static int
graft_secureboot_read_metadata(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)586 graft_secureboot_read_metadata(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
587 vfs_context_t vctx, fsioc_graft_fs_t *gfs)
588 {
589 const char *manifest_path_prefix = NULL;
590 int error;
591
592 // For Mobile Asset, make sure that the manifest comes from a data vault.
593 if (graft_type == GRAFTDMG_CRYPTEX_MOBILE_ASSET) {
594 manifest_path_prefix = MOBILE_ASSET_DATA_VAULT_PATH;
595 }
596
597 // Read the authentic manifest.
598 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
599 manifest_path_prefix, &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
600 return error;
601 }
602
603 // The user manifest is currently unused, but set its size.
604 gfs->user_manifest_size = 0;
605
606 // Read the payload.
607 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
608 NULL, &gfs->payload_size, gfs->payload))) {
609 return error;
610 }
611
612 return 0;
613 }
614
615 /*
616 * Call into the filesystem to verify and graft a cryptex.
617 */
618 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)619 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
620 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
621 {
622 fsioc_graft_fs_t gfs = {};
623 uint64_t graft_dir_ino = 0;
624 struct stat64 sb = {};
625 int error;
626
627 // Pre-flight arguments.
628 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
629 // Make sure that this graft version matches what we support.
630 return ENOTSUP;
631 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
632 // For this type, cryptex VP must live on same volume as the target of graft.
633 return EXDEV;
634 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
635 // We cannot graft upon non-directories.
636 return ENOTDIR;
637 } else if (cryptex_vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) {
638 // We do not allow grafts inside disk images.
639 return ENODEV;
640 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
641 sbc_args->sbc_payload_fd < 0) {
642 // We cannot graft without a manifest and payload.
643 return EINVAL;
644 }
645
646 if (mounton_vp) {
647 // Get the mounton's inode number.
648 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
649 if (error) {
650 return error;
651 }
652 graft_dir_ino = (uint64_t) sb.st_ino;
653 }
654
655 // Create buffers (of our maximum-defined size) to store authentication info.
656 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
657 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
658
659 if (!gfs.authentic_manifest || !gfs.payload) {
660 error = ENOMEM;
661 goto out;
662 }
663
664 // Read our fd's into our buffers.
665 // (Note that this will set the buffer size fields in `gfs`.)
666 error = graft_secureboot_read_metadata(graft_type, sbc_args, vctx, &gfs);
667 if (error) {
668 goto out;
669 }
670
671 gfs.graft_version = FSIOC_GRAFT_VERSION;
672 gfs.graft_type = graft_type;
673 gfs.graft_4cc = sbc_args->sbc_4cc;
674 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
675 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
676 }
677 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
678 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
679 }
680 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
681 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
682 }
683 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
684 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
685 }
686 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
687 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
688 }
689 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
690 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
691 }
692 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
693
694 // Call into the FS to perform the graft (and validation).
695 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
696
697 out:
698 if (gfs.authentic_manifest) {
699 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
700 gfs.authentic_manifest = NULL;
701 }
702 if (gfs.payload) {
703 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
704 gfs.payload = NULL;
705 }
706
707 return error;
708 }
709
710 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
711
712 /*
713 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
714 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
715 */
716 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)717 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
718 {
719 int ua_dmgfd = uap->dmg_fd;
720 user_addr_t ua_mountdir = uap->mountdir;
721 uint32_t ua_grafttype = uap->graft_type;
722 user_addr_t ua_graftargs = uap->gda;
723
724 graftdmg_args_un kern_gda = {};
725 int error = 0;
726 secure_boot_cryptex_args_t *sbc_args = NULL;
727
728 vnode_t cryptex_vp = NULLVP;
729 vnode_t mounton_vp = NULLVP;
730 struct nameidata nd = {};
731 vfs_context_t ctx = vfs_context_current();
732
733 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
734 return EPERM;
735 }
736
737 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
738 if (error) {
739 return error;
740 }
741
742 // Copy mount dir in, if provided.
743 if (ua_mountdir != USER_ADDR_NULL) {
744 // Acquire vnode for mount-on path
745 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
746 UIO_USERSPACE, ua_mountdir, ctx);
747
748 error = namei(&nd);
749 if (error) {
750 return error;
751 }
752 mounton_vp = nd.ni_vp;
753 }
754
755 // Convert fd to vnode.
756 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
757 if (error) {
758 goto graftout;
759 }
760
761 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
762 error = EINVAL;
763 } else {
764 sbc_args = &kern_gda.sbc_args;
765 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
766 }
767
768 graftout:
769 if (cryptex_vp) {
770 vnode_put(cryptex_vp);
771 cryptex_vp = NULLVP;
772 }
773 if (mounton_vp) {
774 vnode_put(mounton_vp);
775 mounton_vp = NULLVP;
776 }
777 if (ua_mountdir != USER_ADDR_NULL) {
778 nameidone(&nd);
779 }
780
781 return error;
782 }
783
784 /*
785 * Ungraft a cryptex disk image (via mount dir FD)
786 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
787 */
788 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)789 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
790 {
791 int error = 0;
792 user_addr_t ua_mountdir = uap->mountdir;
793 fsioc_ungraft_fs_t ugfs;
794 vnode_t mounton_vp = NULLVP;
795 struct nameidata nd = {};
796 vfs_context_t ctx = vfs_context_current();
797
798 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
799 return EPERM;
800 }
801
802 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
803 return EINVAL;
804 }
805
806 ugfs.ungraft_flags = 0;
807
808 // Acquire vnode for mount-on path
809 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
810 UIO_USERSPACE, ua_mountdir, ctx);
811
812 error = namei(&nd);
813 if (error) {
814 return error;
815 }
816 mounton_vp = nd.ni_vp;
817
818 // Call into the FS to perform the ungraft
819 error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
820
821 vnode_put(mounton_vp);
822 nameidone(&nd);
823
824 return error;
825 }
826
827
828 void
vfs_notify_mount(vnode_t pdvp)829 vfs_notify_mount(vnode_t pdvp)
830 {
831 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
832 lock_vnode_and_post(pdvp, NOTE_WRITE);
833 }
834
835 /*
836 * __mac_mount:
837 * Mount a file system taking into account MAC label behavior.
838 * See mount(2) man page for more information
839 *
840 * Parameters: p Process requesting the mount
841 * uap User argument descriptor (see below)
842 * retval (ignored)
843 *
844 * Indirect: uap->type Filesystem type
845 * uap->path Path to mount
846 * uap->data Mount arguments
847 * uap->mac_p MAC info
848 * uap->flags Mount flags
849 *
850 *
851 * Returns: 0 Success
852 * !0 Not success
853 */
854 boolean_t root_fs_upgrade_try = FALSE;
855
856 #define MAX_NESTED_UNION_MOUNTS 10
857
858 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)859 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
860 {
861 vnode_t pvp = NULL;
862 vnode_t vp = NULL;
863 int need_nameidone = 0;
864 vfs_context_t ctx = vfs_context_current();
865 char fstypename[MFSNAMELEN];
866 struct nameidata nd;
867 size_t dummy = 0;
868 char *labelstr = NULL;
869 size_t labelsz = 0;
870 int flags = uap->flags;
871 int error;
872 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
873 boolean_t is_64bit = IS_64BIT_PROCESS(p);
874 #else
875 #pragma unused(p)
876 #endif
877 /*
878 * Get the fs type name from user space
879 */
880 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
881 if (error) {
882 return error;
883 }
884
885 /*
886 * Get the vnode to be covered
887 */
888 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
889 UIO_USERSPACE, uap->path, ctx);
890 if (flags & MNT_NOFOLLOW) {
891 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
892 }
893 error = namei(&nd);
894 if (error) {
895 goto out;
896 }
897 need_nameidone = 1;
898 vp = nd.ni_vp;
899 pvp = nd.ni_dvp;
900
901 #ifdef CONFIG_IMGSRC_ACCESS
902 /* Mounting image source cannot be batched with other operations */
903 if (flags == MNT_IMGSRC_BY_INDEX) {
904 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
905 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
906 goto out;
907 }
908 #endif /* CONFIG_IMGSRC_ACCESS */
909
910 #if CONFIG_MACF
911 /*
912 * Get the label string (if any) from user space
913 */
914 if (uap->mac_p != USER_ADDR_NULL) {
915 struct user_mac mac;
916 size_t ulen = 0;
917
918 if (is_64bit) {
919 struct user64_mac mac64;
920 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
921 mac.m_buflen = (user_size_t)mac64.m_buflen;
922 mac.m_string = (user_addr_t)mac64.m_string;
923 } else {
924 struct user32_mac mac32;
925 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
926 mac.m_buflen = mac32.m_buflen;
927 mac.m_string = mac32.m_string;
928 }
929 if (error) {
930 goto out;
931 }
932 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
933 (mac.m_buflen < 2)) {
934 error = EINVAL;
935 goto out;
936 }
937 labelsz = mac.m_buflen;
938 labelstr = kalloc_data(labelsz, Z_WAITOK);
939 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
940 if (error) {
941 goto out;
942 }
943 AUDIT_ARG(mac_string, labelstr);
944 }
945 #endif /* CONFIG_MACF */
946
947 AUDIT_ARG(fflags, flags);
948
949 if (flags & MNT_UNION) {
950 #if CONFIG_UNION_MOUNTS
951 mount_t mp = vp->v_mount;
952 int nested_union_mounts = 0;
953
954 name_cache_lock_shared();
955
956 /* Walk up the vnodecovered chain and check for nested union mounts. */
957 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
958 while (mp) {
959 if (!(mp->mnt_flag & MNT_UNION)) {
960 break;
961 }
962 mp = (mp->mnt_vnodecovered ? mp->mnt_vnodecovered->v_mount : NULL);
963
964 /*
965 * Limit the max nested unon mounts to prevent stack exhaustion
966 * when calling lookup_traverse_union().
967 */
968 if (++nested_union_mounts >= MAX_NESTED_UNION_MOUNTS) {
969 error = ELOOP;
970 break;
971 }
972 }
973
974 name_cache_unlock();
975 if (error) {
976 goto out;
977 }
978 #else
979 error = EPERM;
980 goto out;
981 #endif /* CONFIG_UNION_MOUNTS */
982 }
983
984 if ((vp->v_flag & VROOT) &&
985 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
986 #if CONFIG_UNION_MOUNTS
987 if (!(flags & MNT_UNION)) {
988 flags |= MNT_UPDATE;
989 } else {
990 /*
991 * For a union mount on '/', treat it as fresh
992 * mount instead of update.
993 * Otherwise, union mouting on '/' used to panic the
994 * system before, since mnt_vnodecovered was found to
995 * be NULL for '/' which is required for unionlookup
996 * after it gets ENOENT on union mount.
997 */
998 flags = (flags & ~(MNT_UPDATE));
999 }
1000 #else
1001 flags |= MNT_UPDATE;
1002 #endif /* CONFIG_UNION_MOUNTS */
1003
1004 #if SECURE_KERNEL
1005 if ((flags & MNT_RDONLY) == 0) {
1006 /* Release kernels are not allowed to mount "/" as rw */
1007 error = EPERM;
1008 goto out;
1009 }
1010 #endif
1011
1012 /*
1013 * See 7392553 for more details on why this check exists.
1014 * Suffice to say: If this check is ON and something tries
1015 * to mount the rootFS RW, we'll turn off the codesign
1016 * bitmap optimization.
1017 */
1018 #if CHECK_CS_VALIDATION_BITMAP
1019 if ((flags & MNT_RDONLY) == 0) {
1020 root_fs_upgrade_try = TRUE;
1021 }
1022 #endif
1023 }
1024
1025 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
1026 labelstr, ctx);
1027
1028 out:
1029
1030 #if CONFIG_MACF
1031 kfree_data(labelstr, labelsz);
1032 #endif /* CONFIG_MACF */
1033
1034 if (vp) {
1035 vnode_put(vp);
1036 }
1037 if (pvp) {
1038 vnode_put(pvp);
1039 }
1040 if (need_nameidone) {
1041 nameidone(&nd);
1042 }
1043
1044 return error;
1045 }
1046
1047 /*
1048 * common mount implementation (final stage of mounting)
1049 *
1050 * Arguments:
1051 * fstypename file system type (ie it's vfs name)
1052 * pvp parent of covered vnode
1053 * vp covered vnode
1054 * cnp component name (ie path) of covered vnode
1055 * flags generic mount flags
1056 * fsmountargs file system specific data
1057 * labelstr optional MAC label
1058 * kernelmount TRUE for mounts initiated from inside the kernel
1059 * ctx caller's context
1060 */
1061 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1062 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1063 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1064 char *labelstr, vfs_context_t ctx)
1065 {
1066 #if !CONFIG_MACF
1067 #pragma unused(labelstr)
1068 #endif
1069 struct vnode *devvp = NULLVP;
1070 struct vnode *device_vnode = NULLVP;
1071 #if CONFIG_MACF
1072 struct vnode *rvp;
1073 #endif
1074 struct mount *mp = NULL;
1075 struct vfstable *vfsp = (struct vfstable *)0;
1076 struct proc *p = vfs_context_proc(ctx);
1077 int error, flag = 0;
1078 bool flag_set = false;
1079 user_addr_t devpath = USER_ADDR_NULL;
1080 int ronly = 0;
1081 int mntalloc = 0;
1082 boolean_t vfsp_ref = FALSE;
1083 boolean_t is_rwlock_locked = FALSE;
1084 boolean_t did_rele = FALSE;
1085 boolean_t have_usecount = FALSE;
1086 boolean_t did_set_lmount = FALSE;
1087 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1088
1089 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1090 /* Check for mutually-exclusive flag bits */
1091 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1092 int bitcount = 0;
1093 while (checkflags != 0) {
1094 checkflags &= (checkflags - 1);
1095 bitcount++;
1096 }
1097
1098 if (bitcount > 1) {
1099 //not allowed to request multiple mount-by-role flags
1100 error = EINVAL;
1101 goto out1;
1102 }
1103 #endif
1104
1105 /*
1106 * Process an update for an existing mount
1107 */
1108 if (flags & MNT_UPDATE) {
1109 if ((vp->v_flag & VROOT) == 0) {
1110 error = EINVAL;
1111 goto out1;
1112 }
1113 mp = vp->v_mount;
1114
1115 /* if unmount or mount in progress, return error */
1116 mount_lock_spin(mp);
1117 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1118 mount_unlock(mp);
1119 error = EBUSY;
1120 goto out1;
1121 }
1122 mp->mnt_lflag |= MNT_LMOUNT;
1123 did_set_lmount = TRUE;
1124 mount_unlock(mp);
1125 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1126 is_rwlock_locked = TRUE;
1127 /*
1128 * We only allow the filesystem to be reloaded if it
1129 * is currently mounted read-only.
1130 */
1131 if ((flags & MNT_RELOAD) &&
1132 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1133 error = ENOTSUP;
1134 goto out1;
1135 }
1136
1137 /*
1138 * If content protection is enabled, update mounts are not
1139 * allowed to turn it off.
1140 */
1141 if ((mp->mnt_flag & MNT_CPROTECT) &&
1142 ((flags & MNT_CPROTECT) == 0)) {
1143 error = EINVAL;
1144 goto out1;
1145 }
1146
1147 /*
1148 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1149 * failure to return an error for this so we'll just silently
1150 * add it if it is not passed in.
1151 */
1152 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1153 ((flags & MNT_REMOVABLE) == 0)) {
1154 flags |= MNT_REMOVABLE;
1155 }
1156
1157 /* Can't downgrade the backer of the root FS */
1158 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1159 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1160 error = ENOTSUP;
1161 goto out1;
1162 }
1163
1164 /*
1165 * Only root, or the user that did the original mount is
1166 * permitted to update it.
1167 */
1168 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1169 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1170 goto out1;
1171 }
1172 #if CONFIG_MACF
1173 error = mac_mount_check_remount(ctx, mp, flags);
1174 if (error != 0) {
1175 goto out1;
1176 }
1177 #endif
1178 /*
1179 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1180 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1181 */
1182 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1183 flags |= MNT_NOSUID | MNT_NODEV;
1184 if (mp->mnt_flag & MNT_NOEXEC) {
1185 flags |= MNT_NOEXEC;
1186 }
1187 }
1188 flag = mp->mnt_flag;
1189 flag_set = true;
1190
1191
1192
1193 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1194
1195 vfsp = mp->mnt_vtable;
1196 goto update;
1197 } // MNT_UPDATE
1198
1199 /*
1200 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1201 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1202 */
1203 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1204 flags |= MNT_NOSUID | MNT_NODEV;
1205 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1206 flags |= MNT_NOEXEC;
1207 }
1208 }
1209
1210 /* XXXAUDIT: Should we capture the type on the error path as well? */
1211 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1212 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1213 mount_list_lock();
1214 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1215 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1216 vfsp->vfc_refcount++;
1217 vfsp_ref = TRUE;
1218 break;
1219 }
1220 }
1221 mount_list_unlock();
1222 if (vfsp == NULL) {
1223 error = ENODEV;
1224 goto out1;
1225 }
1226
1227 /*
1228 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1229 * except in ROSV configs and for the initial BaseSystem root.
1230 */
1231 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1232 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1233 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1234 error = EINVAL; /* unsupported request */
1235 goto out1;
1236 }
1237
1238 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1239 if (error != 0) {
1240 goto out1;
1241 }
1242
1243 /*
1244 * Allocate and initialize the filesystem (mount_t)
1245 */
1246 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1247 mntalloc = 1;
1248
1249 /* Initialize the default IO constraints */
1250 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1251 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1252 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1253 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1254 mp->mnt_devblocksize = DEV_BSIZE;
1255 mp->mnt_alignmentmask = PAGE_MASK;
1256 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1257 mp->mnt_ioscale = 1;
1258 mp->mnt_ioflags = 0;
1259 mp->mnt_realrootvp = NULLVP;
1260 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1261
1262 mp->mnt_lflag |= MNT_LMOUNT;
1263 did_set_lmount = TRUE;
1264
1265 TAILQ_INIT(&mp->mnt_vnodelist);
1266 TAILQ_INIT(&mp->mnt_workerqueue);
1267 TAILQ_INIT(&mp->mnt_newvnodes);
1268 mount_lock_init(mp);
1269 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1270 is_rwlock_locked = TRUE;
1271 mp->mnt_op = vfsp->vfc_vfsops;
1272 mp->mnt_vtable = vfsp;
1273 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1274 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1275 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1276 do {
1277 size_t pathlen = MAXPATHLEN;
1278
1279 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1280 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1281 }
1282 } while (0);
1283 mp->mnt_vnodecovered = vp;
1284 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1285 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1286 mp->mnt_devbsdunit = 0;
1287 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1288
1289 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1290 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1291
1292 if (kernelmount) {
1293 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1294 }
1295 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1296 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1297 }
1298
1299 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1300 // kernel mounted devfs
1301 mp->mnt_kern_flag |= MNTK_SYSTEM;
1302 }
1303
1304 update:
1305
1306 /*
1307 * Set the mount level flags.
1308 */
1309 if (flags & MNT_RDONLY) {
1310 mp->mnt_flag |= MNT_RDONLY;
1311 } else if (mp->mnt_flag & MNT_RDONLY) {
1312 // disallow read/write upgrades of file systems that
1313 // had the TYPENAME_OVERRIDE feature set.
1314 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1315 error = EPERM;
1316 goto out1;
1317 }
1318 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1319 }
1320 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1321 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1322 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1323 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1324 MNT_QUARANTINE | MNT_CPROTECT);
1325
1326 #if SECURE_KERNEL
1327 #if !CONFIG_MNT_SUID
1328 /*
1329 * On release builds of iOS based platforms, always enforce NOSUID on
1330 * all mounts. We do this here because we can catch update mounts as well as
1331 * non-update mounts in this case.
1332 */
1333 mp->mnt_flag |= (MNT_NOSUID);
1334 #endif
1335 #endif
1336
1337 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1338 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1339 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1340 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1341 MNT_QUARANTINE | MNT_CPROTECT);
1342
1343 #if CONFIG_MACF
1344 if (flags & MNT_MULTILABEL) {
1345 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1346 error = EINVAL;
1347 goto out1;
1348 }
1349 mp->mnt_flag |= MNT_MULTILABEL;
1350 }
1351 #endif
1352 /*
1353 * Process device path for local file systems if requested.
1354 *
1355 * Snapshot and mount-by-role mounts do not use this path; they are
1356 * passing other opaque data in the device path field.
1357 *
1358 * Basesystemroot mounts pass a device path to be resolved here,
1359 * but it's just a char * already inside the kernel, which
1360 * kernel_mount() shoved into a user_addr_t to call us. So for such
1361 * mounts we must skip copyin (both of the address and of the string
1362 * (in NDINIT).
1363 */
1364 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1365 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1366 boolean_t do_copyin_devpath = true;
1367 #if CONFIG_BASESYSTEMROOT
1368 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1369 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1370 // We have been passed fsmountargs, which is typed as a user_addr_t,
1371 // but is actually a char ** pointing to a (kernelspace) string.
1372 // We manually unpack it with a series of casts and dereferences
1373 // that reverses what was done just above us on the stack in
1374 // imageboot_pivot_image().
1375 // After retrieving the path to the dev node (which we will NDINIT
1376 // in a moment), we pass NULL fsmountargs on to the filesystem.
1377 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1378 char **devnamepp = (char **)fsmountargs;
1379 char *devnamep = *devnamepp;
1380 devpath = CAST_USER_ADDR_T(devnamep);
1381 do_copyin_devpath = false;
1382 fsmountargs = USER_ADDR_NULL;
1383
1384 //Now that we have a mp, denote that this mount is for the basesystem.
1385 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1386 }
1387 #endif // CONFIG_BASESYSTEMROOT
1388
1389 if (do_copyin_devpath) {
1390 if (vfs_context_is64bit(ctx)) {
1391 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1392 goto out1;
1393 }
1394 fsmountargs += sizeof(devpath);
1395 } else {
1396 user32_addr_t tmp;
1397 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1398 goto out1;
1399 }
1400 /* munge into LP64 addr */
1401 devpath = CAST_USER_ADDR_T(tmp);
1402 fsmountargs += sizeof(tmp);
1403 }
1404 }
1405
1406 /* Lookup device and authorize access to it */
1407 if ((devpath)) {
1408 struct nameidata nd;
1409
1410 enum uio_seg seg = UIO_USERSPACE;
1411 #if CONFIG_BASESYSTEMROOT
1412 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1413 seg = UIO_SYSSPACE;
1414 }
1415 #endif // CONFIG_BASESYSTEMROOT
1416
1417 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1418 if ((error = namei(&nd))) {
1419 goto out1;
1420 }
1421
1422 devvp = nd.ni_vp;
1423
1424 if (devvp->v_type != VBLK) {
1425 error = ENOTBLK;
1426 nameidone(&nd);
1427 goto out2;
1428 }
1429 if (major(devvp->v_rdev) >= nblkdev) {
1430 error = ENXIO;
1431 nameidone(&nd);
1432 goto out2;
1433 }
1434 /*
1435 * If mount by non-root, then verify that user has necessary
1436 * permissions on the device.
1437 */
1438 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1439 kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1440
1441 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1442 accessmode |= KAUTH_VNODE_WRITE_DATA;
1443 }
1444 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1445 nameidone(&nd);
1446 goto out2;
1447 }
1448 }
1449
1450 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1451 nameidone(&nd);
1452 }
1453 /* On first mount, preflight and open device */
1454 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1455 if ((error = vnode_ref(devvp))) {
1456 goto out2;
1457 }
1458 /*
1459 * Disallow multiple mounts of the same device.
1460 * Disallow mounting of a device that is currently in use
1461 * (except for root, which might share swap device for miniroot).
1462 * Flush out any old buffers remaining from a previous use.
1463 */
1464 if ((error = vfs_setmounting(devvp))) {
1465 vnode_rele(devvp);
1466 goto out2;
1467 }
1468
1469 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1470 error = EBUSY;
1471 goto out3;
1472 }
1473 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1474 error = ENOTBLK;
1475 goto out3;
1476 }
1477 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1478 goto out3;
1479 }
1480
1481 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1482 #if CONFIG_MACF
1483 error = mac_vnode_check_open(ctx,
1484 devvp,
1485 ronly ? FREAD : FREAD | FWRITE);
1486 if (error) {
1487 goto out3;
1488 }
1489 #endif /* MAC */
1490 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1491 goto out3;
1492 }
1493
1494 mp->mnt_devvp = devvp;
1495 device_vnode = devvp;
1496 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1497 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1498 (device_vnode = mp->mnt_devvp)) {
1499 dev_t dev;
1500 int maj;
1501 /*
1502 * If upgrade to read-write by non-root, then verify
1503 * that user has necessary permissions on the device.
1504 */
1505 vnode_getalways(device_vnode);
1506
1507 if (suser(vfs_context_ucred(ctx), NULL) &&
1508 (error = vnode_authorize(device_vnode, NULL,
1509 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1510 ctx)) != 0) {
1511 vnode_put(device_vnode);
1512 goto out2;
1513 }
1514
1515 /* Tell the device that we're upgrading */
1516 dev = (dev_t)device_vnode->v_rdev;
1517 maj = major(dev);
1518
1519 if ((u_int)maj >= (u_int)nblkdev) {
1520 panic("Volume mounted on a device with invalid major number.");
1521 }
1522
1523 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1524 vnode_put(device_vnode);
1525 device_vnode = NULLVP;
1526 if (error != 0) {
1527 goto out2;
1528 }
1529 }
1530 } // localargs && !(snapshot | data | vm)
1531
1532 #if CONFIG_MACF
1533 if ((flags & MNT_UPDATE) == 0) {
1534 mac_mount_label_init(mp);
1535 mac_mount_label_associate(ctx, mp);
1536 }
1537 if (labelstr) {
1538 if ((flags & MNT_UPDATE) != 0) {
1539 error = mac_mount_check_label_update(ctx, mp);
1540 if (error != 0) {
1541 goto out3;
1542 }
1543 }
1544 }
1545 #endif
1546 /*
1547 * Mount the filesystem. We already asserted that internal_flags
1548 * cannot have more than one mount-by-role bit set.
1549 */
1550 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1551 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1552 (caddr_t)fsmountargs, 0, ctx);
1553 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1554 #if CONFIG_ROSV_STARTUP
1555 struct mount *origin_mp = (struct mount*)fsmountargs;
1556 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1557 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1558 if (error) {
1559 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1560 } else {
1561 /* Mark volume associated with system volume */
1562 mp->mnt_kern_flag |= MNTK_SYSTEM;
1563
1564 /* Attempt to acquire the mnt_devvp and set it up */
1565 struct vnode *mp_devvp = NULL;
1566 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1567 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1568 0, &mp_devvp, vfs_context_kernel());
1569 if (!lerr) {
1570 mp->mnt_devvp = mp_devvp;
1571 //vnode_lookup took an iocount, need to drop it.
1572 vnode_put(mp_devvp);
1573 // now set `device_vnode` to the devvp that was acquired.
1574 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1575 // note that though the iocount above was dropped, the mount acquires
1576 // an implicit reference against the device.
1577 device_vnode = mp_devvp;
1578 }
1579 }
1580 }
1581 #else
1582 error = EINVAL;
1583 #endif
1584 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1585 #if CONFIG_MOUNT_VM
1586 struct mount *origin_mp = (struct mount*)fsmountargs;
1587 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1588 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1589 if (error) {
1590 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1591 } else {
1592 /* Mark volume associated with system volume and a swap mount */
1593 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1594 /* Attempt to acquire the mnt_devvp and set it up */
1595 struct vnode *mp_devvp = NULL;
1596 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1597 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1598 0, &mp_devvp, vfs_context_kernel());
1599 if (!lerr) {
1600 mp->mnt_devvp = mp_devvp;
1601 //vnode_lookup took an iocount, need to drop it.
1602 vnode_put(mp_devvp);
1603
1604 // now set `device_vnode` to the devvp that was acquired.
1605 // note that though the iocount above was dropped, the mount acquires
1606 // an implicit reference against the device.
1607 device_vnode = mp_devvp;
1608 }
1609 }
1610 }
1611 #else
1612 error = EINVAL;
1613 #endif
1614 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1615 #if CONFIG_MOUNT_PREBOOTRECOVERY
1616 struct mount *origin_mp = (struct mount*)fsmountargs;
1617 uint32_t mount_role = 0;
1618 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1619 mount_role = VFS_PREBOOT_ROLE;
1620 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1621 mount_role = VFS_RECOVERY_ROLE;
1622 }
1623
1624 if (mount_role != 0) {
1625 fs_role_mount_args_t frma = {origin_mp, mount_role};
1626 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1627 if (error) {
1628 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1629 } else {
1630 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1631 /* Mark volume associated with system volume */
1632 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1633 /* Attempt to acquire the mnt_devvp and set it up */
1634 struct vnode *mp_devvp = NULL;
1635 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1636 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1637 0, &mp_devvp, vfs_context_kernel());
1638 if (!lerr) {
1639 mp->mnt_devvp = mp_devvp;
1640 //vnode_lookup took an iocount, need to drop it.
1641 vnode_put(mp_devvp);
1642
1643 // now set `device_vnode` to the devvp that was acquired.
1644 // note that though the iocount above was dropped, the mount acquires
1645 // an implicit reference against the device.
1646 device_vnode = mp_devvp;
1647 }
1648 }
1649 }
1650 } else {
1651 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1652 error = EINVAL;
1653 }
1654 #else
1655 error = EINVAL;
1656 #endif
1657 } else {
1658 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1659 }
1660
1661 if (flags & MNT_UPDATE) {
1662 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1663 mp->mnt_flag &= ~MNT_RDONLY;
1664 }
1665 mp->mnt_flag &= ~
1666 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1667 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1668 if (error) {
1669 mp->mnt_flag = flag; /* restore flag value */
1670 }
1671 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1672 lck_rw_done(&mp->mnt_rwlock);
1673 is_rwlock_locked = FALSE;
1674 if (!error) {
1675 enablequotas(mp, ctx);
1676 }
1677 goto exit;
1678 }
1679
1680 /*
1681 * Put the new filesystem on the mount list after root.
1682 */
1683 if (error == 0) {
1684 struct vfs_attr vfsattr;
1685 if (device_vnode) {
1686 /*
1687 * cache the IO attributes for the underlying physical media...
1688 * an error return indicates the underlying driver doesn't
1689 * support all the queries necessary... however, reasonable
1690 * defaults will have been set, so no reason to bail or care
1691 *
1692 * Need to do this before calling the MAC hook as it needs
1693 * information from this call.
1694 */
1695 vfs_init_io_attributes(device_vnode, mp);
1696 }
1697
1698 #if CONFIG_MACF
1699 error = mac_mount_check_mount_late(ctx, mp);
1700 if (error != 0) {
1701 goto out4;
1702 }
1703
1704 if (vfs_flags(mp) & MNT_MULTILABEL) {
1705 error = VFS_ROOT(mp, &rvp, ctx);
1706 if (error) {
1707 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1708 goto out4;
1709 }
1710 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1711 /*
1712 * drop reference provided by VFS_ROOT
1713 */
1714 vnode_put(rvp);
1715
1716 if (error) {
1717 goto out4;
1718 }
1719 }
1720 #endif /* MAC */
1721
1722 vnode_lock_spin(vp);
1723 CLR(vp->v_flag, VMOUNT);
1724 vp->v_mountedhere = mp;
1725 SET(vp->v_flag, VMOUNTEDHERE);
1726 vnode_unlock(vp);
1727
1728 /*
1729 * taking the name_cache_lock exclusively will
1730 * insure that everyone is out of the fast path who
1731 * might be trying to use a now stale copy of
1732 * vp->v_mountedhere->mnt_realrootvp
1733 * bumping mount_generation causes the cached values
1734 * to be invalidated
1735 */
1736 name_cache_lock();
1737 mount_generation++;
1738 name_cache_unlock();
1739
1740 error = vnode_ref(vp);
1741 if (error != 0) {
1742 goto out4;
1743 }
1744
1745 have_usecount = TRUE;
1746
1747 error = checkdirs(vp, ctx);
1748 if (error != 0) {
1749 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1750 goto out4;
1751 }
1752 /*
1753 * there is no cleanup code here so I have made it void
1754 * we need to revisit this
1755 */
1756 (void)VFS_START(mp, 0, ctx);
1757
1758 if (mount_list_add(mp) != 0) {
1759 /*
1760 * The system is shutting down trying to umount
1761 * everything, so fail with a plausible errno.
1762 */
1763 error = EBUSY;
1764 goto out4;
1765 }
1766 lck_rw_done(&mp->mnt_rwlock);
1767 is_rwlock_locked = FALSE;
1768
1769 /* Check if this mounted file system supports EAs or named streams. */
1770 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1771 VFSATTR_INIT(&vfsattr);
1772 VFSATTR_WANTED(&vfsattr, f_capabilities);
1773 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1774 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1775 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1776 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1777 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1778 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1779 }
1780 #if NAMEDSTREAMS
1781 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1782 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1783 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1784 }
1785 #endif
1786 /* Check if this file system supports path from id lookups. */
1787 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1788 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1789 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1790 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1791 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1792 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1793 }
1794
1795 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1796 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1797 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1798 }
1799 }
1800 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1801 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1802 }
1803 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1804 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1805 }
1806 /* increment the operations count */
1807 OSAddAtomic(1, &vfs_nummntops);
1808 enablequotas(mp, ctx);
1809
1810 if (device_vnode) {
1811 vfs_setmountedon(device_vnode);
1812 }
1813
1814 /* Now that mount is setup, notify the listeners */
1815 vfs_notify_mount(pvp);
1816 IOBSDMountChange(mp, kIOMountChangeMount);
1817 } else {
1818 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1819 if (mp->mnt_vnodelist.tqh_first != NULL) {
1820 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1821 mp->mnt_vtable->vfc_name, error);
1822 }
1823
1824 vnode_lock_spin(vp);
1825 CLR(vp->v_flag, VMOUNT);
1826 vnode_unlock(vp);
1827 mount_list_lock();
1828 mp->mnt_vtable->vfc_refcount--;
1829 mount_list_unlock();
1830
1831 if (device_vnode) {
1832 vnode_rele(device_vnode);
1833 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1834 vfs_clearmounting(device_vnode);
1835 }
1836 lck_rw_done(&mp->mnt_rwlock);
1837 is_rwlock_locked = FALSE;
1838
1839 if (nc_smr_enabled) {
1840 vfs_smr_synchronize();
1841 }
1842
1843 /*
1844 * if we get here, we have a mount structure that needs to be freed,
1845 * but since the coveredvp hasn't yet been updated to point at it,
1846 * no need to worry about other threads holding a crossref on this mp
1847 * so it's ok to just free it
1848 */
1849 mount_lock_destroy(mp);
1850 #if CONFIG_MACF
1851 mac_mount_label_destroy(mp);
1852 #endif
1853 zfree(mount_zone, mp);
1854 did_set_lmount = false;
1855 }
1856 exit:
1857 /*
1858 * drop I/O count on the device vp if there was one
1859 */
1860 if (devpath && devvp) {
1861 vnode_put(devvp);
1862 }
1863
1864 if (did_set_lmount) {
1865 mount_lock_spin(mp);
1866 mp->mnt_lflag &= ~MNT_LMOUNT;
1867 mount_unlock(mp);
1868 }
1869
1870 return error;
1871
1872 /* Error condition exits */
1873 out4:
1874 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1875
1876 /*
1877 * If the mount has been placed on the covered vp,
1878 * it may have been discovered by now, so we have
1879 * to treat this just like an unmount
1880 */
1881 mount_lock_spin(mp);
1882 mp->mnt_lflag |= MNT_LDEAD;
1883 mount_unlock(mp);
1884
1885 if (device_vnode != NULLVP) {
1886 vnode_rele(device_vnode);
1887 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1888 ctx);
1889 vfs_clearmounting(device_vnode);
1890 did_rele = TRUE;
1891 }
1892
1893 vnode_lock_spin(vp);
1894
1895 mp->mnt_crossref++;
1896 CLR(vp->v_flag, VMOUNTEDHERE);
1897 vp->v_mountedhere = (mount_t) 0;
1898
1899 vnode_unlock(vp);
1900
1901 if (have_usecount) {
1902 vnode_rele(vp);
1903 }
1904 out3:
1905 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1906 vnode_rele(devvp);
1907 vfs_clearmounting(devvp);
1908 }
1909 out2:
1910 if (devpath && devvp) {
1911 vnode_put(devvp);
1912 }
1913 out1:
1914 /* Release mnt_rwlock only when it was taken */
1915 if (is_rwlock_locked == TRUE) {
1916 if (flag_set) {
1917 mp->mnt_flag = flag; /* restore mnt_flag value */
1918 }
1919 lck_rw_done(&mp->mnt_rwlock);
1920 }
1921
1922 if (did_set_lmount) {
1923 mount_lock_spin(mp);
1924 mp->mnt_lflag &= ~MNT_LMOUNT;
1925 mount_unlock(mp);
1926 }
1927
1928 if (mntalloc) {
1929 if (mp->mnt_crossref) {
1930 mount_dropcrossref(mp, vp, 0);
1931 } else {
1932 if (nc_smr_enabled) {
1933 vfs_smr_synchronize();
1934 }
1935
1936 mount_lock_destroy(mp);
1937 #if CONFIG_MACF
1938 mac_mount_label_destroy(mp);
1939 #endif
1940 zfree(mount_zone, mp);
1941 }
1942 }
1943 if (vfsp_ref) {
1944 mount_list_lock();
1945 vfsp->vfc_refcount--;
1946 mount_list_unlock();
1947 }
1948
1949 return error;
1950 }
1951
1952 /*
1953 * Flush in-core data, check for competing mount attempts,
1954 * and set VMOUNT
1955 */
1956 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1957 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1958 {
1959 #if !CONFIG_MACF
1960 #pragma unused(cnp,fsname)
1961 #endif
1962 struct vnode_attr va;
1963 int error;
1964 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1965 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1966 boolean_t is_busy;
1967
1968 if (!skip_auth) {
1969 /*
1970 * If the user is not root, ensure that they own the directory
1971 * onto which we are attempting to mount.
1972 */
1973 VATTR_INIT(&va);
1974 VATTR_WANTED(&va, va_uid);
1975 if ((error = vnode_getattr(vp, &va, ctx)) ||
1976 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1977 (!vfs_context_issuser(ctx)))) {
1978 error = EPERM;
1979 goto out;
1980 }
1981 }
1982
1983 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1984 goto out;
1985 }
1986
1987 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1988 goto out;
1989 }
1990
1991 if (vp->v_type != VDIR) {
1992 error = ENOTDIR;
1993 goto out;
1994 }
1995
1996 vnode_lock_spin(vp);
1997 is_busy = is_fmount ?
1998 (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1999 (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
2000 if (is_busy) {
2001 vnode_unlock(vp);
2002 error = EBUSY;
2003 goto out;
2004 }
2005 SET(vp->v_flag, VMOUNT);
2006 vnode_unlock(vp);
2007
2008 #if CONFIG_MACF
2009 error = mac_mount_check_mount(ctx, vp,
2010 cnp, fsname);
2011 if (error != 0) {
2012 vnode_lock_spin(vp);
2013 CLR(vp->v_flag, VMOUNT);
2014 vnode_unlock(vp);
2015 }
2016 #endif
2017
2018 out:
2019 return error;
2020 }
2021
2022 #if CONFIG_IMGSRC_ACCESS
2023
2024 #define DEBUG_IMGSRC 0
2025
2026 #if DEBUG_IMGSRC
2027 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2028 #else
2029 #define IMGSRC_DEBUG(args...) do { } while(0)
2030 #endif
2031
2032 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2033 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2034 {
2035 struct nameidata nd;
2036 vnode_t vp, realdevvp;
2037 kauth_action_t accessmode;
2038 int error;
2039 enum uio_seg uio = UIO_USERSPACE;
2040
2041 if (ctx == vfs_context_kernel()) {
2042 uio = UIO_SYSSPACE;
2043 }
2044
2045 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2046 if ((error = namei(&nd))) {
2047 IMGSRC_DEBUG("namei() failed with %d\n", error);
2048 return error;
2049 }
2050
2051 vp = nd.ni_vp;
2052
2053 if (!vnode_isblk(vp)) {
2054 IMGSRC_DEBUG("Not block device.\n");
2055 error = ENOTBLK;
2056 goto out;
2057 }
2058
2059 realdevvp = mp->mnt_devvp;
2060 if (realdevvp == NULLVP) {
2061 IMGSRC_DEBUG("No device backs the mount.\n");
2062 error = ENXIO;
2063 goto out;
2064 }
2065
2066 error = vnode_getwithref(realdevvp);
2067 if (error != 0) {
2068 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2069 goto out;
2070 }
2071
2072 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2073 IMGSRC_DEBUG("Wrong dev_t.\n");
2074 error = ENXIO;
2075 goto out1;
2076 }
2077
2078 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2079
2080 /*
2081 * If mount by non-root, then verify that user has necessary
2082 * permissions on the device.
2083 */
2084 if (!vfs_context_issuser(ctx)) {
2085 accessmode = KAUTH_VNODE_READ_DATA;
2086 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2087 accessmode |= KAUTH_VNODE_WRITE_DATA;
2088 }
2089 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2090 IMGSRC_DEBUG("Access denied.\n");
2091 goto out1;
2092 }
2093 }
2094
2095 *devvpp = vp;
2096
2097 out1:
2098 vnode_put(realdevvp);
2099
2100 out:
2101 nameidone(&nd);
2102
2103 if (error) {
2104 vnode_put(vp);
2105 }
2106
2107 return error;
2108 }
2109
2110 /*
2111 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2112 * and call checkdirs()
2113 */
2114 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2115 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2116 {
2117 int error;
2118
2119 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2120
2121 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2122 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2123
2124 vnode_lock_spin(vp);
2125 CLR(vp->v_flag, VMOUNT);
2126 vp->v_mountedhere = mp;
2127 SET(vp->v_flag, VMOUNTEDHERE);
2128 vnode_unlock(vp);
2129
2130 /*
2131 * taking the name_cache_lock exclusively will
2132 * insure that everyone is out of the fast path who
2133 * might be trying to use a now stale copy of
2134 * vp->v_mountedhere->mnt_realrootvp
2135 * bumping mount_generation causes the cached values
2136 * to be invalidated
2137 */
2138 name_cache_lock();
2139 mount_generation++;
2140 name_cache_unlock();
2141
2142 error = vnode_ref(vp);
2143 if (error != 0) {
2144 goto out;
2145 }
2146
2147 error = checkdirs(vp, ctx);
2148 if (error != 0) {
2149 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2150 vnode_rele(vp);
2151 goto out;
2152 }
2153
2154 out:
2155 if (error != 0) {
2156 mp->mnt_vnodecovered = NULLVP;
2157 }
2158 return error;
2159 }
2160
2161 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2162 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2163 {
2164 vnode_rele(vp);
2165 vnode_lock_spin(vp);
2166 CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2167 vp->v_mountedhere = (mount_t)NULL;
2168 vnode_unlock(vp);
2169
2170 mp->mnt_vnodecovered = NULLVP;
2171 }
2172
2173 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2174 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2175 {
2176 int error;
2177
2178 /* unmount in progress return error */
2179 mount_lock_spin(mp);
2180 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2181 mount_unlock(mp);
2182 return EBUSY;
2183 }
2184 mount_unlock(mp);
2185 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2186
2187 /*
2188 * We only allow the filesystem to be reloaded if it
2189 * is currently mounted read-only.
2190 */
2191 if ((flags & MNT_RELOAD) &&
2192 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2193 error = ENOTSUP;
2194 goto out;
2195 }
2196
2197 /*
2198 * Only root, or the user that did the original mount is
2199 * permitted to update it.
2200 */
2201 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2202 (!vfs_context_issuser(ctx))) {
2203 error = EPERM;
2204 goto out;
2205 }
2206 #if CONFIG_MACF
2207 error = mac_mount_check_remount(ctx, mp, flags);
2208 if (error != 0) {
2209 goto out;
2210 }
2211 #endif
2212
2213 out:
2214 if (error) {
2215 lck_rw_done(&mp->mnt_rwlock);
2216 }
2217
2218 return error;
2219 }
2220
2221 static void
mount_end_update(mount_t mp)2222 mount_end_update(mount_t mp)
2223 {
2224 lck_rw_done(&mp->mnt_rwlock);
2225 }
2226
2227 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2228 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2229 {
2230 vnode_t vp;
2231
2232 if (height >= MAX_IMAGEBOOT_NESTING) {
2233 return EINVAL;
2234 }
2235
2236 vp = imgsrc_rootvnodes[height];
2237 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2238 *rvpp = vp;
2239 return 0;
2240 } else {
2241 return ENOENT;
2242 }
2243 }
2244
2245 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2246 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2247 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2248 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2249 {
2250 int error;
2251 mount_t mp;
2252 boolean_t placed = FALSE;
2253 struct vfstable *vfsp;
2254 user_addr_t devpath;
2255 char *old_mntonname;
2256 vnode_t rvp;
2257 vnode_t devvp;
2258 uint32_t height;
2259 uint32_t flags;
2260
2261 /* If we didn't imageboot, nothing to move */
2262 if (imgsrc_rootvnodes[0] == NULLVP) {
2263 return EINVAL;
2264 }
2265
2266 /* Only root can do this */
2267 if (!vfs_context_issuser(ctx)) {
2268 return EPERM;
2269 }
2270
2271 IMGSRC_DEBUG("looking for root vnode.\n");
2272
2273 /*
2274 * Get root vnode of filesystem we're moving.
2275 */
2276 if (by_index) {
2277 if (is64bit) {
2278 struct user64_mnt_imgsrc_args mia64;
2279 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2280 if (error != 0) {
2281 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2282 return error;
2283 }
2284
2285 height = mia64.mi_height;
2286 flags = mia64.mi_flags;
2287 devpath = (user_addr_t)mia64.mi_devpath;
2288 } else {
2289 struct user32_mnt_imgsrc_args mia32;
2290 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2291 if (error != 0) {
2292 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2293 return error;
2294 }
2295
2296 height = mia32.mi_height;
2297 flags = mia32.mi_flags;
2298 devpath = mia32.mi_devpath;
2299 }
2300 } else {
2301 /*
2302 * For binary compatibility--assumes one level of nesting.
2303 */
2304 if (is64bit) {
2305 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2306 return error;
2307 }
2308 } else {
2309 user32_addr_t tmp;
2310 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2311 return error;
2312 }
2313
2314 /* munge into LP64 addr */
2315 devpath = CAST_USER_ADDR_T(tmp);
2316 }
2317
2318 height = 0;
2319 flags = 0;
2320 }
2321
2322 if (flags != 0) {
2323 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2324 return EINVAL;
2325 }
2326
2327 error = get_imgsrc_rootvnode(height, &rvp);
2328 if (error != 0) {
2329 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2330 return error;
2331 }
2332
2333 IMGSRC_DEBUG("got old root vnode\n");
2334
2335 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2336
2337 /* Can only move once */
2338 mp = vnode_mount(rvp);
2339 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2340 IMGSRC_DEBUG("Already moved.\n");
2341 error = EBUSY;
2342 goto out0;
2343 }
2344
2345 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2346 IMGSRC_DEBUG("Starting updated.\n");
2347
2348 /* Get exclusive rwlock on mount, authorize update on mp */
2349 error = mount_begin_update(mp, ctx, 0);
2350 if (error != 0) {
2351 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2352 goto out0;
2353 }
2354
2355 /*
2356 * It can only be moved once. Flag is set under the rwlock,
2357 * so we're now safe to proceed.
2358 */
2359 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2360 IMGSRC_DEBUG("Already moved [2]\n");
2361 goto out1;
2362 }
2363
2364 IMGSRC_DEBUG("Preparing coveredvp.\n");
2365
2366 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2367 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2368 if (error != 0) {
2369 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2370 goto out1;
2371 }
2372
2373 IMGSRC_DEBUG("Covered vp OK.\n");
2374
2375 /* Sanity check the name caller has provided */
2376 vfsp = mp->mnt_vtable;
2377 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2378 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2379 vfsp->vfc_name, fsname);
2380 error = EINVAL;
2381 goto out2;
2382 }
2383
2384 /* Check the device vnode and update mount-from name, for local filesystems */
2385 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2386 IMGSRC_DEBUG("Local, doing device validation.\n");
2387
2388 if (devpath != USER_ADDR_NULL) {
2389 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2390 if (error) {
2391 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2392 goto out2;
2393 }
2394
2395 vnode_put(devvp);
2396 }
2397 }
2398
2399 /*
2400 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2401 * and increment the name cache's mount generation
2402 */
2403
2404 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2405 error = place_mount_and_checkdirs(mp, vp, ctx);
2406 if (error != 0) {
2407 goto out2;
2408 }
2409
2410 placed = TRUE;
2411
2412 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2413 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2414
2415 /* Forbid future moves */
2416 mount_lock(mp);
2417 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2418 mount_unlock(mp);
2419
2420 /* Finally, add to mount list, completely ready to go */
2421 if (mount_list_add(mp) != 0) {
2422 /*
2423 * The system is shutting down trying to umount
2424 * everything, so fail with a plausible errno.
2425 */
2426 error = EBUSY;
2427 goto out3;
2428 }
2429
2430 mount_end_update(mp);
2431 vnode_put(rvp);
2432 zfree(ZV_NAMEI, old_mntonname);
2433
2434 vfs_notify_mount(pvp);
2435
2436 return 0;
2437 out3:
2438 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2439
2440 mount_lock(mp);
2441 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2442 mount_unlock(mp);
2443
2444 out2:
2445 /*
2446 * Placing the mp on the vnode clears VMOUNT,
2447 * so cleanup is different after that point
2448 */
2449 if (placed) {
2450 /* Rele the vp, clear VMOUNT and v_mountedhere */
2451 undo_place_on_covered_vp(mp, vp);
2452 } else {
2453 vnode_lock_spin(vp);
2454 CLR(vp->v_flag, VMOUNT);
2455 vnode_unlock(vp);
2456 }
2457 out1:
2458 mount_end_update(mp);
2459
2460 out0:
2461 vnode_put(rvp);
2462 zfree(ZV_NAMEI, old_mntonname);
2463 return error;
2464 }
2465
2466 #endif /* CONFIG_IMGSRC_ACCESS */
2467
2468 void
enablequotas(struct mount * mp,vfs_context_t ctx)2469 enablequotas(struct mount *mp, vfs_context_t ctx)
2470 {
2471 struct nameidata qnd;
2472 int type;
2473 char qfpath[MAXPATHLEN];
2474 const char *qfname = QUOTAFILENAME;
2475 const char *qfopsname = QUOTAOPSNAME;
2476 const char *qfextension[] = INITQFNAMES;
2477
2478 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2479 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2480 return;
2481 }
2482 /*
2483 * Enable filesystem disk quotas if necessary.
2484 * We ignore errors as this should not interfere with final mount
2485 */
2486 for (type = 0; type < MAXQUOTAS; type++) {
2487 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2488 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2489 CAST_USER_ADDR_T(qfpath), ctx);
2490 if (namei(&qnd) != 0) {
2491 continue; /* option file to trigger quotas is not present */
2492 }
2493 vnode_put(qnd.ni_vp);
2494 nameidone(&qnd);
2495 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2496
2497 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2498 }
2499 return;
2500 }
2501
2502
2503 static int
checkdirs_callback(proc_t p,void * arg)2504 checkdirs_callback(proc_t p, void * arg)
2505 {
2506 struct cdirargs *cdrp = (struct cdirargs *)arg;
2507 vnode_t olddp = cdrp->olddp;
2508 vnode_t newdp = cdrp->newdp;
2509 struct filedesc *fdp = &p->p_fd;
2510 vnode_t new_cvp = newdp;
2511 vnode_t new_rvp = newdp;
2512 vnode_t old_cvp = NULL;
2513 vnode_t old_rvp = NULL;
2514
2515 /*
2516 * XXX Also needs to iterate each thread in the process to see if it
2517 * XXX is using a per-thread current working directory, and, if so,
2518 * XXX update that as well.
2519 */
2520
2521 /*
2522 * First, with the proc_fdlock held, check to see if we will need
2523 * to do any work. If not, we will get out fast.
2524 */
2525 proc_fdlock(p);
2526 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2527 proc_fdunlock(p);
2528 return PROC_RETURNED;
2529 }
2530 proc_fdunlock(p);
2531
2532 /*
2533 * Ok, we will have to do some work. Always take two refs
2534 * because we might need that many. We'll dispose of whatever
2535 * we ended up not using.
2536 */
2537 if (vnode_ref(newdp) != 0) {
2538 return PROC_RETURNED;
2539 }
2540 if (vnode_ref(newdp) != 0) {
2541 vnode_rele(newdp);
2542 return PROC_RETURNED;
2543 }
2544
2545 proc_dirs_lock_exclusive(p);
2546 /*
2547 * Now do the work. Note: we dropped the proc_fdlock, so we
2548 * have to do all of the checks again.
2549 */
2550 proc_fdlock(p);
2551 if (fdp->fd_cdir == olddp) {
2552 old_cvp = olddp;
2553 fdp->fd_cdir = newdp;
2554 new_cvp = NULL;
2555 }
2556 if (fdp->fd_rdir == olddp) {
2557 old_rvp = olddp;
2558 fdp->fd_rdir = newdp;
2559 new_rvp = NULL;
2560 }
2561 proc_fdunlock(p);
2562 proc_dirs_unlock_exclusive(p);
2563
2564 /*
2565 * Dispose of any references that are no longer needed.
2566 */
2567 if (old_cvp != NULL) {
2568 vnode_rele(old_cvp);
2569 }
2570 if (old_rvp != NULL) {
2571 vnode_rele(old_rvp);
2572 }
2573 if (new_cvp != NULL) {
2574 vnode_rele(new_cvp);
2575 }
2576 if (new_rvp != NULL) {
2577 vnode_rele(new_rvp);
2578 }
2579
2580 return PROC_RETURNED;
2581 }
2582
2583
2584
2585 /*
2586 * Scan all active processes to see if any of them have a current
2587 * or root directory onto which the new filesystem has just been
2588 * mounted. If so, replace them with the new mount point.
2589 */
2590 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2591 checkdirs(vnode_t olddp, vfs_context_t ctx)
2592 {
2593 vnode_t newdp;
2594 vnode_t tvp;
2595 int err;
2596 struct cdirargs cdr;
2597
2598 if (olddp->v_usecount == 1) {
2599 return 0;
2600 }
2601 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2602
2603 if (err != 0) {
2604 #if DIAGNOSTIC
2605 panic("mount: lost mount: error %d", err);
2606 #endif
2607 return err;
2608 }
2609
2610 cdr.olddp = olddp;
2611 cdr.newdp = newdp;
2612 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2613 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2614
2615 if (rootvnode == olddp) {
2616 vnode_ref(newdp);
2617 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2618 tvp = rootvnode;
2619 rootvnode = newdp;
2620 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2621 vnode_rele(tvp);
2622 }
2623
2624 vnode_put(newdp);
2625 return 0;
2626 }
2627
2628 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2629 "com.apple.private.vfs.role-account-unmount"
2630
2631 /*
2632 * Unmount a file system.
2633 *
2634 * Note: unmount takes a path to the vnode mounted on as argument,
2635 * not special file (as before).
2636 */
2637 /* ARGSUSED */
2638 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2639 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2640 {
2641 vnode_t vp;
2642 struct mount *mp;
2643 int error;
2644 struct nameidata nd;
2645 vfs_context_t ctx;
2646
2647 /*
2648 * If the process has the entitlement, use the kernel's context when
2649 * performing lookup on the mount path as the process might lack proper
2650 * permission to access the directory.
2651 */
2652 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2653 vfs_context_kernel() : vfs_context_current();
2654
2655 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2656 UIO_USERSPACE, uap->path, ctx);
2657 error = namei(&nd);
2658 if (error) {
2659 return error;
2660 }
2661 vp = nd.ni_vp;
2662 mp = vp->v_mount;
2663 nameidone(&nd);
2664
2665 /*
2666 * Must be the root of the filesystem
2667 */
2668 if ((vp->v_flag & VROOT) == 0) {
2669 vnode_put(vp);
2670 return EINVAL;
2671 }
2672 #if CONFIG_MACF
2673 error = mac_mount_check_umount(ctx, mp);
2674 if (error != 0) {
2675 vnode_put(vp);
2676 return error;
2677 }
2678 #endif
2679 mount_ref(mp, 0);
2680 vnode_put(vp);
2681 /* safedounmount consumes the mount ref */
2682 return safedounmount(mp, uap->flags, ctx);
2683 }
2684
2685 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2686 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2687 {
2688 mount_t mp;
2689
2690 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2691 if (mp == (mount_t)0) {
2692 return ENOENT;
2693 }
2694 mount_ref(mp, 0);
2695 mount_iterdrop(mp);
2696 /* safedounmount consumes the mount ref */
2697 return safedounmount(mp, flags, ctx);
2698 }
2699
2700 /*
2701 * The mount struct comes with a mount ref which will be consumed.
2702 * Do the actual file system unmount, prevent some common foot shooting.
2703 */
2704 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2705 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2706 {
2707 int error;
2708 proc_t p = vfs_context_proc(ctx);
2709
2710 /*
2711 * If the file system is not responding and MNT_NOBLOCK
2712 * is set and not a forced unmount then return EBUSY.
2713 */
2714 if ((mp->mnt_lflag & MNT_LNOTRESP) &&
2715 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2716 error = EBUSY;
2717 goto out;
2718 }
2719
2720 /*
2721 * Skip authorization in two cases:
2722 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2723 * This entitlement allows non-root processes unmount volumes mounted by
2724 * other processes.
2725 * - If the mount is tagged as permissive and this is not a forced-unmount
2726 * attempt.
2727 */
2728 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2729 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2730 /*
2731 * Only root, or the user that did the original mount is
2732 * permitted to unmount this filesystem.
2733 */
2734 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2735 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2736 goto out;
2737 }
2738 }
2739 /*
2740 * Don't allow unmounting the root file system, or other volumes
2741 * associated with it (for example, the associated VM or DATA mounts) .
2742 */
2743 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2744 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2745 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2746 mp->mnt_vfsstat.f_mntonname);
2747 }
2748 error = EBUSY; /* the root (or associated volumes) is always busy */
2749 goto out;
2750 }
2751
2752 /*
2753 * If the mount is providing the root filesystem's disk image
2754 * (i.e. imageboot), don't allow unmounting
2755 */
2756 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2757 error = EBUSY;
2758 goto out;
2759 }
2760
2761 return dounmount(mp, flags, 1, ctx);
2762
2763 out:
2764 mount_drop(mp, 0);
2765 return error;
2766 }
2767
2768 /*
2769 * Do the actual file system unmount.
2770 */
2771 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2772 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2773 {
2774 vnode_t coveredvp = (vnode_t)0;
2775 int error;
2776 int needwakeup = 0;
2777 int forcedunmount = 0;
2778 int lflags = 0;
2779 struct vnode *devvp = NULLVP;
2780 #if CONFIG_TRIGGERS
2781 proc_t p = vfs_context_proc(ctx);
2782 int did_vflush = 0;
2783 int pflags_save = 0;
2784 #endif /* CONFIG_TRIGGERS */
2785
2786 #if CONFIG_FSE
2787 if (!(flags & MNT_FORCE)) {
2788 fsevent_unmount(mp, ctx); /* has to come first! */
2789 }
2790 #endif
2791
2792 mount_lock(mp);
2793
2794 /*
2795 * If already an unmount in progress just return EBUSY.
2796 * Even a forced unmount cannot override.
2797 */
2798 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2799 if (withref != 0) {
2800 mount_drop(mp, 1);
2801 }
2802 mount_unlock(mp);
2803 return EBUSY;
2804 }
2805
2806 if (flags & MNT_FORCE) {
2807 forcedunmount = 1;
2808 mp->mnt_lflag |= MNT_LFORCE;
2809 }
2810
2811 #if CONFIG_TRIGGERS
2812 if (flags & MNT_NOBLOCK && p != kernproc) {
2813 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2814 }
2815 #endif
2816
2817 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2818 mp->mnt_lflag |= MNT_LUNMOUNT;
2819 mp->mnt_flag &= ~MNT_ASYNC;
2820 /*
2821 * anyone currently in the fast path that
2822 * trips over the cached rootvp will be
2823 * dumped out and forced into the slow path
2824 * to regenerate a new cached value
2825 */
2826 mp->mnt_realrootvp = NULLVP;
2827 mount_unlock(mp);
2828
2829 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2830 /*
2831 * Force unmount any mounts in this filesystem.
2832 * If any unmounts fail - just leave them dangling.
2833 * Avoids recursion.
2834 */
2835 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2836 }
2837
2838 /*
2839 * taking the name_cache_lock exclusively will
2840 * insure that everyone is out of the fast path who
2841 * might be trying to use a now stale copy of
2842 * vp->v_mountedhere->mnt_realrootvp
2843 * bumping mount_generation causes the cached values
2844 * to be invalidated
2845 */
2846 name_cache_lock();
2847 mount_generation++;
2848 name_cache_unlock();
2849
2850
2851 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2852 if (withref != 0) {
2853 mount_drop(mp, 0);
2854 }
2855 error = 0;
2856 if (forcedunmount == 0) {
2857 ubc_umount(mp); /* release cached vnodes */
2858 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2859 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2860 if (error) {
2861 mount_lock(mp);
2862 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2863 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2864 mp->mnt_lflag &= ~MNT_LFORCE;
2865 goto out;
2866 }
2867 }
2868 }
2869
2870 IOBSDMountChange(mp, kIOMountChangeUnmount);
2871
2872 #if CONFIG_TRIGGERS
2873 vfs_nested_trigger_unmounts(mp, flags, ctx);
2874 did_vflush = 1;
2875 #endif
2876 if (forcedunmount) {
2877 lflags |= FORCECLOSE;
2878 }
2879 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2880 if ((forcedunmount == 0) && error) {
2881 mount_lock(mp);
2882 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2883 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2884 mp->mnt_lflag &= ~MNT_LFORCE;
2885 goto out;
2886 }
2887
2888 /* make sure there are no one in the mount iterations or lookup */
2889 mount_iterdrain(mp);
2890
2891 error = VFS_UNMOUNT(mp, flags, ctx);
2892 if (error) {
2893 mount_iterreset(mp);
2894 mount_lock(mp);
2895 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2896 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2897 mp->mnt_lflag &= ~MNT_LFORCE;
2898 goto out;
2899 }
2900
2901 /* increment the operations count */
2902 if (!error) {
2903 OSAddAtomic(1, &vfs_nummntops);
2904 }
2905
2906 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2907 /* hold an io reference and drop the usecount before close */
2908 devvp = mp->mnt_devvp;
2909 vnode_getalways(devvp);
2910 vnode_rele(devvp);
2911 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2912 ctx);
2913 vnode_clearmountedon(devvp);
2914 vnode_put(devvp);
2915 }
2916 lck_rw_done(&mp->mnt_rwlock);
2917 mount_list_remove(mp);
2918 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2919
2920 /* mark the mount point hook in the vp but not drop the ref yet */
2921 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2922 /*
2923 * The covered vnode needs special handling. Trying to get an
2924 * iocount must not block here as this may lead to deadlocks
2925 * if the Filesystem to which the covered vnode belongs is
2926 * undergoing forced unmounts. Since we hold a usecount, the
2927 * vnode cannot be reused (it can, however, still be terminated)
2928 */
2929 vnode_getalways(coveredvp);
2930 vnode_lock_spin(coveredvp);
2931
2932 mp->mnt_crossref++;
2933 coveredvp->v_mountedhere = (struct mount *)0;
2934 CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2935 vnode_unlock(coveredvp);
2936 vnode_put(coveredvp);
2937 }
2938
2939 mount_list_lock();
2940 mp->mnt_vtable->vfc_refcount--;
2941 mount_list_unlock();
2942
2943 cache_purgevfs(mp); /* remove cache entries for this file sys */
2944 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2945 mount_lock(mp);
2946 mp->mnt_lflag |= MNT_LDEAD;
2947
2948 if (mp->mnt_lflag & MNT_LWAIT) {
2949 /*
2950 * do the wakeup here
2951 * in case we block in mount_refdrain
2952 * which will drop the mount lock
2953 * and allow anyone blocked in vfs_busy
2954 * to wakeup and see the LDEAD state
2955 */
2956 mp->mnt_lflag &= ~MNT_LWAIT;
2957 wakeup((caddr_t)mp);
2958 }
2959 mount_refdrain(mp);
2960
2961 /* free disk_conditioner_info structure for this mount */
2962 disk_conditioner_unmount(mp);
2963
2964 out:
2965 if (mp->mnt_lflag & MNT_LWAIT) {
2966 mp->mnt_lflag &= ~MNT_LWAIT;
2967 needwakeup = 1;
2968 }
2969
2970 #if CONFIG_TRIGGERS
2971 if (flags & MNT_NOBLOCK && p != kernproc) {
2972 // Restore P_NOREMOTEHANG bit to its previous value
2973 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2974 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2975 }
2976 }
2977
2978 /*
2979 * Callback and context are set together under the mount lock, and
2980 * never cleared, so we're safe to examine them here, drop the lock,
2981 * and call out.
2982 */
2983 if (mp->mnt_triggercallback != NULL) {
2984 mount_unlock(mp);
2985 if (error == 0) {
2986 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2987 } else if (did_vflush) {
2988 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2989 }
2990 } else {
2991 mount_unlock(mp);
2992 }
2993 #else
2994 mount_unlock(mp);
2995 #endif /* CONFIG_TRIGGERS */
2996
2997 lck_rw_done(&mp->mnt_rwlock);
2998
2999 if (needwakeup) {
3000 wakeup((caddr_t)mp);
3001 }
3002
3003 if (!error) {
3004 if ((coveredvp != NULLVP)) {
3005 vnode_t pvp = NULLVP;
3006
3007 /*
3008 * The covered vnode needs special handling. Trying to
3009 * get an iocount must not block here as this may lead
3010 * to deadlocks if the Filesystem to which the covered
3011 * vnode belongs is undergoing forced unmounts. Since we
3012 * hold a usecount, the vnode cannot be reused
3013 * (it can, however, still be terminated).
3014 */
3015 vnode_getalways(coveredvp);
3016
3017 mount_dropcrossref(mp, coveredvp, 0);
3018 /*
3019 * We'll _try_ to detect if this really needs to be
3020 * done. The coveredvp can only be in termination (or
3021 * terminated) if the coveredvp's mount point is in a
3022 * forced unmount (or has been) since we still hold the
3023 * ref.
3024 */
3025 if (!vnode_isrecycled(coveredvp)) {
3026 pvp = vnode_getparent(coveredvp);
3027 #if CONFIG_TRIGGERS
3028 if (coveredvp->v_resolve) {
3029 vnode_trigger_rearm(coveredvp, ctx);
3030 }
3031 #endif
3032 }
3033
3034 vnode_rele(coveredvp);
3035 vnode_put(coveredvp);
3036 coveredvp = NULLVP;
3037
3038 if (pvp) {
3039 lock_vnode_and_post(pvp, NOTE_WRITE);
3040 vnode_put(pvp);
3041 }
3042 } else if (mp->mnt_flag & MNT_ROOTFS) {
3043 if (nc_smr_enabled) {
3044 vfs_smr_synchronize();
3045 }
3046
3047 mount_lock_destroy(mp);
3048 #if CONFIG_MACF
3049 mac_mount_label_destroy(mp);
3050 #endif
3051 zfree(mount_zone, mp);
3052 } else {
3053 panic("dounmount: no coveredvp");
3054 }
3055 }
3056 return error;
3057 }
3058
3059 /*
3060 * Unmount any mounts in this filesystem.
3061 */
3062 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3063 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3064 {
3065 mount_t smp;
3066 fsid_t *fsids, fsid;
3067 int fsids_sz;
3068 int count = 0, i, m = 0;
3069 vnode_t vp;
3070
3071 mount_list_lock();
3072
3073 // Get an array to hold the submounts fsids.
3074 TAILQ_FOREACH(smp, &mountlist, mnt_list)
3075 count++;
3076 fsids_sz = count * sizeof(fsid_t);
3077 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3078 if (fsids == NULL) {
3079 mount_list_unlock();
3080 goto out;
3081 }
3082 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3083
3084 /*
3085 * Fill the array with submount fsids.
3086 * Since mounts are always added to the tail of the mount list, the
3087 * list is always in mount order.
3088 * For each mount check if the mounted-on vnode belongs to a
3089 * mount that's already added to our array of mounts to be unmounted.
3090 */
3091 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3092 vp = smp->mnt_vnodecovered;
3093 if (vp == NULL) {
3094 continue;
3095 }
3096 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3097 for (i = 0; i <= m; i++) {
3098 if (fsids[i].val[0] == fsid.val[0] &&
3099 fsids[i].val[1] == fsid.val[1]) {
3100 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3101 break;
3102 }
3103 }
3104 }
3105 mount_list_unlock();
3106
3107 // Unmount the submounts in reverse order. Ignore errors.
3108 for (i = m; i > 0; i--) {
3109 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3110 if (smp) {
3111 mount_ref(smp, 0);
3112 mount_iterdrop(smp);
3113 (void) dounmount(smp, flags, 1, ctx);
3114 }
3115 }
3116 out:
3117 kfree_data(fsids, fsids_sz);
3118 }
3119
3120 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3121 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3122 {
3123 vnode_hold(dp);
3124 vnode_lock(dp);
3125 mp->mnt_crossref--;
3126
3127 if (mp->mnt_crossref < 0) {
3128 panic("mount cross refs -ve");
3129 }
3130
3131 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3132 if (need_put) {
3133 vnode_put_locked(dp);
3134 }
3135 vnode_drop_and_unlock(dp);
3136
3137 if (nc_smr_enabled) {
3138 vfs_smr_synchronize();
3139 }
3140
3141 mount_lock_destroy(mp);
3142 #if CONFIG_MACF
3143 mac_mount_label_destroy(mp);
3144 #endif
3145 zfree(mount_zone, mp);
3146 return;
3147 }
3148 if (need_put) {
3149 vnode_put_locked(dp);
3150 }
3151 vnode_drop_and_unlock(dp);
3152 }
3153
3154
3155 /*
3156 * Sync each mounted filesystem.
3157 */
3158 #if DIAGNOSTIC
3159 int syncprt = 0;
3160 #endif
3161
3162 int print_vmpage_stat = 0;
3163
3164 /*
3165 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3166 * mounted read-write with the passed waitfor value.
3167 *
3168 * Parameters: mp mount-point descriptor per mounted file-system instance.
3169 * arg user argument (please see below)
3170 *
3171 * User argument is a pointer to 32 bit unsigned integer which describes the
3172 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3173 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3174 * waitfor value.
3175 *
3176 * Returns: VFS_RETURNED
3177 */
3178 static int
sync_callback(mount_t mp,void * arg)3179 sync_callback(mount_t mp, void *arg)
3180 {
3181 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3182 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3183 unsigned waitfor = MNT_NOWAIT;
3184
3185 if (arg) {
3186 waitfor = *(uint32_t*)arg;
3187 }
3188
3189 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3190 if (waitfor != MNT_WAIT &&
3191 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3192 waitfor != MNT_NOWAIT &&
3193 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3194 waitfor != MNT_DWAIT &&
3195 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3196 panic("Passed inappropriate waitfor %u to "
3197 "sync_callback()", waitfor);
3198 }
3199
3200 mp->mnt_flag &= ~MNT_ASYNC;
3201 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3202 if (asyncflag) {
3203 mp->mnt_flag |= MNT_ASYNC;
3204 }
3205 }
3206
3207 return VFS_RETURNED;
3208 }
3209
3210 /* ARGSUSED */
3211 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3212 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3213 {
3214 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3215
3216 if (print_vmpage_stat) {
3217 vm_countdirtypages();
3218 }
3219
3220 #if DIAGNOSTIC
3221 if (syncprt) {
3222 vfs_bufstats();
3223 }
3224 #endif /* DIAGNOSTIC */
3225 return 0;
3226 }
3227
3228 typedef enum {
3229 SYNC_ALL = 0,
3230 SYNC_ONLY_RELIABLE_MEDIA = 1,
3231 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3232 } sync_type_t;
3233
3234 static int
sync_internal_callback(mount_t mp,void * arg)3235 sync_internal_callback(mount_t mp, void *arg)
3236 {
3237 if (arg) {
3238 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3239 (mp->mnt_flag & MNT_LOCAL);
3240 sync_type_t sync_type = *((sync_type_t *)arg);
3241
3242 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3243 return VFS_RETURNED;
3244 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3245 return VFS_RETURNED;
3246 }
3247 }
3248
3249 (void)sync_callback(mp, NULL);
3250
3251 return VFS_RETURNED;
3252 }
3253
3254 int sync_thread_state = 0;
3255 int sync_timeout_seconds = 5;
3256
3257 #define SYNC_THREAD_RUN 0x0001
3258 #define SYNC_THREAD_RUNNING 0x0002
3259
3260 #if CONFIG_PHYS_WRITE_ACCT
3261 thread_t pm_sync_thread;
3262 #endif /* CONFIG_PHYS_WRITE_ACCT */
3263
3264 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3265 sync_thread(__unused void *arg, __unused wait_result_t wr)
3266 {
3267 sync_type_t sync_type;
3268 #if CONFIG_PHYS_WRITE_ACCT
3269 pm_sync_thread = current_thread();
3270 #endif /* CONFIG_PHYS_WRITE_ACCT */
3271
3272 lck_mtx_lock(&sync_mtx_lck);
3273 while (sync_thread_state & SYNC_THREAD_RUN) {
3274 sync_thread_state &= ~SYNC_THREAD_RUN;
3275 lck_mtx_unlock(&sync_mtx_lck);
3276
3277 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3278 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3279 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3280 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3281
3282 lck_mtx_lock(&sync_mtx_lck);
3283 }
3284 /*
3285 * This wakeup _has_ to be issued before the lock is released otherwise
3286 * we may end up waking up a thread in sync_internal which is
3287 * expecting a wakeup from a thread it just created and not from this
3288 * thread which is about to exit.
3289 */
3290 wakeup(&sync_thread_state);
3291 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3292 #if CONFIG_PHYS_WRITE_ACCT
3293 pm_sync_thread = NULL;
3294 #endif /* CONFIG_PHYS_WRITE_ACCT */
3295 lck_mtx_unlock(&sync_mtx_lck);
3296
3297 if (print_vmpage_stat) {
3298 vm_countdirtypages();
3299 }
3300
3301 #if DIAGNOSTIC
3302 if (syncprt) {
3303 vfs_bufstats();
3304 }
3305 #endif /* DIAGNOSTIC */
3306 }
3307
3308 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3309
3310 /*
3311 * An in-kernel sync for power management to call.
3312 * This function always returns within sync_timeout seconds.
3313 */
3314 __private_extern__ int
sync_internal(void)3315 sync_internal(void)
3316 {
3317 thread_t thd = NULL;
3318 int error;
3319 int thread_created = FALSE;
3320 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3321
3322 lck_mtx_lock(&sync_mtx_lck);
3323 sync_thread_state |= SYNC_THREAD_RUN;
3324 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3325 int kr;
3326
3327 sync_thread_state |= SYNC_THREAD_RUNNING;
3328 kr = kernel_thread_start(sync_thread, NULL, &thd);
3329 if (kr != KERN_SUCCESS) {
3330 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3331 lck_mtx_unlock(&sync_mtx_lck);
3332 printf("sync_thread failed\n");
3333 return 0;
3334 }
3335 thread_created = TRUE;
3336 }
3337
3338 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3339 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3340 if (error) {
3341 struct timeval now;
3342
3343 microtime(&now);
3344 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3345 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3346 sync_timeout_last_print.tv_sec = now.tv_sec;
3347 }
3348 }
3349
3350 if (thread_created) {
3351 thread_deallocate(thd);
3352 }
3353
3354 return 0;
3355 } /* end of sync_internal call */
3356
3357 /*
3358 * Change filesystem quotas.
3359 */
3360 #if QUOTA
3361 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3362 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3363 {
3364 struct mount *mp;
3365 int error, quota_cmd, quota_status = 0;
3366 caddr_t datap;
3367 size_t fnamelen;
3368 struct nameidata nd;
3369 vfs_context_t ctx = vfs_context_current();
3370 struct dqblk my_dqblk = {};
3371
3372 AUDIT_ARG(uid, uap->uid);
3373 AUDIT_ARG(cmd, uap->cmd);
3374 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3375 uap->path, ctx);
3376 error = namei(&nd);
3377 if (error) {
3378 return error;
3379 }
3380 mp = nd.ni_vp->v_mount;
3381 mount_ref(mp, 0);
3382 vnode_put(nd.ni_vp);
3383 nameidone(&nd);
3384
3385 #if CONFIG_MACF
3386 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3387 if (error != 0) {
3388 goto out;
3389 }
3390 #endif
3391
3392 /* copyin any data we will need for downstream code */
3393 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3394
3395 switch (quota_cmd) {
3396 case Q_QUOTAON:
3397 /* uap->arg specifies a file from which to take the quotas */
3398 fnamelen = MAXPATHLEN;
3399 datap = zalloc(ZV_NAMEI);
3400 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3401 break;
3402 case Q_GETQUOTA:
3403 /* uap->arg is a pointer to a dqblk structure. */
3404 datap = (caddr_t) &my_dqblk;
3405 break;
3406 case Q_SETQUOTA:
3407 case Q_SETUSE:
3408 /* uap->arg is a pointer to a dqblk structure. */
3409 datap = (caddr_t) &my_dqblk;
3410 if (proc_is64bit(p)) {
3411 struct user_dqblk my_dqblk64;
3412 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3413 if (error == 0) {
3414 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3415 }
3416 } else {
3417 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3418 }
3419 break;
3420 case Q_QUOTASTAT:
3421 /* uap->arg is a pointer to an integer */
3422 datap = (caddr_t) "a_status;
3423 break;
3424 default:
3425 datap = NULL;
3426 break;
3427 } /* switch */
3428
3429 if (error == 0) {
3430 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3431 }
3432
3433 switch (quota_cmd) {
3434 case Q_QUOTAON:
3435 if (datap != NULL) {
3436 zfree(ZV_NAMEI, datap);
3437 }
3438 break;
3439 case Q_GETQUOTA:
3440 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3441 if (error == 0) {
3442 if (proc_is64bit(p)) {
3443 struct user_dqblk my_dqblk64;
3444
3445 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3446 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3447 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3448 } else {
3449 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3450 }
3451 }
3452 break;
3453 case Q_QUOTASTAT:
3454 /* uap->arg is a pointer to an integer */
3455 if (error == 0) {
3456 error = copyout(datap, uap->arg, sizeof(quota_status));
3457 }
3458 break;
3459 default:
3460 break;
3461 } /* switch */
3462
3463 out:
3464 mount_drop(mp, 0);
3465 return error;
3466 }
3467 #else
3468 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3469 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3470 {
3471 return EOPNOTSUPP;
3472 }
3473 #endif /* QUOTA */
3474
3475 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3476 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3477 {
3478 int error;
3479 vfs_context_t ctx = vfs_context_current();
3480
3481 #if CONFIG_MACF
3482 error = mac_mount_check_stat(ctx, mp);
3483 if (error != 0) {
3484 return error;
3485 }
3486 #endif
3487
3488 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3489 if (error != 0) {
3490 return error;
3491 }
3492
3493 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3494 }
3495
3496 /*
3497 * Get filesystem statistics.
3498 *
3499 * Returns: 0 Success
3500 * namei:???
3501 * vfs_update_vfsstat:???
3502 * munge_statfs:EFAULT
3503 */
3504 /* ARGSUSED */
3505 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3506 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3507 {
3508 int error;
3509 struct mount *mp;
3510 struct nameidata nd;
3511 vfs_context_t ctx = vfs_context_current();
3512 vnode_t vp;
3513
3514 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3515 UIO_USERSPACE, uap->path, ctx);
3516 error = namei(&nd);
3517 if (error != 0) {
3518 return error;
3519 }
3520 vp = nd.ni_vp;
3521 mp = vp->v_mount;
3522 nameidone(&nd);
3523
3524 error = statfs_internal(p, mp, uap->buf);
3525 vnode_put(vp);
3526
3527 return error;
3528 }
3529
3530 /*
3531 * Get filesystem statistics.
3532 */
3533 /* ARGSUSED */
3534 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3535 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3536 {
3537 int error;
3538 vnode_t vp = NULL;
3539 struct mount *mp;
3540
3541 AUDIT_ARG(fd, uap->fd);
3542
3543 if ((error = file_vnode(uap->fd, &vp)) ||
3544 (error = vnode_getwithref(vp))) {
3545 goto out;
3546 }
3547
3548 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3549
3550 mp = vp->v_mount;
3551 if (!mp) {
3552 error = EBADF;
3553 goto out_vnode;
3554 }
3555
3556 error = statfs_internal(p, mp, uap->buf);
3557
3558 out_vnode:
3559 vnode_put(vp);
3560
3561 out:
3562 if (vp != NULL) {
3563 file_drop(uap->fd);
3564 }
3565
3566 return error;
3567 }
3568
3569 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3570 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3571 {
3572 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3573
3574 bzero(sfs, sizeof(*sfs));
3575
3576 sfs->f_bsize = vsfs->f_bsize;
3577 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3578 sfs->f_blocks = vsfs->f_blocks;
3579 sfs->f_bfree = vsfs->f_bfree;
3580 sfs->f_bavail = vsfs->f_bavail;
3581 sfs->f_files = vsfs->f_files;
3582 sfs->f_ffree = vsfs->f_ffree;
3583 sfs->f_fsid = vsfs->f_fsid;
3584 sfs->f_owner = vsfs->f_owner;
3585 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3586 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3587 sfs->f_fssubtype = vsfs->f_fssubtype;
3588 sfs->f_flags_ext = 0;
3589 if (mp->mnt_kern_flag & MNTK_SYSTEMDATA) {
3590 sfs->f_flags_ext |= MNT_EXT_ROOT_DATA_VOL;
3591 }
3592 if (mp->mnt_kern_flag & MNTK_FSKIT) {
3593 sfs->f_flags_ext |= MNT_EXT_FSKIT;
3594 }
3595 vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3596 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3597 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3598 }
3599
3600 /*
3601 * Get file system statistics in 64-bit mode
3602 */
3603 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3604 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3605 {
3606 struct mount *mp;
3607 int error;
3608 struct nameidata *ndp;
3609 struct statfs64 *sfsp;
3610 vfs_context_t ctxp = vfs_context_current();
3611 vnode_t vp;
3612 struct {
3613 struct nameidata nd;
3614 struct statfs64 sfs;
3615 } *__nameidata_statfs64;
3616
3617 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3618 Z_WAITOK);
3619 ndp = &__nameidata_statfs64->nd;
3620
3621 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3622 UIO_USERSPACE, uap->path, ctxp);
3623 error = namei(ndp);
3624 if (error != 0) {
3625 goto out;
3626 }
3627 vp = ndp->ni_vp;
3628 mp = vp->v_mount;
3629 nameidone(ndp);
3630
3631 #if CONFIG_MACF
3632 error = mac_mount_check_stat(ctxp, mp);
3633 if (error != 0) {
3634 vnode_put(vp);
3635 goto out;
3636 }
3637 #endif
3638
3639 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3640 if (error != 0) {
3641 vnode_put(vp);
3642 goto out;
3643 }
3644
3645 sfsp = &__nameidata_statfs64->sfs;
3646 vfs_get_statfs64(mp, sfsp);
3647 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3648 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3649 /* This process does not want to see a seperate data volume mountpoint */
3650 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3651 }
3652 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3653 vnode_put(vp);
3654
3655 out:
3656 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3657
3658 return error;
3659 }
3660
3661 /*
3662 * Get file system statistics in 64-bit mode
3663 */
3664 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3665 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3666 {
3667 struct vnode *vp;
3668 struct mount *mp;
3669 struct statfs64 sfs;
3670 int error;
3671
3672 AUDIT_ARG(fd, uap->fd);
3673
3674 if ((error = file_vnode(uap->fd, &vp))) {
3675 return error;
3676 }
3677
3678 error = vnode_getwithref(vp);
3679 if (error) {
3680 file_drop(uap->fd);
3681 return error;
3682 }
3683
3684 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3685
3686 mp = vp->v_mount;
3687 if (!mp) {
3688 error = EBADF;
3689 goto out;
3690 }
3691
3692 #if CONFIG_MACF
3693 error = mac_mount_check_stat(vfs_context_current(), mp);
3694 if (error != 0) {
3695 goto out;
3696 }
3697 #endif
3698
3699 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3700 goto out;
3701 }
3702
3703 vfs_get_statfs64(mp, &sfs);
3704 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3705 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3706 /* This process does not want to see a seperate data volume mountpoint */
3707 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3708 }
3709 error = copyout(&sfs, uap->buf, sizeof(sfs));
3710
3711 out:
3712 file_drop(uap->fd);
3713 vnode_put(vp);
3714
3715 return error;
3716 }
3717
3718 struct getfsstat_struct {
3719 user_addr_t sfsp;
3720 user_addr_t *mp;
3721 int count;
3722 int maxcount;
3723 int flags;
3724 int error;
3725 };
3726
3727
3728 static int
getfsstat_callback(mount_t mp,void * arg)3729 getfsstat_callback(mount_t mp, void * arg)
3730 {
3731 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3732 struct vfsstatfs *sp;
3733 int error, my_size;
3734 vfs_context_t ctx = vfs_context_current();
3735
3736 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3737 #if CONFIG_MACF
3738 error = mac_mount_check_stat(ctx, mp);
3739 if (error != 0) {
3740 fstp->error = error;
3741 return VFS_RETURNED_DONE;
3742 }
3743 #endif
3744 sp = &mp->mnt_vfsstat;
3745 /*
3746 * If MNT_NOWAIT is specified, do not refresh the
3747 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3748 */
3749 if ((mp->mnt_lflag & MNT_LDEAD) ||
3750 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3751 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3752 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3753 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3754 return VFS_RETURNED;
3755 }
3756
3757 /*
3758 * Need to handle LP64 version of struct statfs
3759 */
3760 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3761 if (error) {
3762 fstp->error = error;
3763 return VFS_RETURNED_DONE;
3764 }
3765 fstp->sfsp += my_size;
3766
3767 if (fstp->mp) {
3768 #if CONFIG_MACF
3769 error = mac_mount_label_get(mp, *fstp->mp);
3770 if (error) {
3771 fstp->error = error;
3772 return VFS_RETURNED_DONE;
3773 }
3774 #endif
3775 fstp->mp++;
3776 }
3777 }
3778 fstp->count++;
3779 return VFS_RETURNED;
3780 }
3781
3782 /*
3783 * Get statistics on all filesystems.
3784 */
3785 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3786 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3787 {
3788 struct __mac_getfsstat_args muap;
3789
3790 muap.buf = uap->buf;
3791 muap.bufsize = uap->bufsize;
3792 muap.mac = USER_ADDR_NULL;
3793 muap.macsize = 0;
3794 muap.flags = uap->flags;
3795
3796 return __mac_getfsstat(p, &muap, retval);
3797 }
3798
3799 /*
3800 * __mac_getfsstat: Get MAC-related file system statistics
3801 *
3802 * Parameters: p (ignored)
3803 * uap User argument descriptor (see below)
3804 * retval Count of file system statistics (N stats)
3805 *
3806 * Indirect: uap->bufsize Buffer size
3807 * uap->macsize MAC info size
3808 * uap->buf Buffer where information will be returned
3809 * uap->mac MAC info
3810 * uap->flags File system flags
3811 *
3812 *
3813 * Returns: 0 Success
3814 * !0 Not success
3815 *
3816 */
3817 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3818 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3819 {
3820 user_addr_t sfsp;
3821 user_addr_t *mp;
3822 size_t count, maxcount, bufsize, macsize;
3823 struct getfsstat_struct fst;
3824
3825 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3826 return EINVAL;
3827 }
3828
3829 bufsize = (size_t) uap->bufsize;
3830 macsize = (size_t) uap->macsize;
3831
3832 if (IS_64BIT_PROCESS(p)) {
3833 maxcount = bufsize / sizeof(struct user64_statfs);
3834 } else {
3835 maxcount = bufsize / sizeof(struct user32_statfs);
3836 }
3837 sfsp = uap->buf;
3838 count = 0;
3839
3840 mp = NULL;
3841
3842 #if CONFIG_MACF
3843 if (uap->mac != USER_ADDR_NULL) {
3844 u_int32_t *mp0;
3845 int error;
3846 unsigned int i;
3847
3848 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3849 if (count != maxcount) {
3850 return EINVAL;
3851 }
3852
3853 /* Copy in the array */
3854 mp0 = kalloc_data(macsize, Z_WAITOK);
3855 if (mp0 == NULL) {
3856 return ENOMEM;
3857 }
3858
3859 error = copyin(uap->mac, mp0, macsize);
3860 if (error) {
3861 kfree_data(mp0, macsize);
3862 return error;
3863 }
3864
3865 /* Normalize to an array of user_addr_t */
3866 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3867 if (mp == NULL) {
3868 kfree_data(mp0, macsize);
3869 return ENOMEM;
3870 }
3871
3872 for (i = 0; i < count; i++) {
3873 if (IS_64BIT_PROCESS(p)) {
3874 mp[i] = ((user_addr_t *)mp0)[i];
3875 } else {
3876 mp[i] = (user_addr_t)mp0[i];
3877 }
3878 }
3879 kfree_data(mp0, macsize);
3880 }
3881 #endif
3882
3883
3884 fst.sfsp = sfsp;
3885 fst.mp = mp;
3886 fst.flags = uap->flags;
3887 fst.count = 0;
3888 fst.error = 0;
3889 fst.maxcount = (int)maxcount;
3890
3891
3892 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3893
3894 if (mp) {
3895 kfree_data(mp, count * sizeof(user_addr_t));
3896 }
3897
3898 if (fst.error) {
3899 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3900 return fst.error;
3901 }
3902
3903 if (fst.sfsp && fst.count > fst.maxcount) {
3904 *retval = fst.maxcount;
3905 } else {
3906 *retval = fst.count;
3907 }
3908 return 0;
3909 }
3910
3911 static int
getfsstat64_callback(mount_t mp,void * arg)3912 getfsstat64_callback(mount_t mp, void * arg)
3913 {
3914 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3915 struct vfsstatfs *sp;
3916 struct statfs64 sfs;
3917 int error;
3918
3919 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3920 #if CONFIG_MACF
3921 error = mac_mount_check_stat(vfs_context_current(), mp);
3922 if (error != 0) {
3923 fstp->error = error;
3924 return VFS_RETURNED_DONE;
3925 }
3926 #endif
3927 sp = &mp->mnt_vfsstat;
3928 /*
3929 * If MNT_NOWAIT is specified, do not refresh the fsstat
3930 * cache. MNT_WAIT overrides MNT_NOWAIT.
3931 *
3932 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3933 * getfsstat, since the constants are out of the same
3934 * namespace.
3935 */
3936 if ((mp->mnt_lflag & MNT_LDEAD) ||
3937 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3938 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3939 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3940 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3941 return VFS_RETURNED;
3942 }
3943
3944 vfs_get_statfs64(mp, &sfs);
3945 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3946 if (error) {
3947 fstp->error = error;
3948 return VFS_RETURNED_DONE;
3949 }
3950 fstp->sfsp += sizeof(sfs);
3951 }
3952 fstp->count++;
3953 return VFS_RETURNED;
3954 }
3955
3956 /*
3957 * Get statistics on all file systems in 64 bit mode.
3958 */
3959 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3960 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3961 {
3962 user_addr_t sfsp;
3963 int count, maxcount;
3964 struct getfsstat_struct fst;
3965
3966 maxcount = uap->bufsize / sizeof(struct statfs64);
3967
3968 sfsp = uap->buf;
3969 count = 0;
3970
3971 fst.sfsp = sfsp;
3972 fst.flags = uap->flags;
3973 fst.count = 0;
3974 fst.error = 0;
3975 fst.maxcount = maxcount;
3976
3977 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3978
3979 if (fst.error) {
3980 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3981 return fst.error;
3982 }
3983
3984 if (fst.sfsp && fst.count > fst.maxcount) {
3985 *retval = fst.maxcount;
3986 } else {
3987 *retval = fst.count;
3988 }
3989
3990 return 0;
3991 }
3992
3993 /*
3994 * gets the associated vnode with the file descriptor passed.
3995 * as input
3996 *
3997 * INPUT
3998 * ctx - vfs context of caller
3999 * fd - file descriptor for which vnode is required.
4000 * vpp - Pointer to pointer to vnode to be returned.
4001 *
4002 * The vnode is returned with an iocount so any vnode obtained
4003 * by this call needs a vnode_put
4004 *
4005 */
4006 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)4007 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
4008 {
4009 int error;
4010 vnode_t vp;
4011 struct fileproc *fp;
4012 proc_t p = vfs_context_proc(ctx);
4013
4014 *vpp = NULLVP;
4015
4016 error = fp_getfvp(p, fd, &fp, &vp);
4017 if (error) {
4018 return error;
4019 }
4020
4021 error = vnode_getwithref(vp);
4022 if (error) {
4023 (void)fp_drop(p, fd, fp, 0);
4024 return error;
4025 }
4026
4027 (void)fp_drop(p, fd, fp, 0);
4028 *vpp = vp;
4029 return error;
4030 }
4031
4032 /*
4033 * Wrapper function around namei to start lookup from a directory
4034 * specified by a file descriptor ni_dirfd.
4035 *
4036 * In addition to all the errors returned by namei, this call can
4037 * return ENOTDIR if the file descriptor does not refer to a directory.
4038 * and EBADF if the file descriptor is not valid.
4039 */
4040 int
nameiat(struct nameidata * ndp,int dirfd)4041 nameiat(struct nameidata *ndp, int dirfd)
4042 {
4043 if ((dirfd != AT_FDCWD) &&
4044 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4045 !(ndp->ni_cnd.cn_flags & USEDVP)) {
4046 int error = 0;
4047 char c;
4048
4049 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4050 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4051 if (error) {
4052 return error;
4053 }
4054 } else {
4055 c = *((char *)(ndp->ni_dirp));
4056 }
4057
4058 if (c != '/') {
4059 vnode_t dvp_at;
4060
4061 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4062 &dvp_at);
4063 if (error) {
4064 return error;
4065 }
4066
4067 if (vnode_vtype(dvp_at) != VDIR) {
4068 vnode_put(dvp_at);
4069 return ENOTDIR;
4070 }
4071
4072 ndp->ni_dvp = dvp_at;
4073 ndp->ni_cnd.cn_flags |= USEDVP;
4074 error = namei(ndp);
4075 ndp->ni_cnd.cn_flags &= ~USEDVP;
4076 vnode_put(dvp_at);
4077 return error;
4078 }
4079 }
4080
4081 return namei(ndp);
4082 }
4083
4084 /*
4085 * Change current working directory to a given file descriptor.
4086 */
4087 /* ARGSUSED */
4088 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4089 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4090 {
4091 vnode_t vp;
4092 vnode_t tdp;
4093 vnode_t tvp;
4094 struct mount *mp;
4095 int error, should_put = 1;
4096
4097 AUDIT_ARG(fd, fd);
4098 if (per_thread && fd == -1) {
4099 /*
4100 * Switching back from per-thread to per process CWD; verify we
4101 * in fact have one before proceeding. The only success case
4102 * for this code path is to return 0 preemptively after zapping
4103 * the thread structure contents.
4104 */
4105 thread_t th = vfs_context_thread(ctx);
4106 if (th) {
4107 uthread_t uth = get_bsdthread_info(th);
4108 tvp = uth->uu_cdir;
4109 uth->uu_cdir = NULLVP;
4110 if (tvp != NULLVP) {
4111 vnode_rele(tvp);
4112 return 0;
4113 }
4114 }
4115 return EBADF;
4116 }
4117
4118 if ((error = file_vnode(fd, &vp))) {
4119 return error;
4120 }
4121 if ((error = vnode_getwithref(vp))) {
4122 file_drop(fd);
4123 return error;
4124 }
4125
4126 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4127
4128 if (vp->v_type != VDIR) {
4129 error = ENOTDIR;
4130 goto out;
4131 }
4132
4133 #if CONFIG_MACF
4134 error = mac_vnode_check_chdir(ctx, vp);
4135 if (error) {
4136 goto out;
4137 }
4138 #endif
4139 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4140 if (error) {
4141 goto out;
4142 }
4143
4144 while (!error && (mp = vp->v_mountedhere) != NULL) {
4145 if (vfs_busy(mp, LK_NOWAIT)) {
4146 error = EACCES;
4147 goto out;
4148 }
4149 error = VFS_ROOT(mp, &tdp, ctx);
4150 vfs_unbusy(mp);
4151 if (error) {
4152 break;
4153 }
4154 vnode_put(vp);
4155 vp = tdp;
4156 }
4157 if (error) {
4158 goto out;
4159 }
4160 if ((error = vnode_ref(vp))) {
4161 goto out;
4162 }
4163 vnode_put(vp);
4164 should_put = 0;
4165
4166 if (per_thread) {
4167 thread_t th = vfs_context_thread(ctx);
4168 if (th) {
4169 uthread_t uth = get_bsdthread_info(th);
4170 tvp = uth->uu_cdir;
4171 uth->uu_cdir = vp;
4172 OSBitOrAtomic(P_THCWD, &p->p_flag);
4173 } else {
4174 vnode_rele(vp);
4175 error = ENOENT;
4176 goto out;
4177 }
4178 } else {
4179 proc_dirs_lock_exclusive(p);
4180 proc_fdlock(p);
4181 tvp = p->p_fd.fd_cdir;
4182 p->p_fd.fd_cdir = vp;
4183 proc_fdunlock(p);
4184 proc_dirs_unlock_exclusive(p);
4185 }
4186
4187 if (tvp) {
4188 vnode_rele(tvp);
4189 }
4190
4191 out:
4192 if (should_put) {
4193 vnode_put(vp);
4194 }
4195 file_drop(fd);
4196
4197 return error;
4198 }
4199
4200 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4201 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4202 {
4203 return fchdir(p, vfs_context_current(), uap->fd, false);
4204 }
4205
4206 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4207 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4208 {
4209 return fchdir(p, vfs_context_current(), uap->fd, true);
4210 }
4211
4212
4213 /*
4214 * Change current working directory (".").
4215 *
4216 * Returns: 0 Success
4217 * change_dir:ENOTDIR
4218 * change_dir:???
4219 * vnode_ref:ENOENT No such file or directory
4220 */
4221 /* ARGSUSED */
4222 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4223 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4224 {
4225 int error;
4226 vnode_t tvp;
4227
4228 error = change_dir(ndp, ctx);
4229 if (error) {
4230 return error;
4231 }
4232 if ((error = vnode_ref(ndp->ni_vp))) {
4233 vnode_put(ndp->ni_vp);
4234 return error;
4235 }
4236 /*
4237 * drop the iocount we picked up in change_dir
4238 */
4239 vnode_put(ndp->ni_vp);
4240
4241 if (per_thread) {
4242 thread_t th = vfs_context_thread(ctx);
4243 if (th) {
4244 uthread_t uth = get_bsdthread_info(th);
4245 tvp = uth->uu_cdir;
4246 uth->uu_cdir = ndp->ni_vp;
4247 OSBitOrAtomic(P_THCWD, &p->p_flag);
4248 } else {
4249 vnode_rele(ndp->ni_vp);
4250 return ENOENT;
4251 }
4252 } else {
4253 proc_dirs_lock_exclusive(p);
4254 proc_fdlock(p);
4255 tvp = p->p_fd.fd_cdir;
4256 p->p_fd.fd_cdir = ndp->ni_vp;
4257 proc_fdunlock(p);
4258 proc_dirs_unlock_exclusive(p);
4259 }
4260
4261 if (tvp) {
4262 vnode_rele(tvp);
4263 }
4264
4265 return 0;
4266 }
4267
4268
4269 /*
4270 * Change current working directory (".").
4271 *
4272 * Returns: 0 Success
4273 * chdir_internal:ENOTDIR
4274 * chdir_internal:ENOENT No such file or directory
4275 * chdir_internal:???
4276 */
4277 /* ARGSUSED */
4278 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4279 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4280 {
4281 struct nameidata nd;
4282 vfs_context_t ctx = vfs_context_current();
4283
4284 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4285 UIO_USERSPACE, uap->path, ctx);
4286
4287 return chdir_internal(p, ctx, &nd, per_thread);
4288 }
4289
4290
4291 /*
4292 * chdir
4293 *
4294 * Change current working directory (".") for the entire process
4295 *
4296 * Parameters: p Process requesting the call
4297 * uap User argument descriptor (see below)
4298 * retval (ignored)
4299 *
4300 * Indirect parameters: uap->path Directory path
4301 *
4302 * Returns: 0 Success
4303 * common_chdir: ENOTDIR
4304 * common_chdir: ENOENT No such file or directory
4305 * common_chdir: ???
4306 *
4307 */
4308 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4309 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4310 {
4311 return common_chdir(p, (void *)uap, 0);
4312 }
4313
4314 /*
4315 * __pthread_chdir
4316 *
4317 * Change current working directory (".") for a single thread
4318 *
4319 * Parameters: p Process requesting the call
4320 * uap User argument descriptor (see below)
4321 * retval (ignored)
4322 *
4323 * Indirect parameters: uap->path Directory path
4324 *
4325 * Returns: 0 Success
4326 * common_chdir: ENOTDIR
4327 * common_chdir: ENOENT No such file or directory
4328 * common_chdir: ???
4329 *
4330 */
4331 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4332 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4333 {
4334 return common_chdir(p, (void *)uap, 1);
4335 }
4336
4337
4338 /*
4339 * Change notion of root (``/'') directory.
4340 */
4341 /* ARGSUSED */
4342 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4343 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4344 {
4345 struct filedesc *fdp = &p->p_fd;
4346 int error;
4347 struct nameidata nd;
4348 vnode_t tvp;
4349 vfs_context_t ctx = vfs_context_current();
4350
4351 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4352 return error;
4353 }
4354
4355 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4356 UIO_USERSPACE, uap->path, ctx);
4357 error = change_dir(&nd, ctx);
4358 if (error) {
4359 return error;
4360 }
4361
4362 #if CONFIG_MACF
4363 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4364 &nd.ni_cnd);
4365 if (error) {
4366 vnode_put(nd.ni_vp);
4367 return error;
4368 }
4369 #endif
4370
4371 if ((error = vnode_ref(nd.ni_vp))) {
4372 vnode_put(nd.ni_vp);
4373 return error;
4374 }
4375 vnode_put(nd.ni_vp);
4376
4377 /*
4378 * This lock provides the guarantee that as long as you hold the lock
4379 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4380 * on a referenced vnode in namei when determining the rootvnode for
4381 * a process.
4382 */
4383 /* needed for synchronization with lookup */
4384 proc_dirs_lock_exclusive(p);
4385 /* needed for setting the flag and other activities on the fd itself */
4386 proc_fdlock(p);
4387 tvp = fdp->fd_rdir;
4388 fdp->fd_rdir = nd.ni_vp;
4389 fdt_flag_set(fdp, FD_CHROOT);
4390 proc_fdunlock(p);
4391 proc_dirs_unlock_exclusive(p);
4392
4393 if (tvp != NULL) {
4394 vnode_rele(tvp);
4395 }
4396
4397 return 0;
4398 }
4399
4400 #define PATHSTATICBUFLEN 256
4401 #define PIVOT_ROOT_ENTITLEMENT \
4402 "com.apple.private.vfs.pivot-root"
4403
4404 #if defined(XNU_TARGET_OS_OSX)
4405 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4406 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4407 {
4408 int error;
4409 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4410 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4411 char *new_rootfs_path_before_buf = NULL;
4412 char *old_rootfs_path_after_buf = NULL;
4413 char *incoming = NULL;
4414 char *outgoing = NULL;
4415 vnode_t incoming_rootvp = NULLVP;
4416 size_t bytes_copied;
4417
4418 /*
4419 * XXX : Additional restrictions needed
4420 * - perhaps callable only once.
4421 */
4422 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4423 return error;
4424 }
4425
4426 /*
4427 * pivot_root can be executed by launchd only.
4428 * Enforce entitlement.
4429 */
4430 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4431 return EPERM;
4432 }
4433
4434 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4435 if (error == ENAMETOOLONG) {
4436 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4437 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4438 }
4439
4440 if (error) {
4441 goto out;
4442 }
4443
4444 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4445 if (error == ENAMETOOLONG) {
4446 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4447 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4448 }
4449 if (error) {
4450 goto out;
4451 }
4452
4453 if (new_rootfs_path_before_buf) {
4454 incoming = new_rootfs_path_before_buf;
4455 } else {
4456 incoming = &new_rootfs_path_before[0];
4457 }
4458
4459 if (old_rootfs_path_after_buf) {
4460 outgoing = old_rootfs_path_after_buf;
4461 } else {
4462 outgoing = &old_rootfs_path_after[0];
4463 }
4464
4465 /*
4466 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4467 * Userland is not allowed to pivot to an image.
4468 */
4469 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4470 if (error) {
4471 goto out;
4472 }
4473 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4474 if (error) {
4475 goto out;
4476 }
4477
4478 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4479
4480 out:
4481 if (incoming_rootvp != NULLVP) {
4482 vnode_put(incoming_rootvp);
4483 incoming_rootvp = NULLVP;
4484 }
4485
4486 if (old_rootfs_path_after_buf) {
4487 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4488 }
4489
4490 if (new_rootfs_path_before_buf) {
4491 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4492 }
4493
4494 return error;
4495 }
4496 #else
4497 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4498 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4499 {
4500 return nosys(p, NULL, retval);
4501 }
4502 #endif /* XNU_TARGET_OS_OSX */
4503
4504 /*
4505 * Common routine for chroot and chdir.
4506 *
4507 * Returns: 0 Success
4508 * ENOTDIR Not a directory
4509 * namei:??? [anything namei can return]
4510 * vnode_authorize:??? [anything vnode_authorize can return]
4511 */
4512 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4513 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4514 {
4515 vnode_t vp;
4516 int error;
4517
4518 if ((error = namei(ndp))) {
4519 return error;
4520 }
4521 nameidone(ndp);
4522 vp = ndp->ni_vp;
4523
4524 if (vp->v_type != VDIR) {
4525 vnode_put(vp);
4526 return ENOTDIR;
4527 }
4528
4529 #if CONFIG_MACF
4530 error = mac_vnode_check_chdir(ctx, vp);
4531 if (error) {
4532 vnode_put(vp);
4533 return error;
4534 }
4535 #endif
4536
4537 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4538 if (error) {
4539 vnode_put(vp);
4540 return error;
4541 }
4542
4543 return error;
4544 }
4545
4546 /*
4547 * Free the vnode data (for directories) associated with the file glob.
4548 */
4549 struct fd_vn_data *
fg_vn_data_alloc(void)4550 fg_vn_data_alloc(void)
4551 {
4552 struct fd_vn_data *fvdata;
4553
4554 /* Allocate per fd vnode data */
4555 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4556 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4557 return fvdata;
4558 }
4559
4560 /*
4561 * Free the vnode data (for directories) associated with the file glob.
4562 */
4563 void
fg_vn_data_free(void * fgvndata)4564 fg_vn_data_free(void *fgvndata)
4565 {
4566 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4567
4568 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4569 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4570 kfree_type(struct fd_vn_data, fvdata);
4571 }
4572
4573 /*
4574 * Check permissions, allocate an open file structure,
4575 * and call the device open routine if any.
4576 *
4577 * Returns: 0 Success
4578 * EINVAL
4579 * EINTR
4580 * falloc:ENFILE
4581 * falloc:EMFILE
4582 * falloc:ENOMEM
4583 * vn_open_auth:???
4584 * dupfdopen:???
4585 * VNOP_ADVLOCK:???
4586 * vnode_setsize:???
4587 *
4588 * XXX Need to implement uid, gid
4589 */
4590 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4591 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4592 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4593 {
4594 proc_t p = vfs_context_proc(ctx);
4595 kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4596 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4597 struct fileproc *fp;
4598 vnode_t vp;
4599 int flags, oflags, amode;
4600 int type, indx, error;
4601 struct vfs_context context;
4602 vnode_t authvp = NULLVP;
4603
4604 oflags = uflags;
4605
4606 amode = oflags & O_ACCMODE;
4607 /*
4608 * Because O_RDONLY is 0, it is not possible to distinguish between
4609 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4610 * with FREAD/FWRITE.
4611 */
4612 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4613 return EINVAL;
4614 }
4615
4616 flags = FFLAGS(uflags);
4617 CLR(flags, FENCRYPTED);
4618 CLR(flags, FUNENCRYPTED);
4619
4620 AUDIT_ARG(fflags, oflags);
4621 AUDIT_ARG(mode, vap->va_mode);
4622
4623 if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4624 return error;
4625 }
4626 if (flags & O_CLOEXEC) {
4627 fp->fp_flags |= FP_CLOEXEC;
4628 }
4629 if (flags & O_CLOFORK) {
4630 fp->fp_flags |= FP_CLOFORK;
4631 }
4632
4633 /* setup state to recognize when fdesc_open was called */
4634 uu->uu_dupfd = -1;
4635
4636 /*
4637 * Disable read/write access if file is opened with O_EVTONLY and
4638 * the process has requested to deny read/write access.
4639 */
4640 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4641 flags &= ~(FREAD | FWRITE);
4642 }
4643
4644 if (authfd != AUTH_OPEN_NOAUTHFD) {
4645 error = vnode_getfromfd(ctx, authfd, &authvp);
4646 if (error) {
4647 fp_free(p, indx, fp);
4648 return error;
4649 }
4650 }
4651
4652 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4653 if (authvp != NULLVP) {
4654 vnode_put(authvp);
4655 }
4656 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4657 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4658 *retval = indx;
4659 return 0;
4660 }
4661 }
4662 if (error == ERESTART) {
4663 error = EINTR;
4664 }
4665 fp_free(p, indx, fp);
4666 return error;
4667 }
4668
4669 if (authvp != NULLVP) {
4670 vnode_put(authvp);
4671 }
4672
4673 uu->uu_dupfd = 0;
4674 vp = ndp->ni_vp;
4675
4676 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4677 fp->fp_glob->fg_ops = &vnops;
4678 fp_set_data(fp, vp);
4679
4680 #if CONFIG_FILE_LEASES
4681 /*
4682 * If we are creating a file or open with truncate, we need to break the
4683 * lease if there is a read lease placed on the parent dir.
4684 */
4685 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4686 vnode_breakdirlease(vp, true, oflags);
4687 }
4688 /* Now check if there is a lease placed on the file itself. */
4689 error = vnode_breaklease(vp, oflags, ctx);
4690 if (error) {
4691 goto bad;
4692 }
4693 #endif /* CONFIG_FILE_LEASES */
4694
4695 if (flags & (O_EXLOCK | O_SHLOCK)) {
4696 struct flock lf = {
4697 .l_whence = SEEK_SET,
4698 };
4699
4700 if (flags & O_EXLOCK) {
4701 lf.l_type = F_WRLCK;
4702 } else {
4703 lf.l_type = F_RDLCK;
4704 }
4705 type = F_FLOCK;
4706 if ((flags & FNONBLOCK) == 0) {
4707 type |= F_WAIT;
4708 }
4709 #if CONFIG_MACF
4710 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4711 F_SETLK, &lf);
4712 if (error) {
4713 goto bad;
4714 }
4715 #endif
4716 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4717 goto bad;
4718 }
4719 fp->fp_glob->fg_flag |= FWASLOCKED;
4720 }
4721
4722 /* try to truncate by setting the size attribute */
4723 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4724 goto bad;
4725 }
4726
4727 /*
4728 * For directories we hold some additional information in the fd.
4729 */
4730 if (vnode_vtype(vp) == VDIR) {
4731 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4732 } else {
4733 fp->fp_glob->fg_vn_data = NULL;
4734 }
4735
4736 #if CONFIG_SECLUDED_MEMORY
4737 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4738 memory_object_control_t moc;
4739 const char *v_name;
4740
4741 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4742
4743 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4744 /* nothing to do... */
4745 } else if (fp->fp_glob->fg_flag & FWRITE) {
4746 /* writable -> no longer eligible for secluded pages */
4747 memory_object_mark_eligible_for_secluded(moc,
4748 FALSE);
4749 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4750 char pathname[32] = { 0, };
4751 size_t copied;
4752 /* XXX FBDP: better way to detect /Applications/ ? */
4753 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4754 (void)copyinstr(ndp->ni_dirp,
4755 pathname,
4756 sizeof(pathname),
4757 &copied);
4758 } else {
4759 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4760 pathname,
4761 sizeof(pathname),
4762 &copied);
4763 }
4764 pathname[sizeof(pathname) - 1] = '\0';
4765 if (strncmp(pathname,
4766 "/Applications/",
4767 strlen("/Applications/")) == 0 &&
4768 strncmp(pathname,
4769 "/Applications/Camera.app/",
4770 strlen("/Applications/Camera.app/")) != 0) {
4771 /*
4772 * not writable
4773 * AND from "/Applications/"
4774 * AND not from "/Applications/Camera.app/"
4775 * ==> eligible for secluded
4776 */
4777 memory_object_mark_eligible_for_secluded(moc,
4778 TRUE);
4779 }
4780 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4781 (v_name = vnode_getname(vp))) {
4782 size_t len = strlen(v_name);
4783
4784 if (!strncmp(v_name, "dyld", len) ||
4785 !strncmp(v_name, "launchd", len) ||
4786 !strncmp(v_name, "Camera", len) ||
4787 !strncmp(v_name, "SpringBoard", len) ||
4788 !strncmp(v_name, "backboardd", len) ||
4789 !strncmp(v_name, "cameracaptured", len)) {
4790 /*
4791 * This file matters when launching Camera:
4792 * do not store its contents in the secluded
4793 * pool that will be drained on Camera launch.
4794 */
4795 memory_object_mark_eligible_for_secluded(moc,
4796 FALSE);
4797 } else if (!strncmp(v_name, "audiomxd", len) ||
4798 !strncmp(v_name, "mediaplaybackd", len)) {
4799 memory_object_mark_eligible_for_secluded(moc,
4800 FALSE);
4801 memory_object_mark_for_realtime(moc,
4802 true);
4803 } else if (!strncmp(v_name, "bluetoothd", len)) {
4804 /*
4805 * bluetoothd might be needed for realtime audio
4806 * playback.
4807 */
4808 memory_object_mark_eligible_for_secluded(moc,
4809 FALSE);
4810 memory_object_mark_for_realtime(moc,
4811 true);
4812 } else {
4813 char pathname[64] = { 0, };
4814 size_t copied;
4815 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4816 (void)copyinstr(ndp->ni_dirp,
4817 pathname,
4818 sizeof(pathname),
4819 &copied);
4820 } else {
4821 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4822 pathname,
4823 sizeof(pathname),
4824 &copied);
4825 }
4826 pathname[sizeof(pathname) - 1] = '\0';
4827 if (strncmp(pathname,
4828 "/Library/Audio/Plug-Ins/",
4829 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4830 strncmp(pathname,
4831 "/System/Library/Audio/Plug-Ins/",
4832 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4833 /*
4834 * This may be an audio plugin required
4835 * for realtime playback.
4836 * ==> NOT eligible for secluded.
4837 */
4838 memory_object_mark_eligible_for_secluded(moc,
4839 FALSE);
4840 memory_object_mark_for_realtime(moc,
4841 true);
4842 }
4843 }
4844 vnode_putname(v_name);
4845 }
4846 }
4847 #endif /* CONFIG_SECLUDED_MEMORY */
4848
4849 vnode_put(vp);
4850
4851 /*
4852 * The first terminal open (without a O_NOCTTY) by a session leader
4853 * results in it being set as the controlling terminal.
4854 */
4855 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4856 !(flags & O_NOCTTY)) {
4857 int tmp = 0;
4858
4859 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4860 (caddr_t)&tmp, ctx);
4861 }
4862
4863 proc_fdlock(p);
4864 procfdtbl_releasefd(p, indx, NULL);
4865
4866 fp_drop(p, indx, fp, 1);
4867 proc_fdunlock(p);
4868
4869 *retval = indx;
4870
4871 return 0;
4872 bad:
4873 context = *vfs_context_current();
4874 context.vc_ucred = fp->fp_glob->fg_cred;
4875
4876 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4877 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4878 struct flock lf = {
4879 .l_whence = SEEK_SET,
4880 .l_type = F_UNLCK,
4881 };
4882
4883 (void)VNOP_ADVLOCK(
4884 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4885 }
4886
4887 vn_close(vp, fp->fp_glob->fg_flag, &context);
4888 vnode_put(vp);
4889 fp_free(p, indx, fp);
4890
4891 return error;
4892 }
4893
4894 /*
4895 * While most of the *at syscall handlers can call nameiat() which
4896 * is a wrapper around namei, the use of namei and initialisation
4897 * of nameidata are far removed and in different functions - namei
4898 * gets called in vn_open_auth for open1. So we'll just do here what
4899 * nameiat() does.
4900 */
4901 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4902 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4903 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4904 int dirfd, int authfd)
4905 {
4906 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4907 int error;
4908 char c;
4909
4910 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4911 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4912 if (error) {
4913 return error;
4914 }
4915 } else {
4916 c = *((char *)(ndp->ni_dirp));
4917 }
4918
4919 if (c != '/') {
4920 vnode_t dvp_at;
4921
4922 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4923 &dvp_at);
4924 if (error) {
4925 return error;
4926 }
4927
4928 if (vnode_vtype(dvp_at) != VDIR) {
4929 vnode_put(dvp_at);
4930 return ENOTDIR;
4931 }
4932
4933 ndp->ni_dvp = dvp_at;
4934 ndp->ni_cnd.cn_flags |= USEDVP;
4935 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4936 retval, authfd);
4937 vnode_put(dvp_at);
4938 return error;
4939 }
4940 }
4941
4942 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4943 }
4944
4945 /*
4946 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4947 *
4948 * Parameters: p Process requesting the open
4949 * uap User argument descriptor (see below)
4950 * retval Pointer to an area to receive the
4951 * return calue from the system call
4952 *
4953 * Indirect: uap->path Path to open (same as 'open')
4954 * uap->flags Flags to open (same as 'open'
4955 * uap->uid UID to set, if creating
4956 * uap->gid GID to set, if creating
4957 * uap->mode File mode, if creating (same as 'open')
4958 * uap->xsecurity ACL to set, if creating
4959 *
4960 * Returns: 0 Success
4961 * !0 errno value
4962 *
4963 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4964 *
4965 * XXX: We should enummerate the possible errno values here, and where
4966 * in the code they originated.
4967 */
4968 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4969 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4970 {
4971 int ciferror;
4972 kauth_filesec_t xsecdst;
4973 struct vnode_attr va;
4974 struct nameidata nd;
4975 int cmode;
4976
4977 AUDIT_ARG(owner, uap->uid, uap->gid);
4978
4979 xsecdst = NULL;
4980 if ((uap->xsecurity != USER_ADDR_NULL) &&
4981 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4982 return ciferror;
4983 }
4984
4985 VATTR_INIT(&va);
4986 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4987 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4988 if (uap->uid != KAUTH_UID_NONE) {
4989 VATTR_SET(&va, va_uid, uap->uid);
4990 }
4991 if (uap->gid != KAUTH_GID_NONE) {
4992 VATTR_SET(&va, va_gid, uap->gid);
4993 }
4994 if (xsecdst != NULL) {
4995 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4996 va.va_vaflags |= VA_FILESEC_ACL;
4997 }
4998
4999 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
5000 uap->path, vfs_context_current());
5001
5002 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
5003 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
5004 if (xsecdst != NULL) {
5005 kauth_filesec_free(xsecdst);
5006 }
5007
5008 return ciferror;
5009 }
5010
5011 /*
5012 * Go through the data-protected atomically controlled open (2)
5013 *
5014 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5015 */
5016 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5017 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5018 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5019 {
5020 /*
5021 * Follow the same path as normal open(2)
5022 * Look up the item if it exists, and acquire the vnode.
5023 */
5024 struct vnode_attr va;
5025 struct nameidata nd;
5026 int cmode;
5027 int error;
5028 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5029
5030 VATTR_INIT(&va);
5031 /* Mask off all but regular access permissions */
5032 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5033 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5034
5035 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5036 path, ctx);
5037
5038 /*
5039 * Initialize the extra fields in vnode_attr to pass down our
5040 * extra fields.
5041 * 1. target cprotect class.
5042 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5043 */
5044 if (flags & O_CREAT) {
5045 /* lower level kernel code validates that the class is valid before applying it. */
5046 if (class != PROTECTION_CLASS_DEFAULT) {
5047 /*
5048 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5049 * file behave the same as open (2)
5050 */
5051 VATTR_SET(&va, va_dataprotect_class, class);
5052 }
5053 }
5054
5055 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5056 if (flags & (O_RDWR | O_WRONLY)) {
5057 /*
5058 * Not allowed to write raw encrypted bytes or when opening authenticated.
5059 */
5060 return EINVAL;
5061 }
5062 if (dpflags & O_DP_GETRAWENCRYPTED) {
5063 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5064 }
5065 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5066 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5067 }
5068 if (dpflags & O_DP_AUTHENTICATE) {
5069 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5070 }
5071 }
5072
5073 error = open1at(vfs_context_current(), &nd, flags, &va,
5074 NULL, NULL, retval, fd, authfd);
5075
5076 return error;
5077 }
5078
5079 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5080 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5081 {
5082 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5083 return EINVAL;
5084 }
5085
5086 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5087 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5088 }
5089
5090 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5091 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5092 {
5093 if (uap->dpflags & O_DP_AUTHENTICATE) {
5094 return EINVAL;
5095 }
5096
5097 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5098 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5099 }
5100
5101 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5102 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5103 int fd, enum uio_seg segflg, int *retval)
5104 {
5105 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5106 struct {
5107 struct vnode_attr va;
5108 struct nameidata nd;
5109 } *__open_data;
5110 struct vnode_attr *vap;
5111 struct nameidata *ndp;
5112 int cmode;
5113 int error;
5114
5115 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5116 vap = &__open_data->va;
5117 ndp = &__open_data->nd;
5118
5119 VATTR_INIT(vap);
5120 /* Mask off all but regular access permissions */
5121 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5122 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5123
5124 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5125 segflg, path, ctx);
5126
5127 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5128
5129 kfree_type(typeof(*__open_data), __open_data);
5130
5131 return error;
5132 }
5133
5134 int
open(proc_t p,struct open_args * uap,int32_t * retval)5135 open(proc_t p, struct open_args *uap, int32_t *retval)
5136 {
5137 __pthread_testcancel(1);
5138 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5139 }
5140
5141 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5142 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5143 int32_t *retval)
5144 {
5145 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5146 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5147 }
5148
5149 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5150 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5151 int32_t *retval)
5152 {
5153 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5154 uap->mode, uap->fd, UIO_USERSPACE, retval);
5155 }
5156
5157 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5158 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5159 {
5160 __pthread_testcancel(1);
5161 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5162 }
5163
5164 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5165
5166 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5167 vfs_context_can_open_by_id(vfs_context_t ctx)
5168 {
5169 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5170 return TRUE;
5171 }
5172
5173 return IOTaskHasEntitlement(vfs_context_task(ctx),
5174 OPEN_BY_ID_ENTITLEMENT);
5175 }
5176
5177 /*
5178 * openbyid_np: open a file given a file system id and a file system object id
5179 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5180 * file systems that don't support object ids it is a node id (uint64_t).
5181 *
5182 * Parameters: p Process requesting the open
5183 * uap User argument descriptor (see below)
5184 * retval Pointer to an area to receive the
5185 * return calue from the system call
5186 *
5187 * Indirect: uap->path Path to open (same as 'open')
5188 *
5189 * uap->fsid id of target file system
5190 * uap->objid id of target file system object
5191 * uap->flags Flags to open (same as 'open')
5192 *
5193 * Returns: 0 Success
5194 * !0 errno value
5195 *
5196 *
5197 * XXX: We should enummerate the possible errno values here, and where
5198 * in the code they originated.
5199 */
5200 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5201 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5202 {
5203 fsid_t fsid;
5204 uint64_t objid;
5205 int error;
5206 char *buf = NULL;
5207 int buflen = MAXPATHLEN;
5208 int pathlen = 0;
5209 vfs_context_t ctx = vfs_context_current();
5210
5211 if (!vfs_context_can_open_by_id(ctx)) {
5212 return EPERM;
5213 }
5214
5215 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5216 return error;
5217 }
5218
5219 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5220 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5221 return error;
5222 }
5223
5224 AUDIT_ARG(value32, fsid.val[0]);
5225 AUDIT_ARG(value64, objid);
5226
5227 /*resolve path from fsis, objid*/
5228 do {
5229 buf = kalloc_data(buflen + 1, Z_WAITOK);
5230 if (buf == NULL) {
5231 return ENOMEM;
5232 }
5233
5234 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5235 buf, FSOPT_ISREALFSID, &pathlen);
5236
5237 if (error) {
5238 kfree_data(buf, buflen + 1);
5239 buf = NULL;
5240 }
5241 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5242
5243 if (error) {
5244 return error;
5245 }
5246
5247 buf[pathlen] = 0;
5248
5249 error = openat_internal(
5250 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5251
5252 kfree_data(buf, buflen + 1);
5253
5254 return error;
5255 }
5256
5257
5258 /*
5259 * Create a special file.
5260 */
5261 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5262 int fd);
5263
5264 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5265 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5266 mode_t mode, int fd)
5267 {
5268 vfs_context_t ctx = vfs_context_current();
5269 struct nameidata nd;
5270 vnode_t vp, dvp;
5271 int error;
5272
5273 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5274 if ((mode & S_IFMT) == S_IFIFO) {
5275 return mkfifo1(ctx, upath, vap, fd);
5276 }
5277
5278 AUDIT_ARG(mode, mode);
5279 AUDIT_ARG(value32, vap->va_rdev);
5280
5281 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5282 return error;
5283 }
5284 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5285 UIO_USERSPACE, upath, ctx);
5286 error = nameiat(&nd, fd);
5287 if (error) {
5288 return error;
5289 }
5290 dvp = nd.ni_dvp;
5291 vp = nd.ni_vp;
5292
5293 if (vp != NULL) {
5294 error = EEXIST;
5295 goto out;
5296 }
5297
5298 switch (mode & S_IFMT) {
5299 case S_IFCHR:
5300 VATTR_SET(vap, va_type, VCHR);
5301 break;
5302 case S_IFBLK:
5303 VATTR_SET(vap, va_type, VBLK);
5304 break;
5305 default:
5306 error = EINVAL;
5307 goto out;
5308 }
5309
5310 #if CONFIG_MACF
5311 error = mac_vnode_check_create(ctx,
5312 nd.ni_dvp, &nd.ni_cnd, vap);
5313 if (error) {
5314 goto out;
5315 }
5316 #endif
5317
5318 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5319 goto out;
5320 }
5321
5322 #if CONFIG_FILE_LEASES
5323 vnode_breakdirlease(dvp, false, O_WRONLY);
5324 #endif
5325
5326 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5327 goto out;
5328 }
5329
5330 if (vp) {
5331 int update_flags = 0;
5332
5333 // Make sure the name & parent pointers are hooked up
5334 if (vp->v_name == NULL) {
5335 update_flags |= VNODE_UPDATE_NAME;
5336 }
5337 if (vp->v_parent == NULLVP) {
5338 update_flags |= VNODE_UPDATE_PARENT;
5339 }
5340
5341 if (update_flags) {
5342 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5343 }
5344
5345 #if CONFIG_FSE
5346 add_fsevent(FSE_CREATE_FILE, ctx,
5347 FSE_ARG_VNODE, vp,
5348 FSE_ARG_DONE);
5349 #endif
5350 }
5351
5352 out:
5353 /*
5354 * nameidone has to happen before we vnode_put(dvp)
5355 * since it may need to release the fs_nodelock on the dvp
5356 */
5357 nameidone(&nd);
5358
5359 if (vp) {
5360 vnode_put(vp);
5361 }
5362 vnode_put(dvp);
5363
5364 return error;
5365 }
5366
5367 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5368 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5369 {
5370 struct vnode_attr va;
5371
5372 VATTR_INIT(&va);
5373 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5374 VATTR_SET(&va, va_rdev, uap->dev);
5375
5376 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5377 }
5378
5379 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5380 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5381 {
5382 struct vnode_attr va;
5383
5384 VATTR_INIT(&va);
5385 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5386 VATTR_SET(&va, va_rdev, uap->dev);
5387
5388 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5389 }
5390
5391 /*
5392 * Create a named pipe.
5393 *
5394 * Returns: 0 Success
5395 * EEXIST
5396 * namei:???
5397 * vnode_authorize:???
5398 * vn_create:???
5399 */
5400 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5401 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5402 {
5403 vnode_t vp, dvp;
5404 int error;
5405 struct nameidata nd;
5406
5407 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5408 UIO_USERSPACE, upath, ctx);
5409 error = nameiat(&nd, fd);
5410 if (error) {
5411 return error;
5412 }
5413 dvp = nd.ni_dvp;
5414 vp = nd.ni_vp;
5415
5416 /* check that this is a new file and authorize addition */
5417 if (vp != NULL) {
5418 error = EEXIST;
5419 goto out;
5420 }
5421 VATTR_SET(vap, va_type, VFIFO);
5422
5423 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5424 goto out;
5425 }
5426
5427 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5428 out:
5429 /*
5430 * nameidone has to happen before we vnode_put(dvp)
5431 * since it may need to release the fs_nodelock on the dvp
5432 */
5433 nameidone(&nd);
5434
5435 if (vp) {
5436 vnode_put(vp);
5437 }
5438 vnode_put(dvp);
5439
5440 return error;
5441 }
5442
5443
5444 /*
5445 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5446 *
5447 * Parameters: p Process requesting the open
5448 * uap User argument descriptor (see below)
5449 * retval (Ignored)
5450 *
5451 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5452 * uap->uid UID to set
5453 * uap->gid GID to set
5454 * uap->mode File mode to set (same as 'mkfifo')
5455 * uap->xsecurity ACL to set, if creating
5456 *
5457 * Returns: 0 Success
5458 * !0 errno value
5459 *
5460 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5461 *
5462 * XXX: We should enummerate the possible errno values here, and where
5463 * in the code they originated.
5464 */
5465 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5466 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5467 {
5468 int ciferror;
5469 kauth_filesec_t xsecdst;
5470 struct vnode_attr va;
5471
5472 AUDIT_ARG(owner, uap->uid, uap->gid);
5473
5474 xsecdst = KAUTH_FILESEC_NONE;
5475 if (uap->xsecurity != USER_ADDR_NULL) {
5476 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5477 return ciferror;
5478 }
5479 }
5480
5481 VATTR_INIT(&va);
5482 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5483 if (uap->uid != KAUTH_UID_NONE) {
5484 VATTR_SET(&va, va_uid, uap->uid);
5485 }
5486 if (uap->gid != KAUTH_GID_NONE) {
5487 VATTR_SET(&va, va_gid, uap->gid);
5488 }
5489 if (xsecdst != KAUTH_FILESEC_NONE) {
5490 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5491 va.va_vaflags |= VA_FILESEC_ACL;
5492 }
5493
5494 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5495
5496 if (xsecdst != KAUTH_FILESEC_NONE) {
5497 kauth_filesec_free(xsecdst);
5498 }
5499 return ciferror;
5500 }
5501
5502 /* ARGSUSED */
5503 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5504 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5505 {
5506 struct vnode_attr va;
5507
5508 VATTR_INIT(&va);
5509 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5510
5511 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5512 }
5513
5514 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5515 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5516 {
5517 struct vnode_attr va;
5518
5519 VATTR_INIT(&va);
5520 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5521
5522 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5523 }
5524
5525 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5526 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5527 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5528
5529 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5530 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5531 {
5532 int ret, len = _len;
5533
5534 *truncated_path = 0;
5535
5536 if (firmlink) {
5537 ret = vn_getpath(dvp, path, &len);
5538 } else {
5539 ret = vn_getpath_no_firmlink(dvp, path, &len);
5540 }
5541 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5542 if (leafname) {
5543 path[len - 1] = '/';
5544 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5545 if (len > MAXPATHLEN) {
5546 char *ptr;
5547
5548 // the string got truncated!
5549 *truncated_path = 1;
5550 ptr = strrchr(path, '/');
5551 if (ptr) {
5552 *ptr = '\0'; // chop off the string at the last directory component
5553 }
5554 len = (int)strlen(path) + 1;
5555 }
5556 }
5557 } else if (ret == 0) {
5558 *truncated_path = 1;
5559 } else if (ret != 0) {
5560 struct vnode *mydvp = dvp;
5561
5562 if (ret != ENOSPC) {
5563 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5564 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5565 }
5566 *truncated_path = 1;
5567
5568 do {
5569 if (mydvp->v_parent != NULL) {
5570 mydvp = mydvp->v_parent;
5571 } else if (mydvp->v_mount) {
5572 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5573 break;
5574 } else {
5575 // no parent and no mount point? only thing is to punt and say "/" changed
5576 strlcpy(path, "/", _len);
5577 len = 2;
5578 mydvp = NULL;
5579 }
5580
5581 if (mydvp == NULL) {
5582 break;
5583 }
5584
5585 len = _len;
5586 if (firmlink) {
5587 ret = vn_getpath(mydvp, path, &len);
5588 } else {
5589 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5590 }
5591 } while (ret == ENOSPC);
5592 }
5593
5594 return len;
5595 }
5596
5597 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5598 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5599 {
5600 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5601 }
5602
5603 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5604 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5605 {
5606 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5607 }
5608
5609 /*
5610 * Make a hard file link.
5611 *
5612 * Returns: 0 Success
5613 * EPERM
5614 * EEXIST
5615 * EXDEV
5616 * namei:???
5617 * vnode_authorize:???
5618 * VNOP_LINK:???
5619 */
5620 /* ARGSUSED */
5621 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5622 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5623 user_addr_t link, int flag, enum uio_seg segflg)
5624 {
5625 vnode_t vp, pvp, dvp, lvp;
5626 struct nameidata nd;
5627 int follow;
5628 int error;
5629 #if CONFIG_FSE
5630 fse_info finfo;
5631 #endif
5632 int need_event, has_listeners, need_kpath2;
5633 char *target_path = NULL;
5634 char *no_firmlink_path = NULL;
5635 int truncated = 0;
5636 int truncated_no_firmlink_path = 0;
5637 bool do_retry;
5638 int num_retries = 0;
5639
5640 /* look up the object we are linking to */
5641 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5642
5643 retry:
5644 do_retry = false;
5645 vp = dvp = lvp = NULLVP;
5646 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5647 segflg, path, ctx);
5648
5649 error = nameiat(&nd, fd1);
5650 if (error) {
5651 return error;
5652 }
5653 vp = nd.ni_vp;
5654
5655 nameidone(&nd);
5656
5657 /*
5658 * Normally, linking to directories is not supported.
5659 * However, some file systems may have limited support.
5660 */
5661 if (vp->v_type == VDIR) {
5662 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5663 error = EPERM; /* POSIX */
5664 goto out;
5665 }
5666
5667 /* Linking to a directory requires ownership. */
5668 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5669 struct vnode_attr dva;
5670
5671 VATTR_INIT(&dva);
5672 VATTR_WANTED(&dva, va_uid);
5673 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5674 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5675 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5676 error = EACCES;
5677 goto out;
5678 }
5679 }
5680 }
5681
5682 /* lookup the target node */
5683 #if CONFIG_TRIGGERS
5684 nd.ni_op = OP_LINK;
5685 #endif
5686 nd.ni_cnd.cn_nameiop = CREATE;
5687 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5688 nd.ni_dirp = link;
5689 error = nameiat(&nd, fd2);
5690 if (error != 0) {
5691 goto out;
5692 }
5693 dvp = nd.ni_dvp;
5694 lvp = nd.ni_vp;
5695
5696 #if CONFIG_MACF
5697 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5698 goto out2;
5699 }
5700 #endif
5701
5702 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5703 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5704 goto out2;
5705 }
5706
5707 /* target node must not exist */
5708 if (lvp != NULLVP) {
5709 error = EEXIST;
5710 goto out2;
5711 }
5712 /* cannot link across mountpoints */
5713 if (vnode_mount(vp) != vnode_mount(dvp)) {
5714 error = EXDEV;
5715 goto out2;
5716 }
5717
5718 /* authorize creation of the target note */
5719 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5720 goto out2;
5721 }
5722
5723 #if CONFIG_FILE_LEASES
5724 vnode_breakdirlease(dvp, false, O_WRONLY);
5725 #endif
5726
5727 /* and finally make the link */
5728 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5729 if (error) {
5730 if (error == ENOENT && num_retries < MAX_LINK_ENOENT_RETRIES) {
5731 do_retry = true;
5732 }
5733 goto out2;
5734 }
5735
5736 #if CONFIG_MACF
5737 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5738 #endif
5739
5740 #if CONFIG_FSE
5741 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5742 #else
5743 need_event = 0;
5744 #endif
5745 has_listeners = kauth_authorize_fileop_has_listeners();
5746
5747 need_kpath2 = 0;
5748 #if CONFIG_AUDIT
5749 if (AUDIT_RECORD_EXISTS()) {
5750 need_kpath2 = 1;
5751 }
5752 #endif
5753
5754 if (need_event || has_listeners || need_kpath2) {
5755 char *link_to_path = NULL;
5756 int len, link_name_len;
5757 int len_no_firmlink_path = 0;
5758
5759 /* build the path to the new link file */
5760 GET_PATH(target_path);
5761
5762 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5763 if (no_firmlink_path == NULL) {
5764 GET_PATH(no_firmlink_path);
5765 }
5766 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5767
5768 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5769
5770 if (has_listeners) {
5771 /* build the path to file we are linking to */
5772 GET_PATH(link_to_path);
5773
5774 link_name_len = MAXPATHLEN;
5775 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5776 /*
5777 * Call out to allow 3rd party notification of rename.
5778 * Ignore result of kauth_authorize_fileop call.
5779 */
5780 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5781 (uintptr_t)link_to_path,
5782 (uintptr_t)target_path);
5783 }
5784 if (link_to_path != NULL) {
5785 RELEASE_PATH(link_to_path);
5786 }
5787 }
5788 #if CONFIG_FSE
5789 if (need_event) {
5790 /* construct fsevent */
5791 if (get_fse_info(vp, &finfo, ctx) == 0) {
5792 if (truncated_no_firmlink_path) {
5793 finfo.mode |= FSE_TRUNCATED_PATH;
5794 }
5795
5796 // build the path to the destination of the link
5797 add_fsevent(FSE_CREATE_FILE, ctx,
5798 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5799 FSE_ARG_FINFO, &finfo,
5800 FSE_ARG_DONE);
5801 }
5802
5803 pvp = vp->v_parent;
5804 // need an iocount on parent vnode in this case
5805 if (pvp && pvp != dvp) {
5806 pvp = vnode_getparent_if_different(vp, dvp);
5807 }
5808 if (pvp) {
5809 add_fsevent(FSE_STAT_CHANGED, ctx,
5810 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5811 }
5812 if (pvp && pvp != dvp) {
5813 vnode_put(pvp);
5814 }
5815 }
5816 #endif
5817 }
5818 out2:
5819 /*
5820 * nameidone has to happen before we vnode_put(dvp)
5821 * since it may need to release the fs_nodelock on the dvp
5822 */
5823 nameidone(&nd);
5824 if (target_path != NULL) {
5825 RELEASE_PATH(target_path);
5826 target_path = NULL;
5827 }
5828 if (no_firmlink_path != NULL) {
5829 RELEASE_PATH(no_firmlink_path);
5830 no_firmlink_path = NULL;
5831 }
5832 out:
5833 if (lvp) {
5834 vnode_put(lvp);
5835 }
5836 if (dvp) {
5837 vnode_put(dvp);
5838 }
5839 vnode_put(vp);
5840
5841 if (do_retry) {
5842 goto retry;
5843 }
5844
5845 return error;
5846 }
5847
5848 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5849 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5850 {
5851 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5852 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5853 }
5854
5855 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5856 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5857 {
5858 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5859 return EINVAL;
5860 }
5861
5862 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5863 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5864 }
5865
5866 /*
5867 * Make a symbolic link.
5868 *
5869 * We could add support for ACLs here too...
5870 */
5871 /* ARGSUSED */
5872 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5873 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5874 user_addr_t link, enum uio_seg segflg)
5875 {
5876 struct vnode_attr va;
5877 char *path;
5878 int error;
5879 struct nameidata nd;
5880 vnode_t vp, dvp;
5881 size_t dummy = 0;
5882 proc_t p;
5883
5884 error = 0;
5885 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5886 path = zalloc(ZV_NAMEI);
5887 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5888 } else {
5889 path = (char *)path_data;
5890 }
5891 if (error) {
5892 goto out;
5893 }
5894 AUDIT_ARG(text, path); /* This is the link string */
5895
5896 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5897 segflg, link, ctx);
5898
5899 error = nameiat(&nd, fd);
5900 if (error) {
5901 goto out;
5902 }
5903 dvp = nd.ni_dvp;
5904 vp = nd.ni_vp;
5905
5906 p = vfs_context_proc(ctx);
5907 VATTR_INIT(&va);
5908 VATTR_SET(&va, va_type, VLNK);
5909 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5910
5911 #if CONFIG_MACF
5912 error = mac_vnode_check_create(ctx,
5913 dvp, &nd.ni_cnd, &va);
5914 #endif
5915 if (error != 0) {
5916 goto skipit;
5917 }
5918
5919 if (vp != NULL) {
5920 error = EEXIST;
5921 goto skipit;
5922 }
5923
5924 /* authorize */
5925 if (error == 0) {
5926 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5927 }
5928 /* get default ownership, etc. */
5929 if (error == 0) {
5930 error = vnode_authattr_new(dvp, &va, 0, ctx);
5931 }
5932
5933 #if CONFIG_FILE_LEASES
5934 vnode_breakdirlease(dvp, false, O_WRONLY);
5935 #endif
5936
5937 if (error == 0) {
5938 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5939 }
5940
5941 /* do fallback attribute handling */
5942 if (error == 0 && vp) {
5943 error = vnode_setattr_fallback(vp, &va, ctx);
5944 }
5945
5946 #if CONFIG_MACF
5947 if (error == 0 && vp) {
5948 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5949 }
5950 #endif
5951
5952 if (error == 0) {
5953 int update_flags = 0;
5954
5955 /*check if a new vnode was created, else try to get one*/
5956 if (vp == NULL) {
5957 nd.ni_cnd.cn_nameiop = LOOKUP;
5958 #if CONFIG_TRIGGERS
5959 nd.ni_op = OP_LOOKUP;
5960 #endif
5961 /*
5962 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5963 * reallocated again in namei().
5964 */
5965 nd.ni_cnd.cn_flags &= HASBUF;
5966 error = nameiat(&nd, fd);
5967 if (error) {
5968 goto skipit;
5969 }
5970 vp = nd.ni_vp;
5971 }
5972
5973 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5974 /* call out to allow 3rd party notification of rename.
5975 * Ignore result of kauth_authorize_fileop call.
5976 */
5977 if (kauth_authorize_fileop_has_listeners() &&
5978 namei(&nd) == 0) {
5979 char *new_link_path = NULL;
5980 int len;
5981
5982 /* build the path to the new link file */
5983 new_link_path = get_pathbuff();
5984 len = MAXPATHLEN;
5985 vn_getpath(dvp, new_link_path, &len);
5986 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5987 new_link_path[len - 1] = '/';
5988 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5989 }
5990
5991 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5992 (uintptr_t)path, (uintptr_t)new_link_path);
5993 if (new_link_path != NULL) {
5994 release_pathbuff(new_link_path);
5995 }
5996 }
5997 #endif
5998 // Make sure the name & parent pointers are hooked up
5999 if (vp->v_name == NULL) {
6000 update_flags |= VNODE_UPDATE_NAME;
6001 }
6002 if (vp->v_parent == NULLVP) {
6003 update_flags |= VNODE_UPDATE_PARENT;
6004 }
6005
6006 if (update_flags) {
6007 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
6008 }
6009
6010 #if CONFIG_FSE
6011 add_fsevent(FSE_CREATE_FILE, ctx,
6012 FSE_ARG_VNODE, vp,
6013 FSE_ARG_DONE);
6014 #endif
6015 }
6016
6017 skipit:
6018 /*
6019 * nameidone has to happen before we vnode_put(dvp)
6020 * since it may need to release the fs_nodelock on the dvp
6021 */
6022 nameidone(&nd);
6023
6024 if (vp) {
6025 vnode_put(vp);
6026 }
6027 vnode_put(dvp);
6028 out:
6029 if (path && (path != (char *)path_data)) {
6030 zfree(ZV_NAMEI, path);
6031 }
6032
6033 return error;
6034 }
6035
6036 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6037 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6038 {
6039 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6040 uap->link, UIO_USERSPACE);
6041 }
6042
6043 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6044 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6045 __unused int32_t *retval)
6046 {
6047 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6048 uap->path2, UIO_USERSPACE);
6049 }
6050
6051 /*
6052 * Delete a whiteout from the filesystem.
6053 * No longer supported.
6054 */
6055 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6056 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6057 {
6058 return ENOTSUP;
6059 }
6060
6061 /*
6062 * Delete a name from the filesystem.
6063 */
6064 /* ARGSUSED */
6065 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6066 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6067 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6068 {
6069 struct {
6070 struct nameidata nd;
6071 #if CONFIG_FSE
6072 struct vnode_attr va;
6073 fse_info finfo;
6074 #endif
6075 } *__unlink_data;
6076 struct nameidata *ndp;
6077 vnode_t vp, dvp;
6078 int error;
6079 struct componentname *cnp;
6080 char *path = NULL;
6081 char *no_firmlink_path = NULL;
6082 int len_path = 0;
6083 int len_no_firmlink_path = 0;
6084 int flags;
6085 int need_event;
6086 int has_listeners;
6087 int truncated_path;
6088 int truncated_no_firmlink_path;
6089 int batched;
6090 struct vnode_attr *vap;
6091 int do_retry;
6092 int retry_count = 0;
6093 int cn_flags;
6094 int nofollow_any = 0;
6095
6096 cn_flags = LOCKPARENT;
6097 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6098 cn_flags |= AUDITVNPATH1;
6099 }
6100 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6101 nofollow_any = NAMEI_NOFOLLOW_ANY;
6102 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6103 }
6104 /* If a starting dvp is passed, it trumps any fd passed. */
6105 if (start_dvp) {
6106 cn_flags |= USEDVP;
6107 }
6108
6109 #if NAMEDRSRCFORK
6110 /* unlink or delete is allowed on rsrc forks and named streams */
6111 cn_flags |= CN_ALLOWRSRCFORK;
6112 #endif
6113
6114 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6115 ndp = &__unlink_data->nd;
6116 #if CONFIG_FSE
6117 fse_info *finfop = &__unlink_data->finfo;
6118 #endif
6119
6120 retry:
6121 do_retry = 0;
6122 flags = 0;
6123 need_event = 0;
6124 has_listeners = 0;
6125 truncated_path = 0;
6126 truncated_no_firmlink_path = 0;
6127 vap = NULL;
6128
6129 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6130
6131 ndp->ni_dvp = start_dvp;
6132 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6133 cnp = &ndp->ni_cnd;
6134
6135 continue_lookup:
6136 error = nameiat(ndp, fd);
6137 if (error) {
6138 goto early_out;
6139 }
6140
6141 dvp = ndp->ni_dvp;
6142 vp = ndp->ni_vp;
6143
6144 /* With Carbon delete semantics, busy files cannot be deleted */
6145 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6146 flags |= VNODE_REMOVE_NODELETEBUSY;
6147 }
6148
6149 /* Skip any potential upcalls if told to. */
6150 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6151 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6152 }
6153
6154 if (vp) {
6155 batched = vnode_compound_remove_available(vp);
6156 /*
6157 * The root of a mounted filesystem cannot be deleted.
6158 */
6159 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6160 error = EBUSY;
6161 goto out;
6162 }
6163
6164 #if DEVELOPMENT || DEBUG
6165 /*
6166 * XXX VSWAP: Check for entitlements or special flag here
6167 * so we can restrict access appropriately.
6168 */
6169 #else /* DEVELOPMENT || DEBUG */
6170
6171 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6172 error = EPERM;
6173 goto out;
6174 }
6175 #endif /* DEVELOPMENT || DEBUG */
6176
6177 if (!batched) {
6178 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6179 if (error) {
6180 if (error == ENOENT) {
6181 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6182 do_retry = 1;
6183 retry_count++;
6184 }
6185 }
6186 goto out;
6187 }
6188 }
6189 } else {
6190 batched = 1;
6191
6192 if (!vnode_compound_remove_available(dvp)) {
6193 panic("No vp, but no compound remove?");
6194 }
6195 }
6196
6197 #if CONFIG_FSE
6198 need_event = need_fsevent(FSE_DELETE, dvp);
6199 if (need_event) {
6200 if (!batched) {
6201 if ((vp->v_flag & VISHARDLINK) == 0) {
6202 /* XXX need to get these data in batched VNOP */
6203 get_fse_info(vp, finfop, ctx);
6204 }
6205 } else {
6206 error =
6207 vfs_get_notify_attributes(&__unlink_data->va);
6208 if (error) {
6209 goto out;
6210 }
6211
6212 vap = &__unlink_data->va;
6213 }
6214 }
6215 #endif
6216 has_listeners = kauth_authorize_fileop_has_listeners();
6217 if (need_event || has_listeners) {
6218 if (path == NULL) {
6219 GET_PATH(path);
6220 }
6221 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6222 if (no_firmlink_path == NULL) {
6223 GET_PATH(no_firmlink_path);
6224 }
6225 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6226 }
6227
6228 #if NAMEDRSRCFORK
6229 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6230 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6231 } else
6232 #endif
6233 {
6234 #if CONFIG_FILE_LEASES
6235 vnode_breakdirlease(dvp, false, O_WRONLY);
6236 #endif
6237
6238 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6239 vp = ndp->ni_vp;
6240 if (error == EKEEPLOOKING) {
6241 if (!batched) {
6242 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6243 }
6244
6245 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6246 panic("EKEEPLOOKING, but continue flag not set?");
6247 }
6248
6249 if (vnode_isdir(vp)) {
6250 error = EISDIR;
6251 goto out;
6252 }
6253 goto continue_lookup;
6254 } else if (error == ENOENT && batched) {
6255 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6256 /*
6257 * For compound VNOPs, the authorization callback may
6258 * return ENOENT in case of racing hardlink lookups
6259 * hitting the name cache, redrive the lookup.
6260 */
6261 do_retry = 1;
6262 retry_count += 1;
6263 goto out;
6264 }
6265 }
6266 }
6267
6268 /*
6269 * Call out to allow 3rd party notification of delete.
6270 * Ignore result of kauth_authorize_fileop call.
6271 */
6272 if (!error) {
6273 if (has_listeners) {
6274 kauth_authorize_fileop(vfs_context_ucred(ctx),
6275 KAUTH_FILEOP_DELETE,
6276 (uintptr_t)vp,
6277 (uintptr_t)path);
6278 }
6279
6280 if (vp->v_flag & VISHARDLINK) {
6281 //
6282 // if a hardlink gets deleted we want to blow away the
6283 // v_parent link because the path that got us to this
6284 // instance of the link is no longer valid. this will
6285 // force the next call to get the path to ask the file
6286 // system instead of just following the v_parent link.
6287 //
6288 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6289 }
6290
6291 #if CONFIG_FSE
6292 if (need_event) {
6293 if (vp->v_flag & VISHARDLINK) {
6294 get_fse_info(vp, finfop, ctx);
6295 } else if (vap) {
6296 vnode_get_fse_info_from_vap(vp, finfop, vap);
6297 }
6298 if (truncated_path) {
6299 finfop->mode |= FSE_TRUNCATED_PATH;
6300 }
6301 add_fsevent(FSE_DELETE, ctx,
6302 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6303 FSE_ARG_FINFO, finfop,
6304 FSE_ARG_DONE);
6305 }
6306 #endif
6307
6308 #if CONFIG_MACF
6309 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6310 #endif
6311 }
6312
6313 out:
6314 if (path != NULL) {
6315 RELEASE_PATH(path);
6316 path = NULL;
6317 }
6318
6319 if (no_firmlink_path != NULL) {
6320 RELEASE_PATH(no_firmlink_path);
6321 no_firmlink_path = NULL;
6322 }
6323 #if NAMEDRSRCFORK
6324 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6325 * will cause its shadow file to go away if necessary.
6326 */
6327 if (vp && (vnode_isnamedstream(vp)) &&
6328 (vp->v_parent != NULLVP) &&
6329 vnode_isshadow(vp)) {
6330 vnode_recycle(vp);
6331 }
6332 #endif
6333 /*
6334 * nameidone has to happen before we vnode_put(dvp)
6335 * since it may need to release the fs_nodelock on the dvp
6336 */
6337 nameidone(ndp);
6338 vnode_put(dvp);
6339 if (vp) {
6340 vnode_put(vp);
6341 }
6342
6343 if (do_retry) {
6344 goto retry;
6345 }
6346
6347 early_out:
6348 kfree_type(typeof(*__unlink_data), __unlink_data);
6349 return error;
6350 }
6351
6352 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6353 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6354 enum uio_seg segflg, int unlink_flags)
6355 {
6356 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6357 unlink_flags);
6358 }
6359
6360 /*
6361 * Delete a name from the filesystem using Carbon semantics.
6362 */
6363 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6364 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6365 {
6366 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6367 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6368 }
6369
6370 /*
6371 * Delete a name from the filesystem using POSIX semantics.
6372 */
6373 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6374 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6375 {
6376 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6377 uap->path, UIO_USERSPACE, 0);
6378 }
6379
6380 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6381 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6382 {
6383 int unlink_flags = 0;
6384
6385 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY)) {
6386 return EINVAL;
6387 }
6388
6389 if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6390 unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6391 }
6392
6393 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6394 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6395 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6396 }
6397 return rmdirat_internal(vfs_context_current(), uap->fd,
6398 uap->path, UIO_USERSPACE, unlink_flags);
6399 } else {
6400 return unlinkat_internal(vfs_context_current(), uap->fd,
6401 NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6402 }
6403 }
6404
6405 /*
6406 * Reposition read/write file offset.
6407 */
6408 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6409 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6410 {
6411 struct fileproc *fp;
6412 vnode_t vp;
6413 struct vfs_context *ctx;
6414 off_t offset = uap->offset, file_size;
6415 int error;
6416
6417 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6418 if (error == ENOTSUP) {
6419 return ESPIPE;
6420 }
6421 return error;
6422 }
6423 if (vnode_isfifo(vp)) {
6424 file_drop(uap->fd);
6425 return ESPIPE;
6426 }
6427
6428
6429 ctx = vfs_context_current();
6430 #if CONFIG_MACF
6431 if (uap->whence == L_INCR && uap->offset == 0) {
6432 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6433 fp->fp_glob);
6434 } else {
6435 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6436 fp->fp_glob);
6437 }
6438 if (error) {
6439 file_drop(uap->fd);
6440 return error;
6441 }
6442 #endif
6443 if ((error = vnode_getwithref(vp))) {
6444 file_drop(uap->fd);
6445 return error;
6446 }
6447
6448 switch (uap->whence) {
6449 case L_INCR:
6450 offset += fp->fp_glob->fg_offset;
6451 break;
6452 case L_XTND:
6453 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6454 break;
6455 }
6456 offset += file_size;
6457 break;
6458 case L_SET:
6459 break;
6460 case SEEK_HOLE:
6461 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6462 break;
6463 case SEEK_DATA:
6464 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6465 break;
6466 default:
6467 error = EINVAL;
6468 }
6469 if (error == 0) {
6470 if (uap->offset > 0 && offset < 0) {
6471 /* Incremented/relative move past max size */
6472 error = EOVERFLOW;
6473 } else {
6474 /*
6475 * Allow negative offsets on character devices, per
6476 * POSIX 1003.1-2001. Most likely for writing disk
6477 * labels.
6478 */
6479 if (offset < 0 && vp->v_type != VCHR) {
6480 /* Decremented/relative move before start */
6481 error = EINVAL;
6482 } else {
6483 /* Success */
6484 fp->fp_glob->fg_offset = offset;
6485 *retval = fp->fp_glob->fg_offset;
6486 }
6487 }
6488 }
6489
6490 /*
6491 * An lseek can affect whether data is "available to read." Use
6492 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6493 */
6494 post_event_if_success(vp, error, NOTE_NONE);
6495 (void)vnode_put(vp);
6496 file_drop(uap->fd);
6497 return error;
6498 }
6499
6500
6501 /*
6502 * Check access permissions.
6503 *
6504 * Returns: 0 Success
6505 * vnode_authorize:???
6506 */
6507 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6508 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6509 {
6510 kauth_action_t action;
6511 int error;
6512
6513 /*
6514 * If just the regular access bits, convert them to something
6515 * that vnode_authorize will understand.
6516 */
6517 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6518 action = 0;
6519 if (uflags & R_OK) {
6520 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6521 }
6522 if (uflags & W_OK) {
6523 if (vnode_isdir(vp)) {
6524 action |= KAUTH_VNODE_ADD_FILE |
6525 KAUTH_VNODE_ADD_SUBDIRECTORY;
6526 /* might want delete rights here too */
6527 } else {
6528 action |= KAUTH_VNODE_WRITE_DATA;
6529 }
6530 }
6531 if (uflags & X_OK) {
6532 if (vnode_isdir(vp)) {
6533 action |= KAUTH_VNODE_SEARCH;
6534 } else {
6535 action |= KAUTH_VNODE_EXECUTE;
6536 }
6537 }
6538 } else {
6539 /* take advantage of definition of uflags */
6540 action = uflags >> 8;
6541 }
6542
6543 #if CONFIG_MACF
6544 error = mac_vnode_check_access(ctx, vp, uflags);
6545 if (error) {
6546 return error;
6547 }
6548 #endif /* MAC */
6549
6550 /* action == 0 means only check for existence */
6551 if (action != 0) {
6552 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6553 } else {
6554 error = 0;
6555 }
6556
6557 return error;
6558 }
6559
6560
6561
6562 /*
6563 * access_extended: Check access permissions in bulk.
6564 *
6565 * Description: uap->entries Pointer to an array of accessx
6566 * descriptor structs, plus one or
6567 * more NULL terminated strings (see
6568 * "Notes" section below).
6569 * uap->size Size of the area pointed to by
6570 * uap->entries.
6571 * uap->results Pointer to the results array.
6572 *
6573 * Returns: 0 Success
6574 * ENOMEM Insufficient memory
6575 * EINVAL Invalid arguments
6576 * namei:EFAULT Bad address
6577 * namei:ENAMETOOLONG Filename too long
6578 * namei:ENOENT No such file or directory
6579 * namei:ELOOP Too many levels of symbolic links
6580 * namei:EBADF Bad file descriptor
6581 * namei:ENOTDIR Not a directory
6582 * namei:???
6583 * access1:
6584 *
6585 * Implicit returns:
6586 * uap->results Array contents modified
6587 *
6588 * Notes: The uap->entries are structured as an arbitrary length array
6589 * of accessx descriptors, followed by one or more NULL terminated
6590 * strings
6591 *
6592 * struct accessx_descriptor[0]
6593 * ...
6594 * struct accessx_descriptor[n]
6595 * char name_data[0];
6596 *
6597 * We determine the entry count by walking the buffer containing
6598 * the uap->entries argument descriptor. For each descriptor we
6599 * see, the valid values for the offset ad_name_offset will be
6600 * in the byte range:
6601 *
6602 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6603 * to
6604 * [ uap->entries + uap->size - 2 ]
6605 *
6606 * since we must have at least one string, and the string must
6607 * be at least one character plus the NULL terminator in length.
6608 *
6609 * XXX: Need to support the check-as uid argument
6610 */
6611 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6612 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6613 {
6614 struct accessx_descriptor *input = NULL;
6615 errno_t *result = NULL;
6616 errno_t error = 0;
6617 int wantdelete = 0;
6618 size_t desc_max, desc_actual = 0;
6619 unsigned int i, j;
6620 struct vfs_context context;
6621 struct nameidata nd;
6622 int niopts;
6623 vnode_t vp = NULL;
6624 vnode_t dvp = NULL;
6625 #define ACCESSX_MAX_DESCR_ON_STACK 10
6626 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6627
6628 context.vc_ucred = NULL;
6629
6630 /*
6631 * Validate parameters; if valid, copy the descriptor array and string
6632 * arguments into local memory. Before proceeding, the following
6633 * conditions must have been met:
6634 *
6635 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6636 * o There must be sufficient room in the request for at least one
6637 * descriptor and a one yte NUL terminated string.
6638 * o The allocation of local storage must not fail.
6639 */
6640 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6641 return ENOMEM;
6642 }
6643 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6644 return EINVAL;
6645 }
6646 if (uap->size <= sizeof(stack_input)) {
6647 input = stack_input;
6648 } else {
6649 input = kalloc_data(uap->size, Z_WAITOK);
6650 if (input == NULL) {
6651 error = ENOMEM;
6652 goto out;
6653 }
6654 }
6655 error = copyin(uap->entries, input, uap->size);
6656 if (error) {
6657 goto out;
6658 }
6659
6660 AUDIT_ARG(opaque, input, uap->size);
6661
6662 /*
6663 * Force NUL termination of the copyin buffer to avoid nami() running
6664 * off the end. If the caller passes us bogus data, they may get a
6665 * bogus result.
6666 */
6667 ((char *)input)[uap->size - 1] = 0;
6668
6669 /*
6670 * Access is defined as checking against the process' real identity,
6671 * even if operations are checking the effective identity. This
6672 * requires that we use a local vfs context.
6673 */
6674 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6675 context.vc_thread = current_thread();
6676
6677 /*
6678 * Find out how many entries we have, so we can allocate the result
6679 * array by walking the list and adjusting the count downward by the
6680 * earliest string offset we see.
6681 */
6682 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6683 desc_actual = desc_max;
6684 for (i = 0; i < desc_actual; i++) {
6685 /*
6686 * Take the offset to the name string for this entry and
6687 * convert to an input array index, which would be one off
6688 * the end of the array if this entry was the lowest-addressed
6689 * name string.
6690 */
6691 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6692
6693 /*
6694 * An offset greater than the max allowable offset is an error.
6695 * It is also an error for any valid entry to point
6696 * to a location prior to the end of the current entry, if
6697 * it's not a reference to the string of the previous entry.
6698 */
6699 if (j > desc_max || (j != 0 && j <= i)) {
6700 error = EINVAL;
6701 goto out;
6702 }
6703
6704 /* Also do not let ad_name_offset point to something beyond the size of the input */
6705 if (input[i].ad_name_offset >= uap->size) {
6706 error = EINVAL;
6707 goto out;
6708 }
6709
6710 /*
6711 * An offset of 0 means use the previous descriptor's offset;
6712 * this is used to chain multiple requests for the same file
6713 * to avoid multiple lookups.
6714 */
6715 if (j == 0) {
6716 /* This is not valid for the first entry */
6717 if (i == 0) {
6718 error = EINVAL;
6719 goto out;
6720 }
6721 continue;
6722 }
6723
6724 /*
6725 * If the offset of the string for this descriptor is before
6726 * what we believe is the current actual last descriptor,
6727 * then we need to adjust our estimate downward; this permits
6728 * the string table following the last descriptor to be out
6729 * of order relative to the descriptor list.
6730 */
6731 if (j < desc_actual) {
6732 desc_actual = j;
6733 }
6734 }
6735
6736 /*
6737 * We limit the actual number of descriptors we are willing to process
6738 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6739 * requested does not exceed this limit,
6740 */
6741 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6742 error = ENOMEM;
6743 goto out;
6744 }
6745 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6746 if (result == NULL) {
6747 error = ENOMEM;
6748 goto out;
6749 }
6750
6751 /*
6752 * Do the work by iterating over the descriptor entries we know to
6753 * at least appear to contain valid data.
6754 */
6755 error = 0;
6756 for (i = 0; i < desc_actual; i++) {
6757 /*
6758 * If the ad_name_offset is 0, then we use the previous
6759 * results to make the check; otherwise, we are looking up
6760 * a new file name.
6761 */
6762 if (input[i].ad_name_offset != 0) {
6763 /* discard old vnodes */
6764 if (vp) {
6765 vnode_put(vp);
6766 vp = NULL;
6767 }
6768 if (dvp) {
6769 vnode_put(dvp);
6770 dvp = NULL;
6771 }
6772
6773 /*
6774 * Scan forward in the descriptor list to see if we
6775 * need the parent vnode. We will need it if we are
6776 * deleting, since we must have rights to remove
6777 * entries in the parent directory, as well as the
6778 * rights to delete the object itself.
6779 */
6780 wantdelete = input[i].ad_flags & _DELETE_OK;
6781 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6782 if (input[j].ad_flags & _DELETE_OK) {
6783 wantdelete = 1;
6784 }
6785 }
6786
6787 niopts = FOLLOW | AUDITVNPATH1;
6788
6789 /* need parent for vnode_authorize for deletion test */
6790 if (wantdelete) {
6791 niopts |= WANTPARENT;
6792 }
6793
6794 /* do the lookup */
6795 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6796 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6797 &context);
6798 error = namei(&nd);
6799 if (!error) {
6800 vp = nd.ni_vp;
6801 if (wantdelete) {
6802 dvp = nd.ni_dvp;
6803 }
6804 }
6805 nameidone(&nd);
6806 }
6807
6808 /*
6809 * Handle lookup errors.
6810 */
6811 switch (error) {
6812 case ENOENT:
6813 case EACCES:
6814 case EPERM:
6815 case ENOTDIR:
6816 result[i] = error;
6817 break;
6818 case 0:
6819 /* run this access check */
6820 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6821 break;
6822 default:
6823 /* fatal lookup error */
6824
6825 goto out;
6826 }
6827 }
6828
6829 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6830
6831 /* copy out results */
6832 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6833
6834 out:
6835 if (input && input != stack_input) {
6836 kfree_data(input, uap->size);
6837 }
6838 if (result) {
6839 kfree_data(result, desc_actual * sizeof(errno_t));
6840 }
6841 if (vp) {
6842 vnode_put(vp);
6843 }
6844 if (dvp) {
6845 vnode_put(dvp);
6846 }
6847 if (IS_VALID_CRED(context.vc_ucred)) {
6848 kauth_cred_unref(&context.vc_ucred);
6849 }
6850 return error;
6851 }
6852
6853
6854 /*
6855 * Returns: 0 Success
6856 * namei:EFAULT Bad address
6857 * namei:ENAMETOOLONG Filename too long
6858 * namei:ENOENT No such file or directory
6859 * namei:ELOOP Too many levels of symbolic links
6860 * namei:EBADF Bad file descriptor
6861 * namei:ENOTDIR Not a directory
6862 * namei:???
6863 * access1:
6864 */
6865 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6866 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6867 int flag, enum uio_seg segflg)
6868 {
6869 int error;
6870 struct nameidata nd;
6871 int niopts;
6872 struct vfs_context context;
6873 #if NAMEDRSRCFORK
6874 int is_namedstream = 0;
6875 #endif
6876
6877 /*
6878 * Unless the AT_EACCESS option is used, Access is defined as checking
6879 * against the process' real identity, even if operations are checking
6880 * the effective identity. So we need to tweak the credential
6881 * in the context for that case.
6882 */
6883 if (!(flag & AT_EACCESS)) {
6884 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6885 } else {
6886 context.vc_ucred = ctx->vc_ucred;
6887 }
6888 context.vc_thread = ctx->vc_thread;
6889
6890
6891 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6892 /* need parent for vnode_authorize for deletion test */
6893 if (amode & _DELETE_OK) {
6894 niopts |= WANTPARENT;
6895 }
6896 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6897 path, &context);
6898 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6899 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6900 }
6901
6902 #if NAMEDRSRCFORK
6903 /* access(F_OK) calls are allowed for resource forks. */
6904 if (amode == F_OK) {
6905 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6906 }
6907 #endif
6908 error = nameiat(&nd, fd);
6909 if (error) {
6910 goto out;
6911 }
6912
6913 #if NAMEDRSRCFORK
6914 /* Grab reference on the shadow stream file vnode to
6915 * force an inactive on release which will mark it
6916 * for recycle.
6917 */
6918 if (vnode_isnamedstream(nd.ni_vp) &&
6919 (nd.ni_vp->v_parent != NULLVP) &&
6920 vnode_isshadow(nd.ni_vp)) {
6921 is_namedstream = 1;
6922 vnode_ref(nd.ni_vp);
6923 }
6924 #endif
6925
6926 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6927
6928 #if NAMEDRSRCFORK
6929 if (is_namedstream) {
6930 vnode_rele(nd.ni_vp);
6931 }
6932 #endif
6933
6934 vnode_put(nd.ni_vp);
6935 if (amode & _DELETE_OK) {
6936 vnode_put(nd.ni_dvp);
6937 }
6938 nameidone(&nd);
6939
6940 out:
6941 if (!(flag & AT_EACCESS)) {
6942 kauth_cred_unref(&context.vc_ucred);
6943 }
6944 return error;
6945 }
6946
6947 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6948 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6949 {
6950 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6951 uap->path, uap->flags, 0, UIO_USERSPACE);
6952 }
6953
6954 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6955 faccessat(__unused proc_t p, struct faccessat_args *uap,
6956 __unused int32_t *retval)
6957 {
6958 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6959 return EINVAL;
6960 }
6961
6962 return faccessat_internal(vfs_context_current(), uap->fd,
6963 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6964 }
6965
6966 /*
6967 * Returns: 0 Success
6968 * EFAULT
6969 * copyout:EFAULT
6970 * namei:???
6971 * vn_stat:???
6972 */
6973 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6974 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6975 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6976 enum uio_seg segflg, int fd, int flag)
6977 {
6978 struct nameidata *ndp = NULL;
6979 int follow;
6980 union {
6981 struct stat sb;
6982 struct stat64 sb64;
6983 } source = {};
6984 union {
6985 struct user64_stat user64_sb;
6986 struct user32_stat user32_sb;
6987 struct user64_stat64 user64_sb64;
6988 struct user32_stat64 user32_sb64;
6989 } dest = {};
6990 caddr_t sbp;
6991 int error, my_size;
6992 kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
6993 size_t xsecurity_bufsize;
6994 void * statptr;
6995 struct fileproc *fp = NULL;
6996 int needsrealdev = 0;
6997
6998 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6999 ndp = kalloc_type(struct nameidata, Z_WAITOK);
7000 NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
7001 segflg, path, ctx);
7002 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7003 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
7004 }
7005
7006 #if NAMEDRSRCFORK
7007 int is_namedstream = 0;
7008 /* stat calls are allowed for resource forks. */
7009 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
7010 #endif
7011
7012 if (flag & AT_FDONLY) {
7013 vnode_t fvp;
7014
7015 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
7016 if (error) {
7017 goto out;
7018 }
7019 if ((error = vnode_getwithref(fvp))) {
7020 file_drop(fd);
7021 goto out;
7022 }
7023 ndp->ni_vp = fvp;
7024 } else {
7025 error = nameiat(ndp, fd);
7026 if (error) {
7027 goto out;
7028 }
7029 }
7030
7031 statptr = (void *)&source;
7032
7033 #if NAMEDRSRCFORK
7034 /* Grab reference on the shadow stream file vnode to
7035 * force an inactive on release which will mark it
7036 * for recycle.
7037 */
7038 if (vnode_isnamedstream(ndp->ni_vp) &&
7039 (ndp->ni_vp->v_parent != NULLVP) &&
7040 vnode_isshadow(ndp->ni_vp)) {
7041 is_namedstream = 1;
7042 vnode_ref(ndp->ni_vp);
7043 }
7044 #endif
7045
7046 needsrealdev = flag & AT_REALDEV ? 1 : 0;
7047 if (fp && (xsecurity == USER_ADDR_NULL)) {
7048 /*
7049 * If the caller has the file open, and is not
7050 * requesting extended security information, we are
7051 * going to let them get the basic stat information.
7052 */
7053 error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7054 fp->fp_glob->fg_cred);
7055 } else {
7056 error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7057 isstat64, needsrealdev, ctx);
7058 }
7059
7060 #if NAMEDRSRCFORK
7061 if (is_namedstream) {
7062 vnode_rele(ndp->ni_vp);
7063 }
7064 #endif
7065 vnode_put(ndp->ni_vp);
7066 nameidone(ndp);
7067
7068 if (fp) {
7069 file_drop(fd);
7070 fp = NULL;
7071 }
7072
7073 if (error) {
7074 goto out;
7075 }
7076 /* Zap spare fields */
7077 if (isstat64 != 0) {
7078 source.sb64.st_lspare = 0;
7079 source.sb64.st_qspare[0] = 0LL;
7080 source.sb64.st_qspare[1] = 0LL;
7081 if (vfs_context_is64bit(ctx)) {
7082 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7083 my_size = sizeof(dest.user64_sb64);
7084 sbp = (caddr_t)&dest.user64_sb64;
7085 } else {
7086 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7087 my_size = sizeof(dest.user32_sb64);
7088 sbp = (caddr_t)&dest.user32_sb64;
7089 }
7090 /*
7091 * Check if we raced (post lookup) against the last unlink of a file.
7092 */
7093 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7094 source.sb64.st_nlink = 1;
7095 }
7096 } else {
7097 source.sb.st_lspare = 0;
7098 source.sb.st_qspare[0] = 0LL;
7099 source.sb.st_qspare[1] = 0LL;
7100 if (vfs_context_is64bit(ctx)) {
7101 munge_user64_stat(&source.sb, &dest.user64_sb);
7102 my_size = sizeof(dest.user64_sb);
7103 sbp = (caddr_t)&dest.user64_sb;
7104 } else {
7105 munge_user32_stat(&source.sb, &dest.user32_sb);
7106 my_size = sizeof(dest.user32_sb);
7107 sbp = (caddr_t)&dest.user32_sb;
7108 }
7109
7110 /*
7111 * Check if we raced (post lookup) against the last unlink of a file.
7112 */
7113 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7114 source.sb.st_nlink = 1;
7115 }
7116 }
7117 if ((error = copyout(sbp, ub, my_size)) != 0) {
7118 goto out;
7119 }
7120
7121 /* caller wants extended security information? */
7122 if (xsecurity != USER_ADDR_NULL) {
7123 /* did we get any? */
7124 if (fsec == KAUTH_FILESEC_NONE) {
7125 if (susize(xsecurity_size, 0) != 0) {
7126 error = EFAULT;
7127 goto out;
7128 }
7129 } else {
7130 /* find the user buffer size */
7131 xsecurity_bufsize = fusize(xsecurity_size);
7132
7133 /* copy out the actual data size */
7134 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7135 error = EFAULT;
7136 goto out;
7137 }
7138
7139 /* if the caller supplied enough room, copy out to it */
7140 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7141 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7142 }
7143 }
7144 }
7145 out:
7146 if (ndp) {
7147 kfree_type(struct nameidata, ndp);
7148 }
7149 if (fsec != KAUTH_FILESEC_NONE) {
7150 kauth_filesec_free(fsec);
7151 }
7152 return error;
7153 }
7154
7155 /*
7156 * stat_extended: Get file status; with extended security (ACL).
7157 *
7158 * Parameters: p (ignored)
7159 * uap User argument descriptor (see below)
7160 * retval (ignored)
7161 *
7162 * Indirect: uap->path Path of file to get status from
7163 * uap->ub User buffer (holds file status info)
7164 * uap->xsecurity ACL to get (extended security)
7165 * uap->xsecurity_size Size of ACL
7166 *
7167 * Returns: 0 Success
7168 * !0 errno value
7169 *
7170 */
7171 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7172 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7173 __unused int32_t *retval)
7174 {
7175 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7176 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7177 0);
7178 }
7179
7180 /*
7181 * Returns: 0 Success
7182 * fstatat_internal:??? [see fstatat_internal() in this file]
7183 */
7184 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7185 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7186 {
7187 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7188 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7189 }
7190
7191 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7192 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7193 {
7194 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7195 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7196 }
7197
7198 /*
7199 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7200 *
7201 * Parameters: p (ignored)
7202 * uap User argument descriptor (see below)
7203 * retval (ignored)
7204 *
7205 * Indirect: uap->path Path of file to get status from
7206 * uap->ub User buffer (holds file status info)
7207 * uap->xsecurity ACL to get (extended security)
7208 * uap->xsecurity_size Size of ACL
7209 *
7210 * Returns: 0 Success
7211 * !0 errno value
7212 *
7213 */
7214 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7215 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7216 {
7217 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7218 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7219 0);
7220 }
7221
7222 /*
7223 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7224 *
7225 * Parameters: p (ignored)
7226 * uap User argument descriptor (see below)
7227 * retval (ignored)
7228 *
7229 * Indirect: uap->path Path of file to get status from
7230 * uap->ub User buffer (holds file status info)
7231 * uap->xsecurity ACL to get (extended security)
7232 * uap->xsecurity_size Size of ACL
7233 *
7234 * Returns: 0 Success
7235 * !0 errno value
7236 *
7237 */
7238 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7239 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7240 {
7241 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7242 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7243 AT_SYMLINK_NOFOLLOW);
7244 }
7245
7246 /*
7247 * Get file status; this version does not follow links.
7248 */
7249 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7250 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7251 {
7252 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7253 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7254 }
7255
7256 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7257 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7258 {
7259 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7260 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7261 }
7262
7263 /*
7264 * lstat64_extended: Get file status; can handle large inode numbers; does not
7265 * follow links; with extended security (ACL).
7266 *
7267 * Parameters: p (ignored)
7268 * uap User argument descriptor (see below)
7269 * retval (ignored)
7270 *
7271 * Indirect: uap->path Path of file to get status from
7272 * uap->ub User buffer (holds file status info)
7273 * uap->xsecurity ACL to get (extended security)
7274 * uap->xsecurity_size Size of ACL
7275 *
7276 * Returns: 0 Success
7277 * !0 errno value
7278 *
7279 */
7280 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7281 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7282 {
7283 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7284 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7285 AT_SYMLINK_NOFOLLOW);
7286 }
7287
7288 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7289 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7290 {
7291 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7292 return EINVAL;
7293 }
7294
7295 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7296 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7297 }
7298
7299 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7300 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7301 __unused int32_t *retval)
7302 {
7303 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7304 return EINVAL;
7305 }
7306
7307 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7308 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7309 }
7310
7311 /*
7312 * Get configurable pathname variables.
7313 *
7314 * Returns: 0 Success
7315 * namei:???
7316 * vn_pathconf:???
7317 *
7318 * Notes: Global implementation constants are intended to be
7319 * implemented in this function directly; all other constants
7320 * are per-FS implementation, and therefore must be handled in
7321 * each respective FS, instead.
7322 *
7323 * XXX We implement some things globally right now that should actually be
7324 * XXX per-FS; we will need to deal with this at some point.
7325 */
7326 /* ARGSUSED */
7327 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7328 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7329 {
7330 int error;
7331 struct nameidata nd;
7332 vfs_context_t ctx = vfs_context_current();
7333
7334 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7335 UIO_USERSPACE, uap->path, ctx);
7336 error = namei(&nd);
7337 if (error) {
7338 return error;
7339 }
7340
7341 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7342
7343 vnode_put(nd.ni_vp);
7344 nameidone(&nd);
7345 return error;
7346 }
7347
7348 /*
7349 * Return target name of a symbolic link.
7350 */
7351 /* ARGSUSED */
7352 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7353 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7354 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7355 int *retval)
7356 {
7357 vnode_t vp;
7358 uio_t auio;
7359 int error;
7360 struct nameidata nd;
7361 UIO_STACKBUF(uio_buf, 1);
7362 bool put_vnode;
7363
7364 if (bufsize > INT32_MAX) {
7365 return EINVAL;
7366 }
7367
7368 if (lnk_vp) {
7369 vp = lnk_vp;
7370 put_vnode = false;
7371 } else {
7372 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7373 seg, path, ctx);
7374
7375 error = nameiat(&nd, fd);
7376 if (error) {
7377 return error;
7378 }
7379 vp = nd.ni_vp;
7380 put_vnode = true;
7381 nameidone(&nd);
7382 }
7383
7384 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7385 &uio_buf[0], sizeof(uio_buf));
7386 uio_addiov(auio, buf, bufsize);
7387 if (vp->v_type != VLNK) {
7388 error = EINVAL;
7389 } else {
7390 #if CONFIG_MACF
7391 error = mac_vnode_check_readlink(ctx, vp);
7392 #endif
7393 if (error == 0) {
7394 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7395 ctx);
7396 }
7397 if (error == 0) {
7398 error = VNOP_READLINK(vp, auio, ctx);
7399 }
7400 }
7401
7402 if (put_vnode) {
7403 vnode_put(vp);
7404 }
7405
7406 *retval = (int)(bufsize - uio_resid(auio));
7407 return error;
7408 }
7409
7410 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7411 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7412 {
7413 enum uio_seg procseg;
7414 vnode_t vp;
7415 int error;
7416
7417 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7418
7419 AUDIT_ARG(fd, uap->fd);
7420
7421 if ((error = file_vnode(uap->fd, &vp))) {
7422 return error;
7423 }
7424 if ((error = vnode_getwithref(vp))) {
7425 file_drop(uap->fd);
7426 return error;
7427 }
7428
7429 error = readlinkat_internal(vfs_context_current(), -1,
7430 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7431 uap->bufsize, procseg, retval);
7432
7433 vnode_put(vp);
7434 file_drop(uap->fd);
7435 return error;
7436 }
7437
7438 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7439 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7440 {
7441 enum uio_seg procseg;
7442
7443 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7444 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7445 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7446 uap->count, procseg, retval);
7447 }
7448
7449 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7450 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7451 {
7452 enum uio_seg procseg;
7453
7454 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7455 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7456 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7457 retval);
7458 }
7459
7460 /*
7461 * Change file flags, the deep inner layer.
7462 */
7463 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7464 chflags0(vnode_t vp, struct vnode_attr *va,
7465 int (*setattr)(vnode_t, void *, vfs_context_t),
7466 void *arg, vfs_context_t ctx)
7467 {
7468 kauth_action_t action = 0;
7469 int error;
7470
7471 #if CONFIG_MACF
7472 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7473 if (error) {
7474 goto out;
7475 }
7476 #endif
7477
7478 /* request authorisation, disregard immutability */
7479 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7480 goto out;
7481 }
7482 /*
7483 * Request that the auth layer disregard those file flags it's allowed to when
7484 * authorizing this operation; we need to do this in order to be able to
7485 * clear immutable flags.
7486 */
7487 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7488 goto out;
7489 }
7490 error = (*setattr)(vp, arg, ctx);
7491
7492 #if CONFIG_MACF
7493 if (error == 0) {
7494 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7495 }
7496 #endif
7497
7498 out:
7499 return error;
7500 }
7501
7502 /*
7503 * Change file flags.
7504 *
7505 * NOTE: this will vnode_put() `vp'
7506 */
7507 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7508 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7509 {
7510 struct vnode_attr va;
7511 int error;
7512
7513 VATTR_INIT(&va);
7514 VATTR_SET(&va, va_flags, flags);
7515
7516 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7517 vnode_put(vp);
7518
7519 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7520 error = ENOTSUP;
7521 }
7522
7523 return error;
7524 }
7525
7526 /*
7527 * Change flags of a file given a path name.
7528 */
7529 /* ARGSUSED */
7530 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7531 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7532 {
7533 vnode_t vp;
7534 vfs_context_t ctx = vfs_context_current();
7535 int error;
7536 struct nameidata nd;
7537 uint32_t wantparent = 0;
7538
7539 #if CONFIG_FILE_LEASES
7540 wantparent = WANTPARENT;
7541 #endif
7542
7543 AUDIT_ARG(fflags, uap->flags);
7544 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7545 UIO_USERSPACE, uap->path, ctx);
7546 error = namei(&nd);
7547 if (error) {
7548 return error;
7549 }
7550 vp = nd.ni_vp;
7551
7552 #if CONFIG_FILE_LEASES
7553 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7554 vnode_put(nd.ni_dvp);
7555 #endif
7556
7557 nameidone(&nd);
7558
7559 /* we don't vnode_put() here because chflags1 does internally */
7560 error = chflags1(vp, uap->flags, ctx);
7561
7562 return error;
7563 }
7564
7565 /*
7566 * Change flags of a file given a file descriptor.
7567 */
7568 /* ARGSUSED */
7569 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7570 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7571 {
7572 vnode_t vp;
7573 int error;
7574
7575 AUDIT_ARG(fd, uap->fd);
7576 AUDIT_ARG(fflags, uap->flags);
7577 if ((error = file_vnode(uap->fd, &vp))) {
7578 return error;
7579 }
7580
7581 if ((error = vnode_getwithref(vp))) {
7582 file_drop(uap->fd);
7583 return error;
7584 }
7585
7586 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7587
7588 #if CONFIG_FILE_LEASES
7589 vnode_breakdirlease(vp, true, O_WRONLY);
7590 #endif
7591
7592 /* we don't vnode_put() here because chflags1 does internally */
7593 error = chflags1(vp, uap->flags, vfs_context_current());
7594
7595 file_drop(uap->fd);
7596 return error;
7597 }
7598
7599 /*
7600 * Change security information on a filesystem object.
7601 *
7602 * Returns: 0 Success
7603 * EPERM Operation not permitted
7604 * vnode_authattr:??? [anything vnode_authattr can return]
7605 * vnode_authorize:??? [anything vnode_authorize can return]
7606 * vnode_setattr:??? [anything vnode_setattr can return]
7607 *
7608 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7609 * translated to EPERM before being returned.
7610 */
7611 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7612 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7613 {
7614 kauth_action_t action;
7615 int error;
7616
7617 AUDIT_ARG(mode, vap->va_mode);
7618 /* XXX audit new args */
7619
7620 #if NAMEDSTREAMS
7621 /* chmod calls are not allowed for resource forks. */
7622 if (vp->v_flag & VISNAMEDSTREAM) {
7623 return EPERM;
7624 }
7625 #endif
7626
7627 #if CONFIG_MACF
7628 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7629 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7630 return error;
7631 }
7632
7633 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7634 if ((error = mac_vnode_check_setowner(ctx, vp,
7635 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7636 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7637 return error;
7638 }
7639 }
7640
7641 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7642 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7643 return error;
7644 }
7645 #endif
7646
7647 /* make sure that the caller is allowed to set this security information */
7648 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7649 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7650 if (error == EACCES) {
7651 error = EPERM;
7652 }
7653 return error;
7654 }
7655
7656 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7657 return error;
7658 }
7659
7660 #if CONFIG_MACF
7661 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7662 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7663 }
7664
7665 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7666 mac_vnode_notify_setowner(ctx, vp,
7667 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7668 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7669 }
7670
7671 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7672 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7673 }
7674 #endif
7675
7676 return error;
7677 }
7678
7679
7680 /*
7681 * Change mode of a file given a path name.
7682 *
7683 * Returns: 0 Success
7684 * namei:??? [anything namei can return]
7685 * chmod_vnode:??? [anything chmod_vnode can return]
7686 */
7687 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7688 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7689 int fd, int flag, enum uio_seg segflg)
7690 {
7691 struct nameidata nd;
7692 int follow, error;
7693 uint32_t wantparent = 0;
7694
7695 #if CONFIG_FILE_LEASES
7696 wantparent = WANTPARENT;
7697 #endif
7698
7699 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7700 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7701 segflg, path, ctx);
7702 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7703 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7704 }
7705 if ((error = nameiat(&nd, fd))) {
7706 return error;
7707 }
7708
7709 #if CONFIG_FILE_LEASES
7710 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7711 vnode_put(nd.ni_dvp);
7712 #endif
7713
7714 error = chmod_vnode(ctx, nd.ni_vp, vap);
7715 vnode_put(nd.ni_vp);
7716 nameidone(&nd);
7717 return error;
7718 }
7719
7720 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7721 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7722 gid_t gid, user_addr_t xsecurity)
7723 {
7724 int error;
7725
7726 VATTR_INIT(pva);
7727
7728 if (mode != -1) {
7729 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7730 } else {
7731 pva->va_mode = 0;
7732 }
7733
7734 if (uid != KAUTH_UID_NONE) {
7735 VATTR_SET(pva, va_uid, uid);
7736 }
7737
7738 if (gid != KAUTH_GID_NONE) {
7739 VATTR_SET(pva, va_gid, gid);
7740 }
7741
7742 *pxsecdst = NULL;
7743 switch (xsecurity) {
7744 case USER_ADDR_NULL:
7745 break;
7746
7747 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7748 VATTR_SET(pva, va_acl, NULL);
7749 break;
7750
7751 default:
7752 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7753 return error;
7754 }
7755
7756 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7757 pva->va_vaflags |= VA_FILESEC_ACL;
7758 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7759 break;
7760 }
7761
7762 return 0;
7763 }
7764
7765 /*
7766 * chmod_extended: Change the mode of a file given a path name; with extended
7767 * argument list (including extended security (ACL)).
7768 *
7769 * Parameters: p Process requesting the open
7770 * uap User argument descriptor (see below)
7771 * retval (ignored)
7772 *
7773 * Indirect: uap->path Path to object (same as 'chmod')
7774 * uap->uid UID to set
7775 * uap->gid GID to set
7776 * uap->mode File mode to set (same as 'chmod')
7777 * uap->xsecurity ACL to set (or delete)
7778 *
7779 * Returns: 0 Success
7780 * !0 errno value
7781 *
7782 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7783 *
7784 * XXX: We should enummerate the possible errno values here, and where
7785 * in the code they originated.
7786 */
7787 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7788 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7789 {
7790 int error;
7791 struct vnode_attr va;
7792 kauth_filesec_t xsecdst = NULL;
7793
7794 AUDIT_ARG(owner, uap->uid, uap->gid);
7795
7796 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7797 uap->gid, uap->xsecurity);
7798
7799 if (error) {
7800 return error;
7801 }
7802
7803 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7804 UIO_USERSPACE);
7805
7806 if (xsecdst != NULL) {
7807 kauth_filesec_free(xsecdst);
7808 }
7809 return error;
7810 }
7811
7812 /*
7813 * Returns: 0 Success
7814 * chmodat:??? [anything chmodat can return]
7815 */
7816 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7817 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7818 int flag, enum uio_seg segflg)
7819 {
7820 struct vnode_attr va;
7821
7822 VATTR_INIT(&va);
7823 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7824
7825 return chmodat(ctx, path, &va, fd, flag, segflg);
7826 }
7827
7828 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7829 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7830 {
7831 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7832 AT_FDCWD, 0, UIO_USERSPACE);
7833 }
7834
7835 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7836 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7837 {
7838 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7839 return EINVAL;
7840 }
7841
7842 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7843 uap->fd, uap->flag, UIO_USERSPACE);
7844 }
7845
7846 /*
7847 * Change mode of a file given a file descriptor.
7848 */
7849 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7850 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7851 {
7852 vnode_t vp;
7853 int error;
7854
7855 AUDIT_ARG(fd, fd);
7856
7857 if ((error = file_vnode(fd, &vp)) != 0) {
7858 return error;
7859 }
7860 if ((error = vnode_getwithref(vp)) != 0) {
7861 file_drop(fd);
7862 return error;
7863 }
7864 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7865
7866 #if CONFIG_FILE_LEASES
7867 vnode_breakdirlease(vp, true, O_WRONLY);
7868 #endif
7869
7870 error = chmod_vnode(vfs_context_current(), vp, vap);
7871 (void)vnode_put(vp);
7872 file_drop(fd);
7873
7874 return error;
7875 }
7876
7877 /*
7878 * fchmod_extended: Change mode of a file given a file descriptor; with
7879 * extended argument list (including extended security (ACL)).
7880 *
7881 * Parameters: p Process requesting to change file mode
7882 * uap User argument descriptor (see below)
7883 * retval (ignored)
7884 *
7885 * Indirect: uap->mode File mode to set (same as 'chmod')
7886 * uap->uid UID to set
7887 * uap->gid GID to set
7888 * uap->xsecurity ACL to set (or delete)
7889 * uap->fd File descriptor of file to change mode
7890 *
7891 * Returns: 0 Success
7892 * !0 errno value
7893 *
7894 */
7895 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7896 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7897 {
7898 int error;
7899 struct vnode_attr va;
7900 kauth_filesec_t xsecdst = NULL;
7901
7902 AUDIT_ARG(owner, uap->uid, uap->gid);
7903
7904 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7905 uap->gid, uap->xsecurity);
7906
7907 if (error) {
7908 return error;
7909 }
7910
7911 error = fchmod1(p, uap->fd, &va);
7912
7913 if (xsecdst != NULL) {
7914 kauth_filesec_free(xsecdst);
7915 }
7916 return error;
7917 }
7918
7919 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7920 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7921 {
7922 struct vnode_attr va;
7923
7924 VATTR_INIT(&va);
7925 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7926
7927 return fchmod1(p, uap->fd, &va);
7928 }
7929
7930 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)7931 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
7932 {
7933 struct vnode_attr va;
7934 kauth_action_t action;
7935 int error;
7936
7937 VATTR_INIT(&va);
7938 if (uid != (uid_t)VNOVAL) {
7939 VATTR_SET(&va, va_uid, uid);
7940 }
7941 if (gid != (gid_t)VNOVAL) {
7942 VATTR_SET(&va, va_gid, gid);
7943 }
7944
7945 #if NAMEDSTREAMS
7946 /* chown calls are not allowed for resource forks. */
7947 if (vp->v_flag & VISNAMEDSTREAM) {
7948 error = EPERM;
7949 goto out;
7950 }
7951 #endif
7952
7953 #if CONFIG_MACF
7954 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7955 if (error) {
7956 goto out;
7957 }
7958 #endif
7959
7960 /* preflight and authorize attribute changes */
7961 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7962 goto out;
7963 }
7964 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7965 /*
7966 * EACCES is only allowed from namei(); permissions failure should
7967 * return EPERM, so we need to translate the error code.
7968 */
7969 if (error == EACCES) {
7970 error = EPERM;
7971 }
7972
7973 goto out;
7974 }
7975
7976 #if CONFIG_FILE_LEASES
7977 vnode_breakdirlease(vp, true, O_WRONLY);
7978 #endif
7979
7980 error = vnode_setattr(vp, &va, ctx);
7981
7982 #if CONFIG_MACF
7983 if (error == 0) {
7984 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7985 }
7986 #endif
7987
7988 out:
7989 return error;
7990 }
7991
7992 /*
7993 * Set ownership given a path name.
7994 */
7995 /* ARGSUSED */
7996 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7997 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7998 gid_t gid, int flag, enum uio_seg segflg)
7999 {
8000 vnode_t vp;
8001 int error;
8002 struct nameidata nd;
8003 int follow;
8004
8005 AUDIT_ARG(owner, uid, gid);
8006
8007 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
8008 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
8009 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
8010 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
8011 }
8012
8013 error = nameiat(&nd, fd);
8014 if (error) {
8015 return error;
8016 }
8017
8018 vp = nd.ni_vp;
8019 error = vn_chown_internal(ctx, vp, uid, gid);
8020
8021 nameidone(&nd);
8022 vnode_put(vp);
8023 return error;
8024 }
8025
8026 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8027 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8028 {
8029 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8030 uap->uid, uap->gid, 0, UIO_USERSPACE);
8031 }
8032
8033 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8034 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8035 {
8036 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8037 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8038 }
8039
8040 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8041 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8042 {
8043 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
8044 return EINVAL;
8045 }
8046
8047 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8048 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8049 }
8050
8051 /*
8052 * Set ownership given a file descriptor.
8053 */
8054 /* ARGSUSED */
8055 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8056 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8057 {
8058 vfs_context_t ctx = vfs_context_current();
8059 vnode_t vp;
8060 int error;
8061
8062 AUDIT_ARG(owner, uap->uid, uap->gid);
8063 AUDIT_ARG(fd, uap->fd);
8064
8065 if ((error = file_vnode(uap->fd, &vp))) {
8066 return error;
8067 }
8068
8069 if ((error = vnode_getwithref(vp))) {
8070 file_drop(uap->fd);
8071 return error;
8072 }
8073 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8074
8075 error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8076
8077 (void)vnode_put(vp);
8078 file_drop(uap->fd);
8079 return error;
8080 }
8081
8082 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8083 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8084 {
8085 int error;
8086
8087 if (usrtvp == USER_ADDR_NULL) {
8088 struct timeval old_tv;
8089 /* XXX Y2038 bug because of microtime argument */
8090 microtime(&old_tv);
8091 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8092 tsp[1] = tsp[0];
8093 } else {
8094 if (IS_64BIT_PROCESS(current_proc())) {
8095 struct user64_timeval tv[2];
8096 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8097 if (error) {
8098 return error;
8099 }
8100 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8101 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8102 } else {
8103 struct user32_timeval tv[2];
8104 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8105 if (error) {
8106 return error;
8107 }
8108 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8109 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8110 }
8111 }
8112 return 0;
8113 }
8114
8115 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8116 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8117 int nullflag)
8118 {
8119 int error;
8120 struct vnode_attr va;
8121 kauth_action_t action;
8122
8123 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8124
8125 VATTR_INIT(&va);
8126 VATTR_SET(&va, va_access_time, ts[0]);
8127 VATTR_SET(&va, va_modify_time, ts[1]);
8128 if (nullflag) {
8129 va.va_vaflags |= VA_UTIMES_NULL;
8130 }
8131
8132 #if NAMEDSTREAMS
8133 /* utimes calls are not allowed for resource forks. */
8134 if (vp->v_flag & VISNAMEDSTREAM) {
8135 error = EPERM;
8136 goto out;
8137 }
8138 #endif
8139
8140 #if CONFIG_MACF
8141 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8142 if (error) {
8143 goto out;
8144 }
8145 #endif
8146 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8147 if (!nullflag && error == EACCES) {
8148 error = EPERM;
8149 }
8150 goto out;
8151 }
8152
8153 /* since we may not need to auth anything, check here */
8154 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8155 if (!nullflag && error == EACCES) {
8156 error = EPERM;
8157 }
8158 goto out;
8159 }
8160 error = vnode_setattr(vp, &va, ctx);
8161
8162 #if CONFIG_MACF
8163 if (error == 0) {
8164 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8165 }
8166 #endif
8167
8168 out:
8169 return error;
8170 }
8171
8172 /*
8173 * Set the access and modification times of a file.
8174 */
8175 /* ARGSUSED */
8176 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8177 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8178 {
8179 struct timespec ts[2];
8180 user_addr_t usrtvp;
8181 int error;
8182 struct nameidata nd;
8183 vfs_context_t ctx = vfs_context_current();
8184 uint32_t wantparent = 0;
8185
8186 #if CONFIG_FILE_LEASES
8187 wantparent = WANTPARENT;
8188 #endif
8189
8190 /*
8191 * AUDIT: Needed to change the order of operations to do the
8192 * name lookup first because auditing wants the path.
8193 */
8194 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8195 UIO_USERSPACE, uap->path, ctx);
8196 error = namei(&nd);
8197 if (error) {
8198 return error;
8199 }
8200
8201 /*
8202 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8203 * the current time instead.
8204 */
8205 usrtvp = uap->tptr;
8206 if ((error = getutimes(usrtvp, ts)) != 0) {
8207 goto out;
8208 }
8209
8210 #if CONFIG_FILE_LEASES
8211 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8212 #endif
8213
8214 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8215
8216 out:
8217 #if CONFIG_FILE_LEASES
8218 vnode_put(nd.ni_dvp);
8219 #endif
8220 nameidone(&nd);
8221 vnode_put(nd.ni_vp);
8222 return error;
8223 }
8224
8225 /*
8226 * Set the access and modification times of a file.
8227 */
8228 /* ARGSUSED */
8229 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8230 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8231 {
8232 struct timespec ts[2];
8233 vnode_t vp;
8234 user_addr_t usrtvp;
8235 int error;
8236
8237 AUDIT_ARG(fd, uap->fd);
8238 usrtvp = uap->tptr;
8239 if ((error = getutimes(usrtvp, ts)) != 0) {
8240 return error;
8241 }
8242 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8243 return error;
8244 }
8245 if ((error = vnode_getwithref(vp))) {
8246 file_drop(uap->fd);
8247 return error;
8248 }
8249
8250 #if CONFIG_FILE_LEASES
8251 vnode_breakdirlease(vp, true, O_WRONLY);
8252 #endif
8253
8254 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8255
8256 vnode_put(vp);
8257 file_drop(uap->fd);
8258 return error;
8259 }
8260
8261 static int
truncate_validate_common(proc_t p,off_t length)8262 truncate_validate_common(proc_t p, off_t length)
8263 {
8264 rlim_t fsize_limit;
8265
8266 if (length < 0) {
8267 return EINVAL;
8268 }
8269
8270 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8271 if ((rlim_t)length > fsize_limit) {
8272 psignal(p, SIGXFSZ);
8273 return EFBIG;
8274 }
8275
8276 return 0;
8277 }
8278
8279 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8280 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8281 vfs_context_t ctx, boolean_t need_auth)
8282 {
8283 struct vnode_attr va;
8284 kauth_action_t action;
8285 int error;
8286
8287 VATTR_INIT(&va);
8288 VATTR_SET(&va, va_data_size, length);
8289
8290 #if CONFIG_MACF
8291 error = mac_vnode_check_truncate(ctx, cred, vp);
8292 if (error) {
8293 return error;
8294 }
8295 #endif
8296
8297 /*
8298 * If we reached here from `ftruncate` then we already did an effective
8299 * `vnode_authorize` upon open. We honour the result from then.
8300 */
8301 if (need_auth) {
8302 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8303 return error;
8304 }
8305
8306 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8307 return error;
8308 }
8309 }
8310
8311 #if CONFIG_FILE_LEASES
8312 /* Check if there is a lease placed on the parent directory. */
8313 vnode_breakdirlease(vp, true, O_WRONLY);
8314
8315 /* Now check if there is a lease placed on the file itself. */
8316 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8317 #endif
8318
8319 error = vnode_setattr(vp, &va, ctx);
8320
8321 #if CONFIG_MACF
8322 if (error == 0) {
8323 mac_vnode_notify_truncate(ctx, cred, vp);
8324 }
8325 #endif
8326
8327 return error;
8328 }
8329
8330 /*
8331 * Truncate a file given its path name.
8332 */
8333 /* ARGSUSED */
8334 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8335 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8336 {
8337 vfs_context_t ctx = vfs_context_current();
8338 vnode_t vp;
8339 int error;
8340 struct nameidata nd;
8341
8342 if ((error = truncate_validate_common(p, uap->length))) {
8343 return error;
8344 }
8345
8346 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8347 UIO_USERSPACE, uap->path, ctx);
8348
8349 if ((error = namei(&nd))) {
8350 return error;
8351 }
8352
8353 vp = nd.ni_vp;
8354 nameidone(&nd);
8355
8356 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8357 vnode_put(vp);
8358
8359 return error;
8360 }
8361
8362 /*
8363 * Truncate a file given a file descriptor.
8364 */
8365 /* ARGSUSED */
8366 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8367 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8368 {
8369 vnode_t vp;
8370 struct fileproc *fp;
8371 int error;
8372
8373 AUDIT_ARG(fd, uap->fd);
8374
8375 if ((error = truncate_validate_common(p, uap->length))) {
8376 return error;
8377 }
8378
8379 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8380 return error;
8381 }
8382
8383 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8384 case DTYPE_PSXSHM:
8385 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8386 goto out;
8387 case DTYPE_VNODE:
8388 break;
8389 default:
8390 error = EINVAL;
8391 goto out;
8392 }
8393
8394 vp = (vnode_t)fp_get_data(fp);
8395
8396 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8397 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8398 error = EINVAL;
8399 goto out;
8400 }
8401
8402 if ((error = vnode_getwithref(vp)) != 0) {
8403 goto out;
8404 }
8405
8406 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8407
8408 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8409 vfs_context_current(), false);
8410 vnode_put(vp);
8411
8412 out:
8413 file_drop(uap->fd);
8414 return error;
8415 }
8416
8417
8418 /*
8419 * Sync an open file with synchronized I/O _file_ integrity completion
8420 */
8421 /* ARGSUSED */
8422 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8423 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8424 {
8425 __pthread_testcancel(1);
8426 return fsync_common(p, uap, MNT_WAIT);
8427 }
8428
8429
8430 /*
8431 * Sync an open file with synchronized I/O _file_ integrity completion
8432 *
8433 * Notes: This is a legacy support function that does not test for
8434 * thread cancellation points.
8435 */
8436 /* ARGSUSED */
8437 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8438 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8439 {
8440 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8441 }
8442
8443
8444 /*
8445 * Sync an open file with synchronized I/O _data_ integrity completion
8446 */
8447 /* ARGSUSED */
8448 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8449 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8450 {
8451 __pthread_testcancel(1);
8452 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8453 }
8454
8455
8456 /*
8457 * fsync_common
8458 *
8459 * Common fsync code to support both synchronized I/O file integrity completion
8460 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8461 *
8462 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8463 * will only guarantee that the file data contents are retrievable. If
8464 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8465 * includes additional metadata unnecessary for retrieving the file data
8466 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8467 * storage.
8468 *
8469 * Parameters: p The process
8470 * uap->fd The descriptor to synchronize
8471 * flags The data integrity flags
8472 *
8473 * Returns: int Success
8474 * fp_getfvp:EBADF Bad file descriptor
8475 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8476 * VNOP_FSYNC:??? unspecified
8477 *
8478 * Notes: We use struct fsync_args because it is a short name, and all
8479 * caller argument structures are otherwise identical.
8480 */
8481 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8482 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8483 {
8484 vnode_t vp;
8485 struct fileproc *fp;
8486 vfs_context_t ctx = vfs_context_current();
8487 int error;
8488
8489 AUDIT_ARG(fd, uap->fd);
8490
8491 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8492 return error;
8493 }
8494 if ((error = vnode_getwithref(vp))) {
8495 file_drop(uap->fd);
8496 return error;
8497 }
8498
8499 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8500
8501 error = VNOP_FSYNC(vp, flags, ctx);
8502
8503 #if NAMEDRSRCFORK
8504 /* Sync resource fork shadow file if necessary. */
8505 if ((error == 0) &&
8506 (vp->v_flag & VISNAMEDSTREAM) &&
8507 (vp->v_parent != NULLVP) &&
8508 vnode_isshadow(vp) &&
8509 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8510 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8511 }
8512 #endif
8513
8514 (void)vnode_put(vp);
8515 file_drop(uap->fd);
8516 return error;
8517 }
8518
8519 /*
8520 * Duplicate files. Source must be a file, target must be a file or
8521 * must not exist.
8522 *
8523 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8524 * perform inheritance correctly.
8525 */
8526 /* ARGSUSED */
8527 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8528 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8529 {
8530 vnode_t tvp, fvp, tdvp, sdvp;
8531 struct nameidata fromnd, tond;
8532 int error;
8533 vfs_context_t ctx = vfs_context_current();
8534
8535 /* Check that the flags are valid. */
8536 if (uap->flags & ~CPF_MASK) {
8537 return EINVAL;
8538 }
8539
8540 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8541 UIO_USERSPACE, uap->from, ctx);
8542 if ((error = namei(&fromnd))) {
8543 return error;
8544 }
8545 fvp = fromnd.ni_vp;
8546
8547 NDINIT(&tond, CREATE, OP_LINK,
8548 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8549 UIO_USERSPACE, uap->to, ctx);
8550 if ((error = namei(&tond))) {
8551 goto out1;
8552 }
8553 tdvp = tond.ni_dvp;
8554 tvp = tond.ni_vp;
8555
8556 if (tvp != NULL) {
8557 if (!(uap->flags & CPF_OVERWRITE)) {
8558 error = EEXIST;
8559 goto out;
8560 }
8561 }
8562
8563 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8564 error = EISDIR;
8565 goto out;
8566 }
8567
8568 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8569 error = EOPNOTSUPP;
8570 goto out;
8571 }
8572
8573 #if CONFIG_MACF
8574 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8575 goto out;
8576 }
8577 #endif /* CONFIG_MACF */
8578
8579 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8580 goto out;
8581 }
8582 if (tvp) {
8583 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8584 goto out;
8585 }
8586 }
8587 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8588 goto out;
8589 }
8590
8591 if (fvp == tdvp) {
8592 error = EINVAL;
8593 }
8594 /*
8595 * If source is the same as the destination (that is the
8596 * same inode number) then there is nothing to do.
8597 * (fixed to have POSIX semantics - CSM 3/2/98)
8598 */
8599 if (fvp == tvp) {
8600 error = -1;
8601 }
8602
8603 #if CONFIG_FILE_LEASES
8604 vnode_breakdirlease(tdvp, false, O_WRONLY);
8605 #endif
8606
8607 if (!error) {
8608 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8609 }
8610 out:
8611 sdvp = tond.ni_startdir;
8612 /*
8613 * nameidone has to happen before we vnode_put(tdvp)
8614 * since it may need to release the fs_nodelock on the tdvp
8615 */
8616 nameidone(&tond);
8617
8618 if (tvp) {
8619 vnode_put(tvp);
8620 }
8621 vnode_put(tdvp);
8622 vnode_put(sdvp);
8623 out1:
8624 vnode_put(fvp);
8625
8626 nameidone(&fromnd);
8627
8628 if (error == -1) {
8629 return 0;
8630 }
8631 return error;
8632 }
8633
8634 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8635
8636 /*
8637 * Helper function for doing clones. The caller is expected to provide an
8638 * iocounted source vnode and release it.
8639 */
8640 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8641 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8642 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8643 {
8644 vnode_t tvp, tdvp;
8645 struct nameidata *tondp = NULL;
8646 int error;
8647 int follow;
8648 boolean_t free_src_acl;
8649 boolean_t attr_cleanup;
8650 enum vtype v_type;
8651 kauth_action_t action;
8652 struct componentname *cnp;
8653 uint32_t defaulted = 0;
8654 struct {
8655 struct vnode_attr va[2];
8656 } *va2p = NULL;
8657 struct vnode_attr *vap = NULL;
8658 struct vnode_attr *nvap = NULL;
8659 uint32_t vnop_flags;
8660
8661 v_type = vnode_vtype(fvp);
8662 switch (v_type) {
8663 case VLNK:
8664 /* FALLTHRU */
8665 case VREG:
8666 action = KAUTH_VNODE_ADD_FILE;
8667 break;
8668 case VDIR:
8669 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8670 fvp->v_mountedhere) {
8671 return EINVAL;
8672 }
8673 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8674 break;
8675 default:
8676 return EINVAL;
8677 }
8678
8679 AUDIT_ARG(fd2, dst_dirfd);
8680 AUDIT_ARG(value32, flags);
8681
8682 tondp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8683 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8684 NDINIT(tondp, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8685 UIO_USERSPACE, dst, ctx);
8686 if (flags & CLONE_NOFOLLOW_ANY) {
8687 tondp->ni_flag |= NAMEI_NOFOLLOW_ANY;
8688 }
8689
8690 if ((error = nameiat(tondp, dst_dirfd))) {
8691 kfree_type(struct nameidata, tondp);
8692 return error;
8693 }
8694 cnp = &tondp->ni_cnd;
8695 tdvp = tondp->ni_dvp;
8696 tvp = tondp->ni_vp;
8697
8698 free_src_acl = FALSE;
8699 attr_cleanup = FALSE;
8700
8701 if (tvp != NULL) {
8702 error = EEXIST;
8703 goto out;
8704 }
8705
8706 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8707 error = EXDEV;
8708 goto out;
8709 }
8710
8711 #if CONFIG_MACF
8712 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8713 goto out;
8714 }
8715 #endif
8716 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8717 goto out;
8718 }
8719
8720 action = KAUTH_VNODE_GENERIC_READ_BITS;
8721 if (data_read_authorised) {
8722 action &= ~KAUTH_VNODE_READ_DATA;
8723 }
8724 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8725 goto out;
8726 }
8727
8728 va2p = kalloc_type(typeof(*va2p), Z_WAITOK | Z_NOFAIL);
8729 vap = &va2p->va[0];
8730 nvap = &va2p->va[1];
8731
8732 /*
8733 * certain attributes may need to be changed from the source, we ask for
8734 * those here with the exception of source file's ACLs unless the CLONE_ACL
8735 * flag is specified. By default, the clone file will inherit the target
8736 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8737 * will inherit the source file's ACLs instead.
8738 */
8739 VATTR_INIT(vap);
8740 VATTR_WANTED(vap, va_uid);
8741 VATTR_WANTED(vap, va_gid);
8742 VATTR_WANTED(vap, va_mode);
8743 VATTR_WANTED(vap, va_flags);
8744 if (flags & CLONE_ACL) {
8745 VATTR_WANTED(vap, va_acl);
8746 }
8747
8748 if ((error = vnode_getattr(fvp, vap, ctx)) != 0) {
8749 goto out;
8750 }
8751
8752 VATTR_INIT(nvap);
8753 VATTR_SET(nvap, va_type, v_type);
8754 if (VATTR_IS_SUPPORTED(vap, va_acl) && vap->va_acl != NULL) {
8755 VATTR_SET(nvap, va_acl, vap->va_acl);
8756 free_src_acl = TRUE;
8757 }
8758
8759 /* Handle ACL inheritance, initialize vap. */
8760 if (v_type == VLNK) {
8761 error = vnode_authattr_new(tdvp, nvap, 0, ctx);
8762 } else {
8763 error = vn_attribute_prepare(tdvp, nvap, &defaulted, ctx);
8764 if (error) {
8765 goto out;
8766 }
8767 attr_cleanup = TRUE;
8768 }
8769
8770 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8771 /*
8772 * We've got initial values for all security parameters,
8773 * If we are superuser, then we can change owners to be the
8774 * same as the source. Both superuser and the owner have default
8775 * WRITE_SECURITY privileges so all other fields can be taken
8776 * from source as well.
8777 */
8778 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8779 if (VATTR_IS_SUPPORTED(vap, va_uid)) {
8780 VATTR_SET(nvap, va_uid, vap->va_uid);
8781 }
8782 if (VATTR_IS_SUPPORTED(vap, va_gid)) {
8783 VATTR_SET(nvap, va_gid, vap->va_gid);
8784 }
8785 } else {
8786 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8787 }
8788
8789 if (VATTR_IS_SUPPORTED(vap, va_mode)) {
8790 VATTR_SET(nvap, va_mode, vap->va_mode);
8791 }
8792 if (VATTR_IS_SUPPORTED(vap, va_flags)) {
8793 VATTR_SET(nvap, va_flags,
8794 ((vap->va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8795 (nvap->va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8796 }
8797
8798 #if CONFIG_FILE_LEASES
8799 vnode_breakdirlease(tdvp, false, O_WRONLY);
8800 #endif
8801
8802 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, nvap, vnop_flags, ctx);
8803
8804 if (!error && tvp) {
8805 int update_flags = 0;
8806 #if CONFIG_FSE
8807 int fsevent;
8808 #endif /* CONFIG_FSE */
8809
8810 /*
8811 * If some of the requested attributes weren't handled by the
8812 * VNOP, use our fallback code.
8813 */
8814 if (!VATTR_ALL_SUPPORTED(nvap)) {
8815 (void)vnode_setattr_fallback(tvp, nvap, ctx);
8816 }
8817
8818 #if CONFIG_MACF
8819 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8820 VNODE_LABEL_CREATE, ctx);
8821 #endif
8822
8823 // Make sure the name & parent pointers are hooked up
8824 if (tvp->v_name == NULL) {
8825 update_flags |= VNODE_UPDATE_NAME;
8826 }
8827 if (tvp->v_parent == NULLVP) {
8828 update_flags |= VNODE_UPDATE_PARENT;
8829 }
8830
8831 if (update_flags) {
8832 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8833 cnp->cn_namelen, cnp->cn_hash, update_flags);
8834 }
8835
8836 #if CONFIG_FSE
8837 switch (vnode_vtype(tvp)) {
8838 case VLNK:
8839 /* FALLTHRU */
8840 case VREG:
8841 fsevent = FSE_CREATE_FILE;
8842 break;
8843 case VDIR:
8844 fsevent = FSE_CREATE_DIR;
8845 break;
8846 default:
8847 goto out;
8848 }
8849
8850 if (need_fsevent(fsevent, tvp)) {
8851 /*
8852 * The following is a sequence of three explicit events.
8853 * A pair of FSE_CLONE events representing the source and destination
8854 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8855 * fseventsd may coalesce the destination clone and create events
8856 * into a single event resulting in the following sequence for a client
8857 * FSE_CLONE (src)
8858 * FSE_CLONE | FSE_CREATE (dst)
8859 */
8860 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8861 FSE_ARG_DONE);
8862 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8863 FSE_ARG_DONE);
8864 }
8865 #endif /* CONFIG_FSE */
8866 }
8867
8868 out:
8869 if (attr_cleanup) {
8870 vn_attribute_cleanup(nvap, defaulted);
8871 }
8872 if (free_src_acl && vap->va_acl) {
8873 kauth_acl_free(vap->va_acl);
8874 }
8875 if (va2p) {
8876 kfree_type(typeof(*va2p), va2p);
8877 }
8878 nameidone(tondp);
8879 kfree_type(struct nameidata, tondp);
8880 if (tvp) {
8881 vnode_put(tvp);
8882 }
8883 vnode_put(tdvp);
8884 return error;
8885 }
8886
8887 /*
8888 * clone files or directories, target must not exist.
8889 */
8890 /* ARGSUSED */
8891 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8892 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8893 __unused int32_t *retval)
8894 {
8895 vnode_t fvp;
8896 struct nameidata *ndp = NULL;
8897 int follow;
8898 int error;
8899 vfs_context_t ctx = vfs_context_current();
8900
8901 /* Check that the flags are valid. */
8902 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
8903 CLONE_NOFOLLOW_ANY)) {
8904 return EINVAL;
8905 }
8906
8907 AUDIT_ARG(fd, uap->src_dirfd);
8908
8909 ndp = kalloc_type(struct nameidata, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8910
8911 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8912 NDINIT(ndp, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8913 UIO_USERSPACE, uap->src, ctx);
8914 if (uap->flags & CLONE_NOFOLLOW_ANY) {
8915 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
8916 }
8917
8918 if ((error = nameiat(ndp, uap->src_dirfd))) {
8919 kfree_type(struct nameidata, ndp);
8920 return error;
8921 }
8922
8923 fvp = ndp->ni_vp;
8924 nameidone(ndp);
8925 kfree_type(struct nameidata, ndp);
8926
8927 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8928 uap->flags, ctx);
8929
8930 vnode_put(fvp);
8931 return error;
8932 }
8933
8934 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8935 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8936 __unused int32_t *retval)
8937 {
8938 vnode_t fvp;
8939 struct fileproc *fp;
8940 int error;
8941 vfs_context_t ctx = vfs_context_current();
8942
8943 /* Check that the flags are valid. */
8944 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL |
8945 CLONE_NOFOLLOW_ANY)) {
8946 return EINVAL;
8947 }
8948
8949 AUDIT_ARG(fd, uap->src_fd);
8950 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8951 if (error) {
8952 return error;
8953 }
8954
8955 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8956 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8957 error = EBADF;
8958 goto out;
8959 }
8960
8961 if ((error = vnode_getwithref(fvp))) {
8962 goto out;
8963 }
8964
8965 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8966
8967 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8968 uap->flags, ctx);
8969
8970 vnode_put(fvp);
8971 out:
8972 file_drop(uap->src_fd);
8973 return error;
8974 }
8975
8976 static int
rename_submounts_callback(mount_t mp,void * arg)8977 rename_submounts_callback(mount_t mp, void *arg)
8978 {
8979 int error = 0;
8980 mount_t pmp = (mount_t)arg;
8981 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8982
8983 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8984 return 0;
8985 }
8986
8987 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8988 return 0;
8989 }
8990
8991 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8992 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8993 return -1;
8994 }
8995
8996 size_t pathlen = MAXPATHLEN;
8997 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8998 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8999 }
9000
9001 vfs_unbusy(mp);
9002
9003 return error;
9004 }
9005
9006 /*
9007 * Rename files. Source and destination must either both be directories,
9008 * or both not be directories. If target is a directory, it must be empty.
9009 */
9010 /* ARGSUSED */
9011 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)9012 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
9013 int tofd, user_addr_t to, int segflg, u_int uflags)
9014 {
9015 vnode_t tvp, tdvp;
9016 vnode_t fvp, fdvp;
9017 vnode_t mnt_fvp;
9018 struct nameidata *fromnd, *tond;
9019 int error = 0;
9020 int do_retry;
9021 int retry_count;
9022 int mntrename;
9023 int need_event;
9024 int need_kpath2;
9025 int has_listeners;
9026 const char *oname = NULL;
9027 char *from_name = NULL, *to_name = NULL;
9028 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
9029 int from_len = 0, to_len = 0;
9030 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
9031 int holding_mntlock;
9032 int vn_authorize_skipped;
9033 mount_t locked_mp = NULL;
9034 vnode_t oparent = NULLVP;
9035 #if CONFIG_FSE
9036 fse_info from_finfo = {}, to_finfo;
9037 #endif
9038 int from_truncated = 0, to_truncated = 0;
9039 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
9040 int batched = 0;
9041 struct vnode_attr *fvap, *tvap;
9042 int continuing = 0;
9043 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
9044 int32_t nofollow_any = 0;
9045 /* carving out a chunk for structs that are too big to be on stack. */
9046 struct {
9047 struct nameidata from_node, to_node;
9048 struct vnode_attr fv_attr, tv_attr;
9049 } * __rename_data;
9050
9051 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9052 fromnd = &__rename_data->from_node;
9053 tond = &__rename_data->to_node;
9054
9055 holding_mntlock = 0;
9056 do_retry = 0;
9057 retry_count = 0;
9058 retry:
9059 fvp = tvp = NULL;
9060 fdvp = tdvp = NULL;
9061 fvap = tvap = NULL;
9062 mnt_fvp = NULLVP;
9063 mntrename = FALSE;
9064 vn_authorize_skipped = FALSE;
9065
9066 if (uflags & RENAME_NOFOLLOW_ANY) {
9067 nofollow_any = NAMEI_NOFOLLOW_ANY;
9068 }
9069 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9070 segflg, from, ctx);
9071 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9072
9073 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9074 segflg, to, ctx);
9075 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9076
9077 continue_lookup:
9078 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9079 if ((error = nameiat(fromnd, fromfd))) {
9080 goto out1;
9081 }
9082 fdvp = fromnd->ni_dvp;
9083 fvp = fromnd->ni_vp;
9084
9085 if (fvp && fvp->v_type == VDIR) {
9086 tond->ni_cnd.cn_flags |= WILLBEDIR;
9087 }
9088 }
9089
9090 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9091 if ((error = nameiat(tond, tofd))) {
9092 /*
9093 * Translate error code for rename("dir1", "dir2/.").
9094 */
9095 if (error == EISDIR && fvp->v_type == VDIR) {
9096 error = EINVAL;
9097 }
9098 goto out1;
9099 }
9100 tdvp = tond->ni_dvp;
9101 tvp = tond->ni_vp;
9102 }
9103
9104 #if DEVELOPMENT || DEBUG
9105 /*
9106 * XXX VSWAP: Check for entitlements or special flag here
9107 * so we can restrict access appropriately.
9108 */
9109 #else /* DEVELOPMENT || DEBUG */
9110
9111 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9112 error = EPERM;
9113 goto out1;
9114 }
9115
9116 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9117 error = EPERM;
9118 goto out1;
9119 }
9120 #endif /* DEVELOPMENT || DEBUG */
9121
9122 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9123 error = ENOENT;
9124 goto out1;
9125 }
9126
9127 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9128 int32_t pval = 0;
9129 int err = 0;
9130
9131 /*
9132 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9133 * has the same name as target iff the following conditions are met:
9134 * 1. the target file system is case insensitive
9135 * 2. source and target directories are the same
9136 * 3. source and target files are the same
9137 * 4. name only differs in case (determined by underlying filesystem)
9138 */
9139 if (fvp != tvp || fdvp != tdvp) {
9140 error = EEXIST;
9141 goto out1;
9142 }
9143
9144 /*
9145 * Assume that the target file system is case sensitive if
9146 * _PC_CASE_SENSITIVE selector isn't supported.
9147 */
9148 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9149 if (err != 0 || pval != 0) {
9150 error = EEXIST;
9151 goto out1;
9152 }
9153 }
9154
9155 batched = vnode_compound_rename_available(fdvp);
9156
9157 #if CONFIG_FSE
9158 need_event = need_fsevent(FSE_RENAME, fdvp);
9159 if (need_event) {
9160 if (fvp) {
9161 get_fse_info(fvp, &from_finfo, ctx);
9162 } else {
9163 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9164 if (error) {
9165 goto out1;
9166 }
9167
9168 fvap = &__rename_data->fv_attr;
9169 }
9170
9171 if (tvp) {
9172 get_fse_info(tvp, &to_finfo, ctx);
9173 } else if (batched) {
9174 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9175 if (error) {
9176 goto out1;
9177 }
9178
9179 tvap = &__rename_data->tv_attr;
9180 }
9181 }
9182 #else
9183 need_event = 0;
9184 #endif /* CONFIG_FSE */
9185
9186 has_listeners = kauth_authorize_fileop_has_listeners();
9187
9188 need_kpath2 = 0;
9189 #if CONFIG_AUDIT
9190 if (AUDIT_RECORD_EXISTS()) {
9191 need_kpath2 = 1;
9192 }
9193 #endif
9194
9195 if (need_event || has_listeners) {
9196 if (from_name == NULL) {
9197 GET_PATH(from_name);
9198 }
9199
9200 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9201
9202 if (from_name_no_firmlink == NULL) {
9203 GET_PATH(from_name_no_firmlink);
9204 }
9205
9206 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9207 }
9208
9209 if (need_event || need_kpath2 || has_listeners) {
9210 if (to_name == NULL) {
9211 GET_PATH(to_name);
9212 }
9213
9214 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9215
9216 if (to_name_no_firmlink == NULL) {
9217 GET_PATH(to_name_no_firmlink);
9218 }
9219
9220 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9221 if (to_name && need_kpath2) {
9222 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9223 }
9224 }
9225 if (!fvp) {
9226 /*
9227 * Claim: this check will never reject a valid rename.
9228 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9229 * Suppose fdvp and tdvp are not on the same mount.
9230 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9231 * then you can't move it to within another dir on the same mountpoint.
9232 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9233 *
9234 * If this check passes, then we are safe to pass these vnodes to the same FS.
9235 */
9236 if (fdvp->v_mount != tdvp->v_mount) {
9237 error = EXDEV;
9238 goto out1;
9239 }
9240 goto skipped_lookup;
9241 }
9242
9243 /*
9244 * If the source and destination are the same (i.e. they're
9245 * links to the same vnode) and the target file system is
9246 * case sensitive, then there is nothing to do.
9247 *
9248 * XXX Come back to this.
9249 */
9250 if (fvp == tvp) {
9251 int pathconf_val;
9252
9253 /*
9254 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9255 * then assume that this file system is case sensitive.
9256 */
9257 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9258 pathconf_val != 0) {
9259 vn_authorize_skipped = TRUE;
9260 goto out1;
9261 }
9262 }
9263
9264 /*
9265 * Allow the renaming of mount points.
9266 * - target must not exist
9267 * - target must reside in the same directory as source
9268 * - union mounts cannot be renamed
9269 * - the root fs, and tightly-linked system volumes, cannot be renamed
9270 *
9271 * XXX Handle this in VFS after a continued lookup (if we missed
9272 * in the cache to start off)
9273 *
9274 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9275 * we'll skip past here. The file system is responsible for
9276 * checking that @tvp is not a descendent of @fvp and vice versa
9277 * so it should always return EINVAL if either @tvp or @fvp is the
9278 * root of a volume.
9279 */
9280 if ((fvp->v_flag & VROOT) &&
9281 (fvp->v_type == VDIR) &&
9282 (tvp == NULL) &&
9283 (fvp->v_mountedhere == NULL) &&
9284 (fdvp == tdvp) &&
9285 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9286 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9287 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9288 vnode_t coveredvp;
9289
9290 /* switch fvp to the covered vnode */
9291 coveredvp = fvp->v_mount->mnt_vnodecovered;
9292 if ((vnode_getwithref(coveredvp))) {
9293 error = ENOENT;
9294 goto out1;
9295 }
9296 /*
9297 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9298 * later.
9299 */
9300 mnt_fvp = fvp;
9301
9302 fvp = coveredvp;
9303 mntrename = TRUE;
9304 }
9305 /*
9306 * Check for cross-device rename.
9307 * For rename on mountpoint, we want to also check the source and its parent
9308 * belong to the same mountpoint.
9309 */
9310 if ((fvp->v_mount != tdvp->v_mount) ||
9311 (fvp->v_mount != fdvp->v_mount) ||
9312 (tvp && (fvp->v_mount != tvp->v_mount))) {
9313 error = EXDEV;
9314 goto out1;
9315 }
9316
9317 /*
9318 * If source is the same as the destination (that is the
9319 * same inode number) then there is nothing to do...
9320 * EXCEPT if the underlying file system supports case
9321 * insensitivity and is case preserving. In this case
9322 * the file system needs to handle the special case of
9323 * getting the same vnode as target (fvp) and source (tvp).
9324 *
9325 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9326 * and _PC_CASE_PRESERVING can have this exception, and they need to
9327 * handle the special case of getting the same vnode as target and
9328 * source. NOTE: Then the target is unlocked going into vnop_rename,
9329 * so not to cause locking problems. There is a single reference on tvp.
9330 *
9331 * NOTE - that fvp == tvp also occurs if they are hard linked and
9332 * that correct behaviour then is just to return success without doing
9333 * anything.
9334 *
9335 * XXX filesystem should take care of this itself, perhaps...
9336 */
9337 if (fvp == tvp && fdvp == tdvp) {
9338 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9339 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9340 fromnd->ni_cnd.cn_namelen)) {
9341 vn_authorize_skipped = TRUE;
9342 goto out1;
9343 }
9344 }
9345
9346 if (holding_mntlock && fvp->v_mount != locked_mp) {
9347 /*
9348 * we're holding a reference and lock
9349 * on locked_mp, but it no longer matches
9350 * what we want to do... so drop our hold
9351 */
9352 mount_unlock_renames(locked_mp);
9353 mount_drop(locked_mp, 0);
9354 holding_mntlock = 0;
9355 }
9356 if (tdvp != fdvp && fvp->v_type == VDIR) {
9357 /*
9358 * serialize renames that re-shape
9359 * the tree... if holding_mntlock is
9360 * set, then we're ready to go...
9361 * otherwise we
9362 * first need to drop the iocounts
9363 * we picked up, second take the
9364 * lock to serialize the access,
9365 * then finally start the lookup
9366 * process over with the lock held
9367 */
9368 if (!holding_mntlock) {
9369 /*
9370 * need to grab a reference on
9371 * the mount point before we
9372 * drop all the iocounts... once
9373 * the iocounts are gone, the mount
9374 * could follow
9375 */
9376 locked_mp = fvp->v_mount;
9377 mount_ref(locked_mp, 0);
9378
9379 /*
9380 * nameidone has to happen before we vnode_put(tvp)
9381 * since it may need to release the fs_nodelock on the tvp
9382 */
9383 nameidone(tond);
9384
9385 if (tvp) {
9386 vnode_put(tvp);
9387 }
9388 vnode_put(tdvp);
9389
9390 /*
9391 * nameidone has to happen before we vnode_put(fdvp)
9392 * since it may need to release the fs_nodelock on the fvp
9393 */
9394 nameidone(fromnd);
9395
9396 vnode_put(fvp);
9397 vnode_put(fdvp);
9398
9399 if (mnt_fvp != NULLVP) {
9400 vnode_put(mnt_fvp);
9401 }
9402
9403 mount_lock_renames(locked_mp);
9404 holding_mntlock = 1;
9405
9406 goto retry;
9407 }
9408 } else {
9409 /*
9410 * when we dropped the iocounts to take
9411 * the lock, we allowed the identity of
9412 * the various vnodes to change... if they did,
9413 * we may no longer be dealing with a rename
9414 * that reshapes the tree... once we're holding
9415 * the iocounts, the vnodes can't change type
9416 * so we're free to drop the lock at this point
9417 * and continue on
9418 */
9419 if (holding_mntlock) {
9420 mount_unlock_renames(locked_mp);
9421 mount_drop(locked_mp, 0);
9422 holding_mntlock = 0;
9423 }
9424 }
9425
9426 if (!batched) {
9427 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9428 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9429 flags, NULL);
9430 if (error) {
9431 if (error == ENOENT) {
9432 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9433 /*
9434 * We encountered a race where after doing the namei,
9435 * tvp stops being valid. If so, simply re-drive the rename
9436 * call from the top.
9437 */
9438 do_retry = 1;
9439 retry_count += 1;
9440 }
9441 }
9442 goto out1;
9443 }
9444 }
9445
9446 /* Release the 'mnt_fvp' now that it is no longer needed. */
9447 if (mnt_fvp != NULLVP) {
9448 vnode_put(mnt_fvp);
9449 mnt_fvp = NULLVP;
9450 }
9451
9452 // save these off so we can later verify that fvp is the same
9453 oname = fvp->v_name;
9454 oparent = fvp->v_parent;
9455
9456 skipped_lookup:
9457 #if CONFIG_FILE_LEASES
9458 /* Lease break needed for source's parent dir? */
9459 vnode_breakdirlease(fdvp, false, O_WRONLY);
9460
9461 /* Lease break needed for target's parent dir? */
9462 vnode_breakdirlease(tdvp, false, O_WRONLY);
9463 #endif
9464
9465 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9466 tdvp, &tvp, &tond->ni_cnd, tvap,
9467 flags, ctx);
9468
9469 if (holding_mntlock) {
9470 /*
9471 * we can drop our serialization
9472 * lock now
9473 */
9474 mount_unlock_renames(locked_mp);
9475 mount_drop(locked_mp, 0);
9476 holding_mntlock = 0;
9477 }
9478 if (error) {
9479 if (error == EDATALESS) {
9480 /*
9481 * If we've been here before, something has gone
9482 * horribly wrong and we should just get out lest
9483 * we spiral around the drain forever.
9484 */
9485 if (flags & VFS_RENAME_DATALESS) {
9486 error = EIO;
9487 goto out1;
9488 }
9489
9490 /*
9491 * The object we're renaming is dataless (or has a
9492 * dataless descendent) and requires materialization
9493 * before the rename occurs. But we're holding the
9494 * mount point's rename lock, so it's not safe to
9495 * make the upcall.
9496 *
9497 * In this case, we release the lock (above), perform
9498 * the materialization, and start the whole thing over.
9499 */
9500 error = vfs_materialize_reparent(fvp, tdvp);
9501 if (error == 0) {
9502 /*
9503 * The next time around we need to tell the
9504 * file system that the materializtaion has
9505 * been performed.
9506 */
9507 flags |= VFS_RENAME_DATALESS;
9508 do_retry = 1;
9509 }
9510 goto out1;
9511 }
9512 if (error == EKEEPLOOKING) {
9513 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9514 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9515 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9516 }
9517 }
9518
9519 fromnd->ni_vp = fvp;
9520 tond->ni_vp = tvp;
9521
9522 goto continue_lookup;
9523 }
9524
9525 /*
9526 * We may encounter a race in the VNOP where the destination didn't
9527 * exist when we did the namei, but it does by the time we go and
9528 * try to create the entry. In this case, we should re-drive this rename
9529 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9530 * but other filesystems susceptible to this race could return it, too.
9531 */
9532 if (error == ERECYCLE) {
9533 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9534 do_retry = 1;
9535 retry_count += 1;
9536 } else {
9537 printf("rename retry limit due to ERECYCLE reached\n");
9538 error = ENOENT;
9539 }
9540 }
9541
9542 /*
9543 * For compound VNOPs, the authorization callback may return
9544 * ENOENT in case of racing hardlink lookups hitting the name
9545 * cache, redrive the lookup.
9546 */
9547 if (batched && error == ENOENT) {
9548 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9549 do_retry = 1;
9550 retry_count += 1;
9551 }
9552 }
9553
9554 goto out1;
9555 }
9556
9557 /* call out to allow 3rd party notification of rename.
9558 * Ignore result of kauth_authorize_fileop call.
9559 */
9560 kauth_authorize_fileop(vfs_context_ucred(ctx),
9561 KAUTH_FILEOP_RENAME,
9562 (uintptr_t)from_name, (uintptr_t)to_name);
9563 if (flags & VFS_RENAME_SWAP) {
9564 kauth_authorize_fileop(vfs_context_ucred(ctx),
9565 KAUTH_FILEOP_RENAME,
9566 (uintptr_t)to_name, (uintptr_t)from_name);
9567 }
9568
9569 #if CONFIG_FSE
9570 if (from_name != NULL && to_name != NULL) {
9571 if (from_truncated || to_truncated) {
9572 // set it here since only the from_finfo gets reported up to user space
9573 from_finfo.mode |= FSE_TRUNCATED_PATH;
9574 }
9575
9576 if (tvap && tvp) {
9577 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9578 }
9579 if (fvap) {
9580 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9581 }
9582
9583 if (tvp) {
9584 add_fsevent(FSE_RENAME, ctx,
9585 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9586 FSE_ARG_FINFO, &from_finfo,
9587 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9588 FSE_ARG_FINFO, &to_finfo,
9589 FSE_ARG_DONE);
9590 if (flags & VFS_RENAME_SWAP) {
9591 /*
9592 * Strictly speaking, swap is the equivalent of
9593 * *three* renames. FSEvents clients should only take
9594 * the events as a hint, so we only bother reporting
9595 * two.
9596 */
9597 add_fsevent(FSE_RENAME, ctx,
9598 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9599 FSE_ARG_FINFO, &to_finfo,
9600 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9601 FSE_ARG_FINFO, &from_finfo,
9602 FSE_ARG_DONE);
9603 }
9604 } else {
9605 add_fsevent(FSE_RENAME, ctx,
9606 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9607 FSE_ARG_FINFO, &from_finfo,
9608 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9609 FSE_ARG_DONE);
9610 }
9611 }
9612 #endif /* CONFIG_FSE */
9613
9614 /*
9615 * update filesystem's mount point data
9616 */
9617 if (mntrename) {
9618 char *cp, *pathend, *mpname;
9619 char * tobuf;
9620 struct mount *mp;
9621 int maxlen;
9622 size_t len = 0;
9623
9624 mp = fvp->v_mountedhere;
9625
9626 if (vfs_busy(mp, LK_NOWAIT)) {
9627 error = EBUSY;
9628 goto out1;
9629 }
9630 tobuf = zalloc(ZV_NAMEI);
9631
9632 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9633 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9634 } else {
9635 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9636 }
9637 if (!error) {
9638 /* find current mount point prefix */
9639 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9640 for (cp = pathend; *cp != '\0'; ++cp) {
9641 if (*cp == '/') {
9642 pathend = cp + 1;
9643 }
9644 }
9645 /* find last component of target name */
9646 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9647 if (*cp == '/') {
9648 mpname = cp + 1;
9649 }
9650 }
9651
9652 /* Update f_mntonname of sub mounts */
9653 vfs_iterate(0, rename_submounts_callback, (void *)mp);
9654
9655 /* append name to prefix */
9656 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9657 bzero(pathend, maxlen);
9658
9659 strlcpy(pathend, mpname, maxlen);
9660 }
9661 zfree(ZV_NAMEI, tobuf);
9662
9663 vfs_unbusy(mp);
9664
9665 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9666 }
9667 /*
9668 * fix up name & parent pointers. note that we first
9669 * check that fvp has the same name/parent pointers it
9670 * had before the rename call... this is a 'weak' check
9671 * at best...
9672 *
9673 * XXX oparent and oname may not be set in the compound vnop case
9674 */
9675 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9676 int update_flags;
9677
9678 update_flags = VNODE_UPDATE_NAME;
9679
9680 if (fdvp != tdvp) {
9681 update_flags |= VNODE_UPDATE_PARENT;
9682 }
9683
9684 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9685 }
9686 out1:
9687 /*
9688 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9689 * skipped earlier as no actual rename was performed.
9690 */
9691 if (vn_authorize_skipped && error == 0) {
9692 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9693 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9694 flags, NULL);
9695 if (error && error == ENOENT) {
9696 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9697 do_retry = 1;
9698 retry_count += 1;
9699 }
9700 }
9701 }
9702 if (to_name != NULL) {
9703 RELEASE_PATH(to_name);
9704 to_name = NULL;
9705 }
9706 if (to_name_no_firmlink != NULL) {
9707 RELEASE_PATH(to_name_no_firmlink);
9708 to_name_no_firmlink = NULL;
9709 }
9710 if (from_name != NULL) {
9711 RELEASE_PATH(from_name);
9712 from_name = NULL;
9713 }
9714 if (from_name_no_firmlink != NULL) {
9715 RELEASE_PATH(from_name_no_firmlink);
9716 from_name_no_firmlink = NULL;
9717 }
9718 if (holding_mntlock) {
9719 mount_unlock_renames(locked_mp);
9720 mount_drop(locked_mp, 0);
9721 holding_mntlock = 0;
9722 }
9723 if (tdvp) {
9724 /*
9725 * nameidone has to happen before we vnode_put(tdvp)
9726 * since it may need to release the fs_nodelock on the tdvp
9727 */
9728 nameidone(tond);
9729
9730 if (tvp) {
9731 vnode_put(tvp);
9732 }
9733 vnode_put(tdvp);
9734 }
9735 if (fdvp) {
9736 /*
9737 * nameidone has to happen before we vnode_put(fdvp)
9738 * since it may need to release the fs_nodelock on the fdvp
9739 */
9740 nameidone(fromnd);
9741
9742 if (fvp) {
9743 vnode_put(fvp);
9744 }
9745 vnode_put(fdvp);
9746 }
9747 if (mnt_fvp != NULLVP) {
9748 vnode_put(mnt_fvp);
9749 }
9750 /*
9751 * If things changed after we did the namei, then we will re-drive
9752 * this rename call from the top.
9753 */
9754 if (do_retry) {
9755 do_retry = 0;
9756 goto retry;
9757 }
9758
9759 kfree_type(typeof(*__rename_data), __rename_data);
9760 return error;
9761 }
9762
9763 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9764 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9765 {
9766 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9767 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9768 }
9769
9770 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9771 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9772 {
9773 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9774 return EINVAL;
9775 }
9776
9777 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9778 return EINVAL;
9779 }
9780
9781 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9782 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9783 }
9784
9785 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9786 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9787 {
9788 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9789 uap->tofd, uap->to, UIO_USERSPACE, 0);
9790 }
9791
9792 /*
9793 * Make a directory file.
9794 *
9795 * Returns: 0 Success
9796 * EEXIST
9797 * namei:???
9798 * vnode_authorize:???
9799 * vn_create:???
9800 */
9801 /* ARGSUSED */
9802 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9803 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9804 enum uio_seg segflg)
9805 {
9806 vnode_t vp, dvp;
9807 int error;
9808 int update_flags = 0;
9809 int batched;
9810 struct nameidata nd;
9811
9812 AUDIT_ARG(mode, vap->va_mode);
9813 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9814 path, ctx);
9815 nd.ni_cnd.cn_flags |= WILLBEDIR;
9816 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9817
9818 continue_lookup:
9819 error = nameiat(&nd, fd);
9820 if (error) {
9821 return error;
9822 }
9823 dvp = nd.ni_dvp;
9824 vp = nd.ni_vp;
9825
9826 if (vp != NULL) {
9827 error = EEXIST;
9828 goto out;
9829 }
9830
9831 batched = vnode_compound_mkdir_available(dvp);
9832
9833 VATTR_SET(vap, va_type, VDIR);
9834
9835 /*
9836 * XXX
9837 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9838 * only get EXISTS or EISDIR for existing path components, and not that it could see
9839 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9840 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9841 */
9842 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9843 if (error == EACCES || error == EPERM) {
9844 int error2;
9845
9846 nameidone(&nd);
9847 vnode_put(dvp);
9848 dvp = NULLVP;
9849
9850 /*
9851 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9852 * rather than EACCESS if the target exists.
9853 */
9854 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9855 path, ctx);
9856 error2 = nameiat(&nd, fd);
9857 if (error2) {
9858 goto out;
9859 } else {
9860 vp = nd.ni_vp;
9861 error = EEXIST;
9862 goto out;
9863 }
9864 }
9865
9866 goto out;
9867 }
9868
9869 #if CONFIG_FILE_LEASES
9870 vnode_breakdirlease(dvp, false, O_WRONLY);
9871 #endif
9872
9873 /*
9874 * make the directory
9875 */
9876 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9877 if (error == EKEEPLOOKING) {
9878 nd.ni_vp = vp;
9879 goto continue_lookup;
9880 }
9881
9882 goto out;
9883 }
9884
9885 // Make sure the name & parent pointers are hooked up
9886 if (vp->v_name == NULL) {
9887 update_flags |= VNODE_UPDATE_NAME;
9888 }
9889 if (vp->v_parent == NULLVP) {
9890 update_flags |= VNODE_UPDATE_PARENT;
9891 }
9892
9893 if (update_flags) {
9894 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9895 }
9896
9897 #if CONFIG_FSE
9898 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9899 #endif
9900
9901 out:
9902 /*
9903 * nameidone has to happen before we vnode_put(dvp)
9904 * since it may need to release the fs_nodelock on the dvp
9905 */
9906 nameidone(&nd);
9907
9908 if (vp) {
9909 vnode_put(vp);
9910 }
9911 if (dvp) {
9912 vnode_put(dvp);
9913 }
9914
9915 return error;
9916 }
9917
9918 /*
9919 * mkdir_extended: Create a directory; with extended security (ACL).
9920 *
9921 * Parameters: p Process requesting to create the directory
9922 * uap User argument descriptor (see below)
9923 * retval (ignored)
9924 *
9925 * Indirect: uap->path Path of directory to create
9926 * uap->mode Access permissions to set
9927 * uap->xsecurity ACL to set
9928 *
9929 * Returns: 0 Success
9930 * !0 Not success
9931 *
9932 */
9933 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9934 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9935 {
9936 int ciferror;
9937 kauth_filesec_t xsecdst;
9938 struct vnode_attr va;
9939
9940 AUDIT_ARG(owner, uap->uid, uap->gid);
9941
9942 xsecdst = NULL;
9943 if ((uap->xsecurity != USER_ADDR_NULL) &&
9944 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9945 return ciferror;
9946 }
9947
9948 VATTR_INIT(&va);
9949 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9950 if (xsecdst != NULL) {
9951 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9952 va.va_vaflags |= VA_FILESEC_ACL;
9953 }
9954
9955 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9956 UIO_USERSPACE);
9957 if (xsecdst != NULL) {
9958 kauth_filesec_free(xsecdst);
9959 }
9960 return ciferror;
9961 }
9962
9963 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9964 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9965 {
9966 struct vnode_attr va;
9967
9968 VATTR_INIT(&va);
9969 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9970
9971 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9972 UIO_USERSPACE);
9973 }
9974
9975 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9976 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9977 {
9978 struct vnode_attr va;
9979
9980 VATTR_INIT(&va);
9981 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9982
9983 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9984 UIO_USERSPACE);
9985 }
9986
9987 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9988 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9989 enum uio_seg segflg, int unlink_flags)
9990 {
9991 struct {
9992 struct nameidata nd;
9993 #if CONFIG_FSE
9994 struct vnode_attr va;
9995 #endif /* CONFIG_FSE */
9996 } *__rmdir_data;
9997 vnode_t vp, dvp;
9998 int error;
9999 struct nameidata *ndp;
10000 char *path = NULL;
10001 char *no_firmlink_path = NULL;
10002 int len_path = 0;
10003 int len_no_firmlink_path = 0;
10004 int has_listeners = 0;
10005 int need_event = 0;
10006 int truncated_path = 0;
10007 int truncated_no_firmlink_path = 0;
10008 struct vnode_attr *vap = NULL;
10009 int restart_count = 0;
10010 int batched;
10011
10012 int restart_flag;
10013 int nofollow_any = 0;
10014
10015 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
10016 ndp = &__rmdir_data->nd;
10017
10018 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
10019 nofollow_any = NAMEI_NOFOLLOW_ANY;
10020 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
10021 }
10022
10023 /*
10024 * This loop exists to restart rmdir in the unlikely case that two
10025 * processes are simultaneously trying to remove the same directory
10026 * containing orphaned appleDouble files.
10027 */
10028 do {
10029 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
10030 segflg, dirpath, ctx);
10031 ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
10032 continue_lookup:
10033 restart_flag = 0;
10034 vap = NULL;
10035
10036 error = nameiat(ndp, fd);
10037 if (error) {
10038 goto err_out;
10039 }
10040
10041 dvp = ndp->ni_dvp;
10042 vp = ndp->ni_vp;
10043
10044 if (vp) {
10045 batched = vnode_compound_rmdir_available(vp);
10046
10047 if (vp->v_flag & VROOT) {
10048 /*
10049 * The root of a mounted filesystem cannot be deleted.
10050 */
10051 error = EBUSY;
10052 goto out;
10053 }
10054
10055 #if DEVELOPMENT || DEBUG
10056 /*
10057 * XXX VSWAP: Check for entitlements or special flag here
10058 * so we can restrict access appropriately.
10059 */
10060 #else /* DEVELOPMENT || DEBUG */
10061
10062 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10063 error = EPERM;
10064 goto out;
10065 }
10066 #endif /* DEVELOPMENT || DEBUG */
10067
10068 /*
10069 * Removed a check here; we used to abort if vp's vid
10070 * was not the same as what we'd seen the last time around.
10071 * I do not think that check was valid, because if we retry
10072 * and all dirents are gone, the directory could legitimately
10073 * be recycled but still be present in a situation where we would
10074 * have had permission to delete. Therefore, we won't make
10075 * an effort to preserve that check now that we may not have a
10076 * vp here.
10077 */
10078
10079 if (!batched) {
10080 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10081 if (error) {
10082 if (error == ENOENT) {
10083 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10084 restart_flag = 1;
10085 restart_count += 1;
10086 }
10087 }
10088 goto out;
10089 }
10090 }
10091 } else {
10092 batched = 1;
10093
10094 if (!vnode_compound_rmdir_available(dvp)) {
10095 panic("No error, but no compound rmdir?");
10096 }
10097 }
10098
10099 #if CONFIG_FSE
10100 fse_info finfo = {0};
10101
10102 need_event = need_fsevent(FSE_DELETE, dvp);
10103 if (need_event) {
10104 if (!batched) {
10105 get_fse_info(vp, &finfo, ctx);
10106 } else {
10107 error = vfs_get_notify_attributes(&__rmdir_data->va);
10108 if (error) {
10109 goto out;
10110 }
10111
10112 vap = &__rmdir_data->va;
10113 }
10114 }
10115 #endif
10116 has_listeners = kauth_authorize_fileop_has_listeners();
10117 if (need_event || has_listeners) {
10118 if (path == NULL) {
10119 GET_PATH(path);
10120 }
10121
10122 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10123
10124 if (no_firmlink_path == NULL) {
10125 GET_PATH(no_firmlink_path);
10126 }
10127
10128 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10129 #if CONFIG_FSE
10130 if (truncated_no_firmlink_path) {
10131 finfo.mode |= FSE_TRUNCATED_PATH;
10132 }
10133 #endif
10134 }
10135
10136 #if CONFIG_FILE_LEASES
10137 vnode_breakdirlease(dvp, false, O_WRONLY);
10138 #endif
10139
10140 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10141 ndp->ni_vp = vp;
10142 if (vp == NULLVP) {
10143 /* Couldn't find a vnode */
10144 goto out;
10145 }
10146
10147 if (error == EKEEPLOOKING) {
10148 goto continue_lookup;
10149 } else if (batched && error == ENOENT) {
10150 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10151 /*
10152 * For compound VNOPs, the authorization callback
10153 * may return ENOENT in case of racing hard link lookups
10154 * redrive the lookup.
10155 */
10156 restart_flag = 1;
10157 restart_count += 1;
10158 goto out;
10159 }
10160 }
10161
10162 /*
10163 * XXX There's no provision for passing flags
10164 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10165 * because it's not empty, then we try again
10166 * with VNOP_REMOVE(), passing in a special
10167 * flag that clever file systems will know
10168 * how to handle.
10169 */
10170 if (error == ENOTEMPTY &&
10171 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10172 /*
10173 * Only do this if the directory is actually
10174 * marked as DATALESS.
10175 */
10176 struct vnode_attr *lvap =
10177 kalloc_type(struct vnode_attr, Z_WAITOK);
10178
10179 VATTR_INIT(lvap);
10180 VATTR_WANTED(lvap, va_flags);
10181 if (vnode_getattr(vp, lvap, ctx) == 0 &&
10182 VATTR_IS_SUPPORTED(lvap, va_flags) &&
10183 (lvap->va_flags & SF_DATALESS) != 0) {
10184 /*
10185 * If this fails, we want to keep the original
10186 * error.
10187 */
10188 if (vn_remove(dvp, &vp, ndp,
10189 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10190 error = 0;
10191 }
10192 }
10193 kfree_type(struct vnode_attr, lvap);
10194 }
10195
10196 #if CONFIG_APPLEDOUBLE
10197 /*
10198 * Special case to remove orphaned AppleDouble
10199 * files. I don't like putting this in the kernel,
10200 * but carbon does not like putting this in carbon either,
10201 * so here we are.
10202 */
10203 if (error == ENOTEMPTY) {
10204 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10205 if (ad_error == EBUSY) {
10206 error = ad_error;
10207 goto out;
10208 }
10209
10210
10211 /*
10212 * Assuming everything went well, we will try the RMDIR again
10213 */
10214 if (!ad_error) {
10215 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10216 }
10217 }
10218 #endif /* CONFIG_APPLEDOUBLE */
10219 /*
10220 * Call out to allow 3rd party notification of delete.
10221 * Ignore result of kauth_authorize_fileop call.
10222 */
10223 if (!error) {
10224 if (has_listeners) {
10225 kauth_authorize_fileop(vfs_context_ucred(ctx),
10226 KAUTH_FILEOP_DELETE,
10227 (uintptr_t)vp,
10228 (uintptr_t)path);
10229 }
10230
10231 if (vp->v_flag & VISHARDLINK) {
10232 // see the comment in unlink1() about why we update
10233 // the parent of a hard link when it is removed
10234 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10235 }
10236
10237 #if CONFIG_FSE
10238 if (need_event) {
10239 if (vap) {
10240 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10241 }
10242 add_fsevent(FSE_DELETE, ctx,
10243 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10244 FSE_ARG_FINFO, &finfo,
10245 FSE_ARG_DONE);
10246 }
10247 #endif
10248
10249 #if CONFIG_MACF
10250 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10251 #endif
10252 }
10253
10254 out:
10255 if (path != NULL) {
10256 RELEASE_PATH(path);
10257 path = NULL;
10258 }
10259
10260 if (no_firmlink_path != NULL) {
10261 RELEASE_PATH(no_firmlink_path);
10262 no_firmlink_path = NULL;
10263 }
10264
10265 /*
10266 * nameidone has to happen before we vnode_put(dvp)
10267 * since it may need to release the fs_nodelock on the dvp
10268 */
10269 nameidone(ndp);
10270 vnode_put(dvp);
10271
10272 if (vp) {
10273 vnode_put(vp);
10274 }
10275
10276 if (restart_flag == 0) {
10277 wakeup_one((caddr_t)vp);
10278 goto err_out;
10279 }
10280 tsleep(vp, PVFS, "rm AD", 1);
10281 } while (restart_flag != 0);
10282
10283 err_out:
10284 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10285
10286 return error;
10287 }
10288
10289 /*
10290 * Remove a directory file.
10291 */
10292 /* ARGSUSED */
10293 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10294 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10295 {
10296 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10297 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10298 }
10299
10300 /* Get direntry length padded to 8 byte alignment */
10301 #define DIRENT64_LEN(namlen) \
10302 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10303
10304 /* Get dirent length padded to 4 byte alignment */
10305 #define DIRENT_LEN(namelen) \
10306 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10307
10308 /* Get the end of this dirent */
10309 #define DIRENT_END(dep) \
10310 (((char *)(dep)) + (dep)->d_reclen - 1)
10311
10312 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10313 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10314 int *numdirent, vfs_context_t ctxp)
10315 {
10316 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10317 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10318 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10319 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10320 } else {
10321 size_t bufsize;
10322 void * bufptr;
10323 uio_t auio;
10324 struct direntry *entry64;
10325 struct dirent *dep;
10326 size_t bytesread;
10327 int error;
10328
10329 /*
10330 * We're here because the underlying file system does not
10331 * support direnties or we mounted denying support so we must
10332 * fall back to dirents and convert them to direntries.
10333 *
10334 * Our kernel buffer needs to be smaller since re-packing will
10335 * expand each dirent. The worse case (when the name length
10336 * is 3 or less) corresponds to a struct direntry size of 32
10337 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10338 * (4-byte aligned). So having a buffer that is 3/8 the size
10339 * will prevent us from reading more than we can pack.
10340 *
10341 * Since this buffer is wired memory, we will limit the
10342 * buffer size to a maximum of 32K. We would really like to
10343 * use 32K in the MIN(), but we use magic number 87371 to
10344 * prevent uio_resid() * 3 / 8 from overflowing.
10345 */
10346 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10347 bufptr = kalloc_data(bufsize, Z_WAITOK);
10348 if (bufptr == NULL) {
10349 return ENOMEM;
10350 }
10351
10352 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10353 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10354 auio->uio_offset = uio->uio_offset;
10355
10356 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10357
10358 dep = (struct dirent *)bufptr;
10359 bytesread = bufsize - uio_resid(auio);
10360
10361 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10362 /*
10363 * Convert all the entries and copy them out to user's buffer.
10364 */
10365 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10366 /* First check that the dirent struct up to d_name is within the buffer */
10367 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10368 /* Check that the length of the entire dirent is within the buffer */
10369 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10370 /* Check that the actual length including the name doesn't exceed d_reclen */
10371 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10372 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10373 vp->v_mount->mnt_vfsstat.f_mntonname,
10374 vp->v_name ? vp->v_name : "<unknown>");
10375 error = EIO;
10376 break;
10377 }
10378
10379 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10380
10381 bzero(entry64, enbufsize);
10382 /* Convert a dirent to a dirent64. */
10383 entry64->d_ino = dep->d_ino;
10384 entry64->d_seekoff = 0;
10385 entry64->d_reclen = (uint16_t)enbufsize;
10386 entry64->d_namlen = dep->d_namlen;
10387 entry64->d_type = dep->d_type;
10388 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10389
10390 /* Move to next entry. */
10391 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10392
10393 /* Copy entry64 to user's buffer. */
10394 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10395 }
10396
10397 /* Update the real offset using the offset we got from VNOP_READDIR. */
10398 if (error == 0) {
10399 uio->uio_offset = auio->uio_offset;
10400 }
10401 uio_free(auio);
10402 kfree_data(bufptr, bufsize);
10403 kfree_type(struct direntry, entry64);
10404 return error;
10405 }
10406 }
10407
10408 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10409
10410 /*
10411 * Read a block of directory entries in a file system independent format.
10412 */
10413 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10414 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10415 off_t *offset, int *eofflag, int flags)
10416 {
10417 vnode_t vp;
10418 struct vfs_context context = *vfs_context_current(); /* local copy */
10419 struct fileproc *fp;
10420 uio_t auio;
10421 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10422 off_t loff;
10423 int error, numdirent;
10424 UIO_STACKBUF(uio_buf, 1);
10425
10426 get_from_fd:
10427 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10428 if (error) {
10429 return error;
10430 }
10431
10432 vn_offset_lock(fp->fp_glob);
10433 if (((vnode_t)fp_get_data(fp)) != vp) {
10434 vn_offset_unlock(fp->fp_glob);
10435 file_drop(fd);
10436 goto get_from_fd;
10437 }
10438
10439 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10440 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10441 error = EBADF;
10442 goto out;
10443 }
10444
10445 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10446 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10447 }
10448
10449 #if CONFIG_MACF
10450 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10451 if (error) {
10452 goto out;
10453 }
10454 #endif
10455
10456 if ((error = vnode_getwithref(vp))) {
10457 goto out;
10458 }
10459 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10460
10461 #if CONFIG_UNION_MOUNTS
10462 unionread:
10463 #endif /* CONFIG_UNION_MOUNTS */
10464 if (vp->v_type != VDIR) {
10465 (void)vnode_put(vp);
10466 error = EINVAL;
10467 goto out;
10468 }
10469
10470 #if CONFIG_MACF
10471 error = mac_vnode_check_readdir(&context, vp);
10472 if (error != 0) {
10473 (void)vnode_put(vp);
10474 goto out;
10475 }
10476 #endif /* MAC */
10477
10478 loff = fp->fp_glob->fg_offset;
10479 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10480 uio_addiov(auio, bufp, bufsize);
10481
10482 if (flags & VNODE_READDIR_EXTENDED) {
10483 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10484 fp->fp_glob->fg_offset = uio_offset(auio);
10485 } else {
10486 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10487 fp->fp_glob->fg_offset = uio_offset(auio);
10488 }
10489 if (error) {
10490 (void)vnode_put(vp);
10491 goto out;
10492 }
10493
10494 #if CONFIG_UNION_MOUNTS
10495 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10496 (vp->v_mount->mnt_flag & MNT_UNION)) {
10497 vnode_t uvp;
10498
10499 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10500 if (vnode_ref(uvp) == 0) {
10501 fp_set_data(fp, uvp);
10502 fp->fp_glob->fg_offset = 0;
10503 vnode_rele(vp);
10504 vnode_put(vp);
10505 vp = uvp;
10506 goto unionread;
10507 } else {
10508 /* could not get a ref, can't replace in fd */
10509 vnode_put(uvp);
10510 }
10511 }
10512 }
10513 #endif /* CONFIG_UNION_MOUNTS */
10514
10515 vnode_put(vp);
10516 if (offset) {
10517 *offset = loff;
10518 }
10519
10520 *bytesread = bufsize - uio_resid(auio);
10521 out:
10522 vn_offset_unlock(fp->fp_glob);
10523 file_drop(fd);
10524 return error;
10525 }
10526
10527
10528 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10529 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10530 {
10531 off_t offset;
10532 ssize_t bytesread;
10533 int error, eofflag;
10534
10535 AUDIT_ARG(fd, uap->fd);
10536 error = getdirentries_common(uap->fd, uap->buf, uap->count,
10537 &bytesread, &offset, &eofflag, 0);
10538
10539 if (error == 0) {
10540 if (proc_is64bit(p)) {
10541 user64_long_t base = (user64_long_t)offset;
10542 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10543 } else {
10544 user32_long_t base = (user32_long_t)offset;
10545 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10546 }
10547 *retval = (int)bytesread;
10548 }
10549 return error;
10550 }
10551
10552 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10553 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10554 {
10555 off_t offset;
10556 ssize_t bytesread;
10557 int error, eofflag;
10558 user_size_t bufsize;
10559
10560 AUDIT_ARG(fd, uap->fd);
10561
10562 /*
10563 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10564 * then the kernel carves out the last 4 bytes to return extended
10565 * information to userspace (namely whether we reached EOF with this call).
10566 */
10567 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10568 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10569 } else {
10570 bufsize = uap->bufsize;
10571 }
10572
10573 error = getdirentries_common(uap->fd, uap->buf, bufsize,
10574 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10575
10576 if (error == 0) {
10577 *retval = bytesread;
10578 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10579
10580 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10581 getdirentries64_flags_t flags = 0;
10582 if (eofflag) {
10583 flags |= GETDIRENTRIES64_EOF;
10584 }
10585 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10586 sizeof(flags));
10587 }
10588 }
10589 return error;
10590 }
10591
10592
10593 /*
10594 * Set the mode mask for creation of filesystem nodes.
10595 * XXX implement xsecurity
10596 */
10597 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10598 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10599 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10600 {
10601 AUDIT_ARG(mask, newmask);
10602 proc_fdlock(p);
10603 *retval = p->p_fd.fd_cmask;
10604 p->p_fd.fd_cmask = newmask & ALLPERMS;
10605 proc_fdunlock(p);
10606 return 0;
10607 }
10608
10609 /*
10610 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10611 *
10612 * Parameters: p Process requesting to set the umask
10613 * uap User argument descriptor (see below)
10614 * retval umask of the process (parameter p)
10615 *
10616 * Indirect: uap->newmask umask to set
10617 * uap->xsecurity ACL to set
10618 *
10619 * Returns: 0 Success
10620 * !0 Not success
10621 *
10622 */
10623 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10624 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10625 {
10626 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10627 }
10628
10629 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10630 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10631 {
10632 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10633 }
10634
10635 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10636 "com.apple.private.vfs.revoke-mounted-device"
10637
10638 /*
10639 * Void all references to file by ripping underlying filesystem
10640 * away from vnode.
10641 */
10642 /* ARGSUSED */
10643 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10644 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10645 {
10646 vnode_t vp;
10647 struct vnode_attr va;
10648 vfs_context_t ctx = vfs_context_current();
10649 int error;
10650 struct nameidata nd;
10651
10652 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10653 uap->path, ctx);
10654 error = namei(&nd);
10655 if (error) {
10656 return error;
10657 }
10658 vp = nd.ni_vp;
10659
10660 nameidone(&nd);
10661
10662 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10663 error = ENOTSUP;
10664 goto out;
10665 }
10666
10667 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10668 error = EBUSY;
10669 goto out;
10670 }
10671
10672 #if CONFIG_MACF
10673 error = mac_vnode_check_revoke(ctx, vp);
10674 if (error) {
10675 goto out;
10676 }
10677 #endif
10678
10679 VATTR_INIT(&va);
10680 VATTR_WANTED(&va, va_uid);
10681 if ((error = vnode_getattr(vp, &va, ctx))) {
10682 goto out;
10683 }
10684 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10685 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10686 goto out;
10687 }
10688 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10689 VNOP_REVOKE(vp, REVOKEALL, ctx);
10690 }
10691 out:
10692 vnode_put(vp);
10693 return error;
10694 }
10695
10696
10697 /*
10698 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10699 * The following system calls are designed to support features
10700 * which are specific to the HFS & HFS Plus volume formats
10701 */
10702
10703
10704 /*
10705 * Obtain attribute information on objects in a directory while enumerating
10706 * the directory.
10707 */
10708 /* ARGSUSED */
10709 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10710 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10711 {
10712 vnode_t vp;
10713 struct fileproc *fp;
10714 uio_t auio = NULL;
10715 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10716 uint32_t count = 0, savecount = 0;
10717 uint32_t newstate = 0;
10718 int error, eofflag = 0;
10719 off_t loff = 0;
10720 struct attrlist attributelist;
10721 vfs_context_t ctx = vfs_context_current();
10722 int fd = uap->fd;
10723 UIO_STACKBUF(uio_buf, 1);
10724 kauth_action_t action;
10725
10726 AUDIT_ARG(fd, fd);
10727
10728 /* Get the attributes into kernel space */
10729 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10730 return error;
10731 }
10732 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10733 return error;
10734 }
10735 savecount = count;
10736
10737 get_from_fd:
10738 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10739 return error;
10740 }
10741
10742 vn_offset_lock(fp->fp_glob);
10743 if (((vnode_t)fp_get_data(fp)) != vp) {
10744 vn_offset_unlock(fp->fp_glob);
10745 file_drop(fd);
10746 goto get_from_fd;
10747 }
10748
10749 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10750 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10751 error = EBADF;
10752 goto out;
10753 }
10754
10755
10756 #if CONFIG_MACF
10757 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10758 fp->fp_glob);
10759 if (error) {
10760 goto out;
10761 }
10762 #endif
10763
10764
10765 if ((error = vnode_getwithref(vp))) {
10766 goto out;
10767 }
10768
10769 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10770
10771 #if CONFIG_UNION_MOUNTS
10772 unionread:
10773 #endif /* CONFIG_UNION_MOUNTS */
10774 if (vp->v_type != VDIR) {
10775 (void)vnode_put(vp);
10776 error = EINVAL;
10777 goto out;
10778 }
10779
10780 #if CONFIG_MACF
10781 error = mac_vnode_check_readdir(ctx, vp);
10782 if (error != 0) {
10783 (void)vnode_put(vp);
10784 goto out;
10785 }
10786 #endif /* MAC */
10787
10788 /* set up the uio structure which will contain the users return buffer */
10789 loff = fp->fp_glob->fg_offset;
10790 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10791 uio_addiov(auio, uap->buffer, uap->buffersize);
10792
10793 /*
10794 * If the only item requested is file names, we can let that past with
10795 * just LIST_DIRECTORY. If they want any other attributes, that means
10796 * they need SEARCH as well.
10797 */
10798 action = KAUTH_VNODE_LIST_DIRECTORY;
10799 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10800 attributelist.fileattr || attributelist.dirattr) {
10801 action |= KAUTH_VNODE_SEARCH;
10802 }
10803
10804 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10805 /* Believe it or not, uap->options only has 32-bits of valid
10806 * info, so truncate before extending again */
10807
10808 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10809 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10810 }
10811
10812 if (error) {
10813 (void) vnode_put(vp);
10814 goto out;
10815 }
10816
10817 #if CONFIG_UNION_MOUNTS
10818 /*
10819 * If we've got the last entry of a directory in a union mount
10820 * then reset the eofflag and pretend there's still more to come.
10821 * The next call will again set eofflag and the buffer will be empty,
10822 * so traverse to the underlying directory and do the directory
10823 * read there.
10824 */
10825 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10826 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10827 eofflag = 0;
10828 } else { // Empty buffer
10829 vnode_t uvp;
10830 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10831 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10832 fp_set_data(fp, uvp);
10833 fp->fp_glob->fg_offset = 0; // reset index for new dir
10834 count = savecount;
10835 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10836 vnode_put(vp);
10837 vp = uvp;
10838 goto unionread;
10839 } else {
10840 /* could not get a ref, can't replace in fd */
10841 vnode_put(uvp);
10842 }
10843 }
10844 }
10845 }
10846 #endif /* CONFIG_UNION_MOUNTS */
10847
10848 (void)vnode_put(vp);
10849
10850 if (error) {
10851 goto out;
10852 }
10853 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10854
10855 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10856 goto out;
10857 }
10858 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10859 goto out;
10860 }
10861 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10862 goto out;
10863 }
10864
10865 *retval = eofflag; /* similar to getdirentries */
10866 error = 0;
10867 out:
10868 vn_offset_unlock(fp->fp_glob);
10869 file_drop(fd);
10870 return error; /* return error earlier, an retval of 0 or 1 now */
10871 } /* end of getdirentriesattr system call */
10872
10873 /*
10874 * Exchange data between two files
10875 */
10876
10877 /* ARGSUSED */
10878 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10879 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10880 {
10881 struct nameidata fnd, snd;
10882 vfs_context_t ctx = vfs_context_current();
10883 vnode_t fvp;
10884 vnode_t svp;
10885 int error;
10886 u_int32_t nameiflags;
10887 char *fpath = NULL;
10888 char *spath = NULL;
10889 int flen = 0, slen = 0;
10890 int from_truncated = 0, to_truncated = 0;
10891 #if CONFIG_FSE
10892 fse_info f_finfo, s_finfo;
10893 #endif
10894
10895 nameiflags = 0;
10896 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10897 nameiflags |= FOLLOW;
10898 }
10899
10900 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10901 UIO_USERSPACE, uap->path1, ctx);
10902
10903 error = namei(&fnd);
10904 if (error) {
10905 goto out2;
10906 }
10907
10908 nameidone(&fnd);
10909 fvp = fnd.ni_vp;
10910
10911 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10912 UIO_USERSPACE, uap->path2, ctx);
10913
10914 error = namei(&snd);
10915 if (error) {
10916 vnode_put(fvp);
10917 goto out2;
10918 }
10919 nameidone(&snd);
10920 svp = snd.ni_vp;
10921
10922 /*
10923 * if the files are the same, return an inval error
10924 */
10925 if (svp == fvp) {
10926 error = EINVAL;
10927 goto out;
10928 }
10929
10930 /*
10931 * if the files are on different volumes, return an error
10932 */
10933 if (svp->v_mount != fvp->v_mount) {
10934 error = EXDEV;
10935 goto out;
10936 }
10937
10938 /* If they're not files, return an error */
10939 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10940 error = EINVAL;
10941 goto out;
10942 }
10943
10944 #if CONFIG_MACF
10945 error = mac_vnode_check_exchangedata(ctx,
10946 fvp, svp);
10947 if (error) {
10948 goto out;
10949 }
10950 #endif
10951 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10952 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10953 goto out;
10954 }
10955
10956 if (
10957 #if CONFIG_FSE
10958 need_fsevent(FSE_EXCHANGE, fvp) ||
10959 #endif
10960 kauth_authorize_fileop_has_listeners()) {
10961 GET_PATH(fpath);
10962 GET_PATH(spath);
10963
10964 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10965 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10966
10967 #if CONFIG_FSE
10968 get_fse_info(fvp, &f_finfo, ctx);
10969 get_fse_info(svp, &s_finfo, ctx);
10970 if (from_truncated || to_truncated) {
10971 // set it here since only the f_finfo gets reported up to user space
10972 f_finfo.mode |= FSE_TRUNCATED_PATH;
10973 }
10974 #endif
10975 }
10976 /* Ok, make the call */
10977 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10978
10979 if (error == 0) {
10980 const char *tmpname;
10981
10982 if (fpath != NULL && spath != NULL) {
10983 /* call out to allow 3rd party notification of exchangedata.
10984 * Ignore result of kauth_authorize_fileop call.
10985 */
10986 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10987 (uintptr_t)fpath, (uintptr_t)spath);
10988 }
10989 name_cache_lock();
10990
10991 tmpname = fvp->v_name;
10992 fvp->v_name = svp->v_name;
10993 svp->v_name = tmpname;
10994
10995 if (fvp->v_parent != svp->v_parent) {
10996 vnode_t tmp;
10997
10998 tmp = fvp->v_parent;
10999 fvp->v_parent = svp->v_parent;
11000 svp->v_parent = tmp;
11001 }
11002 name_cache_unlock();
11003
11004 #if CONFIG_FSE
11005 if (fpath != NULL && spath != NULL) {
11006 add_fsevent(FSE_EXCHANGE, ctx,
11007 FSE_ARG_STRING, flen, fpath,
11008 FSE_ARG_FINFO, &f_finfo,
11009 FSE_ARG_STRING, slen, spath,
11010 FSE_ARG_FINFO, &s_finfo,
11011 FSE_ARG_DONE);
11012 }
11013 #endif
11014 }
11015
11016 out:
11017 if (fpath != NULL) {
11018 RELEASE_PATH(fpath);
11019 }
11020 if (spath != NULL) {
11021 RELEASE_PATH(spath);
11022 }
11023 vnode_put(svp);
11024 vnode_put(fvp);
11025 out2:
11026 return error;
11027 }
11028
11029 /*
11030 * Return (in MB) the amount of freespace on the given vnode's volume.
11031 */
11032 uint32_t freespace_mb(vnode_t vp);
11033
11034 uint32_t
freespace_mb(vnode_t vp)11035 freespace_mb(vnode_t vp)
11036 {
11037 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
11038 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
11039 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
11040 }
11041
11042 #if CONFIG_SEARCHFS
11043
11044 /* ARGSUSED */
11045
11046 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)11047 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
11048 {
11049 vnode_t vp, tvp;
11050 int i, error = 0;
11051 int fserror = 0;
11052 struct nameidata nd;
11053 struct user64_fssearchblock searchblock;
11054 struct searchstate *state;
11055 struct attrlist *returnattrs;
11056 struct timeval timelimit;
11057 void *searchparams1, *searchparams2;
11058 uio_t auio = NULL;
11059 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11060 uint32_t nummatches;
11061 size_t mallocsize;
11062 uint32_t nameiflags;
11063 vfs_context_t ctx = vfs_context_current();
11064 UIO_STACKBUF(uio_buf, 1);
11065
11066 /* Start by copying in fsearchblock parameter list */
11067 if (IS_64BIT_PROCESS(p)) {
11068 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11069 timelimit.tv_sec = searchblock.timelimit.tv_sec;
11070 timelimit.tv_usec = searchblock.timelimit.tv_usec;
11071 } else {
11072 struct user32_fssearchblock tmp_searchblock;
11073
11074 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11075 // munge into 64-bit version
11076 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11077 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11078 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11079 searchblock.maxmatches = tmp_searchblock.maxmatches;
11080 /*
11081 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11082 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11083 */
11084 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11085 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11086 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11087 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11088 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11089 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11090 searchblock.searchattrs = tmp_searchblock.searchattrs;
11091 }
11092 if (error) {
11093 return error;
11094 }
11095
11096 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11097 */
11098 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11099 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11100 return EINVAL;
11101 }
11102
11103 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11104 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
11105 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11106 /* block. */
11107 /* */
11108 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
11109 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
11110 /* assumes the size is still 556 bytes it will continue to work */
11111
11112 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11113 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11114
11115 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11116
11117 /* Now set up the various pointers to the correct place in our newly allocated memory */
11118
11119 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11120 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11121 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11122
11123 /* Now copy in the stuff given our local variables. */
11124
11125 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11126 goto freeandexit;
11127 }
11128
11129 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11130 goto freeandexit;
11131 }
11132
11133 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11134 goto freeandexit;
11135 }
11136
11137 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11138 goto freeandexit;
11139 }
11140
11141 /*
11142 * When searching a union mount, need to set the
11143 * start flag at the first call on each layer to
11144 * reset state for the new volume.
11145 */
11146 if (uap->options & SRCHFS_START) {
11147 state->ss_union_layer = 0;
11148 } else {
11149 uap->options |= state->ss_union_flags;
11150 }
11151 state->ss_union_flags = 0;
11152
11153 /*
11154 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11155 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11156 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11157 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11158 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11159 */
11160
11161 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11162 attrreference_t* string_ref;
11163 u_int32_t* start_length;
11164 user64_size_t param_length;
11165
11166 /* validate searchparams1 */
11167 param_length = searchblock.sizeofsearchparams1;
11168 /* skip the word that specifies length of the buffer */
11169 start_length = (u_int32_t*) searchparams1;
11170 start_length = start_length + 1;
11171 string_ref = (attrreference_t*) start_length;
11172
11173 /* ensure no negative offsets or too big offsets */
11174 if (string_ref->attr_dataoffset < 0) {
11175 error = EINVAL;
11176 goto freeandexit;
11177 }
11178 if (string_ref->attr_length > MAXPATHLEN) {
11179 error = EINVAL;
11180 goto freeandexit;
11181 }
11182
11183 /* Check for pointer overflow in the string ref */
11184 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11185 error = EINVAL;
11186 goto freeandexit;
11187 }
11188
11189 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11190 error = EINVAL;
11191 goto freeandexit;
11192 }
11193 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11194 error = EINVAL;
11195 goto freeandexit;
11196 }
11197 }
11198
11199 /* set up the uio structure which will contain the users return buffer */
11200 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11201 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11202
11203 nameiflags = 0;
11204 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11205 nameiflags |= FOLLOW;
11206 }
11207 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11208 UIO_USERSPACE, uap->path, ctx);
11209
11210 error = namei(&nd);
11211 if (error) {
11212 goto freeandexit;
11213 }
11214 vp = nd.ni_vp;
11215 nameidone(&nd);
11216
11217 /*
11218 * Switch to the root vnode for the volume
11219 */
11220 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11221 vnode_put(vp);
11222 if (error) {
11223 goto freeandexit;
11224 }
11225 vp = tvp;
11226
11227 #if CONFIG_UNION_MOUNTS
11228 /*
11229 * If it's a union mount, the path lookup takes
11230 * us to the top layer. But we may need to descend
11231 * to a lower layer. For non-union mounts the layer
11232 * is always zero.
11233 */
11234 for (i = 0; i < (int) state->ss_union_layer; i++) {
11235 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11236 break;
11237 }
11238 tvp = vp;
11239 vp = vp->v_mount->mnt_vnodecovered;
11240 if (vp == NULL) {
11241 vnode_put(tvp);
11242 error = ENOENT;
11243 goto freeandexit;
11244 }
11245 error = vnode_getwithref(vp);
11246 vnode_put(tvp);
11247 if (error) {
11248 goto freeandexit;
11249 }
11250 }
11251 #endif /* CONFIG_UNION_MOUNTS */
11252
11253 #if CONFIG_MACF
11254 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11255 if (error) {
11256 vnode_put(vp);
11257 goto freeandexit;
11258 }
11259 #endif
11260
11261
11262 /*
11263 * If searchblock.maxmatches == 0, then skip the search. This has happened
11264 * before and sometimes the underlying code doesnt deal with it well.
11265 */
11266 if (searchblock.maxmatches == 0) {
11267 nummatches = 0;
11268 goto saveandexit;
11269 }
11270
11271 /*
11272 * Allright, we have everything we need, so lets make that call.
11273 *
11274 * We keep special track of the return value from the file system:
11275 * EAGAIN is an acceptable error condition that shouldn't keep us
11276 * from copying out any results...
11277 */
11278
11279 fserror = VNOP_SEARCHFS(vp,
11280 searchparams1,
11281 searchparams2,
11282 &searchblock.searchattrs,
11283 (uint32_t)searchblock.maxmatches,
11284 &timelimit,
11285 returnattrs,
11286 &nummatches,
11287 (uint32_t)uap->scriptcode,
11288 (uint32_t)uap->options,
11289 auio,
11290 (struct searchstate *) &state->ss_fsstate,
11291 ctx);
11292
11293 #if CONFIG_UNION_MOUNTS
11294 /*
11295 * If it's a union mount we need to be called again
11296 * to search the mounted-on filesystem.
11297 */
11298 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11299 state->ss_union_flags = SRCHFS_START;
11300 state->ss_union_layer++; // search next layer down
11301 fserror = EAGAIN;
11302 }
11303 #endif /* CONFIG_UNION_MOUNTS */
11304
11305 saveandexit:
11306
11307 vnode_put(vp);
11308
11309 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11310 * search state. Everything was already put into he return buffer by the vop call. */
11311
11312 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11313 goto freeandexit;
11314 }
11315
11316 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11317 goto freeandexit;
11318 }
11319
11320 error = fserror;
11321
11322 freeandexit:
11323
11324 kfree_data(searchparams1, mallocsize);
11325
11326 return error;
11327 } /* end of searchfs system call */
11328
11329 #else /* CONFIG_SEARCHFS */
11330
11331 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11332 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11333 {
11334 return ENOTSUP;
11335 }
11336
11337 #endif /* CONFIG_SEARCHFS */
11338
11339
11340 #if CONFIG_DATALESS_FILES
11341
11342 /*
11343 * === Namespace Resolver Up-call Mechanism ===
11344 *
11345 * When I/O is performed to a dataless file or directory (read, write,
11346 * lookup-in, etc.), the file system performs an upcall to the namespace
11347 * resolver (filecoordinationd) to materialize the object.
11348 *
11349 * We need multiple up-calls to be in flight at once, and we need these
11350 * up-calls to be interruptible, thus the following implementation:
11351 *
11352 * => The nspace_resolver_request represents the in-kernel request state.
11353 * It contains a request ID, storage space for the errno code returned
11354 * by filecoordinationd, and flags.
11355 *
11356 * => The request ID is simply a global monotonically incrementing 32-bit
11357 * number. Outstanding requests are stored in a hash table, and the
11358 * hash function is extremely simple.
11359 *
11360 * => When an upcall is to be made to filecoordinationd, a request structure
11361 * is allocated on the stack (it is small, and needs to live only during
11362 * the duration of the call to resolve_nspace_item_ext()). It is
11363 * initialized and inserted into the table. Some backpressure from
11364 * filecoordinationd is applied by limiting the numnber of entries that
11365 * can be inserted into the table (and thus limiting the number of
11366 * outstanding requests issued to filecoordinationd); waiting for an
11367 * available slot is interruptible.
11368 *
11369 * => Once the request has been inserted into the table, the up-call is made
11370 * to filecoordinationd via a MiG-generated stub. The up-call returns
11371 * immediately and filecoordinationd processes the request asynchronously.
11372 *
11373 * => The caller now waits for the request to complete. Tnis is achieved by
11374 * sleeping on the address of the request structure and waiting for
11375 * filecoordinationd to mark the request structure as complete. This
11376 * is an interruptible sleep call; if interrupted, the request structure
11377 * is removed from the table and EINTR is returned to the caller. If
11378 * this occurs, an advisory up-call is made to filecoordinationd with
11379 * the request ID to indicate that the request can be aborted or
11380 * de-prioritized at the discretion of filecoordinationd.
11381 *
11382 * => When filecoordinationd has completed the request, it signals completion
11383 * by writing to the vfs.nspace.complete sysctl node. Only a process
11384 * decorated as a namespace resolver can write to this sysctl node. The
11385 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11386 * The request ID is looked up in the table, and if the request is found,
11387 * the error code is stored in the request structure and a wakeup()
11388 * issued on the address of the request structure. If the request is not
11389 * found, we simply drop the completion notification, assuming that the
11390 * caller was interrupted.
11391 *
11392 * => When the waiting thread wakes up, it extracts the error code from the
11393 * request structure, removes the request from the table, and returns the
11394 * error code to the calling function. Fini!
11395 */
11396
11397 struct nspace_resolver_request {
11398 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11399 vnode_t r_vp;
11400 vnode_t r_tdvp;
11401 uint32_t r_req_id;
11402 int r_resolver_error;
11403 int r_flags;
11404 };
11405
11406 #define RRF_COMPLETE 0x0001
11407 #define RRF_COMPLETING 0x0002
11408
11409 struct nspace_resolver_completion_data {
11410 uint32_t req_id;
11411 int32_t resolver_error;
11412 uint64_t orig_gencount;
11413 uint64_t orig_syncroot;
11414 };
11415
11416 static uint32_t
next_nspace_req_id(void)11417 next_nspace_req_id(void)
11418 {
11419 static uint32_t next_req_id;
11420
11421 return OSAddAtomic(1, &next_req_id);
11422 }
11423
11424 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11425 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11426
11427 static LIST_HEAD(nspace_resolver_requesthead,
11428 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11429 static u_long nspace_resolver_request_hashmask;
11430 static u_int nspace_resolver_request_count;
11431 static bool nspace_resolver_request_wait_slot;
11432 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11433 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11434 &nspace_resolver_request_lck_grp);
11435
11436 #define NSPACE_REQ_LOCK() \
11437 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11438 #define NSPACE_REQ_UNLOCK() \
11439 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11440
11441 #define NSPACE_RESOLVER_HASH(req_id) \
11442 (&nspace_resolver_request_hashtbl[(req_id) & \
11443 nspace_resolver_request_hashmask])
11444
11445 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11446 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11447 {
11448 struct nspace_resolver_requesthead *bucket;
11449 struct nspace_resolver_request *req;
11450
11451 bucket = NSPACE_RESOLVER_HASH(req_id);
11452 LIST_FOREACH(req, bucket, r_hashlink) {
11453 if (req->r_req_id == req_id) {
11454 /*
11455 * If this request already has a completion
11456 * pending, don't return it again.
11457 */
11458 if ((req->r_flags & RRF_COMPLETING) != 0 &&
11459 skip_completing) {
11460 req = NULL;
11461 }
11462 return req;
11463 }
11464 }
11465
11466 return NULL;
11467 }
11468
11469 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11470 nspace_resolver_req_add(struct nspace_resolver_request *req)
11471 {
11472 struct nspace_resolver_requesthead *bucket;
11473 int error;
11474
11475 NSPACE_REQ_LOCK();
11476
11477 while (nspace_resolver_request_count >=
11478 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11479 nspace_resolver_request_wait_slot = true;
11480 error = msleep(&nspace_resolver_request_count,
11481 &nspace_resolver_request_hash_mutex,
11482 PVFS | PCATCH, "nspacerq", NULL);
11483 if (error) {
11484 NSPACE_REQ_UNLOCK();
11485 return error;
11486 }
11487 }
11488
11489 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11490 #if DIAGNOSTIC
11491 assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11492 #endif /* DIAGNOSTIC */
11493 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11494 nspace_resolver_request_count++;
11495
11496 NSPACE_REQ_UNLOCK();
11497
11498 return 0;
11499 }
11500
11501 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11502 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11503 {
11504 /*
11505 * If a completion is in-progress, we have to wait for the
11506 * completion handler to finish because it's still using 'req',
11507 * which is allocated on our stack a couple of frames up.
11508 */
11509 while ((req->r_flags & RRF_COMPLETING) != 0) {
11510 (void) msleep(req, &nspace_resolver_request_hash_mutex,
11511 PVFS, "nspacecmplt", NULL);
11512 }
11513 }
11514
11515 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11516 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11517 {
11518 struct nspace_resolver_requesthead *bucket;
11519
11520 /* We're called with NSPACE_REQ_LOCK held. */
11521
11522 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11523 #if DIAGNOSTIC
11524 assert((req->r_flags & RRF_COMPLETING) == 0);
11525 assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11526 #endif /* DIAGNOSTIC */
11527 LIST_REMOVE(req, r_hashlink);
11528 nspace_resolver_request_count--;
11529
11530 if (nspace_resolver_request_wait_slot) {
11531 nspace_resolver_request_wait_slot = false;
11532 wakeup(&nspace_resolver_request_count);
11533 }
11534
11535 nspace_resolver_req_wait_pending_completion(req);
11536
11537 NSPACE_REQ_UNLOCK();
11538 }
11539
11540 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11541 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11542 {
11543 NSPACE_REQ_LOCK();
11544 nspace_resolver_req_remove_and_unlock(req);
11545 }
11546
11547 static void
nspace_resolver_req_cancel(uint32_t req_id)11548 nspace_resolver_req_cancel(uint32_t req_id)
11549 {
11550 kern_return_t kr;
11551 mach_port_t mp;
11552
11553 // Failures here aren't fatal -- the cancellation message
11554 // sent to the resolver is merely advisory.
11555
11556 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11557 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11558 return;
11559 }
11560
11561 kr = send_nspace_resolve_cancel(mp, req_id);
11562 if (kr != KERN_SUCCESS) {
11563 os_log_error(OS_LOG_DEFAULT,
11564 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11565 }
11566
11567 ipc_port_release_send(mp);
11568 }
11569
11570 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11571 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11572 {
11573 bool send_cancel_message = false;
11574 int error;
11575
11576 NSPACE_REQ_LOCK();
11577
11578 while ((req->r_flags & RRF_COMPLETE) == 0) {
11579 error = msleep(req, &nspace_resolver_request_hash_mutex,
11580 PVFS | PCATCH, "nspace", NULL);
11581 if (error && error != ERESTART) {
11582 req->r_resolver_error = (error == EINTR) ? EINTR :
11583 ETIMEDOUT;
11584 send_cancel_message = true;
11585 break;
11586 }
11587 }
11588
11589 nspace_resolver_req_remove_and_unlock(req);
11590
11591 /*
11592 * It's safe to continue referencing 'req' here because it's
11593 * allocated on our caller's stack.
11594 */
11595
11596 if (send_cancel_message) {
11597 nspace_resolver_req_cancel(req->r_req_id);
11598 }
11599
11600 return req->r_resolver_error;
11601 }
11602
11603 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11604 nspace_resolver_req_mark_complete(
11605 struct nspace_resolver_request *req,
11606 int resolver_error)
11607 {
11608 req->r_resolver_error = resolver_error;
11609 req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11610 wakeup(req);
11611 }
11612
11613 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11614 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11615 {
11616 req->r_flags |= RRF_COMPLETING;
11617 }
11618
11619 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11620 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11621 {
11622 struct nspace_resolver_request *req;
11623 int error;
11624 struct vnode_attr va;
11625 vnode_t vp;
11626
11627 NSPACE_REQ_LOCK();
11628
11629 req = nspace_resolver_req_lookup(c->req_id, true);
11630 if (req == NULL) {
11631 /*
11632 * If we don't find the request corresponding to our req_id,
11633 * just drop the completion on the floor; it's likely that
11634 * the requester interrupted with a signal, or it may already
11635 * be completing.
11636 */
11637 NSPACE_REQ_UNLOCK();
11638 return;
11639 }
11640
11641 /*
11642 * Get out now if the resolver reported an error.
11643 */
11644 if ((error = c->resolver_error) != 0) {
11645 goto out;
11646 }
11647
11648 /*
11649 * If the resolver did not specify any namespace shape criteria
11650 * for letting the operation proceed, then get out now.
11651 */
11652 if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11653 goto out;
11654 }
11655
11656 /*
11657 * We're going to have to acquire the mount rename lock and do
11658 * some I/O in order to verify the criteria. Mark the request
11659 * as pending so no one else messes with it after we drop the
11660 * NSPACE_REQ_LOCK.
11661 */
11662 nspace_resolver_req_mark_completion_pending(req);
11663 NSPACE_REQ_UNLOCK();
11664
11665 /*
11666 * Lock out renames from changing the shape of the tree while
11667 * validate the criteria.
11668 */
11669 mount_t locked_mp = req->r_vp->v_mount;
11670 mount_ref(locked_mp, 0);
11671 mount_lock_renames(locked_mp);
11672
11673 if (c->orig_gencount != 0) {
11674 vp = req->r_vp;
11675 if (error) {
11676 goto out_dropmount;
11677 }
11678
11679 VATTR_INIT(&va);
11680 VATTR_WANTED(&va, va_recursive_gencount);
11681 error = vnode_getattr(vp, &va, vfs_context_kernel());
11682 if (error) {
11683 goto out_dropmount;
11684 }
11685 if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11686 va.va_recursive_gencount != c->orig_gencount) {
11687 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11688 c->orig_gencount, va.va_recursive_gencount);
11689 error = EBUSY;
11690 goto out_dropmount;
11691 }
11692 }
11693
11694 /*
11695 * Ignore orig_syncroot if a destination directory wasn't specified
11696 * in the request.
11697 */
11698 if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11699 uint64_t syncroot_id;
11700
11701 if (error) {
11702 goto out_dropmount;
11703 }
11704
11705 #ifndef APFSIOC_GET_SYNC_ROOT
11706 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11707 #endif
11708
11709 error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11710 (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11711 if (error) {
11712 goto out_dropmount;
11713 }
11714 if (syncroot_id != c->orig_syncroot) {
11715 printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11716 c->orig_syncroot, syncroot_id);
11717 error = EBUSY;
11718 goto out_dropmount;
11719 }
11720 }
11721
11722 out_dropmount:
11723 mount_unlock_renames(locked_mp);
11724 mount_drop(locked_mp, 0);
11725 NSPACE_REQ_LOCK();
11726
11727 out:
11728 nspace_resolver_req_mark_complete(req, error);
11729 NSPACE_REQ_UNLOCK();
11730 }
11731
11732 static struct proc *nspace_resolver_proc;
11733
11734 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11735 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11736 {
11737 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11738 p == nspace_resolver_proc) ? 1 : 0;
11739 return 0;
11740 }
11741
11742 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11743
11744 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11745 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11746 {
11747 vfs_context_t ctx = vfs_context_current();
11748 int error = 0;
11749
11750 //
11751 // The system filecoordinationd runs as uid == 0. This also
11752 // has the nice side-effect of filtering out filecoordinationd
11753 // running in the simulator.
11754 //
11755 if (!vfs_context_issuser(ctx) ||
11756 !vfs_context_is_dataless_resolver(ctx)) {
11757 return EPERM;
11758 }
11759
11760 if (is_resolver) {
11761 NSPACE_REQ_LOCK();
11762
11763 if (nspace_resolver_proc == NULL) {
11764 proc_lock(p);
11765 p->p_lflag |= P_LNSPACE_RESOLVER;
11766 proc_unlock(p);
11767 nspace_resolver_proc = p;
11768 } else {
11769 error = EBUSY;
11770 }
11771
11772 NSPACE_REQ_UNLOCK();
11773 } else {
11774 // This is basically just like the exit case.
11775 // nspace_resolver_exited() will verify that the
11776 // process is the resolver, and will clear the
11777 // global.
11778 nspace_resolver_exited(p);
11779 }
11780
11781 return error;
11782 }
11783
11784 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11785 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11786 {
11787 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11788 (p->p_vfs_iopolicy &
11789 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11790 *is_prevented = 1;
11791 } else {
11792 *is_prevented = 0;
11793 }
11794 return 0;
11795 }
11796
11797 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11798 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11799 {
11800 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11801 return is_prevented ? 0 : EBUSY;
11802 }
11803
11804 if (is_prevented) {
11805 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11806 } else {
11807 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11808 }
11809 return 0;
11810 }
11811
11812 static int
nspace_materialization_get_thread_state(int * is_prevented)11813 nspace_materialization_get_thread_state(int *is_prevented)
11814 {
11815 uthread_t ut = current_uthread();
11816
11817 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11818 return 0;
11819 }
11820
11821 static int
nspace_materialization_set_thread_state(int is_prevented)11822 nspace_materialization_set_thread_state(int is_prevented)
11823 {
11824 uthread_t ut = current_uthread();
11825
11826 if (is_prevented) {
11827 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11828 } else {
11829 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11830 }
11831 return 0;
11832 }
11833
11834 /* the vfs.nspace branch */
11835 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11836
11837 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11838 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11839 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11840 {
11841 struct proc *p = req->p;
11842 int new_value, old_value, changed = 0;
11843 int error;
11844
11845 error = nspace_resolver_get_proc_state(p, &old_value);
11846 if (error) {
11847 return error;
11848 }
11849
11850 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11851 &changed);
11852 if (error == 0 && changed) {
11853 error = nspace_resolver_set_proc_state(p, new_value);
11854 }
11855 return error;
11856 }
11857
11858 /* decorate this process as the dataless file resolver */
11859 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11860 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11861 0, 0, sysctl_nspace_resolver, "I", "");
11862
11863 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11864 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11865 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11866 {
11867 struct proc *p = req->p;
11868 int new_value, old_value, changed = 0;
11869 int error;
11870
11871 error = nspace_materialization_get_proc_state(p, &old_value);
11872 if (error) {
11873 return error;
11874 }
11875
11876 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11877 &changed);
11878 if (error == 0 && changed) {
11879 error = nspace_materialization_set_proc_state(p, new_value);
11880 }
11881 return error;
11882 }
11883
11884 /* decorate this process as not wanting to materialize dataless files */
11885 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11886 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11887 0, 0, sysctl_nspace_prevent_materialization, "I", "");
11888
11889 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11890 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11891 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11892 {
11893 int new_value, old_value, changed = 0;
11894 int error;
11895
11896 error = nspace_materialization_get_thread_state(&old_value);
11897 if (error) {
11898 return error;
11899 }
11900
11901 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11902 &changed);
11903 if (error == 0 && changed) {
11904 error = nspace_materialization_set_thread_state(new_value);
11905 }
11906 return error;
11907 }
11908
11909 /* decorate this thread as not wanting to materialize dataless files */
11910 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11911 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11912 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11913
11914 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11915 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11916 __unused int arg2, struct sysctl_req *req)
11917 {
11918 struct proc *p = req->p;
11919 uint32_t req_status[2] = { 0, 0 };
11920 uint64_t gencount = 0;
11921 uint64_t syncroot = 0;
11922 int error, is_resolver, changed = 0, other_changed;
11923
11924 error = nspace_resolver_get_proc_state(p, &is_resolver);
11925 if (error) {
11926 return error;
11927 }
11928
11929 if (!is_resolver) {
11930 return EPERM;
11931 }
11932
11933 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11934 &changed);
11935 if (error) {
11936 return error;
11937 }
11938
11939 /*
11940 * Get the gencount if it was passed. Ignore errors, because
11941 * it's optional.
11942 */
11943 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11944 &other_changed);
11945 if (error) {
11946 gencount = 0;
11947 error = 0;
11948 }
11949
11950 /*
11951 * ...and now the syncroot ID.
11952 */
11953 error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
11954 &other_changed);
11955 if (error) {
11956 syncroot = 0;
11957 error = 0;
11958 }
11959
11960 /*
11961 * req_status[0] is the req_id
11962 *
11963 * req_status[1] is the errno
11964 */
11965 if (error == 0 && changed) {
11966 const struct nspace_resolver_completion_data cd = {
11967 .req_id = req_status[0],
11968 .resolver_error = req_status[1],
11969 .orig_gencount = gencount,
11970 .orig_syncroot = syncroot,
11971 };
11972 nspace_resolver_req_completed(&cd);
11973 }
11974 return error;
11975 }
11976
11977 /* Resolver reports completed reqs here. */
11978 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11979 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11980 0, 0, sysctl_nspace_complete, "-", "");
11981
11982 #endif /* CONFIG_DATALESS_FILES */
11983
11984 #if CONFIG_DATALESS_FILES
11985 #define __no_dataless_unused /* nothing */
11986 #else
11987 #define __no_dataless_unused __unused
11988 #endif
11989
11990 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11991 vfs_context_dataless_materialization_is_prevented(
11992 vfs_context_t const ctx __no_dataless_unused)
11993 {
11994 #if CONFIG_DATALESS_FILES
11995 proc_t const p = vfs_context_proc(ctx);
11996 thread_t const t = vfs_context_thread(ctx);
11997 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11998
11999 /*
12000 * Kernel context ==> return EDEADLK, as we would with any random
12001 * process decorated as no-materialize.
12002 */
12003 if (ctx == vfs_context_kernel()) {
12004 return EDEADLK;
12005 }
12006
12007 /*
12008 * If the process has the dataless-manipulation entitlement,
12009 * materialization is prevented, and depending on the kind
12010 * of file system operation, things get to proceed as if the
12011 * object is not dataless.
12012 */
12013 if (vfs_context_is_dataless_manipulator(ctx)) {
12014 return EJUSTRETURN;
12015 }
12016
12017 /*
12018 * Per-thread decorations override any process-wide decorations.
12019 * (Foundation uses this, and this overrides even the dataless-
12020 * manipulation entitlement so as to make API contracts consistent.)
12021 */
12022 if (ut != NULL) {
12023 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
12024 return EDEADLK;
12025 }
12026 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
12027 return 0;
12028 }
12029 }
12030
12031 /*
12032 * If the process's iopolicy specifies that dataless files
12033 * can be materialized, then we let it go ahead.
12034 */
12035 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
12036 return 0;
12037 }
12038 #endif /* CONFIG_DATALESS_FILES */
12039
12040 /*
12041 * The default behavior is to not materialize dataless files;
12042 * return to the caller that deadlock was detected.
12043 */
12044 return EDEADLK;
12045 }
12046
12047 void
nspace_resolver_init(void)12048 nspace_resolver_init(void)
12049 {
12050 #if CONFIG_DATALESS_FILES
12051 nspace_resolver_request_hashtbl =
12052 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12053 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12054 #endif /* CONFIG_DATALESS_FILES */
12055 }
12056
12057 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12058 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12059 {
12060 #if CONFIG_DATALESS_FILES
12061 struct nspace_resolver_requesthead *bucket;
12062 struct nspace_resolver_request *req;
12063 u_long idx;
12064
12065 NSPACE_REQ_LOCK();
12066
12067 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12068 p == nspace_resolver_proc) {
12069 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12070 bucket = &nspace_resolver_request_hashtbl[idx];
12071 LIST_FOREACH(req, bucket, r_hashlink) {
12072 nspace_resolver_req_wait_pending_completion(req);
12073 nspace_resolver_req_mark_complete(req,
12074 ETIMEDOUT);
12075 }
12076 }
12077 nspace_resolver_proc = NULL;
12078 }
12079
12080 NSPACE_REQ_UNLOCK();
12081 #endif /* CONFIG_DATALESS_FILES */
12082 }
12083
12084 #define DATALESS_RESOLVER_ENTITLEMENT \
12085 "com.apple.private.vfs.dataless-resolver"
12086 #define DATALESS_MANIPULATION_ENTITLEMENT \
12087 "com.apple.private.vfs.dataless-manipulation"
12088
12089 #if CONFIG_DATALESS_FILES
12090 /*
12091 * Return TRUE if the vfs context is associated with the dataless
12092 * resolver.
12093 */
12094 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12095 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12096 {
12097 return IOTaskHasEntitlement(vfs_context_task(ctx),
12098 DATALESS_RESOLVER_ENTITLEMENT);
12099 }
12100 #endif /* CONFIG_DATALESS_FILES */
12101
12102 /*
12103 * Return TRUE if the vfs context is associated with a process entitled
12104 * for dataless manipulation.
12105 *
12106 * XXX Arguably belongs in vfs_subr.c, but is here because of the
12107 * complication around CONFIG_DATALESS_FILES.
12108 */
12109 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12110 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12111 {
12112 #if CONFIG_DATALESS_FILES
12113 task_t task = vfs_context_task(ctx);
12114 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12115 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12116 #else
12117 return false;
12118 #endif /* CONFIG_DATALESS_FILES */
12119 }
12120
12121 #if CONFIG_DATALESS_FILES
12122 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12123 log_materialization_prevented(vnode_t vp, uint64_t op)
12124 {
12125 char p_name[MAXCOMLEN + 1];
12126 char *vntype;
12127 proc_selfname(&p_name[0], sizeof(p_name));
12128
12129 if (vp->v_type == VREG) {
12130 vntype = "File";
12131 } else if (vp->v_type == VDIR) {
12132 vntype = "Dir";
12133 } else if (vp->v_type == VLNK) {
12134 vntype = "SymLink";
12135 } else {
12136 vntype = "Other";
12137 }
12138
12139 #if DEVELOPMENT
12140 struct vnode_attr *vap = kalloc_type(struct vnode_attr, Z_WAITOK);
12141
12142 VATTR_INIT(vap);
12143 VATTR_WANTED(vap, va_fsid);
12144 VATTR_WANTED(vap, va_fileid);
12145 if (vnode_getattr(vp, vap, vfs_context_current()) == 0) {
12146 os_log_debug(OS_LOG_DEFAULT,
12147 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) fsid 0x%08x/%u fileid=%llu",
12148 p_name, proc_selfpid(), op, vntype,
12149 vap->va_fsid, vap->va_fsid, vap->va_fileid);
12150 } else
12151 #endif
12152 {
12153 os_log_debug(OS_LOG_DEFAULT,
12154 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12155 p_name, proc_selfpid(), op, vntype);
12156 }
12157 #if DEVELOPMENT
12158 kfree_type(struct vnode_attr, vap);
12159 #endif
12160 }
12161 #endif /* CONFIG_DATALESS_FILES */
12162
12163 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12164 vfs_materialize_item(
12165 vnode_t vp __no_dataless_unused,
12166 uint32_t op __no_dataless_unused,
12167 int64_t offset __no_dataless_unused,
12168 int64_t size __no_dataless_unused,
12169 char *lookup_name __no_dataless_unused,
12170 size_t const namelen __no_dataless_unused,
12171 vnode_t tdvp __no_dataless_unused)
12172 {
12173 #if CONFIG_DATALESS_FILES
12174 kern_return_t kern_ret;
12175 mach_port_t mach_port;
12176 char *path = NULL;
12177 vfs_context_t context;
12178 int path_len;
12179 int error;
12180 audit_token_t atoken;
12181 enum vtype vp_vtype;
12182
12183 /* Swap files are special; ignore them */
12184 if (vnode_isswap(vp)) {
12185 return 0;
12186 }
12187
12188 /*
12189 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12190 * are no longer used nor supported.
12191 */
12192 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12193 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12194 return ENOTSUP;
12195 }
12196 if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12197 os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12198 return ENOTSUP;
12199 }
12200
12201 /* Normalize 'op'. */
12202 op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12203
12204 /*
12205 * To-directory is only meaningful for rename operations;
12206 * ignore it if someone handed one to us unexpectedly.
12207 */
12208 if (op != NAMESPACE_HANDLER_RENAME_OP) {
12209 tdvp = NULL;
12210 }
12211
12212 context = vfs_context_current();
12213
12214 /* Remember this for later. */
12215 vp_vtype = vnode_vtype(vp);
12216
12217 error = vfs_context_dataless_materialization_is_prevented(context);
12218 if (error) {
12219 log_materialization_prevented(vp, op);
12220 goto out_check_errors;
12221 }
12222
12223 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12224 &mach_port);
12225 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12226 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12227 /*
12228 * Treat this like being unable to access the backing store
12229 * server.
12230 */
12231 return ETIMEDOUT;
12232 }
12233
12234 int path_alloc_len = MAXPATHLEN;
12235 do {
12236 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12237 if (path == NULL) {
12238 return ENOMEM;
12239 }
12240
12241 path_len = path_alloc_len;
12242 error = vn_getpath(vp, path, &path_len);
12243 if (error == 0) {
12244 break;
12245 } else if (error == ENOSPC) {
12246 kfree_data(path, path_alloc_len);
12247 path = NULL;
12248 } else {
12249 goto out_release_port;
12250 }
12251 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12252
12253 error = vfs_context_copy_audit_token(context, &atoken);
12254 if (error) {
12255 goto out_release_port;
12256 }
12257
12258 struct nspace_resolver_request req = {
12259 .r_req_id = next_nspace_req_id(),
12260 .r_vp = vp,
12261 .r_tdvp = tdvp,
12262 };
12263
12264 error = nspace_resolver_req_add(&req);
12265 if (error) {
12266 goto out_release_port;
12267 }
12268
12269 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12270
12271 if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12272 char *dest_path = NULL;
12273 int dest_path_len;
12274
12275 dest_path = zalloc(ZV_NAMEI);
12276 dest_path_len = MAXPATHLEN;
12277
12278 error = vn_getpath(tdvp, dest_path, &dest_path_len);
12279 if (error) {
12280 zfree(ZV_NAMEI, dest_path);
12281 goto out_release_port;
12282 }
12283
12284 /*
12285 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12286 * compatibility with existing agents in user-space
12287 * who get passed this value.
12288 */
12289 kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12290 req.r_req_id,
12291 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12292 path, dest_path, atoken);
12293
12294 zfree(ZV_NAMEI, dest_path);
12295 } else if (vp_vtype == VDIR) {
12296 char *tmpname = NULL;
12297
12298 /*
12299 * If the caller provided a lookup_name *and* a name length,
12300 * then we assume the lookup_name is not NUL-terminated.
12301 * Allocate a temporary buffer in this case to provide
12302 * a NUL-terminated path name to the IPC call.
12303 */
12304 if (lookup_name != NULL && namelen != 0) {
12305 if (namelen >= PATH_MAX) {
12306 error = EINVAL;
12307 goto out_req_remove;
12308 }
12309 tmpname = zalloc(ZV_NAMEI);
12310 strlcpy(tmpname, lookup_name, namelen + 1);
12311 lookup_name = tmpname;
12312 } else if (lookup_name != NULL) {
12313 /*
12314 * If the caller provided a lookup_name with a
12315 * zero name length, then we assume it's NUL-
12316 * terminated. Verify it has a valid length.
12317 */
12318 if (strlen(lookup_name) >= PATH_MAX) {
12319 error = EINVAL;
12320 goto out_req_remove;
12321 }
12322 }
12323
12324 /* (See above.) */
12325 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12326 req.r_req_id,
12327 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12328 lookup_name == NULL ? "" : lookup_name, path, atoken);
12329
12330 if (tmpname != NULL) {
12331 zfree(ZV_NAMEI, tmpname);
12332
12333 /*
12334 * Poison lookup_name rather than reference
12335 * freed memory.
12336 */
12337 lookup_name = NULL;
12338 }
12339 } else {
12340 /* (See above.) */
12341 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12342 req.r_req_id,
12343 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12344 offset, size, path, atoken);
12345 }
12346 if (kern_ret != KERN_SUCCESS) {
12347 /*
12348 * Also treat this like being unable to access the backing
12349 * store server.
12350 */
12351 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12352 kern_ret);
12353 error = ETIMEDOUT;
12354 goto out_req_remove;
12355 }
12356
12357 /*
12358 * Give back the memory we allocated earlier while we wait; we
12359 * no longer need it.
12360 */
12361 kfree_data(path, path_alloc_len);
12362 path = NULL;
12363
12364 /*
12365 * Request has been submitted to the resolver. Now (interruptibly)
12366 * wait for completion. Upon requrn, the request will have been
12367 * removed from the lookup table.
12368 */
12369 error = nspace_resolver_req_wait(&req);
12370
12371 out_release_port:
12372 if (path != NULL) {
12373 kfree_data(path, path_alloc_len);
12374 path = NULL;
12375 }
12376 ipc_port_release_send(mach_port);
12377
12378 out_check_errors:
12379 /*
12380 * The file resolver owns the logic about what error to return
12381 * to the caller. We only need to handle a couple of special
12382 * cases here:
12383 */
12384 if (error == EJUSTRETURN) {
12385 /*
12386 * The requesting process is allowed to interact with
12387 * dataless objects. Make a couple of sanity-checks
12388 * here to ensure the action makes sense.
12389 */
12390 switch (op) {
12391 case NAMESPACE_HANDLER_WRITE_OP:
12392 case NAMESPACE_HANDLER_TRUNCATE_OP:
12393 case NAMESPACE_HANDLER_RENAME_OP:
12394 /*
12395 * This handles the case of the resolver itself
12396 * writing data to the file (or throwing it
12397 * away).
12398 */
12399 error = 0;
12400 break;
12401 case NAMESPACE_HANDLER_READ_OP:
12402 case NAMESPACE_HANDLER_LOOKUP_OP:
12403 /*
12404 * This handles the case of the resolver needing
12405 * to look up inside of a dataless directory while
12406 * it's in the process of materializing it (for
12407 * example, creating files or directories).
12408 */
12409 error = (vp_vtype == VDIR) ? 0 : EBADF;
12410 break;
12411 default:
12412 error = EBADF;
12413 break;
12414 }
12415 }
12416
12417 return error;
12418
12419 out_req_remove:
12420 nspace_resolver_req_remove(&req);
12421 goto out_release_port;
12422 #else
12423 return ENOTSUP;
12424 #endif /* CONFIG_DATALESS_FILES */
12425 }
12426
12427 /*
12428 * vfs_materialize_file: Materialize a regular file.
12429 *
12430 * Inputs:
12431 * vp The dataless file to be materialized.
12432 *
12433 * op What kind of operation is being performed:
12434 * -> NAMESPACE_HANDLER_READ_OP
12435 * -> NAMESPACE_HANDLER_WRITE_OP
12436 * -> NAMESPACE_HANDLER_LINK_CREATE
12437 * -> NAMESPACE_HANDLER_DELETE_OP
12438 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12439 * -> NAMESPACE_HANDLER_RENAME_OP
12440 *
12441 * offset offset of I/O for READ or WRITE. Ignored for
12442 * other ops.
12443 *
12444 * size size of I/O for READ or WRITE Ignored for
12445 * other ops.
12446 *
12447 * If offset or size are -1 for a READ or WRITE, then the resolver should
12448 * consider the range to be unknown.
12449 *
12450 * Upon successful return, the caller may proceed with the operation.
12451 * N.B. the file may still be "dataless" in this case.
12452 */
12453 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12454 vfs_materialize_file(
12455 struct vnode *vp,
12456 uint64_t op,
12457 int64_t offset,
12458 int64_t size)
12459 {
12460 if (vp->v_type != VREG) {
12461 return EFTYPE;
12462 }
12463 return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12464 NULL);
12465 }
12466
12467 /*
12468 * vfs_materialize_dir:
12469 *
12470 * Inputs:
12471 * vp The dataless directory to be materialized.
12472 *
12473 * op What kind of operation is being performed:
12474 * -> NAMESPACE_HANDLER_READ_OP
12475 * -> NAMESPACE_HANDLER_WRITE_OP
12476 * -> NAMESPACE_HANDLER_DELETE_OP
12477 * -> NAMESPACE_HANDLER_RENAME_OP
12478 * -> NAMESPACE_HANDLER_LOOKUP_OP
12479 *
12480 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12481 * other ops. May or may not be NUL-terminated; see below.
12482 *
12483 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12484 * terminated and namelen is the number of valid bytes in
12485 * lookup_name. If zero, then lookup_name is assumed to be
12486 * NUL-terminated.
12487 *
12488 * Upon successful return, the caller may proceed with the operation.
12489 * N.B. the directory may still be "dataless" in this case.
12490 */
12491 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12492 vfs_materialize_dir(
12493 struct vnode *vp,
12494 uint64_t op,
12495 char *lookup_name,
12496 size_t namelen)
12497 {
12498 if (vp->v_type != VDIR) {
12499 return EFTYPE;
12500 }
12501 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12502 return EINVAL;
12503 }
12504 return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12505 namelen, NULL);
12506 }
12507
12508 /*
12509 * vfs_materialize_reparent:
12510 *
12511 * Inputs:
12512 * vp The dataless file or directory to be materialized.
12513 *
12514 * tdvp The new parent directory for the dataless file.
12515 *
12516 * Upon successful return, the caller may proceed with the operation.
12517 * N.B. the item may still be "dataless" in this case.
12518 */
12519 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12520 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12521 {
12522 if (vp->v_type != VDIR && vp->v_type != VREG) {
12523 return EFTYPE;
12524 }
12525 return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12526 0, 0, NULL, 0, tdvp);
12527 }
12528
12529 #if 0
12530 static int
12531 build_volfs_path(struct vnode *vp, char *path, int *len)
12532 {
12533 struct vnode_attr va;
12534 int ret;
12535
12536 VATTR_INIT(&va);
12537 VATTR_WANTED(&va, va_fsid);
12538 VATTR_WANTED(&va, va_fileid);
12539
12540 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12541 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12542 ret = -1;
12543 } else {
12544 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12545 ret = 0;
12546 }
12547
12548 return ret;
12549 }
12550 #endif
12551
12552 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12553 fsctl_bogus_command_compat(unsigned long cmd)
12554 {
12555 switch (cmd) {
12556 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12557 return FSIOC_SYNC_VOLUME;
12558 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12559 return FSIOC_ROUTEFS_SETROUTEID;
12560 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12561 return FSIOC_SET_PACKAGE_EXTS;
12562 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12563 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12564 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12565 return DISK_CONDITIONER_IOC_GET;
12566 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12567 return DISK_CONDITIONER_IOC_SET;
12568 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12569 return FSIOC_FIOSEEKHOLE;
12570 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12571 return FSIOC_FIOSEEKDATA;
12572 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12573 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12574 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12575 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12576 }
12577
12578 return cmd;
12579 }
12580
12581 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12582 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12583 {
12584 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12585 }
12586
12587 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12588 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12589 {
12590 struct vfs_attr vfa;
12591 mount_t mp = vp->v_mount;
12592 unsigned arg;
12593 int error;
12594
12595 /* record vid of vp so we can drop it below. */
12596 uint32_t vvid = vp->v_id;
12597
12598 /*
12599 * Then grab mount_iterref so that we can release the vnode.
12600 * Without this, a thread may call vnode_iterate_prepare then
12601 * get into a deadlock because we've never released the root vp
12602 */
12603 error = mount_iterref(mp, 0);
12604 if (error) {
12605 return error;
12606 }
12607 vnode_hold(vp);
12608 vnode_put(vp);
12609
12610 arg = MNT_NOWAIT;
12611 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12612 arg = MNT_WAIT;
12613 }
12614
12615 /*
12616 * If the filessytem supports multiple filesytems in a
12617 * partition (For eg APFS volumes in a container, it knows
12618 * that the waitfor argument to VFS_SYNC are flags.
12619 */
12620 VFSATTR_INIT(&vfa);
12621 VFSATTR_WANTED(&vfa, f_capabilities);
12622 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12623 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12624 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12625 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12626 arg |= MNT_VOLUME;
12627 }
12628
12629 /* issue the sync for this volume */
12630 (void)sync_callback(mp, &arg);
12631
12632 /*
12633 * Then release the mount_iterref once we're done syncing; it's not
12634 * needed for the VNOP_IOCTL below
12635 */
12636 mount_iterdrop(mp);
12637
12638 if (arg & FSCTL_SYNC_FULLSYNC) {
12639 /* re-obtain vnode iocount on the root vp, if possible */
12640 error = vnode_getwithvid(vp, vvid);
12641 if (error == 0) {
12642 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12643 vnode_put(vp);
12644 }
12645 }
12646 vnode_drop(vp);
12647 /* mark the argument VP as having been released */
12648 *arg_vp = NULL;
12649 return error;
12650 }
12651
12652 #if ROUTEFS
12653 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12654 handle_routes(user_addr_t udata)
12655 {
12656 char routepath[MAXPATHLEN];
12657 size_t len = 0;
12658 int error;
12659
12660 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12661 return error;
12662 }
12663 bzero(routepath, MAXPATHLEN);
12664 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12665 if (error) {
12666 return error;
12667 }
12668 error = routefs_kernel_mount(routepath);
12669 return error;
12670 }
12671 #endif
12672
12673 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12674 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12675 {
12676 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12677 struct vnode_attr va;
12678 int error;
12679
12680 VATTR_INIT(&va);
12681 VATTR_SET(&va, va_flags, cas->new_flags);
12682
12683 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12684
12685 #if CONFIG_FSE
12686 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12687 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12688 }
12689 #endif
12690
12691 return error;
12692 }
12693
12694 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12695 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12696 {
12697 struct mount *mp = NULL;
12698 errno_t rootauth = 0;
12699
12700 mp = vp->v_mount;
12701
12702 /*
12703 * query the underlying FS and see if it reports something
12704 * sane for this vnode. If volume is authenticated via
12705 * chunklist, leave that for the caller to determine.
12706 */
12707 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12708
12709 return rootauth;
12710 }
12711
12712 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12713 "com.apple.private.kernel.set-package-extensions"
12714
12715 /*
12716 * Make a filesystem-specific control call:
12717 */
12718 /* ARGSUSED */
12719 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12720 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12721 {
12722 int error = 0;
12723 boolean_t is64bit;
12724 u_int size;
12725 #define STK_PARAMS 128
12726 char stkbuf[STK_PARAMS] = {0};
12727 caddr_t data, memp;
12728 vnode_t vp = *arg_vp;
12729
12730 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12731 return ENOTTY;
12732 }
12733
12734 cmd = fsctl_bogus_command_compat(cmd);
12735
12736 size = IOCPARM_LEN(cmd);
12737 if (size > IOCPARM_MAX) {
12738 return EINVAL;
12739 }
12740
12741 is64bit = proc_is64bit(p);
12742
12743 memp = NULL;
12744
12745 if (size > sizeof(stkbuf)) {
12746 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12747 return ENOMEM;
12748 }
12749 data = memp;
12750 } else {
12751 data = &stkbuf[0];
12752 };
12753
12754 if (cmd & IOC_IN) {
12755 if (size) {
12756 error = copyin(udata, data, size);
12757 if (error) {
12758 if (memp) {
12759 kfree_data(memp, size);
12760 }
12761 return error;
12762 }
12763 } else {
12764 if (is64bit) {
12765 *(user_addr_t *)data = udata;
12766 } else {
12767 *(uint32_t *)data = (uint32_t)udata;
12768 }
12769 };
12770 } else if ((cmd & IOC_OUT) && size) {
12771 /*
12772 * Zero the buffer so the user always
12773 * gets back something deterministic.
12774 */
12775 bzero(data, size);
12776 } else if (cmd & IOC_VOID) {
12777 if (is64bit) {
12778 *(user_addr_t *)data = udata;
12779 } else {
12780 *(uint32_t *)data = (uint32_t)udata;
12781 }
12782 }
12783
12784 /* Check to see if it's a generic command */
12785 switch (cmd) {
12786 case FSIOC_SYNC_VOLUME:
12787 error = handle_sync_volume(vp, arg_vp, data, ctx);
12788 break;
12789
12790 case FSIOC_ROUTEFS_SETROUTEID:
12791 #if ROUTEFS
12792 error = handle_routes(udata);
12793 #endif
12794 break;
12795
12796 case FSIOC_SET_PACKAGE_EXTS: {
12797 user_addr_t ext_strings;
12798 uint32_t num_entries;
12799 uint32_t max_width;
12800
12801 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12802 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12803 error = EPERM;
12804 break;
12805 }
12806
12807 if ((is64bit && size != sizeof(user64_package_ext_info))
12808 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12809 // either you're 64-bit and passed a 64-bit struct or
12810 // you're 32-bit and passed a 32-bit struct. otherwise
12811 // it's not ok.
12812 error = EINVAL;
12813 break;
12814 }
12815
12816 if (is64bit) {
12817 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12818 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12819 }
12820 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12821 num_entries = ((user64_package_ext_info *)data)->num_entries;
12822 max_width = ((user64_package_ext_info *)data)->max_width;
12823 } else {
12824 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12825 num_entries = ((user32_package_ext_info *)data)->num_entries;
12826 max_width = ((user32_package_ext_info *)data)->max_width;
12827 }
12828 error = set_package_extensions_table(ext_strings, num_entries, max_width);
12829 }
12830 break;
12831
12832 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12833 {
12834 mount_t mp;
12835
12836 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12837 break;
12838 }
12839 if ((mp = vp->v_mount) != NULL) {
12840 mount_lock(mp);
12841 if (data[0] != 0) {
12842 for (int i = 0; i < MFSTYPENAMELEN; i++) {
12843 if (!data[i]) {
12844 goto continue_copy;
12845 }
12846 }
12847 /*
12848 * Getting here means we have a user data
12849 * string which has no NULL termination in
12850 * its first MFSTYPENAMELEN bytes. This is
12851 * bogus, let's avoid strlcpy-ing the read
12852 * data and return an error.
12853 */
12854 error = EINVAL;
12855 goto unlock;
12856 continue_copy:
12857 vfs_setfstypename_locked(mp, data);
12858 if (vfs_isrdonly(mp) &&
12859 strcmp(data, "mtmfs") == 0) {
12860 mp->mnt_kern_flag |=
12861 MNTK_EXTENDED_SECURITY;
12862 mp->mnt_kern_flag &=
12863 ~MNTK_AUTH_OPAQUE;
12864 }
12865 } else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12866 const char *name =
12867 vfs_getfstypenameref_locked(mp, NULL);
12868 if (strcmp(name, "mtmfs") == 0) {
12869 mp->mnt_kern_flag &=
12870 ~MNTK_EXTENDED_SECURITY;
12871 }
12872 vfs_setfstypename_locked(mp, NULL);
12873 }
12874 unlock:
12875 mount_unlock(mp);
12876 }
12877 }
12878 break;
12879
12880 case DISK_CONDITIONER_IOC_GET: {
12881 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12882 }
12883 break;
12884
12885 case DISK_CONDITIONER_IOC_SET: {
12886 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12887 }
12888 break;
12889
12890 case FSIOC_CAS_BSDFLAGS:
12891 error = handle_flags(vp, data, ctx);
12892 break;
12893
12894 case FSIOC_FD_ONLY_OPEN_ONCE: {
12895 error = 0;
12896 if (vnode_usecount(vp) > 1) {
12897 vnode_lock_spin(vp);
12898 if (vp->v_lflag & VL_HASSTREAMS) {
12899 if (vnode_isinuse_locked(vp, 1, 1)) {
12900 error = EBUSY;
12901 }
12902 } else if (vnode_usecount(vp) > 1) {
12903 error = EBUSY;
12904 }
12905 vnode_unlock(vp);
12906 }
12907 }
12908 break;
12909
12910 case FSIOC_EVAL_ROOTAUTH:
12911 error = handle_auth(vp, cmd, data, options, ctx);
12912 break;
12913
12914 case FSIOC_TEST_FSE_ACCESS_GRANTED:
12915 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12916 break;
12917
12918 #if CONFIG_EXCLAVES
12919 case FSIOC_EXCLAVE_FS_REGISTER:
12920 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12921 error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
12922 } else {
12923 error = EPERM;
12924 }
12925 break;
12926
12927 case FSIOC_EXCLAVE_FS_UNREGISTER:
12928 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12929 error = vfs_exclave_fs_unregister(vp);
12930 } else {
12931 error = EPERM;
12932 }
12933 break;
12934
12935 case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
12936 exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
12937 exclave_fs_base_dir_t *dirs = NULL;
12938 if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12939 error = EPERM;
12940 break;
12941 }
12942 if (get_base_dirs->base_dirs) {
12943 if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
12944 error = EINVAL;
12945 break;
12946 }
12947 dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
12948 if (!dirs) {
12949 error = ENOSPC;
12950 break;
12951 }
12952 }
12953 error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
12954 if (!error && dirs) {
12955 error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
12956 get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
12957 }
12958 if (dirs) {
12959 kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
12960 }
12961 }
12962 break;
12963 #endif
12964
12965 default: {
12966 /*
12967 * Other, known commands shouldn't be passed down here.
12968 * (When adding a selector to this list, it may be prudent
12969 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
12970 */
12971 switch (cmd) {
12972 case F_PUNCHHOLE:
12973 case F_TRIM_ACTIVE_FILE:
12974 case F_RDADVISE:
12975 case F_TRANSCODEKEY:
12976 case F_GETPROTECTIONLEVEL:
12977 case F_GETDEFAULTPROTLEVEL:
12978 case F_MAKECOMPRESSED:
12979 case F_SET_GREEDY_MODE:
12980 case F_SETSTATICCONTENT:
12981 case F_SETIOTYPE:
12982 case F_SETBACKINGSTORE:
12983 case F_GETPATH_MTMINFO:
12984 case APFSIOC_REVERT_TO_SNAPSHOT:
12985 case FSIOC_FIOSEEKHOLE:
12986 case FSIOC_FIOSEEKDATA:
12987 case HFS_GET_BOOT_INFO:
12988 case HFS_SET_BOOT_INFO:
12989 case FIOPINSWAP:
12990 case F_CHKCLEAN:
12991 case F_FULLFSYNC:
12992 case F_BARRIERFSYNC:
12993 case F_FREEZE_FS:
12994 case F_THAW_FS:
12995 case FSIOC_KERNEL_ROOTAUTH:
12996 case FSIOC_GRAFT_FS:
12997 case FSIOC_UNGRAFT_FS:
12998 case FSIOC_AUTH_FS:
12999 case F_SPECULATIVE_READ:
13000 case F_ATTRIBUTION_TAG:
13001 case F_TRANSFEREXTENTS:
13002 case F_ASSERT_BG_ACCESS:
13003 case F_RELEASE_BG_ACCESS:
13004 error = EINVAL;
13005 goto outdrop;
13006 }
13007 /* Invoke the filesystem-specific code */
13008 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
13009 }
13010 } /* end switch stmt */
13011
13012 /*
13013 * if no errors, copy any data to user. Size was
13014 * already set and checked above.
13015 */
13016 if (error == 0 && (cmd & IOC_OUT) && size) {
13017 error = copyout(data, udata, size);
13018 }
13019
13020 outdrop:
13021 if (memp) {
13022 kfree_data(memp, size);
13023 }
13024
13025 return error;
13026 }
13027
13028 /* ARGSUSED */
13029 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)13030 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
13031 {
13032 int error;
13033 struct nameidata nd;
13034 uint32_t nameiflags;
13035 vnode_t vp = NULL;
13036 vfs_context_t ctx = vfs_context_current();
13037
13038 AUDIT_ARG(cmd, (int)uap->cmd);
13039 AUDIT_ARG(value32, uap->options);
13040 /* Get the vnode for the file we are getting info on: */
13041 nameiflags = 0;
13042 //
13043 // if we come through fsctl() then the file is by definition not open.
13044 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
13045 // lest the caller mistakenly thinks the only open is their own (but in
13046 // reality it's someone elses).
13047 //
13048 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
13049 return EINVAL;
13050 }
13051 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
13052 nameiflags |= FOLLOW;
13053 }
13054 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
13055 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13056 }
13057 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13058 UIO_USERSPACE, uap->path, ctx);
13059 if ((error = namei(&nd))) {
13060 goto done;
13061 }
13062 vp = nd.ni_vp;
13063 nameidone(&nd);
13064
13065 #if CONFIG_MACF
13066 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13067 if (error) {
13068 goto done;
13069 }
13070 #endif
13071
13072 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13073
13074 done:
13075 if (vp) {
13076 vnode_put(vp);
13077 }
13078 return error;
13079 }
13080 /* ARGSUSED */
13081 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13082 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13083 {
13084 int error;
13085 vnode_t vp = NULL;
13086 vfs_context_t ctx = vfs_context_current();
13087 int fd = -1;
13088
13089 AUDIT_ARG(fd, uap->fd);
13090 AUDIT_ARG(cmd, (int)uap->cmd);
13091 AUDIT_ARG(value32, uap->options);
13092
13093 /* Get the vnode for the file we are getting info on: */
13094 if ((error = file_vnode(uap->fd, &vp))) {
13095 return error;
13096 }
13097 fd = uap->fd;
13098 if ((error = vnode_getwithref(vp))) {
13099 file_drop(fd);
13100 return error;
13101 }
13102
13103 #if CONFIG_MACF
13104 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13105 file_drop(fd);
13106 vnode_put(vp);
13107 return error;
13108 }
13109 #endif
13110
13111 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13112
13113 file_drop(fd);
13114
13115 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13116 if (vp) {
13117 vnode_put(vp);
13118 }
13119
13120 return error;
13121 }
13122 /* end of fsctl system call */
13123
13124 #define FILESEC_ACCESS_ENTITLEMENT \
13125 "com.apple.private.vfs.filesec-access"
13126
13127 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13128 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13129 {
13130 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13131 /*
13132 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13133 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13134 */
13135 if ((!setting && vfs_context_issuser(ctx)) ||
13136 IOTaskHasEntitlement(vfs_context_task(ctx),
13137 FILESEC_ACCESS_ENTITLEMENT)) {
13138 return 0;
13139 }
13140 }
13141
13142 return EPERM;
13143 }
13144
13145 /*
13146 * Retrieve the data of an extended attribute.
13147 */
13148 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13149 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13150 {
13151 vnode_t vp;
13152 struct nameidata nd;
13153 char attrname[XATTR_MAXNAMELEN + 1];
13154 vfs_context_t ctx = vfs_context_current();
13155 uio_t auio = NULL;
13156 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13157 size_t attrsize = 0;
13158 size_t namelen;
13159 u_int32_t nameiflags;
13160 int error;
13161 UIO_STACKBUF(uio_buf, 1);
13162
13163 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13164 return EINVAL;
13165 }
13166
13167 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13168 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13169 if (uap->options & XATTR_NOFOLLOW_ANY) {
13170 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13171 }
13172
13173 if ((error = namei(&nd))) {
13174 return error;
13175 }
13176 vp = nd.ni_vp;
13177 nameidone(&nd);
13178
13179 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13180 if (error != 0) {
13181 goto out;
13182 }
13183 if (xattr_protected(attrname) &&
13184 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13185 goto out;
13186 }
13187 /*
13188 * the specific check for 0xffffffff is a hack to preserve
13189 * binaray compatibilty in K64 with applications that discovered
13190 * that passing in a buf pointer and a size of -1 resulted in
13191 * just the size of the indicated extended attribute being returned.
13192 * this isn't part of the documented behavior, but because of the
13193 * original implemtation's check for "uap->size > 0", this behavior
13194 * was allowed. In K32 that check turned into a signed comparison
13195 * even though uap->size is unsigned... in K64, we blow by that
13196 * check because uap->size is unsigned and doesn't get sign smeared
13197 * in the munger for a 32 bit user app. we also need to add a
13198 * check to limit the maximum size of the buffer being passed in...
13199 * unfortunately, the underlying fileystems seem to just malloc
13200 * the requested size even if the actual extended attribute is tiny.
13201 * because that malloc is for kernel wired memory, we have to put a
13202 * sane limit on it.
13203 *
13204 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13205 * U64 running on K64 will yield -1 (64 bits wide)
13206 * U32/U64 running on K32 will yield -1 (32 bits wide)
13207 */
13208 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13209 goto no_uio;
13210 }
13211
13212 if (uap->value) {
13213 if (uap->size > (size_t)XATTR_MAXSIZE) {
13214 uap->size = XATTR_MAXSIZE;
13215 }
13216
13217 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13218 &uio_buf[0], sizeof(uio_buf));
13219 uio_addiov(auio, uap->value, uap->size);
13220 }
13221 no_uio:
13222 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13223 out:
13224 vnode_put(vp);
13225
13226 if (auio) {
13227 *retval = uap->size - uio_resid(auio);
13228 } else {
13229 *retval = (user_ssize_t)attrsize;
13230 }
13231
13232 return error;
13233 }
13234
13235 /*
13236 * Retrieve the data of an extended attribute.
13237 */
13238 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13239 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13240 {
13241 vnode_t vp;
13242 char attrname[XATTR_MAXNAMELEN + 1];
13243 vfs_context_t ctx = vfs_context_current();
13244 uio_t auio = NULL;
13245 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13246 size_t attrsize = 0;
13247 size_t namelen;
13248 int error;
13249 UIO_STACKBUF(uio_buf, 1);
13250
13251 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13252 XATTR_NOFOLLOW_ANY)) {
13253 return EINVAL;
13254 }
13255
13256 if ((error = file_vnode(uap->fd, &vp))) {
13257 return error;
13258 }
13259 if ((error = vnode_getwithref(vp))) {
13260 file_drop(uap->fd);
13261 return error;
13262 }
13263 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13264 if (error != 0) {
13265 goto out;
13266 }
13267 if (xattr_protected(attrname) &&
13268 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13269 goto out;
13270 }
13271 if (uap->value && uap->size > 0) {
13272 if (uap->size > (size_t)XATTR_MAXSIZE) {
13273 uap->size = XATTR_MAXSIZE;
13274 }
13275
13276 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13277 &uio_buf[0], sizeof(uio_buf));
13278 uio_addiov(auio, uap->value, uap->size);
13279 }
13280
13281 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13282 out:
13283 (void)vnode_put(vp);
13284 file_drop(uap->fd);
13285
13286 if (auio) {
13287 *retval = uap->size - uio_resid(auio);
13288 } else {
13289 *retval = (user_ssize_t)attrsize;
13290 }
13291 return error;
13292 }
13293
13294 /* struct for checkdirs iteration */
13295 struct setxattr_ctx {
13296 struct nameidata nd;
13297 char attrname[XATTR_MAXNAMELEN + 1];
13298 UIO_STACKBUF(uio_buf, 1);
13299 };
13300
13301 /*
13302 * Set the data of an extended attribute.
13303 */
13304 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13305 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13306 {
13307 vnode_t vp;
13308 vfs_context_t ctx = vfs_context_current();
13309 uio_t auio = NULL;
13310 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13311 size_t namelen;
13312 u_int32_t nameiflags;
13313 int error;
13314 struct setxattr_ctx *sactx;
13315
13316 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13317 return EINVAL;
13318 }
13319
13320 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13321 if (sactx == NULL) {
13322 return ENOMEM;
13323 }
13324
13325 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13326 if (error != 0) {
13327 if (error == EPERM) {
13328 /* if the string won't fit in attrname, copyinstr emits EPERM */
13329 error = ENAMETOOLONG;
13330 }
13331 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13332 goto out;
13333 }
13334 if (xattr_protected(sactx->attrname) &&
13335 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13336 goto out;
13337 }
13338 if (uap->size != 0 && uap->value == 0) {
13339 error = EINVAL;
13340 goto out;
13341 }
13342 if (uap->size > INT_MAX) {
13343 error = E2BIG;
13344 goto out;
13345 }
13346
13347 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13348 #if CONFIG_FILE_LEASES
13349 nameiflags |= WANTPARENT;
13350 #endif
13351 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13352 if (uap->options & XATTR_NOFOLLOW_ANY) {
13353 sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13354 }
13355
13356 if ((error = namei(&sactx->nd))) {
13357 goto out;
13358 }
13359 vp = sactx->nd.ni_vp;
13360 #if CONFIG_FILE_LEASES
13361 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13362 vnode_put(sactx->nd.ni_dvp);
13363 #endif
13364 nameidone(&sactx->nd);
13365
13366 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13367 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13368 uio_addiov(auio, uap->value, uap->size);
13369
13370 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13371 #if CONFIG_FSE
13372 if (error == 0) {
13373 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13374 FSE_ARG_VNODE, vp,
13375 FSE_ARG_DONE);
13376 }
13377 #endif
13378 vnode_put(vp);
13379 out:
13380 kfree_type(struct setxattr_ctx, sactx);
13381 *retval = 0;
13382 return error;
13383 }
13384
13385 /*
13386 * Set the data of an extended attribute.
13387 */
13388 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13389 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13390 {
13391 vnode_t vp;
13392 char attrname[XATTR_MAXNAMELEN + 1];
13393 vfs_context_t ctx = vfs_context_current();
13394 uio_t auio = NULL;
13395 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13396 size_t namelen;
13397 int error;
13398 UIO_STACKBUF(uio_buf, 1);
13399
13400 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13401 XATTR_NOFOLLOW_ANY)) {
13402 return EINVAL;
13403 }
13404
13405 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13406 if (error != 0) {
13407 if (error == EPERM) {
13408 /* if the string won't fit in attrname, copyinstr emits EPERM */
13409 return ENAMETOOLONG;
13410 }
13411 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13412 return error;
13413 }
13414 if (xattr_protected(attrname) &&
13415 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13416 return error;
13417 }
13418 if (uap->size != 0 && uap->value == 0) {
13419 return EINVAL;
13420 }
13421 if (uap->size > INT_MAX) {
13422 return E2BIG;
13423 }
13424 if ((error = file_vnode(uap->fd, &vp))) {
13425 return error;
13426 }
13427 if ((error = vnode_getwithref(vp))) {
13428 file_drop(uap->fd);
13429 return error;
13430 }
13431
13432 #if CONFIG_FILE_LEASES
13433 vnode_breakdirlease(vp, true, O_WRONLY);
13434 #endif
13435
13436 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13437 &uio_buf[0], sizeof(uio_buf));
13438 uio_addiov(auio, uap->value, uap->size);
13439
13440 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13441 #if CONFIG_FSE
13442 if (error == 0) {
13443 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13444 FSE_ARG_VNODE, vp,
13445 FSE_ARG_DONE);
13446 }
13447 #endif
13448 vnode_put(vp);
13449 file_drop(uap->fd);
13450 *retval = 0;
13451 return error;
13452 }
13453
13454 /*
13455 * Remove an extended attribute.
13456 * XXX Code duplication here.
13457 */
13458 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13459 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13460 {
13461 vnode_t vp;
13462 struct nameidata nd;
13463 char attrname[XATTR_MAXNAMELEN + 1];
13464 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13465 vfs_context_t ctx = vfs_context_current();
13466 size_t namelen;
13467 u_int32_t nameiflags;
13468 int error;
13469
13470 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13471 return EINVAL;
13472 }
13473
13474 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13475 if (error != 0) {
13476 return error;
13477 }
13478 if (xattr_protected(attrname)) {
13479 return EPERM;
13480 }
13481 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13482 #if CONFIG_FILE_LEASES
13483 nameiflags |= WANTPARENT;
13484 #endif
13485 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13486 if (uap->options & XATTR_NOFOLLOW_ANY) {
13487 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13488 }
13489
13490 if ((error = namei(&nd))) {
13491 return error;
13492 }
13493 vp = nd.ni_vp;
13494 #if CONFIG_FILE_LEASES
13495 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13496 vnode_put(nd.ni_dvp);
13497 #endif
13498 nameidone(&nd);
13499
13500 error = vn_removexattr(vp, attrname, uap->options, ctx);
13501 #if CONFIG_FSE
13502 if (error == 0) {
13503 add_fsevent(FSE_XATTR_REMOVED, ctx,
13504 FSE_ARG_VNODE, vp,
13505 FSE_ARG_DONE);
13506 }
13507 #endif
13508 vnode_put(vp);
13509 *retval = 0;
13510 return error;
13511 }
13512
13513 /*
13514 * Remove an extended attribute.
13515 * XXX Code duplication here.
13516 */
13517 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13518 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13519 {
13520 vnode_t vp;
13521 char attrname[XATTR_MAXNAMELEN + 1];
13522 size_t namelen;
13523 int error;
13524 #if CONFIG_FSE
13525 vfs_context_t ctx = vfs_context_current();
13526 #endif
13527
13528 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13529 XATTR_NOFOLLOW_ANY)) {
13530 return EINVAL;
13531 }
13532
13533 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13534 if (error != 0) {
13535 return error;
13536 }
13537 if (xattr_protected(attrname)) {
13538 return EPERM;
13539 }
13540 if ((error = file_vnode(uap->fd, &vp))) {
13541 return error;
13542 }
13543 if ((error = vnode_getwithref(vp))) {
13544 file_drop(uap->fd);
13545 return error;
13546 }
13547
13548 #if CONFIG_FILE_LEASES
13549 vnode_breakdirlease(vp, true, O_WRONLY);
13550 #endif
13551
13552 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13553 #if CONFIG_FSE
13554 if (error == 0) {
13555 add_fsevent(FSE_XATTR_REMOVED, ctx,
13556 FSE_ARG_VNODE, vp,
13557 FSE_ARG_DONE);
13558 }
13559 #endif
13560 vnode_put(vp);
13561 file_drop(uap->fd);
13562 *retval = 0;
13563 return error;
13564 }
13565
13566 /*
13567 * Retrieve the list of extended attribute names.
13568 * XXX Code duplication here.
13569 */
13570 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13571 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13572 {
13573 vnode_t vp;
13574 struct nameidata nd;
13575 vfs_context_t ctx = vfs_context_current();
13576 uio_t auio = NULL;
13577 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13578 size_t attrsize = 0;
13579 u_int32_t nameiflags;
13580 int error;
13581 UIO_STACKBUF(uio_buf, 1);
13582
13583 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13584 return EINVAL;
13585 }
13586
13587 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13588 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13589 if (uap->options & XATTR_NOFOLLOW_ANY) {
13590 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
13591 }
13592
13593 if ((error = namei(&nd))) {
13594 return error;
13595 }
13596 vp = nd.ni_vp;
13597 nameidone(&nd);
13598 if (uap->namebuf != 0 && uap->bufsize > 0) {
13599 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13600 &uio_buf[0], sizeof(uio_buf));
13601 uio_addiov(auio, uap->namebuf, uap->bufsize);
13602 }
13603
13604 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13605
13606 vnode_put(vp);
13607 if (auio) {
13608 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13609 } else {
13610 *retval = (user_ssize_t)attrsize;
13611 }
13612 return error;
13613 }
13614
13615 /*
13616 * Retrieve the list of extended attribute names.
13617 * XXX Code duplication here.
13618 */
13619 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13620 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13621 {
13622 vnode_t vp;
13623 uio_t auio = NULL;
13624 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13625 size_t attrsize = 0;
13626 int error;
13627 UIO_STACKBUF(uio_buf, 1);
13628
13629 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT |
13630 XATTR_NOFOLLOW_ANY)) {
13631 return EINVAL;
13632 }
13633
13634 if ((error = file_vnode(uap->fd, &vp))) {
13635 return error;
13636 }
13637 if ((error = vnode_getwithref(vp))) {
13638 file_drop(uap->fd);
13639 return error;
13640 }
13641 if (uap->namebuf != 0 && uap->bufsize > 0) {
13642 auio = uio_createwithbuffer(1, 0, spacetype,
13643 UIO_READ, &uio_buf[0], sizeof(uio_buf));
13644 uio_addiov(auio, uap->namebuf, uap->bufsize);
13645 }
13646
13647 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13648
13649 vnode_put(vp);
13650 file_drop(uap->fd);
13651 if (auio) {
13652 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13653 } else {
13654 *retval = (user_ssize_t)attrsize;
13655 }
13656 return error;
13657 }
13658
13659 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13660 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13661 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13662 {
13663 int error;
13664 struct mount *mp = NULL;
13665 vnode_t vp;
13666 int length;
13667 int bpflags;
13668 /* maximum number of times to retry build_path */
13669 unsigned int retries = 0x10;
13670
13671 if (bufsize > FSGETPATH_MAXBUFLEN) {
13672 return EINVAL;
13673 }
13674
13675 if (buf == NULL) {
13676 return ENOMEM;
13677 }
13678
13679 retry:
13680 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13681 error = ENOTSUP; /* unexpected failure */
13682 return ENOTSUP;
13683 }
13684
13685 #if CONFIG_UNION_MOUNTS
13686 unionget:
13687 #endif /* CONFIG_UNION_MOUNTS */
13688 if (objid == 2) {
13689 struct vfs_attr vfsattr;
13690 int use_vfs_root = TRUE;
13691
13692 VFSATTR_INIT(&vfsattr);
13693 VFSATTR_WANTED(&vfsattr, f_capabilities);
13694 if (!(options & FSOPT_ISREALFSID) &&
13695 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13696 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13697 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13698 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13699 use_vfs_root = FALSE;
13700 }
13701 }
13702
13703 if (use_vfs_root) {
13704 error = VFS_ROOT(mp, &vp, ctx);
13705 } else {
13706 error = VFS_VGET(mp, objid, &vp, ctx);
13707 }
13708 } else {
13709 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13710 }
13711
13712 #if CONFIG_UNION_MOUNTS
13713 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13714 /*
13715 * If the fileid isn't found and we're in a union
13716 * mount volume, then see if the fileid is in the
13717 * mounted-on volume.
13718 */
13719 struct mount *tmp = mp;
13720 mp = vnode_mount(tmp->mnt_vnodecovered);
13721 vfs_unbusy(tmp);
13722 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13723 goto unionget;
13724 }
13725 } else {
13726 vfs_unbusy(mp);
13727 }
13728 #else
13729 vfs_unbusy(mp);
13730 #endif /* CONFIG_UNION_MOUNTS */
13731
13732 if (error) {
13733 return error;
13734 }
13735
13736 #if CONFIG_MACF
13737 error = mac_vnode_check_fsgetpath(ctx, vp);
13738 if (error) {
13739 vnode_put(vp);
13740 return error;
13741 }
13742 #endif
13743
13744 /* Obtain the absolute path to this vnode. */
13745 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13746 if (options & FSOPT_NOFIRMLINKPATH) {
13747 bpflags |= BUILDPATH_NO_FIRMLINK;
13748 }
13749 bpflags |= BUILDPATH_CHECK_MOVED;
13750 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13751 vnode_put(vp);
13752
13753 if (error) {
13754 /* there was a race building the path, try a few more times */
13755 if (error == EAGAIN) {
13756 --retries;
13757 if (retries > 0) {
13758 goto retry;
13759 }
13760
13761 error = ENOENT;
13762 }
13763 goto out;
13764 }
13765
13766 AUDIT_ARG(text, buf);
13767
13768 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13769 unsigned long path_words[NUMPARMS];
13770 size_t path_len = sizeof(path_words);
13771
13772 if ((size_t)length < path_len) {
13773 memcpy((char *)path_words, buf, length);
13774 memset((char *)path_words + length, 0, path_len - length);
13775
13776 path_len = length;
13777 } else {
13778 memcpy((char *)path_words, buf + (length - path_len), path_len);
13779 }
13780
13781 kdebug_vfs_lookup(path_words, (int)path_len, vp,
13782 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13783 }
13784
13785 *pathlen = length; /* may be superseded by error */
13786
13787 out:
13788 return error;
13789 }
13790
13791 /*
13792 * Obtain the full pathname of a file system object by id.
13793 */
13794 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13795 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13796 uint32_t options, user_ssize_t *retval)
13797 {
13798 vfs_context_t ctx = vfs_context_current();
13799 fsid_t fsid;
13800 char *realpath;
13801 int length;
13802 int error;
13803
13804 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13805 return EINVAL;
13806 }
13807
13808 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13809 return error;
13810 }
13811 AUDIT_ARG(value32, fsid.val[0]);
13812 AUDIT_ARG(value64, objid);
13813 /* Restrict output buffer size for now. */
13814
13815 if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13816 return EINVAL;
13817 }
13818 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13819 if (realpath == NULL) {
13820 return ENOMEM;
13821 }
13822
13823 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13824 options, &length);
13825
13826 if (error) {
13827 goto out;
13828 }
13829
13830 error = copyout((caddr_t)realpath, buf, length);
13831
13832 *retval = (user_ssize_t)length; /* may be superseded by error */
13833 out:
13834 kfree_data(realpath, bufsize);
13835 return error;
13836 }
13837
13838 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13839 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13840 {
13841 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13842 0, retval);
13843 }
13844
13845 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13846 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13847 {
13848 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13849 uap->options, retval);
13850 }
13851
13852 /*
13853 * Common routine to handle various flavors of statfs data heading out
13854 * to user space.
13855 *
13856 * Returns: 0 Success
13857 * EFAULT
13858 */
13859 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13860 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13861 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13862 boolean_t partial_copy)
13863 {
13864 int error;
13865 int my_size, copy_size;
13866
13867 if (is_64_bit) {
13868 struct user64_statfs sfs;
13869 my_size = copy_size = sizeof(sfs);
13870 bzero(&sfs, my_size);
13871 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13872 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13873 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13874 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13875 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13876 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13877 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13878 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13879 sfs.f_files = (user64_long_t)sfsp->f_files;
13880 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13881 sfs.f_fsid = sfsp->f_fsid;
13882 sfs.f_owner = sfsp->f_owner;
13883 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13884 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13885 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13886
13887 if (partial_copy) {
13888 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13889 }
13890 error = copyout((caddr_t)&sfs, bufp, copy_size);
13891 } else {
13892 struct user32_statfs sfs;
13893
13894 my_size = copy_size = sizeof(sfs);
13895 bzero(&sfs, my_size);
13896
13897 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13898 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13899 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13900
13901 /*
13902 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13903 * have to fudge the numbers here in that case. We inflate the blocksize in order
13904 * to reflect the filesystem size as best we can.
13905 */
13906 if ((sfsp->f_blocks > INT_MAX)
13907 /* Hack for 4061702 . I think the real fix is for Carbon to
13908 * look for some volume capability and not depend on hidden
13909 * semantics agreed between a FS and carbon.
13910 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13911 * for Carbon to set bNoVolumeSizes volume attribute.
13912 * Without this the webdavfs files cannot be copied onto
13913 * disk as they look huge. This change should not affect
13914 * XSAN as they should not setting these to -1..
13915 */
13916 && (sfsp->f_blocks != 0xffffffffffffffffULL)
13917 && (sfsp->f_bfree != 0xffffffffffffffffULL)
13918 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13919 int shift;
13920
13921 /*
13922 * Work out how far we have to shift the block count down to make it fit.
13923 * Note that it's possible to have to shift so far that the resulting
13924 * blocksize would be unreportably large. At that point, we will clip
13925 * any values that don't fit.
13926 *
13927 * For safety's sake, we also ensure that f_iosize is never reported as
13928 * being smaller than f_bsize.
13929 */
13930 for (shift = 0; shift < 32; shift++) {
13931 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13932 break;
13933 }
13934 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13935 break;
13936 }
13937 }
13938 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13939 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13940 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13941 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13942 #undef __SHIFT_OR_CLIP
13943 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13944 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13945 } else {
13946 /* filesystem is small enough to be reported honestly */
13947 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13948 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13949 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13950 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13951 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13952 }
13953 sfs.f_files = (user32_long_t)sfsp->f_files;
13954 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13955 sfs.f_fsid = sfsp->f_fsid;
13956 sfs.f_owner = sfsp->f_owner;
13957 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13958 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13959 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13960
13961 if (partial_copy) {
13962 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13963 }
13964 error = copyout((caddr_t)&sfs, bufp, copy_size);
13965 }
13966
13967 if (sizep != NULL) {
13968 *sizep = my_size;
13969 }
13970 return error;
13971 }
13972
13973 /*
13974 * copy stat structure into user_stat structure.
13975 */
13976 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13977 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13978 {
13979 bzero(usbp, sizeof(*usbp));
13980
13981 usbp->st_dev = sbp->st_dev;
13982 usbp->st_ino = sbp->st_ino;
13983 usbp->st_mode = sbp->st_mode;
13984 usbp->st_nlink = sbp->st_nlink;
13985 usbp->st_uid = sbp->st_uid;
13986 usbp->st_gid = sbp->st_gid;
13987 usbp->st_rdev = sbp->st_rdev;
13988 #ifndef _POSIX_C_SOURCE
13989 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13990 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13991 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13992 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13993 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13994 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13995 #else
13996 usbp->st_atime = sbp->st_atime;
13997 usbp->st_atimensec = sbp->st_atimensec;
13998 usbp->st_mtime = sbp->st_mtime;
13999 usbp->st_mtimensec = sbp->st_mtimensec;
14000 usbp->st_ctime = sbp->st_ctime;
14001 usbp->st_ctimensec = sbp->st_ctimensec;
14002 #endif
14003 usbp->st_size = sbp->st_size;
14004 usbp->st_blocks = sbp->st_blocks;
14005 usbp->st_blksize = sbp->st_blksize;
14006 usbp->st_flags = sbp->st_flags;
14007 usbp->st_gen = sbp->st_gen;
14008 usbp->st_lspare = sbp->st_lspare;
14009 usbp->st_qspare[0] = sbp->st_qspare[0];
14010 usbp->st_qspare[1] = sbp->st_qspare[1];
14011 }
14012
14013 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)14014 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
14015 {
14016 bzero(usbp, sizeof(*usbp));
14017
14018 usbp->st_dev = sbp->st_dev;
14019 usbp->st_ino = sbp->st_ino;
14020 usbp->st_mode = sbp->st_mode;
14021 usbp->st_nlink = sbp->st_nlink;
14022 usbp->st_uid = sbp->st_uid;
14023 usbp->st_gid = sbp->st_gid;
14024 usbp->st_rdev = sbp->st_rdev;
14025 #ifndef _POSIX_C_SOURCE
14026 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14027 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14028 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14029 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14030 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14031 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14032 #else
14033 usbp->st_atime = sbp->st_atime;
14034 usbp->st_atimensec = sbp->st_atimensec;
14035 usbp->st_mtime = sbp->st_mtime;
14036 usbp->st_mtimensec = sbp->st_mtimensec;
14037 usbp->st_ctime = sbp->st_ctime;
14038 usbp->st_ctimensec = sbp->st_ctimensec;
14039 #endif
14040 usbp->st_size = sbp->st_size;
14041 usbp->st_blocks = sbp->st_blocks;
14042 usbp->st_blksize = sbp->st_blksize;
14043 usbp->st_flags = sbp->st_flags;
14044 usbp->st_gen = sbp->st_gen;
14045 usbp->st_lspare = sbp->st_lspare;
14046 usbp->st_qspare[0] = sbp->st_qspare[0];
14047 usbp->st_qspare[1] = sbp->st_qspare[1];
14048 }
14049
14050 /*
14051 * copy stat64 structure into user_stat64 structure.
14052 */
14053 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)14054 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
14055 {
14056 bzero(usbp, sizeof(*usbp));
14057
14058 usbp->st_dev = sbp->st_dev;
14059 usbp->st_ino = sbp->st_ino;
14060 usbp->st_mode = sbp->st_mode;
14061 usbp->st_nlink = sbp->st_nlink;
14062 usbp->st_uid = sbp->st_uid;
14063 usbp->st_gid = sbp->st_gid;
14064 usbp->st_rdev = sbp->st_rdev;
14065 #ifndef _POSIX_C_SOURCE
14066 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
14067 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
14068 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
14069 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
14070 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
14071 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
14072 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
14073 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
14074 #else
14075 usbp->st_atime = sbp->st_atime;
14076 usbp->st_atimensec = sbp->st_atimensec;
14077 usbp->st_mtime = sbp->st_mtime;
14078 usbp->st_mtimensec = sbp->st_mtimensec;
14079 usbp->st_ctime = sbp->st_ctime;
14080 usbp->st_ctimensec = sbp->st_ctimensec;
14081 usbp->st_birthtime = sbp->st_birthtime;
14082 usbp->st_birthtimensec = sbp->st_birthtimensec;
14083 #endif
14084 usbp->st_size = sbp->st_size;
14085 usbp->st_blocks = sbp->st_blocks;
14086 usbp->st_blksize = sbp->st_blksize;
14087 usbp->st_flags = sbp->st_flags;
14088 usbp->st_gen = sbp->st_gen;
14089 usbp->st_lspare = sbp->st_lspare;
14090 usbp->st_qspare[0] = sbp->st_qspare[0];
14091 usbp->st_qspare[1] = sbp->st_qspare[1];
14092 }
14093
14094 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14095 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14096 {
14097 bzero(usbp, sizeof(*usbp));
14098
14099 usbp->st_dev = sbp->st_dev;
14100 usbp->st_ino = sbp->st_ino;
14101 usbp->st_mode = sbp->st_mode;
14102 usbp->st_nlink = sbp->st_nlink;
14103 usbp->st_uid = sbp->st_uid;
14104 usbp->st_gid = sbp->st_gid;
14105 usbp->st_rdev = sbp->st_rdev;
14106 #ifndef _POSIX_C_SOURCE
14107 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14108 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14109 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14110 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14111 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14112 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14113 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14114 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14115 #else
14116 usbp->st_atime = sbp->st_atime;
14117 usbp->st_atimensec = sbp->st_atimensec;
14118 usbp->st_mtime = sbp->st_mtime;
14119 usbp->st_mtimensec = sbp->st_mtimensec;
14120 usbp->st_ctime = sbp->st_ctime;
14121 usbp->st_ctimensec = sbp->st_ctimensec;
14122 usbp->st_birthtime = sbp->st_birthtime;
14123 usbp->st_birthtimensec = sbp->st_birthtimensec;
14124 #endif
14125 usbp->st_size = sbp->st_size;
14126 usbp->st_blocks = sbp->st_blocks;
14127 usbp->st_blksize = sbp->st_blksize;
14128 usbp->st_flags = sbp->st_flags;
14129 usbp->st_gen = sbp->st_gen;
14130 usbp->st_lspare = sbp->st_lspare;
14131 usbp->st_qspare[0] = sbp->st_qspare[0];
14132 usbp->st_qspare[1] = sbp->st_qspare[1];
14133 }
14134
14135 /*
14136 * Purge buffer cache for simulating cold starts
14137 */
14138 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14139 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14140 {
14141 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14142
14143 return VNODE_RETURNED;
14144 }
14145
14146 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14147 vfs_purge_callback(mount_t mp, __unused void * arg)
14148 {
14149 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14150
14151 return VFS_RETURNED;
14152 }
14153
14154 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14155 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14156
14157 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14158 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14159 {
14160 if (!kauth_cred_issuser(kauth_cred_get())) {
14161 return EPERM;
14162 }
14163
14164 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14165
14166 /* also flush any VM pagers backed by files */
14167 if (vfs_purge_vm_pagers) {
14168 vm_purge_filebacked_pagers();
14169 }
14170
14171 return 0;
14172 }
14173
14174 /*
14175 * gets the vnode associated with the (unnamed) snapshot directory
14176 * for a Filesystem. The snapshot directory vnode is returned with
14177 * an iocount on it.
14178 */
14179 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14180 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14181 {
14182 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14183 }
14184
14185 /*
14186 * Get the snapshot vnode.
14187 *
14188 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14189 * needs nameidone() on ndp.
14190 *
14191 * If the snapshot vnode exists it is returned in ndp->ni_vp.
14192 *
14193 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14194 * not needed.
14195 */
14196 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14197 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14198 user_addr_t name, struct nameidata *ndp, int32_t op,
14199 #if !CONFIG_TRIGGERS
14200 __unused
14201 #endif
14202 enum path_operation pathop,
14203 vfs_context_t ctx)
14204 {
14205 int error, i;
14206 caddr_t name_buf;
14207 size_t name_len;
14208 struct vfs_attr vfa;
14209
14210 *sdvpp = NULLVP;
14211 *rvpp = NULLVP;
14212
14213 error = vnode_getfromfd(ctx, dirfd, rvpp);
14214 if (error) {
14215 return error;
14216 }
14217
14218 if (!vnode_isvroot(*rvpp)) {
14219 error = EINVAL;
14220 goto out;
14221 }
14222
14223 /* Make sure the filesystem supports snapshots */
14224 VFSATTR_INIT(&vfa);
14225 VFSATTR_WANTED(&vfa, f_capabilities);
14226 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14227 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14228 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14229 VOL_CAP_INT_SNAPSHOT)) ||
14230 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14231 VOL_CAP_INT_SNAPSHOT))) {
14232 error = ENOTSUP;
14233 goto out;
14234 }
14235
14236 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14237 if (error) {
14238 goto out;
14239 }
14240
14241 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14242 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14243 if (error) {
14244 goto out1;
14245 }
14246
14247 /*
14248 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14249 * (the length returned by copyinstr includes the terminating NUL)
14250 */
14251 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14252 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14253 error = EINVAL;
14254 goto out1;
14255 }
14256 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14257 ;
14258 }
14259 if (i < (int)name_len) {
14260 error = EINVAL;
14261 goto out1;
14262 }
14263
14264 #if CONFIG_MACF
14265 if (op == CREATE) {
14266 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14267 name_buf);
14268 } else if (op == DELETE) {
14269 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14270 name_buf);
14271 }
14272 if (error) {
14273 goto out1;
14274 }
14275 #endif
14276
14277 /* Check if the snapshot already exists ... */
14278 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14279 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14280 ndp->ni_dvp = *sdvpp;
14281
14282 error = namei(ndp);
14283 out1:
14284 zfree(ZV_NAMEI, name_buf);
14285 out:
14286 if (error) {
14287 if (*sdvpp) {
14288 vnode_put(*sdvpp);
14289 *sdvpp = NULLVP;
14290 }
14291 if (*rvpp) {
14292 vnode_put(*rvpp);
14293 *rvpp = NULLVP;
14294 }
14295 }
14296 return error;
14297 }
14298
14299 /*
14300 * create a filesystem snapshot (for supporting filesystems)
14301 *
14302 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14303 * We get to the (unnamed) snapshot directory vnode and create the vnode
14304 * for the snapshot in it.
14305 *
14306 * Restrictions:
14307 *
14308 * a) Passed in name for snapshot cannot have slashes.
14309 * b) name can't be "." or ".."
14310 *
14311 * Since this requires superuser privileges, vnode_authorize calls are not
14312 * made.
14313 */
14314 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14315 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14316 vfs_context_t ctx)
14317 {
14318 vnode_t rvp, snapdvp;
14319 int error;
14320 struct nameidata *ndp;
14321
14322 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14323
14324 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14325 OP_LINK, ctx);
14326 if (error) {
14327 goto out;
14328 }
14329
14330 if (ndp->ni_vp) {
14331 vnode_put(ndp->ni_vp);
14332 error = EEXIST;
14333 } else {
14334 struct vnode_attr *vap;
14335 vnode_t vp = NULLVP;
14336
14337 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14338
14339 VATTR_INIT(vap);
14340 VATTR_SET(vap, va_type, VREG);
14341 VATTR_SET(vap, va_mode, 0);
14342
14343 error = vn_create(snapdvp, &vp, ndp, vap,
14344 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14345 if (!error && vp) {
14346 vnode_put(vp);
14347 }
14348
14349 kfree_type(struct vnode_attr, vap);
14350 }
14351
14352 nameidone(ndp);
14353 vnode_put(snapdvp);
14354 vnode_put(rvp);
14355 out:
14356 kfree_type(struct nameidata, ndp);
14357
14358 return error;
14359 }
14360
14361 /*
14362 * Delete a Filesystem snapshot
14363 *
14364 * get the vnode for the unnamed snapshot directory and the snapshot and
14365 * delete the snapshot.
14366 */
14367 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14368 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14369 vfs_context_t ctx)
14370 {
14371 vnode_t rvp, snapdvp;
14372 int error;
14373 struct nameidata *ndp;
14374
14375 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14376
14377 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14378 OP_UNLINK, ctx);
14379 if (error) {
14380 goto out;
14381 }
14382
14383 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14384 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14385
14386 vnode_put(ndp->ni_vp);
14387 nameidone(ndp);
14388 vnode_put(snapdvp);
14389 vnode_put(rvp);
14390 out:
14391 kfree_type(struct nameidata, ndp);
14392
14393 return error;
14394 }
14395
14396 /*
14397 * Revert a filesystem to a snapshot
14398 *
14399 * Marks the filesystem to revert to the given snapshot on next mount.
14400 */
14401 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14402 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14403 vfs_context_t ctx)
14404 {
14405 int error;
14406 vnode_t rvp;
14407 mount_t mp;
14408 struct fs_snapshot_revert_args revert_data;
14409 struct componentname cnp;
14410 caddr_t name_buf;
14411 size_t name_len;
14412
14413 error = vnode_getfromfd(ctx, dirfd, &rvp);
14414 if (error) {
14415 return error;
14416 }
14417 mp = vnode_mount(rvp);
14418
14419 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14420 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14421 if (error) {
14422 zfree(ZV_NAMEI, name_buf);
14423 vnode_put(rvp);
14424 return error;
14425 }
14426
14427 #if CONFIG_MACF
14428 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14429 if (error) {
14430 zfree(ZV_NAMEI, name_buf);
14431 vnode_put(rvp);
14432 return error;
14433 }
14434 #endif
14435
14436 /*
14437 * Grab mount_iterref so that we can release the vnode,
14438 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14439 */
14440 error = mount_iterref(mp, 0);
14441 vnode_put(rvp);
14442 if (error) {
14443 zfree(ZV_NAMEI, name_buf);
14444 return error;
14445 }
14446
14447 memset(&cnp, 0, sizeof(cnp));
14448 cnp.cn_pnbuf = (char *)name_buf;
14449 cnp.cn_nameiop = LOOKUP;
14450 cnp.cn_flags = ISLASTCN | HASBUF;
14451 cnp.cn_pnlen = MAXPATHLEN;
14452 cnp.cn_nameptr = cnp.cn_pnbuf;
14453 cnp.cn_namelen = (int)name_len;
14454 revert_data.sr_cnp = &cnp;
14455
14456 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14457 mount_iterdrop(mp);
14458 zfree(ZV_NAMEI, name_buf);
14459
14460 if (error) {
14461 /* If there was any error, try again using VNOP_IOCTL */
14462
14463 vnode_t snapdvp;
14464 struct nameidata namend;
14465
14466 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14467 OP_LOOKUP, ctx);
14468 if (error) {
14469 return error;
14470 }
14471
14472
14473 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14474 0, ctx);
14475
14476 vnode_put(namend.ni_vp);
14477 nameidone(&namend);
14478 vnode_put(snapdvp);
14479 vnode_put(rvp);
14480 }
14481
14482 return error;
14483 }
14484
14485 /*
14486 * rename a Filesystem snapshot
14487 *
14488 * get the vnode for the unnamed snapshot directory and the snapshot and
14489 * rename the snapshot. This is a very specialised (and simple) case of
14490 * rename(2) (which has to deal with a lot more complications). It differs
14491 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14492 */
14493 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14494 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14495 __unused uint32_t flags, vfs_context_t ctx)
14496 {
14497 vnode_t rvp, snapdvp;
14498 int error, i;
14499 caddr_t newname_buf;
14500 size_t name_len;
14501 vnode_t fvp;
14502 struct nameidata *fromnd, *tond;
14503 /* carving out a chunk for structs that are too big to be on stack. */
14504 struct {
14505 struct nameidata from_node;
14506 struct nameidata to_node;
14507 } * __rename_data;
14508
14509 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14510 fromnd = &__rename_data->from_node;
14511 tond = &__rename_data->to_node;
14512
14513 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14514 OP_UNLINK, ctx);
14515 if (error) {
14516 goto out;
14517 }
14518 fvp = fromnd->ni_vp;
14519
14520 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14521 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14522 if (error) {
14523 goto out1;
14524 }
14525
14526 /*
14527 * Some sanity checks- new name can't be empty, "." or ".." or have
14528 * slashes.
14529 * (the length returned by copyinstr includes the terminating NUL)
14530 *
14531 * The FS rename VNOP is suppossed to handle this but we'll pick it
14532 * off here itself.
14533 */
14534 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14535 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14536 error = EINVAL;
14537 goto out1;
14538 }
14539 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14540 ;
14541 }
14542 if (i < (int)name_len) {
14543 error = EINVAL;
14544 goto out1;
14545 }
14546
14547 #if CONFIG_MACF
14548 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14549 newname_buf);
14550 if (error) {
14551 goto out1;
14552 }
14553 #endif
14554
14555 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14556 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14557 tond->ni_dvp = snapdvp;
14558
14559 error = namei(tond);
14560 if (error) {
14561 goto out2;
14562 } else if (tond->ni_vp) {
14563 /*
14564 * snapshot rename behaves differently than rename(2) - if the
14565 * new name exists, EEXIST is returned.
14566 */
14567 vnode_put(tond->ni_vp);
14568 error = EEXIST;
14569 goto out2;
14570 }
14571
14572 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14573 &tond->ni_cnd, ctx);
14574
14575 out2:
14576 nameidone(tond);
14577 out1:
14578 zfree(ZV_NAMEI, newname_buf);
14579 vnode_put(fvp);
14580 vnode_put(snapdvp);
14581 vnode_put(rvp);
14582 nameidone(fromnd);
14583 out:
14584 kfree_type(typeof(*__rename_data), __rename_data);
14585 return error;
14586 }
14587
14588 /*
14589 * Mount a Filesystem snapshot
14590 *
14591 * get the vnode for the unnamed snapshot directory and the snapshot and
14592 * mount the snapshot.
14593 */
14594 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14595 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14596 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14597 {
14598 mount_t mp;
14599 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14600 struct fs_snapshot_mount_args smnt_data;
14601 int error;
14602 struct nameidata *snapndp, *dirndp;
14603 /* carving out a chunk for structs that are too big to be on stack. */
14604 struct {
14605 struct nameidata snapnd;
14606 struct nameidata dirnd;
14607 } * __snapshot_mount_data;
14608
14609 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14610 snapndp = &__snapshot_mount_data->snapnd;
14611 dirndp = &__snapshot_mount_data->dirnd;
14612
14613 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14614 OP_LOOKUP, ctx);
14615 if (error) {
14616 goto out;
14617 }
14618
14619 snapvp = snapndp->ni_vp;
14620 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14621 error = EIO;
14622 goto out1;
14623 }
14624
14625 /* Get the vnode to be covered */
14626 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14627 UIO_USERSPACE, directory, ctx);
14628 error = namei(dirndp);
14629 if (error) {
14630 goto out1;
14631 }
14632
14633 vp = dirndp->ni_vp;
14634 pvp = dirndp->ni_dvp;
14635 mp = vnode_mount(rvp);
14636
14637 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14638 error = EINVAL;
14639 goto out2;
14640 }
14641
14642 #if CONFIG_MACF
14643 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14644 mp->mnt_vfsstat.f_fstypename);
14645 if (error) {
14646 goto out2;
14647 }
14648 #endif
14649
14650 smnt_data.sm_mp = mp;
14651 smnt_data.sm_cnp = &snapndp->ni_cnd;
14652 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14653 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & (MNT_DONTBROWSE | MNT_IGNORE_OWNERSHIP),
14654 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14655
14656 out2:
14657 vnode_put(vp);
14658 vnode_put(pvp);
14659 nameidone(dirndp);
14660 out1:
14661 vnode_put(snapvp);
14662 vnode_put(snapdvp);
14663 vnode_put(rvp);
14664 nameidone(snapndp);
14665 out:
14666 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14667 return error;
14668 }
14669
14670 /*
14671 * Root from a snapshot of the filesystem
14672 *
14673 * Marks the filesystem to root from the given snapshot on next boot.
14674 */
14675 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14676 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14677 vfs_context_t ctx)
14678 {
14679 int error;
14680 vnode_t rvp;
14681 mount_t mp;
14682 struct fs_snapshot_root_args root_data;
14683 struct componentname cnp;
14684 caddr_t name_buf;
14685 size_t name_len;
14686
14687 error = vnode_getfromfd(ctx, dirfd, &rvp);
14688 if (error) {
14689 return error;
14690 }
14691 mp = vnode_mount(rvp);
14692
14693 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14694 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14695 if (error) {
14696 zfree(ZV_NAMEI, name_buf);
14697 vnode_put(rvp);
14698 return error;
14699 }
14700
14701 // XXX MAC checks ?
14702
14703 /*
14704 * Grab mount_iterref so that we can release the vnode,
14705 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14706 */
14707 error = mount_iterref(mp, 0);
14708 vnode_put(rvp);
14709 if (error) {
14710 zfree(ZV_NAMEI, name_buf);
14711 return error;
14712 }
14713
14714 memset(&cnp, 0, sizeof(cnp));
14715 cnp.cn_pnbuf = (char *)name_buf;
14716 cnp.cn_nameiop = LOOKUP;
14717 cnp.cn_flags = ISLASTCN | HASBUF;
14718 cnp.cn_pnlen = MAXPATHLEN;
14719 cnp.cn_nameptr = cnp.cn_pnbuf;
14720 cnp.cn_namelen = (int)name_len;
14721 root_data.sr_cnp = &cnp;
14722
14723 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14724
14725 mount_iterdrop(mp);
14726 zfree(ZV_NAMEI, name_buf);
14727
14728 return error;
14729 }
14730
14731 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14732 vfs_context_can_snapshot(vfs_context_t ctx)
14733 {
14734 static const char * const snapshot_entitlements[] = {
14735 "com.apple.private.vfs.snapshot",
14736 "com.apple.developer.vfs.snapshot",
14737 "com.apple.private.apfs.arv.limited.snapshot",
14738 };
14739 static const size_t nentitlements =
14740 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14741 size_t i;
14742
14743 task_t task = vfs_context_task(ctx);
14744 for (i = 0; i < nentitlements; i++) {
14745 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14746 return TRUE;
14747 }
14748 }
14749 return FALSE;
14750 }
14751
14752 /*
14753 * FS snapshot operations dispatcher
14754 */
14755 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14756 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14757 __unused int32_t *retval)
14758 {
14759 int error;
14760 vfs_context_t ctx = vfs_context_current();
14761
14762 AUDIT_ARG(fd, uap->dirfd);
14763 AUDIT_ARG(value32, uap->op);
14764
14765 if (!vfs_context_can_snapshot(ctx)) {
14766 return EPERM;
14767 }
14768
14769 /*
14770 * Enforce user authorization for snapshot modification operations,
14771 * or if trying to root from snapshot.
14772 */
14773 if (uap->op != SNAPSHOT_OP_MOUNT) {
14774 vnode_t dvp = NULLVP;
14775 vnode_t devvp = NULLVP;
14776 mount_t mp;
14777
14778 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14779 if (error) {
14780 return error;
14781 }
14782 mp = vnode_mount(dvp);
14783 devvp = mp->mnt_devvp;
14784
14785 /* get an iocount on devvp */
14786 if (devvp == NULLVP) {
14787 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14788 /* for mounts which arent block devices */
14789 if (error == ENOENT) {
14790 error = ENXIO;
14791 }
14792 } else {
14793 error = vnode_getwithref(devvp);
14794 }
14795
14796 if (error) {
14797 vnode_put(dvp);
14798 return error;
14799 }
14800
14801 if ((vfs_context_issuser(ctx) == 0) &&
14802 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14803 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14804 error = EPERM;
14805 }
14806 vnode_put(dvp);
14807 vnode_put(devvp);
14808
14809 if (error) {
14810 return error;
14811 }
14812 }
14813
14814 switch (uap->op) {
14815 case SNAPSHOT_OP_CREATE:
14816 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14817 break;
14818 case SNAPSHOT_OP_DELETE:
14819 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14820 break;
14821 case SNAPSHOT_OP_RENAME:
14822 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14823 uap->flags, ctx);
14824 break;
14825 case SNAPSHOT_OP_MOUNT:
14826 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14827 uap->data, uap->flags, ctx);
14828 break;
14829 case SNAPSHOT_OP_REVERT:
14830 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14831 break;
14832 #if CONFIG_MNT_ROOTSNAP
14833 case SNAPSHOT_OP_ROOT:
14834 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14835 break;
14836 #endif /* CONFIG_MNT_ROOTSNAP */
14837 default:
14838 error = ENOSYS;
14839 }
14840
14841 return error;
14842 }
14843