1 /*
2 * Copyright (c) 1995-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137
138 #include <nfs/nfs_conf.h>
139
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 ((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 release_pathbuff(x)
154 #else
155 #define GET_PATH(x) \
156 ((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
171 #endif
172
173 extern void disk_conditioner_unmount(mount_t mp);
174
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 vnode_t olddp;
178 vnode_t newdp;
179 };
180 /* callback for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192 boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195 struct componentname *cnp, user_addr_t fsmountargs,
196 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200
201 struct fd_vn_data * fg_vn_data_alloc(void);
202
203 /*
204 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205 * Concurrent lookups (or lookups by ids) on hard links can cause the
206 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207 * does) to return ENOENT as the path cannot be returned from the name cache
208 * alone. We have no option but to retry and hope to get one namei->reverse path
209 * generation done without an intervening lookup, lookup by id on the hard link
210 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211 * which currently are the MAC hooks for rename, unlink and rmdir.
212 */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219 int unlink_flags);
220
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236
237 __private_extern__
238 int sync_internal(void);
239
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249
250 extern lck_rw_t rootvnode_rw_lock;
251
252 /*
253 * incremented each time a mount or unmount operation occurs
254 * used to invalidate the cached value of the rootvp in the
255 * mount structure utilized by cache_lookup_path
256 */
257 uint32_t mount_generation = 0;
258
259 /* counts number of mount and unmount operations */
260 unsigned int vfs_nummntops = 0;
261
262 /* system-wide, per-boot unique mount ID */
263 static _Atomic uint64_t mount_unique_id = 1;
264
265 extern const struct fileops vnops;
266 #if CONFIG_APPLEDOUBLE
267 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
268 #endif /* CONFIG_APPLEDOUBLE */
269
270 /*
271 * Virtual File System System Calls
272 */
273
274 /*
275 * Private in-kernel mounting spi (specific use-cases only)
276 */
277 boolean_t
vfs_iskernelmount(mount_t mp)278 vfs_iskernelmount(mount_t mp)
279 {
280 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
281 }
282
283 __private_extern__
284 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)285 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
286 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
287 vfs_context_t ctx)
288 {
289 struct nameidata nd;
290 boolean_t did_namei;
291 int error;
292
293 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
294 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
295
296 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
297
298 /*
299 * Get the vnode to be covered if it's not supplied
300 */
301 if (vp == NULLVP) {
302 error = namei(&nd);
303 if (error) {
304 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
305 printf("failed to locate mount-on path: %s ", path);
306 }
307 return error;
308 }
309 vp = nd.ni_vp;
310 pvp = nd.ni_dvp;
311 did_namei = TRUE;
312 } else {
313 char *pnbuf = CAST_DOWN(char *, path);
314
315 nd.ni_cnd.cn_pnbuf = pnbuf;
316 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
317 did_namei = FALSE;
318 }
319
320 kern_flags |= KERNEL_MOUNT_KMOUNT;
321 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
322 syscall_flags, kern_flags, NULL, ctx);
323
324 if (did_namei) {
325 vnode_put(vp);
326 vnode_put(pvp);
327 nameidone(&nd);
328 }
329
330 return error;
331 }
332
333 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)334 vfs_mount_at_path(const char *fstype, const char *path,
335 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
336 int mnt_flags, int flags)
337 {
338 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
339 int error, km_flags = 0;
340 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
341
342 /*
343 * This call is currently restricted to specific use cases.
344 */
345 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
346 return ENOTSUP;
347 }
348
349 #if !defined(XNU_TARGET_OS_OSX)
350 if (strcmp(fstype, "lifs") == 0) {
351 syscall_flags |= MNT_NOEXEC;
352 }
353 #endif
354
355 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
356 km_flags |= KERNEL_MOUNT_NOAUTH;
357 }
358 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
359 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
360 }
361
362 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
363 syscall_flags, km_flags, ctx);
364 if (error) {
365 printf("%s: mount on %s failed, error %d\n", __func__, path,
366 error);
367 }
368
369 return error;
370 }
371
372 int
vfs_mount_override_type_name(mount_t mp,const char * name)373 vfs_mount_override_type_name(mount_t mp, const char *name)
374 {
375 if (mp == NULL || name == NULL) {
376 return EINVAL;
377 }
378
379 /* Override the FS type name. */
380 mount_lock_spin(mp);
381 strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
382 mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
383 mount_unlock(mp);
384
385 return 0;
386 }
387
388 /*
389 * Mount a file system.
390 */
391 /* ARGSUSED */
392 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)393 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
394 {
395 struct __mac_mount_args muap;
396
397 muap.type = uap->type;
398 muap.path = uap->path;
399 muap.flags = uap->flags;
400 muap.data = uap->data;
401 muap.mac_p = USER_ADDR_NULL;
402 return __mac_mount(p, &muap, retval);
403 }
404
405 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)406 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
407 {
408 struct componentname cn;
409 vfs_context_t ctx = vfs_context_current();
410 size_t dummy = 0;
411 int error;
412 int flags = uap->flags;
413 char fstypename[MFSNAMELEN];
414 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
415 vnode_t pvp;
416 vnode_t vp;
417
418 AUDIT_ARG(fd, uap->fd);
419 AUDIT_ARG(fflags, flags);
420 /* fstypename will get audited by mount_common */
421
422 /* Sanity check the flags */
423 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
424 return ENOTSUP;
425 }
426
427 if (flags & MNT_UNION) {
428 return EPERM;
429 }
430
431 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
432 if (error) {
433 return error;
434 }
435
436 if ((error = file_vnode(uap->fd, &vp)) != 0) {
437 return error;
438 }
439
440 if ((error = vnode_getwithref(vp)) != 0) {
441 file_drop(uap->fd);
442 return error;
443 }
444
445 pvp = vnode_getparent(vp);
446 if (pvp == NULL) {
447 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
448 error = EBUSY;
449 } else {
450 error = EINVAL;
451 }
452 vnode_put(vp);
453 file_drop(uap->fd);
454 return error;
455 }
456
457 memset(&cn, 0, sizeof(struct componentname));
458 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
459 cn.cn_pnlen = MAXPATHLEN;
460
461 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
462 zfree(ZV_NAMEI, cn.cn_pnbuf);
463 vnode_put(pvp);
464 vnode_put(vp);
465 file_drop(uap->fd);
466 return error;
467 }
468
469 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
470
471 zfree(ZV_NAMEI, cn.cn_pnbuf);
472 vnode_put(pvp);
473 vnode_put(vp);
474 file_drop(uap->fd);
475
476 return error;
477 }
478
479 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
480
481 /*
482 * Get the size of a graft file (a manifest or payload file).
483 * The vp should be an iocounted vnode.
484 */
485 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)486 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
487 {
488 struct stat64 sb = {};
489 int error;
490
491 *size = 0;
492
493 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
494 if (error) {
495 return error;
496 }
497
498 if (sb.st_size == 0) {
499 error = ENODATA;
500 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
501 error = EFBIG;
502 } else {
503 *size = (size_t) sb.st_size;
504 }
505
506 return error;
507 }
508
509 /*
510 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
511 * `size` must already be validated.
512 */
513 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)514 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
515 {
516 return vn_rdwr(UIO_READ, graft_vp,
517 (caddr_t) buf, (int) size, /* offset */ 0,
518 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
519 vfs_context_ucred(vctx), /* resid */ NULL,
520 vfs_context_proc(vctx));
521 }
522
523 /*
524 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
525 * and read it into `buf`.
526 */
527 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)528 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
529 {
530 vnode_t metadata_vp = NULLVP;
531 int error;
532
533 // Convert this graft fd to a vnode.
534 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
535 goto out;
536 }
537
538 // Get (and validate) size information.
539 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
540 goto out;
541 }
542
543 // Read each file into the provided buffer - we must get the expected amount of bytes.
544 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
545 goto out;
546 }
547
548 out:
549 if (metadata_vp) {
550 vnode_put(metadata_vp);
551 metadata_vp = NULLVP;
552 }
553
554 return error;
555 }
556
557 /*
558 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
559 * provided in `gfs`, saving the size of data read in `gfs`.
560 */
561 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)562 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
563 fsioc_graft_fs_t *gfs)
564 {
565 int error;
566
567 // Read the authentic manifest.
568 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
569 &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
570 return error;
571 }
572
573 // The user manifest is currently unused, but set its size.
574 gfs->user_manifest_size = 0;
575
576 // Read the payload.
577 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
578 &gfs->payload_size, gfs->payload))) {
579 return error;
580 }
581
582 return 0;
583 }
584
585 /*
586 * Call into the filesystem to verify and graft a cryptex.
587 */
588 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)589 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
590 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
591 {
592 fsioc_graft_fs_t gfs = {};
593 uint64_t graft_dir_ino = 0;
594 struct stat64 sb = {};
595 int error;
596
597 // Pre-flight arguments.
598 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
599 // Make sure that this graft version matches what we support.
600 return ENOTSUP;
601 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
602 // For this type, cryptex VP must live on same volume as the target of graft.
603 return EXDEV;
604 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
605 // We cannot graft upon non-directories.
606 return ENOTDIR;
607 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
608 sbc_args->sbc_payload_fd < 0) {
609 // We cannot graft without a manifest and payload.
610 return EINVAL;
611 }
612
613 if (mounton_vp) {
614 // Get the mounton's inode number.
615 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
616 if (error) {
617 return error;
618 }
619 graft_dir_ino = (uint64_t) sb.st_ino;
620 }
621
622 // Create buffers (of our maximum-defined size) to store authentication info.
623 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
624 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
625
626 if (!gfs.authentic_manifest || !gfs.payload) {
627 error = ENOMEM;
628 goto out;
629 }
630
631 // Read our fd's into our buffers.
632 // (Note that this will set the buffer size fields in `gfs`.)
633 error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
634 if (error) {
635 goto out;
636 }
637
638 gfs.graft_version = FSIOC_GRAFT_VERSION;
639 gfs.graft_type = graft_type;
640 gfs.graft_4cc = sbc_args->sbc_4cc;
641 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
642 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
643 }
644 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
645 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
646 }
647 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
648 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
649 }
650 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
651 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
652 }
653 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
654 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
655 }
656 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
657 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
658 }
659 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
660
661 // Call into the FS to perform the graft (and validation).
662 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
663
664 out:
665 if (gfs.authentic_manifest) {
666 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
667 gfs.authentic_manifest = NULL;
668 }
669 if (gfs.payload) {
670 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
671 gfs.payload = NULL;
672 }
673
674 return error;
675 }
676
677 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
678
679 /*
680 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
681 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
682 */
683 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)684 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
685 {
686 int ua_dmgfd = uap->dmg_fd;
687 user_addr_t ua_mountdir = uap->mountdir;
688 uint32_t ua_grafttype = uap->graft_type;
689 user_addr_t ua_graftargs = uap->gda;
690
691 graftdmg_args_un kern_gda = {};
692 int error = 0;
693 secure_boot_cryptex_args_t *sbc_args = NULL;
694
695 vnode_t cryptex_vp = NULLVP;
696 vnode_t mounton_vp = NULLVP;
697 struct nameidata nd = {};
698 vfs_context_t ctx = vfs_context_current();
699
700 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
701 return EPERM;
702 }
703
704 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
705 if (error) {
706 return error;
707 }
708
709 // Copy mount dir in, if provided.
710 if (ua_mountdir != USER_ADDR_NULL) {
711 // Acquire vnode for mount-on path
712 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
713 UIO_USERSPACE, ua_mountdir, ctx);
714
715 error = namei(&nd);
716 if (error) {
717 return error;
718 }
719 mounton_vp = nd.ni_vp;
720 }
721
722 // Convert fd to vnode.
723 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
724 if (error) {
725 goto graftout;
726 }
727
728 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_DOWNLEVEL) {
729 error = EINVAL;
730 } else {
731 sbc_args = &kern_gda.sbc_args;
732 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
733 }
734
735 graftout:
736 if (cryptex_vp) {
737 vnode_put(cryptex_vp);
738 cryptex_vp = NULLVP;
739 }
740 if (mounton_vp) {
741 vnode_put(mounton_vp);
742 mounton_vp = NULLVP;
743 }
744 if (ua_mountdir != USER_ADDR_NULL) {
745 nameidone(&nd);
746 }
747
748 return error;
749 }
750
751 /*
752 * Ungraft a cryptex disk image (via mount dir FD)
753 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
754 */
755 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)756 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
757 {
758 int error = 0;
759 user_addr_t ua_mountdir = uap->mountdir;
760 fsioc_ungraft_fs_t ugfs;
761 vnode_t mounton_vp = NULLVP;
762 struct nameidata nd = {};
763 vfs_context_t ctx = vfs_context_current();
764
765 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
766 return EPERM;
767 }
768
769 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
770 return EINVAL;
771 }
772
773 ugfs.ungraft_flags = 0;
774
775 // Acquire vnode for mount-on path
776 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
777 UIO_USERSPACE, ua_mountdir, ctx);
778
779 error = namei(&nd);
780 if (error) {
781 return error;
782 }
783 mounton_vp = nd.ni_vp;
784
785 // Call into the FS to perform the ungraft
786 error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
787
788 vnode_put(mounton_vp);
789 nameidone(&nd);
790
791 return error;
792 }
793
794
795 void
vfs_notify_mount(vnode_t pdvp)796 vfs_notify_mount(vnode_t pdvp)
797 {
798 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
799 lock_vnode_and_post(pdvp, NOTE_WRITE);
800 }
801
802 /*
803 * __mac_mount:
804 * Mount a file system taking into account MAC label behavior.
805 * See mount(2) man page for more information
806 *
807 * Parameters: p Process requesting the mount
808 * uap User argument descriptor (see below)
809 * retval (ignored)
810 *
811 * Indirect: uap->type Filesystem type
812 * uap->path Path to mount
813 * uap->data Mount arguments
814 * uap->mac_p MAC info
815 * uap->flags Mount flags
816 *
817 *
818 * Returns: 0 Success
819 * !0 Not success
820 */
821 boolean_t root_fs_upgrade_try = FALSE;
822
823 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)824 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
825 {
826 vnode_t pvp = NULL;
827 vnode_t vp = NULL;
828 int need_nameidone = 0;
829 vfs_context_t ctx = vfs_context_current();
830 char fstypename[MFSNAMELEN];
831 struct nameidata nd;
832 size_t dummy = 0;
833 char *labelstr = NULL;
834 size_t labelsz = 0;
835 int flags = uap->flags;
836 int error;
837 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
838 boolean_t is_64bit = IS_64BIT_PROCESS(p);
839 #else
840 #pragma unused(p)
841 #endif
842 /*
843 * Get the fs type name from user space
844 */
845 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
846 if (error) {
847 return error;
848 }
849
850 /*
851 * Get the vnode to be covered
852 */
853 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
854 UIO_USERSPACE, uap->path, ctx);
855 if (flags & MNT_NOFOLLOW) {
856 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
857 }
858 error = namei(&nd);
859 if (error) {
860 goto out;
861 }
862 need_nameidone = 1;
863 vp = nd.ni_vp;
864 pvp = nd.ni_dvp;
865
866 #ifdef CONFIG_IMGSRC_ACCESS
867 /* Mounting image source cannot be batched with other operations */
868 if (flags == MNT_IMGSRC_BY_INDEX) {
869 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
870 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
871 goto out;
872 }
873 #endif /* CONFIG_IMGSRC_ACCESS */
874
875 #if CONFIG_MACF
876 /*
877 * Get the label string (if any) from user space
878 */
879 if (uap->mac_p != USER_ADDR_NULL) {
880 struct user_mac mac;
881 size_t ulen = 0;
882
883 if (is_64bit) {
884 struct user64_mac mac64;
885 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
886 mac.m_buflen = (user_size_t)mac64.m_buflen;
887 mac.m_string = (user_addr_t)mac64.m_string;
888 } else {
889 struct user32_mac mac32;
890 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
891 mac.m_buflen = mac32.m_buflen;
892 mac.m_string = mac32.m_string;
893 }
894 if (error) {
895 goto out;
896 }
897 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
898 (mac.m_buflen < 2)) {
899 error = EINVAL;
900 goto out;
901 }
902 labelsz = mac.m_buflen;
903 labelstr = kalloc_data(labelsz, Z_WAITOK);
904 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
905 if (error) {
906 goto out;
907 }
908 AUDIT_ARG(mac_string, labelstr);
909 }
910 #endif /* CONFIG_MACF */
911
912 AUDIT_ARG(fflags, flags);
913
914 #if !CONFIG_UNION_MOUNTS
915 if (flags & MNT_UNION) {
916 error = EPERM;
917 goto out;
918 }
919 #endif
920
921 if ((vp->v_flag & VROOT) &&
922 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
923 #if CONFIG_UNION_MOUNTS
924 if (!(flags & MNT_UNION)) {
925 flags |= MNT_UPDATE;
926 } else {
927 /*
928 * For a union mount on '/', treat it as fresh
929 * mount instead of update.
930 * Otherwise, union mouting on '/' used to panic the
931 * system before, since mnt_vnodecovered was found to
932 * be NULL for '/' which is required for unionlookup
933 * after it gets ENOENT on union mount.
934 */
935 flags = (flags & ~(MNT_UPDATE));
936 }
937 #else
938 flags |= MNT_UPDATE;
939 #endif /* CONFIG_UNION_MOUNTS */
940
941 #if SECURE_KERNEL
942 if ((flags & MNT_RDONLY) == 0) {
943 /* Release kernels are not allowed to mount "/" as rw */
944 error = EPERM;
945 goto out;
946 }
947 #endif
948
949 /*
950 * See 7392553 for more details on why this check exists.
951 * Suffice to say: If this check is ON and something tries
952 * to mount the rootFS RW, we'll turn off the codesign
953 * bitmap optimization.
954 */
955 #if CHECK_CS_VALIDATION_BITMAP
956 if ((flags & MNT_RDONLY) == 0) {
957 root_fs_upgrade_try = TRUE;
958 }
959 #endif
960 }
961
962 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
963 labelstr, ctx);
964
965 out:
966
967 #if CONFIG_MACF
968 kfree_data(labelstr, labelsz);
969 #endif /* CONFIG_MACF */
970
971 if (vp) {
972 vnode_put(vp);
973 }
974 if (pvp) {
975 vnode_put(pvp);
976 }
977 if (need_nameidone) {
978 nameidone(&nd);
979 }
980
981 return error;
982 }
983
984 /*
985 * common mount implementation (final stage of mounting)
986 *
987 * Arguments:
988 * fstypename file system type (ie it's vfs name)
989 * pvp parent of covered vnode
990 * vp covered vnode
991 * cnp component name (ie path) of covered vnode
992 * flags generic mount flags
993 * fsmountargs file system specific data
994 * labelstr optional MAC label
995 * kernelmount TRUE for mounts initiated from inside the kernel
996 * ctx caller's context
997 */
998 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)999 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1000 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1001 char *labelstr, vfs_context_t ctx)
1002 {
1003 #if !CONFIG_MACF
1004 #pragma unused(labelstr)
1005 #endif
1006 struct vnode *devvp = NULLVP;
1007 struct vnode *device_vnode = NULLVP;
1008 #if CONFIG_MACF
1009 struct vnode *rvp;
1010 #endif
1011 struct mount *mp = NULL;
1012 struct vfstable *vfsp = (struct vfstable *)0;
1013 struct proc *p = vfs_context_proc(ctx);
1014 int error, flag = 0;
1015 bool flag_set = false;
1016 user_addr_t devpath = USER_ADDR_NULL;
1017 int ronly = 0;
1018 int mntalloc = 0;
1019 boolean_t vfsp_ref = FALSE;
1020 boolean_t is_rwlock_locked = FALSE;
1021 boolean_t did_rele = FALSE;
1022 boolean_t have_usecount = FALSE;
1023 boolean_t did_set_lmount = FALSE;
1024 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1025
1026 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1027 /* Check for mutually-exclusive flag bits */
1028 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1029 int bitcount = 0;
1030 while (checkflags != 0) {
1031 checkflags &= (checkflags - 1);
1032 bitcount++;
1033 }
1034
1035 if (bitcount > 1) {
1036 //not allowed to request multiple mount-by-role flags
1037 error = EINVAL;
1038 goto out1;
1039 }
1040 #endif
1041
1042 /*
1043 * Process an update for an existing mount
1044 */
1045 if (flags & MNT_UPDATE) {
1046 if ((vp->v_flag & VROOT) == 0) {
1047 error = EINVAL;
1048 goto out1;
1049 }
1050 mp = vp->v_mount;
1051
1052 /* if unmount or mount in progress, return error */
1053 mount_lock_spin(mp);
1054 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1055 mount_unlock(mp);
1056 error = EBUSY;
1057 goto out1;
1058 }
1059 mp->mnt_lflag |= MNT_LMOUNT;
1060 did_set_lmount = TRUE;
1061 mount_unlock(mp);
1062 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1063 is_rwlock_locked = TRUE;
1064 /*
1065 * We only allow the filesystem to be reloaded if it
1066 * is currently mounted read-only.
1067 */
1068 if ((flags & MNT_RELOAD) &&
1069 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1070 error = ENOTSUP;
1071 goto out1;
1072 }
1073
1074 /*
1075 * If content protection is enabled, update mounts are not
1076 * allowed to turn it off.
1077 */
1078 if ((mp->mnt_flag & MNT_CPROTECT) &&
1079 ((flags & MNT_CPROTECT) == 0)) {
1080 error = EINVAL;
1081 goto out1;
1082 }
1083
1084 /*
1085 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1086 * failure to return an error for this so we'll just silently
1087 * add it if it is not passed in.
1088 */
1089 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1090 ((flags & MNT_REMOVABLE) == 0)) {
1091 flags |= MNT_REMOVABLE;
1092 }
1093
1094 /* Can't downgrade the backer of the root FS */
1095 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1096 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1097 error = ENOTSUP;
1098 goto out1;
1099 }
1100
1101 /*
1102 * Only root, or the user that did the original mount is
1103 * permitted to update it.
1104 */
1105 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1106 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1107 goto out1;
1108 }
1109 #if CONFIG_MACF
1110 error = mac_mount_check_remount(ctx, mp);
1111 if (error != 0) {
1112 goto out1;
1113 }
1114 #endif
1115 /*
1116 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1117 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1118 */
1119 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1120 flags |= MNT_NOSUID | MNT_NODEV;
1121 if (mp->mnt_flag & MNT_NOEXEC) {
1122 flags |= MNT_NOEXEC;
1123 }
1124 }
1125 flag = mp->mnt_flag;
1126 flag_set = true;
1127
1128
1129
1130 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1131
1132 vfsp = mp->mnt_vtable;
1133 goto update;
1134 } // MNT_UPDATE
1135
1136 /*
1137 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1138 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1139 */
1140 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1141 flags |= MNT_NOSUID | MNT_NODEV;
1142 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1143 flags |= MNT_NOEXEC;
1144 }
1145 }
1146
1147 /* XXXAUDIT: Should we capture the type on the error path as well? */
1148 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1149 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1150 mount_list_lock();
1151 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1152 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1153 vfsp->vfc_refcount++;
1154 vfsp_ref = TRUE;
1155 break;
1156 }
1157 }
1158 mount_list_unlock();
1159 if (vfsp == NULL) {
1160 error = ENODEV;
1161 goto out1;
1162 }
1163
1164 /*
1165 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1166 * except in ROSV configs and for the initial BaseSystem root.
1167 */
1168 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1169 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1170 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1171 error = EINVAL; /* unsupported request */
1172 goto out1;
1173 }
1174
1175 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1176 if (error != 0) {
1177 goto out1;
1178 }
1179
1180 /*
1181 * Allocate and initialize the filesystem (mount_t)
1182 */
1183 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1184 mntalloc = 1;
1185
1186 /* Initialize the default IO constraints */
1187 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1188 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1189 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1190 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1191 mp->mnt_devblocksize = DEV_BSIZE;
1192 mp->mnt_alignmentmask = PAGE_MASK;
1193 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1194 mp->mnt_ioscale = 1;
1195 mp->mnt_ioflags = 0;
1196 mp->mnt_realrootvp = NULLVP;
1197 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1198
1199 mp->mnt_lflag |= MNT_LMOUNT;
1200 did_set_lmount = TRUE;
1201
1202 TAILQ_INIT(&mp->mnt_vnodelist);
1203 TAILQ_INIT(&mp->mnt_workerqueue);
1204 TAILQ_INIT(&mp->mnt_newvnodes);
1205 mount_lock_init(mp);
1206 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1207 is_rwlock_locked = TRUE;
1208 mp->mnt_op = vfsp->vfc_vfsops;
1209 mp->mnt_vtable = vfsp;
1210 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1211 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1212 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1213 do {
1214 size_t pathlen = MAXPATHLEN;
1215
1216 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1217 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1218 }
1219 } while (0);
1220 mp->mnt_vnodecovered = vp;
1221 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1222 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1223 mp->mnt_devbsdunit = 0;
1224 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1225
1226 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1227 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1228
1229 if (kernelmount) {
1230 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1231 }
1232 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1233 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1234 }
1235
1236 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1237 // kernel mounted devfs
1238 mp->mnt_kern_flag |= MNTK_SYSTEM;
1239 }
1240
1241 update:
1242
1243 /*
1244 * Set the mount level flags.
1245 */
1246 if (flags & MNT_RDONLY) {
1247 mp->mnt_flag |= MNT_RDONLY;
1248 } else if (mp->mnt_flag & MNT_RDONLY) {
1249 // disallow read/write upgrades of file systems that
1250 // had the TYPENAME_OVERRIDE feature set.
1251 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1252 error = EPERM;
1253 goto out1;
1254 }
1255 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1256 }
1257 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1258 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1259 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1260 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1261 MNT_QUARANTINE | MNT_CPROTECT);
1262
1263 #if SECURE_KERNEL
1264 #if !CONFIG_MNT_SUID
1265 /*
1266 * On release builds of iOS based platforms, always enforce NOSUID on
1267 * all mounts. We do this here because we can catch update mounts as well as
1268 * non-update mounts in this case.
1269 */
1270 mp->mnt_flag |= (MNT_NOSUID);
1271 #endif
1272 #endif
1273
1274 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1275 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1276 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1277 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1278 MNT_QUARANTINE | MNT_CPROTECT);
1279
1280 #if CONFIG_MACF
1281 if (flags & MNT_MULTILABEL) {
1282 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1283 error = EINVAL;
1284 goto out1;
1285 }
1286 mp->mnt_flag |= MNT_MULTILABEL;
1287 }
1288 #endif
1289 /*
1290 * Process device path for local file systems if requested.
1291 *
1292 * Snapshot and mount-by-role mounts do not use this path; they are
1293 * passing other opaque data in the device path field.
1294 *
1295 * Basesystemroot mounts pass a device path to be resolved here,
1296 * but it's just a char * already inside the kernel, which
1297 * kernel_mount() shoved into a user_addr_t to call us. So for such
1298 * mounts we must skip copyin (both of the address and of the string
1299 * (in NDINIT).
1300 */
1301 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1302 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1303 boolean_t do_copyin_devpath = true;
1304 #if CONFIG_BASESYSTEMROOT
1305 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1306 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1307 // We have been passed fsmountargs, which is typed as a user_addr_t,
1308 // but is actually a char ** pointing to a (kernelspace) string.
1309 // We manually unpack it with a series of casts and dereferences
1310 // that reverses what was done just above us on the stack in
1311 // imageboot_pivot_image().
1312 // After retrieving the path to the dev node (which we will NDINIT
1313 // in a moment), we pass NULL fsmountargs on to the filesystem.
1314 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1315 char **devnamepp = (char **)fsmountargs;
1316 char *devnamep = *devnamepp;
1317 devpath = CAST_USER_ADDR_T(devnamep);
1318 do_copyin_devpath = false;
1319 fsmountargs = USER_ADDR_NULL;
1320
1321 //Now that we have a mp, denote that this mount is for the basesystem.
1322 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1323 }
1324 #endif // CONFIG_BASESYSTEMROOT
1325
1326 if (do_copyin_devpath) {
1327 if (vfs_context_is64bit(ctx)) {
1328 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1329 goto out1;
1330 }
1331 fsmountargs += sizeof(devpath);
1332 } else {
1333 user32_addr_t tmp;
1334 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1335 goto out1;
1336 }
1337 /* munge into LP64 addr */
1338 devpath = CAST_USER_ADDR_T(tmp);
1339 fsmountargs += sizeof(tmp);
1340 }
1341 }
1342
1343 /* Lookup device and authorize access to it */
1344 if ((devpath)) {
1345 struct nameidata nd;
1346
1347 enum uio_seg seg = UIO_USERSPACE;
1348 #if CONFIG_BASESYSTEMROOT
1349 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1350 seg = UIO_SYSSPACE;
1351 }
1352 #endif // CONFIG_BASESYSTEMROOT
1353
1354 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1355 if ((error = namei(&nd))) {
1356 goto out1;
1357 }
1358
1359 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1360 devvp = nd.ni_vp;
1361
1362 nameidone(&nd);
1363
1364 if (devvp->v_type != VBLK) {
1365 error = ENOTBLK;
1366 goto out2;
1367 }
1368 if (major(devvp->v_rdev) >= nblkdev) {
1369 error = ENXIO;
1370 goto out2;
1371 }
1372 /*
1373 * If mount by non-root, then verify that user has necessary
1374 * permissions on the device.
1375 */
1376 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1377 mode_t accessmode = KAUTH_VNODE_READ_DATA;
1378
1379 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1380 accessmode |= KAUTH_VNODE_WRITE_DATA;
1381 }
1382 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1383 goto out2;
1384 }
1385 }
1386 }
1387 /* On first mount, preflight and open device */
1388 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1389 if ((error = vnode_ref(devvp))) {
1390 goto out2;
1391 }
1392 /*
1393 * Disallow multiple mounts of the same device.
1394 * Disallow mounting of a device that is currently in use
1395 * (except for root, which might share swap device for miniroot).
1396 * Flush out any old buffers remaining from a previous use.
1397 */
1398 if ((error = vfs_mountedon(devvp))) {
1399 goto out3;
1400 }
1401
1402 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1403 error = EBUSY;
1404 goto out3;
1405 }
1406 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1407 error = ENOTBLK;
1408 goto out3;
1409 }
1410 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1411 goto out3;
1412 }
1413
1414 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1415 #if CONFIG_MACF
1416 error = mac_vnode_check_open(ctx,
1417 devvp,
1418 ronly ? FREAD : FREAD | FWRITE);
1419 if (error) {
1420 goto out3;
1421 }
1422 #endif /* MAC */
1423 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1424 goto out3;
1425 }
1426
1427 mp->mnt_devvp = devvp;
1428 device_vnode = devvp;
1429 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1430 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1431 (device_vnode = mp->mnt_devvp)) {
1432 dev_t dev;
1433 int maj;
1434 /*
1435 * If upgrade to read-write by non-root, then verify
1436 * that user has necessary permissions on the device.
1437 */
1438 vnode_getalways(device_vnode);
1439
1440 if (suser(vfs_context_ucred(ctx), NULL) &&
1441 (error = vnode_authorize(device_vnode, NULL,
1442 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1443 ctx)) != 0) {
1444 vnode_put(device_vnode);
1445 goto out2;
1446 }
1447
1448 /* Tell the device that we're upgrading */
1449 dev = (dev_t)device_vnode->v_rdev;
1450 maj = major(dev);
1451
1452 if ((u_int)maj >= (u_int)nblkdev) {
1453 panic("Volume mounted on a device with invalid major number.");
1454 }
1455
1456 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1457 vnode_put(device_vnode);
1458 device_vnode = NULLVP;
1459 if (error != 0) {
1460 goto out2;
1461 }
1462 }
1463 } // localargs && !(snapshot | data | vm)
1464
1465 #if CONFIG_MACF
1466 if ((flags & MNT_UPDATE) == 0) {
1467 mac_mount_label_init(mp);
1468 mac_mount_label_associate(ctx, mp);
1469 }
1470 if (labelstr) {
1471 if ((flags & MNT_UPDATE) != 0) {
1472 error = mac_mount_check_label_update(ctx, mp);
1473 if (error != 0) {
1474 goto out3;
1475 }
1476 }
1477 }
1478 #endif
1479 /*
1480 * Mount the filesystem. We already asserted that internal_flags
1481 * cannot have more than one mount-by-role bit set.
1482 */
1483 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1484 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1485 (caddr_t)fsmountargs, 0, ctx);
1486 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1487 #if CONFIG_ROSV_STARTUP
1488 struct mount *origin_mp = (struct mount*)fsmountargs;
1489 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1490 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1491 if (error) {
1492 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1493 } else {
1494 /* Mark volume associated with system volume */
1495 mp->mnt_kern_flag |= MNTK_SYSTEM;
1496
1497 /* Attempt to acquire the mnt_devvp and set it up */
1498 struct vnode *mp_devvp = NULL;
1499 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1500 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1501 0, &mp_devvp, vfs_context_kernel());
1502 if (!lerr) {
1503 mp->mnt_devvp = mp_devvp;
1504 //vnode_lookup took an iocount, need to drop it.
1505 vnode_put(mp_devvp);
1506 // now set `device_vnode` to the devvp that was acquired.
1507 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1508 // note that though the iocount above was dropped, the mount acquires
1509 // an implicit reference against the device.
1510 device_vnode = mp_devvp;
1511 }
1512 }
1513 }
1514 #else
1515 error = EINVAL;
1516 #endif
1517 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1518 #if CONFIG_MOUNT_VM
1519 struct mount *origin_mp = (struct mount*)fsmountargs;
1520 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1521 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1522 if (error) {
1523 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1524 } else {
1525 /* Mark volume associated with system volume and a swap mount */
1526 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1527 /* Attempt to acquire the mnt_devvp and set it up */
1528 struct vnode *mp_devvp = NULL;
1529 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1530 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1531 0, &mp_devvp, vfs_context_kernel());
1532 if (!lerr) {
1533 mp->mnt_devvp = mp_devvp;
1534 //vnode_lookup took an iocount, need to drop it.
1535 vnode_put(mp_devvp);
1536
1537 // now set `device_vnode` to the devvp that was acquired.
1538 // note that though the iocount above was dropped, the mount acquires
1539 // an implicit reference against the device.
1540 device_vnode = mp_devvp;
1541 }
1542 }
1543 }
1544 #else
1545 error = EINVAL;
1546 #endif
1547 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1548 #if CONFIG_MOUNT_PREBOOTRECOVERY
1549 struct mount *origin_mp = (struct mount*)fsmountargs;
1550 uint32_t mount_role = 0;
1551 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1552 mount_role = VFS_PREBOOT_ROLE;
1553 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1554 mount_role = VFS_RECOVERY_ROLE;
1555 }
1556
1557 if (mount_role != 0) {
1558 fs_role_mount_args_t frma = {origin_mp, mount_role};
1559 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1560 if (error) {
1561 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1562 } else {
1563 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1564 /* Mark volume associated with system volume */
1565 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1566 /* Attempt to acquire the mnt_devvp and set it up */
1567 struct vnode *mp_devvp = NULL;
1568 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1569 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1570 0, &mp_devvp, vfs_context_kernel());
1571 if (!lerr) {
1572 mp->mnt_devvp = mp_devvp;
1573 //vnode_lookup took an iocount, need to drop it.
1574 vnode_put(mp_devvp);
1575
1576 // now set `device_vnode` to the devvp that was acquired.
1577 // note that though the iocount above was dropped, the mount acquires
1578 // an implicit reference against the device.
1579 device_vnode = mp_devvp;
1580 }
1581 }
1582 }
1583 } else {
1584 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1585 error = EINVAL;
1586 }
1587 #else
1588 error = EINVAL;
1589 #endif
1590 } else {
1591 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1592 }
1593
1594 if (flags & MNT_UPDATE) {
1595 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1596 mp->mnt_flag &= ~MNT_RDONLY;
1597 }
1598 mp->mnt_flag &= ~
1599 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1600 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1601 if (error) {
1602 mp->mnt_flag = flag; /* restore flag value */
1603 }
1604 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1605 lck_rw_done(&mp->mnt_rwlock);
1606 is_rwlock_locked = FALSE;
1607 if (!error) {
1608 enablequotas(mp, ctx);
1609 }
1610 goto exit;
1611 }
1612
1613 /*
1614 * Put the new filesystem on the mount list after root.
1615 */
1616 if (error == 0) {
1617 struct vfs_attr vfsattr;
1618 if (device_vnode) {
1619 /*
1620 * cache the IO attributes for the underlying physical media...
1621 * an error return indicates the underlying driver doesn't
1622 * support all the queries necessary... however, reasonable
1623 * defaults will have been set, so no reason to bail or care
1624 *
1625 * Need to do this before calling the MAC hook as it needs
1626 * information from this call.
1627 */
1628 vfs_init_io_attributes(device_vnode, mp);
1629 }
1630
1631 #if CONFIG_MACF
1632 error = mac_mount_check_mount_late(ctx, mp);
1633 if (error != 0) {
1634 goto out4;
1635 }
1636
1637 if (vfs_flags(mp) & MNT_MULTILABEL) {
1638 error = VFS_ROOT(mp, &rvp, ctx);
1639 if (error) {
1640 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1641 goto out4;
1642 }
1643 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1644 /*
1645 * drop reference provided by VFS_ROOT
1646 */
1647 vnode_put(rvp);
1648
1649 if (error) {
1650 goto out4;
1651 }
1652 }
1653 #endif /* MAC */
1654
1655 vnode_lock_spin(vp);
1656 CLR(vp->v_flag, VMOUNT);
1657 vp->v_mountedhere = mp;
1658 vnode_unlock(vp);
1659
1660 /*
1661 * taking the name_cache_lock exclusively will
1662 * insure that everyone is out of the fast path who
1663 * might be trying to use a now stale copy of
1664 * vp->v_mountedhere->mnt_realrootvp
1665 * bumping mount_generation causes the cached values
1666 * to be invalidated
1667 */
1668 name_cache_lock();
1669 mount_generation++;
1670 name_cache_unlock();
1671
1672 error = vnode_ref(vp);
1673 if (error != 0) {
1674 goto out4;
1675 }
1676
1677 have_usecount = TRUE;
1678
1679 error = checkdirs(vp, ctx);
1680 if (error != 0) {
1681 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1682 goto out4;
1683 }
1684 /*
1685 * there is no cleanup code here so I have made it void
1686 * we need to revisit this
1687 */
1688 (void)VFS_START(mp, 0, ctx);
1689
1690 if (mount_list_add(mp) != 0) {
1691 /*
1692 * The system is shutting down trying to umount
1693 * everything, so fail with a plausible errno.
1694 */
1695 error = EBUSY;
1696 goto out4;
1697 }
1698 lck_rw_done(&mp->mnt_rwlock);
1699 is_rwlock_locked = FALSE;
1700
1701 /* Check if this mounted file system supports EAs or named streams. */
1702 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1703 VFSATTR_INIT(&vfsattr);
1704 VFSATTR_WANTED(&vfsattr, f_capabilities);
1705 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1706 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1707 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1708 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1709 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1710 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1711 }
1712 #if NAMEDSTREAMS
1713 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1714 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1715 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1716 }
1717 #endif
1718 /* Check if this file system supports path from id lookups. */
1719 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1720 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1721 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1722 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1723 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1724 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1725 }
1726
1727 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1728 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1729 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1730 }
1731 }
1732 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1733 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1734 }
1735 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1736 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1737 }
1738 /* increment the operations count */
1739 OSAddAtomic(1, &vfs_nummntops);
1740 enablequotas(mp, ctx);
1741
1742 if (device_vnode) {
1743 device_vnode->v_specflags |= SI_MOUNTEDON;
1744 }
1745
1746 /* Now that mount is setup, notify the listeners */
1747 vfs_notify_mount(pvp);
1748 IOBSDMountChange(mp, kIOMountChangeMount);
1749 } else {
1750 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1751 if (mp->mnt_vnodelist.tqh_first != NULL) {
1752 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1753 mp->mnt_vtable->vfc_name, error);
1754 }
1755
1756 vnode_lock_spin(vp);
1757 CLR(vp->v_flag, VMOUNT);
1758 vnode_unlock(vp);
1759 mount_list_lock();
1760 mp->mnt_vtable->vfc_refcount--;
1761 mount_list_unlock();
1762
1763 if (device_vnode) {
1764 vnode_rele(device_vnode);
1765 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1766 }
1767 lck_rw_done(&mp->mnt_rwlock);
1768 is_rwlock_locked = FALSE;
1769
1770 /*
1771 * if we get here, we have a mount structure that needs to be freed,
1772 * but since the coveredvp hasn't yet been updated to point at it,
1773 * no need to worry about other threads holding a crossref on this mp
1774 * so it's ok to just free it
1775 */
1776 mount_lock_destroy(mp);
1777 #if CONFIG_MACF
1778 mac_mount_label_destroy(mp);
1779 #endif
1780 zfree(mount_zone, mp);
1781 did_set_lmount = false;
1782 }
1783 exit:
1784 /*
1785 * drop I/O count on the device vp if there was one
1786 */
1787 if (devpath && devvp) {
1788 vnode_put(devvp);
1789 }
1790
1791 if (did_set_lmount) {
1792 mount_lock_spin(mp);
1793 mp->mnt_lflag &= ~MNT_LMOUNT;
1794 mount_unlock(mp);
1795 }
1796
1797 return error;
1798
1799 /* Error condition exits */
1800 out4:
1801 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1802
1803 /*
1804 * If the mount has been placed on the covered vp,
1805 * it may have been discovered by now, so we have
1806 * to treat this just like an unmount
1807 */
1808 mount_lock_spin(mp);
1809 mp->mnt_lflag |= MNT_LDEAD;
1810 mount_unlock(mp);
1811
1812 if (device_vnode != NULLVP) {
1813 vnode_rele(device_vnode);
1814 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1815 ctx);
1816 did_rele = TRUE;
1817 }
1818
1819 vnode_lock_spin(vp);
1820
1821 mp->mnt_crossref++;
1822 vp->v_mountedhere = (mount_t) 0;
1823
1824 vnode_unlock(vp);
1825
1826 if (have_usecount) {
1827 vnode_rele(vp);
1828 }
1829 out3:
1830 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1831 vnode_rele(devvp);
1832 }
1833 out2:
1834 if (devpath && devvp) {
1835 vnode_put(devvp);
1836 }
1837 out1:
1838 /* Release mnt_rwlock only when it was taken */
1839 if (is_rwlock_locked == TRUE) {
1840 if (flag_set) {
1841 mp->mnt_flag = flag; /* restore mnt_flag value */
1842 }
1843 lck_rw_done(&mp->mnt_rwlock);
1844 }
1845
1846 if (did_set_lmount) {
1847 mount_lock_spin(mp);
1848 mp->mnt_lflag &= ~MNT_LMOUNT;
1849 mount_unlock(mp);
1850 }
1851
1852 if (mntalloc) {
1853 if (mp->mnt_crossref) {
1854 mount_dropcrossref(mp, vp, 0);
1855 } else {
1856 mount_lock_destroy(mp);
1857 #if CONFIG_MACF
1858 mac_mount_label_destroy(mp);
1859 #endif
1860 zfree(mount_zone, mp);
1861 }
1862 }
1863 if (vfsp_ref) {
1864 mount_list_lock();
1865 vfsp->vfc_refcount--;
1866 mount_list_unlock();
1867 }
1868
1869 return error;
1870 }
1871
1872 /*
1873 * Flush in-core data, check for competing mount attempts,
1874 * and set VMOUNT
1875 */
1876 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1877 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1878 {
1879 #if !CONFIG_MACF
1880 #pragma unused(cnp,fsname)
1881 #endif
1882 struct vnode_attr va;
1883 int error;
1884 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1885 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1886 boolean_t is_busy;
1887
1888 if (!skip_auth) {
1889 /*
1890 * If the user is not root, ensure that they own the directory
1891 * onto which we are attempting to mount.
1892 */
1893 VATTR_INIT(&va);
1894 VATTR_WANTED(&va, va_uid);
1895 if ((error = vnode_getattr(vp, &va, ctx)) ||
1896 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1897 (!vfs_context_issuser(ctx)))) {
1898 error = EPERM;
1899 goto out;
1900 }
1901 }
1902
1903 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1904 goto out;
1905 }
1906
1907 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1908 goto out;
1909 }
1910
1911 if (vp->v_type != VDIR) {
1912 error = ENOTDIR;
1913 goto out;
1914 }
1915
1916 vnode_lock_spin(vp);
1917 is_busy = is_fmount ?
1918 (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1919 (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1920 if (is_busy) {
1921 vnode_unlock(vp);
1922 error = EBUSY;
1923 goto out;
1924 }
1925 SET(vp->v_flag, VMOUNT);
1926 vnode_unlock(vp);
1927
1928 #if CONFIG_MACF
1929 error = mac_mount_check_mount(ctx, vp,
1930 cnp, fsname);
1931 if (error != 0) {
1932 vnode_lock_spin(vp);
1933 CLR(vp->v_flag, VMOUNT);
1934 vnode_unlock(vp);
1935 }
1936 #endif
1937
1938 out:
1939 return error;
1940 }
1941
1942 #if CONFIG_IMGSRC_ACCESS
1943
1944 #define DEBUG_IMGSRC 0
1945
1946 #if DEBUG_IMGSRC
1947 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1948 #else
1949 #define IMGSRC_DEBUG(args...) do { } while(0)
1950 #endif
1951
1952 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1953 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1954 {
1955 struct nameidata nd;
1956 vnode_t vp, realdevvp;
1957 mode_t accessmode;
1958 int error;
1959 enum uio_seg uio = UIO_USERSPACE;
1960
1961 if (ctx == vfs_context_kernel()) {
1962 uio = UIO_SYSSPACE;
1963 }
1964
1965 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1966 if ((error = namei(&nd))) {
1967 IMGSRC_DEBUG("namei() failed with %d\n", error);
1968 return error;
1969 }
1970
1971 vp = nd.ni_vp;
1972
1973 if (!vnode_isblk(vp)) {
1974 IMGSRC_DEBUG("Not block device.\n");
1975 error = ENOTBLK;
1976 goto out;
1977 }
1978
1979 realdevvp = mp->mnt_devvp;
1980 if (realdevvp == NULLVP) {
1981 IMGSRC_DEBUG("No device backs the mount.\n");
1982 error = ENXIO;
1983 goto out;
1984 }
1985
1986 error = vnode_getwithref(realdevvp);
1987 if (error != 0) {
1988 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1989 goto out;
1990 }
1991
1992 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1993 IMGSRC_DEBUG("Wrong dev_t.\n");
1994 error = ENXIO;
1995 goto out1;
1996 }
1997
1998 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1999
2000 /*
2001 * If mount by non-root, then verify that user has necessary
2002 * permissions on the device.
2003 */
2004 if (!vfs_context_issuser(ctx)) {
2005 accessmode = KAUTH_VNODE_READ_DATA;
2006 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2007 accessmode |= KAUTH_VNODE_WRITE_DATA;
2008 }
2009 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2010 IMGSRC_DEBUG("Access denied.\n");
2011 goto out1;
2012 }
2013 }
2014
2015 *devvpp = vp;
2016
2017 out1:
2018 vnode_put(realdevvp);
2019
2020 out:
2021 nameidone(&nd);
2022
2023 if (error) {
2024 vnode_put(vp);
2025 }
2026
2027 return error;
2028 }
2029
2030 /*
2031 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2032 * and call checkdirs()
2033 */
2034 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2035 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2036 {
2037 int error;
2038
2039 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2040
2041 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2042 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2043
2044 vnode_lock_spin(vp);
2045 CLR(vp->v_flag, VMOUNT);
2046 vp->v_mountedhere = mp;
2047 vnode_unlock(vp);
2048
2049 /*
2050 * taking the name_cache_lock exclusively will
2051 * insure that everyone is out of the fast path who
2052 * might be trying to use a now stale copy of
2053 * vp->v_mountedhere->mnt_realrootvp
2054 * bumping mount_generation causes the cached values
2055 * to be invalidated
2056 */
2057 name_cache_lock();
2058 mount_generation++;
2059 name_cache_unlock();
2060
2061 error = vnode_ref(vp);
2062 if (error != 0) {
2063 goto out;
2064 }
2065
2066 error = checkdirs(vp, ctx);
2067 if (error != 0) {
2068 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2069 vnode_rele(vp);
2070 goto out;
2071 }
2072
2073 out:
2074 if (error != 0) {
2075 mp->mnt_vnodecovered = NULLVP;
2076 }
2077 return error;
2078 }
2079
2080 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2081 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2082 {
2083 vnode_rele(vp);
2084 vnode_lock_spin(vp);
2085 vp->v_mountedhere = (mount_t)NULL;
2086 vnode_unlock(vp);
2087
2088 mp->mnt_vnodecovered = NULLVP;
2089 }
2090
2091 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2092 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2093 {
2094 int error;
2095
2096 /* unmount in progress return error */
2097 mount_lock_spin(mp);
2098 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2099 mount_unlock(mp);
2100 return EBUSY;
2101 }
2102 mount_unlock(mp);
2103 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2104
2105 /*
2106 * We only allow the filesystem to be reloaded if it
2107 * is currently mounted read-only.
2108 */
2109 if ((flags & MNT_RELOAD) &&
2110 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2111 error = ENOTSUP;
2112 goto out;
2113 }
2114
2115 /*
2116 * Only root, or the user that did the original mount is
2117 * permitted to update it.
2118 */
2119 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2120 (!vfs_context_issuser(ctx))) {
2121 error = EPERM;
2122 goto out;
2123 }
2124 #if CONFIG_MACF
2125 error = mac_mount_check_remount(ctx, mp);
2126 if (error != 0) {
2127 goto out;
2128 }
2129 #endif
2130
2131 out:
2132 if (error) {
2133 lck_rw_done(&mp->mnt_rwlock);
2134 }
2135
2136 return error;
2137 }
2138
2139 static void
mount_end_update(mount_t mp)2140 mount_end_update(mount_t mp)
2141 {
2142 lck_rw_done(&mp->mnt_rwlock);
2143 }
2144
2145 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2146 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2147 {
2148 vnode_t vp;
2149
2150 if (height >= MAX_IMAGEBOOT_NESTING) {
2151 return EINVAL;
2152 }
2153
2154 vp = imgsrc_rootvnodes[height];
2155 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2156 *rvpp = vp;
2157 return 0;
2158 } else {
2159 return ENOENT;
2160 }
2161 }
2162
2163 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2164 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2165 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2166 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2167 {
2168 int error;
2169 mount_t mp;
2170 boolean_t placed = FALSE;
2171 struct vfstable *vfsp;
2172 user_addr_t devpath;
2173 char *old_mntonname;
2174 vnode_t rvp;
2175 vnode_t devvp;
2176 uint32_t height;
2177 uint32_t flags;
2178
2179 /* If we didn't imageboot, nothing to move */
2180 if (imgsrc_rootvnodes[0] == NULLVP) {
2181 return EINVAL;
2182 }
2183
2184 /* Only root can do this */
2185 if (!vfs_context_issuser(ctx)) {
2186 return EPERM;
2187 }
2188
2189 IMGSRC_DEBUG("looking for root vnode.\n");
2190
2191 /*
2192 * Get root vnode of filesystem we're moving.
2193 */
2194 if (by_index) {
2195 if (is64bit) {
2196 struct user64_mnt_imgsrc_args mia64;
2197 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2198 if (error != 0) {
2199 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2200 return error;
2201 }
2202
2203 height = mia64.mi_height;
2204 flags = mia64.mi_flags;
2205 devpath = (user_addr_t)mia64.mi_devpath;
2206 } else {
2207 struct user32_mnt_imgsrc_args mia32;
2208 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2209 if (error != 0) {
2210 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2211 return error;
2212 }
2213
2214 height = mia32.mi_height;
2215 flags = mia32.mi_flags;
2216 devpath = mia32.mi_devpath;
2217 }
2218 } else {
2219 /*
2220 * For binary compatibility--assumes one level of nesting.
2221 */
2222 if (is64bit) {
2223 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2224 return error;
2225 }
2226 } else {
2227 user32_addr_t tmp;
2228 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2229 return error;
2230 }
2231
2232 /* munge into LP64 addr */
2233 devpath = CAST_USER_ADDR_T(tmp);
2234 }
2235
2236 height = 0;
2237 flags = 0;
2238 }
2239
2240 if (flags != 0) {
2241 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2242 return EINVAL;
2243 }
2244
2245 error = get_imgsrc_rootvnode(height, &rvp);
2246 if (error != 0) {
2247 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2248 return error;
2249 }
2250
2251 IMGSRC_DEBUG("got old root vnode\n");
2252
2253 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2254
2255 /* Can only move once */
2256 mp = vnode_mount(rvp);
2257 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2258 IMGSRC_DEBUG("Already moved.\n");
2259 error = EBUSY;
2260 goto out0;
2261 }
2262
2263 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2264 IMGSRC_DEBUG("Starting updated.\n");
2265
2266 /* Get exclusive rwlock on mount, authorize update on mp */
2267 error = mount_begin_update(mp, ctx, 0);
2268 if (error != 0) {
2269 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2270 goto out0;
2271 }
2272
2273 /*
2274 * It can only be moved once. Flag is set under the rwlock,
2275 * so we're now safe to proceed.
2276 */
2277 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2278 IMGSRC_DEBUG("Already moved [2]\n");
2279 goto out1;
2280 }
2281
2282 IMGSRC_DEBUG("Preparing coveredvp.\n");
2283
2284 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2285 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2286 if (error != 0) {
2287 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2288 goto out1;
2289 }
2290
2291 IMGSRC_DEBUG("Covered vp OK.\n");
2292
2293 /* Sanity check the name caller has provided */
2294 vfsp = mp->mnt_vtable;
2295 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2296 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2297 vfsp->vfc_name, fsname);
2298 error = EINVAL;
2299 goto out2;
2300 }
2301
2302 /* Check the device vnode and update mount-from name, for local filesystems */
2303 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2304 IMGSRC_DEBUG("Local, doing device validation.\n");
2305
2306 if (devpath != USER_ADDR_NULL) {
2307 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2308 if (error) {
2309 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2310 goto out2;
2311 }
2312
2313 vnode_put(devvp);
2314 }
2315 }
2316
2317 /*
2318 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2319 * and increment the name cache's mount generation
2320 */
2321
2322 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2323 error = place_mount_and_checkdirs(mp, vp, ctx);
2324 if (error != 0) {
2325 goto out2;
2326 }
2327
2328 placed = TRUE;
2329
2330 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2331 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2332
2333 /* Forbid future moves */
2334 mount_lock(mp);
2335 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2336 mount_unlock(mp);
2337
2338 /* Finally, add to mount list, completely ready to go */
2339 if (mount_list_add(mp) != 0) {
2340 /*
2341 * The system is shutting down trying to umount
2342 * everything, so fail with a plausible errno.
2343 */
2344 error = EBUSY;
2345 goto out3;
2346 }
2347
2348 mount_end_update(mp);
2349 vnode_put(rvp);
2350 zfree(ZV_NAMEI, old_mntonname);
2351
2352 vfs_notify_mount(pvp);
2353
2354 return 0;
2355 out3:
2356 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2357
2358 mount_lock(mp);
2359 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2360 mount_unlock(mp);
2361
2362 out2:
2363 /*
2364 * Placing the mp on the vnode clears VMOUNT,
2365 * so cleanup is different after that point
2366 */
2367 if (placed) {
2368 /* Rele the vp, clear VMOUNT and v_mountedhere */
2369 undo_place_on_covered_vp(mp, vp);
2370 } else {
2371 vnode_lock_spin(vp);
2372 CLR(vp->v_flag, VMOUNT);
2373 vnode_unlock(vp);
2374 }
2375 out1:
2376 mount_end_update(mp);
2377
2378 out0:
2379 vnode_put(rvp);
2380 zfree(ZV_NAMEI, old_mntonname);
2381 return error;
2382 }
2383
2384 #endif /* CONFIG_IMGSRC_ACCESS */
2385
2386 void
enablequotas(struct mount * mp,vfs_context_t ctx)2387 enablequotas(struct mount *mp, vfs_context_t ctx)
2388 {
2389 struct nameidata qnd;
2390 int type;
2391 char qfpath[MAXPATHLEN];
2392 const char *qfname = QUOTAFILENAME;
2393 const char *qfopsname = QUOTAOPSNAME;
2394 const char *qfextension[] = INITQFNAMES;
2395
2396 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2397 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2398 return;
2399 }
2400 /*
2401 * Enable filesystem disk quotas if necessary.
2402 * We ignore errors as this should not interfere with final mount
2403 */
2404 for (type = 0; type < MAXQUOTAS; type++) {
2405 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2406 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2407 CAST_USER_ADDR_T(qfpath), ctx);
2408 if (namei(&qnd) != 0) {
2409 continue; /* option file to trigger quotas is not present */
2410 }
2411 vnode_put(qnd.ni_vp);
2412 nameidone(&qnd);
2413 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2414
2415 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2416 }
2417 return;
2418 }
2419
2420
2421 static int
checkdirs_callback(proc_t p,void * arg)2422 checkdirs_callback(proc_t p, void * arg)
2423 {
2424 struct cdirargs *cdrp = (struct cdirargs *)arg;
2425 vnode_t olddp = cdrp->olddp;
2426 vnode_t newdp = cdrp->newdp;
2427 struct filedesc *fdp = &p->p_fd;
2428 vnode_t new_cvp = newdp;
2429 vnode_t new_rvp = newdp;
2430 vnode_t old_cvp = NULL;
2431 vnode_t old_rvp = NULL;
2432
2433 /*
2434 * XXX Also needs to iterate each thread in the process to see if it
2435 * XXX is using a per-thread current working directory, and, if so,
2436 * XXX update that as well.
2437 */
2438
2439 /*
2440 * First, with the proc_fdlock held, check to see if we will need
2441 * to do any work. If not, we will get out fast.
2442 */
2443 proc_fdlock(p);
2444 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2445 proc_fdunlock(p);
2446 return PROC_RETURNED;
2447 }
2448 proc_fdunlock(p);
2449
2450 /*
2451 * Ok, we will have to do some work. Always take two refs
2452 * because we might need that many. We'll dispose of whatever
2453 * we ended up not using.
2454 */
2455 if (vnode_ref(newdp) != 0) {
2456 return PROC_RETURNED;
2457 }
2458 if (vnode_ref(newdp) != 0) {
2459 vnode_rele(newdp);
2460 return PROC_RETURNED;
2461 }
2462
2463 proc_dirs_lock_exclusive(p);
2464 /*
2465 * Now do the work. Note: we dropped the proc_fdlock, so we
2466 * have to do all of the checks again.
2467 */
2468 proc_fdlock(p);
2469 if (fdp->fd_cdir == olddp) {
2470 old_cvp = olddp;
2471 fdp->fd_cdir = newdp;
2472 new_cvp = NULL;
2473 }
2474 if (fdp->fd_rdir == olddp) {
2475 old_rvp = olddp;
2476 fdp->fd_rdir = newdp;
2477 new_rvp = NULL;
2478 }
2479 proc_fdunlock(p);
2480 proc_dirs_unlock_exclusive(p);
2481
2482 /*
2483 * Dispose of any references that are no longer needed.
2484 */
2485 if (old_cvp != NULL) {
2486 vnode_rele(old_cvp);
2487 }
2488 if (old_rvp != NULL) {
2489 vnode_rele(old_rvp);
2490 }
2491 if (new_cvp != NULL) {
2492 vnode_rele(new_cvp);
2493 }
2494 if (new_rvp != NULL) {
2495 vnode_rele(new_rvp);
2496 }
2497
2498 return PROC_RETURNED;
2499 }
2500
2501
2502
2503 /*
2504 * Scan all active processes to see if any of them have a current
2505 * or root directory onto which the new filesystem has just been
2506 * mounted. If so, replace them with the new mount point.
2507 */
2508 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2509 checkdirs(vnode_t olddp, vfs_context_t ctx)
2510 {
2511 vnode_t newdp;
2512 vnode_t tvp;
2513 int err;
2514 struct cdirargs cdr;
2515
2516 if (olddp->v_usecount == 1) {
2517 return 0;
2518 }
2519 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2520
2521 if (err != 0) {
2522 #if DIAGNOSTIC
2523 panic("mount: lost mount: error %d", err);
2524 #endif
2525 return err;
2526 }
2527
2528 cdr.olddp = olddp;
2529 cdr.newdp = newdp;
2530 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2531 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2532
2533 if (rootvnode == olddp) {
2534 vnode_ref(newdp);
2535 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2536 tvp = rootvnode;
2537 rootvnode = newdp;
2538 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2539 vnode_rele(tvp);
2540 }
2541
2542 vnode_put(newdp);
2543 return 0;
2544 }
2545
2546 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2547 "com.apple.private.vfs.role-account-unmount"
2548
2549 /*
2550 * Unmount a file system.
2551 *
2552 * Note: unmount takes a path to the vnode mounted on as argument,
2553 * not special file (as before).
2554 */
2555 /* ARGSUSED */
2556 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2557 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2558 {
2559 vnode_t vp;
2560 struct mount *mp;
2561 int error;
2562 struct nameidata nd;
2563 vfs_context_t ctx;
2564
2565 /*
2566 * If the process has the entitlement, use the kernel's context when
2567 * performing lookup on the mount path as the process might lack proper
2568 * permission to access the directory.
2569 */
2570 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2571 vfs_context_kernel() : vfs_context_current();
2572
2573 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2574 UIO_USERSPACE, uap->path, ctx);
2575 error = namei(&nd);
2576 if (error) {
2577 return error;
2578 }
2579 vp = nd.ni_vp;
2580 mp = vp->v_mount;
2581 nameidone(&nd);
2582
2583 #if CONFIG_MACF
2584 error = mac_mount_check_umount(ctx, mp);
2585 if (error != 0) {
2586 vnode_put(vp);
2587 return error;
2588 }
2589 #endif
2590 /*
2591 * Must be the root of the filesystem
2592 */
2593 if ((vp->v_flag & VROOT) == 0) {
2594 vnode_put(vp);
2595 return EINVAL;
2596 }
2597 mount_ref(mp, 0);
2598 vnode_put(vp);
2599 /* safedounmount consumes the mount ref */
2600 return safedounmount(mp, uap->flags, ctx);
2601 }
2602
2603 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2604 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2605 {
2606 mount_t mp;
2607
2608 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2609 if (mp == (mount_t)0) {
2610 return ENOENT;
2611 }
2612 mount_ref(mp, 0);
2613 mount_iterdrop(mp);
2614 /* safedounmount consumes the mount ref */
2615 return safedounmount(mp, flags, ctx);
2616 }
2617
2618 /*
2619 * The mount struct comes with a mount ref which will be consumed.
2620 * Do the actual file system unmount, prevent some common foot shooting.
2621 */
2622 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2623 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2624 {
2625 int error;
2626 proc_t p = vfs_context_proc(ctx);
2627
2628 /*
2629 * If the file system is not responding and MNT_NOBLOCK
2630 * is set and not a forced unmount then return EBUSY.
2631 */
2632 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2633 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2634 error = EBUSY;
2635 goto out;
2636 }
2637
2638 /*
2639 * Skip authorization in two cases:
2640 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2641 * This entitlement allows non-root processes unmount volumes mounted by
2642 * other processes.
2643 * - If the mount is tagged as permissive and this is not a forced-unmount
2644 * attempt.
2645 */
2646 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2647 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2648 /*
2649 * Only root, or the user that did the original mount is
2650 * permitted to unmount this filesystem.
2651 */
2652 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2653 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2654 goto out;
2655 }
2656 }
2657 /*
2658 * Don't allow unmounting the root file system, or other volumes
2659 * associated with it (for example, the associated VM or DATA mounts) .
2660 */
2661 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2662 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2663 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2664 mp->mnt_vfsstat.f_mntonname);
2665 }
2666 error = EBUSY; /* the root (or associated volumes) is always busy */
2667 goto out;
2668 }
2669
2670 /*
2671 * If the mount is providing the root filesystem's disk image
2672 * (i.e. imageboot), don't allow unmounting
2673 */
2674 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2675 error = EBUSY;
2676 goto out;
2677 }
2678
2679 return dounmount(mp, flags, 1, ctx);
2680
2681 out:
2682 mount_drop(mp, 0);
2683 return error;
2684 }
2685
2686 /*
2687 * Do the actual file system unmount.
2688 */
2689 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2690 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2691 {
2692 vnode_t coveredvp = (vnode_t)0;
2693 int error;
2694 int needwakeup = 0;
2695 int forcedunmount = 0;
2696 int lflags = 0;
2697 struct vnode *devvp = NULLVP;
2698 #if CONFIG_TRIGGERS
2699 proc_t p = vfs_context_proc(ctx);
2700 int did_vflush = 0;
2701 int pflags_save = 0;
2702 #endif /* CONFIG_TRIGGERS */
2703
2704 #if CONFIG_FSE
2705 if (!(flags & MNT_FORCE)) {
2706 fsevent_unmount(mp, ctx); /* has to come first! */
2707 }
2708 #endif
2709
2710 mount_lock(mp);
2711
2712 /*
2713 * If already an unmount in progress just return EBUSY.
2714 * Even a forced unmount cannot override.
2715 */
2716 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2717 if (withref != 0) {
2718 mount_drop(mp, 1);
2719 }
2720 mount_unlock(mp);
2721 return EBUSY;
2722 }
2723
2724 if (flags & MNT_FORCE) {
2725 forcedunmount = 1;
2726 mp->mnt_lflag |= MNT_LFORCE;
2727 }
2728
2729 #if CONFIG_TRIGGERS
2730 if (flags & MNT_NOBLOCK && p != kernproc) {
2731 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2732 }
2733 #endif
2734
2735 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2736 mp->mnt_lflag |= MNT_LUNMOUNT;
2737 mp->mnt_flag &= ~MNT_ASYNC;
2738 /*
2739 * anyone currently in the fast path that
2740 * trips over the cached rootvp will be
2741 * dumped out and forced into the slow path
2742 * to regenerate a new cached value
2743 */
2744 mp->mnt_realrootvp = NULLVP;
2745 mount_unlock(mp);
2746
2747 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2748 /*
2749 * Force unmount any mounts in this filesystem.
2750 * If any unmounts fail - just leave them dangling.
2751 * Avoids recursion.
2752 */
2753 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2754 }
2755
2756 /*
2757 * taking the name_cache_lock exclusively will
2758 * insure that everyone is out of the fast path who
2759 * might be trying to use a now stale copy of
2760 * vp->v_mountedhere->mnt_realrootvp
2761 * bumping mount_generation causes the cached values
2762 * to be invalidated
2763 */
2764 name_cache_lock();
2765 mount_generation++;
2766 name_cache_unlock();
2767
2768
2769 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2770 if (withref != 0) {
2771 mount_drop(mp, 0);
2772 }
2773 error = 0;
2774 if (forcedunmount == 0) {
2775 ubc_umount(mp); /* release cached vnodes */
2776 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2777 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2778 if (error) {
2779 mount_lock(mp);
2780 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2781 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2782 mp->mnt_lflag &= ~MNT_LFORCE;
2783 goto out;
2784 }
2785 }
2786 }
2787
2788 IOBSDMountChange(mp, kIOMountChangeUnmount);
2789
2790 #if CONFIG_TRIGGERS
2791 vfs_nested_trigger_unmounts(mp, flags, ctx);
2792 did_vflush = 1;
2793 #endif
2794 if (forcedunmount) {
2795 lflags |= FORCECLOSE;
2796 }
2797 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2798 if ((forcedunmount == 0) && error) {
2799 mount_lock(mp);
2800 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2801 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2802 mp->mnt_lflag &= ~MNT_LFORCE;
2803 goto out;
2804 }
2805
2806 /* make sure there are no one in the mount iterations or lookup */
2807 mount_iterdrain(mp);
2808
2809 error = VFS_UNMOUNT(mp, flags, ctx);
2810 if (error) {
2811 mount_iterreset(mp);
2812 mount_lock(mp);
2813 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2814 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2815 mp->mnt_lflag &= ~MNT_LFORCE;
2816 goto out;
2817 }
2818
2819 /* increment the operations count */
2820 if (!error) {
2821 OSAddAtomic(1, &vfs_nummntops);
2822 }
2823
2824 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2825 /* hold an io reference and drop the usecount before close */
2826 devvp = mp->mnt_devvp;
2827 vnode_getalways(devvp);
2828 vnode_rele(devvp);
2829 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2830 ctx);
2831 vnode_clearmountedon(devvp);
2832 vnode_put(devvp);
2833 }
2834 lck_rw_done(&mp->mnt_rwlock);
2835 mount_list_remove(mp);
2836 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2837
2838 /* mark the mount point hook in the vp but not drop the ref yet */
2839 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2840 /*
2841 * The covered vnode needs special handling. Trying to get an
2842 * iocount must not block here as this may lead to deadlocks
2843 * if the Filesystem to which the covered vnode belongs is
2844 * undergoing forced unmounts. Since we hold a usecount, the
2845 * vnode cannot be reused (it can, however, still be terminated)
2846 */
2847 vnode_getalways(coveredvp);
2848 vnode_lock_spin(coveredvp);
2849
2850 mp->mnt_crossref++;
2851 coveredvp->v_mountedhere = (struct mount *)0;
2852 CLR(coveredvp->v_flag, VMOUNT);
2853
2854 vnode_unlock(coveredvp);
2855 vnode_put(coveredvp);
2856 }
2857
2858 mount_list_lock();
2859 mp->mnt_vtable->vfc_refcount--;
2860 mount_list_unlock();
2861
2862 cache_purgevfs(mp); /* remove cache entries for this file sys */
2863 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2864 mount_lock(mp);
2865 mp->mnt_lflag |= MNT_LDEAD;
2866
2867 if (mp->mnt_lflag & MNT_LWAIT) {
2868 /*
2869 * do the wakeup here
2870 * in case we block in mount_refdrain
2871 * which will drop the mount lock
2872 * and allow anyone blocked in vfs_busy
2873 * to wakeup and see the LDEAD state
2874 */
2875 mp->mnt_lflag &= ~MNT_LWAIT;
2876 wakeup((caddr_t)mp);
2877 }
2878 mount_refdrain(mp);
2879
2880 /* free disk_conditioner_info structure for this mount */
2881 disk_conditioner_unmount(mp);
2882
2883 out:
2884 if (mp->mnt_lflag & MNT_LWAIT) {
2885 mp->mnt_lflag &= ~MNT_LWAIT;
2886 needwakeup = 1;
2887 }
2888
2889 #if CONFIG_TRIGGERS
2890 if (flags & MNT_NOBLOCK && p != kernproc) {
2891 // Restore P_NOREMOTEHANG bit to its previous value
2892 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2893 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2894 }
2895 }
2896
2897 /*
2898 * Callback and context are set together under the mount lock, and
2899 * never cleared, so we're safe to examine them here, drop the lock,
2900 * and call out.
2901 */
2902 if (mp->mnt_triggercallback != NULL) {
2903 mount_unlock(mp);
2904 if (error == 0) {
2905 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2906 } else if (did_vflush) {
2907 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2908 }
2909 } else {
2910 mount_unlock(mp);
2911 }
2912 #else
2913 mount_unlock(mp);
2914 #endif /* CONFIG_TRIGGERS */
2915
2916 lck_rw_done(&mp->mnt_rwlock);
2917
2918 if (needwakeup) {
2919 wakeup((caddr_t)mp);
2920 }
2921
2922 if (!error) {
2923 if ((coveredvp != NULLVP)) {
2924 vnode_t pvp = NULLVP;
2925
2926 /*
2927 * The covered vnode needs special handling. Trying to
2928 * get an iocount must not block here as this may lead
2929 * to deadlocks if the Filesystem to which the covered
2930 * vnode belongs is undergoing forced unmounts. Since we
2931 * hold a usecount, the vnode cannot be reused
2932 * (it can, however, still be terminated).
2933 */
2934 vnode_getalways(coveredvp);
2935
2936 mount_dropcrossref(mp, coveredvp, 0);
2937 /*
2938 * We'll _try_ to detect if this really needs to be
2939 * done. The coveredvp can only be in termination (or
2940 * terminated) if the coveredvp's mount point is in a
2941 * forced unmount (or has been) since we still hold the
2942 * ref.
2943 */
2944 if (!vnode_isrecycled(coveredvp)) {
2945 pvp = vnode_getparent(coveredvp);
2946 #if CONFIG_TRIGGERS
2947 if (coveredvp->v_resolve) {
2948 vnode_trigger_rearm(coveredvp, ctx);
2949 }
2950 #endif
2951 }
2952
2953 vnode_rele(coveredvp);
2954 vnode_put(coveredvp);
2955 coveredvp = NULLVP;
2956
2957 if (pvp) {
2958 lock_vnode_and_post(pvp, NOTE_WRITE);
2959 vnode_put(pvp);
2960 }
2961 } else if (mp->mnt_flag & MNT_ROOTFS) {
2962 mount_lock_destroy(mp);
2963 #if CONFIG_MACF
2964 mac_mount_label_destroy(mp);
2965 #endif
2966 zfree(mount_zone, mp);
2967 } else {
2968 panic("dounmount: no coveredvp");
2969 }
2970 }
2971 return error;
2972 }
2973
2974 /*
2975 * Unmount any mounts in this filesystem.
2976 */
2977 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2978 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2979 {
2980 mount_t smp;
2981 fsid_t *fsids, fsid;
2982 int fsids_sz;
2983 int count = 0, i, m = 0;
2984 vnode_t vp;
2985
2986 mount_list_lock();
2987
2988 // Get an array to hold the submounts fsids.
2989 TAILQ_FOREACH(smp, &mountlist, mnt_list)
2990 count++;
2991 fsids_sz = count * sizeof(fsid_t);
2992 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
2993 if (fsids == NULL) {
2994 mount_list_unlock();
2995 goto out;
2996 }
2997 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
2998
2999 /*
3000 * Fill the array with submount fsids.
3001 * Since mounts are always added to the tail of the mount list, the
3002 * list is always in mount order.
3003 * For each mount check if the mounted-on vnode belongs to a
3004 * mount that's already added to our array of mounts to be unmounted.
3005 */
3006 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3007 vp = smp->mnt_vnodecovered;
3008 if (vp == NULL) {
3009 continue;
3010 }
3011 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3012 for (i = 0; i <= m; i++) {
3013 if (fsids[i].val[0] == fsid.val[0] &&
3014 fsids[i].val[1] == fsid.val[1]) {
3015 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3016 break;
3017 }
3018 }
3019 }
3020 mount_list_unlock();
3021
3022 // Unmount the submounts in reverse order. Ignore errors.
3023 for (i = m; i > 0; i--) {
3024 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3025 if (smp) {
3026 mount_ref(smp, 0);
3027 mount_iterdrop(smp);
3028 (void) dounmount(smp, flags, 1, ctx);
3029 }
3030 }
3031 out:
3032 kfree_data(fsids, fsids_sz);
3033 }
3034
3035 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3036 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3037 {
3038 vnode_hold(dp);
3039 vnode_lock(dp);
3040 mp->mnt_crossref--;
3041
3042 if (mp->mnt_crossref < 0) {
3043 panic("mount cross refs -ve");
3044 }
3045
3046 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3047 if (need_put) {
3048 vnode_put_locked(dp);
3049 }
3050 vnode_drop_and_unlock(dp);
3051
3052 mount_lock_destroy(mp);
3053 #if CONFIG_MACF
3054 mac_mount_label_destroy(mp);
3055 #endif
3056 zfree(mount_zone, mp);
3057 return;
3058 }
3059 if (need_put) {
3060 vnode_put_locked(dp);
3061 }
3062 vnode_drop_and_unlock(dp);
3063 }
3064
3065
3066 /*
3067 * Sync each mounted filesystem.
3068 */
3069 #if DIAGNOSTIC
3070 int syncprt = 0;
3071 #endif
3072
3073 int print_vmpage_stat = 0;
3074
3075 /*
3076 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3077 * mounted read-write with the passed waitfor value.
3078 *
3079 * Parameters: mp mount-point descriptor per mounted file-system instance.
3080 * arg user argument (please see below)
3081 *
3082 * User argument is a pointer to 32 bit unsigned integer which describes the
3083 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3084 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3085 * waitfor value.
3086 *
3087 * Returns: VFS_RETURNED
3088 */
3089 static int
sync_callback(mount_t mp,void * arg)3090 sync_callback(mount_t mp, void *arg)
3091 {
3092 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3093 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3094 unsigned waitfor = MNT_NOWAIT;
3095
3096 if (arg) {
3097 waitfor = *(uint32_t*)arg;
3098 }
3099
3100 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3101 if (waitfor != MNT_WAIT &&
3102 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3103 waitfor != MNT_NOWAIT &&
3104 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3105 waitfor != MNT_DWAIT &&
3106 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3107 panic("Passed inappropriate waitfor %u to "
3108 "sync_callback()", waitfor);
3109 }
3110
3111 mp->mnt_flag &= ~MNT_ASYNC;
3112 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3113 if (asyncflag) {
3114 mp->mnt_flag |= MNT_ASYNC;
3115 }
3116 }
3117
3118 return VFS_RETURNED;
3119 }
3120
3121 /* ARGSUSED */
3122 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3123 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3124 {
3125 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3126
3127 if (print_vmpage_stat) {
3128 vm_countdirtypages();
3129 }
3130
3131 #if DIAGNOSTIC
3132 if (syncprt) {
3133 vfs_bufstats();
3134 }
3135 #endif /* DIAGNOSTIC */
3136 return 0;
3137 }
3138
3139 typedef enum {
3140 SYNC_ALL = 0,
3141 SYNC_ONLY_RELIABLE_MEDIA = 1,
3142 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3143 } sync_type_t;
3144
3145 static int
sync_internal_callback(mount_t mp,void * arg)3146 sync_internal_callback(mount_t mp, void *arg)
3147 {
3148 if (arg) {
3149 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3150 (mp->mnt_flag & MNT_LOCAL);
3151 sync_type_t sync_type = *((sync_type_t *)arg);
3152
3153 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3154 return VFS_RETURNED;
3155 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3156 return VFS_RETURNED;
3157 }
3158 }
3159
3160 (void)sync_callback(mp, NULL);
3161
3162 return VFS_RETURNED;
3163 }
3164
3165 int sync_thread_state = 0;
3166 int sync_timeout_seconds = 5;
3167
3168 #define SYNC_THREAD_RUN 0x0001
3169 #define SYNC_THREAD_RUNNING 0x0002
3170
3171 #if CONFIG_PHYS_WRITE_ACCT
3172 thread_t pm_sync_thread;
3173 #endif /* CONFIG_PHYS_WRITE_ACCT */
3174
3175 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3176 sync_thread(__unused void *arg, __unused wait_result_t wr)
3177 {
3178 sync_type_t sync_type;
3179 #if CONFIG_PHYS_WRITE_ACCT
3180 pm_sync_thread = current_thread();
3181 #endif /* CONFIG_PHYS_WRITE_ACCT */
3182
3183 lck_mtx_lock(&sync_mtx_lck);
3184 while (sync_thread_state & SYNC_THREAD_RUN) {
3185 sync_thread_state &= ~SYNC_THREAD_RUN;
3186 lck_mtx_unlock(&sync_mtx_lck);
3187
3188 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3189 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3190 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3191 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3192
3193 lck_mtx_lock(&sync_mtx_lck);
3194 }
3195 /*
3196 * This wakeup _has_ to be issued before the lock is released otherwise
3197 * we may end up waking up a thread in sync_internal which is
3198 * expecting a wakeup from a thread it just created and not from this
3199 * thread which is about to exit.
3200 */
3201 wakeup(&sync_thread_state);
3202 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3203 #if CONFIG_PHYS_WRITE_ACCT
3204 pm_sync_thread = NULL;
3205 #endif /* CONFIG_PHYS_WRITE_ACCT */
3206 lck_mtx_unlock(&sync_mtx_lck);
3207
3208 if (print_vmpage_stat) {
3209 vm_countdirtypages();
3210 }
3211
3212 #if DIAGNOSTIC
3213 if (syncprt) {
3214 vfs_bufstats();
3215 }
3216 #endif /* DIAGNOSTIC */
3217 }
3218
3219 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3220
3221 /*
3222 * An in-kernel sync for power management to call.
3223 * This function always returns within sync_timeout seconds.
3224 */
3225 __private_extern__ int
sync_internal(void)3226 sync_internal(void)
3227 {
3228 thread_t thd = NULL;
3229 int error;
3230 int thread_created = FALSE;
3231 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3232
3233 lck_mtx_lock(&sync_mtx_lck);
3234 sync_thread_state |= SYNC_THREAD_RUN;
3235 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3236 int kr;
3237
3238 sync_thread_state |= SYNC_THREAD_RUNNING;
3239 kr = kernel_thread_start(sync_thread, NULL, &thd);
3240 if (kr != KERN_SUCCESS) {
3241 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3242 lck_mtx_unlock(&sync_mtx_lck);
3243 printf("sync_thread failed\n");
3244 return 0;
3245 }
3246 thread_created = TRUE;
3247 }
3248
3249 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3250 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3251 if (error) {
3252 struct timeval now;
3253
3254 microtime(&now);
3255 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3256 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3257 sync_timeout_last_print.tv_sec = now.tv_sec;
3258 }
3259 }
3260
3261 if (thread_created) {
3262 thread_deallocate(thd);
3263 }
3264
3265 return 0;
3266 } /* end of sync_internal call */
3267
3268 /*
3269 * Change filesystem quotas.
3270 */
3271 #if QUOTA
3272 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3273 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3274 {
3275 struct mount *mp;
3276 int error, quota_cmd, quota_status = 0;
3277 caddr_t datap;
3278 size_t fnamelen;
3279 struct nameidata nd;
3280 vfs_context_t ctx = vfs_context_current();
3281 struct dqblk my_dqblk = {};
3282
3283 AUDIT_ARG(uid, uap->uid);
3284 AUDIT_ARG(cmd, uap->cmd);
3285 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3286 uap->path, ctx);
3287 error = namei(&nd);
3288 if (error) {
3289 return error;
3290 }
3291 mp = nd.ni_vp->v_mount;
3292 mount_ref(mp, 0);
3293 vnode_put(nd.ni_vp);
3294 nameidone(&nd);
3295
3296 #if CONFIG_MACF
3297 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3298 if (error != 0) {
3299 goto out;
3300 }
3301 #endif
3302
3303 /* copyin any data we will need for downstream code */
3304 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3305
3306 switch (quota_cmd) {
3307 case Q_QUOTAON:
3308 /* uap->arg specifies a file from which to take the quotas */
3309 fnamelen = MAXPATHLEN;
3310 datap = zalloc(ZV_NAMEI);
3311 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3312 break;
3313 case Q_GETQUOTA:
3314 /* uap->arg is a pointer to a dqblk structure. */
3315 datap = (caddr_t) &my_dqblk;
3316 break;
3317 case Q_SETQUOTA:
3318 case Q_SETUSE:
3319 /* uap->arg is a pointer to a dqblk structure. */
3320 datap = (caddr_t) &my_dqblk;
3321 if (proc_is64bit(p)) {
3322 struct user_dqblk my_dqblk64;
3323 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3324 if (error == 0) {
3325 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3326 }
3327 } else {
3328 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3329 }
3330 break;
3331 case Q_QUOTASTAT:
3332 /* uap->arg is a pointer to an integer */
3333 datap = (caddr_t) "a_status;
3334 break;
3335 default:
3336 datap = NULL;
3337 break;
3338 } /* switch */
3339
3340 if (error == 0) {
3341 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3342 }
3343
3344 switch (quota_cmd) {
3345 case Q_QUOTAON:
3346 if (datap != NULL) {
3347 zfree(ZV_NAMEI, datap);
3348 }
3349 break;
3350 case Q_GETQUOTA:
3351 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3352 if (error == 0) {
3353 if (proc_is64bit(p)) {
3354 struct user_dqblk my_dqblk64;
3355
3356 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3357 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3358 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3359 } else {
3360 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3361 }
3362 }
3363 break;
3364 case Q_QUOTASTAT:
3365 /* uap->arg is a pointer to an integer */
3366 if (error == 0) {
3367 error = copyout(datap, uap->arg, sizeof(quota_status));
3368 }
3369 break;
3370 default:
3371 break;
3372 } /* switch */
3373
3374 out:
3375 mount_drop(mp, 0);
3376 return error;
3377 }
3378 #else
3379 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3380 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3381 {
3382 return EOPNOTSUPP;
3383 }
3384 #endif /* QUOTA */
3385
3386 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3387 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3388 {
3389 int error;
3390 vfs_context_t ctx = vfs_context_current();
3391
3392 #if CONFIG_MACF
3393 error = mac_mount_check_stat(ctx, mp);
3394 if (error != 0) {
3395 return error;
3396 }
3397 #endif
3398
3399 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3400 if (error != 0) {
3401 return error;
3402 }
3403
3404 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3405 }
3406
3407 /*
3408 * Get filesystem statistics.
3409 *
3410 * Returns: 0 Success
3411 * namei:???
3412 * vfs_update_vfsstat:???
3413 * munge_statfs:EFAULT
3414 */
3415 /* ARGSUSED */
3416 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3417 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3418 {
3419 int error;
3420 struct mount *mp;
3421 struct nameidata nd;
3422 vfs_context_t ctx = vfs_context_current();
3423 vnode_t vp;
3424
3425 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3426 UIO_USERSPACE, uap->path, ctx);
3427 error = namei(&nd);
3428 if (error != 0) {
3429 return error;
3430 }
3431 vp = nd.ni_vp;
3432 mp = vp->v_mount;
3433 nameidone(&nd);
3434
3435 error = statfs_internal(p, mp, uap->buf);
3436 vnode_put(vp);
3437
3438 return error;
3439 }
3440
3441 /*
3442 * Get filesystem statistics.
3443 */
3444 /* ARGSUSED */
3445 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3446 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3447 {
3448 int error;
3449 vnode_t vp = NULL;
3450 struct mount *mp;
3451
3452 AUDIT_ARG(fd, uap->fd);
3453
3454 if ((error = file_vnode(uap->fd, &vp)) ||
3455 (error = vnode_getwithref(vp))) {
3456 goto out;
3457 }
3458
3459 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3460
3461 mp = vp->v_mount;
3462 if (!mp) {
3463 error = EBADF;
3464 goto out_vnode;
3465 }
3466
3467 error = statfs_internal(p, mp, uap->buf);
3468
3469 out_vnode:
3470 vnode_put(vp);
3471
3472 out:
3473 if (vp != NULL) {
3474 file_drop(uap->fd);
3475 }
3476
3477 return error;
3478 }
3479
3480 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3481 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3482 {
3483 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3484
3485 bzero(sfs, sizeof(*sfs));
3486
3487 sfs->f_bsize = vsfs->f_bsize;
3488 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3489 sfs->f_blocks = vsfs->f_blocks;
3490 sfs->f_bfree = vsfs->f_bfree;
3491 sfs->f_bavail = vsfs->f_bavail;
3492 sfs->f_files = vsfs->f_files;
3493 sfs->f_ffree = vsfs->f_ffree;
3494 sfs->f_fsid = vsfs->f_fsid;
3495 sfs->f_owner = vsfs->f_owner;
3496 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3497 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3498 sfs->f_fssubtype = vsfs->f_fssubtype;
3499 sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3500 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3501 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3502 } else {
3503 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3504 }
3505 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3506 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3507 }
3508
3509 /*
3510 * Get file system statistics in 64-bit mode
3511 */
3512 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3513 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3514 {
3515 struct mount *mp;
3516 int error;
3517 struct nameidata *ndp;
3518 struct statfs64 *sfsp;
3519 vfs_context_t ctxp = vfs_context_current();
3520 vnode_t vp;
3521 struct {
3522 struct nameidata nd;
3523 struct statfs64 sfs;
3524 } *__nameidata_statfs64;
3525
3526 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3527 Z_WAITOK);
3528 ndp = &__nameidata_statfs64->nd;
3529
3530 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3531 UIO_USERSPACE, uap->path, ctxp);
3532 error = namei(ndp);
3533 if (error != 0) {
3534 goto out;
3535 }
3536 vp = ndp->ni_vp;
3537 mp = vp->v_mount;
3538 nameidone(ndp);
3539
3540 #if CONFIG_MACF
3541 error = mac_mount_check_stat(ctxp, mp);
3542 if (error != 0) {
3543 vnode_put(vp);
3544 goto out;
3545 }
3546 #endif
3547
3548 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3549 if (error != 0) {
3550 vnode_put(vp);
3551 goto out;
3552 }
3553
3554 sfsp = &__nameidata_statfs64->sfs;
3555 vfs_get_statfs64(mp, sfsp);
3556 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3557 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3558 /* This process does not want to see a seperate data volume mountpoint */
3559 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3560 }
3561 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3562 vnode_put(vp);
3563
3564 out:
3565 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3566
3567 return error;
3568 }
3569
3570 /*
3571 * Get file system statistics in 64-bit mode
3572 */
3573 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3574 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3575 {
3576 struct vnode *vp;
3577 struct mount *mp;
3578 struct statfs64 sfs;
3579 int error;
3580
3581 AUDIT_ARG(fd, uap->fd);
3582
3583 if ((error = file_vnode(uap->fd, &vp))) {
3584 return error;
3585 }
3586
3587 error = vnode_getwithref(vp);
3588 if (error) {
3589 file_drop(uap->fd);
3590 return error;
3591 }
3592
3593 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3594
3595 mp = vp->v_mount;
3596 if (!mp) {
3597 error = EBADF;
3598 goto out;
3599 }
3600
3601 #if CONFIG_MACF
3602 error = mac_mount_check_stat(vfs_context_current(), mp);
3603 if (error != 0) {
3604 goto out;
3605 }
3606 #endif
3607
3608 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3609 goto out;
3610 }
3611
3612 vfs_get_statfs64(mp, &sfs);
3613 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3614 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3615 /* This process does not want to see a seperate data volume mountpoint */
3616 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3617 }
3618 error = copyout(&sfs, uap->buf, sizeof(sfs));
3619
3620 out:
3621 file_drop(uap->fd);
3622 vnode_put(vp);
3623
3624 return error;
3625 }
3626
3627 struct getfsstat_struct {
3628 user_addr_t sfsp;
3629 user_addr_t *mp;
3630 int count;
3631 int maxcount;
3632 int flags;
3633 int error;
3634 };
3635
3636
3637 static int
getfsstat_callback(mount_t mp,void * arg)3638 getfsstat_callback(mount_t mp, void * arg)
3639 {
3640 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3641 struct vfsstatfs *sp;
3642 int error, my_size;
3643 vfs_context_t ctx = vfs_context_current();
3644
3645 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3646 #if CONFIG_MACF
3647 error = mac_mount_check_stat(ctx, mp);
3648 if (error != 0) {
3649 fstp->error = error;
3650 return VFS_RETURNED_DONE;
3651 }
3652 #endif
3653 sp = &mp->mnt_vfsstat;
3654 /*
3655 * If MNT_NOWAIT is specified, do not refresh the
3656 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3657 */
3658 if ((mp->mnt_lflag & MNT_LDEAD) ||
3659 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3660 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3661 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3662 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3663 return VFS_RETURNED;
3664 }
3665
3666 /*
3667 * Need to handle LP64 version of struct statfs
3668 */
3669 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3670 if (error) {
3671 fstp->error = error;
3672 return VFS_RETURNED_DONE;
3673 }
3674 fstp->sfsp += my_size;
3675
3676 if (fstp->mp) {
3677 #if CONFIG_MACF
3678 error = mac_mount_label_get(mp, *fstp->mp);
3679 if (error) {
3680 fstp->error = error;
3681 return VFS_RETURNED_DONE;
3682 }
3683 #endif
3684 fstp->mp++;
3685 }
3686 }
3687 fstp->count++;
3688 return VFS_RETURNED;
3689 }
3690
3691 /*
3692 * Get statistics on all filesystems.
3693 */
3694 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3695 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3696 {
3697 struct __mac_getfsstat_args muap;
3698
3699 muap.buf = uap->buf;
3700 muap.bufsize = uap->bufsize;
3701 muap.mac = USER_ADDR_NULL;
3702 muap.macsize = 0;
3703 muap.flags = uap->flags;
3704
3705 return __mac_getfsstat(p, &muap, retval);
3706 }
3707
3708 /*
3709 * __mac_getfsstat: Get MAC-related file system statistics
3710 *
3711 * Parameters: p (ignored)
3712 * uap User argument descriptor (see below)
3713 * retval Count of file system statistics (N stats)
3714 *
3715 * Indirect: uap->bufsize Buffer size
3716 * uap->macsize MAC info size
3717 * uap->buf Buffer where information will be returned
3718 * uap->mac MAC info
3719 * uap->flags File system flags
3720 *
3721 *
3722 * Returns: 0 Success
3723 * !0 Not success
3724 *
3725 */
3726 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3727 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3728 {
3729 user_addr_t sfsp;
3730 user_addr_t *mp;
3731 size_t count, maxcount, bufsize, macsize;
3732 struct getfsstat_struct fst;
3733
3734 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3735 return EINVAL;
3736 }
3737
3738 bufsize = (size_t) uap->bufsize;
3739 macsize = (size_t) uap->macsize;
3740
3741 if (IS_64BIT_PROCESS(p)) {
3742 maxcount = bufsize / sizeof(struct user64_statfs);
3743 } else {
3744 maxcount = bufsize / sizeof(struct user32_statfs);
3745 }
3746 sfsp = uap->buf;
3747 count = 0;
3748
3749 mp = NULL;
3750
3751 #if CONFIG_MACF
3752 if (uap->mac != USER_ADDR_NULL) {
3753 u_int32_t *mp0;
3754 int error;
3755 unsigned int i;
3756
3757 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3758 if (count != maxcount) {
3759 return EINVAL;
3760 }
3761
3762 /* Copy in the array */
3763 mp0 = kalloc_data(macsize, Z_WAITOK);
3764 if (mp0 == NULL) {
3765 return ENOMEM;
3766 }
3767
3768 error = copyin(uap->mac, mp0, macsize);
3769 if (error) {
3770 kfree_data(mp0, macsize);
3771 return error;
3772 }
3773
3774 /* Normalize to an array of user_addr_t */
3775 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3776 if (mp == NULL) {
3777 kfree_data(mp0, macsize);
3778 return ENOMEM;
3779 }
3780
3781 for (i = 0; i < count; i++) {
3782 if (IS_64BIT_PROCESS(p)) {
3783 mp[i] = ((user_addr_t *)mp0)[i];
3784 } else {
3785 mp[i] = (user_addr_t)mp0[i];
3786 }
3787 }
3788 kfree_data(mp0, macsize);
3789 }
3790 #endif
3791
3792
3793 fst.sfsp = sfsp;
3794 fst.mp = mp;
3795 fst.flags = uap->flags;
3796 fst.count = 0;
3797 fst.error = 0;
3798 fst.maxcount = (int)maxcount;
3799
3800
3801 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3802
3803 if (mp) {
3804 kfree_data(mp, count * sizeof(user_addr_t));
3805 }
3806
3807 if (fst.error) {
3808 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3809 return fst.error;
3810 }
3811
3812 if (fst.sfsp && fst.count > fst.maxcount) {
3813 *retval = fst.maxcount;
3814 } else {
3815 *retval = fst.count;
3816 }
3817 return 0;
3818 }
3819
3820 static int
getfsstat64_callback(mount_t mp,void * arg)3821 getfsstat64_callback(mount_t mp, void * arg)
3822 {
3823 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3824 struct vfsstatfs *sp;
3825 struct statfs64 sfs;
3826 int error;
3827
3828 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3829 #if CONFIG_MACF
3830 error = mac_mount_check_stat(vfs_context_current(), mp);
3831 if (error != 0) {
3832 fstp->error = error;
3833 return VFS_RETURNED_DONE;
3834 }
3835 #endif
3836 sp = &mp->mnt_vfsstat;
3837 /*
3838 * If MNT_NOWAIT is specified, do not refresh the fsstat
3839 * cache. MNT_WAIT overrides MNT_NOWAIT.
3840 *
3841 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3842 * getfsstat, since the constants are out of the same
3843 * namespace.
3844 */
3845 if ((mp->mnt_lflag & MNT_LDEAD) ||
3846 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3847 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3848 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3849 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3850 return VFS_RETURNED;
3851 }
3852
3853 vfs_get_statfs64(mp, &sfs);
3854 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3855 if (error) {
3856 fstp->error = error;
3857 return VFS_RETURNED_DONE;
3858 }
3859 fstp->sfsp += sizeof(sfs);
3860 }
3861 fstp->count++;
3862 return VFS_RETURNED;
3863 }
3864
3865 /*
3866 * Get statistics on all file systems in 64 bit mode.
3867 */
3868 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3869 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3870 {
3871 user_addr_t sfsp;
3872 int count, maxcount;
3873 struct getfsstat_struct fst;
3874
3875 maxcount = uap->bufsize / sizeof(struct statfs64);
3876
3877 sfsp = uap->buf;
3878 count = 0;
3879
3880 fst.sfsp = sfsp;
3881 fst.flags = uap->flags;
3882 fst.count = 0;
3883 fst.error = 0;
3884 fst.maxcount = maxcount;
3885
3886 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3887
3888 if (fst.error) {
3889 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3890 return fst.error;
3891 }
3892
3893 if (fst.sfsp && fst.count > fst.maxcount) {
3894 *retval = fst.maxcount;
3895 } else {
3896 *retval = fst.count;
3897 }
3898
3899 return 0;
3900 }
3901
3902 /*
3903 * gets the associated vnode with the file descriptor passed.
3904 * as input
3905 *
3906 * INPUT
3907 * ctx - vfs context of caller
3908 * fd - file descriptor for which vnode is required.
3909 * vpp - Pointer to pointer to vnode to be returned.
3910 *
3911 * The vnode is returned with an iocount so any vnode obtained
3912 * by this call needs a vnode_put
3913 *
3914 */
3915 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3916 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3917 {
3918 int error;
3919 vnode_t vp;
3920 struct fileproc *fp;
3921 proc_t p = vfs_context_proc(ctx);
3922
3923 *vpp = NULLVP;
3924
3925 error = fp_getfvp(p, fd, &fp, &vp);
3926 if (error) {
3927 return error;
3928 }
3929
3930 error = vnode_getwithref(vp);
3931 if (error) {
3932 (void)fp_drop(p, fd, fp, 0);
3933 return error;
3934 }
3935
3936 (void)fp_drop(p, fd, fp, 0);
3937 *vpp = vp;
3938 return error;
3939 }
3940
3941 /*
3942 * Wrapper function around namei to start lookup from a directory
3943 * specified by a file descriptor ni_dirfd.
3944 *
3945 * In addition to all the errors returned by namei, this call can
3946 * return ENOTDIR if the file descriptor does not refer to a directory.
3947 * and EBADF if the file descriptor is not valid.
3948 */
3949 int
nameiat(struct nameidata * ndp,int dirfd)3950 nameiat(struct nameidata *ndp, int dirfd)
3951 {
3952 if ((dirfd != AT_FDCWD) &&
3953 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3954 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3955 int error = 0;
3956 char c;
3957
3958 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3959 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3960 if (error) {
3961 return error;
3962 }
3963 } else {
3964 c = *((char *)(ndp->ni_dirp));
3965 }
3966
3967 if (c != '/') {
3968 vnode_t dvp_at;
3969
3970 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3971 &dvp_at);
3972 if (error) {
3973 return error;
3974 }
3975
3976 if (vnode_vtype(dvp_at) != VDIR) {
3977 vnode_put(dvp_at);
3978 return ENOTDIR;
3979 }
3980
3981 ndp->ni_dvp = dvp_at;
3982 ndp->ni_cnd.cn_flags |= USEDVP;
3983 error = namei(ndp);
3984 ndp->ni_cnd.cn_flags &= ~USEDVP;
3985 vnode_put(dvp_at);
3986 return error;
3987 }
3988 }
3989
3990 return namei(ndp);
3991 }
3992
3993 /*
3994 * Change current working directory to a given file descriptor.
3995 */
3996 /* ARGSUSED */
3997 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)3998 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3999 {
4000 vnode_t vp;
4001 vnode_t tdp;
4002 vnode_t tvp;
4003 struct mount *mp;
4004 int error, should_put = 1;
4005 vfs_context_t ctx = vfs_context_current();
4006
4007 AUDIT_ARG(fd, uap->fd);
4008 if (per_thread && uap->fd == -1) {
4009 /*
4010 * Switching back from per-thread to per process CWD; verify we
4011 * in fact have one before proceeding. The only success case
4012 * for this code path is to return 0 preemptively after zapping
4013 * the thread structure contents.
4014 */
4015 thread_t th = vfs_context_thread(ctx);
4016 if (th) {
4017 uthread_t uth = get_bsdthread_info(th);
4018 tvp = uth->uu_cdir;
4019 uth->uu_cdir = NULLVP;
4020 if (tvp != NULLVP) {
4021 vnode_rele(tvp);
4022 return 0;
4023 }
4024 }
4025 return EBADF;
4026 }
4027
4028 if ((error = file_vnode(uap->fd, &vp))) {
4029 return error;
4030 }
4031 if ((error = vnode_getwithref(vp))) {
4032 file_drop(uap->fd);
4033 return error;
4034 }
4035
4036 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4037
4038 if (vp->v_type != VDIR) {
4039 error = ENOTDIR;
4040 goto out;
4041 }
4042
4043 #if CONFIG_MACF
4044 error = mac_vnode_check_chdir(ctx, vp);
4045 if (error) {
4046 goto out;
4047 }
4048 #endif
4049 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4050 if (error) {
4051 goto out;
4052 }
4053
4054 while (!error && (mp = vp->v_mountedhere) != NULL) {
4055 if (vfs_busy(mp, LK_NOWAIT)) {
4056 error = EACCES;
4057 goto out;
4058 }
4059 error = VFS_ROOT(mp, &tdp, ctx);
4060 vfs_unbusy(mp);
4061 if (error) {
4062 break;
4063 }
4064 vnode_put(vp);
4065 vp = tdp;
4066 }
4067 if (error) {
4068 goto out;
4069 }
4070 if ((error = vnode_ref(vp))) {
4071 goto out;
4072 }
4073 vnode_put(vp);
4074 should_put = 0;
4075
4076 if (per_thread) {
4077 thread_t th = vfs_context_thread(ctx);
4078 if (th) {
4079 uthread_t uth = get_bsdthread_info(th);
4080 tvp = uth->uu_cdir;
4081 uth->uu_cdir = vp;
4082 OSBitOrAtomic(P_THCWD, &p->p_flag);
4083 } else {
4084 vnode_rele(vp);
4085 error = ENOENT;
4086 goto out;
4087 }
4088 } else {
4089 proc_dirs_lock_exclusive(p);
4090 proc_fdlock(p);
4091 tvp = p->p_fd.fd_cdir;
4092 p->p_fd.fd_cdir = vp;
4093 proc_fdunlock(p);
4094 proc_dirs_unlock_exclusive(p);
4095 }
4096
4097 if (tvp) {
4098 vnode_rele(tvp);
4099 }
4100
4101 out:
4102 if (should_put) {
4103 vnode_put(vp);
4104 }
4105 file_drop(uap->fd);
4106
4107 return error;
4108 }
4109
4110 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4111 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4112 {
4113 return common_fchdir(p, uap, 0);
4114 }
4115
4116 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4117 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4118 {
4119 return common_fchdir(p, (void *)uap, 1);
4120 }
4121
4122
4123 /*
4124 * Change current working directory (".").
4125 *
4126 * Returns: 0 Success
4127 * change_dir:ENOTDIR
4128 * change_dir:???
4129 * vnode_ref:ENOENT No such file or directory
4130 */
4131 /* ARGSUSED */
4132 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4133 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4134 {
4135 int error;
4136 vnode_t tvp;
4137
4138 error = change_dir(ndp, ctx);
4139 if (error) {
4140 return error;
4141 }
4142 if ((error = vnode_ref(ndp->ni_vp))) {
4143 vnode_put(ndp->ni_vp);
4144 return error;
4145 }
4146 /*
4147 * drop the iocount we picked up in change_dir
4148 */
4149 vnode_put(ndp->ni_vp);
4150
4151 if (per_thread) {
4152 thread_t th = vfs_context_thread(ctx);
4153 if (th) {
4154 uthread_t uth = get_bsdthread_info(th);
4155 tvp = uth->uu_cdir;
4156 uth->uu_cdir = ndp->ni_vp;
4157 OSBitOrAtomic(P_THCWD, &p->p_flag);
4158 } else {
4159 vnode_rele(ndp->ni_vp);
4160 return ENOENT;
4161 }
4162 } else {
4163 proc_dirs_lock_exclusive(p);
4164 proc_fdlock(p);
4165 tvp = p->p_fd.fd_cdir;
4166 p->p_fd.fd_cdir = ndp->ni_vp;
4167 proc_fdunlock(p);
4168 proc_dirs_unlock_exclusive(p);
4169 }
4170
4171 if (tvp) {
4172 vnode_rele(tvp);
4173 }
4174
4175 return 0;
4176 }
4177
4178
4179 /*
4180 * Change current working directory (".").
4181 *
4182 * Returns: 0 Success
4183 * chdir_internal:ENOTDIR
4184 * chdir_internal:ENOENT No such file or directory
4185 * chdir_internal:???
4186 */
4187 /* ARGSUSED */
4188 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4189 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4190 {
4191 struct nameidata nd;
4192 vfs_context_t ctx = vfs_context_current();
4193
4194 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4195 UIO_USERSPACE, uap->path, ctx);
4196
4197 return chdir_internal(p, ctx, &nd, per_thread);
4198 }
4199
4200
4201 /*
4202 * chdir
4203 *
4204 * Change current working directory (".") for the entire process
4205 *
4206 * Parameters: p Process requesting the call
4207 * uap User argument descriptor (see below)
4208 * retval (ignored)
4209 *
4210 * Indirect parameters: uap->path Directory path
4211 *
4212 * Returns: 0 Success
4213 * common_chdir: ENOTDIR
4214 * common_chdir: ENOENT No such file or directory
4215 * common_chdir: ???
4216 *
4217 */
4218 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4219 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4220 {
4221 return common_chdir(p, (void *)uap, 0);
4222 }
4223
4224 /*
4225 * __pthread_chdir
4226 *
4227 * Change current working directory (".") for a single thread
4228 *
4229 * Parameters: p Process requesting the call
4230 * uap User argument descriptor (see below)
4231 * retval (ignored)
4232 *
4233 * Indirect parameters: uap->path Directory path
4234 *
4235 * Returns: 0 Success
4236 * common_chdir: ENOTDIR
4237 * common_chdir: ENOENT No such file or directory
4238 * common_chdir: ???
4239 *
4240 */
4241 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4242 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4243 {
4244 return common_chdir(p, (void *)uap, 1);
4245 }
4246
4247
4248 /*
4249 * Change notion of root (``/'') directory.
4250 */
4251 /* ARGSUSED */
4252 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4253 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4254 {
4255 struct filedesc *fdp = &p->p_fd;
4256 int error;
4257 struct nameidata nd;
4258 vnode_t tvp;
4259 vfs_context_t ctx = vfs_context_current();
4260
4261 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4262 return error;
4263 }
4264
4265 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4266 UIO_USERSPACE, uap->path, ctx);
4267 error = change_dir(&nd, ctx);
4268 if (error) {
4269 return error;
4270 }
4271
4272 #if CONFIG_MACF
4273 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4274 &nd.ni_cnd);
4275 if (error) {
4276 vnode_put(nd.ni_vp);
4277 return error;
4278 }
4279 #endif
4280
4281 if ((error = vnode_ref(nd.ni_vp))) {
4282 vnode_put(nd.ni_vp);
4283 return error;
4284 }
4285 vnode_put(nd.ni_vp);
4286
4287 /*
4288 * This lock provides the guarantee that as long as you hold the lock
4289 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4290 * on a referenced vnode in namei when determining the rootvnode for
4291 * a process.
4292 */
4293 /* needed for synchronization with lookup */
4294 proc_dirs_lock_exclusive(p);
4295 /* needed for setting the flag and other activities on the fd itself */
4296 proc_fdlock(p);
4297 tvp = fdp->fd_rdir;
4298 fdp->fd_rdir = nd.ni_vp;
4299 fdt_flag_set(fdp, FD_CHROOT);
4300 proc_fdunlock(p);
4301 proc_dirs_unlock_exclusive(p);
4302
4303 if (tvp != NULL) {
4304 vnode_rele(tvp);
4305 }
4306
4307 return 0;
4308 }
4309
4310 #define PATHSTATICBUFLEN 256
4311 #define PIVOT_ROOT_ENTITLEMENT \
4312 "com.apple.private.vfs.pivot-root"
4313
4314 #if defined(XNU_TARGET_OS_OSX)
4315 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4316 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4317 {
4318 int error;
4319 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4320 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4321 char *new_rootfs_path_before_buf = NULL;
4322 char *old_rootfs_path_after_buf = NULL;
4323 char *incoming = NULL;
4324 char *outgoing = NULL;
4325 vnode_t incoming_rootvp = NULLVP;
4326 size_t bytes_copied;
4327
4328 /*
4329 * XXX : Additional restrictions needed
4330 * - perhaps callable only once.
4331 */
4332 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4333 return error;
4334 }
4335
4336 /*
4337 * pivot_root can be executed by launchd only.
4338 * Enforce entitlement.
4339 */
4340 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4341 return EPERM;
4342 }
4343
4344 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4345 if (error == ENAMETOOLONG) {
4346 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4347 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4348 }
4349
4350 if (error) {
4351 goto out;
4352 }
4353
4354 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4355 if (error == ENAMETOOLONG) {
4356 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4357 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4358 }
4359 if (error) {
4360 goto out;
4361 }
4362
4363 if (new_rootfs_path_before_buf) {
4364 incoming = new_rootfs_path_before_buf;
4365 } else {
4366 incoming = &new_rootfs_path_before[0];
4367 }
4368
4369 if (old_rootfs_path_after_buf) {
4370 outgoing = old_rootfs_path_after_buf;
4371 } else {
4372 outgoing = &old_rootfs_path_after[0];
4373 }
4374
4375 /*
4376 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4377 * Userland is not allowed to pivot to an image.
4378 */
4379 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4380 if (error) {
4381 goto out;
4382 }
4383 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4384 if (error) {
4385 goto out;
4386 }
4387
4388 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4389
4390 out:
4391 if (incoming_rootvp != NULLVP) {
4392 vnode_put(incoming_rootvp);
4393 incoming_rootvp = NULLVP;
4394 }
4395
4396 if (old_rootfs_path_after_buf) {
4397 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4398 }
4399
4400 if (new_rootfs_path_before_buf) {
4401 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4402 }
4403
4404 return error;
4405 }
4406 #else
4407 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4408 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4409 {
4410 return nosys(p, NULL, retval);
4411 }
4412 #endif /* XNU_TARGET_OS_OSX */
4413
4414 /*
4415 * Common routine for chroot and chdir.
4416 *
4417 * Returns: 0 Success
4418 * ENOTDIR Not a directory
4419 * namei:??? [anything namei can return]
4420 * vnode_authorize:??? [anything vnode_authorize can return]
4421 */
4422 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4423 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4424 {
4425 vnode_t vp;
4426 int error;
4427
4428 if ((error = namei(ndp))) {
4429 return error;
4430 }
4431 nameidone(ndp);
4432 vp = ndp->ni_vp;
4433
4434 if (vp->v_type != VDIR) {
4435 vnode_put(vp);
4436 return ENOTDIR;
4437 }
4438
4439 #if CONFIG_MACF
4440 error = mac_vnode_check_chdir(ctx, vp);
4441 if (error) {
4442 vnode_put(vp);
4443 return error;
4444 }
4445 #endif
4446
4447 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4448 if (error) {
4449 vnode_put(vp);
4450 return error;
4451 }
4452
4453 return error;
4454 }
4455
4456 /*
4457 * Free the vnode data (for directories) associated with the file glob.
4458 */
4459 struct fd_vn_data *
fg_vn_data_alloc(void)4460 fg_vn_data_alloc(void)
4461 {
4462 struct fd_vn_data *fvdata;
4463
4464 /* Allocate per fd vnode data */
4465 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4466 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4467 return fvdata;
4468 }
4469
4470 /*
4471 * Free the vnode data (for directories) associated with the file glob.
4472 */
4473 void
fg_vn_data_free(void * fgvndata)4474 fg_vn_data_free(void *fgvndata)
4475 {
4476 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4477
4478 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4479 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4480 kfree_type(struct fd_vn_data, fvdata);
4481 }
4482
4483 /*
4484 * Check permissions, allocate an open file structure,
4485 * and call the device open routine if any.
4486 *
4487 * Returns: 0 Success
4488 * EINVAL
4489 * EINTR
4490 * falloc:ENFILE
4491 * falloc:EMFILE
4492 * falloc:ENOMEM
4493 * vn_open_auth:???
4494 * dupfdopen:???
4495 * VNOP_ADVLOCK:???
4496 * vnode_setsize:???
4497 *
4498 * XXX Need to implement uid, gid
4499 */
4500 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4501 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4502 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4503 {
4504 proc_t p = vfs_context_proc(ctx);
4505 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4506 struct fileproc *fp;
4507 vnode_t vp;
4508 int flags, oflags, amode;
4509 int type, indx, error;
4510 struct vfs_context context;
4511 vnode_t authvp = NULLVP;
4512
4513 oflags = uflags;
4514
4515 amode = oflags & O_ACCMODE;
4516 /*
4517 * Because O_RDONLY is 0, it is not possible to distinguish between
4518 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4519 * with FREAD/FWRITE.
4520 */
4521 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4522 return EINVAL;
4523 }
4524
4525 flags = FFLAGS(uflags);
4526 CLR(flags, FENCRYPTED);
4527 CLR(flags, FUNENCRYPTED);
4528
4529 AUDIT_ARG(fflags, oflags);
4530 AUDIT_ARG(mode, vap->va_mode);
4531
4532 if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4533 return error;
4534 }
4535 if (flags & O_CLOEXEC) {
4536 fp->fp_flags |= FP_CLOEXEC;
4537 }
4538 if (flags & O_CLOFORK) {
4539 fp->fp_flags |= FP_CLOFORK;
4540 }
4541
4542 /* setup state to recognize when fdesc_open was called */
4543 uu->uu_dupfd = -1;
4544
4545 /*
4546 * Disable read/write access if file is opened with O_EVTONLY and
4547 * the process has requested to deny read/write access.
4548 */
4549 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4550 flags &= ~(FREAD | FWRITE);
4551 }
4552
4553 if (authfd != AUTH_OPEN_NOAUTHFD) {
4554 error = vnode_getfromfd(ctx, authfd, &authvp);
4555 if (error) {
4556 fp_free(p, indx, fp);
4557 return error;
4558 }
4559 }
4560
4561 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4562 if (authvp != NULLVP) {
4563 vnode_put(authvp);
4564 }
4565 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4566 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4567 *retval = indx;
4568 return 0;
4569 }
4570 }
4571 if (error == ERESTART) {
4572 error = EINTR;
4573 }
4574 fp_free(p, indx, fp);
4575 return error;
4576 }
4577
4578 if (authvp != NULLVP) {
4579 vnode_put(authvp);
4580 }
4581
4582 uu->uu_dupfd = 0;
4583 vp = ndp->ni_vp;
4584
4585 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4586 fp->fp_glob->fg_ops = &vnops;
4587 fp_set_data(fp, vp);
4588
4589 #if CONFIG_FILE_LEASES
4590 /*
4591 * If we are creating a file or open with truncate, we need to break the
4592 * lease if there is a read lease placed on the parent dir.
4593 */
4594 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4595 vnode_breakdirlease(vp, true, oflags);
4596 }
4597 /* Now check if there is a lease placed on the file itself. */
4598 error = vnode_breaklease(vp, oflags, ctx);
4599 if (error) {
4600 goto bad;
4601 }
4602 #endif /* CONFIG_FILE_LEASES */
4603
4604 if (flags & (O_EXLOCK | O_SHLOCK)) {
4605 struct flock lf = {
4606 .l_whence = SEEK_SET,
4607 };
4608
4609 if (flags & O_EXLOCK) {
4610 lf.l_type = F_WRLCK;
4611 } else {
4612 lf.l_type = F_RDLCK;
4613 }
4614 type = F_FLOCK;
4615 if ((flags & FNONBLOCK) == 0) {
4616 type |= F_WAIT;
4617 }
4618 #if CONFIG_MACF
4619 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4620 F_SETLK, &lf);
4621 if (error) {
4622 goto bad;
4623 }
4624 #endif
4625 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4626 goto bad;
4627 }
4628 fp->fp_glob->fg_flag |= FWASLOCKED;
4629 }
4630
4631 /* try to truncate by setting the size attribute */
4632 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4633 goto bad;
4634 }
4635
4636 /*
4637 * For directories we hold some additional information in the fd.
4638 */
4639 if (vnode_vtype(vp) == VDIR) {
4640 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4641 } else {
4642 fp->fp_glob->fg_vn_data = NULL;
4643 }
4644
4645 vnode_put(vp);
4646
4647 /*
4648 * The first terminal open (without a O_NOCTTY) by a session leader
4649 * results in it being set as the controlling terminal.
4650 */
4651 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4652 !(flags & O_NOCTTY)) {
4653 int tmp = 0;
4654
4655 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4656 (caddr_t)&tmp, ctx);
4657 }
4658
4659 proc_fdlock(p);
4660 procfdtbl_releasefd(p, indx, NULL);
4661
4662 #if CONFIG_SECLUDED_MEMORY
4663 if (secluded_for_filecache &&
4664 FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4665 vnode_vtype(vp) == VREG) {
4666 memory_object_control_t moc;
4667
4668 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4669
4670 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4671 /* nothing to do... */
4672 } else if (fp->fp_glob->fg_flag & FWRITE) {
4673 /* writable -> no longer eligible for secluded pages */
4674 memory_object_mark_eligible_for_secluded(moc,
4675 FALSE);
4676 } else if (secluded_for_filecache == 1) {
4677 char pathname[32] = { 0, };
4678 size_t copied;
4679 /* XXX FBDP: better way to detect /Applications/ ? */
4680 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4681 (void)copyinstr(ndp->ni_dirp,
4682 pathname,
4683 sizeof(pathname),
4684 &copied);
4685 } else {
4686 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4687 pathname,
4688 sizeof(pathname),
4689 &copied);
4690 }
4691 pathname[sizeof(pathname) - 1] = '\0';
4692 if (strncmp(pathname,
4693 "/Applications/",
4694 strlen("/Applications/")) == 0 &&
4695 strncmp(pathname,
4696 "/Applications/Camera.app/",
4697 strlen("/Applications/Camera.app/")) != 0) {
4698 /*
4699 * not writable
4700 * AND from "/Applications/"
4701 * AND not from "/Applications/Camera.app/"
4702 * ==> eligible for secluded
4703 */
4704 memory_object_mark_eligible_for_secluded(moc,
4705 TRUE);
4706 }
4707 } else if (secluded_for_filecache == 2) {
4708 size_t len = strlen(vp->v_name);
4709 if (!strncmp(vp->v_name, "dyld", len) ||
4710 !strncmp(vp->v_name, "launchd", len) ||
4711 !strncmp(vp->v_name, "Camera", len) ||
4712 !strncmp(vp->v_name, "SpringBoard", len) ||
4713 !strncmp(vp->v_name, "backboardd", len)) {
4714 /*
4715 * This file matters when launching Camera:
4716 * do not store its contents in the secluded
4717 * pool that will be drained on Camera launch.
4718 */
4719 memory_object_mark_eligible_for_secluded(moc,
4720 FALSE);
4721 } else if (!strncmp(vp->v_name, "mediaserverd", len)) {
4722 memory_object_mark_eligible_for_secluded(moc,
4723 FALSE);
4724 memory_object_mark_for_realtime(moc,
4725 true);
4726 } else if (!strncmp(vp->v_name, "bluetoothd", len)) {
4727 /*
4728 * bluetoothd might be needed for realtime audio
4729 * playback.
4730 */
4731 memory_object_mark_eligible_for_secluded(moc,
4732 FALSE);
4733 memory_object_mark_for_realtime(moc,
4734 true);
4735 } else {
4736 char pathname[64] = { 0, };
4737 size_t copied;
4738 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4739 (void)copyinstr(ndp->ni_dirp,
4740 pathname,
4741 sizeof(pathname),
4742 &copied);
4743 } else {
4744 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4745 pathname,
4746 sizeof(pathname),
4747 &copied);
4748 }
4749 pathname[sizeof(pathname) - 1] = '\0';
4750 if (strncmp(pathname,
4751 "/Library/Audio/Plug-Ins/",
4752 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4753 strncmp(pathname,
4754 "/System/Library/Audio/Plug-Ins/",
4755 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4756 /*
4757 * This may be an audio plugin required
4758 * for realtime playback.
4759 * ==> NOT eligible for secluded.
4760 */
4761 memory_object_mark_eligible_for_secluded(moc,
4762 FALSE);
4763 memory_object_mark_for_realtime(moc,
4764 true);
4765 }
4766 }
4767 }
4768 }
4769 #endif /* CONFIG_SECLUDED_MEMORY */
4770
4771 fp_drop(p, indx, fp, 1);
4772 proc_fdunlock(p);
4773
4774 *retval = indx;
4775
4776 return 0;
4777 bad:
4778 context = *vfs_context_current();
4779 context.vc_ucred = fp->fp_glob->fg_cred;
4780
4781 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4782 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4783 struct flock lf = {
4784 .l_whence = SEEK_SET,
4785 .l_type = F_UNLCK,
4786 };
4787
4788 (void)VNOP_ADVLOCK(
4789 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4790 }
4791
4792 vn_close(vp, fp->fp_glob->fg_flag, &context);
4793 vnode_put(vp);
4794 fp_free(p, indx, fp);
4795
4796 return error;
4797 }
4798
4799 /*
4800 * While most of the *at syscall handlers can call nameiat() which
4801 * is a wrapper around namei, the use of namei and initialisation
4802 * of nameidata are far removed and in different functions - namei
4803 * gets called in vn_open_auth for open1. So we'll just do here what
4804 * nameiat() does.
4805 */
4806 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4807 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4808 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4809 int dirfd, int authfd)
4810 {
4811 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4812 int error;
4813 char c;
4814
4815 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4816 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4817 if (error) {
4818 return error;
4819 }
4820 } else {
4821 c = *((char *)(ndp->ni_dirp));
4822 }
4823
4824 if (c != '/') {
4825 vnode_t dvp_at;
4826
4827 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4828 &dvp_at);
4829 if (error) {
4830 return error;
4831 }
4832
4833 if (vnode_vtype(dvp_at) != VDIR) {
4834 vnode_put(dvp_at);
4835 return ENOTDIR;
4836 }
4837
4838 ndp->ni_dvp = dvp_at;
4839 ndp->ni_cnd.cn_flags |= USEDVP;
4840 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4841 retval, authfd);
4842 vnode_put(dvp_at);
4843 return error;
4844 }
4845 }
4846
4847 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4848 }
4849
4850 /*
4851 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4852 *
4853 * Parameters: p Process requesting the open
4854 * uap User argument descriptor (see below)
4855 * retval Pointer to an area to receive the
4856 * return calue from the system call
4857 *
4858 * Indirect: uap->path Path to open (same as 'open')
4859 * uap->flags Flags to open (same as 'open'
4860 * uap->uid UID to set, if creating
4861 * uap->gid GID to set, if creating
4862 * uap->mode File mode, if creating (same as 'open')
4863 * uap->xsecurity ACL to set, if creating
4864 *
4865 * Returns: 0 Success
4866 * !0 errno value
4867 *
4868 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4869 *
4870 * XXX: We should enummerate the possible errno values here, and where
4871 * in the code they originated.
4872 */
4873 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4874 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4875 {
4876 int ciferror;
4877 kauth_filesec_t xsecdst;
4878 struct vnode_attr va;
4879 struct nameidata nd;
4880 int cmode;
4881
4882 AUDIT_ARG(owner, uap->uid, uap->gid);
4883
4884 xsecdst = NULL;
4885 if ((uap->xsecurity != USER_ADDR_NULL) &&
4886 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4887 return ciferror;
4888 }
4889
4890 VATTR_INIT(&va);
4891 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4892 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4893 if (uap->uid != KAUTH_UID_NONE) {
4894 VATTR_SET(&va, va_uid, uap->uid);
4895 }
4896 if (uap->gid != KAUTH_GID_NONE) {
4897 VATTR_SET(&va, va_gid, uap->gid);
4898 }
4899 if (xsecdst != NULL) {
4900 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4901 va.va_vaflags |= VA_FILESEC_ACL;
4902 }
4903
4904 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4905 uap->path, vfs_context_current());
4906
4907 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4908 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4909 if (xsecdst != NULL) {
4910 kauth_filesec_free(xsecdst);
4911 }
4912
4913 return ciferror;
4914 }
4915
4916 /*
4917 * Go through the data-protected atomically controlled open (2)
4918 *
4919 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4920 */
4921 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4922 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4923 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4924 {
4925 /*
4926 * Follow the same path as normal open(2)
4927 * Look up the item if it exists, and acquire the vnode.
4928 */
4929 struct vnode_attr va;
4930 struct nameidata nd;
4931 int cmode;
4932 int error;
4933 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4934
4935 VATTR_INIT(&va);
4936 /* Mask off all but regular access permissions */
4937 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4938 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4939
4940 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4941 path, ctx);
4942
4943 /*
4944 * Initialize the extra fields in vnode_attr to pass down our
4945 * extra fields.
4946 * 1. target cprotect class.
4947 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4948 */
4949 if (flags & O_CREAT) {
4950 /* lower level kernel code validates that the class is valid before applying it. */
4951 if (class != PROTECTION_CLASS_DEFAULT) {
4952 /*
4953 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4954 * file behave the same as open (2)
4955 */
4956 VATTR_SET(&va, va_dataprotect_class, class);
4957 }
4958 }
4959
4960 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4961 if (flags & (O_RDWR | O_WRONLY)) {
4962 /*
4963 * Not allowed to write raw encrypted bytes or when opening authenticated.
4964 */
4965 return EINVAL;
4966 }
4967 if (dpflags & O_DP_GETRAWENCRYPTED) {
4968 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4969 }
4970 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4971 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4972 }
4973 if (dpflags & O_DP_AUTHENTICATE) {
4974 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4975 }
4976 }
4977
4978 error = open1at(vfs_context_current(), &nd, flags, &va,
4979 NULL, NULL, retval, fd, authfd);
4980
4981 return error;
4982 }
4983
4984 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)4985 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
4986 {
4987 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
4988 return EINVAL;
4989 }
4990
4991 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
4992 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
4993 }
4994
4995 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)4996 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4997 {
4998 if (uap->dpflags & O_DP_AUTHENTICATE) {
4999 return EINVAL;
5000 }
5001
5002 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5003 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5004 }
5005
5006 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5007 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5008 int fd, enum uio_seg segflg, int *retval)
5009 {
5010 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5011 struct {
5012 struct vnode_attr va;
5013 struct nameidata nd;
5014 } *__open_data;
5015 struct vnode_attr *vap;
5016 struct nameidata *ndp;
5017 int cmode;
5018 int error;
5019
5020 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5021 vap = &__open_data->va;
5022 ndp = &__open_data->nd;
5023
5024 VATTR_INIT(vap);
5025 /* Mask off all but regular access permissions */
5026 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5027 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5028
5029 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5030 segflg, path, ctx);
5031
5032 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5033
5034 kfree_type(typeof(*__open_data), __open_data);
5035
5036 return error;
5037 }
5038
5039 int
open(proc_t p,struct open_args * uap,int32_t * retval)5040 open(proc_t p, struct open_args *uap, int32_t *retval)
5041 {
5042 __pthread_testcancel(1);
5043 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5044 }
5045
5046 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5047 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5048 int32_t *retval)
5049 {
5050 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5051 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5052 }
5053
5054 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5055 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5056 int32_t *retval)
5057 {
5058 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5059 uap->mode, uap->fd, UIO_USERSPACE, retval);
5060 }
5061
5062 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5063 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5064 {
5065 __pthread_testcancel(1);
5066 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5067 }
5068
5069 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5070
5071 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5072 vfs_context_can_open_by_id(vfs_context_t ctx)
5073 {
5074 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5075 return TRUE;
5076 }
5077
5078 return IOTaskHasEntitlement(vfs_context_task(ctx),
5079 OPEN_BY_ID_ENTITLEMENT);
5080 }
5081
5082 /*
5083 * openbyid_np: open a file given a file system id and a file system object id
5084 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5085 * file systems that don't support object ids it is a node id (uint64_t).
5086 *
5087 * Parameters: p Process requesting the open
5088 * uap User argument descriptor (see below)
5089 * retval Pointer to an area to receive the
5090 * return calue from the system call
5091 *
5092 * Indirect: uap->path Path to open (same as 'open')
5093 *
5094 * uap->fsid id of target file system
5095 * uap->objid id of target file system object
5096 * uap->flags Flags to open (same as 'open')
5097 *
5098 * Returns: 0 Success
5099 * !0 errno value
5100 *
5101 *
5102 * XXX: We should enummerate the possible errno values here, and where
5103 * in the code they originated.
5104 */
5105 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5106 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5107 {
5108 fsid_t fsid;
5109 uint64_t objid;
5110 int error;
5111 char *buf = NULL;
5112 int buflen = MAXPATHLEN;
5113 int pathlen = 0;
5114 vfs_context_t ctx = vfs_context_current();
5115
5116 if (!vfs_context_can_open_by_id(ctx)) {
5117 return EPERM;
5118 }
5119
5120 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5121 return error;
5122 }
5123
5124 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5125 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5126 return error;
5127 }
5128
5129 AUDIT_ARG(value32, fsid.val[0]);
5130 AUDIT_ARG(value64, objid);
5131
5132 /*resolve path from fsis, objid*/
5133 do {
5134 buf = kalloc_data(buflen + 1, Z_WAITOK);
5135 if (buf == NULL) {
5136 return ENOMEM;
5137 }
5138
5139 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5140 buf, FSOPT_ISREALFSID, &pathlen);
5141
5142 if (error) {
5143 kfree_data(buf, buflen + 1);
5144 buf = NULL;
5145 }
5146 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5147
5148 if (error) {
5149 return error;
5150 }
5151
5152 buf[pathlen] = 0;
5153
5154 error = openat_internal(
5155 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5156
5157 kfree_data(buf, buflen + 1);
5158
5159 return error;
5160 }
5161
5162
5163 /*
5164 * Create a special file.
5165 */
5166 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5167 int fd);
5168
5169 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5170 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5171 mode_t mode, int fd)
5172 {
5173 vfs_context_t ctx = vfs_context_current();
5174 struct nameidata nd;
5175 vnode_t vp, dvp;
5176 int error;
5177
5178 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5179 if ((mode & S_IFMT) == S_IFIFO) {
5180 return mkfifo1(ctx, upath, vap, fd);
5181 }
5182
5183 AUDIT_ARG(mode, mode);
5184 AUDIT_ARG(value32, vap->va_rdev);
5185
5186 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5187 return error;
5188 }
5189 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5190 UIO_USERSPACE, upath, ctx);
5191 error = nameiat(&nd, fd);
5192 if (error) {
5193 return error;
5194 }
5195 dvp = nd.ni_dvp;
5196 vp = nd.ni_vp;
5197
5198 if (vp != NULL) {
5199 error = EEXIST;
5200 goto out;
5201 }
5202
5203 switch (mode & S_IFMT) {
5204 case S_IFCHR:
5205 VATTR_SET(vap, va_type, VCHR);
5206 break;
5207 case S_IFBLK:
5208 VATTR_SET(vap, va_type, VBLK);
5209 break;
5210 default:
5211 error = EINVAL;
5212 goto out;
5213 }
5214
5215 #if CONFIG_MACF
5216 error = mac_vnode_check_create(ctx,
5217 nd.ni_dvp, &nd.ni_cnd, vap);
5218 if (error) {
5219 goto out;
5220 }
5221 #endif
5222
5223 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5224 goto out;
5225 }
5226
5227 #if CONFIG_FILE_LEASES
5228 vnode_breakdirlease(dvp, false, O_WRONLY);
5229 #endif
5230
5231 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5232 goto out;
5233 }
5234
5235 if (vp) {
5236 int update_flags = 0;
5237
5238 // Make sure the name & parent pointers are hooked up
5239 if (vp->v_name == NULL) {
5240 update_flags |= VNODE_UPDATE_NAME;
5241 }
5242 if (vp->v_parent == NULLVP) {
5243 update_flags |= VNODE_UPDATE_PARENT;
5244 }
5245
5246 if (update_flags) {
5247 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5248 }
5249
5250 #if CONFIG_FSE
5251 add_fsevent(FSE_CREATE_FILE, ctx,
5252 FSE_ARG_VNODE, vp,
5253 FSE_ARG_DONE);
5254 #endif
5255 }
5256
5257 out:
5258 /*
5259 * nameidone has to happen before we vnode_put(dvp)
5260 * since it may need to release the fs_nodelock on the dvp
5261 */
5262 nameidone(&nd);
5263
5264 if (vp) {
5265 vnode_put(vp);
5266 }
5267 vnode_put(dvp);
5268
5269 return error;
5270 }
5271
5272 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5273 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5274 {
5275 struct vnode_attr va;
5276
5277 VATTR_INIT(&va);
5278 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5279 VATTR_SET(&va, va_rdev, uap->dev);
5280
5281 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5282 }
5283
5284 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5285 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5286 {
5287 struct vnode_attr va;
5288
5289 VATTR_INIT(&va);
5290 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5291 VATTR_SET(&va, va_rdev, uap->dev);
5292
5293 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5294 }
5295
5296 /*
5297 * Create a named pipe.
5298 *
5299 * Returns: 0 Success
5300 * EEXIST
5301 * namei:???
5302 * vnode_authorize:???
5303 * vn_create:???
5304 */
5305 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5306 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5307 {
5308 vnode_t vp, dvp;
5309 int error;
5310 struct nameidata nd;
5311
5312 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5313 UIO_USERSPACE, upath, ctx);
5314 error = nameiat(&nd, fd);
5315 if (error) {
5316 return error;
5317 }
5318 dvp = nd.ni_dvp;
5319 vp = nd.ni_vp;
5320
5321 /* check that this is a new file and authorize addition */
5322 if (vp != NULL) {
5323 error = EEXIST;
5324 goto out;
5325 }
5326 VATTR_SET(vap, va_type, VFIFO);
5327
5328 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5329 goto out;
5330 }
5331
5332 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5333 out:
5334 /*
5335 * nameidone has to happen before we vnode_put(dvp)
5336 * since it may need to release the fs_nodelock on the dvp
5337 */
5338 nameidone(&nd);
5339
5340 if (vp) {
5341 vnode_put(vp);
5342 }
5343 vnode_put(dvp);
5344
5345 return error;
5346 }
5347
5348
5349 /*
5350 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5351 *
5352 * Parameters: p Process requesting the open
5353 * uap User argument descriptor (see below)
5354 * retval (Ignored)
5355 *
5356 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5357 * uap->uid UID to set
5358 * uap->gid GID to set
5359 * uap->mode File mode to set (same as 'mkfifo')
5360 * uap->xsecurity ACL to set, if creating
5361 *
5362 * Returns: 0 Success
5363 * !0 errno value
5364 *
5365 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5366 *
5367 * XXX: We should enummerate the possible errno values here, and where
5368 * in the code they originated.
5369 */
5370 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5371 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5372 {
5373 int ciferror;
5374 kauth_filesec_t xsecdst;
5375 struct vnode_attr va;
5376
5377 AUDIT_ARG(owner, uap->uid, uap->gid);
5378
5379 xsecdst = KAUTH_FILESEC_NONE;
5380 if (uap->xsecurity != USER_ADDR_NULL) {
5381 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5382 return ciferror;
5383 }
5384 }
5385
5386 VATTR_INIT(&va);
5387 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5388 if (uap->uid != KAUTH_UID_NONE) {
5389 VATTR_SET(&va, va_uid, uap->uid);
5390 }
5391 if (uap->gid != KAUTH_GID_NONE) {
5392 VATTR_SET(&va, va_gid, uap->gid);
5393 }
5394 if (xsecdst != KAUTH_FILESEC_NONE) {
5395 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5396 va.va_vaflags |= VA_FILESEC_ACL;
5397 }
5398
5399 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5400
5401 if (xsecdst != KAUTH_FILESEC_NONE) {
5402 kauth_filesec_free(xsecdst);
5403 }
5404 return ciferror;
5405 }
5406
5407 /* ARGSUSED */
5408 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5409 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5410 {
5411 struct vnode_attr va;
5412
5413 VATTR_INIT(&va);
5414 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5415
5416 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5417 }
5418
5419 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5420 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5421 {
5422 struct vnode_attr va;
5423
5424 VATTR_INIT(&va);
5425 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5426
5427 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5428 }
5429
5430 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5431 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5432 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5433
5434 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5435 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5436 {
5437 int ret, len = _len;
5438
5439 *truncated_path = 0;
5440
5441 if (firmlink) {
5442 ret = vn_getpath(dvp, path, &len);
5443 } else {
5444 ret = vn_getpath_no_firmlink(dvp, path, &len);
5445 }
5446 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5447 if (leafname) {
5448 path[len - 1] = '/';
5449 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5450 if (len > MAXPATHLEN) {
5451 char *ptr;
5452
5453 // the string got truncated!
5454 *truncated_path = 1;
5455 ptr = strrchr(path, '/');
5456 if (ptr) {
5457 *ptr = '\0'; // chop off the string at the last directory component
5458 }
5459 len = (int)strlen(path) + 1;
5460 }
5461 }
5462 } else if (ret == 0) {
5463 *truncated_path = 1;
5464 } else if (ret != 0) {
5465 struct vnode *mydvp = dvp;
5466
5467 if (ret != ENOSPC) {
5468 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5469 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5470 }
5471 *truncated_path = 1;
5472
5473 do {
5474 if (mydvp->v_parent != NULL) {
5475 mydvp = mydvp->v_parent;
5476 } else if (mydvp->v_mount) {
5477 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5478 break;
5479 } else {
5480 // no parent and no mount point? only thing is to punt and say "/" changed
5481 strlcpy(path, "/", _len);
5482 len = 2;
5483 mydvp = NULL;
5484 }
5485
5486 if (mydvp == NULL) {
5487 break;
5488 }
5489
5490 len = _len;
5491 if (firmlink) {
5492 ret = vn_getpath(mydvp, path, &len);
5493 } else {
5494 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5495 }
5496 } while (ret == ENOSPC);
5497 }
5498
5499 return len;
5500 }
5501
5502 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5503 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5504 {
5505 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5506 }
5507
5508 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5509 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5510 {
5511 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5512 }
5513
5514 /*
5515 * Make a hard file link.
5516 *
5517 * Returns: 0 Success
5518 * EPERM
5519 * EEXIST
5520 * EXDEV
5521 * namei:???
5522 * vnode_authorize:???
5523 * VNOP_LINK:???
5524 */
5525 /* ARGSUSED */
5526 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5527 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5528 user_addr_t link, int flag, enum uio_seg segflg)
5529 {
5530 vnode_t vp, pvp, dvp, lvp;
5531 struct nameidata nd;
5532 int follow;
5533 int error;
5534 #if CONFIG_FSE
5535 fse_info finfo;
5536 #endif
5537 int need_event, has_listeners, need_kpath2;
5538 char *target_path = NULL;
5539 char *no_firmlink_path = NULL;
5540 int truncated = 0;
5541 int truncated_no_firmlink_path = 0;
5542
5543 vp = dvp = lvp = NULLVP;
5544
5545 /* look up the object we are linking to */
5546 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5547 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5548 segflg, path, ctx);
5549
5550 error = nameiat(&nd, fd1);
5551 if (error) {
5552 return error;
5553 }
5554 vp = nd.ni_vp;
5555
5556 nameidone(&nd);
5557
5558 /*
5559 * Normally, linking to directories is not supported.
5560 * However, some file systems may have limited support.
5561 */
5562 if (vp->v_type == VDIR) {
5563 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5564 error = EPERM; /* POSIX */
5565 goto out;
5566 }
5567
5568 /* Linking to a directory requires ownership. */
5569 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5570 struct vnode_attr dva;
5571
5572 VATTR_INIT(&dva);
5573 VATTR_WANTED(&dva, va_uid);
5574 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5575 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5576 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5577 error = EACCES;
5578 goto out;
5579 }
5580 }
5581 }
5582
5583 /* lookup the target node */
5584 #if CONFIG_TRIGGERS
5585 nd.ni_op = OP_LINK;
5586 #endif
5587 nd.ni_cnd.cn_nameiop = CREATE;
5588 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5589 nd.ni_dirp = link;
5590 error = nameiat(&nd, fd2);
5591 if (error != 0) {
5592 goto out;
5593 }
5594 dvp = nd.ni_dvp;
5595 lvp = nd.ni_vp;
5596
5597 #if CONFIG_MACF
5598 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5599 goto out2;
5600 }
5601 #endif
5602
5603 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5604 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5605 goto out2;
5606 }
5607
5608 /* target node must not exist */
5609 if (lvp != NULLVP) {
5610 error = EEXIST;
5611 goto out2;
5612 }
5613 /* cannot link across mountpoints */
5614 if (vnode_mount(vp) != vnode_mount(dvp)) {
5615 error = EXDEV;
5616 goto out2;
5617 }
5618
5619 /* authorize creation of the target note */
5620 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5621 goto out2;
5622 }
5623
5624 #if CONFIG_FILE_LEASES
5625 vnode_breakdirlease(dvp, false, O_WRONLY);
5626 #endif
5627
5628 /* and finally make the link */
5629 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5630 if (error) {
5631 goto out2;
5632 }
5633
5634 #if CONFIG_MACF
5635 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5636 #endif
5637
5638 #if CONFIG_FSE
5639 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5640 #else
5641 need_event = 0;
5642 #endif
5643 has_listeners = kauth_authorize_fileop_has_listeners();
5644
5645 need_kpath2 = 0;
5646 #if CONFIG_AUDIT
5647 if (AUDIT_RECORD_EXISTS()) {
5648 need_kpath2 = 1;
5649 }
5650 #endif
5651
5652 if (need_event || has_listeners || need_kpath2) {
5653 char *link_to_path = NULL;
5654 int len, link_name_len;
5655 int len_no_firmlink_path = 0;
5656
5657 /* build the path to the new link file */
5658 GET_PATH(target_path);
5659
5660 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5661 if (no_firmlink_path == NULL) {
5662 GET_PATH(no_firmlink_path);
5663 }
5664 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5665
5666 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5667
5668 if (has_listeners) {
5669 /* build the path to file we are linking to */
5670 GET_PATH(link_to_path);
5671
5672 link_name_len = MAXPATHLEN;
5673 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5674 /*
5675 * Call out to allow 3rd party notification of rename.
5676 * Ignore result of kauth_authorize_fileop call.
5677 */
5678 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5679 (uintptr_t)link_to_path,
5680 (uintptr_t)target_path);
5681 }
5682 if (link_to_path != NULL) {
5683 RELEASE_PATH(link_to_path);
5684 }
5685 }
5686 #if CONFIG_FSE
5687 if (need_event) {
5688 /* construct fsevent */
5689 if (get_fse_info(vp, &finfo, ctx) == 0) {
5690 if (truncated_no_firmlink_path) {
5691 finfo.mode |= FSE_TRUNCATED_PATH;
5692 }
5693
5694 // build the path to the destination of the link
5695 add_fsevent(FSE_CREATE_FILE, ctx,
5696 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5697 FSE_ARG_FINFO, &finfo,
5698 FSE_ARG_DONE);
5699 }
5700
5701 pvp = vp->v_parent;
5702 // need an iocount on pvp in this case
5703 if (pvp && pvp != dvp) {
5704 error = vnode_get(pvp);
5705 if (error) {
5706 pvp = NULLVP;
5707 error = 0;
5708 }
5709 }
5710 if (pvp) {
5711 add_fsevent(FSE_STAT_CHANGED, ctx,
5712 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5713 }
5714 if (pvp && pvp != dvp) {
5715 vnode_put(pvp);
5716 }
5717 }
5718 #endif
5719 }
5720 out2:
5721 /*
5722 * nameidone has to happen before we vnode_put(dvp)
5723 * since it may need to release the fs_nodelock on the dvp
5724 */
5725 nameidone(&nd);
5726 if (target_path != NULL) {
5727 RELEASE_PATH(target_path);
5728 }
5729 if (no_firmlink_path != NULL) {
5730 RELEASE_PATH(no_firmlink_path);
5731 no_firmlink_path = NULL;
5732 }
5733 out:
5734 if (lvp) {
5735 vnode_put(lvp);
5736 }
5737 if (dvp) {
5738 vnode_put(dvp);
5739 }
5740 vnode_put(vp);
5741 return error;
5742 }
5743
5744 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5745 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5746 {
5747 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5748 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5749 }
5750
5751 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5752 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5753 {
5754 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5755 return EINVAL;
5756 }
5757
5758 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5759 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5760 }
5761
5762 /*
5763 * Make a symbolic link.
5764 *
5765 * We could add support for ACLs here too...
5766 */
5767 /* ARGSUSED */
5768 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5769 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5770 user_addr_t link, enum uio_seg segflg)
5771 {
5772 struct vnode_attr va;
5773 char *path;
5774 int error;
5775 struct nameidata nd;
5776 vnode_t vp, dvp;
5777 size_t dummy = 0;
5778 proc_t p;
5779
5780 error = 0;
5781 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5782 path = zalloc(ZV_NAMEI);
5783 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5784 } else {
5785 path = (char *)path_data;
5786 }
5787 if (error) {
5788 goto out;
5789 }
5790 AUDIT_ARG(text, path); /* This is the link string */
5791
5792 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5793 segflg, link, ctx);
5794
5795 error = nameiat(&nd, fd);
5796 if (error) {
5797 goto out;
5798 }
5799 dvp = nd.ni_dvp;
5800 vp = nd.ni_vp;
5801
5802 p = vfs_context_proc(ctx);
5803 VATTR_INIT(&va);
5804 VATTR_SET(&va, va_type, VLNK);
5805 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5806
5807 #if CONFIG_MACF
5808 error = mac_vnode_check_create(ctx,
5809 dvp, &nd.ni_cnd, &va);
5810 #endif
5811 if (error != 0) {
5812 goto skipit;
5813 }
5814
5815 if (vp != NULL) {
5816 error = EEXIST;
5817 goto skipit;
5818 }
5819
5820 /* authorize */
5821 if (error == 0) {
5822 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5823 }
5824 /* get default ownership, etc. */
5825 if (error == 0) {
5826 error = vnode_authattr_new(dvp, &va, 0, ctx);
5827 }
5828
5829 #if CONFIG_FILE_LEASES
5830 vnode_breakdirlease(dvp, false, O_WRONLY);
5831 #endif
5832
5833 if (error == 0) {
5834 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5835 }
5836
5837 /* do fallback attribute handling */
5838 if (error == 0 && vp) {
5839 error = vnode_setattr_fallback(vp, &va, ctx);
5840 }
5841
5842 #if CONFIG_MACF
5843 if (error == 0 && vp) {
5844 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5845 }
5846 #endif
5847
5848 if (error == 0) {
5849 int update_flags = 0;
5850
5851 /*check if a new vnode was created, else try to get one*/
5852 if (vp == NULL) {
5853 nd.ni_cnd.cn_nameiop = LOOKUP;
5854 #if CONFIG_TRIGGERS
5855 nd.ni_op = OP_LOOKUP;
5856 #endif
5857 /*
5858 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5859 * reallocated again in namei().
5860 */
5861 nd.ni_cnd.cn_flags &= HASBUF;
5862 error = nameiat(&nd, fd);
5863 if (error) {
5864 goto skipit;
5865 }
5866 vp = nd.ni_vp;
5867 }
5868
5869 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5870 /* call out to allow 3rd party notification of rename.
5871 * Ignore result of kauth_authorize_fileop call.
5872 */
5873 if (kauth_authorize_fileop_has_listeners() &&
5874 namei(&nd) == 0) {
5875 char *new_link_path = NULL;
5876 int len;
5877
5878 /* build the path to the new link file */
5879 new_link_path = get_pathbuff();
5880 len = MAXPATHLEN;
5881 vn_getpath(dvp, new_link_path, &len);
5882 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5883 new_link_path[len - 1] = '/';
5884 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5885 }
5886
5887 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5888 (uintptr_t)path, (uintptr_t)new_link_path);
5889 if (new_link_path != NULL) {
5890 release_pathbuff(new_link_path);
5891 }
5892 }
5893 #endif
5894 // Make sure the name & parent pointers are hooked up
5895 if (vp->v_name == NULL) {
5896 update_flags |= VNODE_UPDATE_NAME;
5897 }
5898 if (vp->v_parent == NULLVP) {
5899 update_flags |= VNODE_UPDATE_PARENT;
5900 }
5901
5902 if (update_flags) {
5903 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5904 }
5905
5906 #if CONFIG_FSE
5907 add_fsevent(FSE_CREATE_FILE, ctx,
5908 FSE_ARG_VNODE, vp,
5909 FSE_ARG_DONE);
5910 #endif
5911 }
5912
5913 skipit:
5914 /*
5915 * nameidone has to happen before we vnode_put(dvp)
5916 * since it may need to release the fs_nodelock on the dvp
5917 */
5918 nameidone(&nd);
5919
5920 if (vp) {
5921 vnode_put(vp);
5922 }
5923 vnode_put(dvp);
5924 out:
5925 if (path && (path != (char *)path_data)) {
5926 zfree(ZV_NAMEI, path);
5927 }
5928
5929 return error;
5930 }
5931
5932 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5933 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5934 {
5935 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5936 uap->link, UIO_USERSPACE);
5937 }
5938
5939 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5940 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5941 __unused int32_t *retval)
5942 {
5943 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5944 uap->path2, UIO_USERSPACE);
5945 }
5946
5947 /*
5948 * Delete a whiteout from the filesystem.
5949 * No longer supported.
5950 */
5951 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5952 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5953 {
5954 return ENOTSUP;
5955 }
5956
5957 /*
5958 * Delete a name from the filesystem.
5959 */
5960 /* ARGSUSED */
5961 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5962 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5963 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5964 {
5965 struct {
5966 struct nameidata nd;
5967 #if CONFIG_FSE
5968 struct vnode_attr va;
5969 fse_info finfo;
5970 #endif
5971 } *__unlink_data;
5972 struct nameidata *ndp;
5973 vnode_t vp, dvp;
5974 int error;
5975 struct componentname *cnp;
5976 char *path = NULL;
5977 char *no_firmlink_path = NULL;
5978 int len_path = 0;
5979 int len_no_firmlink_path = 0;
5980 int flags;
5981 int need_event;
5982 int has_listeners;
5983 int truncated_path;
5984 int truncated_no_firmlink_path;
5985 int batched;
5986 struct vnode_attr *vap;
5987 int do_retry;
5988 int retry_count = 0;
5989 int cn_flags;
5990
5991 cn_flags = LOCKPARENT;
5992 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5993 cn_flags |= AUDITVNPATH1;
5994 }
5995 /* If a starting dvp is passed, it trumps any fd passed. */
5996 if (start_dvp) {
5997 cn_flags |= USEDVP;
5998 }
5999
6000 #if NAMEDRSRCFORK
6001 /* unlink or delete is allowed on rsrc forks and named streams */
6002 cn_flags |= CN_ALLOWRSRCFORK;
6003 #endif
6004
6005 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6006 ndp = &__unlink_data->nd;
6007 #if CONFIG_FSE
6008 fse_info *finfop = &__unlink_data->finfo;
6009 #endif
6010
6011 retry:
6012 do_retry = 0;
6013 flags = 0;
6014 need_event = 0;
6015 has_listeners = 0;
6016 truncated_path = 0;
6017 truncated_no_firmlink_path = 0;
6018 vap = NULL;
6019
6020 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6021
6022 ndp->ni_dvp = start_dvp;
6023 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
6024 cnp = &ndp->ni_cnd;
6025
6026 continue_lookup:
6027 error = nameiat(ndp, fd);
6028 if (error) {
6029 goto early_out;
6030 }
6031
6032 dvp = ndp->ni_dvp;
6033 vp = ndp->ni_vp;
6034
6035 /* With Carbon delete semantics, busy files cannot be deleted */
6036 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6037 flags |= VNODE_REMOVE_NODELETEBUSY;
6038 }
6039
6040 /* Skip any potential upcalls if told to. */
6041 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6042 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6043 }
6044
6045 if (vp) {
6046 batched = vnode_compound_remove_available(vp);
6047 /*
6048 * The root of a mounted filesystem cannot be deleted.
6049 */
6050 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6051 error = EBUSY;
6052 goto out;
6053 }
6054
6055 #if DEVELOPMENT || DEBUG
6056 /*
6057 * XXX VSWAP: Check for entitlements or special flag here
6058 * so we can restrict access appropriately.
6059 */
6060 #else /* DEVELOPMENT || DEBUG */
6061
6062 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6063 error = EPERM;
6064 goto out;
6065 }
6066 #endif /* DEVELOPMENT || DEBUG */
6067
6068 if (!batched) {
6069 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6070 if (error) {
6071 if (error == ENOENT) {
6072 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6073 do_retry = 1;
6074 retry_count++;
6075 }
6076 }
6077 goto out;
6078 }
6079 }
6080 } else {
6081 batched = 1;
6082
6083 if (!vnode_compound_remove_available(dvp)) {
6084 panic("No vp, but no compound remove?");
6085 }
6086 }
6087
6088 #if CONFIG_FSE
6089 need_event = need_fsevent(FSE_DELETE, dvp);
6090 if (need_event) {
6091 if (!batched) {
6092 if ((vp->v_flag & VISHARDLINK) == 0) {
6093 /* XXX need to get these data in batched VNOP */
6094 get_fse_info(vp, finfop, ctx);
6095 }
6096 } else {
6097 error =
6098 vfs_get_notify_attributes(&__unlink_data->va);
6099 if (error) {
6100 goto out;
6101 }
6102
6103 vap = &__unlink_data->va;
6104 }
6105 }
6106 #endif
6107 has_listeners = kauth_authorize_fileop_has_listeners();
6108 if (need_event || has_listeners) {
6109 if (path == NULL) {
6110 GET_PATH(path);
6111 }
6112 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6113 if (no_firmlink_path == NULL) {
6114 GET_PATH(no_firmlink_path);
6115 }
6116 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6117 }
6118
6119 #if NAMEDRSRCFORK
6120 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6121 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6122 } else
6123 #endif
6124 {
6125 #if CONFIG_FILE_LEASES
6126 vnode_breakdirlease(dvp, false, O_WRONLY);
6127 #endif
6128
6129 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6130 vp = ndp->ni_vp;
6131 if (error == EKEEPLOOKING) {
6132 if (!batched) {
6133 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6134 }
6135
6136 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6137 panic("EKEEPLOOKING, but continue flag not set?");
6138 }
6139
6140 if (vnode_isdir(vp)) {
6141 error = EISDIR;
6142 goto out;
6143 }
6144 goto continue_lookup;
6145 } else if (error == ENOENT && batched) {
6146 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6147 /*
6148 * For compound VNOPs, the authorization callback may
6149 * return ENOENT in case of racing hardlink lookups
6150 * hitting the name cache, redrive the lookup.
6151 */
6152 do_retry = 1;
6153 retry_count += 1;
6154 goto out;
6155 }
6156 }
6157 }
6158
6159 /*
6160 * Call out to allow 3rd party notification of delete.
6161 * Ignore result of kauth_authorize_fileop call.
6162 */
6163 if (!error) {
6164 if (has_listeners) {
6165 kauth_authorize_fileop(vfs_context_ucred(ctx),
6166 KAUTH_FILEOP_DELETE,
6167 (uintptr_t)vp,
6168 (uintptr_t)path);
6169 }
6170
6171 if (vp->v_flag & VISHARDLINK) {
6172 //
6173 // if a hardlink gets deleted we want to blow away the
6174 // v_parent link because the path that got us to this
6175 // instance of the link is no longer valid. this will
6176 // force the next call to get the path to ask the file
6177 // system instead of just following the v_parent link.
6178 //
6179 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6180 }
6181
6182 #if CONFIG_FSE
6183 if (need_event) {
6184 if (vp->v_flag & VISHARDLINK) {
6185 get_fse_info(vp, finfop, ctx);
6186 } else if (vap) {
6187 vnode_get_fse_info_from_vap(vp, finfop, vap);
6188 }
6189 if (truncated_path) {
6190 finfop->mode |= FSE_TRUNCATED_PATH;
6191 }
6192 add_fsevent(FSE_DELETE, ctx,
6193 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6194 FSE_ARG_FINFO, finfop,
6195 FSE_ARG_DONE);
6196 }
6197 #endif
6198
6199 #if CONFIG_MACF
6200 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6201 #endif
6202 }
6203
6204 out:
6205 if (path != NULL) {
6206 RELEASE_PATH(path);
6207 path = NULL;
6208 }
6209
6210 if (no_firmlink_path != NULL) {
6211 RELEASE_PATH(no_firmlink_path);
6212 no_firmlink_path = NULL;
6213 }
6214 #if NAMEDRSRCFORK
6215 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6216 * will cause its shadow file to go away if necessary.
6217 */
6218 if (vp && (vnode_isnamedstream(vp)) &&
6219 (vp->v_parent != NULLVP) &&
6220 vnode_isshadow(vp)) {
6221 vnode_recycle(vp);
6222 }
6223 #endif
6224 /*
6225 * nameidone has to happen before we vnode_put(dvp)
6226 * since it may need to release the fs_nodelock on the dvp
6227 */
6228 nameidone(ndp);
6229 vnode_put(dvp);
6230 if (vp) {
6231 vnode_put(vp);
6232 }
6233
6234 if (do_retry) {
6235 goto retry;
6236 }
6237
6238 early_out:
6239 kfree_type(typeof(*__unlink_data), __unlink_data);
6240 return error;
6241 }
6242
6243 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6244 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6245 enum uio_seg segflg, int unlink_flags)
6246 {
6247 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6248 unlink_flags);
6249 }
6250
6251 /*
6252 * Delete a name from the filesystem using Carbon semantics.
6253 */
6254 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6255 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6256 {
6257 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6258 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6259 }
6260
6261 /*
6262 * Delete a name from the filesystem using POSIX semantics.
6263 */
6264 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6265 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6266 {
6267 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6268 uap->path, UIO_USERSPACE, 0);
6269 }
6270
6271 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6272 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6273 {
6274 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6275 return EINVAL;
6276 }
6277
6278 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6279 int unlink_flags = 0;
6280
6281 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6282 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6283 }
6284 return rmdirat_internal(vfs_context_current(), uap->fd,
6285 uap->path, UIO_USERSPACE, unlink_flags);
6286 } else {
6287 return unlinkat_internal(vfs_context_current(), uap->fd,
6288 NULLVP, uap->path, UIO_USERSPACE, 0);
6289 }
6290 }
6291
6292 /*
6293 * Reposition read/write file offset.
6294 */
6295 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6296 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6297 {
6298 struct fileproc *fp;
6299 vnode_t vp;
6300 struct vfs_context *ctx;
6301 off_t offset = uap->offset, file_size;
6302 int error;
6303
6304 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6305 if (error == ENOTSUP) {
6306 return ESPIPE;
6307 }
6308 return error;
6309 }
6310 if (vnode_isfifo(vp)) {
6311 file_drop(uap->fd);
6312 return ESPIPE;
6313 }
6314
6315
6316 ctx = vfs_context_current();
6317 #if CONFIG_MACF
6318 if (uap->whence == L_INCR && uap->offset == 0) {
6319 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6320 fp->fp_glob);
6321 } else {
6322 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6323 fp->fp_glob);
6324 }
6325 if (error) {
6326 file_drop(uap->fd);
6327 return error;
6328 }
6329 #endif
6330 if ((error = vnode_getwithref(vp))) {
6331 file_drop(uap->fd);
6332 return error;
6333 }
6334
6335 switch (uap->whence) {
6336 case L_INCR:
6337 offset += fp->fp_glob->fg_offset;
6338 break;
6339 case L_XTND:
6340 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6341 break;
6342 }
6343 offset += file_size;
6344 break;
6345 case L_SET:
6346 break;
6347 case SEEK_HOLE:
6348 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6349 break;
6350 case SEEK_DATA:
6351 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6352 break;
6353 default:
6354 error = EINVAL;
6355 }
6356 if (error == 0) {
6357 if (uap->offset > 0 && offset < 0) {
6358 /* Incremented/relative move past max size */
6359 error = EOVERFLOW;
6360 } else {
6361 /*
6362 * Allow negative offsets on character devices, per
6363 * POSIX 1003.1-2001. Most likely for writing disk
6364 * labels.
6365 */
6366 if (offset < 0 && vp->v_type != VCHR) {
6367 /* Decremented/relative move before start */
6368 error = EINVAL;
6369 } else {
6370 /* Success */
6371 fp->fp_glob->fg_offset = offset;
6372 *retval = fp->fp_glob->fg_offset;
6373 }
6374 }
6375 }
6376
6377 /*
6378 * An lseek can affect whether data is "available to read." Use
6379 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6380 */
6381 post_event_if_success(vp, error, NOTE_NONE);
6382 (void)vnode_put(vp);
6383 file_drop(uap->fd);
6384 return error;
6385 }
6386
6387
6388 /*
6389 * Check access permissions.
6390 *
6391 * Returns: 0 Success
6392 * vnode_authorize:???
6393 */
6394 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6395 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6396 {
6397 kauth_action_t action;
6398 int error;
6399
6400 /*
6401 * If just the regular access bits, convert them to something
6402 * that vnode_authorize will understand.
6403 */
6404 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6405 action = 0;
6406 if (uflags & R_OK) {
6407 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6408 }
6409 if (uflags & W_OK) {
6410 if (vnode_isdir(vp)) {
6411 action |= KAUTH_VNODE_ADD_FILE |
6412 KAUTH_VNODE_ADD_SUBDIRECTORY;
6413 /* might want delete rights here too */
6414 } else {
6415 action |= KAUTH_VNODE_WRITE_DATA;
6416 }
6417 }
6418 if (uflags & X_OK) {
6419 if (vnode_isdir(vp)) {
6420 action |= KAUTH_VNODE_SEARCH;
6421 } else {
6422 action |= KAUTH_VNODE_EXECUTE;
6423 }
6424 }
6425 } else {
6426 /* take advantage of definition of uflags */
6427 action = uflags >> 8;
6428 }
6429
6430 #if CONFIG_MACF
6431 error = mac_vnode_check_access(ctx, vp, uflags);
6432 if (error) {
6433 return error;
6434 }
6435 #endif /* MAC */
6436
6437 /* action == 0 means only check for existence */
6438 if (action != 0) {
6439 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6440 } else {
6441 error = 0;
6442 }
6443
6444 return error;
6445 }
6446
6447
6448
6449 /*
6450 * access_extended: Check access permissions in bulk.
6451 *
6452 * Description: uap->entries Pointer to an array of accessx
6453 * descriptor structs, plus one or
6454 * more NULL terminated strings (see
6455 * "Notes" section below).
6456 * uap->size Size of the area pointed to by
6457 * uap->entries.
6458 * uap->results Pointer to the results array.
6459 *
6460 * Returns: 0 Success
6461 * ENOMEM Insufficient memory
6462 * EINVAL Invalid arguments
6463 * namei:EFAULT Bad address
6464 * namei:ENAMETOOLONG Filename too long
6465 * namei:ENOENT No such file or directory
6466 * namei:ELOOP Too many levels of symbolic links
6467 * namei:EBADF Bad file descriptor
6468 * namei:ENOTDIR Not a directory
6469 * namei:???
6470 * access1:
6471 *
6472 * Implicit returns:
6473 * uap->results Array contents modified
6474 *
6475 * Notes: The uap->entries are structured as an arbitrary length array
6476 * of accessx descriptors, followed by one or more NULL terminated
6477 * strings
6478 *
6479 * struct accessx_descriptor[0]
6480 * ...
6481 * struct accessx_descriptor[n]
6482 * char name_data[0];
6483 *
6484 * We determine the entry count by walking the buffer containing
6485 * the uap->entries argument descriptor. For each descriptor we
6486 * see, the valid values for the offset ad_name_offset will be
6487 * in the byte range:
6488 *
6489 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6490 * to
6491 * [ uap->entries + uap->size - 2 ]
6492 *
6493 * since we must have at least one string, and the string must
6494 * be at least one character plus the NULL terminator in length.
6495 *
6496 * XXX: Need to support the check-as uid argument
6497 */
6498 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6499 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6500 {
6501 struct accessx_descriptor *input = NULL;
6502 errno_t *result = NULL;
6503 errno_t error = 0;
6504 int wantdelete = 0;
6505 size_t desc_max, desc_actual = 0;
6506 unsigned int i, j;
6507 struct vfs_context context;
6508 struct nameidata nd;
6509 int niopts;
6510 vnode_t vp = NULL;
6511 vnode_t dvp = NULL;
6512 #define ACCESSX_MAX_DESCR_ON_STACK 10
6513 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6514
6515 context.vc_ucred = NULL;
6516
6517 /*
6518 * Validate parameters; if valid, copy the descriptor array and string
6519 * arguments into local memory. Before proceeding, the following
6520 * conditions must have been met:
6521 *
6522 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6523 * o There must be sufficient room in the request for at least one
6524 * descriptor and a one yte NUL terminated string.
6525 * o The allocation of local storage must not fail.
6526 */
6527 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6528 return ENOMEM;
6529 }
6530 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6531 return EINVAL;
6532 }
6533 if (uap->size <= sizeof(stack_input)) {
6534 input = stack_input;
6535 } else {
6536 input = kalloc_data(uap->size, Z_WAITOK);
6537 if (input == NULL) {
6538 error = ENOMEM;
6539 goto out;
6540 }
6541 }
6542 error = copyin(uap->entries, input, uap->size);
6543 if (error) {
6544 goto out;
6545 }
6546
6547 AUDIT_ARG(opaque, input, uap->size);
6548
6549 /*
6550 * Force NUL termination of the copyin buffer to avoid nami() running
6551 * off the end. If the caller passes us bogus data, they may get a
6552 * bogus result.
6553 */
6554 ((char *)input)[uap->size - 1] = 0;
6555
6556 /*
6557 * Access is defined as checking against the process' real identity,
6558 * even if operations are checking the effective identity. This
6559 * requires that we use a local vfs context.
6560 */
6561 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6562 context.vc_thread = current_thread();
6563
6564 /*
6565 * Find out how many entries we have, so we can allocate the result
6566 * array by walking the list and adjusting the count downward by the
6567 * earliest string offset we see.
6568 */
6569 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6570 desc_actual = desc_max;
6571 for (i = 0; i < desc_actual; i++) {
6572 /*
6573 * Take the offset to the name string for this entry and
6574 * convert to an input array index, which would be one off
6575 * the end of the array if this entry was the lowest-addressed
6576 * name string.
6577 */
6578 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6579
6580 /*
6581 * An offset greater than the max allowable offset is an error.
6582 * It is also an error for any valid entry to point
6583 * to a location prior to the end of the current entry, if
6584 * it's not a reference to the string of the previous entry.
6585 */
6586 if (j > desc_max || (j != 0 && j <= i)) {
6587 error = EINVAL;
6588 goto out;
6589 }
6590
6591 /* Also do not let ad_name_offset point to something beyond the size of the input */
6592 if (input[i].ad_name_offset >= uap->size) {
6593 error = EINVAL;
6594 goto out;
6595 }
6596
6597 /*
6598 * An offset of 0 means use the previous descriptor's offset;
6599 * this is used to chain multiple requests for the same file
6600 * to avoid multiple lookups.
6601 */
6602 if (j == 0) {
6603 /* This is not valid for the first entry */
6604 if (i == 0) {
6605 error = EINVAL;
6606 goto out;
6607 }
6608 continue;
6609 }
6610
6611 /*
6612 * If the offset of the string for this descriptor is before
6613 * what we believe is the current actual last descriptor,
6614 * then we need to adjust our estimate downward; this permits
6615 * the string table following the last descriptor to be out
6616 * of order relative to the descriptor list.
6617 */
6618 if (j < desc_actual) {
6619 desc_actual = j;
6620 }
6621 }
6622
6623 /*
6624 * We limit the actual number of descriptors we are willing to process
6625 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6626 * requested does not exceed this limit,
6627 */
6628 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6629 error = ENOMEM;
6630 goto out;
6631 }
6632 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6633 if (result == NULL) {
6634 error = ENOMEM;
6635 goto out;
6636 }
6637
6638 /*
6639 * Do the work by iterating over the descriptor entries we know to
6640 * at least appear to contain valid data.
6641 */
6642 error = 0;
6643 for (i = 0; i < desc_actual; i++) {
6644 /*
6645 * If the ad_name_offset is 0, then we use the previous
6646 * results to make the check; otherwise, we are looking up
6647 * a new file name.
6648 */
6649 if (input[i].ad_name_offset != 0) {
6650 /* discard old vnodes */
6651 if (vp) {
6652 vnode_put(vp);
6653 vp = NULL;
6654 }
6655 if (dvp) {
6656 vnode_put(dvp);
6657 dvp = NULL;
6658 }
6659
6660 /*
6661 * Scan forward in the descriptor list to see if we
6662 * need the parent vnode. We will need it if we are
6663 * deleting, since we must have rights to remove
6664 * entries in the parent directory, as well as the
6665 * rights to delete the object itself.
6666 */
6667 wantdelete = input[i].ad_flags & _DELETE_OK;
6668 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6669 if (input[j].ad_flags & _DELETE_OK) {
6670 wantdelete = 1;
6671 }
6672 }
6673
6674 niopts = FOLLOW | AUDITVNPATH1;
6675
6676 /* need parent for vnode_authorize for deletion test */
6677 if (wantdelete) {
6678 niopts |= WANTPARENT;
6679 }
6680
6681 /* do the lookup */
6682 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6683 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6684 &context);
6685 error = namei(&nd);
6686 if (!error) {
6687 vp = nd.ni_vp;
6688 if (wantdelete) {
6689 dvp = nd.ni_dvp;
6690 }
6691 }
6692 nameidone(&nd);
6693 }
6694
6695 /*
6696 * Handle lookup errors.
6697 */
6698 switch (error) {
6699 case ENOENT:
6700 case EACCES:
6701 case EPERM:
6702 case ENOTDIR:
6703 result[i] = error;
6704 break;
6705 case 0:
6706 /* run this access check */
6707 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6708 break;
6709 default:
6710 /* fatal lookup error */
6711
6712 goto out;
6713 }
6714 }
6715
6716 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6717
6718 /* copy out results */
6719 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6720
6721 out:
6722 if (input && input != stack_input) {
6723 kfree_data(input, uap->size);
6724 }
6725 if (result) {
6726 kfree_data(result, desc_actual * sizeof(errno_t));
6727 }
6728 if (vp) {
6729 vnode_put(vp);
6730 }
6731 if (dvp) {
6732 vnode_put(dvp);
6733 }
6734 if (IS_VALID_CRED(context.vc_ucred)) {
6735 kauth_cred_unref(&context.vc_ucred);
6736 }
6737 return error;
6738 }
6739
6740
6741 /*
6742 * Returns: 0 Success
6743 * namei:EFAULT Bad address
6744 * namei:ENAMETOOLONG Filename too long
6745 * namei:ENOENT No such file or directory
6746 * namei:ELOOP Too many levels of symbolic links
6747 * namei:EBADF Bad file descriptor
6748 * namei:ENOTDIR Not a directory
6749 * namei:???
6750 * access1:
6751 */
6752 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6753 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6754 int flag, enum uio_seg segflg)
6755 {
6756 int error;
6757 struct nameidata nd;
6758 int niopts;
6759 struct vfs_context context;
6760 #if NAMEDRSRCFORK
6761 int is_namedstream = 0;
6762 #endif
6763
6764 /*
6765 * Unless the AT_EACCESS option is used, Access is defined as checking
6766 * against the process' real identity, even if operations are checking
6767 * the effective identity. So we need to tweak the credential
6768 * in the context for that case.
6769 */
6770 if (!(flag & AT_EACCESS)) {
6771 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6772 } else {
6773 context.vc_ucred = ctx->vc_ucred;
6774 }
6775 context.vc_thread = ctx->vc_thread;
6776
6777
6778 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6779 /* need parent for vnode_authorize for deletion test */
6780 if (amode & _DELETE_OK) {
6781 niopts |= WANTPARENT;
6782 }
6783 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6784 path, &context);
6785 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6786 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6787 }
6788
6789 #if NAMEDRSRCFORK
6790 /* access(F_OK) calls are allowed for resource forks. */
6791 if (amode == F_OK) {
6792 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6793 }
6794 #endif
6795 error = nameiat(&nd, fd);
6796 if (error) {
6797 goto out;
6798 }
6799
6800 #if NAMEDRSRCFORK
6801 /* Grab reference on the shadow stream file vnode to
6802 * force an inactive on release which will mark it
6803 * for recycle.
6804 */
6805 if (vnode_isnamedstream(nd.ni_vp) &&
6806 (nd.ni_vp->v_parent != NULLVP) &&
6807 vnode_isshadow(nd.ni_vp)) {
6808 is_namedstream = 1;
6809 vnode_ref(nd.ni_vp);
6810 }
6811 #endif
6812
6813 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6814
6815 #if NAMEDRSRCFORK
6816 if (is_namedstream) {
6817 vnode_rele(nd.ni_vp);
6818 }
6819 #endif
6820
6821 vnode_put(nd.ni_vp);
6822 if (amode & _DELETE_OK) {
6823 vnode_put(nd.ni_dvp);
6824 }
6825 nameidone(&nd);
6826
6827 out:
6828 if (!(flag & AT_EACCESS)) {
6829 kauth_cred_unref(&context.vc_ucred);
6830 }
6831 return error;
6832 }
6833
6834 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6835 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6836 {
6837 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6838 uap->path, uap->flags, 0, UIO_USERSPACE);
6839 }
6840
6841 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6842 faccessat(__unused proc_t p, struct faccessat_args *uap,
6843 __unused int32_t *retval)
6844 {
6845 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6846 return EINVAL;
6847 }
6848
6849 return faccessat_internal(vfs_context_current(), uap->fd,
6850 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6851 }
6852
6853 /*
6854 * Returns: 0 Success
6855 * EFAULT
6856 * copyout:EFAULT
6857 * namei:???
6858 * vn_stat:???
6859 */
6860 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6861 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6862 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6863 enum uio_seg segflg, int fd, int flag)
6864 {
6865 struct nameidata nd;
6866 int follow;
6867 union {
6868 struct stat sb;
6869 struct stat64 sb64;
6870 } source = {};
6871 union {
6872 struct user64_stat user64_sb;
6873 struct user32_stat user32_sb;
6874 struct user64_stat64 user64_sb64;
6875 struct user32_stat64 user32_sb64;
6876 } dest = {};
6877 caddr_t sbp;
6878 int error, my_size;
6879 kauth_filesec_t fsec;
6880 size_t xsecurity_bufsize;
6881 void * statptr;
6882 struct fileproc *fp = NULL;
6883 int needsrealdev = 0;
6884
6885 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6886 NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6887 segflg, path, ctx);
6888 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6889 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6890 }
6891
6892 #if NAMEDRSRCFORK
6893 int is_namedstream = 0;
6894 /* stat calls are allowed for resource forks. */
6895 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6896 #endif
6897
6898 if (flag & AT_FDONLY) {
6899 vnode_t fvp;
6900
6901 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6902 if (error) {
6903 return error;
6904 }
6905 if ((error = vnode_getwithref(fvp))) {
6906 file_drop(fd);
6907 return error;
6908 }
6909 nd.ni_vp = fvp;
6910 } else {
6911 error = nameiat(&nd, fd);
6912 if (error) {
6913 return error;
6914 }
6915 }
6916 fsec = KAUTH_FILESEC_NONE;
6917
6918 statptr = (void *)&source;
6919
6920 #if NAMEDRSRCFORK
6921 /* Grab reference on the shadow stream file vnode to
6922 * force an inactive on release which will mark it
6923 * for recycle.
6924 */
6925 if (vnode_isnamedstream(nd.ni_vp) &&
6926 (nd.ni_vp->v_parent != NULLVP) &&
6927 vnode_isshadow(nd.ni_vp)) {
6928 is_namedstream = 1;
6929 vnode_ref(nd.ni_vp);
6930 }
6931 #endif
6932
6933 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6934 if (fp && (xsecurity == USER_ADDR_NULL)) {
6935 /*
6936 * If the caller has the file open, and is not
6937 * requesting extended security information, we are
6938 * going to let them get the basic stat information.
6939 */
6940 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6941 fp->fp_glob->fg_cred);
6942 } else {
6943 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6944 isstat64, needsrealdev, ctx);
6945 }
6946
6947 #if NAMEDRSRCFORK
6948 if (is_namedstream) {
6949 vnode_rele(nd.ni_vp);
6950 }
6951 #endif
6952 vnode_put(nd.ni_vp);
6953 nameidone(&nd);
6954 if (fp) {
6955 file_drop(fd);
6956 fp = NULL;
6957 }
6958
6959 if (error) {
6960 return error;
6961 }
6962 /* Zap spare fields */
6963 if (isstat64 != 0) {
6964 source.sb64.st_lspare = 0;
6965 source.sb64.st_qspare[0] = 0LL;
6966 source.sb64.st_qspare[1] = 0LL;
6967 if (vfs_context_is64bit(ctx)) {
6968 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6969 my_size = sizeof(dest.user64_sb64);
6970 sbp = (caddr_t)&dest.user64_sb64;
6971 } else {
6972 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6973 my_size = sizeof(dest.user32_sb64);
6974 sbp = (caddr_t)&dest.user32_sb64;
6975 }
6976 /*
6977 * Check if we raced (post lookup) against the last unlink of a file.
6978 */
6979 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6980 source.sb64.st_nlink = 1;
6981 }
6982 } else {
6983 source.sb.st_lspare = 0;
6984 source.sb.st_qspare[0] = 0LL;
6985 source.sb.st_qspare[1] = 0LL;
6986 if (vfs_context_is64bit(ctx)) {
6987 munge_user64_stat(&source.sb, &dest.user64_sb);
6988 my_size = sizeof(dest.user64_sb);
6989 sbp = (caddr_t)&dest.user64_sb;
6990 } else {
6991 munge_user32_stat(&source.sb, &dest.user32_sb);
6992 my_size = sizeof(dest.user32_sb);
6993 sbp = (caddr_t)&dest.user32_sb;
6994 }
6995
6996 /*
6997 * Check if we raced (post lookup) against the last unlink of a file.
6998 */
6999 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7000 source.sb.st_nlink = 1;
7001 }
7002 }
7003 if ((error = copyout(sbp, ub, my_size)) != 0) {
7004 goto out;
7005 }
7006
7007 /* caller wants extended security information? */
7008 if (xsecurity != USER_ADDR_NULL) {
7009 /* did we get any? */
7010 if (fsec == KAUTH_FILESEC_NONE) {
7011 if (susize(xsecurity_size, 0) != 0) {
7012 error = EFAULT;
7013 goto out;
7014 }
7015 } else {
7016 /* find the user buffer size */
7017 xsecurity_bufsize = fusize(xsecurity_size);
7018
7019 /* copy out the actual data size */
7020 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7021 error = EFAULT;
7022 goto out;
7023 }
7024
7025 /* if the caller supplied enough room, copy out to it */
7026 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7027 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7028 }
7029 }
7030 }
7031 out:
7032 if (fsec != KAUTH_FILESEC_NONE) {
7033 kauth_filesec_free(fsec);
7034 }
7035 return error;
7036 }
7037
7038 /*
7039 * stat_extended: Get file status; with extended security (ACL).
7040 *
7041 * Parameters: p (ignored)
7042 * uap User argument descriptor (see below)
7043 * retval (ignored)
7044 *
7045 * Indirect: uap->path Path of file to get status from
7046 * uap->ub User buffer (holds file status info)
7047 * uap->xsecurity ACL to get (extended security)
7048 * uap->xsecurity_size Size of ACL
7049 *
7050 * Returns: 0 Success
7051 * !0 errno value
7052 *
7053 */
7054 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7055 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7056 __unused int32_t *retval)
7057 {
7058 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7059 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7060 0);
7061 }
7062
7063 /*
7064 * Returns: 0 Success
7065 * fstatat_internal:??? [see fstatat_internal() in this file]
7066 */
7067 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7068 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7069 {
7070 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7071 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7072 }
7073
7074 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7075 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7076 {
7077 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7078 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7079 }
7080
7081 /*
7082 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7083 *
7084 * Parameters: p (ignored)
7085 * uap User argument descriptor (see below)
7086 * retval (ignored)
7087 *
7088 * Indirect: uap->path Path of file to get status from
7089 * uap->ub User buffer (holds file status info)
7090 * uap->xsecurity ACL to get (extended security)
7091 * uap->xsecurity_size Size of ACL
7092 *
7093 * Returns: 0 Success
7094 * !0 errno value
7095 *
7096 */
7097 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7098 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7099 {
7100 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7101 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7102 0);
7103 }
7104
7105 /*
7106 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7107 *
7108 * Parameters: p (ignored)
7109 * uap User argument descriptor (see below)
7110 * retval (ignored)
7111 *
7112 * Indirect: uap->path Path of file to get status from
7113 * uap->ub User buffer (holds file status info)
7114 * uap->xsecurity ACL to get (extended security)
7115 * uap->xsecurity_size Size of ACL
7116 *
7117 * Returns: 0 Success
7118 * !0 errno value
7119 *
7120 */
7121 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7122 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7123 {
7124 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7125 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7126 AT_SYMLINK_NOFOLLOW);
7127 }
7128
7129 /*
7130 * Get file status; this version does not follow links.
7131 */
7132 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7133 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7134 {
7135 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7136 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7137 }
7138
7139 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7140 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7141 {
7142 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7143 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7144 }
7145
7146 /*
7147 * lstat64_extended: Get file status; can handle large inode numbers; does not
7148 * follow links; with extended security (ACL).
7149 *
7150 * Parameters: p (ignored)
7151 * uap User argument descriptor (see below)
7152 * retval (ignored)
7153 *
7154 * Indirect: uap->path Path of file to get status from
7155 * uap->ub User buffer (holds file status info)
7156 * uap->xsecurity ACL to get (extended security)
7157 * uap->xsecurity_size Size of ACL
7158 *
7159 * Returns: 0 Success
7160 * !0 errno value
7161 *
7162 */
7163 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7164 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7165 {
7166 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7167 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7168 AT_SYMLINK_NOFOLLOW);
7169 }
7170
7171 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7172 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7173 {
7174 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7175 return EINVAL;
7176 }
7177
7178 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7179 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7180 }
7181
7182 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7183 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7184 __unused int32_t *retval)
7185 {
7186 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7187 return EINVAL;
7188 }
7189
7190 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7191 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7192 }
7193
7194 /*
7195 * Get configurable pathname variables.
7196 *
7197 * Returns: 0 Success
7198 * namei:???
7199 * vn_pathconf:???
7200 *
7201 * Notes: Global implementation constants are intended to be
7202 * implemented in this function directly; all other constants
7203 * are per-FS implementation, and therefore must be handled in
7204 * each respective FS, instead.
7205 *
7206 * XXX We implement some things globally right now that should actually be
7207 * XXX per-FS; we will need to deal with this at some point.
7208 */
7209 /* ARGSUSED */
7210 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7211 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7212 {
7213 int error;
7214 struct nameidata nd;
7215 vfs_context_t ctx = vfs_context_current();
7216
7217 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7218 UIO_USERSPACE, uap->path, ctx);
7219 error = namei(&nd);
7220 if (error) {
7221 return error;
7222 }
7223
7224 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7225
7226 vnode_put(nd.ni_vp);
7227 nameidone(&nd);
7228 return error;
7229 }
7230
7231 /*
7232 * Return target name of a symbolic link.
7233 */
7234 /* ARGSUSED */
7235 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7236 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7237 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7238 int *retval)
7239 {
7240 vnode_t vp;
7241 uio_t auio;
7242 int error;
7243 struct nameidata nd;
7244 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
7245 bool put_vnode;
7246
7247 if (bufsize > INT32_MAX) {
7248 return EINVAL;
7249 }
7250
7251 if (lnk_vp) {
7252 vp = lnk_vp;
7253 put_vnode = false;
7254 } else {
7255 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7256 seg, path, ctx);
7257
7258 error = nameiat(&nd, fd);
7259 if (error) {
7260 return error;
7261 }
7262 vp = nd.ni_vp;
7263 put_vnode = true;
7264 nameidone(&nd);
7265 }
7266
7267 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7268 &uio_buf[0], sizeof(uio_buf));
7269 uio_addiov(auio, buf, bufsize);
7270 if (vp->v_type != VLNK) {
7271 error = EINVAL;
7272 } else {
7273 #if CONFIG_MACF
7274 error = mac_vnode_check_readlink(ctx, vp);
7275 #endif
7276 if (error == 0) {
7277 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7278 ctx);
7279 }
7280 if (error == 0) {
7281 error = VNOP_READLINK(vp, auio, ctx);
7282 }
7283 }
7284
7285 if (put_vnode) {
7286 vnode_put(vp);
7287 }
7288
7289 *retval = (int)(bufsize - uio_resid(auio));
7290 return error;
7291 }
7292
7293 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7294 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7295 {
7296 enum uio_seg procseg;
7297 vnode_t vp;
7298 int error;
7299
7300 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7301
7302 AUDIT_ARG(fd, uap->fd);
7303
7304 if ((error = file_vnode(uap->fd, &vp))) {
7305 return error;
7306 }
7307 if ((error = vnode_getwithref(vp))) {
7308 file_drop(uap->fd);
7309 return error;
7310 }
7311
7312 error = readlinkat_internal(vfs_context_current(), -1,
7313 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7314 uap->bufsize, procseg, retval);
7315
7316 vnode_put(vp);
7317 file_drop(uap->fd);
7318 return error;
7319 }
7320
7321 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7322 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7323 {
7324 enum uio_seg procseg;
7325
7326 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7327 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7328 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7329 uap->count, procseg, retval);
7330 }
7331
7332 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7333 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7334 {
7335 enum uio_seg procseg;
7336
7337 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7338 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7339 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7340 retval);
7341 }
7342
7343 /*
7344 * Change file flags, the deep inner layer.
7345 */
7346 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7347 chflags0(vnode_t vp, struct vnode_attr *va,
7348 int (*setattr)(vnode_t, void *, vfs_context_t),
7349 void *arg, vfs_context_t ctx)
7350 {
7351 kauth_action_t action = 0;
7352 int error;
7353
7354 #if CONFIG_MACF
7355 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7356 if (error) {
7357 goto out;
7358 }
7359 #endif
7360
7361 /* request authorisation, disregard immutability */
7362 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7363 goto out;
7364 }
7365 /*
7366 * Request that the auth layer disregard those file flags it's allowed to when
7367 * authorizing this operation; we need to do this in order to be able to
7368 * clear immutable flags.
7369 */
7370 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7371 goto out;
7372 }
7373 error = (*setattr)(vp, arg, ctx);
7374
7375 #if CONFIG_MACF
7376 if (error == 0) {
7377 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7378 }
7379 #endif
7380
7381 out:
7382 return error;
7383 }
7384
7385 /*
7386 * Change file flags.
7387 *
7388 * NOTE: this will vnode_put() `vp'
7389 */
7390 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7391 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7392 {
7393 struct vnode_attr va;
7394 int error;
7395
7396 VATTR_INIT(&va);
7397 VATTR_SET(&va, va_flags, flags);
7398
7399 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7400 vnode_put(vp);
7401
7402 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7403 error = ENOTSUP;
7404 }
7405
7406 return error;
7407 }
7408
7409 /*
7410 * Change flags of a file given a path name.
7411 */
7412 /* ARGSUSED */
7413 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7414 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7415 {
7416 vnode_t vp;
7417 vfs_context_t ctx = vfs_context_current();
7418 int error;
7419 struct nameidata nd;
7420 uint32_t wantparent = 0;
7421
7422 #if CONFIG_FILE_LEASES
7423 wantparent = WANTPARENT;
7424 #endif
7425
7426 AUDIT_ARG(fflags, uap->flags);
7427 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7428 UIO_USERSPACE, uap->path, ctx);
7429 error = namei(&nd);
7430 if (error) {
7431 return error;
7432 }
7433 vp = nd.ni_vp;
7434
7435 #if CONFIG_FILE_LEASES
7436 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7437 vnode_put(nd.ni_dvp);
7438 #endif
7439
7440 nameidone(&nd);
7441
7442 /* we don't vnode_put() here because chflags1 does internally */
7443 error = chflags1(vp, uap->flags, ctx);
7444
7445 return error;
7446 }
7447
7448 /*
7449 * Change flags of a file given a file descriptor.
7450 */
7451 /* ARGSUSED */
7452 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7453 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7454 {
7455 vnode_t vp;
7456 int error;
7457
7458 AUDIT_ARG(fd, uap->fd);
7459 AUDIT_ARG(fflags, uap->flags);
7460 if ((error = file_vnode(uap->fd, &vp))) {
7461 return error;
7462 }
7463
7464 if ((error = vnode_getwithref(vp))) {
7465 file_drop(uap->fd);
7466 return error;
7467 }
7468
7469 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7470
7471 #if CONFIG_FILE_LEASES
7472 vnode_breakdirlease(vp, true, O_WRONLY);
7473 #endif
7474
7475 /* we don't vnode_put() here because chflags1 does internally */
7476 error = chflags1(vp, uap->flags, vfs_context_current());
7477
7478 file_drop(uap->fd);
7479 return error;
7480 }
7481
7482 /*
7483 * Change security information on a filesystem object.
7484 *
7485 * Returns: 0 Success
7486 * EPERM Operation not permitted
7487 * vnode_authattr:??? [anything vnode_authattr can return]
7488 * vnode_authorize:??? [anything vnode_authorize can return]
7489 * vnode_setattr:??? [anything vnode_setattr can return]
7490 *
7491 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7492 * translated to EPERM before being returned.
7493 */
7494 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7495 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7496 {
7497 kauth_action_t action;
7498 int error;
7499
7500 AUDIT_ARG(mode, vap->va_mode);
7501 /* XXX audit new args */
7502
7503 #if NAMEDSTREAMS
7504 /* chmod calls are not allowed for resource forks. */
7505 if (vp->v_flag & VISNAMEDSTREAM) {
7506 return EPERM;
7507 }
7508 #endif
7509
7510 #if CONFIG_MACF
7511 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7512 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7513 return error;
7514 }
7515
7516 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7517 if ((error = mac_vnode_check_setowner(ctx, vp,
7518 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7519 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7520 return error;
7521 }
7522 }
7523
7524 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7525 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7526 return error;
7527 }
7528 #endif
7529
7530 /* make sure that the caller is allowed to set this security information */
7531 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7532 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7533 if (error == EACCES) {
7534 error = EPERM;
7535 }
7536 return error;
7537 }
7538
7539 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7540 return error;
7541 }
7542
7543 #if CONFIG_MACF
7544 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7545 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7546 }
7547
7548 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7549 mac_vnode_notify_setowner(ctx, vp,
7550 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7551 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7552 }
7553
7554 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7555 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7556 }
7557 #endif
7558
7559 return error;
7560 }
7561
7562
7563 /*
7564 * Change mode of a file given a path name.
7565 *
7566 * Returns: 0 Success
7567 * namei:??? [anything namei can return]
7568 * chmod_vnode:??? [anything chmod_vnode can return]
7569 */
7570 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7571 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7572 int fd, int flag, enum uio_seg segflg)
7573 {
7574 struct nameidata nd;
7575 int follow, error;
7576 uint32_t wantparent = 0;
7577
7578 #if CONFIG_FILE_LEASES
7579 wantparent = WANTPARENT;
7580 #endif
7581
7582 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7583 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7584 segflg, path, ctx);
7585 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7586 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7587 }
7588 if ((error = nameiat(&nd, fd))) {
7589 return error;
7590 }
7591
7592 #if CONFIG_FILE_LEASES
7593 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7594 vnode_put(nd.ni_dvp);
7595 #endif
7596
7597 error = chmod_vnode(ctx, nd.ni_vp, vap);
7598 vnode_put(nd.ni_vp);
7599 nameidone(&nd);
7600 return error;
7601 }
7602
7603 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7604 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7605 gid_t gid, user_addr_t xsecurity)
7606 {
7607 int error;
7608
7609 VATTR_INIT(pva);
7610
7611 if (mode != -1) {
7612 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7613 } else {
7614 pva->va_mode = 0;
7615 }
7616
7617 if (uid != KAUTH_UID_NONE) {
7618 VATTR_SET(pva, va_uid, uid);
7619 }
7620
7621 if (gid != KAUTH_GID_NONE) {
7622 VATTR_SET(pva, va_gid, gid);
7623 }
7624
7625 *pxsecdst = NULL;
7626 switch (xsecurity) {
7627 case USER_ADDR_NULL:
7628 break;
7629
7630 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7631 VATTR_SET(pva, va_acl, NULL);
7632 break;
7633
7634 default:
7635 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7636 return error;
7637 }
7638
7639 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7640 pva->va_vaflags |= VA_FILESEC_ACL;
7641 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7642 break;
7643 }
7644
7645 return 0;
7646 }
7647
7648 /*
7649 * chmod_extended: Change the mode of a file given a path name; with extended
7650 * argument list (including extended security (ACL)).
7651 *
7652 * Parameters: p Process requesting the open
7653 * uap User argument descriptor (see below)
7654 * retval (ignored)
7655 *
7656 * Indirect: uap->path Path to object (same as 'chmod')
7657 * uap->uid UID to set
7658 * uap->gid GID to set
7659 * uap->mode File mode to set (same as 'chmod')
7660 * uap->xsecurity ACL to set (or delete)
7661 *
7662 * Returns: 0 Success
7663 * !0 errno value
7664 *
7665 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7666 *
7667 * XXX: We should enummerate the possible errno values here, and where
7668 * in the code they originated.
7669 */
7670 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7671 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7672 {
7673 int error;
7674 struct vnode_attr va;
7675 kauth_filesec_t xsecdst = NULL;
7676
7677 AUDIT_ARG(owner, uap->uid, uap->gid);
7678
7679 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7680 uap->gid, uap->xsecurity);
7681
7682 if (error) {
7683 return error;
7684 }
7685
7686 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7687 UIO_USERSPACE);
7688
7689 if (xsecdst != NULL) {
7690 kauth_filesec_free(xsecdst);
7691 }
7692 return error;
7693 }
7694
7695 /*
7696 * Returns: 0 Success
7697 * chmodat:??? [anything chmodat can return]
7698 */
7699 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7700 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7701 int flag, enum uio_seg segflg)
7702 {
7703 struct vnode_attr va;
7704
7705 VATTR_INIT(&va);
7706 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7707
7708 return chmodat(ctx, path, &va, fd, flag, segflg);
7709 }
7710
7711 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7712 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7713 {
7714 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7715 AT_FDCWD, 0, UIO_USERSPACE);
7716 }
7717
7718 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7719 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7720 {
7721 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7722 return EINVAL;
7723 }
7724
7725 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7726 uap->fd, uap->flag, UIO_USERSPACE);
7727 }
7728
7729 /*
7730 * Change mode of a file given a file descriptor.
7731 */
7732 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7733 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7734 {
7735 vnode_t vp;
7736 int error;
7737
7738 AUDIT_ARG(fd, fd);
7739
7740 if ((error = file_vnode(fd, &vp)) != 0) {
7741 return error;
7742 }
7743 if ((error = vnode_getwithref(vp)) != 0) {
7744 file_drop(fd);
7745 return error;
7746 }
7747 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7748
7749 #if CONFIG_FILE_LEASES
7750 vnode_breakdirlease(vp, true, O_WRONLY);
7751 #endif
7752
7753 error = chmod_vnode(vfs_context_current(), vp, vap);
7754 (void)vnode_put(vp);
7755 file_drop(fd);
7756
7757 return error;
7758 }
7759
7760 /*
7761 * fchmod_extended: Change mode of a file given a file descriptor; with
7762 * extended argument list (including extended security (ACL)).
7763 *
7764 * Parameters: p Process requesting to change file mode
7765 * uap User argument descriptor (see below)
7766 * retval (ignored)
7767 *
7768 * Indirect: uap->mode File mode to set (same as 'chmod')
7769 * uap->uid UID to set
7770 * uap->gid GID to set
7771 * uap->xsecurity ACL to set (or delete)
7772 * uap->fd File descriptor of file to change mode
7773 *
7774 * Returns: 0 Success
7775 * !0 errno value
7776 *
7777 */
7778 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7779 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7780 {
7781 int error;
7782 struct vnode_attr va;
7783 kauth_filesec_t xsecdst = NULL;
7784
7785 AUDIT_ARG(owner, uap->uid, uap->gid);
7786
7787 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7788 uap->gid, uap->xsecurity);
7789
7790 if (error) {
7791 return error;
7792 }
7793
7794 error = fchmod1(p, uap->fd, &va);
7795
7796 if (xsecdst != NULL) {
7797 kauth_filesec_free(xsecdst);
7798 }
7799 return error;
7800 }
7801
7802 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7803 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7804 {
7805 struct vnode_attr va;
7806
7807 VATTR_INIT(&va);
7808 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7809
7810 return fchmod1(p, uap->fd, &va);
7811 }
7812
7813
7814 /*
7815 * Set ownership given a path name.
7816 */
7817 /* ARGSUSED */
7818 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7819 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7820 gid_t gid, int flag, enum uio_seg segflg)
7821 {
7822 vnode_t vp;
7823 struct vnode_attr va;
7824 int error;
7825 struct nameidata nd;
7826 int follow;
7827 kauth_action_t action;
7828 uint32_t wantparent = 0;
7829
7830 #if CONFIG_FILE_LEASES
7831 wantparent = WANTPARENT;
7832 #endif
7833
7834 AUDIT_ARG(owner, uid, gid);
7835
7836 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7837 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent, segflg,
7838 path, ctx);
7839 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7840 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7841 }
7842 error = nameiat(&nd, fd);
7843 if (error) {
7844 return error;
7845 }
7846 vp = nd.ni_vp;
7847
7848 VATTR_INIT(&va);
7849 if (uid != (uid_t)VNOVAL) {
7850 VATTR_SET(&va, va_uid, uid);
7851 }
7852 if (gid != (gid_t)VNOVAL) {
7853 VATTR_SET(&va, va_gid, gid);
7854 }
7855
7856 #if CONFIG_MACF
7857 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7858 if (error) {
7859 goto out;
7860 }
7861 #endif
7862
7863 /* preflight and authorize attribute changes */
7864 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7865 goto out;
7866 }
7867 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7868 goto out;
7869 }
7870
7871 #if CONFIG_FILE_LEASES
7872 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7873 #endif
7874
7875 error = vnode_setattr(vp, &va, ctx);
7876
7877 #if CONFIG_MACF
7878 if (error == 0) {
7879 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7880 }
7881 #endif
7882
7883 out:
7884 /*
7885 * EACCES is only allowed from namei(); permissions failure should
7886 * return EPERM, so we need to translate the error code.
7887 */
7888 if (error == EACCES) {
7889 error = EPERM;
7890 }
7891
7892 #if CONFIG_FILE_LEASES
7893 vnode_put(nd.ni_dvp);
7894 #endif
7895 nameidone(&nd);
7896 vnode_put(vp);
7897 return error;
7898 }
7899
7900 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7901 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7902 {
7903 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7904 uap->uid, uap->gid, 0, UIO_USERSPACE);
7905 }
7906
7907 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7908 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7909 {
7910 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7911 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7912 }
7913
7914 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7915 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7916 {
7917 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7918 return EINVAL;
7919 }
7920
7921 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7922 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7923 }
7924
7925 /*
7926 * Set ownership given a file descriptor.
7927 */
7928 /* ARGSUSED */
7929 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7930 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7931 {
7932 struct vnode_attr va;
7933 vfs_context_t ctx = vfs_context_current();
7934 vnode_t vp;
7935 int error;
7936 kauth_action_t action;
7937
7938 AUDIT_ARG(owner, uap->uid, uap->gid);
7939 AUDIT_ARG(fd, uap->fd);
7940
7941 if ((error = file_vnode(uap->fd, &vp))) {
7942 return error;
7943 }
7944
7945 if ((error = vnode_getwithref(vp))) {
7946 file_drop(uap->fd);
7947 return error;
7948 }
7949 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7950
7951 VATTR_INIT(&va);
7952 if (uap->uid != VNOVAL) {
7953 VATTR_SET(&va, va_uid, uap->uid);
7954 }
7955 if (uap->gid != VNOVAL) {
7956 VATTR_SET(&va, va_gid, uap->gid);
7957 }
7958
7959 #if NAMEDSTREAMS
7960 /* chown calls are not allowed for resource forks. */
7961 if (vp->v_flag & VISNAMEDSTREAM) {
7962 error = EPERM;
7963 goto out;
7964 }
7965 #endif
7966
7967 #if CONFIG_MACF
7968 error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7969 if (error) {
7970 goto out;
7971 }
7972 #endif
7973
7974 /* preflight and authorize attribute changes */
7975 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7976 goto out;
7977 }
7978 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7979 if (error == EACCES) {
7980 error = EPERM;
7981 }
7982 goto out;
7983 }
7984
7985 #if CONFIG_FILE_LEASES
7986 vnode_breakdirlease(vp, true, O_WRONLY);
7987 #endif
7988
7989 error = vnode_setattr(vp, &va, ctx);
7990
7991 #if CONFIG_MACF
7992 if (error == 0) {
7993 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7994 }
7995 #endif
7996
7997 out:
7998 (void)vnode_put(vp);
7999 file_drop(uap->fd);
8000 return error;
8001 }
8002
8003 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8004 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8005 {
8006 int error;
8007
8008 if (usrtvp == USER_ADDR_NULL) {
8009 struct timeval old_tv;
8010 /* XXX Y2038 bug because of microtime argument */
8011 microtime(&old_tv);
8012 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8013 tsp[1] = tsp[0];
8014 } else {
8015 if (IS_64BIT_PROCESS(current_proc())) {
8016 struct user64_timeval tv[2];
8017 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8018 if (error) {
8019 return error;
8020 }
8021 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8022 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8023 } else {
8024 struct user32_timeval tv[2];
8025 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8026 if (error) {
8027 return error;
8028 }
8029 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8030 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8031 }
8032 }
8033 return 0;
8034 }
8035
8036 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8037 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8038 int nullflag)
8039 {
8040 int error;
8041 struct vnode_attr va;
8042 kauth_action_t action;
8043
8044 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8045
8046 VATTR_INIT(&va);
8047 VATTR_SET(&va, va_access_time, ts[0]);
8048 VATTR_SET(&va, va_modify_time, ts[1]);
8049 if (nullflag) {
8050 va.va_vaflags |= VA_UTIMES_NULL;
8051 }
8052
8053 #if NAMEDSTREAMS
8054 /* utimes calls are not allowed for resource forks. */
8055 if (vp->v_flag & VISNAMEDSTREAM) {
8056 error = EPERM;
8057 goto out;
8058 }
8059 #endif
8060
8061 #if CONFIG_MACF
8062 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8063 if (error) {
8064 goto out;
8065 }
8066 #endif
8067 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8068 if (!nullflag && error == EACCES) {
8069 error = EPERM;
8070 }
8071 goto out;
8072 }
8073
8074 /* since we may not need to auth anything, check here */
8075 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8076 if (!nullflag && error == EACCES) {
8077 error = EPERM;
8078 }
8079 goto out;
8080 }
8081 error = vnode_setattr(vp, &va, ctx);
8082
8083 #if CONFIG_MACF
8084 if (error == 0) {
8085 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8086 }
8087 #endif
8088
8089 out:
8090 return error;
8091 }
8092
8093 /*
8094 * Set the access and modification times of a file.
8095 */
8096 /* ARGSUSED */
8097 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8098 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8099 {
8100 struct timespec ts[2];
8101 user_addr_t usrtvp;
8102 int error;
8103 struct nameidata nd;
8104 vfs_context_t ctx = vfs_context_current();
8105 uint32_t wantparent = 0;
8106
8107 #if CONFIG_FILE_LEASES
8108 wantparent = WANTPARENT;
8109 #endif
8110
8111 /*
8112 * AUDIT: Needed to change the order of operations to do the
8113 * name lookup first because auditing wants the path.
8114 */
8115 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8116 UIO_USERSPACE, uap->path, ctx);
8117 error = namei(&nd);
8118 if (error) {
8119 return error;
8120 }
8121
8122 /*
8123 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8124 * the current time instead.
8125 */
8126 usrtvp = uap->tptr;
8127 if ((error = getutimes(usrtvp, ts)) != 0) {
8128 goto out;
8129 }
8130
8131 #if CONFIG_FILE_LEASES
8132 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8133 #endif
8134
8135 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8136
8137 out:
8138 #if CONFIG_FILE_LEASES
8139 vnode_put(nd.ni_dvp);
8140 #endif
8141 nameidone(&nd);
8142 vnode_put(nd.ni_vp);
8143 return error;
8144 }
8145
8146 /*
8147 * Set the access and modification times of a file.
8148 */
8149 /* ARGSUSED */
8150 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8151 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8152 {
8153 struct timespec ts[2];
8154 vnode_t vp;
8155 user_addr_t usrtvp;
8156 int error;
8157
8158 AUDIT_ARG(fd, uap->fd);
8159 usrtvp = uap->tptr;
8160 if ((error = getutimes(usrtvp, ts)) != 0) {
8161 return error;
8162 }
8163 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8164 return error;
8165 }
8166 if ((error = vnode_getwithref(vp))) {
8167 file_drop(uap->fd);
8168 return error;
8169 }
8170
8171 #if CONFIG_FILE_LEASES
8172 vnode_breakdirlease(vp, true, O_WRONLY);
8173 #endif
8174
8175 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8176
8177 vnode_put(vp);
8178 file_drop(uap->fd);
8179 return error;
8180 }
8181
8182 static int
truncate_validate_common(proc_t p,off_t length)8183 truncate_validate_common(proc_t p, off_t length)
8184 {
8185 rlim_t fsize_limit;
8186
8187 if (length < 0) {
8188 return EINVAL;
8189 }
8190
8191 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8192 if ((rlim_t)length > fsize_limit) {
8193 psignal(p, SIGXFSZ);
8194 return EFBIG;
8195 }
8196
8197 return 0;
8198 }
8199
8200 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8201 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8202 vfs_context_t ctx, boolean_t need_auth)
8203 {
8204 struct vnode_attr va;
8205 kauth_action_t action;
8206 int error;
8207
8208 VATTR_INIT(&va);
8209 VATTR_SET(&va, va_data_size, length);
8210
8211 #if CONFIG_MACF
8212 error = mac_vnode_check_truncate(ctx, cred, vp);
8213 if (error) {
8214 return error;
8215 }
8216 #endif
8217
8218 /*
8219 * If we reached here from `ftruncate` then we already did an effective
8220 * `vnode_authorize` upon open. We honour the result from then.
8221 */
8222 if (need_auth) {
8223 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8224 return error;
8225 }
8226
8227 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8228 return error;
8229 }
8230 }
8231
8232 #if CONFIG_FILE_LEASES
8233 /* Check if there is a lease placed on the parent directory. */
8234 vnode_breakdirlease(vp, true, O_WRONLY);
8235
8236 /* Now check if there is a lease placed on the file itself. */
8237 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8238 #endif
8239
8240 error = vnode_setattr(vp, &va, ctx);
8241
8242 #if CONFIG_MACF
8243 if (error == 0) {
8244 mac_vnode_notify_truncate(ctx, cred, vp);
8245 }
8246 #endif
8247
8248 return error;
8249 }
8250
8251 /*
8252 * Truncate a file given its path name.
8253 */
8254 /* ARGSUSED */
8255 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8256 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8257 {
8258 vfs_context_t ctx = vfs_context_current();
8259 vnode_t vp;
8260 int error;
8261 struct nameidata nd;
8262
8263 if ((error = truncate_validate_common(p, uap->length))) {
8264 return error;
8265 }
8266
8267 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8268 UIO_USERSPACE, uap->path, ctx);
8269
8270 if ((error = namei(&nd))) {
8271 return error;
8272 }
8273
8274 vp = nd.ni_vp;
8275 nameidone(&nd);
8276
8277 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8278 vnode_put(vp);
8279
8280 return error;
8281 }
8282
8283 /*
8284 * Truncate a file given a file descriptor.
8285 */
8286 /* ARGSUSED */
8287 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8288 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8289 {
8290 vnode_t vp;
8291 struct fileproc *fp;
8292 int error;
8293
8294 AUDIT_ARG(fd, uap->fd);
8295
8296 if ((error = truncate_validate_common(p, uap->length))) {
8297 return error;
8298 }
8299
8300 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8301 return error;
8302 }
8303
8304 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8305 case DTYPE_PSXSHM:
8306 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8307 goto out;
8308 case DTYPE_VNODE:
8309 break;
8310 default:
8311 error = EINVAL;
8312 goto out;
8313 }
8314
8315 vp = (vnode_t)fp_get_data(fp);
8316
8317 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8318 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8319 error = EINVAL;
8320 goto out;
8321 }
8322
8323 if ((error = vnode_getwithref(vp)) != 0) {
8324 goto out;
8325 }
8326
8327 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8328
8329 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8330 vfs_context_current(), false);
8331 vnode_put(vp);
8332
8333 out:
8334 file_drop(uap->fd);
8335 return error;
8336 }
8337
8338
8339 /*
8340 * Sync an open file with synchronized I/O _file_ integrity completion
8341 */
8342 /* ARGSUSED */
8343 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8344 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8345 {
8346 __pthread_testcancel(1);
8347 return fsync_common(p, uap, MNT_WAIT);
8348 }
8349
8350
8351 /*
8352 * Sync an open file with synchronized I/O _file_ integrity completion
8353 *
8354 * Notes: This is a legacy support function that does not test for
8355 * thread cancellation points.
8356 */
8357 /* ARGSUSED */
8358 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8359 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8360 {
8361 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8362 }
8363
8364
8365 /*
8366 * Sync an open file with synchronized I/O _data_ integrity completion
8367 */
8368 /* ARGSUSED */
8369 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8370 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8371 {
8372 __pthread_testcancel(1);
8373 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8374 }
8375
8376
8377 /*
8378 * fsync_common
8379 *
8380 * Common fsync code to support both synchronized I/O file integrity completion
8381 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8382 *
8383 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8384 * will only guarantee that the file data contents are retrievable. If
8385 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8386 * includes additional metadata unnecessary for retrieving the file data
8387 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8388 * storage.
8389 *
8390 * Parameters: p The process
8391 * uap->fd The descriptor to synchronize
8392 * flags The data integrity flags
8393 *
8394 * Returns: int Success
8395 * fp_getfvp:EBADF Bad file descriptor
8396 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8397 * VNOP_FSYNC:??? unspecified
8398 *
8399 * Notes: We use struct fsync_args because it is a short name, and all
8400 * caller argument structures are otherwise identical.
8401 */
8402 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8403 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8404 {
8405 vnode_t vp;
8406 struct fileproc *fp;
8407 vfs_context_t ctx = vfs_context_current();
8408 int error;
8409
8410 AUDIT_ARG(fd, uap->fd);
8411
8412 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8413 return error;
8414 }
8415 if ((error = vnode_getwithref(vp))) {
8416 file_drop(uap->fd);
8417 return error;
8418 }
8419
8420 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8421
8422 error = VNOP_FSYNC(vp, flags, ctx);
8423
8424 #if NAMEDRSRCFORK
8425 /* Sync resource fork shadow file if necessary. */
8426 if ((error == 0) &&
8427 (vp->v_flag & VISNAMEDSTREAM) &&
8428 (vp->v_parent != NULLVP) &&
8429 vnode_isshadow(vp) &&
8430 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8431 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8432 }
8433 #endif
8434
8435 (void)vnode_put(vp);
8436 file_drop(uap->fd);
8437 return error;
8438 }
8439
8440 /*
8441 * Duplicate files. Source must be a file, target must be a file or
8442 * must not exist.
8443 *
8444 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8445 * perform inheritance correctly.
8446 */
8447 /* ARGSUSED */
8448 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8449 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8450 {
8451 vnode_t tvp, fvp, tdvp, sdvp;
8452 struct nameidata fromnd, tond;
8453 int error;
8454 vfs_context_t ctx = vfs_context_current();
8455
8456 /* Check that the flags are valid. */
8457 if (uap->flags & ~CPF_MASK) {
8458 return EINVAL;
8459 }
8460
8461 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8462 UIO_USERSPACE, uap->from, ctx);
8463 if ((error = namei(&fromnd))) {
8464 return error;
8465 }
8466 fvp = fromnd.ni_vp;
8467
8468 NDINIT(&tond, CREATE, OP_LINK,
8469 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8470 UIO_USERSPACE, uap->to, ctx);
8471 if ((error = namei(&tond))) {
8472 goto out1;
8473 }
8474 tdvp = tond.ni_dvp;
8475 tvp = tond.ni_vp;
8476
8477 if (tvp != NULL) {
8478 if (!(uap->flags & CPF_OVERWRITE)) {
8479 error = EEXIST;
8480 goto out;
8481 }
8482 }
8483
8484 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8485 error = EISDIR;
8486 goto out;
8487 }
8488
8489 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8490 error = EOPNOTSUPP;
8491 goto out;
8492 }
8493
8494 #if CONFIG_MACF
8495 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8496 goto out;
8497 }
8498 #endif /* CONFIG_MACF */
8499
8500 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8501 goto out;
8502 }
8503 if (tvp) {
8504 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8505 goto out;
8506 }
8507 }
8508 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8509 goto out;
8510 }
8511
8512 if (fvp == tdvp) {
8513 error = EINVAL;
8514 }
8515 /*
8516 * If source is the same as the destination (that is the
8517 * same inode number) then there is nothing to do.
8518 * (fixed to have POSIX semantics - CSM 3/2/98)
8519 */
8520 if (fvp == tvp) {
8521 error = -1;
8522 }
8523
8524 #if CONFIG_FILE_LEASES
8525 vnode_breakdirlease(tdvp, false, O_WRONLY);
8526 #endif
8527
8528 if (!error) {
8529 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8530 }
8531 out:
8532 sdvp = tond.ni_startdir;
8533 /*
8534 * nameidone has to happen before we vnode_put(tdvp)
8535 * since it may need to release the fs_nodelock on the tdvp
8536 */
8537 nameidone(&tond);
8538
8539 if (tvp) {
8540 vnode_put(tvp);
8541 }
8542 vnode_put(tdvp);
8543 vnode_put(sdvp);
8544 out1:
8545 vnode_put(fvp);
8546
8547 nameidone(&fromnd);
8548
8549 if (error == -1) {
8550 return 0;
8551 }
8552 return error;
8553 }
8554
8555 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8556
8557 /*
8558 * Helper function for doing clones. The caller is expected to provide an
8559 * iocounted source vnode and release it.
8560 */
8561 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8562 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8563 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8564 {
8565 vnode_t tvp, tdvp;
8566 struct nameidata tond;
8567 int error;
8568 int follow;
8569 boolean_t free_src_acl;
8570 boolean_t attr_cleanup;
8571 enum vtype v_type;
8572 kauth_action_t action;
8573 struct componentname *cnp;
8574 uint32_t defaulted = 0;
8575 struct vnode_attr va;
8576 struct vnode_attr nva;
8577 uint32_t vnop_flags;
8578
8579 v_type = vnode_vtype(fvp);
8580 switch (v_type) {
8581 case VLNK:
8582 /* FALLTHRU */
8583 case VREG:
8584 action = KAUTH_VNODE_ADD_FILE;
8585 break;
8586 case VDIR:
8587 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8588 fvp->v_mountedhere) {
8589 return EINVAL;
8590 }
8591 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8592 break;
8593 default:
8594 return EINVAL;
8595 }
8596
8597 AUDIT_ARG(fd2, dst_dirfd);
8598 AUDIT_ARG(value32, flags);
8599
8600 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8601 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8602 UIO_USERSPACE, dst, ctx);
8603 if ((error = nameiat(&tond, dst_dirfd))) {
8604 return error;
8605 }
8606 cnp = &tond.ni_cnd;
8607 tdvp = tond.ni_dvp;
8608 tvp = tond.ni_vp;
8609
8610 free_src_acl = FALSE;
8611 attr_cleanup = FALSE;
8612
8613 if (tvp != NULL) {
8614 error = EEXIST;
8615 goto out;
8616 }
8617
8618 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8619 error = EXDEV;
8620 goto out;
8621 }
8622
8623 #if CONFIG_MACF
8624 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8625 goto out;
8626 }
8627 #endif
8628 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8629 goto out;
8630 }
8631
8632 action = KAUTH_VNODE_GENERIC_READ_BITS;
8633 if (data_read_authorised) {
8634 action &= ~KAUTH_VNODE_READ_DATA;
8635 }
8636 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8637 goto out;
8638 }
8639
8640 /*
8641 * certain attributes may need to be changed from the source, we ask for
8642 * those here with the exception of source file's ACLs unless the CLONE_ACL
8643 * flag is specified. By default, the clone file will inherit the target
8644 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8645 * will inherit the source file's ACLs instead.
8646 */
8647 VATTR_INIT(&va);
8648 VATTR_WANTED(&va, va_uid);
8649 VATTR_WANTED(&va, va_gid);
8650 VATTR_WANTED(&va, va_mode);
8651 VATTR_WANTED(&va, va_flags);
8652 if (flags & CLONE_ACL) {
8653 VATTR_WANTED(&va, va_acl);
8654 }
8655
8656 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8657 goto out;
8658 }
8659
8660 VATTR_INIT(&nva);
8661 VATTR_SET(&nva, va_type, v_type);
8662 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8663 VATTR_SET(&nva, va_acl, va.va_acl);
8664 free_src_acl = TRUE;
8665 }
8666
8667 /* Handle ACL inheritance, initialize vap. */
8668 if (v_type == VLNK) {
8669 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8670 } else {
8671 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8672 if (error) {
8673 goto out;
8674 }
8675 attr_cleanup = TRUE;
8676 }
8677
8678 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8679 /*
8680 * We've got initial values for all security parameters,
8681 * If we are superuser, then we can change owners to be the
8682 * same as the source. Both superuser and the owner have default
8683 * WRITE_SECURITY privileges so all other fields can be taken
8684 * from source as well.
8685 */
8686 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8687 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8688 VATTR_SET(&nva, va_uid, va.va_uid);
8689 }
8690 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8691 VATTR_SET(&nva, va_gid, va.va_gid);
8692 }
8693 } else {
8694 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8695 }
8696
8697 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8698 VATTR_SET(&nva, va_mode, va.va_mode);
8699 }
8700 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8701 VATTR_SET(&nva, va_flags,
8702 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8703 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8704 }
8705
8706 #if CONFIG_FILE_LEASES
8707 vnode_breakdirlease(tdvp, false, O_WRONLY);
8708 #endif
8709
8710 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8711
8712 if (!error && tvp) {
8713 int update_flags = 0;
8714 #if CONFIG_FSE
8715 int fsevent;
8716 #endif /* CONFIG_FSE */
8717
8718 /*
8719 * If some of the requested attributes weren't handled by the
8720 * VNOP, use our fallback code.
8721 */
8722 if (!VATTR_ALL_SUPPORTED(&nva)) {
8723 (void)vnode_setattr_fallback(tvp, &nva, ctx);
8724 }
8725
8726 #if CONFIG_MACF
8727 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8728 VNODE_LABEL_CREATE, ctx);
8729 #endif
8730
8731 // Make sure the name & parent pointers are hooked up
8732 if (tvp->v_name == NULL) {
8733 update_flags |= VNODE_UPDATE_NAME;
8734 }
8735 if (tvp->v_parent == NULLVP) {
8736 update_flags |= VNODE_UPDATE_PARENT;
8737 }
8738
8739 if (update_flags) {
8740 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8741 cnp->cn_namelen, cnp->cn_hash, update_flags);
8742 }
8743
8744 #if CONFIG_FSE
8745 switch (vnode_vtype(tvp)) {
8746 case VLNK:
8747 /* FALLTHRU */
8748 case VREG:
8749 fsevent = FSE_CREATE_FILE;
8750 break;
8751 case VDIR:
8752 fsevent = FSE_CREATE_DIR;
8753 break;
8754 default:
8755 goto out;
8756 }
8757
8758 if (need_fsevent(fsevent, tvp)) {
8759 /*
8760 * The following is a sequence of three explicit events.
8761 * A pair of FSE_CLONE events representing the source and destination
8762 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8763 * fseventsd may coalesce the destination clone and create events
8764 * into a single event resulting in the following sequence for a client
8765 * FSE_CLONE (src)
8766 * FSE_CLONE | FSE_CREATE (dst)
8767 */
8768 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8769 FSE_ARG_DONE);
8770 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8771 FSE_ARG_DONE);
8772 }
8773 #endif /* CONFIG_FSE */
8774 }
8775
8776 out:
8777 if (attr_cleanup) {
8778 vn_attribute_cleanup(&nva, defaulted);
8779 }
8780 if (free_src_acl && va.va_acl) {
8781 kauth_acl_free(va.va_acl);
8782 }
8783 nameidone(&tond);
8784 if (tvp) {
8785 vnode_put(tvp);
8786 }
8787 vnode_put(tdvp);
8788 return error;
8789 }
8790
8791 /*
8792 * clone files or directories, target must not exist.
8793 */
8794 /* ARGSUSED */
8795 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8796 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8797 __unused int32_t *retval)
8798 {
8799 vnode_t fvp;
8800 struct nameidata fromnd;
8801 int follow;
8802 int error;
8803 vfs_context_t ctx = vfs_context_current();
8804
8805 /* Check that the flags are valid. */
8806 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8807 return EINVAL;
8808 }
8809
8810 AUDIT_ARG(fd, uap->src_dirfd);
8811
8812 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8813 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8814 UIO_USERSPACE, uap->src, ctx);
8815 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8816 return error;
8817 }
8818
8819 fvp = fromnd.ni_vp;
8820 nameidone(&fromnd);
8821
8822 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8823 uap->flags, ctx);
8824
8825 vnode_put(fvp);
8826 return error;
8827 }
8828
8829 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8830 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8831 __unused int32_t *retval)
8832 {
8833 vnode_t fvp;
8834 struct fileproc *fp;
8835 int error;
8836 vfs_context_t ctx = vfs_context_current();
8837
8838 /* Check that the flags are valid. */
8839 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8840 return EINVAL;
8841 }
8842
8843 AUDIT_ARG(fd, uap->src_fd);
8844 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8845 if (error) {
8846 return error;
8847 }
8848
8849 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8850 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8851 error = EBADF;
8852 goto out;
8853 }
8854
8855 if ((error = vnode_getwithref(fvp))) {
8856 goto out;
8857 }
8858
8859 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8860
8861 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8862 uap->flags, ctx);
8863
8864 vnode_put(fvp);
8865 out:
8866 file_drop(uap->src_fd);
8867 return error;
8868 }
8869
8870 static int
rename_submounts_callback(mount_t mp,void * arg)8871 rename_submounts_callback(mount_t mp, void *arg)
8872 {
8873 int error = 0;
8874 mount_t pmp = (mount_t)arg;
8875 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8876
8877 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8878 return 0;
8879 }
8880
8881 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8882 return 0;
8883 }
8884
8885 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8886 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8887 return -1;
8888 }
8889
8890 size_t pathlen = MAXPATHLEN;
8891 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8892 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8893 }
8894
8895 vfs_unbusy(mp);
8896
8897 return error;
8898 }
8899
8900 /*
8901 * Rename files. Source and destination must either both be directories,
8902 * or both not be directories. If target is a directory, it must be empty.
8903 */
8904 /* ARGSUSED */
8905 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8906 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8907 int tofd, user_addr_t to, int segflg, u_int uflags)
8908 {
8909 vnode_t tvp, tdvp;
8910 vnode_t fvp, fdvp;
8911 vnode_t mnt_fvp;
8912 struct nameidata *fromnd, *tond;
8913 int error = 0;
8914 int do_retry;
8915 int retry_count;
8916 int mntrename;
8917 int need_event;
8918 int need_kpath2;
8919 int has_listeners;
8920 const char *oname = NULL;
8921 char *from_name = NULL, *to_name = NULL;
8922 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8923 int from_len = 0, to_len = 0;
8924 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8925 int holding_mntlock;
8926 int vn_authorize_skipped;
8927 mount_t locked_mp = NULL;
8928 vnode_t oparent = NULLVP;
8929 #if CONFIG_FSE
8930 fse_info from_finfo = {}, to_finfo;
8931 #endif
8932 int from_truncated = 0, to_truncated = 0;
8933 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8934 int batched = 0;
8935 struct vnode_attr *fvap, *tvap;
8936 int continuing = 0;
8937 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8938 int32_t nofollow_any = 0;
8939 /* carving out a chunk for structs that are too big to be on stack. */
8940 struct {
8941 struct nameidata from_node, to_node;
8942 struct vnode_attr fv_attr, tv_attr;
8943 } * __rename_data;
8944
8945 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8946 fromnd = &__rename_data->from_node;
8947 tond = &__rename_data->to_node;
8948
8949 holding_mntlock = 0;
8950 do_retry = 0;
8951 retry_count = 0;
8952 retry:
8953 fvp = tvp = NULL;
8954 fdvp = tdvp = NULL;
8955 fvap = tvap = NULL;
8956 mnt_fvp = NULLVP;
8957 mntrename = FALSE;
8958 vn_authorize_skipped = FALSE;
8959
8960 if (uflags & RENAME_NOFOLLOW_ANY) {
8961 nofollow_any = NAMEI_NOFOLLOW_ANY;
8962 }
8963 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8964 segflg, from, ctx);
8965 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8966
8967 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8968 segflg, to, ctx);
8969 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8970
8971 continue_lookup:
8972 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8973 if ((error = nameiat(fromnd, fromfd))) {
8974 goto out1;
8975 }
8976 fdvp = fromnd->ni_dvp;
8977 fvp = fromnd->ni_vp;
8978
8979 if (fvp && fvp->v_type == VDIR) {
8980 tond->ni_cnd.cn_flags |= WILLBEDIR;
8981 }
8982 }
8983
8984 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8985 if ((error = nameiat(tond, tofd))) {
8986 /*
8987 * Translate error code for rename("dir1", "dir2/.").
8988 */
8989 if (error == EISDIR && fvp->v_type == VDIR) {
8990 error = EINVAL;
8991 }
8992 goto out1;
8993 }
8994 tdvp = tond->ni_dvp;
8995 tvp = tond->ni_vp;
8996 }
8997
8998 #if DEVELOPMENT || DEBUG
8999 /*
9000 * XXX VSWAP: Check for entitlements or special flag here
9001 * so we can restrict access appropriately.
9002 */
9003 #else /* DEVELOPMENT || DEBUG */
9004
9005 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9006 error = EPERM;
9007 goto out1;
9008 }
9009
9010 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9011 error = EPERM;
9012 goto out1;
9013 }
9014 #endif /* DEVELOPMENT || DEBUG */
9015
9016 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9017 error = ENOENT;
9018 goto out1;
9019 }
9020
9021 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9022 int32_t pval = 0;
9023 int err = 0;
9024
9025 /*
9026 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9027 * has the same name as target iff the following conditions are met:
9028 * 1. the target file system is case insensitive
9029 * 2. source and target directories are the same
9030 * 3. source and target files are the same
9031 * 4. name only differs in case (determined by underlying filesystem)
9032 */
9033 if (fvp != tvp || fdvp != tdvp) {
9034 error = EEXIST;
9035 goto out1;
9036 }
9037
9038 /*
9039 * Assume that the target file system is case sensitive if
9040 * _PC_CASE_SENSITIVE selector isn't supported.
9041 */
9042 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9043 if (err != 0 || pval != 0) {
9044 error = EEXIST;
9045 goto out1;
9046 }
9047 }
9048
9049 batched = vnode_compound_rename_available(fdvp);
9050
9051 #if CONFIG_FSE
9052 need_event = need_fsevent(FSE_RENAME, fdvp);
9053 if (need_event) {
9054 if (fvp) {
9055 get_fse_info(fvp, &from_finfo, ctx);
9056 } else {
9057 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9058 if (error) {
9059 goto out1;
9060 }
9061
9062 fvap = &__rename_data->fv_attr;
9063 }
9064
9065 if (tvp) {
9066 get_fse_info(tvp, &to_finfo, ctx);
9067 } else if (batched) {
9068 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9069 if (error) {
9070 goto out1;
9071 }
9072
9073 tvap = &__rename_data->tv_attr;
9074 }
9075 }
9076 #else
9077 need_event = 0;
9078 #endif /* CONFIG_FSE */
9079
9080 has_listeners = kauth_authorize_fileop_has_listeners();
9081
9082 need_kpath2 = 0;
9083 #if CONFIG_AUDIT
9084 if (AUDIT_RECORD_EXISTS()) {
9085 need_kpath2 = 1;
9086 }
9087 #endif
9088
9089 if (need_event || has_listeners) {
9090 if (from_name == NULL) {
9091 GET_PATH(from_name);
9092 }
9093
9094 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9095
9096 if (from_name_no_firmlink == NULL) {
9097 GET_PATH(from_name_no_firmlink);
9098 }
9099
9100 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9101 }
9102
9103 if (need_event || need_kpath2 || has_listeners) {
9104 if (to_name == NULL) {
9105 GET_PATH(to_name);
9106 }
9107
9108 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9109
9110 if (to_name_no_firmlink == NULL) {
9111 GET_PATH(to_name_no_firmlink);
9112 }
9113
9114 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9115 if (to_name && need_kpath2) {
9116 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9117 }
9118 }
9119 if (!fvp) {
9120 /*
9121 * Claim: this check will never reject a valid rename.
9122 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9123 * Suppose fdvp and tdvp are not on the same mount.
9124 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9125 * then you can't move it to within another dir on the same mountpoint.
9126 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9127 *
9128 * If this check passes, then we are safe to pass these vnodes to the same FS.
9129 */
9130 if (fdvp->v_mount != tdvp->v_mount) {
9131 error = EXDEV;
9132 goto out1;
9133 }
9134 goto skipped_lookup;
9135 }
9136
9137 /*
9138 * If the source and destination are the same (i.e. they're
9139 * links to the same vnode) and the target file system is
9140 * case sensitive, then there is nothing to do.
9141 *
9142 * XXX Come back to this.
9143 */
9144 if (fvp == tvp) {
9145 int pathconf_val;
9146
9147 /*
9148 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9149 * then assume that this file system is case sensitive.
9150 */
9151 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9152 pathconf_val != 0) {
9153 vn_authorize_skipped = TRUE;
9154 goto out1;
9155 }
9156 }
9157
9158 /*
9159 * Allow the renaming of mount points.
9160 * - target must not exist
9161 * - target must reside in the same directory as source
9162 * - union mounts cannot be renamed
9163 * - the root fs, and tightly-linked system volumes, cannot be renamed
9164 *
9165 * XXX Handle this in VFS after a continued lookup (if we missed
9166 * in the cache to start off)
9167 *
9168 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9169 * we'll skip past here. The file system is responsible for
9170 * checking that @tvp is not a descendent of @fvp and vice versa
9171 * so it should always return EINVAL if either @tvp or @fvp is the
9172 * root of a volume.
9173 */
9174 if ((fvp->v_flag & VROOT) &&
9175 (fvp->v_type == VDIR) &&
9176 (tvp == NULL) &&
9177 (fvp->v_mountedhere == NULL) &&
9178 (fdvp == tdvp) &&
9179 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9180 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9181 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9182 vnode_t coveredvp;
9183
9184 /* switch fvp to the covered vnode */
9185 coveredvp = fvp->v_mount->mnt_vnodecovered;
9186 if ((vnode_getwithref(coveredvp))) {
9187 error = ENOENT;
9188 goto out1;
9189 }
9190 /*
9191 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9192 * later.
9193 */
9194 mnt_fvp = fvp;
9195
9196 fvp = coveredvp;
9197 mntrename = TRUE;
9198 }
9199 /*
9200 * Check for cross-device rename.
9201 */
9202 if ((fvp->v_mount != tdvp->v_mount) ||
9203 (tvp && (fvp->v_mount != tvp->v_mount))) {
9204 error = EXDEV;
9205 goto out1;
9206 }
9207
9208 /*
9209 * If source is the same as the destination (that is the
9210 * same inode number) then there is nothing to do...
9211 * EXCEPT if the underlying file system supports case
9212 * insensitivity and is case preserving. In this case
9213 * the file system needs to handle the special case of
9214 * getting the same vnode as target (fvp) and source (tvp).
9215 *
9216 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9217 * and _PC_CASE_PRESERVING can have this exception, and they need to
9218 * handle the special case of getting the same vnode as target and
9219 * source. NOTE: Then the target is unlocked going into vnop_rename,
9220 * so not to cause locking problems. There is a single reference on tvp.
9221 *
9222 * NOTE - that fvp == tvp also occurs if they are hard linked and
9223 * that correct behaviour then is just to return success without doing
9224 * anything.
9225 *
9226 * XXX filesystem should take care of this itself, perhaps...
9227 */
9228 if (fvp == tvp && fdvp == tdvp) {
9229 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9230 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9231 fromnd->ni_cnd.cn_namelen)) {
9232 vn_authorize_skipped = TRUE;
9233 goto out1;
9234 }
9235 }
9236
9237 if (holding_mntlock && fvp->v_mount != locked_mp) {
9238 /*
9239 * we're holding a reference and lock
9240 * on locked_mp, but it no longer matches
9241 * what we want to do... so drop our hold
9242 */
9243 mount_unlock_renames(locked_mp);
9244 mount_drop(locked_mp, 0);
9245 holding_mntlock = 0;
9246 }
9247 if (tdvp != fdvp && fvp->v_type == VDIR) {
9248 /*
9249 * serialize renames that re-shape
9250 * the tree... if holding_mntlock is
9251 * set, then we're ready to go...
9252 * otherwise we
9253 * first need to drop the iocounts
9254 * we picked up, second take the
9255 * lock to serialize the access,
9256 * then finally start the lookup
9257 * process over with the lock held
9258 */
9259 if (!holding_mntlock) {
9260 /*
9261 * need to grab a reference on
9262 * the mount point before we
9263 * drop all the iocounts... once
9264 * the iocounts are gone, the mount
9265 * could follow
9266 */
9267 locked_mp = fvp->v_mount;
9268 mount_ref(locked_mp, 0);
9269
9270 /*
9271 * nameidone has to happen before we vnode_put(tvp)
9272 * since it may need to release the fs_nodelock on the tvp
9273 */
9274 nameidone(tond);
9275
9276 if (tvp) {
9277 vnode_put(tvp);
9278 }
9279 vnode_put(tdvp);
9280
9281 /*
9282 * nameidone has to happen before we vnode_put(fdvp)
9283 * since it may need to release the fs_nodelock on the fvp
9284 */
9285 nameidone(fromnd);
9286
9287 vnode_put(fvp);
9288 vnode_put(fdvp);
9289
9290 if (mnt_fvp != NULLVP) {
9291 vnode_put(mnt_fvp);
9292 }
9293
9294 mount_lock_renames(locked_mp);
9295 holding_mntlock = 1;
9296
9297 goto retry;
9298 }
9299 } else {
9300 /*
9301 * when we dropped the iocounts to take
9302 * the lock, we allowed the identity of
9303 * the various vnodes to change... if they did,
9304 * we may no longer be dealing with a rename
9305 * that reshapes the tree... once we're holding
9306 * the iocounts, the vnodes can't change type
9307 * so we're free to drop the lock at this point
9308 * and continue on
9309 */
9310 if (holding_mntlock) {
9311 mount_unlock_renames(locked_mp);
9312 mount_drop(locked_mp, 0);
9313 holding_mntlock = 0;
9314 }
9315 }
9316
9317 if (!batched) {
9318 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9319 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9320 flags, NULL);
9321 if (error) {
9322 if (error == ENOENT) {
9323 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9324 /*
9325 * We encountered a race where after doing the namei,
9326 * tvp stops being valid. If so, simply re-drive the rename
9327 * call from the top.
9328 */
9329 do_retry = 1;
9330 retry_count += 1;
9331 }
9332 }
9333 goto out1;
9334 }
9335 }
9336
9337 /* Release the 'mnt_fvp' now that it is no longer needed. */
9338 if (mnt_fvp != NULLVP) {
9339 vnode_put(mnt_fvp);
9340 mnt_fvp = NULLVP;
9341 }
9342
9343 // save these off so we can later verify that fvp is the same
9344 oname = fvp->v_name;
9345 oparent = fvp->v_parent;
9346
9347 skipped_lookup:
9348 #if CONFIG_FILE_LEASES
9349 /* Lease break needed for source's parent dir? */
9350 vnode_breakdirlease(fdvp, false, O_WRONLY);
9351
9352 /* Lease break needed for target's parent dir? */
9353 vnode_breakdirlease(tdvp, false, O_WRONLY);
9354 #endif
9355
9356 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9357 tdvp, &tvp, &tond->ni_cnd, tvap,
9358 flags, ctx);
9359
9360 if (holding_mntlock) {
9361 /*
9362 * we can drop our serialization
9363 * lock now
9364 */
9365 mount_unlock_renames(locked_mp);
9366 mount_drop(locked_mp, 0);
9367 holding_mntlock = 0;
9368 }
9369 if (error) {
9370 if (error == EDATALESS) {
9371 /*
9372 * If we've been here before, something has gone
9373 * horribly wrong and we should just get out lest
9374 * we spiral around the drain forever.
9375 */
9376 if (flags & VFS_RENAME_DATALESS) {
9377 error = EIO;
9378 goto out1;
9379 }
9380
9381 /*
9382 * The object we're renaming is dataless (or has a
9383 * dataless descendent) and requires materialization
9384 * before the rename occurs. But we're holding the
9385 * mount point's rename lock, so it's not safe to
9386 * make the upcall.
9387 *
9388 * In this case, we release the lock, perform the
9389 * materialization, and start the whole thing over.
9390 */
9391 error = vnode_materialize_dataless_file(fvp,
9392 NAMESPACE_HANDLER_RENAME_OP);
9393
9394 if (error == 0) {
9395 /*
9396 * The next time around we need to tell the
9397 * file system that the materializtaion has
9398 * been performed.
9399 */
9400 flags |= VFS_RENAME_DATALESS;
9401 do_retry = 1;
9402 }
9403 goto out1;
9404 }
9405 if (error == EKEEPLOOKING) {
9406 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9407 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9408 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9409 }
9410 }
9411
9412 fromnd->ni_vp = fvp;
9413 tond->ni_vp = tvp;
9414
9415 goto continue_lookup;
9416 }
9417
9418 /*
9419 * We may encounter a race in the VNOP where the destination didn't
9420 * exist when we did the namei, but it does by the time we go and
9421 * try to create the entry. In this case, we should re-drive this rename
9422 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9423 * but other filesystems susceptible to this race could return it, too.
9424 */
9425 if (error == ERECYCLE) {
9426 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9427 do_retry = 1;
9428 retry_count += 1;
9429 } else {
9430 printf("rename retry limit due to ERECYCLE reached\n");
9431 error = ENOENT;
9432 }
9433 }
9434
9435 /*
9436 * For compound VNOPs, the authorization callback may return
9437 * ENOENT in case of racing hardlink lookups hitting the name
9438 * cache, redrive the lookup.
9439 */
9440 if (batched && error == ENOENT) {
9441 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9442 do_retry = 1;
9443 retry_count += 1;
9444 }
9445 }
9446
9447 goto out1;
9448 }
9449
9450 /* call out to allow 3rd party notification of rename.
9451 * Ignore result of kauth_authorize_fileop call.
9452 */
9453 kauth_authorize_fileop(vfs_context_ucred(ctx),
9454 KAUTH_FILEOP_RENAME,
9455 (uintptr_t)from_name, (uintptr_t)to_name);
9456 if (flags & VFS_RENAME_SWAP) {
9457 kauth_authorize_fileop(vfs_context_ucred(ctx),
9458 KAUTH_FILEOP_RENAME,
9459 (uintptr_t)to_name, (uintptr_t)from_name);
9460 }
9461
9462 #if CONFIG_FSE
9463 if (from_name != NULL && to_name != NULL) {
9464 if (from_truncated || to_truncated) {
9465 // set it here since only the from_finfo gets reported up to user space
9466 from_finfo.mode |= FSE_TRUNCATED_PATH;
9467 }
9468
9469 if (tvap && tvp) {
9470 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9471 }
9472 if (fvap) {
9473 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9474 }
9475
9476 if (tvp) {
9477 add_fsevent(FSE_RENAME, ctx,
9478 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9479 FSE_ARG_FINFO, &from_finfo,
9480 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9481 FSE_ARG_FINFO, &to_finfo,
9482 FSE_ARG_DONE);
9483 if (flags & VFS_RENAME_SWAP) {
9484 /*
9485 * Strictly speaking, swap is the equivalent of
9486 * *three* renames. FSEvents clients should only take
9487 * the events as a hint, so we only bother reporting
9488 * two.
9489 */
9490 add_fsevent(FSE_RENAME, ctx,
9491 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9492 FSE_ARG_FINFO, &to_finfo,
9493 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9494 FSE_ARG_FINFO, &from_finfo,
9495 FSE_ARG_DONE);
9496 }
9497 } else {
9498 add_fsevent(FSE_RENAME, ctx,
9499 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9500 FSE_ARG_FINFO, &from_finfo,
9501 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9502 FSE_ARG_DONE);
9503 }
9504 }
9505 #endif /* CONFIG_FSE */
9506
9507 /*
9508 * update filesystem's mount point data
9509 */
9510 if (mntrename) {
9511 char *cp, *pathend, *mpname;
9512 char * tobuf;
9513 struct mount *mp;
9514 int maxlen;
9515 size_t len = 0;
9516
9517 mp = fvp->v_mountedhere;
9518
9519 if (vfs_busy(mp, LK_NOWAIT)) {
9520 error = EBUSY;
9521 goto out1;
9522 }
9523 tobuf = zalloc(ZV_NAMEI);
9524
9525 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9526 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9527 } else {
9528 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9529 }
9530 if (!error) {
9531 /* find current mount point prefix */
9532 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9533 for (cp = pathend; *cp != '\0'; ++cp) {
9534 if (*cp == '/') {
9535 pathend = cp + 1;
9536 }
9537 }
9538 /* find last component of target name */
9539 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9540 if (*cp == '/') {
9541 mpname = cp + 1;
9542 }
9543 }
9544
9545 /* Update f_mntonname of sub mounts */
9546 vfs_iterate(0, rename_submounts_callback, (void *)mp);
9547
9548 /* append name to prefix */
9549 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9550 bzero(pathend, maxlen);
9551
9552 strlcpy(pathend, mpname, maxlen);
9553 }
9554 zfree(ZV_NAMEI, tobuf);
9555
9556 vfs_unbusy(mp);
9557
9558 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9559 }
9560 /*
9561 * fix up name & parent pointers. note that we first
9562 * check that fvp has the same name/parent pointers it
9563 * had before the rename call... this is a 'weak' check
9564 * at best...
9565 *
9566 * XXX oparent and oname may not be set in the compound vnop case
9567 */
9568 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9569 int update_flags;
9570
9571 update_flags = VNODE_UPDATE_NAME;
9572
9573 if (fdvp != tdvp) {
9574 update_flags |= VNODE_UPDATE_PARENT;
9575 }
9576
9577 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9578 }
9579 out1:
9580 /*
9581 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9582 * skipped earlier as no actual rename was performed.
9583 */
9584 if (vn_authorize_skipped && error == 0) {
9585 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9586 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9587 flags, NULL);
9588 if (error && error == ENOENT) {
9589 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9590 do_retry = 1;
9591 retry_count += 1;
9592 }
9593 }
9594 }
9595 if (to_name != NULL) {
9596 RELEASE_PATH(to_name);
9597 to_name = NULL;
9598 }
9599 if (to_name_no_firmlink != NULL) {
9600 RELEASE_PATH(to_name_no_firmlink);
9601 to_name_no_firmlink = NULL;
9602 }
9603 if (from_name != NULL) {
9604 RELEASE_PATH(from_name);
9605 from_name = NULL;
9606 }
9607 if (from_name_no_firmlink != NULL) {
9608 RELEASE_PATH(from_name_no_firmlink);
9609 from_name_no_firmlink = NULL;
9610 }
9611 if (holding_mntlock) {
9612 mount_unlock_renames(locked_mp);
9613 mount_drop(locked_mp, 0);
9614 holding_mntlock = 0;
9615 }
9616 if (tdvp) {
9617 /*
9618 * nameidone has to happen before we vnode_put(tdvp)
9619 * since it may need to release the fs_nodelock on the tdvp
9620 */
9621 nameidone(tond);
9622
9623 if (tvp) {
9624 vnode_put(tvp);
9625 }
9626 vnode_put(tdvp);
9627 }
9628 if (fdvp) {
9629 /*
9630 * nameidone has to happen before we vnode_put(fdvp)
9631 * since it may need to release the fs_nodelock on the fdvp
9632 */
9633 nameidone(fromnd);
9634
9635 if (fvp) {
9636 vnode_put(fvp);
9637 }
9638 vnode_put(fdvp);
9639 }
9640 if (mnt_fvp != NULLVP) {
9641 vnode_put(mnt_fvp);
9642 }
9643 /*
9644 * If things changed after we did the namei, then we will re-drive
9645 * this rename call from the top.
9646 */
9647 if (do_retry) {
9648 do_retry = 0;
9649 goto retry;
9650 }
9651
9652 kfree_type(typeof(*__rename_data), __rename_data);
9653 return error;
9654 }
9655
9656 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9657 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9658 {
9659 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9660 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9661 }
9662
9663 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9664 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9665 {
9666 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9667 return EINVAL;
9668 }
9669
9670 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9671 return EINVAL;
9672 }
9673
9674 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9675 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9676 }
9677
9678 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9679 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9680 {
9681 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9682 uap->tofd, uap->to, UIO_USERSPACE, 0);
9683 }
9684
9685 /*
9686 * Make a directory file.
9687 *
9688 * Returns: 0 Success
9689 * EEXIST
9690 * namei:???
9691 * vnode_authorize:???
9692 * vn_create:???
9693 */
9694 /* ARGSUSED */
9695 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9696 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9697 enum uio_seg segflg)
9698 {
9699 vnode_t vp, dvp;
9700 int error;
9701 int update_flags = 0;
9702 int batched;
9703 struct nameidata nd;
9704
9705 AUDIT_ARG(mode, vap->va_mode);
9706 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9707 path, ctx);
9708 nd.ni_cnd.cn_flags |= WILLBEDIR;
9709 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9710
9711 continue_lookup:
9712 error = nameiat(&nd, fd);
9713 if (error) {
9714 return error;
9715 }
9716 dvp = nd.ni_dvp;
9717 vp = nd.ni_vp;
9718
9719 if (vp != NULL) {
9720 error = EEXIST;
9721 goto out;
9722 }
9723
9724 batched = vnode_compound_mkdir_available(dvp);
9725
9726 VATTR_SET(vap, va_type, VDIR);
9727
9728 /*
9729 * XXX
9730 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9731 * only get EXISTS or EISDIR for existing path components, and not that it could see
9732 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9733 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9734 */
9735 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9736 if (error == EACCES || error == EPERM) {
9737 int error2;
9738
9739 nameidone(&nd);
9740 vnode_put(dvp);
9741 dvp = NULLVP;
9742
9743 /*
9744 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9745 * rather than EACCESS if the target exists.
9746 */
9747 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9748 path, ctx);
9749 error2 = nameiat(&nd, fd);
9750 if (error2) {
9751 goto out;
9752 } else {
9753 vp = nd.ni_vp;
9754 error = EEXIST;
9755 goto out;
9756 }
9757 }
9758
9759 goto out;
9760 }
9761
9762 #if CONFIG_FILE_LEASES
9763 vnode_breakdirlease(dvp, false, O_WRONLY);
9764 #endif
9765
9766 /*
9767 * make the directory
9768 */
9769 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9770 if (error == EKEEPLOOKING) {
9771 nd.ni_vp = vp;
9772 goto continue_lookup;
9773 }
9774
9775 goto out;
9776 }
9777
9778 // Make sure the name & parent pointers are hooked up
9779 if (vp->v_name == NULL) {
9780 update_flags |= VNODE_UPDATE_NAME;
9781 }
9782 if (vp->v_parent == NULLVP) {
9783 update_flags |= VNODE_UPDATE_PARENT;
9784 }
9785
9786 if (update_flags) {
9787 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9788 }
9789
9790 #if CONFIG_FSE
9791 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9792 #endif
9793
9794 out:
9795 /*
9796 * nameidone has to happen before we vnode_put(dvp)
9797 * since it may need to release the fs_nodelock on the dvp
9798 */
9799 nameidone(&nd);
9800
9801 if (vp) {
9802 vnode_put(vp);
9803 }
9804 if (dvp) {
9805 vnode_put(dvp);
9806 }
9807
9808 return error;
9809 }
9810
9811 /*
9812 * mkdir_extended: Create a directory; with extended security (ACL).
9813 *
9814 * Parameters: p Process requesting to create the directory
9815 * uap User argument descriptor (see below)
9816 * retval (ignored)
9817 *
9818 * Indirect: uap->path Path of directory to create
9819 * uap->mode Access permissions to set
9820 * uap->xsecurity ACL to set
9821 *
9822 * Returns: 0 Success
9823 * !0 Not success
9824 *
9825 */
9826 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9827 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9828 {
9829 int ciferror;
9830 kauth_filesec_t xsecdst;
9831 struct vnode_attr va;
9832
9833 AUDIT_ARG(owner, uap->uid, uap->gid);
9834
9835 xsecdst = NULL;
9836 if ((uap->xsecurity != USER_ADDR_NULL) &&
9837 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9838 return ciferror;
9839 }
9840
9841 VATTR_INIT(&va);
9842 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9843 if (xsecdst != NULL) {
9844 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9845 va.va_vaflags |= VA_FILESEC_ACL;
9846 }
9847
9848 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9849 UIO_USERSPACE);
9850 if (xsecdst != NULL) {
9851 kauth_filesec_free(xsecdst);
9852 }
9853 return ciferror;
9854 }
9855
9856 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9857 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9858 {
9859 struct vnode_attr va;
9860
9861 VATTR_INIT(&va);
9862 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9863
9864 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9865 UIO_USERSPACE);
9866 }
9867
9868 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9869 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9870 {
9871 struct vnode_attr va;
9872
9873 VATTR_INIT(&va);
9874 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9875
9876 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9877 UIO_USERSPACE);
9878 }
9879
9880 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9881 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9882 enum uio_seg segflg, int unlink_flags)
9883 {
9884 struct {
9885 struct nameidata nd;
9886 #if CONFIG_FSE
9887 struct vnode_attr va;
9888 #endif /* CONFIG_FSE */
9889 } *__rmdir_data;
9890 vnode_t vp, dvp;
9891 int error;
9892 struct nameidata *ndp;
9893 char *path = NULL;
9894 char *no_firmlink_path = NULL;
9895 int len_path = 0;
9896 int len_no_firmlink_path = 0;
9897 int has_listeners = 0;
9898 int need_event = 0;
9899 int truncated_path = 0;
9900 int truncated_no_firmlink_path = 0;
9901 struct vnode_attr *vap = NULL;
9902 int restart_count = 0;
9903 int batched;
9904
9905 int restart_flag;
9906
9907 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9908 ndp = &__rmdir_data->nd;
9909
9910 /*
9911 * This loop exists to restart rmdir in the unlikely case that two
9912 * processes are simultaneously trying to remove the same directory
9913 * containing orphaned appleDouble files.
9914 */
9915 do {
9916 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9917 segflg, dirpath, ctx);
9918 ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9919 continue_lookup:
9920 restart_flag = 0;
9921 vap = NULL;
9922
9923 error = nameiat(ndp, fd);
9924 if (error) {
9925 goto err_out;
9926 }
9927
9928 dvp = ndp->ni_dvp;
9929 vp = ndp->ni_vp;
9930
9931 if (vp) {
9932 batched = vnode_compound_rmdir_available(vp);
9933
9934 if (vp->v_flag & VROOT) {
9935 /*
9936 * The root of a mounted filesystem cannot be deleted.
9937 */
9938 error = EBUSY;
9939 goto out;
9940 }
9941
9942 #if DEVELOPMENT || DEBUG
9943 /*
9944 * XXX VSWAP: Check for entitlements or special flag here
9945 * so we can restrict access appropriately.
9946 */
9947 #else /* DEVELOPMENT || DEBUG */
9948
9949 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9950 error = EPERM;
9951 goto out;
9952 }
9953 #endif /* DEVELOPMENT || DEBUG */
9954
9955 /*
9956 * Removed a check here; we used to abort if vp's vid
9957 * was not the same as what we'd seen the last time around.
9958 * I do not think that check was valid, because if we retry
9959 * and all dirents are gone, the directory could legitimately
9960 * be recycled but still be present in a situation where we would
9961 * have had permission to delete. Therefore, we won't make
9962 * an effort to preserve that check now that we may not have a
9963 * vp here.
9964 */
9965
9966 if (!batched) {
9967 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9968 if (error) {
9969 if (error == ENOENT) {
9970 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9971 restart_flag = 1;
9972 restart_count += 1;
9973 }
9974 }
9975 goto out;
9976 }
9977 }
9978 } else {
9979 batched = 1;
9980
9981 if (!vnode_compound_rmdir_available(dvp)) {
9982 panic("No error, but no compound rmdir?");
9983 }
9984 }
9985
9986 #if CONFIG_FSE
9987 fse_info finfo = {0};
9988
9989 need_event = need_fsevent(FSE_DELETE, dvp);
9990 if (need_event) {
9991 if (!batched) {
9992 get_fse_info(vp, &finfo, ctx);
9993 } else {
9994 error = vfs_get_notify_attributes(&__rmdir_data->va);
9995 if (error) {
9996 goto out;
9997 }
9998
9999 vap = &__rmdir_data->va;
10000 }
10001 }
10002 #endif
10003 has_listeners = kauth_authorize_fileop_has_listeners();
10004 if (need_event || has_listeners) {
10005 if (path == NULL) {
10006 GET_PATH(path);
10007 }
10008
10009 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10010
10011 if (no_firmlink_path == NULL) {
10012 GET_PATH(no_firmlink_path);
10013 }
10014
10015 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10016 #if CONFIG_FSE
10017 if (truncated_no_firmlink_path) {
10018 finfo.mode |= FSE_TRUNCATED_PATH;
10019 }
10020 #endif
10021 }
10022
10023 #if CONFIG_FILE_LEASES
10024 vnode_breakdirlease(dvp, false, O_WRONLY);
10025 #endif
10026
10027 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10028 ndp->ni_vp = vp;
10029 if (vp == NULLVP) {
10030 /* Couldn't find a vnode */
10031 goto out;
10032 }
10033
10034 if (error == EKEEPLOOKING) {
10035 goto continue_lookup;
10036 } else if (batched && error == ENOENT) {
10037 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10038 /*
10039 * For compound VNOPs, the authorization callback
10040 * may return ENOENT in case of racing hard link lookups
10041 * redrive the lookup.
10042 */
10043 restart_flag = 1;
10044 restart_count += 1;
10045 goto out;
10046 }
10047 }
10048
10049 /*
10050 * XXX There's no provision for passing flags
10051 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10052 * because it's not empty, then we try again
10053 * with VNOP_REMOVE(), passing in a special
10054 * flag that clever file systems will know
10055 * how to handle.
10056 */
10057 if (error == ENOTEMPTY &&
10058 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10059 /*
10060 * If this fails, we want to keep the original
10061 * error.
10062 */
10063 if (vn_remove(dvp, &vp, ndp,
10064 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10065 error = 0;
10066 }
10067 }
10068
10069 #if CONFIG_APPLEDOUBLE
10070 /*
10071 * Special case to remove orphaned AppleDouble
10072 * files. I don't like putting this in the kernel,
10073 * but carbon does not like putting this in carbon either,
10074 * so here we are.
10075 */
10076 if (error == ENOTEMPTY) {
10077 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10078 if (ad_error == EBUSY) {
10079 error = ad_error;
10080 goto out;
10081 }
10082
10083
10084 /*
10085 * Assuming everything went well, we will try the RMDIR again
10086 */
10087 if (!ad_error) {
10088 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10089 }
10090 }
10091 #endif /* CONFIG_APPLEDOUBLE */
10092 /*
10093 * Call out to allow 3rd party notification of delete.
10094 * Ignore result of kauth_authorize_fileop call.
10095 */
10096 if (!error) {
10097 if (has_listeners) {
10098 kauth_authorize_fileop(vfs_context_ucred(ctx),
10099 KAUTH_FILEOP_DELETE,
10100 (uintptr_t)vp,
10101 (uintptr_t)path);
10102 }
10103
10104 if (vp->v_flag & VISHARDLINK) {
10105 // see the comment in unlink1() about why we update
10106 // the parent of a hard link when it is removed
10107 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10108 }
10109
10110 #if CONFIG_FSE
10111 if (need_event) {
10112 if (vap) {
10113 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10114 }
10115 add_fsevent(FSE_DELETE, ctx,
10116 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10117 FSE_ARG_FINFO, &finfo,
10118 FSE_ARG_DONE);
10119 }
10120 #endif
10121
10122 #if CONFIG_MACF
10123 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10124 #endif
10125 }
10126
10127 out:
10128 if (path != NULL) {
10129 RELEASE_PATH(path);
10130 path = NULL;
10131 }
10132
10133 if (no_firmlink_path != NULL) {
10134 RELEASE_PATH(no_firmlink_path);
10135 no_firmlink_path = NULL;
10136 }
10137
10138 /*
10139 * nameidone has to happen before we vnode_put(dvp)
10140 * since it may need to release the fs_nodelock on the dvp
10141 */
10142 nameidone(ndp);
10143 vnode_put(dvp);
10144
10145 if (vp) {
10146 vnode_put(vp);
10147 }
10148
10149 if (restart_flag == 0) {
10150 wakeup_one((caddr_t)vp);
10151 goto err_out;
10152 }
10153 tsleep(vp, PVFS, "rm AD", 1);
10154 } while (restart_flag != 0);
10155
10156 err_out:
10157 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10158
10159 return error;
10160 }
10161
10162 /*
10163 * Remove a directory file.
10164 */
10165 /* ARGSUSED */
10166 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10167 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10168 {
10169 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10170 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10171 }
10172
10173 /* Get direntry length padded to 8 byte alignment */
10174 #define DIRENT64_LEN(namlen) \
10175 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10176
10177 /* Get dirent length padded to 4 byte alignment */
10178 #define DIRENT_LEN(namelen) \
10179 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10180
10181 /* Get the end of this dirent */
10182 #define DIRENT_END(dep) \
10183 (((char *)(dep)) + (dep)->d_reclen - 1)
10184
10185 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10186 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10187 int *numdirent, vfs_context_t ctxp)
10188 {
10189 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10190 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10191 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10192 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10193 } else {
10194 size_t bufsize;
10195 void * bufptr;
10196 uio_t auio;
10197 struct direntry *entry64;
10198 struct dirent *dep;
10199 size_t bytesread;
10200 int error;
10201
10202 /*
10203 * We're here because the underlying file system does not
10204 * support direnties or we mounted denying support so we must
10205 * fall back to dirents and convert them to direntries.
10206 *
10207 * Our kernel buffer needs to be smaller since re-packing will
10208 * expand each dirent. The worse case (when the name length
10209 * is 3 or less) corresponds to a struct direntry size of 32
10210 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10211 * (4-byte aligned). So having a buffer that is 3/8 the size
10212 * will prevent us from reading more than we can pack.
10213 *
10214 * Since this buffer is wired memory, we will limit the
10215 * buffer size to a maximum of 32K. We would really like to
10216 * use 32K in the MIN(), but we use magic number 87371 to
10217 * prevent uio_resid() * 3 / 8 from overflowing.
10218 */
10219 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10220 bufptr = kalloc_data(bufsize, Z_WAITOK);
10221 if (bufptr == NULL) {
10222 return ENOMEM;
10223 }
10224
10225 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10226 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10227 auio->uio_offset = uio->uio_offset;
10228
10229 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10230
10231 dep = (struct dirent *)bufptr;
10232 bytesread = bufsize - uio_resid(auio);
10233
10234 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10235 /*
10236 * Convert all the entries and copy them out to user's buffer.
10237 */
10238 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10239 /* First check that the dirent struct up to d_name is within the buffer */
10240 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10241 /* Check that the length of the entire dirent is within the buffer */
10242 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10243 /* Check that the actual length including the name doesn't exceed d_reclen */
10244 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10245 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10246 vp->v_mount->mnt_vfsstat.f_mntonname,
10247 vp->v_name ? vp->v_name : "<unknown>");
10248 error = EIO;
10249 break;
10250 }
10251
10252 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10253
10254 bzero(entry64, enbufsize);
10255 /* Convert a dirent to a dirent64. */
10256 entry64->d_ino = dep->d_ino;
10257 entry64->d_seekoff = 0;
10258 entry64->d_reclen = (uint16_t)enbufsize;
10259 entry64->d_namlen = dep->d_namlen;
10260 entry64->d_type = dep->d_type;
10261 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10262
10263 /* Move to next entry. */
10264 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10265
10266 /* Copy entry64 to user's buffer. */
10267 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10268 }
10269
10270 /* Update the real offset using the offset we got from VNOP_READDIR. */
10271 if (error == 0) {
10272 uio->uio_offset = auio->uio_offset;
10273 }
10274 uio_free(auio);
10275 kfree_data(bufptr, bufsize);
10276 kfree_type(struct direntry, entry64);
10277 return error;
10278 }
10279 }
10280
10281 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10282
10283 /*
10284 * Read a block of directory entries in a file system independent format.
10285 */
10286 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10287 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10288 off_t *offset, int *eofflag, int flags)
10289 {
10290 vnode_t vp;
10291 struct vfs_context context = *vfs_context_current(); /* local copy */
10292 struct fileproc *fp;
10293 uio_t auio;
10294 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10295 off_t loff;
10296 int error, numdirent;
10297 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10298
10299 get_from_fd:
10300 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10301 if (error) {
10302 return error;
10303 }
10304
10305 vn_offset_lock(fp->fp_glob);
10306 if (((vnode_t)fp_get_data(fp)) != vp) {
10307 vn_offset_unlock(fp->fp_glob);
10308 file_drop(fd);
10309 goto get_from_fd;
10310 }
10311
10312 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10313 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10314 error = EBADF;
10315 goto out;
10316 }
10317
10318 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10319 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10320 }
10321
10322 #if CONFIG_MACF
10323 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10324 if (error) {
10325 goto out;
10326 }
10327 #endif
10328
10329 if ((error = vnode_getwithref(vp))) {
10330 goto out;
10331 }
10332 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10333
10334 #if CONFIG_UNION_MOUNTS
10335 unionread:
10336 #endif /* CONFIG_UNION_MOUNTS */
10337 if (vp->v_type != VDIR) {
10338 (void)vnode_put(vp);
10339 error = EINVAL;
10340 goto out;
10341 }
10342
10343 #if CONFIG_MACF
10344 error = mac_vnode_check_readdir(&context, vp);
10345 if (error != 0) {
10346 (void)vnode_put(vp);
10347 goto out;
10348 }
10349 #endif /* MAC */
10350
10351 loff = fp->fp_glob->fg_offset;
10352 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10353 uio_addiov(auio, bufp, bufsize);
10354
10355 if (flags & VNODE_READDIR_EXTENDED) {
10356 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10357 fp->fp_glob->fg_offset = uio_offset(auio);
10358 } else {
10359 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10360 fp->fp_glob->fg_offset = uio_offset(auio);
10361 }
10362 if (error) {
10363 (void)vnode_put(vp);
10364 goto out;
10365 }
10366
10367 #if CONFIG_UNION_MOUNTS
10368 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10369 (vp->v_mount->mnt_flag & MNT_UNION)) {
10370 vnode_t uvp;
10371
10372 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10373 if (vnode_ref(uvp) == 0) {
10374 fp_set_data(fp, uvp);
10375 fp->fp_glob->fg_offset = 0;
10376 vnode_rele(vp);
10377 vnode_put(vp);
10378 vp = uvp;
10379 goto unionread;
10380 } else {
10381 /* could not get a ref, can't replace in fd */
10382 vnode_put(uvp);
10383 }
10384 }
10385 }
10386 #endif /* CONFIG_UNION_MOUNTS */
10387
10388 vnode_put(vp);
10389 if (offset) {
10390 *offset = loff;
10391 }
10392
10393 *bytesread = bufsize - uio_resid(auio);
10394 out:
10395 vn_offset_unlock(fp->fp_glob);
10396 file_drop(fd);
10397 return error;
10398 }
10399
10400
10401 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10402 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10403 {
10404 off_t offset;
10405 ssize_t bytesread;
10406 int error, eofflag;
10407
10408 AUDIT_ARG(fd, uap->fd);
10409 error = getdirentries_common(uap->fd, uap->buf, uap->count,
10410 &bytesread, &offset, &eofflag, 0);
10411
10412 if (error == 0) {
10413 if (proc_is64bit(p)) {
10414 user64_long_t base = (user64_long_t)offset;
10415 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10416 } else {
10417 user32_long_t base = (user32_long_t)offset;
10418 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10419 }
10420 *retval = (int)bytesread;
10421 }
10422 return error;
10423 }
10424
10425 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10426 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10427 {
10428 off_t offset;
10429 ssize_t bytesread;
10430 int error, eofflag;
10431 user_size_t bufsize;
10432
10433 AUDIT_ARG(fd, uap->fd);
10434
10435 /*
10436 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10437 * then the kernel carves out the last 4 bytes to return extended
10438 * information to userspace (namely whether we reached EOF with this call).
10439 */
10440 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10441 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10442 } else {
10443 bufsize = uap->bufsize;
10444 }
10445
10446 error = getdirentries_common(uap->fd, uap->buf, bufsize,
10447 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10448
10449 if (error == 0) {
10450 *retval = bytesread;
10451 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10452
10453 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10454 getdirentries64_flags_t flags = 0;
10455 if (eofflag) {
10456 flags |= GETDIRENTRIES64_EOF;
10457 }
10458 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10459 sizeof(flags));
10460 }
10461 }
10462 return error;
10463 }
10464
10465
10466 /*
10467 * Set the mode mask for creation of filesystem nodes.
10468 * XXX implement xsecurity
10469 */
10470 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10471 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10472 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10473 {
10474 AUDIT_ARG(mask, newmask);
10475 proc_fdlock(p);
10476 *retval = p->p_fd.fd_cmask;
10477 p->p_fd.fd_cmask = newmask & ALLPERMS;
10478 proc_fdunlock(p);
10479 return 0;
10480 }
10481
10482 /*
10483 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10484 *
10485 * Parameters: p Process requesting to set the umask
10486 * uap User argument descriptor (see below)
10487 * retval umask of the process (parameter p)
10488 *
10489 * Indirect: uap->newmask umask to set
10490 * uap->xsecurity ACL to set
10491 *
10492 * Returns: 0 Success
10493 * !0 Not success
10494 *
10495 */
10496 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10497 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10498 {
10499 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10500 }
10501
10502 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10503 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10504 {
10505 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10506 }
10507
10508 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10509 "com.apple.private.vfs.revoke-mounted-device"
10510
10511 /*
10512 * Void all references to file by ripping underlying filesystem
10513 * away from vnode.
10514 */
10515 /* ARGSUSED */
10516 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10517 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10518 {
10519 vnode_t vp;
10520 struct vnode_attr va;
10521 vfs_context_t ctx = vfs_context_current();
10522 int error;
10523 struct nameidata nd;
10524
10525 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10526 uap->path, ctx);
10527 error = namei(&nd);
10528 if (error) {
10529 return error;
10530 }
10531 vp = nd.ni_vp;
10532
10533 nameidone(&nd);
10534
10535 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10536 error = ENOTSUP;
10537 goto out;
10538 }
10539
10540 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10541 error = EBUSY;
10542 goto out;
10543 }
10544
10545 #if CONFIG_MACF
10546 error = mac_vnode_check_revoke(ctx, vp);
10547 if (error) {
10548 goto out;
10549 }
10550 #endif
10551
10552 VATTR_INIT(&va);
10553 VATTR_WANTED(&va, va_uid);
10554 if ((error = vnode_getattr(vp, &va, ctx))) {
10555 goto out;
10556 }
10557 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10558 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10559 goto out;
10560 }
10561 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10562 VNOP_REVOKE(vp, REVOKEALL, ctx);
10563 }
10564 out:
10565 vnode_put(vp);
10566 return error;
10567 }
10568
10569
10570 /*
10571 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10572 * The following system calls are designed to support features
10573 * which are specific to the HFS & HFS Plus volume formats
10574 */
10575
10576
10577 /*
10578 * Obtain attribute information on objects in a directory while enumerating
10579 * the directory.
10580 */
10581 /* ARGSUSED */
10582 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10583 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10584 {
10585 vnode_t vp;
10586 struct fileproc *fp;
10587 uio_t auio = NULL;
10588 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10589 uint32_t count = 0, savecount = 0;
10590 uint32_t newstate = 0;
10591 int error, eofflag = 0;
10592 off_t loff = 0;
10593 struct attrlist attributelist;
10594 vfs_context_t ctx = vfs_context_current();
10595 int fd = uap->fd;
10596 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10597 kauth_action_t action;
10598
10599 AUDIT_ARG(fd, fd);
10600
10601 /* Get the attributes into kernel space */
10602 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10603 return error;
10604 }
10605 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10606 return error;
10607 }
10608 savecount = count;
10609
10610 get_from_fd:
10611 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10612 return error;
10613 }
10614
10615 vn_offset_lock(fp->fp_glob);
10616 if (((vnode_t)fp_get_data(fp)) != vp) {
10617 vn_offset_unlock(fp->fp_glob);
10618 file_drop(fd);
10619 goto get_from_fd;
10620 }
10621
10622 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10623 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10624 error = EBADF;
10625 goto out;
10626 }
10627
10628
10629 #if CONFIG_MACF
10630 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10631 fp->fp_glob);
10632 if (error) {
10633 goto out;
10634 }
10635 #endif
10636
10637
10638 if ((error = vnode_getwithref(vp))) {
10639 goto out;
10640 }
10641
10642 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10643
10644 #if CONFIG_UNION_MOUNTS
10645 unionread:
10646 #endif /* CONFIG_UNION_MOUNTS */
10647 if (vp->v_type != VDIR) {
10648 (void)vnode_put(vp);
10649 error = EINVAL;
10650 goto out;
10651 }
10652
10653 #if CONFIG_MACF
10654 error = mac_vnode_check_readdir(ctx, vp);
10655 if (error != 0) {
10656 (void)vnode_put(vp);
10657 goto out;
10658 }
10659 #endif /* MAC */
10660
10661 /* set up the uio structure which will contain the users return buffer */
10662 loff = fp->fp_glob->fg_offset;
10663 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10664 uio_addiov(auio, uap->buffer, uap->buffersize);
10665
10666 /*
10667 * If the only item requested is file names, we can let that past with
10668 * just LIST_DIRECTORY. If they want any other attributes, that means
10669 * they need SEARCH as well.
10670 */
10671 action = KAUTH_VNODE_LIST_DIRECTORY;
10672 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10673 attributelist.fileattr || attributelist.dirattr) {
10674 action |= KAUTH_VNODE_SEARCH;
10675 }
10676
10677 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10678 /* Believe it or not, uap->options only has 32-bits of valid
10679 * info, so truncate before extending again */
10680
10681 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10682 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10683 }
10684
10685 if (error) {
10686 (void) vnode_put(vp);
10687 goto out;
10688 }
10689
10690 #if CONFIG_UNION_MOUNTS
10691 /*
10692 * If we've got the last entry of a directory in a union mount
10693 * then reset the eofflag and pretend there's still more to come.
10694 * The next call will again set eofflag and the buffer will be empty,
10695 * so traverse to the underlying directory and do the directory
10696 * read there.
10697 */
10698 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10699 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10700 eofflag = 0;
10701 } else { // Empty buffer
10702 vnode_t uvp;
10703 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10704 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10705 fp_set_data(fp, uvp);
10706 fp->fp_glob->fg_offset = 0; // reset index for new dir
10707 count = savecount;
10708 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10709 vnode_put(vp);
10710 vp = uvp;
10711 goto unionread;
10712 } else {
10713 /* could not get a ref, can't replace in fd */
10714 vnode_put(uvp);
10715 }
10716 }
10717 }
10718 }
10719 #endif /* CONFIG_UNION_MOUNTS */
10720
10721 (void)vnode_put(vp);
10722
10723 if (error) {
10724 goto out;
10725 }
10726 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10727
10728 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10729 goto out;
10730 }
10731 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10732 goto out;
10733 }
10734 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10735 goto out;
10736 }
10737
10738 *retval = eofflag; /* similar to getdirentries */
10739 error = 0;
10740 out:
10741 vn_offset_unlock(fp->fp_glob);
10742 file_drop(fd);
10743 return error; /* return error earlier, an retval of 0 or 1 now */
10744 } /* end of getdirentriesattr system call */
10745
10746 /*
10747 * Exchange data between two files
10748 */
10749
10750 /* ARGSUSED */
10751 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10752 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10753 {
10754 struct nameidata fnd, snd;
10755 vfs_context_t ctx = vfs_context_current();
10756 vnode_t fvp;
10757 vnode_t svp;
10758 int error;
10759 u_int32_t nameiflags;
10760 char *fpath = NULL;
10761 char *spath = NULL;
10762 int flen = 0, slen = 0;
10763 int from_truncated = 0, to_truncated = 0;
10764 #if CONFIG_FSE
10765 fse_info f_finfo, s_finfo;
10766 #endif
10767
10768 nameiflags = 0;
10769 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10770 nameiflags |= FOLLOW;
10771 }
10772
10773 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10774 UIO_USERSPACE, uap->path1, ctx);
10775
10776 error = namei(&fnd);
10777 if (error) {
10778 goto out2;
10779 }
10780
10781 nameidone(&fnd);
10782 fvp = fnd.ni_vp;
10783
10784 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10785 UIO_USERSPACE, uap->path2, ctx);
10786
10787 error = namei(&snd);
10788 if (error) {
10789 vnode_put(fvp);
10790 goto out2;
10791 }
10792 nameidone(&snd);
10793 svp = snd.ni_vp;
10794
10795 /*
10796 * if the files are the same, return an inval error
10797 */
10798 if (svp == fvp) {
10799 error = EINVAL;
10800 goto out;
10801 }
10802
10803 /*
10804 * if the files are on different volumes, return an error
10805 */
10806 if (svp->v_mount != fvp->v_mount) {
10807 error = EXDEV;
10808 goto out;
10809 }
10810
10811 /* If they're not files, return an error */
10812 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10813 error = EINVAL;
10814 goto out;
10815 }
10816
10817 #if CONFIG_MACF
10818 error = mac_vnode_check_exchangedata(ctx,
10819 fvp, svp);
10820 if (error) {
10821 goto out;
10822 }
10823 #endif
10824 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10825 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10826 goto out;
10827 }
10828
10829 if (
10830 #if CONFIG_FSE
10831 need_fsevent(FSE_EXCHANGE, fvp) ||
10832 #endif
10833 kauth_authorize_fileop_has_listeners()) {
10834 GET_PATH(fpath);
10835 GET_PATH(spath);
10836
10837 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10838 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10839
10840 #if CONFIG_FSE
10841 get_fse_info(fvp, &f_finfo, ctx);
10842 get_fse_info(svp, &s_finfo, ctx);
10843 if (from_truncated || to_truncated) {
10844 // set it here since only the f_finfo gets reported up to user space
10845 f_finfo.mode |= FSE_TRUNCATED_PATH;
10846 }
10847 #endif
10848 }
10849 /* Ok, make the call */
10850 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10851
10852 if (error == 0) {
10853 const char *tmpname;
10854
10855 if (fpath != NULL && spath != NULL) {
10856 /* call out to allow 3rd party notification of exchangedata.
10857 * Ignore result of kauth_authorize_fileop call.
10858 */
10859 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10860 (uintptr_t)fpath, (uintptr_t)spath);
10861 }
10862 name_cache_lock();
10863
10864 tmpname = fvp->v_name;
10865 fvp->v_name = svp->v_name;
10866 svp->v_name = tmpname;
10867
10868 if (fvp->v_parent != svp->v_parent) {
10869 vnode_t tmp;
10870
10871 tmp = fvp->v_parent;
10872 fvp->v_parent = svp->v_parent;
10873 svp->v_parent = tmp;
10874 }
10875 name_cache_unlock();
10876
10877 #if CONFIG_FSE
10878 if (fpath != NULL && spath != NULL) {
10879 add_fsevent(FSE_EXCHANGE, ctx,
10880 FSE_ARG_STRING, flen, fpath,
10881 FSE_ARG_FINFO, &f_finfo,
10882 FSE_ARG_STRING, slen, spath,
10883 FSE_ARG_FINFO, &s_finfo,
10884 FSE_ARG_DONE);
10885 }
10886 #endif
10887 }
10888
10889 out:
10890 if (fpath != NULL) {
10891 RELEASE_PATH(fpath);
10892 }
10893 if (spath != NULL) {
10894 RELEASE_PATH(spath);
10895 }
10896 vnode_put(svp);
10897 vnode_put(fvp);
10898 out2:
10899 return error;
10900 }
10901
10902 /*
10903 * Return (in MB) the amount of freespace on the given vnode's volume.
10904 */
10905 uint32_t freespace_mb(vnode_t vp);
10906
10907 uint32_t
freespace_mb(vnode_t vp)10908 freespace_mb(vnode_t vp)
10909 {
10910 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10911 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10912 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10913 }
10914
10915 #if CONFIG_SEARCHFS
10916
10917 /* ARGSUSED */
10918
10919 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10920 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10921 {
10922 vnode_t vp, tvp;
10923 int i, error = 0;
10924 int fserror = 0;
10925 struct nameidata nd;
10926 struct user64_fssearchblock searchblock;
10927 struct searchstate *state;
10928 struct attrlist *returnattrs;
10929 struct timeval timelimit;
10930 void *searchparams1, *searchparams2;
10931 uio_t auio = NULL;
10932 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10933 uint32_t nummatches;
10934 size_t mallocsize;
10935 uint32_t nameiflags;
10936 vfs_context_t ctx = vfs_context_current();
10937 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10938
10939 /* Start by copying in fsearchblock parameter list */
10940 if (IS_64BIT_PROCESS(p)) {
10941 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10942 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10943 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10944 } else {
10945 struct user32_fssearchblock tmp_searchblock;
10946
10947 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10948 // munge into 64-bit version
10949 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10950 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10951 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10952 searchblock.maxmatches = tmp_searchblock.maxmatches;
10953 /*
10954 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10955 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10956 */
10957 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10958 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10959 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10960 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10961 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10962 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10963 searchblock.searchattrs = tmp_searchblock.searchattrs;
10964 }
10965 if (error) {
10966 return error;
10967 }
10968
10969 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10970 */
10971 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10972 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10973 return EINVAL;
10974 }
10975
10976 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10977 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
10978 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10979 /* block. */
10980 /* */
10981 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
10982 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
10983 /* assumes the size is still 556 bytes it will continue to work */
10984
10985 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10986 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10987
10988 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10989
10990 /* Now set up the various pointers to the correct place in our newly allocated memory */
10991
10992 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10993 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10994 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10995
10996 /* Now copy in the stuff given our local variables. */
10997
10998 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10999 goto freeandexit;
11000 }
11001
11002 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11003 goto freeandexit;
11004 }
11005
11006 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11007 goto freeandexit;
11008 }
11009
11010 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11011 goto freeandexit;
11012 }
11013
11014 /*
11015 * When searching a union mount, need to set the
11016 * start flag at the first call on each layer to
11017 * reset state for the new volume.
11018 */
11019 if (uap->options & SRCHFS_START) {
11020 state->ss_union_layer = 0;
11021 } else {
11022 uap->options |= state->ss_union_flags;
11023 }
11024 state->ss_union_flags = 0;
11025
11026 /*
11027 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11028 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11029 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11030 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11031 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11032 */
11033
11034 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11035 attrreference_t* string_ref;
11036 u_int32_t* start_length;
11037 user64_size_t param_length;
11038
11039 /* validate searchparams1 */
11040 param_length = searchblock.sizeofsearchparams1;
11041 /* skip the word that specifies length of the buffer */
11042 start_length = (u_int32_t*) searchparams1;
11043 start_length = start_length + 1;
11044 string_ref = (attrreference_t*) start_length;
11045
11046 /* ensure no negative offsets or too big offsets */
11047 if (string_ref->attr_dataoffset < 0) {
11048 error = EINVAL;
11049 goto freeandexit;
11050 }
11051 if (string_ref->attr_length > MAXPATHLEN) {
11052 error = EINVAL;
11053 goto freeandexit;
11054 }
11055
11056 /* Check for pointer overflow in the string ref */
11057 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11058 error = EINVAL;
11059 goto freeandexit;
11060 }
11061
11062 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11063 error = EINVAL;
11064 goto freeandexit;
11065 }
11066 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11067 error = EINVAL;
11068 goto freeandexit;
11069 }
11070 }
11071
11072 /* set up the uio structure which will contain the users return buffer */
11073 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11074 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11075
11076 nameiflags = 0;
11077 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11078 nameiflags |= FOLLOW;
11079 }
11080 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11081 UIO_USERSPACE, uap->path, ctx);
11082
11083 error = namei(&nd);
11084 if (error) {
11085 goto freeandexit;
11086 }
11087 vp = nd.ni_vp;
11088 nameidone(&nd);
11089
11090 /*
11091 * Switch to the root vnode for the volume
11092 */
11093 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11094 vnode_put(vp);
11095 if (error) {
11096 goto freeandexit;
11097 }
11098 vp = tvp;
11099
11100 #if CONFIG_UNION_MOUNTS
11101 /*
11102 * If it's a union mount, the path lookup takes
11103 * us to the top layer. But we may need to descend
11104 * to a lower layer. For non-union mounts the layer
11105 * is always zero.
11106 */
11107 for (i = 0; i < (int) state->ss_union_layer; i++) {
11108 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11109 break;
11110 }
11111 tvp = vp;
11112 vp = vp->v_mount->mnt_vnodecovered;
11113 if (vp == NULL) {
11114 vnode_put(tvp);
11115 error = ENOENT;
11116 goto freeandexit;
11117 }
11118 error = vnode_getwithref(vp);
11119 vnode_put(tvp);
11120 if (error) {
11121 goto freeandexit;
11122 }
11123 }
11124 #endif /* CONFIG_UNION_MOUNTS */
11125
11126 #if CONFIG_MACF
11127 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11128 if (error) {
11129 vnode_put(vp);
11130 goto freeandexit;
11131 }
11132 #endif
11133
11134
11135 /*
11136 * If searchblock.maxmatches == 0, then skip the search. This has happened
11137 * before and sometimes the underlying code doesnt deal with it well.
11138 */
11139 if (searchblock.maxmatches == 0) {
11140 nummatches = 0;
11141 goto saveandexit;
11142 }
11143
11144 /*
11145 * Allright, we have everything we need, so lets make that call.
11146 *
11147 * We keep special track of the return value from the file system:
11148 * EAGAIN is an acceptable error condition that shouldn't keep us
11149 * from copying out any results...
11150 */
11151
11152 fserror = VNOP_SEARCHFS(vp,
11153 searchparams1,
11154 searchparams2,
11155 &searchblock.searchattrs,
11156 (uint32_t)searchblock.maxmatches,
11157 &timelimit,
11158 returnattrs,
11159 &nummatches,
11160 (uint32_t)uap->scriptcode,
11161 (uint32_t)uap->options,
11162 auio,
11163 (struct searchstate *) &state->ss_fsstate,
11164 ctx);
11165
11166 #if CONFIG_UNION_MOUNTS
11167 /*
11168 * If it's a union mount we need to be called again
11169 * to search the mounted-on filesystem.
11170 */
11171 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11172 state->ss_union_flags = SRCHFS_START;
11173 state->ss_union_layer++; // search next layer down
11174 fserror = EAGAIN;
11175 }
11176 #endif /* CONFIG_UNION_MOUNTS */
11177
11178 saveandexit:
11179
11180 vnode_put(vp);
11181
11182 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11183 * search state. Everything was already put into he return buffer by the vop call. */
11184
11185 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11186 goto freeandexit;
11187 }
11188
11189 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11190 goto freeandexit;
11191 }
11192
11193 error = fserror;
11194
11195 freeandexit:
11196
11197 kfree_data(searchparams1, mallocsize);
11198
11199 return error;
11200 } /* end of searchfs system call */
11201
11202 #else /* CONFIG_SEARCHFS */
11203
11204 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11205 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11206 {
11207 return ENOTSUP;
11208 }
11209
11210 #endif /* CONFIG_SEARCHFS */
11211
11212
11213 #if CONFIG_DATALESS_FILES
11214
11215 /*
11216 * === Namespace Resolver Up-call Mechanism ===
11217 *
11218 * When I/O is performed to a dataless file or directory (read, write,
11219 * lookup-in, etc.), the file system performs an upcall to the namespace
11220 * resolver (filecoordinationd) to materialize the object.
11221 *
11222 * We need multiple up-calls to be in flight at once, and we need these
11223 * up-calls to be interruptible, thus the following implementation:
11224 *
11225 * => The nspace_resolver_request represents the in-kernel request state.
11226 * It contains a request ID, storage space for the errno code returned
11227 * by filecoordinationd, and flags.
11228 *
11229 * => The request ID is simply a global monotonically incrementing 32-bit
11230 * number. Outstanding requests are stored in a hash table, and the
11231 * hash function is extremely simple.
11232 *
11233 * => When an upcall is to be made to filecoordinationd, a request structure
11234 * is allocated on the stack (it is small, and needs to live only during
11235 * the duration of the call to resolve_nspace_item_ext()). It is
11236 * initialized and inserted into the table. Some backpressure from
11237 * filecoordinationd is applied by limiting the numnber of entries that
11238 * can be inserted into the table (and thus limiting the number of
11239 * outstanding requests issued to filecoordinationd); waiting for an
11240 * available slot is interruptible.
11241 *
11242 * => Once the request has been inserted into the table, the up-call is made
11243 * to filecoordinationd via a MiG-generated stub. The up-call returns
11244 * immediately and filecoordinationd processes the request asynchronously.
11245 *
11246 * => The caller now waits for the request to complete. Tnis is achieved by
11247 * sleeping on the address of the request structure and waiting for
11248 * filecoordinationd to mark the request structure as complete. This
11249 * is an interruptible sleep call; if interrupted, the request structure
11250 * is removed from the table and EINTR is returned to the caller. If
11251 * this occurs, an advisory up-call is made to filecoordinationd with
11252 * the request ID to indicate that the request can be aborted or
11253 * de-prioritized at the discretion of filecoordinationd.
11254 *
11255 * => When filecoordinationd has completed the request, it signals completion
11256 * by writing to the vfs.nspace.complete sysctl node. Only a process
11257 * decorated as a namespace resolver can write to this sysctl node. The
11258 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11259 * The request ID is looked up in the table, and if the request is found,
11260 * the error code is stored in the request structure and a wakeup()
11261 * issued on the address of the request structure. If the request is not
11262 * found, we simply drop the completion notification, assuming that the
11263 * caller was interrupted.
11264 *
11265 * => When the waiting thread wakes up, it extracts the error code from the
11266 * request structure, removes the request from the table, and returns the
11267 * error code to the calling function. Fini!
11268 */
11269
11270 struct nspace_resolver_request {
11271 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11272 vnode_t r_vp;
11273 uint32_t r_req_id;
11274 int r_resolver_error;
11275 int r_flags;
11276 };
11277
11278 #define RRF_COMPLETE 0x0001
11279
11280 static uint32_t
next_nspace_req_id(void)11281 next_nspace_req_id(void)
11282 {
11283 static uint32_t next_req_id;
11284
11285 return OSAddAtomic(1, &next_req_id);
11286 }
11287
11288 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11289 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11290
11291 static LIST_HEAD(nspace_resolver_requesthead,
11292 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11293 static u_long nspace_resolver_request_hashmask;
11294 static u_int nspace_resolver_request_count;
11295 static bool nspace_resolver_request_wait_slot;
11296 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11297 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11298 &nspace_resolver_request_lck_grp);
11299
11300 #define NSPACE_REQ_LOCK() \
11301 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11302 #define NSPACE_REQ_UNLOCK() \
11303 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11304
11305 #define NSPACE_RESOLVER_HASH(req_id) \
11306 (&nspace_resolver_request_hashtbl[(req_id) & \
11307 nspace_resolver_request_hashmask])
11308
11309 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)11310 nspace_resolver_req_lookup(uint32_t req_id)
11311 {
11312 struct nspace_resolver_requesthead *bucket;
11313 struct nspace_resolver_request *req;
11314
11315 bucket = NSPACE_RESOLVER_HASH(req_id);
11316 LIST_FOREACH(req, bucket, r_hashlink) {
11317 if (req->r_req_id == req_id) {
11318 return req;
11319 }
11320 }
11321
11322 return NULL;
11323 }
11324
11325 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11326 nspace_resolver_req_add(struct nspace_resolver_request *req)
11327 {
11328 struct nspace_resolver_requesthead *bucket;
11329 int error;
11330
11331 while (nspace_resolver_request_count >=
11332 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11333 nspace_resolver_request_wait_slot = true;
11334 error = msleep(&nspace_resolver_request_count,
11335 &nspace_resolver_request_hash_mutex,
11336 PVFS | PCATCH, "nspacerq", NULL);
11337 if (error) {
11338 return error;
11339 }
11340 }
11341
11342 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11343 #if DIAGNOSTIC
11344 assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
11345 #endif /* DIAGNOSTIC */
11346 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11347 nspace_resolver_request_count++;
11348
11349 return 0;
11350 }
11351
11352 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11353 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11354 {
11355 struct nspace_resolver_requesthead *bucket;
11356
11357 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11358 #if DIAGNOSTIC
11359 assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
11360 #endif /* DIAGNOSTIC */
11361 LIST_REMOVE(req, r_hashlink);
11362 nspace_resolver_request_count--;
11363
11364 if (nspace_resolver_request_wait_slot) {
11365 nspace_resolver_request_wait_slot = false;
11366 wakeup(&nspace_resolver_request_count);
11367 }
11368 }
11369
11370 static void
nspace_resolver_req_cancel(uint32_t req_id)11371 nspace_resolver_req_cancel(uint32_t req_id)
11372 {
11373 kern_return_t kr;
11374 mach_port_t mp;
11375
11376 // Failures here aren't fatal -- the cancellation message
11377 // sent to the resolver is merely advisory.
11378
11379 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11380 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11381 return;
11382 }
11383
11384 kr = send_nspace_resolve_cancel(mp, req_id);
11385 if (kr != KERN_SUCCESS) {
11386 os_log_error(OS_LOG_DEFAULT,
11387 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11388 }
11389
11390 ipc_port_release_send(mp);
11391 }
11392
11393 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11394 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11395 {
11396 bool send_cancel_message = false;
11397 int error;
11398
11399 NSPACE_REQ_LOCK();
11400
11401 while ((req->r_flags & RRF_COMPLETE) == 0) {
11402 error = msleep(req, &nspace_resolver_request_hash_mutex,
11403 PVFS | PCATCH, "nspace", NULL);
11404 if (error && error != ERESTART) {
11405 req->r_resolver_error = (error == EINTR) ? EINTR :
11406 ETIMEDOUT;
11407 send_cancel_message = true;
11408 break;
11409 }
11410 }
11411
11412 nspace_resolver_req_remove(req);
11413
11414 NSPACE_REQ_UNLOCK();
11415
11416 if (send_cancel_message) {
11417 nspace_resolver_req_cancel(req->r_req_id);
11418 }
11419
11420 return req->r_resolver_error;
11421 }
11422
11423 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11424 nspace_resolver_req_mark_complete(
11425 struct nspace_resolver_request *req,
11426 int resolver_error)
11427 {
11428 req->r_resolver_error = resolver_error;
11429 req->r_flags |= RRF_COMPLETE;
11430 wakeup(req);
11431 }
11432
11433 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)11434 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
11435 {
11436 struct nspace_resolver_request *req;
11437
11438 NSPACE_REQ_LOCK();
11439
11440 // If we don't find the request corresponding to our req_id,
11441 // just drop the completion signal on the floor; it's likely
11442 // that the requester interrupted with a signal.
11443
11444 req = nspace_resolver_req_lookup(req_id);
11445 if (req) {
11446 mount_t locked_mp = NULL;
11447
11448 locked_mp = req->r_vp->v_mount;
11449 mount_ref(locked_mp, 0);
11450 mount_lock_renames(locked_mp);
11451
11452 //
11453 // if the resolver isn't already returning an error and we have an
11454 // orig_gencount, then get an iocount on the request vnode and check
11455 // that the gencount on req->r_vp has not changed.
11456 //
11457 // note: a ref was taken on req->r_vp when the request was created
11458 // and that ref will be dropped by that thread when it wakes up.
11459 //
11460 if (resolver_error == 0 &&
11461 orig_gencount != 0 &&
11462 vnode_getwithref(req->r_vp) == 0) {
11463 struct vnode_attr va;
11464 uint64_t cur_gencount;
11465
11466 VATTR_INIT(&va);
11467 VATTR_WANTED(&va, va_recursive_gencount);
11468
11469 if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
11470 cur_gencount = va.va_recursive_gencount;
11471 } else {
11472 cur_gencount = 0;
11473 }
11474
11475 if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
11476 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
11477
11478 // this error will be returned to the thread that initiated the
11479 // materialization of req->r_vp.
11480 resolver_error = EBUSY;
11481
11482 // note: we explicitly do not return an error to the caller (i.e.
11483 // the thread that did the materialization) because they said they
11484 // don't want one.
11485 }
11486
11487 vnode_put(req->r_vp);
11488 }
11489
11490 mount_unlock_renames(locked_mp);
11491 mount_drop(locked_mp, 0);
11492
11493 nspace_resolver_req_mark_complete(req, resolver_error);
11494 }
11495
11496 NSPACE_REQ_UNLOCK();
11497
11498 return;
11499 }
11500
11501 static struct proc *nspace_resolver_proc;
11502
11503 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11504 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11505 {
11506 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11507 p == nspace_resolver_proc) ? 1 : 0;
11508 return 0;
11509 }
11510
11511 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11512
11513 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11514 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11515 {
11516 vfs_context_t ctx = vfs_context_current();
11517 int error = 0;
11518
11519 //
11520 // The system filecoordinationd runs as uid == 0. This also
11521 // has the nice side-effect of filtering out filecoordinationd
11522 // running in the simulator.
11523 //
11524 if (!vfs_context_issuser(ctx) ||
11525 !vfs_context_is_dataless_resolver(ctx)) {
11526 return EPERM;
11527 }
11528
11529 if (is_resolver) {
11530 NSPACE_REQ_LOCK();
11531
11532 if (nspace_resolver_proc == NULL) {
11533 proc_lock(p);
11534 p->p_lflag |= P_LNSPACE_RESOLVER;
11535 proc_unlock(p);
11536 nspace_resolver_proc = p;
11537 } else {
11538 error = EBUSY;
11539 }
11540
11541 NSPACE_REQ_UNLOCK();
11542 } else {
11543 // This is basically just like the exit case.
11544 // nspace_resolver_exited() will verify that the
11545 // process is the resolver, and will clear the
11546 // global.
11547 nspace_resolver_exited(p);
11548 }
11549
11550 return error;
11551 }
11552
11553 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11554 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11555 {
11556 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11557 (p->p_vfs_iopolicy &
11558 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11559 *is_prevented = 1;
11560 } else {
11561 *is_prevented = 0;
11562 }
11563 return 0;
11564 }
11565
11566 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11567 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11568 {
11569 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11570 return is_prevented ? 0 : EBUSY;
11571 }
11572
11573 if (is_prevented) {
11574 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11575 } else {
11576 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11577 }
11578 return 0;
11579 }
11580
11581 static int
nspace_materialization_get_thread_state(int * is_prevented)11582 nspace_materialization_get_thread_state(int *is_prevented)
11583 {
11584 uthread_t ut = current_uthread();
11585
11586 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11587 return 0;
11588 }
11589
11590 static int
nspace_materialization_set_thread_state(int is_prevented)11591 nspace_materialization_set_thread_state(int is_prevented)
11592 {
11593 uthread_t ut = current_uthread();
11594
11595 if (is_prevented) {
11596 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11597 } else {
11598 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11599 }
11600 return 0;
11601 }
11602
11603 /* the vfs.nspace branch */
11604 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11605
11606 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11607 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11608 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11609 {
11610 struct proc *p = req->p;
11611 int new_value, old_value, changed = 0;
11612 int error;
11613
11614 error = nspace_resolver_get_proc_state(p, &old_value);
11615 if (error) {
11616 return error;
11617 }
11618
11619 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11620 &changed);
11621 if (error == 0 && changed) {
11622 error = nspace_resolver_set_proc_state(p, new_value);
11623 }
11624 return error;
11625 }
11626
11627 /* decorate this process as the dataless file resolver */
11628 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11629 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11630 0, 0, sysctl_nspace_resolver, "I", "");
11631
11632 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11633 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11634 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11635 {
11636 struct proc *p = req->p;
11637 int new_value, old_value, changed = 0;
11638 int error;
11639
11640 error = nspace_materialization_get_proc_state(p, &old_value);
11641 if (error) {
11642 return error;
11643 }
11644
11645 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11646 &changed);
11647 if (error == 0 && changed) {
11648 error = nspace_materialization_set_proc_state(p, new_value);
11649 }
11650 return error;
11651 }
11652
11653 /* decorate this process as not wanting to materialize dataless files */
11654 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11655 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11656 0, 0, sysctl_nspace_prevent_materialization, "I", "");
11657
11658 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11659 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11660 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11661 {
11662 int new_value, old_value, changed = 0;
11663 int error;
11664
11665 error = nspace_materialization_get_thread_state(&old_value);
11666 if (error) {
11667 return error;
11668 }
11669
11670 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11671 &changed);
11672 if (error == 0 && changed) {
11673 error = nspace_materialization_set_thread_state(new_value);
11674 }
11675 return error;
11676 }
11677
11678 /* decorate this thread as not wanting to materialize dataless files */
11679 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11680 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11681 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11682
11683 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11684 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11685 __unused int arg2, struct sysctl_req *req)
11686 {
11687 struct proc *p = req->p;
11688 uint32_t req_status[2] = { 0, 0 };
11689 uint64_t gencount = 0;
11690 int error, is_resolver, changed = 0, gencount_changed;
11691
11692 error = nspace_resolver_get_proc_state(p, &is_resolver);
11693 if (error) {
11694 return error;
11695 }
11696
11697 if (!is_resolver) {
11698 return EPERM;
11699 }
11700
11701 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11702 &changed);
11703 if (error) {
11704 return error;
11705 }
11706
11707 // get the gencount if it was passed
11708 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11709 &gencount_changed);
11710 if (error) {
11711 gencount = 0;
11712 // we ignore the error because the gencount was optional
11713 error = 0;
11714 }
11715
11716 /*
11717 * req_status[0] is the req_id
11718 *
11719 * req_status[1] is the errno
11720 */
11721 if (error == 0 && changed) {
11722 nspace_resolver_req_completed(req_status[0],
11723 (int)req_status[1], gencount);
11724 }
11725 return error;
11726 }
11727
11728 /* Resolver reports completed reqs here. */
11729 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11730 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11731 0, 0, sysctl_nspace_complete, "-", "");
11732
11733 #endif /* CONFIG_DATALESS_FILES */
11734
11735 #if CONFIG_DATALESS_FILES
11736 #define __no_dataless_unused /* nothing */
11737 #else
11738 #define __no_dataless_unused __unused
11739 #endif
11740
11741 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11742 vfs_context_dataless_materialization_is_prevented(
11743 vfs_context_t const ctx __no_dataless_unused)
11744 {
11745 #if CONFIG_DATALESS_FILES
11746 proc_t const p = vfs_context_proc(ctx);
11747 thread_t const t = vfs_context_thread(ctx);
11748 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11749
11750 /*
11751 * Kernel context ==> return EDEADLK, as we would with any random
11752 * process decorated as no-materialize.
11753 */
11754 if (ctx == vfs_context_kernel()) {
11755 return EDEADLK;
11756 }
11757
11758 /*
11759 * If the process has the dataless-manipulation entitlement,
11760 * materialization is prevented, and depending on the kind
11761 * of file system operation, things get to proceed as if the
11762 * object is not dataless.
11763 */
11764 if (vfs_context_is_dataless_manipulator(ctx)) {
11765 return EJUSTRETURN;
11766 }
11767
11768 /*
11769 * Per-thread decorations override any process-wide decorations.
11770 * (Foundation uses this, and this overrides even the dataless-
11771 * manipulation entitlement so as to make API contracts consistent.)
11772 */
11773 if (ut != NULL) {
11774 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11775 return EDEADLK;
11776 }
11777 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11778 return 0;
11779 }
11780 }
11781
11782 /*
11783 * If the process's iopolicy specifies that dataless files
11784 * can be materialized, then we let it go ahead.
11785 */
11786 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11787 return 0;
11788 }
11789 #endif /* CONFIG_DATALESS_FILES */
11790
11791 /*
11792 * The default behavior is to not materialize dataless files;
11793 * return to the caller that deadlock was detected.
11794 */
11795 return EDEADLK;
11796 }
11797
11798 void
nspace_resolver_init(void)11799 nspace_resolver_init(void)
11800 {
11801 #if CONFIG_DATALESS_FILES
11802 nspace_resolver_request_hashtbl =
11803 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11804 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11805 #endif /* CONFIG_DATALESS_FILES */
11806 }
11807
11808 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11809 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11810 {
11811 #if CONFIG_DATALESS_FILES
11812 struct nspace_resolver_requesthead *bucket;
11813 struct nspace_resolver_request *req;
11814 u_long idx;
11815
11816 NSPACE_REQ_LOCK();
11817
11818 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11819 p == nspace_resolver_proc) {
11820 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11821 bucket = &nspace_resolver_request_hashtbl[idx];
11822 LIST_FOREACH(req, bucket, r_hashlink) {
11823 nspace_resolver_req_mark_complete(req,
11824 ETIMEDOUT);
11825 }
11826 }
11827 nspace_resolver_proc = NULL;
11828 }
11829
11830 NSPACE_REQ_UNLOCK();
11831 #endif /* CONFIG_DATALESS_FILES */
11832 }
11833
11834 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11835 resolve_nspace_item(struct vnode *vp, uint64_t op)
11836 {
11837 return resolve_nspace_item_ext(vp, op, NULL);
11838 }
11839
11840 #define DATALESS_RESOLVER_ENTITLEMENT \
11841 "com.apple.private.vfs.dataless-resolver"
11842 #define DATALESS_MANIPULATION_ENTITLEMENT \
11843 "com.apple.private.vfs.dataless-manipulation"
11844
11845 #if CONFIG_DATALESS_FILES
11846 /*
11847 * Return TRUE if the vfs context is associated with the dataless
11848 * resolver.
11849 */
11850 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11851 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11852 {
11853 return IOTaskHasEntitlement(vfs_context_task(ctx),
11854 DATALESS_RESOLVER_ENTITLEMENT);
11855 }
11856 #endif /* CONFIG_DATALESS_FILES */
11857
11858 /*
11859 * Return TRUE if the vfs context is associated with a process entitled
11860 * for dataless manipulation.
11861 *
11862 * XXX Arguably belongs in vfs_subr.c, but is here because of the
11863 * complication around CONFIG_DATALESS_FILES.
11864 */
11865 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11866 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11867 {
11868 #if CONFIG_DATALESS_FILES
11869 task_t task = vfs_context_task(ctx);
11870 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11871 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11872 #else
11873 return false;
11874 #endif /* CONFIG_DATALESS_FILES */
11875 }
11876
11877 #if CONFIG_DATALESS_FILES
11878 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11879 log_materialization_prevented(vnode_t vp, uint64_t op)
11880 {
11881 char p_name[MAXCOMLEN + 1];
11882 char *vntype;
11883 proc_selfname(&p_name[0], sizeof(p_name));
11884
11885 if (vp->v_type == VREG) {
11886 vntype = "File";
11887 } else if (vp->v_type == VDIR) {
11888 vntype = "Dir";
11889 } else if (vp->v_type == VLNK) {
11890 vntype = "SymLink";
11891 } else {
11892 vntype = "Other";
11893 }
11894
11895 #if DEVELOPMENT
11896 char *path = NULL;
11897 int len;
11898
11899 path = get_pathbuff();
11900 len = MAXPATHLEN;
11901 if (path) {
11902 vn_getpath(vp, path, &len);
11903 }
11904
11905 os_log_debug(OS_LOG_DEFAULT,
11906 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11907 p_name, proc_selfpid(),
11908 op, vntype, path ? path : "<unknown-path>");
11909 if (path) {
11910 release_pathbuff(path);
11911 }
11912 #else
11913 os_log_debug(OS_LOG_DEFAULT,
11914 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11915 p_name, proc_selfpid(),
11916 op, vntype);
11917 #endif
11918 }
11919 #endif /* CONFIG_DATALESS_FILES */
11920
11921 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11922 vfs_materialize_item(
11923 struct vnode *vp __no_dataless_unused,
11924 uint64_t op __no_dataless_unused,
11925 int64_t offset __no_dataless_unused,
11926 int64_t size __no_dataless_unused,
11927 char *lookup_name __no_dataless_unused,
11928 size_t const namelen __no_dataless_unused)
11929 {
11930 #if CONFIG_DATALESS_FILES
11931 struct nspace_resolver_request req;
11932 kern_return_t kern_ret;
11933 mach_port_t mach_port;
11934 char *path = NULL;
11935 vfs_context_t context;
11936 int path_len;
11937 int error;
11938 audit_token_t atoken;
11939
11940 /*
11941 * If this is a snapshot event and the vnode is on a disk image just
11942 * pretend nothing happened since any change to the disk image will
11943 * cause the disk image itself to get backed up and this avoids multi-
11944 * way deadlocks between the snapshot handler and the ever popular
11945 * diskimages-helper process. The variable nspace_allow_virtual_devs
11946 * allows this behavior to be overridden (for use by the Mobile
11947 * TimeMachine testing infrastructure which uses disk images).
11948 */
11949 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11950 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11951 return ENOTSUP;
11952 }
11953
11954 context = vfs_context_current();
11955
11956 error = vfs_context_dataless_materialization_is_prevented(context);
11957 if (error) {
11958 log_materialization_prevented(vp, op);
11959 return error;
11960 }
11961
11962 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11963 &mach_port);
11964 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11965 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11966 /*
11967 * Treat this like being unable to access the backing store
11968 * server.
11969 */
11970 return ETIMEDOUT;
11971 }
11972
11973 path = zalloc(ZV_NAMEI);
11974 path_len = MAXPATHLEN;
11975
11976 error = vn_getpath(vp, path, &path_len);
11977 if (error) {
11978 goto out_release_port;
11979 }
11980
11981 error = vfs_context_copy_audit_token(context, &atoken);
11982 if (error) {
11983 goto out_release_port;
11984 }
11985
11986 req.r_req_id = next_nspace_req_id();
11987 req.r_resolver_error = 0;
11988 req.r_flags = 0;
11989 req.r_vp = vp;
11990
11991 NSPACE_REQ_LOCK();
11992 error = nspace_resolver_req_add(&req);
11993 NSPACE_REQ_UNLOCK();
11994 if (error) {
11995 goto out_release_port;
11996 }
11997
11998 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11999 if (vp->v_type == VDIR) {
12000 char *tmpname = NULL;
12001
12002 /*
12003 * If the caller provided a lookup_name *and* a name length,
12004 * then we assume the lookup_name is not NUL-terminated.
12005 * Allocate a temporary buffer in this case to provide
12006 * a NUL-terminated path name to the IPC call.
12007 */
12008 if (lookup_name != NULL && namelen != 0) {
12009 if (namelen >= PATH_MAX) {
12010 error = EINVAL;
12011 goto out_release_port;
12012 }
12013 tmpname = zalloc(ZV_NAMEI);
12014 strlcpy(tmpname, lookup_name, namelen + 1);
12015 lookup_name = tmpname;
12016 } else if (lookup_name != NULL) {
12017 /*
12018 * If the caller provided a lookup_name with a
12019 * zero name length, then we assume it's NUL-
12020 * terminated. Verify it has a valid length.
12021 */
12022 if (strlen(lookup_name) >= PATH_MAX) {
12023 error = EINVAL;
12024 goto out_release_port;
12025 }
12026 }
12027
12028 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12029 req.r_req_id, (uint32_t)(op & 0xffffffff),
12030 lookup_name == NULL ? "" : lookup_name, path, atoken);
12031
12032 if (tmpname != NULL) {
12033 zfree(ZV_NAMEI, tmpname);
12034
12035 /*
12036 * Poison lookup_name rather than reference
12037 * freed memory.
12038 */
12039 lookup_name = NULL;
12040 }
12041 } else {
12042 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12043 req.r_req_id, (uint32_t)(op & 0xffffffff),
12044 offset, size, path, atoken);
12045 }
12046 if (kern_ret != KERN_SUCCESS) {
12047 /*
12048 * Also treat this like being unable to access the backing
12049 * store server.
12050 */
12051 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12052 kern_ret);
12053 error = ETIMEDOUT;
12054
12055 NSPACE_REQ_LOCK();
12056 nspace_resolver_req_remove(&req);
12057 NSPACE_REQ_UNLOCK();
12058 goto out_release_port;
12059 }
12060
12061 /*
12062 * Give back the memory we allocated earlier while we wait; we
12063 * no longer need it.
12064 */
12065 zfree(ZV_NAMEI, path);
12066 path = NULL;
12067
12068 /*
12069 * Request has been submitted to the resolver. Now (interruptibly)
12070 * wait for completion. Upon requrn, the request will have been
12071 * removed from the lookup table.
12072 */
12073 error = nspace_resolver_req_wait(&req);
12074
12075 out_release_port:
12076 if (path != NULL) {
12077 zfree(ZV_NAMEI, path);
12078 }
12079 ipc_port_release_send(mach_port);
12080
12081 return error;
12082 #else
12083 return ENOTSUP;
12084 #endif /* CONFIG_DATALESS_FILES */
12085 }
12086
12087 /*
12088 * vfs_materialize_file: Materialize a regular file.
12089 *
12090 * Inputs:
12091 * vp The dataless file to be materialized.
12092 *
12093 * op What kind of operation is being performed:
12094 * -> NAMESPACE_HANDLER_READ_OP
12095 * -> NAMESPACE_HANDLER_WRITE_OP
12096 * -> NAMESPACE_HANDLER_LINK_CREATE
12097 * -> NAMESPACE_HANDLER_DELETE_OP
12098 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12099 * -> NAMESPACE_HANDLER_RENAME_OP
12100 *
12101 * offset offset of I/O for READ or WRITE. Ignored for
12102 * other ops.
12103 *
12104 * size size of I/O for READ or WRITE Ignored for
12105 * other ops.
12106 *
12107 * If offsize or size are -1 for a READ or WRITE, then the resolver should
12108 * consider the range to be unknown.
12109 *
12110 * Upon successful return, the caller may proceed with the operation.
12111 * N.B. the file may still be "dataless" in this case.
12112 */
12113 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12114 vfs_materialize_file(
12115 struct vnode *vp,
12116 uint64_t op,
12117 int64_t offset,
12118 int64_t size)
12119 {
12120 if (vp->v_type != VREG) {
12121 return EFTYPE;
12122 }
12123 return vfs_materialize_item(vp, op, offset, size, NULL, 0);
12124 }
12125
12126 /*
12127 * vfs_materialize_dir:
12128 *
12129 * Inputs:
12130 * vp The dataless directory to be materialized.
12131 *
12132 * op What kind of operation is being performed:
12133 * -> NAMESPACE_HANDLER_READ_OP
12134 * -> NAMESPACE_HANDLER_WRITE_OP
12135 * -> NAMESPACE_HANDLER_DELETE_OP
12136 * -> NAMESPACE_HANDLER_RENAME_OP
12137 * -> NAMESPACE_HANDLER_LOOKUP_OP
12138 *
12139 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12140 * other ops. May or may not be NUL-terminated; see below.
12141 *
12142 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12143 * terminated and namelen is the number of valid bytes in
12144 * lookup_name. If zero, then lookup_name is assumed to be
12145 * NUL-terminated.
12146 *
12147 * Upon successful return, the caller may proceed with the operation.
12148 * N.B. the directory may still be "dataless" in this case.
12149 */
12150 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12151 vfs_materialize_dir(
12152 struct vnode *vp,
12153 uint64_t op,
12154 char *lookup_name,
12155 size_t namelen)
12156 {
12157 if (vp->v_type != VDIR) {
12158 return EFTYPE;
12159 }
12160 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12161 return EINVAL;
12162 }
12163 return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
12164 }
12165
12166 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)12167 resolve_nspace_item_ext(
12168 struct vnode *vp __no_dataless_unused,
12169 uint64_t op __no_dataless_unused,
12170 void *arg __unused)
12171 {
12172 #if CONFIG_DATALESS_FILES
12173 int error;
12174 mach_port_t mp;
12175 char *path = NULL;
12176 int path_len;
12177 kern_return_t kr;
12178 struct nspace_resolver_request req;
12179
12180 // only allow namespace events on regular files, directories and symlinks.
12181 if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
12182 return EFTYPE;
12183 }
12184
12185 //
12186 // if this is a snapshot event and the vnode is on a
12187 // disk image just pretend nothing happened since any
12188 // change to the disk image will cause the disk image
12189 // itself to get backed up and this avoids multi-way
12190 // deadlocks between the snapshot handler and the ever
12191 // popular diskimages-helper process. the variable
12192 // nspace_allow_virtual_devs allows this behavior to
12193 // be overridden (for use by the Mobile TimeMachine
12194 // testing infrastructure which uses disk images)
12195 //
12196 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12197 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12198 return ENOTSUP;
12199 }
12200
12201 error = vfs_context_dataless_materialization_is_prevented(
12202 vfs_context_current());
12203 if (error) {
12204 log_materialization_prevented(vp, op);
12205 return error;
12206 }
12207
12208 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12209 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12210 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12211 // Treat this like being unable to access the backing
12212 // store server.
12213 return ETIMEDOUT;
12214 }
12215
12216 path = zalloc(ZV_NAMEI);
12217 path_len = MAXPATHLEN;
12218
12219 error = vn_getpath(vp, path, &path_len);
12220 if (error == 0) {
12221 int xxx_rdar44371223; /* XXX Mig bug */
12222 req.r_req_id = next_nspace_req_id();
12223 req.r_resolver_error = 0;
12224 req.r_flags = 0;
12225
12226 if ((error = vnode_ref(vp)) == 0) { // take a ref so that the vnode doesn't go away
12227 req.r_vp = vp;
12228 } else {
12229 goto out_release_port;
12230 }
12231
12232 NSPACE_REQ_LOCK();
12233 error = nspace_resolver_req_add(&req);
12234 NSPACE_REQ_UNLOCK();
12235 if (error) {
12236 vnode_rele(req.r_vp);
12237 goto out_release_port;
12238 }
12239
12240 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12241 kr = send_nspace_resolve_path(mp, req.r_req_id,
12242 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
12243 path, &xxx_rdar44371223);
12244 if (kr != KERN_SUCCESS) {
12245 // Also treat this like being unable to access
12246 // the backing store server.
12247 os_log_error(OS_LOG_DEFAULT,
12248 "NSPACE resolve_path failure: %d", kr);
12249 error = ETIMEDOUT;
12250
12251 NSPACE_REQ_LOCK();
12252 nspace_resolver_req_remove(&req);
12253 NSPACE_REQ_UNLOCK();
12254 vnode_rele(req.r_vp);
12255 goto out_release_port;
12256 }
12257
12258 // Give back the memory we allocated earlier while
12259 // we wait; we no longer need it.
12260 zfree(ZV_NAMEI, path);
12261 path = NULL;
12262
12263 // Request has been submitted to the resolver.
12264 // Now (interruptibly) wait for completion.
12265 // Upon requrn, the request will have been removed
12266 // from the lookup table.
12267 error = nspace_resolver_req_wait(&req);
12268
12269 vnode_rele(req.r_vp);
12270 }
12271
12272 out_release_port:
12273 if (path != NULL) {
12274 zfree(ZV_NAMEI, path);
12275 }
12276 ipc_port_release_send(mp);
12277
12278 return error;
12279 #else
12280 return ENOTSUP;
12281 #endif /* CONFIG_DATALESS_FILES */
12282 }
12283
12284 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)12285 nspace_snapshot_event(__unused vnode_t vp, __unused time_t ctime,
12286 __unused uint64_t op_type, __unused void *arg)
12287 {
12288 return 0;
12289 }
12290
12291 #if 0
12292 static int
12293 build_volfs_path(struct vnode *vp, char *path, int *len)
12294 {
12295 struct vnode_attr va;
12296 int ret;
12297
12298 VATTR_INIT(&va);
12299 VATTR_WANTED(&va, va_fsid);
12300 VATTR_WANTED(&va, va_fileid);
12301
12302 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12303 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12304 ret = -1;
12305 } else {
12306 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12307 ret = 0;
12308 }
12309
12310 return ret;
12311 }
12312 #endif
12313
12314 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12315 fsctl_bogus_command_compat(unsigned long cmd)
12316 {
12317 switch (cmd) {
12318 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12319 return FSIOC_SYNC_VOLUME;
12320 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12321 return FSIOC_ROUTEFS_SETROUTEID;
12322 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12323 return FSIOC_SET_PACKAGE_EXTS;
12324 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12325 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12326 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12327 return DISK_CONDITIONER_IOC_GET;
12328 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12329 return DISK_CONDITIONER_IOC_SET;
12330 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12331 return FSIOC_FIOSEEKHOLE;
12332 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12333 return FSIOC_FIOSEEKDATA;
12334 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12335 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12336 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12337 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12338 }
12339
12340 return cmd;
12341 }
12342
12343 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12344 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12345 {
12346 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12347 }
12348
12349 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12350 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12351 {
12352 struct vfs_attr vfa;
12353 mount_t mp = vp->v_mount;
12354 unsigned arg;
12355 int error;
12356
12357 /* record vid of vp so we can drop it below. */
12358 uint32_t vvid = vp->v_id;
12359
12360 /*
12361 * Then grab mount_iterref so that we can release the vnode.
12362 * Without this, a thread may call vnode_iterate_prepare then
12363 * get into a deadlock because we've never released the root vp
12364 */
12365 error = mount_iterref(mp, 0);
12366 if (error) {
12367 return error;
12368 }
12369 vnode_hold(vp);
12370 vnode_put(vp);
12371
12372 arg = MNT_NOWAIT;
12373 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12374 arg = MNT_WAIT;
12375 }
12376
12377 /*
12378 * If the filessytem supports multiple filesytems in a
12379 * partition (For eg APFS volumes in a container, it knows
12380 * that the waitfor argument to VFS_SYNC are flags.
12381 */
12382 VFSATTR_INIT(&vfa);
12383 VFSATTR_WANTED(&vfa, f_capabilities);
12384 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12385 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12386 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12387 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12388 arg |= MNT_VOLUME;
12389 }
12390
12391 /* issue the sync for this volume */
12392 (void)sync_callback(mp, &arg);
12393
12394 /*
12395 * Then release the mount_iterref once we're done syncing; it's not
12396 * needed for the VNOP_IOCTL below
12397 */
12398 mount_iterdrop(mp);
12399
12400 if (arg & FSCTL_SYNC_FULLSYNC) {
12401 /* re-obtain vnode iocount on the root vp, if possible */
12402 error = vnode_getwithvid(vp, vvid);
12403 if (error == 0) {
12404 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12405 vnode_put(vp);
12406 }
12407 }
12408 vnode_drop(vp);
12409 /* mark the argument VP as having been released */
12410 *arg_vp = NULL;
12411 return error;
12412 }
12413
12414 #if ROUTEFS
12415 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12416 handle_routes(user_addr_t udata)
12417 {
12418 char routepath[MAXPATHLEN];
12419 size_t len = 0;
12420 int error;
12421
12422 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12423 return error;
12424 }
12425 bzero(routepath, MAXPATHLEN);
12426 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12427 if (error) {
12428 return error;
12429 }
12430 error = routefs_kernel_mount(routepath);
12431 return error;
12432 }
12433 #endif
12434
12435 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12436 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12437 {
12438 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12439 struct vnode_attr va;
12440 int error;
12441
12442 VATTR_INIT(&va);
12443 VATTR_SET(&va, va_flags, cas->new_flags);
12444
12445 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12446
12447 #if CONFIG_FSE
12448 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12449 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12450 }
12451 #endif
12452
12453 return error;
12454 }
12455
12456 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12457 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12458 {
12459 struct mount *mp = NULL;
12460 errno_t rootauth = 0;
12461
12462 mp = vp->v_mount;
12463
12464 /*
12465 * query the underlying FS and see if it reports something
12466 * sane for this vnode. If volume is authenticated via
12467 * chunklist, leave that for the caller to determine.
12468 */
12469 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12470
12471 return rootauth;
12472 }
12473
12474 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12475 "com.apple.private.kernel.set-package-extensions"
12476
12477 /*
12478 * Make a filesystem-specific control call:
12479 */
12480 /* ARGSUSED */
12481 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12482 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12483 {
12484 int error = 0;
12485 boolean_t is64bit;
12486 u_int size;
12487 #define STK_PARAMS 128
12488 char stkbuf[STK_PARAMS] = {0};
12489 caddr_t data, memp;
12490 vnode_t vp = *arg_vp;
12491
12492 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12493 return ENOTTY;
12494 }
12495
12496 cmd = fsctl_bogus_command_compat(cmd);
12497
12498 size = IOCPARM_LEN(cmd);
12499 if (size > IOCPARM_MAX) {
12500 return EINVAL;
12501 }
12502
12503 is64bit = proc_is64bit(p);
12504
12505 memp = NULL;
12506
12507 if (size > sizeof(stkbuf)) {
12508 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12509 return ENOMEM;
12510 }
12511 data = memp;
12512 } else {
12513 data = &stkbuf[0];
12514 };
12515
12516 if (cmd & IOC_IN) {
12517 if (size) {
12518 error = copyin(udata, data, size);
12519 if (error) {
12520 if (memp) {
12521 kfree_data(memp, size);
12522 }
12523 return error;
12524 }
12525 } else {
12526 if (is64bit) {
12527 *(user_addr_t *)data = udata;
12528 } else {
12529 *(uint32_t *)data = (uint32_t)udata;
12530 }
12531 };
12532 } else if ((cmd & IOC_OUT) && size) {
12533 /*
12534 * Zero the buffer so the user always
12535 * gets back something deterministic.
12536 */
12537 bzero(data, size);
12538 } else if (cmd & IOC_VOID) {
12539 if (is64bit) {
12540 *(user_addr_t *)data = udata;
12541 } else {
12542 *(uint32_t *)data = (uint32_t)udata;
12543 }
12544 }
12545
12546 /* Check to see if it's a generic command */
12547 switch (cmd) {
12548 case FSIOC_SYNC_VOLUME:
12549 error = handle_sync_volume(vp, arg_vp, data, ctx);
12550 break;
12551
12552 case FSIOC_ROUTEFS_SETROUTEID:
12553 #if ROUTEFS
12554 error = handle_routes(udata);
12555 #endif
12556 break;
12557
12558 case FSIOC_SET_PACKAGE_EXTS: {
12559 user_addr_t ext_strings;
12560 uint32_t num_entries;
12561 uint32_t max_width;
12562
12563 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12564 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12565 error = EPERM;
12566 break;
12567 }
12568
12569 if ((is64bit && size != sizeof(user64_package_ext_info))
12570 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12571 // either you're 64-bit and passed a 64-bit struct or
12572 // you're 32-bit and passed a 32-bit struct. otherwise
12573 // it's not ok.
12574 error = EINVAL;
12575 break;
12576 }
12577
12578 if (is64bit) {
12579 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12580 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12581 }
12582 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12583 num_entries = ((user64_package_ext_info *)data)->num_entries;
12584 max_width = ((user64_package_ext_info *)data)->max_width;
12585 } else {
12586 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12587 num_entries = ((user32_package_ext_info *)data)->num_entries;
12588 max_width = ((user32_package_ext_info *)data)->max_width;
12589 }
12590 error = set_package_extensions_table(ext_strings, num_entries, max_width);
12591 }
12592 break;
12593
12594 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12595 {
12596 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12597 break;
12598 }
12599 if (vp->v_mount) {
12600 mount_lock(vp->v_mount);
12601 if (data[0] != 0) {
12602 int i;
12603 for (i = 0; i < MFSTYPENAMELEN; i++) {
12604 if (!data[i]) {
12605 goto continue_copy;
12606 }
12607 }
12608 /*
12609 * Getting here means we have a user data string which has no
12610 * NULL termination in its first MFSTYPENAMELEN bytes.
12611 * This is bogus, let's avoid strlcpy-ing the read data and
12612 * return an error.
12613 */
12614 error = EINVAL;
12615 goto unlock;
12616 continue_copy:
12617 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
12618 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
12619 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12620 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12621 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12622 }
12623 } else {
12624 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12625 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12626 }
12627 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12628 vp->v_mount->fstypename_override[0] = '\0';
12629 }
12630 unlock:
12631 mount_unlock(vp->v_mount);
12632 }
12633 }
12634 break;
12635
12636 case DISK_CONDITIONER_IOC_GET: {
12637 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12638 }
12639 break;
12640
12641 case DISK_CONDITIONER_IOC_SET: {
12642 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12643 }
12644 break;
12645
12646 case FSIOC_CAS_BSDFLAGS:
12647 error = handle_flags(vp, data, ctx);
12648 break;
12649
12650 case FSIOC_FD_ONLY_OPEN_ONCE: {
12651 error = 0;
12652 if (vnode_usecount(vp) > 1) {
12653 vnode_lock_spin(vp);
12654 if (vp->v_lflag & VL_HASSTREAMS) {
12655 if (vnode_isinuse_locked(vp, 1, 1)) {
12656 error = EBUSY;
12657 }
12658 } else if (vnode_usecount(vp) > 1) {
12659 error = EBUSY;
12660 }
12661 vnode_unlock(vp);
12662 }
12663 }
12664 break;
12665
12666 case FSIOC_EVAL_ROOTAUTH:
12667 error = handle_auth(vp, cmd, data, options, ctx);
12668 break;
12669
12670 case FSIOC_TEST_FSE_ACCESS_GRANTED:
12671 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12672 break;
12673
12674 default: {
12675 /* other, known commands shouldn't be passed down here */
12676 switch (cmd) {
12677 case F_PUNCHHOLE:
12678 case F_TRIM_ACTIVE_FILE:
12679 case F_RDADVISE:
12680 case F_TRANSCODEKEY:
12681 case F_GETPROTECTIONLEVEL:
12682 case F_GETDEFAULTPROTLEVEL:
12683 case F_MAKECOMPRESSED:
12684 case F_SET_GREEDY_MODE:
12685 case F_SETSTATICCONTENT:
12686 case F_SETIOTYPE:
12687 case F_SETBACKINGSTORE:
12688 case F_GETPATH_MTMINFO:
12689 case APFSIOC_REVERT_TO_SNAPSHOT:
12690 case FSIOC_FIOSEEKHOLE:
12691 case FSIOC_FIOSEEKDATA:
12692 case HFS_GET_BOOT_INFO:
12693 case HFS_SET_BOOT_INFO:
12694 case FIOPINSWAP:
12695 case F_CHKCLEAN:
12696 case F_FULLFSYNC:
12697 case F_BARRIERFSYNC:
12698 case F_FREEZE_FS:
12699 case F_THAW_FS:
12700 case FSIOC_KERNEL_ROOTAUTH:
12701 case FSIOC_GRAFT_FS:
12702 case FSIOC_UNGRAFT_FS:
12703 case FSIOC_AUTH_FS:
12704 error = EINVAL;
12705 goto outdrop;
12706 }
12707 /* Invoke the filesystem-specific code */
12708 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12709 }
12710 } /* end switch stmt */
12711
12712 /*
12713 * if no errors, copy any data to user. Size was
12714 * already set and checked above.
12715 */
12716 if (error == 0 && (cmd & IOC_OUT) && size) {
12717 error = copyout(data, udata, size);
12718 }
12719
12720 outdrop:
12721 if (memp) {
12722 kfree_data(memp, size);
12723 }
12724
12725 return error;
12726 }
12727
12728 /* ARGSUSED */
12729 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12730 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12731 {
12732 int error;
12733 struct nameidata nd;
12734 uint32_t nameiflags;
12735 vnode_t vp = NULL;
12736 vfs_context_t ctx = vfs_context_current();
12737
12738 AUDIT_ARG(cmd, (int)uap->cmd);
12739 AUDIT_ARG(value32, uap->options);
12740 /* Get the vnode for the file we are getting info on: */
12741 nameiflags = 0;
12742 //
12743 // if we come through fsctl() then the file is by definition not open.
12744 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12745 // lest the caller mistakenly thinks the only open is their own (but in
12746 // reality it's someone elses).
12747 //
12748 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12749 return EINVAL;
12750 }
12751 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12752 nameiflags |= FOLLOW;
12753 }
12754 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12755 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12756 }
12757 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12758 UIO_USERSPACE, uap->path, ctx);
12759 if ((error = namei(&nd))) {
12760 goto done;
12761 }
12762 vp = nd.ni_vp;
12763 nameidone(&nd);
12764
12765 #if CONFIG_MACF
12766 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12767 if (error) {
12768 goto done;
12769 }
12770 #endif
12771
12772 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12773
12774 done:
12775 if (vp) {
12776 vnode_put(vp);
12777 }
12778 return error;
12779 }
12780 /* ARGSUSED */
12781 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12782 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12783 {
12784 int error;
12785 vnode_t vp = NULL;
12786 vfs_context_t ctx = vfs_context_current();
12787 int fd = -1;
12788
12789 AUDIT_ARG(fd, uap->fd);
12790 AUDIT_ARG(cmd, (int)uap->cmd);
12791 AUDIT_ARG(value32, uap->options);
12792
12793 /* Get the vnode for the file we are getting info on: */
12794 if ((error = file_vnode(uap->fd, &vp))) {
12795 return error;
12796 }
12797 fd = uap->fd;
12798 if ((error = vnode_getwithref(vp))) {
12799 file_drop(fd);
12800 return error;
12801 }
12802
12803 #if CONFIG_MACF
12804 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12805 file_drop(fd);
12806 vnode_put(vp);
12807 return error;
12808 }
12809 #endif
12810
12811 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12812
12813 file_drop(fd);
12814
12815 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12816 if (vp) {
12817 vnode_put(vp);
12818 }
12819
12820 return error;
12821 }
12822 /* end of fsctl system call */
12823
12824 #define FILESEC_ACCESS_ENTITLEMENT \
12825 "com.apple.private.vfs.filesec-access"
12826
12827 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12828 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12829 {
12830 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12831 /*
12832 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12833 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12834 */
12835 if ((!setting && vfs_context_issuser(ctx)) ||
12836 IOTaskHasEntitlement(vfs_context_task(ctx),
12837 FILESEC_ACCESS_ENTITLEMENT)) {
12838 return 0;
12839 }
12840 }
12841
12842 return EPERM;
12843 }
12844
12845 /*
12846 * Retrieve the data of an extended attribute.
12847 */
12848 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12849 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12850 {
12851 vnode_t vp;
12852 struct nameidata nd;
12853 char attrname[XATTR_MAXNAMELEN + 1];
12854 vfs_context_t ctx = vfs_context_current();
12855 uio_t auio = NULL;
12856 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12857 size_t attrsize = 0;
12858 size_t namelen;
12859 u_int32_t nameiflags;
12860 int error;
12861 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12862
12863 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12864 return EINVAL;
12865 }
12866
12867 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12868 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12869 if ((error = namei(&nd))) {
12870 return error;
12871 }
12872 vp = nd.ni_vp;
12873 nameidone(&nd);
12874
12875 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12876 if (error != 0) {
12877 goto out;
12878 }
12879 if (xattr_protected(attrname) &&
12880 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12881 goto out;
12882 }
12883 /*
12884 * the specific check for 0xffffffff is a hack to preserve
12885 * binaray compatibilty in K64 with applications that discovered
12886 * that passing in a buf pointer and a size of -1 resulted in
12887 * just the size of the indicated extended attribute being returned.
12888 * this isn't part of the documented behavior, but because of the
12889 * original implemtation's check for "uap->size > 0", this behavior
12890 * was allowed. In K32 that check turned into a signed comparison
12891 * even though uap->size is unsigned... in K64, we blow by that
12892 * check because uap->size is unsigned and doesn't get sign smeared
12893 * in the munger for a 32 bit user app. we also need to add a
12894 * check to limit the maximum size of the buffer being passed in...
12895 * unfortunately, the underlying fileystems seem to just malloc
12896 * the requested size even if the actual extended attribute is tiny.
12897 * because that malloc is for kernel wired memory, we have to put a
12898 * sane limit on it.
12899 *
12900 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12901 * U64 running on K64 will yield -1 (64 bits wide)
12902 * U32/U64 running on K32 will yield -1 (32 bits wide)
12903 */
12904 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12905 goto no_uio;
12906 }
12907
12908 if (uap->value) {
12909 if (uap->size > (size_t)XATTR_MAXSIZE) {
12910 uap->size = XATTR_MAXSIZE;
12911 }
12912
12913 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12914 &uio_buf[0], sizeof(uio_buf));
12915 uio_addiov(auio, uap->value, uap->size);
12916 }
12917 no_uio:
12918 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12919 out:
12920 vnode_put(vp);
12921
12922 if (auio) {
12923 *retval = uap->size - uio_resid(auio);
12924 } else {
12925 *retval = (user_ssize_t)attrsize;
12926 }
12927
12928 return error;
12929 }
12930
12931 /*
12932 * Retrieve the data of an extended attribute.
12933 */
12934 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12935 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12936 {
12937 vnode_t vp;
12938 char attrname[XATTR_MAXNAMELEN + 1];
12939 vfs_context_t ctx = vfs_context_current();
12940 uio_t auio = NULL;
12941 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12942 size_t attrsize = 0;
12943 size_t namelen;
12944 int error;
12945 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12946
12947 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12948 return EINVAL;
12949 }
12950
12951 if ((error = file_vnode(uap->fd, &vp))) {
12952 return error;
12953 }
12954 if ((error = vnode_getwithref(vp))) {
12955 file_drop(uap->fd);
12956 return error;
12957 }
12958 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12959 if (error != 0) {
12960 goto out;
12961 }
12962 if (xattr_protected(attrname) &&
12963 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12964 goto out;
12965 }
12966 if (uap->value && uap->size > 0) {
12967 if (uap->size > (size_t)XATTR_MAXSIZE) {
12968 uap->size = XATTR_MAXSIZE;
12969 }
12970
12971 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12972 &uio_buf[0], sizeof(uio_buf));
12973 uio_addiov(auio, uap->value, uap->size);
12974 }
12975
12976 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
12977 out:
12978 (void)vnode_put(vp);
12979 file_drop(uap->fd);
12980
12981 if (auio) {
12982 *retval = uap->size - uio_resid(auio);
12983 } else {
12984 *retval = (user_ssize_t)attrsize;
12985 }
12986 return error;
12987 }
12988
12989 /* struct for checkdirs iteration */
12990 struct setxattr_ctx {
12991 struct nameidata nd;
12992 char attrname[XATTR_MAXNAMELEN + 1];
12993 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12994 };
12995
12996 /*
12997 * Set the data of an extended attribute.
12998 */
12999 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13000 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13001 {
13002 vnode_t vp;
13003 vfs_context_t ctx = vfs_context_current();
13004 uio_t auio = NULL;
13005 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13006 size_t namelen;
13007 u_int32_t nameiflags;
13008 int error;
13009 struct setxattr_ctx *sactx;
13010
13011 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13012 return EINVAL;
13013 }
13014
13015 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13016 if (sactx == NULL) {
13017 return ENOMEM;
13018 }
13019
13020 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13021 if (error != 0) {
13022 if (error == EPERM) {
13023 /* if the string won't fit in attrname, copyinstr emits EPERM */
13024 error = ENAMETOOLONG;
13025 }
13026 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13027 goto out;
13028 }
13029 if (xattr_protected(sactx->attrname) &&
13030 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13031 goto out;
13032 }
13033 if (uap->size != 0 && uap->value == 0) {
13034 error = EINVAL;
13035 goto out;
13036 }
13037 if (uap->size > INT_MAX) {
13038 error = E2BIG;
13039 goto out;
13040 }
13041
13042 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13043 #if CONFIG_FILE_LEASES
13044 nameiflags |= WANTPARENT;
13045 #endif
13046 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13047 if ((error = namei(&sactx->nd))) {
13048 goto out;
13049 }
13050 vp = sactx->nd.ni_vp;
13051 #if CONFIG_FILE_LEASES
13052 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13053 vnode_put(sactx->nd.ni_dvp);
13054 #endif
13055 nameidone(&sactx->nd);
13056
13057 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13058 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13059 uio_addiov(auio, uap->value, uap->size);
13060
13061 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13062 #if CONFIG_FSE
13063 if (error == 0) {
13064 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13065 FSE_ARG_VNODE, vp,
13066 FSE_ARG_DONE);
13067 }
13068 #endif
13069 vnode_put(vp);
13070 out:
13071 kfree_type(struct setxattr_ctx, sactx);
13072 *retval = 0;
13073 return error;
13074 }
13075
13076 /*
13077 * Set the data of an extended attribute.
13078 */
13079 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13080 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13081 {
13082 vnode_t vp;
13083 char attrname[XATTR_MAXNAMELEN + 1];
13084 vfs_context_t ctx = vfs_context_current();
13085 uio_t auio = NULL;
13086 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13087 size_t namelen;
13088 int error;
13089 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13090
13091 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13092 return EINVAL;
13093 }
13094
13095 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13096 if (error != 0) {
13097 if (error == EPERM) {
13098 /* if the string won't fit in attrname, copyinstr emits EPERM */
13099 return ENAMETOOLONG;
13100 }
13101 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13102 return error;
13103 }
13104 if (xattr_protected(attrname) &&
13105 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13106 return error;
13107 }
13108 if (uap->size != 0 && uap->value == 0) {
13109 return EINVAL;
13110 }
13111 if (uap->size > INT_MAX) {
13112 return E2BIG;
13113 }
13114 if ((error = file_vnode(uap->fd, &vp))) {
13115 return error;
13116 }
13117 if ((error = vnode_getwithref(vp))) {
13118 file_drop(uap->fd);
13119 return error;
13120 }
13121
13122 #if CONFIG_FILE_LEASES
13123 vnode_breakdirlease(vp, true, O_WRONLY);
13124 #endif
13125
13126 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13127 &uio_buf[0], sizeof(uio_buf));
13128 uio_addiov(auio, uap->value, uap->size);
13129
13130 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13131 #if CONFIG_FSE
13132 if (error == 0) {
13133 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13134 FSE_ARG_VNODE, vp,
13135 FSE_ARG_DONE);
13136 }
13137 #endif
13138 vnode_put(vp);
13139 file_drop(uap->fd);
13140 *retval = 0;
13141 return error;
13142 }
13143
13144 /*
13145 * Remove an extended attribute.
13146 * XXX Code duplication here.
13147 */
13148 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13149 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13150 {
13151 vnode_t vp;
13152 struct nameidata nd;
13153 char attrname[XATTR_MAXNAMELEN + 1];
13154 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13155 vfs_context_t ctx = vfs_context_current();
13156 size_t namelen;
13157 u_int32_t nameiflags;
13158 int error;
13159
13160 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13161 return EINVAL;
13162 }
13163
13164 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13165 if (error != 0) {
13166 return error;
13167 }
13168 if (xattr_protected(attrname)) {
13169 return EPERM;
13170 }
13171 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13172 #if CONFIG_FILE_LEASES
13173 nameiflags |= WANTPARENT;
13174 #endif
13175 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13176 if ((error = namei(&nd))) {
13177 return error;
13178 }
13179 vp = nd.ni_vp;
13180 #if CONFIG_FILE_LEASES
13181 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13182 vnode_put(nd.ni_dvp);
13183 #endif
13184 nameidone(&nd);
13185
13186 error = vn_removexattr(vp, attrname, uap->options, ctx);
13187 #if CONFIG_FSE
13188 if (error == 0) {
13189 add_fsevent(FSE_XATTR_REMOVED, ctx,
13190 FSE_ARG_VNODE, vp,
13191 FSE_ARG_DONE);
13192 }
13193 #endif
13194 vnode_put(vp);
13195 *retval = 0;
13196 return error;
13197 }
13198
13199 /*
13200 * Remove an extended attribute.
13201 * XXX Code duplication here.
13202 */
13203 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13204 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13205 {
13206 vnode_t vp;
13207 char attrname[XATTR_MAXNAMELEN + 1];
13208 size_t namelen;
13209 int error;
13210 #if CONFIG_FSE
13211 vfs_context_t ctx = vfs_context_current();
13212 #endif
13213
13214 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13215 return EINVAL;
13216 }
13217
13218 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13219 if (error != 0) {
13220 return error;
13221 }
13222 if (xattr_protected(attrname)) {
13223 return EPERM;
13224 }
13225 if ((error = file_vnode(uap->fd, &vp))) {
13226 return error;
13227 }
13228 if ((error = vnode_getwithref(vp))) {
13229 file_drop(uap->fd);
13230 return error;
13231 }
13232
13233 #if CONFIG_FILE_LEASES
13234 vnode_breakdirlease(vp, true, O_WRONLY);
13235 #endif
13236
13237 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13238 #if CONFIG_FSE
13239 if (error == 0) {
13240 add_fsevent(FSE_XATTR_REMOVED, ctx,
13241 FSE_ARG_VNODE, vp,
13242 FSE_ARG_DONE);
13243 }
13244 #endif
13245 vnode_put(vp);
13246 file_drop(uap->fd);
13247 *retval = 0;
13248 return error;
13249 }
13250
13251 /*
13252 * Retrieve the list of extended attribute names.
13253 * XXX Code duplication here.
13254 */
13255 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13256 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13257 {
13258 vnode_t vp;
13259 struct nameidata nd;
13260 vfs_context_t ctx = vfs_context_current();
13261 uio_t auio = NULL;
13262 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13263 size_t attrsize = 0;
13264 u_int32_t nameiflags;
13265 int error;
13266 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13267
13268 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13269 return EINVAL;
13270 }
13271
13272 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13273 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13274 if ((error = namei(&nd))) {
13275 return error;
13276 }
13277 vp = nd.ni_vp;
13278 nameidone(&nd);
13279 if (uap->namebuf != 0 && uap->bufsize > 0) {
13280 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13281 &uio_buf[0], sizeof(uio_buf));
13282 uio_addiov(auio, uap->namebuf, uap->bufsize);
13283 }
13284
13285 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13286
13287 vnode_put(vp);
13288 if (auio) {
13289 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13290 } else {
13291 *retval = (user_ssize_t)attrsize;
13292 }
13293 return error;
13294 }
13295
13296 /*
13297 * Retrieve the list of extended attribute names.
13298 * XXX Code duplication here.
13299 */
13300 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13301 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13302 {
13303 vnode_t vp;
13304 uio_t auio = NULL;
13305 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13306 size_t attrsize = 0;
13307 int error;
13308 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13309
13310 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13311 return EINVAL;
13312 }
13313
13314 if ((error = file_vnode(uap->fd, &vp))) {
13315 return error;
13316 }
13317 if ((error = vnode_getwithref(vp))) {
13318 file_drop(uap->fd);
13319 return error;
13320 }
13321 if (uap->namebuf != 0 && uap->bufsize > 0) {
13322 auio = uio_createwithbuffer(1, 0, spacetype,
13323 UIO_READ, &uio_buf[0], sizeof(uio_buf));
13324 uio_addiov(auio, uap->namebuf, uap->bufsize);
13325 }
13326
13327 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13328
13329 vnode_put(vp);
13330 file_drop(uap->fd);
13331 if (auio) {
13332 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13333 } else {
13334 *retval = (user_ssize_t)attrsize;
13335 }
13336 return error;
13337 }
13338
13339 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13340 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13341 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13342 {
13343 int error;
13344 struct mount *mp = NULL;
13345 vnode_t vp;
13346 int length;
13347 int bpflags;
13348 /* maximum number of times to retry build_path */
13349 unsigned int retries = 0x10;
13350
13351 if (bufsize > PAGE_SIZE) {
13352 return EINVAL;
13353 }
13354
13355 if (buf == NULL) {
13356 return ENOMEM;
13357 }
13358
13359 retry:
13360 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13361 error = ENOTSUP; /* unexpected failure */
13362 return ENOTSUP;
13363 }
13364
13365 #if CONFIG_UNION_MOUNTS
13366 unionget:
13367 #endif /* CONFIG_UNION_MOUNTS */
13368 if (objid == 2) {
13369 struct vfs_attr vfsattr;
13370 int use_vfs_root = TRUE;
13371
13372 VFSATTR_INIT(&vfsattr);
13373 VFSATTR_WANTED(&vfsattr, f_capabilities);
13374 if (!(options & FSOPT_ISREALFSID) &&
13375 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13376 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13377 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13378 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13379 use_vfs_root = FALSE;
13380 }
13381 }
13382
13383 if (use_vfs_root) {
13384 error = VFS_ROOT(mp, &vp, ctx);
13385 } else {
13386 error = VFS_VGET(mp, objid, &vp, ctx);
13387 }
13388 } else {
13389 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13390 }
13391
13392 #if CONFIG_UNION_MOUNTS
13393 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13394 /*
13395 * If the fileid isn't found and we're in a union
13396 * mount volume, then see if the fileid is in the
13397 * mounted-on volume.
13398 */
13399 struct mount *tmp = mp;
13400 mp = vnode_mount(tmp->mnt_vnodecovered);
13401 vfs_unbusy(tmp);
13402 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13403 goto unionget;
13404 }
13405 } else {
13406 vfs_unbusy(mp);
13407 }
13408 #else
13409 vfs_unbusy(mp);
13410 #endif /* CONFIG_UNION_MOUNTS */
13411
13412 if (error) {
13413 return error;
13414 }
13415
13416 #if CONFIG_MACF
13417 error = mac_vnode_check_fsgetpath(ctx, vp);
13418 if (error) {
13419 vnode_put(vp);
13420 return error;
13421 }
13422 #endif
13423
13424 /* Obtain the absolute path to this vnode. */
13425 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13426 if (options & FSOPT_NOFIRMLINKPATH) {
13427 bpflags |= BUILDPATH_NO_FIRMLINK;
13428 }
13429 bpflags |= BUILDPATH_CHECK_MOVED;
13430 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13431 vnode_put(vp);
13432
13433 if (error) {
13434 /* there was a race building the path, try a few more times */
13435 if (error == EAGAIN) {
13436 --retries;
13437 if (retries > 0) {
13438 goto retry;
13439 }
13440
13441 error = ENOENT;
13442 }
13443 goto out;
13444 }
13445
13446 AUDIT_ARG(text, buf);
13447
13448 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13449 unsigned long path_words[NUMPARMS];
13450 size_t path_len = sizeof(path_words);
13451
13452 if ((size_t)length < path_len) {
13453 memcpy((char *)path_words, buf, length);
13454 memset((char *)path_words + length, 0, path_len - length);
13455
13456 path_len = length;
13457 } else {
13458 memcpy((char *)path_words, buf + (length - path_len), path_len);
13459 }
13460
13461 kdebug_vfs_lookup(path_words, (int)path_len, vp,
13462 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13463 }
13464
13465 *pathlen = length; /* may be superseded by error */
13466
13467 out:
13468 return error;
13469 }
13470
13471 /*
13472 * Obtain the full pathname of a file system object by id.
13473 */
13474 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13475 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13476 uint32_t options, user_ssize_t *retval)
13477 {
13478 vfs_context_t ctx = vfs_context_current();
13479 fsid_t fsid;
13480 char *realpath;
13481 int length;
13482 int error;
13483
13484 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13485 return EINVAL;
13486 }
13487
13488 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13489 return error;
13490 }
13491 AUDIT_ARG(value32, fsid.val[0]);
13492 AUDIT_ARG(value64, objid);
13493 /* Restrict output buffer size for now. */
13494
13495 if (bufsize > PAGE_SIZE || bufsize <= 0) {
13496 return EINVAL;
13497 }
13498 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13499 if (realpath == NULL) {
13500 return ENOMEM;
13501 }
13502
13503 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13504 options, &length);
13505
13506 if (error) {
13507 goto out;
13508 }
13509
13510 error = copyout((caddr_t)realpath, buf, length);
13511
13512 *retval = (user_ssize_t)length; /* may be superseded by error */
13513 out:
13514 kfree_data(realpath, bufsize);
13515 return error;
13516 }
13517
13518 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13519 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13520 {
13521 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13522 0, retval);
13523 }
13524
13525 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13526 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13527 {
13528 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13529 uap->options, retval);
13530 }
13531
13532 /*
13533 * Common routine to handle various flavors of statfs data heading out
13534 * to user space.
13535 *
13536 * Returns: 0 Success
13537 * EFAULT
13538 */
13539 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13540 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13541 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13542 boolean_t partial_copy)
13543 {
13544 int error;
13545 int my_size, copy_size;
13546
13547 if (is_64_bit) {
13548 struct user64_statfs sfs;
13549 my_size = copy_size = sizeof(sfs);
13550 bzero(&sfs, my_size);
13551 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13552 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13553 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13554 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13555 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13556 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13557 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13558 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13559 sfs.f_files = (user64_long_t)sfsp->f_files;
13560 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13561 sfs.f_fsid = sfsp->f_fsid;
13562 sfs.f_owner = sfsp->f_owner;
13563 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13564 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13565 } else {
13566 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13567 }
13568 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13569 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13570
13571 if (partial_copy) {
13572 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13573 }
13574 error = copyout((caddr_t)&sfs, bufp, copy_size);
13575 } else {
13576 struct user32_statfs sfs;
13577
13578 my_size = copy_size = sizeof(sfs);
13579 bzero(&sfs, my_size);
13580
13581 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13582 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13583 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13584
13585 /*
13586 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13587 * have to fudge the numbers here in that case. We inflate the blocksize in order
13588 * to reflect the filesystem size as best we can.
13589 */
13590 if ((sfsp->f_blocks > INT_MAX)
13591 /* Hack for 4061702 . I think the real fix is for Carbon to
13592 * look for some volume capability and not depend on hidden
13593 * semantics agreed between a FS and carbon.
13594 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13595 * for Carbon to set bNoVolumeSizes volume attribute.
13596 * Without this the webdavfs files cannot be copied onto
13597 * disk as they look huge. This change should not affect
13598 * XSAN as they should not setting these to -1..
13599 */
13600 && (sfsp->f_blocks != 0xffffffffffffffffULL)
13601 && (sfsp->f_bfree != 0xffffffffffffffffULL)
13602 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13603 int shift;
13604
13605 /*
13606 * Work out how far we have to shift the block count down to make it fit.
13607 * Note that it's possible to have to shift so far that the resulting
13608 * blocksize would be unreportably large. At that point, we will clip
13609 * any values that don't fit.
13610 *
13611 * For safety's sake, we also ensure that f_iosize is never reported as
13612 * being smaller than f_bsize.
13613 */
13614 for (shift = 0; shift < 32; shift++) {
13615 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13616 break;
13617 }
13618 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13619 break;
13620 }
13621 }
13622 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13623 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13624 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13625 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13626 #undef __SHIFT_OR_CLIP
13627 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13628 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13629 } else {
13630 /* filesystem is small enough to be reported honestly */
13631 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13632 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13633 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13634 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13635 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13636 }
13637 sfs.f_files = (user32_long_t)sfsp->f_files;
13638 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13639 sfs.f_fsid = sfsp->f_fsid;
13640 sfs.f_owner = sfsp->f_owner;
13641 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13642 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13643 } else {
13644 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13645 }
13646 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13647 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13648
13649 if (partial_copy) {
13650 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13651 }
13652 error = copyout((caddr_t)&sfs, bufp, copy_size);
13653 }
13654
13655 if (sizep != NULL) {
13656 *sizep = my_size;
13657 }
13658 return error;
13659 }
13660
13661 /*
13662 * copy stat structure into user_stat structure.
13663 */
13664 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13665 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13666 {
13667 bzero(usbp, sizeof(*usbp));
13668
13669 usbp->st_dev = sbp->st_dev;
13670 usbp->st_ino = sbp->st_ino;
13671 usbp->st_mode = sbp->st_mode;
13672 usbp->st_nlink = sbp->st_nlink;
13673 usbp->st_uid = sbp->st_uid;
13674 usbp->st_gid = sbp->st_gid;
13675 usbp->st_rdev = sbp->st_rdev;
13676 #ifndef _POSIX_C_SOURCE
13677 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13678 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13679 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13680 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13681 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13682 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13683 #else
13684 usbp->st_atime = sbp->st_atime;
13685 usbp->st_atimensec = sbp->st_atimensec;
13686 usbp->st_mtime = sbp->st_mtime;
13687 usbp->st_mtimensec = sbp->st_mtimensec;
13688 usbp->st_ctime = sbp->st_ctime;
13689 usbp->st_ctimensec = sbp->st_ctimensec;
13690 #endif
13691 usbp->st_size = sbp->st_size;
13692 usbp->st_blocks = sbp->st_blocks;
13693 usbp->st_blksize = sbp->st_blksize;
13694 usbp->st_flags = sbp->st_flags;
13695 usbp->st_gen = sbp->st_gen;
13696 usbp->st_lspare = sbp->st_lspare;
13697 usbp->st_qspare[0] = sbp->st_qspare[0];
13698 usbp->st_qspare[1] = sbp->st_qspare[1];
13699 }
13700
13701 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13702 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13703 {
13704 bzero(usbp, sizeof(*usbp));
13705
13706 usbp->st_dev = sbp->st_dev;
13707 usbp->st_ino = sbp->st_ino;
13708 usbp->st_mode = sbp->st_mode;
13709 usbp->st_nlink = sbp->st_nlink;
13710 usbp->st_uid = sbp->st_uid;
13711 usbp->st_gid = sbp->st_gid;
13712 usbp->st_rdev = sbp->st_rdev;
13713 #ifndef _POSIX_C_SOURCE
13714 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13715 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13716 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13717 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13718 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13719 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13720 #else
13721 usbp->st_atime = sbp->st_atime;
13722 usbp->st_atimensec = sbp->st_atimensec;
13723 usbp->st_mtime = sbp->st_mtime;
13724 usbp->st_mtimensec = sbp->st_mtimensec;
13725 usbp->st_ctime = sbp->st_ctime;
13726 usbp->st_ctimensec = sbp->st_ctimensec;
13727 #endif
13728 usbp->st_size = sbp->st_size;
13729 usbp->st_blocks = sbp->st_blocks;
13730 usbp->st_blksize = sbp->st_blksize;
13731 usbp->st_flags = sbp->st_flags;
13732 usbp->st_gen = sbp->st_gen;
13733 usbp->st_lspare = sbp->st_lspare;
13734 usbp->st_qspare[0] = sbp->st_qspare[0];
13735 usbp->st_qspare[1] = sbp->st_qspare[1];
13736 }
13737
13738 /*
13739 * copy stat64 structure into user_stat64 structure.
13740 */
13741 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13742 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13743 {
13744 bzero(usbp, sizeof(*usbp));
13745
13746 usbp->st_dev = sbp->st_dev;
13747 usbp->st_ino = sbp->st_ino;
13748 usbp->st_mode = sbp->st_mode;
13749 usbp->st_nlink = sbp->st_nlink;
13750 usbp->st_uid = sbp->st_uid;
13751 usbp->st_gid = sbp->st_gid;
13752 usbp->st_rdev = sbp->st_rdev;
13753 #ifndef _POSIX_C_SOURCE
13754 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13755 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13756 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13757 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13758 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13759 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13760 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13761 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13762 #else
13763 usbp->st_atime = sbp->st_atime;
13764 usbp->st_atimensec = sbp->st_atimensec;
13765 usbp->st_mtime = sbp->st_mtime;
13766 usbp->st_mtimensec = sbp->st_mtimensec;
13767 usbp->st_ctime = sbp->st_ctime;
13768 usbp->st_ctimensec = sbp->st_ctimensec;
13769 usbp->st_birthtime = sbp->st_birthtime;
13770 usbp->st_birthtimensec = sbp->st_birthtimensec;
13771 #endif
13772 usbp->st_size = sbp->st_size;
13773 usbp->st_blocks = sbp->st_blocks;
13774 usbp->st_blksize = sbp->st_blksize;
13775 usbp->st_flags = sbp->st_flags;
13776 usbp->st_gen = sbp->st_gen;
13777 usbp->st_lspare = sbp->st_lspare;
13778 usbp->st_qspare[0] = sbp->st_qspare[0];
13779 usbp->st_qspare[1] = sbp->st_qspare[1];
13780 }
13781
13782 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13783 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13784 {
13785 bzero(usbp, sizeof(*usbp));
13786
13787 usbp->st_dev = sbp->st_dev;
13788 usbp->st_ino = sbp->st_ino;
13789 usbp->st_mode = sbp->st_mode;
13790 usbp->st_nlink = sbp->st_nlink;
13791 usbp->st_uid = sbp->st_uid;
13792 usbp->st_gid = sbp->st_gid;
13793 usbp->st_rdev = sbp->st_rdev;
13794 #ifndef _POSIX_C_SOURCE
13795 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13796 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13797 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13798 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13799 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13800 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13801 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13802 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13803 #else
13804 usbp->st_atime = sbp->st_atime;
13805 usbp->st_atimensec = sbp->st_atimensec;
13806 usbp->st_mtime = sbp->st_mtime;
13807 usbp->st_mtimensec = sbp->st_mtimensec;
13808 usbp->st_ctime = sbp->st_ctime;
13809 usbp->st_ctimensec = sbp->st_ctimensec;
13810 usbp->st_birthtime = sbp->st_birthtime;
13811 usbp->st_birthtimensec = sbp->st_birthtimensec;
13812 #endif
13813 usbp->st_size = sbp->st_size;
13814 usbp->st_blocks = sbp->st_blocks;
13815 usbp->st_blksize = sbp->st_blksize;
13816 usbp->st_flags = sbp->st_flags;
13817 usbp->st_gen = sbp->st_gen;
13818 usbp->st_lspare = sbp->st_lspare;
13819 usbp->st_qspare[0] = sbp->st_qspare[0];
13820 usbp->st_qspare[1] = sbp->st_qspare[1];
13821 }
13822
13823 /*
13824 * Purge buffer cache for simulating cold starts
13825 */
13826 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13827 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13828 {
13829 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13830
13831 return VNODE_RETURNED;
13832 }
13833
13834 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13835 vfs_purge_callback(mount_t mp, __unused void * arg)
13836 {
13837 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13838
13839 return VFS_RETURNED;
13840 }
13841
13842 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13843 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13844 {
13845 if (!kauth_cred_issuser(kauth_cred_get())) {
13846 return EPERM;
13847 }
13848
13849 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13850
13851 return 0;
13852 }
13853
13854 /*
13855 * gets the vnode associated with the (unnamed) snapshot directory
13856 * for a Filesystem. The snapshot directory vnode is returned with
13857 * an iocount on it.
13858 */
13859 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13860 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13861 {
13862 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13863 }
13864
13865 /*
13866 * Get the snapshot vnode.
13867 *
13868 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13869 * needs nameidone() on ndp.
13870 *
13871 * If the snapshot vnode exists it is returned in ndp->ni_vp.
13872 *
13873 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13874 * not needed.
13875 */
13876 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13877 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13878 user_addr_t name, struct nameidata *ndp, int32_t op,
13879 #if !CONFIG_TRIGGERS
13880 __unused
13881 #endif
13882 enum path_operation pathop,
13883 vfs_context_t ctx)
13884 {
13885 int error, i;
13886 caddr_t name_buf;
13887 size_t name_len;
13888 struct vfs_attr vfa;
13889
13890 *sdvpp = NULLVP;
13891 *rvpp = NULLVP;
13892
13893 error = vnode_getfromfd(ctx, dirfd, rvpp);
13894 if (error) {
13895 return error;
13896 }
13897
13898 if (!vnode_isvroot(*rvpp)) {
13899 error = EINVAL;
13900 goto out;
13901 }
13902
13903 /* Make sure the filesystem supports snapshots */
13904 VFSATTR_INIT(&vfa);
13905 VFSATTR_WANTED(&vfa, f_capabilities);
13906 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13907 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13908 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13909 VOL_CAP_INT_SNAPSHOT)) ||
13910 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13911 VOL_CAP_INT_SNAPSHOT))) {
13912 error = ENOTSUP;
13913 goto out;
13914 }
13915
13916 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13917 if (error) {
13918 goto out;
13919 }
13920
13921 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13922 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13923 if (error) {
13924 goto out1;
13925 }
13926
13927 /*
13928 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13929 * (the length returned by copyinstr includes the terminating NUL)
13930 */
13931 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13932 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13933 error = EINVAL;
13934 goto out1;
13935 }
13936 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13937 ;
13938 }
13939 if (i < (int)name_len) {
13940 error = EINVAL;
13941 goto out1;
13942 }
13943
13944 #if CONFIG_MACF
13945 if (op == CREATE) {
13946 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13947 name_buf);
13948 } else if (op == DELETE) {
13949 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13950 name_buf);
13951 }
13952 if (error) {
13953 goto out1;
13954 }
13955 #endif
13956
13957 /* Check if the snapshot already exists ... */
13958 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
13959 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
13960 ndp->ni_dvp = *sdvpp;
13961
13962 error = namei(ndp);
13963 out1:
13964 zfree(ZV_NAMEI, name_buf);
13965 out:
13966 if (error) {
13967 if (*sdvpp) {
13968 vnode_put(*sdvpp);
13969 *sdvpp = NULLVP;
13970 }
13971 if (*rvpp) {
13972 vnode_put(*rvpp);
13973 *rvpp = NULLVP;
13974 }
13975 }
13976 return error;
13977 }
13978
13979 /*
13980 * create a filesystem snapshot (for supporting filesystems)
13981 *
13982 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
13983 * We get to the (unnamed) snapshot directory vnode and create the vnode
13984 * for the snapshot in it.
13985 *
13986 * Restrictions:
13987 *
13988 * a) Passed in name for snapshot cannot have slashes.
13989 * b) name can't be "." or ".."
13990 *
13991 * Since this requires superuser privileges, vnode_authorize calls are not
13992 * made.
13993 */
13994 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13995 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
13996 vfs_context_t ctx)
13997 {
13998 vnode_t rvp, snapdvp;
13999 int error;
14000 struct nameidata *ndp;
14001
14002 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14003
14004 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14005 OP_LINK, ctx);
14006 if (error) {
14007 goto out;
14008 }
14009
14010 if (ndp->ni_vp) {
14011 vnode_put(ndp->ni_vp);
14012 error = EEXIST;
14013 } else {
14014 struct vnode_attr *vap;
14015 vnode_t vp = NULLVP;
14016
14017 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14018
14019 VATTR_INIT(vap);
14020 VATTR_SET(vap, va_type, VREG);
14021 VATTR_SET(vap, va_mode, 0);
14022
14023 error = vn_create(snapdvp, &vp, ndp, vap,
14024 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14025 if (!error && vp) {
14026 vnode_put(vp);
14027 }
14028
14029 kfree_type(struct vnode_attr, vap);
14030 }
14031
14032 nameidone(ndp);
14033 vnode_put(snapdvp);
14034 vnode_put(rvp);
14035 out:
14036 kfree_type(struct nameidata, ndp);
14037
14038 return error;
14039 }
14040
14041 /*
14042 * Delete a Filesystem snapshot
14043 *
14044 * get the vnode for the unnamed snapshot directory and the snapshot and
14045 * delete the snapshot.
14046 */
14047 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14048 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14049 vfs_context_t ctx)
14050 {
14051 vnode_t rvp, snapdvp;
14052 int error;
14053 struct nameidata *ndp;
14054
14055 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14056
14057 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14058 OP_UNLINK, ctx);
14059 if (error) {
14060 goto out;
14061 }
14062
14063 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14064 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14065
14066 vnode_put(ndp->ni_vp);
14067 nameidone(ndp);
14068 vnode_put(snapdvp);
14069 vnode_put(rvp);
14070 out:
14071 kfree_type(struct nameidata, ndp);
14072
14073 return error;
14074 }
14075
14076 /*
14077 * Revert a filesystem to a snapshot
14078 *
14079 * Marks the filesystem to revert to the given snapshot on next mount.
14080 */
14081 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14082 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14083 vfs_context_t ctx)
14084 {
14085 int error;
14086 vnode_t rvp;
14087 mount_t mp;
14088 struct fs_snapshot_revert_args revert_data;
14089 struct componentname cnp;
14090 caddr_t name_buf;
14091 size_t name_len;
14092
14093 error = vnode_getfromfd(ctx, dirfd, &rvp);
14094 if (error) {
14095 return error;
14096 }
14097 mp = vnode_mount(rvp);
14098
14099 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14100 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14101 if (error) {
14102 zfree(ZV_NAMEI, name_buf);
14103 vnode_put(rvp);
14104 return error;
14105 }
14106
14107 #if CONFIG_MACF
14108 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14109 if (error) {
14110 zfree(ZV_NAMEI, name_buf);
14111 vnode_put(rvp);
14112 return error;
14113 }
14114 #endif
14115
14116 /*
14117 * Grab mount_iterref so that we can release the vnode,
14118 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14119 */
14120 error = mount_iterref(mp, 0);
14121 vnode_put(rvp);
14122 if (error) {
14123 zfree(ZV_NAMEI, name_buf);
14124 return error;
14125 }
14126
14127 memset(&cnp, 0, sizeof(cnp));
14128 cnp.cn_pnbuf = (char *)name_buf;
14129 cnp.cn_nameiop = LOOKUP;
14130 cnp.cn_flags = ISLASTCN | HASBUF;
14131 cnp.cn_pnlen = MAXPATHLEN;
14132 cnp.cn_nameptr = cnp.cn_pnbuf;
14133 cnp.cn_namelen = (int)name_len;
14134 revert_data.sr_cnp = &cnp;
14135
14136 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14137 mount_iterdrop(mp);
14138 zfree(ZV_NAMEI, name_buf);
14139
14140 if (error) {
14141 /* If there was any error, try again using VNOP_IOCTL */
14142
14143 vnode_t snapdvp;
14144 struct nameidata namend;
14145
14146 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14147 OP_LOOKUP, ctx);
14148 if (error) {
14149 return error;
14150 }
14151
14152
14153 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14154 0, ctx);
14155
14156 vnode_put(namend.ni_vp);
14157 nameidone(&namend);
14158 vnode_put(snapdvp);
14159 vnode_put(rvp);
14160 }
14161
14162 return error;
14163 }
14164
14165 /*
14166 * rename a Filesystem snapshot
14167 *
14168 * get the vnode for the unnamed snapshot directory and the snapshot and
14169 * rename the snapshot. This is a very specialised (and simple) case of
14170 * rename(2) (which has to deal with a lot more complications). It differs
14171 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14172 */
14173 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14174 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14175 __unused uint32_t flags, vfs_context_t ctx)
14176 {
14177 vnode_t rvp, snapdvp;
14178 int error, i;
14179 caddr_t newname_buf;
14180 size_t name_len;
14181 vnode_t fvp;
14182 struct nameidata *fromnd, *tond;
14183 /* carving out a chunk for structs that are too big to be on stack. */
14184 struct {
14185 struct nameidata from_node;
14186 struct nameidata to_node;
14187 } * __rename_data;
14188
14189 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14190 fromnd = &__rename_data->from_node;
14191 tond = &__rename_data->to_node;
14192
14193 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14194 OP_UNLINK, ctx);
14195 if (error) {
14196 goto out;
14197 }
14198 fvp = fromnd->ni_vp;
14199
14200 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14201 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14202 if (error) {
14203 goto out1;
14204 }
14205
14206 /*
14207 * Some sanity checks- new name can't be empty, "." or ".." or have
14208 * slashes.
14209 * (the length returned by copyinstr includes the terminating NUL)
14210 *
14211 * The FS rename VNOP is suppossed to handle this but we'll pick it
14212 * off here itself.
14213 */
14214 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14215 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14216 error = EINVAL;
14217 goto out1;
14218 }
14219 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14220 ;
14221 }
14222 if (i < (int)name_len) {
14223 error = EINVAL;
14224 goto out1;
14225 }
14226
14227 #if CONFIG_MACF
14228 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14229 newname_buf);
14230 if (error) {
14231 goto out1;
14232 }
14233 #endif
14234
14235 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14236 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14237 tond->ni_dvp = snapdvp;
14238
14239 error = namei(tond);
14240 if (error) {
14241 goto out2;
14242 } else if (tond->ni_vp) {
14243 /*
14244 * snapshot rename behaves differently than rename(2) - if the
14245 * new name exists, EEXIST is returned.
14246 */
14247 vnode_put(tond->ni_vp);
14248 error = EEXIST;
14249 goto out2;
14250 }
14251
14252 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14253 &tond->ni_cnd, ctx);
14254
14255 out2:
14256 nameidone(tond);
14257 out1:
14258 zfree(ZV_NAMEI, newname_buf);
14259 vnode_put(fvp);
14260 vnode_put(snapdvp);
14261 vnode_put(rvp);
14262 nameidone(fromnd);
14263 out:
14264 kfree_type(typeof(*__rename_data), __rename_data);
14265 return error;
14266 }
14267
14268 /*
14269 * Mount a Filesystem snapshot
14270 *
14271 * get the vnode for the unnamed snapshot directory and the snapshot and
14272 * mount the snapshot.
14273 */
14274 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14275 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14276 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14277 {
14278 mount_t mp;
14279 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14280 struct fs_snapshot_mount_args smnt_data;
14281 int error;
14282 struct nameidata *snapndp, *dirndp;
14283 /* carving out a chunk for structs that are too big to be on stack. */
14284 struct {
14285 struct nameidata snapnd;
14286 struct nameidata dirnd;
14287 } * __snapshot_mount_data;
14288
14289 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14290 snapndp = &__snapshot_mount_data->snapnd;
14291 dirndp = &__snapshot_mount_data->dirnd;
14292
14293 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14294 OP_LOOKUP, ctx);
14295 if (error) {
14296 goto out;
14297 }
14298
14299 snapvp = snapndp->ni_vp;
14300 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14301 error = EIO;
14302 goto out1;
14303 }
14304
14305 /* Get the vnode to be covered */
14306 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14307 UIO_USERSPACE, directory, ctx);
14308 error = namei(dirndp);
14309 if (error) {
14310 goto out1;
14311 }
14312
14313 vp = dirndp->ni_vp;
14314 pvp = dirndp->ni_dvp;
14315 mp = vnode_mount(rvp);
14316
14317 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14318 error = EINVAL;
14319 goto out2;
14320 }
14321
14322 #if CONFIG_MACF
14323 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14324 mp->mnt_vfsstat.f_fstypename);
14325 if (error) {
14326 goto out2;
14327 }
14328 #endif
14329
14330 smnt_data.sm_mp = mp;
14331 smnt_data.sm_cnp = &snapndp->ni_cnd;
14332 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14333 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
14334 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14335
14336 out2:
14337 vnode_put(vp);
14338 vnode_put(pvp);
14339 nameidone(dirndp);
14340 out1:
14341 vnode_put(snapvp);
14342 vnode_put(snapdvp);
14343 vnode_put(rvp);
14344 nameidone(snapndp);
14345 out:
14346 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14347 return error;
14348 }
14349
14350 /*
14351 * Root from a snapshot of the filesystem
14352 *
14353 * Marks the filesystem to root from the given snapshot on next boot.
14354 */
14355 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14356 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14357 vfs_context_t ctx)
14358 {
14359 int error;
14360 vnode_t rvp;
14361 mount_t mp;
14362 struct fs_snapshot_root_args root_data;
14363 struct componentname cnp;
14364 caddr_t name_buf;
14365 size_t name_len;
14366
14367 error = vnode_getfromfd(ctx, dirfd, &rvp);
14368 if (error) {
14369 return error;
14370 }
14371 mp = vnode_mount(rvp);
14372
14373 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14374 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14375 if (error) {
14376 zfree(ZV_NAMEI, name_buf);
14377 vnode_put(rvp);
14378 return error;
14379 }
14380
14381 // XXX MAC checks ?
14382
14383 /*
14384 * Grab mount_iterref so that we can release the vnode,
14385 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14386 */
14387 error = mount_iterref(mp, 0);
14388 vnode_put(rvp);
14389 if (error) {
14390 zfree(ZV_NAMEI, name_buf);
14391 return error;
14392 }
14393
14394 memset(&cnp, 0, sizeof(cnp));
14395 cnp.cn_pnbuf = (char *)name_buf;
14396 cnp.cn_nameiop = LOOKUP;
14397 cnp.cn_flags = ISLASTCN | HASBUF;
14398 cnp.cn_pnlen = MAXPATHLEN;
14399 cnp.cn_nameptr = cnp.cn_pnbuf;
14400 cnp.cn_namelen = (int)name_len;
14401 root_data.sr_cnp = &cnp;
14402
14403 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14404
14405 mount_iterdrop(mp);
14406 zfree(ZV_NAMEI, name_buf);
14407
14408 return error;
14409 }
14410
14411 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14412 vfs_context_can_snapshot(vfs_context_t ctx)
14413 {
14414 static const char * const snapshot_entitlements[] = {
14415 "com.apple.private.vfs.snapshot",
14416 "com.apple.developer.vfs.snapshot",
14417 "com.apple.private.apfs.arv.limited.snapshot",
14418 };
14419 static const size_t nentitlements =
14420 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14421 size_t i;
14422
14423 task_t task = vfs_context_task(ctx);
14424 for (i = 0; i < nentitlements; i++) {
14425 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14426 return TRUE;
14427 }
14428 }
14429 return FALSE;
14430 }
14431
14432 /*
14433 * FS snapshot operations dispatcher
14434 */
14435 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14436 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14437 __unused int32_t *retval)
14438 {
14439 int error;
14440 vfs_context_t ctx = vfs_context_current();
14441
14442 AUDIT_ARG(fd, uap->dirfd);
14443 AUDIT_ARG(value32, uap->op);
14444
14445 if (!vfs_context_can_snapshot(ctx)) {
14446 return EPERM;
14447 }
14448
14449 /*
14450 * Enforce user authorization for snapshot modification operations,
14451 * or if trying to root from snapshot.
14452 */
14453 if (uap->op != SNAPSHOT_OP_MOUNT) {
14454 vnode_t dvp = NULLVP;
14455 vnode_t devvp = NULLVP;
14456 mount_t mp;
14457
14458 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14459 if (error) {
14460 return error;
14461 }
14462 mp = vnode_mount(dvp);
14463 devvp = mp->mnt_devvp;
14464
14465 /* get an iocount on devvp */
14466 if (devvp == NULLVP) {
14467 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14468 /* for mounts which arent block devices */
14469 if (error == ENOENT) {
14470 error = ENXIO;
14471 }
14472 } else {
14473 error = vnode_getwithref(devvp);
14474 }
14475
14476 if (error) {
14477 vnode_put(dvp);
14478 return error;
14479 }
14480
14481 if ((vfs_context_issuser(ctx) == 0) &&
14482 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14483 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14484 error = EPERM;
14485 }
14486 vnode_put(dvp);
14487 vnode_put(devvp);
14488
14489 if (error) {
14490 return error;
14491 }
14492 }
14493
14494 switch (uap->op) {
14495 case SNAPSHOT_OP_CREATE:
14496 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14497 break;
14498 case SNAPSHOT_OP_DELETE:
14499 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14500 break;
14501 case SNAPSHOT_OP_RENAME:
14502 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14503 uap->flags, ctx);
14504 break;
14505 case SNAPSHOT_OP_MOUNT:
14506 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14507 uap->data, uap->flags, ctx);
14508 break;
14509 case SNAPSHOT_OP_REVERT:
14510 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14511 break;
14512 #if CONFIG_MNT_ROOTSNAP
14513 case SNAPSHOT_OP_ROOT:
14514 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14515 break;
14516 #endif /* CONFIG_MNT_ROOTSNAP */
14517 default:
14518 error = ENOSYS;
14519 }
14520
14521 return error;
14522 }
14523