1 /*
2 * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137
138 #include <nfs/nfs_conf.h>
139
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 ((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 release_pathbuff(x)
154 #else
155 #define GET_PATH(x) \
156 ((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
171 #endif
172
173 extern void disk_conditioner_unmount(mount_t mp);
174
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 vnode_t olddp;
178 vnode_t newdp;
179 };
180 /* callback for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192 boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195 struct componentname *cnp, user_addr_t fsmountargs,
196 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200
201 struct fd_vn_data * fg_vn_data_alloc(void);
202
203 /*
204 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205 * Concurrent lookups (or lookups by ids) on hard links can cause the
206 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207 * does) to return ENOENT as the path cannot be returned from the name cache
208 * alone. We have no option but to retry and hope to get one namei->reverse path
209 * generation done without an intervening lookup, lookup by id on the hard link
210 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211 * which currently are the MAC hooks for rename, unlink and rmdir.
212 */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219 int unlink_flags);
220
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236
237 __private_extern__
238 int sync_internal(void);
239
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249
250 extern lck_rw_t rootvnode_rw_lock;
251
252 VFS_SMR_DECLARE;
253 extern uint32_t nc_smr_enabled;
254
255 /*
256 * incremented each time a mount or unmount operation occurs
257 * used to invalidate the cached value of the rootvp in the
258 * mount structure utilized by cache_lookup_path
259 */
260 uint32_t mount_generation = 0;
261
262 /* counts number of mount and unmount operations */
263 unsigned int vfs_nummntops = 0;
264
265 /* system-wide, per-boot unique mount ID */
266 static _Atomic uint64_t mount_unique_id = 1;
267
268 extern const struct fileops vnops;
269 #if CONFIG_APPLEDOUBLE
270 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
271 #endif /* CONFIG_APPLEDOUBLE */
272
273 /* Maximum buffer length supported by fsgetpath(2) */
274 #define FSGETPATH_MAXBUFLEN 8192
275
276 /*
277 * Virtual File System System Calls
278 */
279
280 /*
281 * Private in-kernel mounting spi (specific use-cases only)
282 */
283 boolean_t
vfs_iskernelmount(mount_t mp)284 vfs_iskernelmount(mount_t mp)
285 {
286 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
287 }
288
289 __private_extern__
290 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)291 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
292 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
293 vfs_context_t ctx)
294 {
295 struct nameidata nd;
296 boolean_t did_namei;
297 int error;
298
299 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
300 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
301
302 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
303
304 /*
305 * Get the vnode to be covered if it's not supplied
306 */
307 if (vp == NULLVP) {
308 error = namei(&nd);
309 if (error) {
310 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
311 printf("failed to locate mount-on path: %s ", path);
312 }
313 return error;
314 }
315 vp = nd.ni_vp;
316 pvp = nd.ni_dvp;
317 did_namei = TRUE;
318 } else {
319 char *pnbuf = CAST_DOWN(char *, path);
320
321 nd.ni_cnd.cn_pnbuf = pnbuf;
322 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
323 did_namei = FALSE;
324 }
325
326 kern_flags |= KERNEL_MOUNT_KMOUNT;
327 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
328 syscall_flags, kern_flags, NULL, ctx);
329
330 if (did_namei) {
331 vnode_put(vp);
332 vnode_put(pvp);
333 nameidone(&nd);
334 }
335
336 return error;
337 }
338
339 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)340 vfs_mount_at_path(const char *fstype, const char *path,
341 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
342 int mnt_flags, int flags)
343 {
344 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
345 int error, km_flags = 0;
346 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
347
348 /*
349 * This call is currently restricted to specific use cases.
350 */
351 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
352 return ENOTSUP;
353 }
354
355 #if !defined(XNU_TARGET_OS_OSX)
356 if (strcmp(fstype, "lifs") == 0) {
357 syscall_flags |= MNT_NOEXEC;
358 }
359 #endif
360
361 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
362 km_flags |= KERNEL_MOUNT_NOAUTH;
363 }
364 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
365 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
366 }
367
368 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
369 syscall_flags, km_flags, ctx);
370 if (error) {
371 printf("%s: mount on %s failed, error %d\n", __func__, path,
372 error);
373 }
374
375 return error;
376 }
377
378 /*
379 * Mount a file system.
380 */
381 /* ARGSUSED */
382 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)383 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
384 {
385 struct __mac_mount_args muap;
386
387 muap.type = uap->type;
388 muap.path = uap->path;
389 muap.flags = uap->flags;
390 muap.data = uap->data;
391 muap.mac_p = USER_ADDR_NULL;
392 return __mac_mount(p, &muap, retval);
393 }
394
395 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)396 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
397 {
398 struct componentname cn;
399 vfs_context_t ctx = vfs_context_current();
400 size_t dummy = 0;
401 int error;
402 int flags = uap->flags;
403 char fstypename[MFSNAMELEN];
404 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
405 vnode_t pvp;
406 vnode_t vp;
407
408 AUDIT_ARG(fd, uap->fd);
409 AUDIT_ARG(fflags, flags);
410 /* fstypename will get audited by mount_common */
411
412 /* Sanity check the flags */
413 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
414 return ENOTSUP;
415 }
416
417 if (flags & MNT_UNION) {
418 return EPERM;
419 }
420
421 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
422 if (error) {
423 return error;
424 }
425
426 if ((error = file_vnode(uap->fd, &vp)) != 0) {
427 return error;
428 }
429
430 if ((error = vnode_getwithref(vp)) != 0) {
431 file_drop(uap->fd);
432 return error;
433 }
434
435 pvp = vnode_getparent(vp);
436 if (pvp == NULL) {
437 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
438 error = EBUSY;
439 } else {
440 error = EINVAL;
441 }
442 vnode_put(vp);
443 file_drop(uap->fd);
444 return error;
445 }
446
447 memset(&cn, 0, sizeof(struct componentname));
448 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
449 cn.cn_pnlen = MAXPATHLEN;
450
451 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
452 zfree(ZV_NAMEI, cn.cn_pnbuf);
453 vnode_put(pvp);
454 vnode_put(vp);
455 file_drop(uap->fd);
456 return error;
457 }
458
459 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
460
461 zfree(ZV_NAMEI, cn.cn_pnbuf);
462 vnode_put(pvp);
463 vnode_put(vp);
464 file_drop(uap->fd);
465
466 return error;
467 }
468
469 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
470
471 /*
472 * Get the size of a graft file (a manifest or payload file).
473 * The vp should be an iocounted vnode.
474 */
475 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)476 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
477 {
478 struct stat64 sb = {};
479 int error;
480
481 *size = 0;
482
483 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
484 if (error) {
485 return error;
486 }
487
488 if (sb.st_size == 0) {
489 error = ENODATA;
490 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
491 error = EFBIG;
492 } else {
493 *size = (size_t) sb.st_size;
494 }
495
496 return error;
497 }
498
499 /*
500 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
501 * `size` must already be validated.
502 */
503 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)504 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
505 {
506 return vn_rdwr(UIO_READ, graft_vp,
507 (caddr_t) buf, (int) size, /* offset */ 0,
508 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
509 vfs_context_ucred(vctx), /* resid */ NULL,
510 vfs_context_proc(vctx));
511 }
512
513 /*
514 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
515 * and read it into `buf`.
516 */
517 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)518 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
519 {
520 vnode_t metadata_vp = NULLVP;
521 int error;
522
523 // Convert this graft fd to a vnode.
524 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
525 goto out;
526 }
527
528 // Get (and validate) size information.
529 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
530 goto out;
531 }
532
533 // Read each file into the provided buffer - we must get the expected amount of bytes.
534 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
535 goto out;
536 }
537
538 out:
539 if (metadata_vp) {
540 vnode_put(metadata_vp);
541 metadata_vp = NULLVP;
542 }
543
544 return error;
545 }
546
547 /*
548 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
549 * provided in `gfs`, saving the size of data read in `gfs`.
550 */
551 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)552 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
553 fsioc_graft_fs_t *gfs)
554 {
555 int error;
556
557 // Read the authentic manifest.
558 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
559 &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
560 return error;
561 }
562
563 // The user manifest is currently unused, but set its size.
564 gfs->user_manifest_size = 0;
565
566 // Read the payload.
567 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
568 &gfs->payload_size, gfs->payload))) {
569 return error;
570 }
571
572 return 0;
573 }
574
575 /*
576 * Call into the filesystem to verify and graft a cryptex.
577 */
578 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)579 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
580 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
581 {
582 fsioc_graft_fs_t gfs = {};
583 uint64_t graft_dir_ino = 0;
584 struct stat64 sb = {};
585 int error;
586
587 // Pre-flight arguments.
588 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
589 // Make sure that this graft version matches what we support.
590 return ENOTSUP;
591 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
592 // For this type, cryptex VP must live on same volume as the target of graft.
593 return EXDEV;
594 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
595 // We cannot graft upon non-directories.
596 return ENOTDIR;
597 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
598 sbc_args->sbc_payload_fd < 0) {
599 // We cannot graft without a manifest and payload.
600 return EINVAL;
601 }
602
603 if (mounton_vp) {
604 // Get the mounton's inode number.
605 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
606 if (error) {
607 return error;
608 }
609 graft_dir_ino = (uint64_t) sb.st_ino;
610 }
611
612 // Create buffers (of our maximum-defined size) to store authentication info.
613 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
614 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
615
616 if (!gfs.authentic_manifest || !gfs.payload) {
617 error = ENOMEM;
618 goto out;
619 }
620
621 // Read our fd's into our buffers.
622 // (Note that this will set the buffer size fields in `gfs`.)
623 error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
624 if (error) {
625 goto out;
626 }
627
628 gfs.graft_version = FSIOC_GRAFT_VERSION;
629 gfs.graft_type = graft_type;
630 gfs.graft_4cc = sbc_args->sbc_4cc;
631 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
632 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
633 }
634 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
635 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
636 }
637 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
638 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
639 }
640 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
641 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
642 }
643 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
644 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
645 }
646 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
647 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
648 }
649 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
650
651 // Call into the FS to perform the graft (and validation).
652 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
653
654 out:
655 if (gfs.authentic_manifest) {
656 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
657 gfs.authentic_manifest = NULL;
658 }
659 if (gfs.payload) {
660 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
661 gfs.payload = NULL;
662 }
663
664 return error;
665 }
666
667 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
668
669 /*
670 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
671 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
672 */
673 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)674 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
675 {
676 int ua_dmgfd = uap->dmg_fd;
677 user_addr_t ua_mountdir = uap->mountdir;
678 uint32_t ua_grafttype = uap->graft_type;
679 user_addr_t ua_graftargs = uap->gda;
680
681 graftdmg_args_un kern_gda = {};
682 int error = 0;
683 secure_boot_cryptex_args_t *sbc_args = NULL;
684
685 vnode_t cryptex_vp = NULLVP;
686 vnode_t mounton_vp = NULLVP;
687 struct nameidata nd = {};
688 vfs_context_t ctx = vfs_context_current();
689
690 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
691 return EPERM;
692 }
693
694 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
695 if (error) {
696 return error;
697 }
698
699 // Copy mount dir in, if provided.
700 if (ua_mountdir != USER_ADDR_NULL) {
701 // Acquire vnode for mount-on path
702 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
703 UIO_USERSPACE, ua_mountdir, ctx);
704
705 error = namei(&nd);
706 if (error) {
707 return error;
708 }
709 mounton_vp = nd.ni_vp;
710 }
711
712 // Convert fd to vnode.
713 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
714 if (error) {
715 goto graftout;
716 }
717
718 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
719 error = EINVAL;
720 } else {
721 sbc_args = &kern_gda.sbc_args;
722 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
723 }
724
725 graftout:
726 if (cryptex_vp) {
727 vnode_put(cryptex_vp);
728 cryptex_vp = NULLVP;
729 }
730 if (mounton_vp) {
731 vnode_put(mounton_vp);
732 mounton_vp = NULLVP;
733 }
734 if (ua_mountdir != USER_ADDR_NULL) {
735 nameidone(&nd);
736 }
737
738 return error;
739 }
740
741 /*
742 * Ungraft a cryptex disk image (via mount dir FD)
743 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
744 */
745 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)746 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
747 {
748 int error = 0;
749 user_addr_t ua_mountdir = uap->mountdir;
750 fsioc_ungraft_fs_t ugfs;
751 vnode_t mounton_vp = NULLVP;
752 struct nameidata nd = {};
753 vfs_context_t ctx = vfs_context_current();
754
755 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
756 return EPERM;
757 }
758
759 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
760 return EINVAL;
761 }
762
763 ugfs.ungraft_flags = 0;
764
765 // Acquire vnode for mount-on path
766 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
767 UIO_USERSPACE, ua_mountdir, ctx);
768
769 error = namei(&nd);
770 if (error) {
771 return error;
772 }
773 mounton_vp = nd.ni_vp;
774
775 // Call into the FS to perform the ungraft
776 error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
777
778 vnode_put(mounton_vp);
779 nameidone(&nd);
780
781 return error;
782 }
783
784
785 void
vfs_notify_mount(vnode_t pdvp)786 vfs_notify_mount(vnode_t pdvp)
787 {
788 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
789 lock_vnode_and_post(pdvp, NOTE_WRITE);
790 }
791
792 /*
793 * __mac_mount:
794 * Mount a file system taking into account MAC label behavior.
795 * See mount(2) man page for more information
796 *
797 * Parameters: p Process requesting the mount
798 * uap User argument descriptor (see below)
799 * retval (ignored)
800 *
801 * Indirect: uap->type Filesystem type
802 * uap->path Path to mount
803 * uap->data Mount arguments
804 * uap->mac_p MAC info
805 * uap->flags Mount flags
806 *
807 *
808 * Returns: 0 Success
809 * !0 Not success
810 */
811 boolean_t root_fs_upgrade_try = FALSE;
812
813 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)814 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
815 {
816 vnode_t pvp = NULL;
817 vnode_t vp = NULL;
818 int need_nameidone = 0;
819 vfs_context_t ctx = vfs_context_current();
820 char fstypename[MFSNAMELEN];
821 struct nameidata nd;
822 size_t dummy = 0;
823 char *labelstr = NULL;
824 size_t labelsz = 0;
825 int flags = uap->flags;
826 int error;
827 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
828 boolean_t is_64bit = IS_64BIT_PROCESS(p);
829 #else
830 #pragma unused(p)
831 #endif
832 /*
833 * Get the fs type name from user space
834 */
835 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
836 if (error) {
837 return error;
838 }
839
840 /*
841 * Get the vnode to be covered
842 */
843 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
844 UIO_USERSPACE, uap->path, ctx);
845 if (flags & MNT_NOFOLLOW) {
846 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
847 }
848 error = namei(&nd);
849 if (error) {
850 goto out;
851 }
852 need_nameidone = 1;
853 vp = nd.ni_vp;
854 pvp = nd.ni_dvp;
855
856 #ifdef CONFIG_IMGSRC_ACCESS
857 /* Mounting image source cannot be batched with other operations */
858 if (flags == MNT_IMGSRC_BY_INDEX) {
859 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
860 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
861 goto out;
862 }
863 #endif /* CONFIG_IMGSRC_ACCESS */
864
865 #if CONFIG_MACF
866 /*
867 * Get the label string (if any) from user space
868 */
869 if (uap->mac_p != USER_ADDR_NULL) {
870 struct user_mac mac;
871 size_t ulen = 0;
872
873 if (is_64bit) {
874 struct user64_mac mac64;
875 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
876 mac.m_buflen = (user_size_t)mac64.m_buflen;
877 mac.m_string = (user_addr_t)mac64.m_string;
878 } else {
879 struct user32_mac mac32;
880 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
881 mac.m_buflen = mac32.m_buflen;
882 mac.m_string = mac32.m_string;
883 }
884 if (error) {
885 goto out;
886 }
887 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
888 (mac.m_buflen < 2)) {
889 error = EINVAL;
890 goto out;
891 }
892 labelsz = mac.m_buflen;
893 labelstr = kalloc_data(labelsz, Z_WAITOK);
894 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
895 if (error) {
896 goto out;
897 }
898 AUDIT_ARG(mac_string, labelstr);
899 }
900 #endif /* CONFIG_MACF */
901
902 AUDIT_ARG(fflags, flags);
903
904 #if !CONFIG_UNION_MOUNTS
905 if (flags & MNT_UNION) {
906 error = EPERM;
907 goto out;
908 }
909 #endif
910
911 if ((vp->v_flag & VROOT) &&
912 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
913 #if CONFIG_UNION_MOUNTS
914 if (!(flags & MNT_UNION)) {
915 flags |= MNT_UPDATE;
916 } else {
917 /*
918 * For a union mount on '/', treat it as fresh
919 * mount instead of update.
920 * Otherwise, union mouting on '/' used to panic the
921 * system before, since mnt_vnodecovered was found to
922 * be NULL for '/' which is required for unionlookup
923 * after it gets ENOENT on union mount.
924 */
925 flags = (flags & ~(MNT_UPDATE));
926 }
927 #else
928 flags |= MNT_UPDATE;
929 #endif /* CONFIG_UNION_MOUNTS */
930
931 #if SECURE_KERNEL
932 if ((flags & MNT_RDONLY) == 0) {
933 /* Release kernels are not allowed to mount "/" as rw */
934 error = EPERM;
935 goto out;
936 }
937 #endif
938
939 /*
940 * See 7392553 for more details on why this check exists.
941 * Suffice to say: If this check is ON and something tries
942 * to mount the rootFS RW, we'll turn off the codesign
943 * bitmap optimization.
944 */
945 #if CHECK_CS_VALIDATION_BITMAP
946 if ((flags & MNT_RDONLY) == 0) {
947 root_fs_upgrade_try = TRUE;
948 }
949 #endif
950 }
951
952 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
953 labelstr, ctx);
954
955 out:
956
957 #if CONFIG_MACF
958 kfree_data(labelstr, labelsz);
959 #endif /* CONFIG_MACF */
960
961 if (vp) {
962 vnode_put(vp);
963 }
964 if (pvp) {
965 vnode_put(pvp);
966 }
967 if (need_nameidone) {
968 nameidone(&nd);
969 }
970
971 return error;
972 }
973
974 /*
975 * common mount implementation (final stage of mounting)
976 *
977 * Arguments:
978 * fstypename file system type (ie it's vfs name)
979 * pvp parent of covered vnode
980 * vp covered vnode
981 * cnp component name (ie path) of covered vnode
982 * flags generic mount flags
983 * fsmountargs file system specific data
984 * labelstr optional MAC label
985 * kernelmount TRUE for mounts initiated from inside the kernel
986 * ctx caller's context
987 */
988 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)989 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
990 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
991 char *labelstr, vfs_context_t ctx)
992 {
993 #if !CONFIG_MACF
994 #pragma unused(labelstr)
995 #endif
996 struct vnode *devvp = NULLVP;
997 struct vnode *device_vnode = NULLVP;
998 #if CONFIG_MACF
999 struct vnode *rvp;
1000 #endif
1001 struct mount *mp = NULL;
1002 struct vfstable *vfsp = (struct vfstable *)0;
1003 struct proc *p = vfs_context_proc(ctx);
1004 int error, flag = 0;
1005 bool flag_set = false;
1006 user_addr_t devpath = USER_ADDR_NULL;
1007 int ronly = 0;
1008 int mntalloc = 0;
1009 boolean_t vfsp_ref = FALSE;
1010 boolean_t is_rwlock_locked = FALSE;
1011 boolean_t did_rele = FALSE;
1012 boolean_t have_usecount = FALSE;
1013 boolean_t did_set_lmount = FALSE;
1014 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1015
1016 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1017 /* Check for mutually-exclusive flag bits */
1018 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1019 int bitcount = 0;
1020 while (checkflags != 0) {
1021 checkflags &= (checkflags - 1);
1022 bitcount++;
1023 }
1024
1025 if (bitcount > 1) {
1026 //not allowed to request multiple mount-by-role flags
1027 error = EINVAL;
1028 goto out1;
1029 }
1030 #endif
1031
1032 /*
1033 * Process an update for an existing mount
1034 */
1035 if (flags & MNT_UPDATE) {
1036 if ((vp->v_flag & VROOT) == 0) {
1037 error = EINVAL;
1038 goto out1;
1039 }
1040 mp = vp->v_mount;
1041
1042 /* if unmount or mount in progress, return error */
1043 mount_lock_spin(mp);
1044 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1045 mount_unlock(mp);
1046 error = EBUSY;
1047 goto out1;
1048 }
1049 mp->mnt_lflag |= MNT_LMOUNT;
1050 did_set_lmount = TRUE;
1051 mount_unlock(mp);
1052 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1053 is_rwlock_locked = TRUE;
1054 /*
1055 * We only allow the filesystem to be reloaded if it
1056 * is currently mounted read-only.
1057 */
1058 if ((flags & MNT_RELOAD) &&
1059 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1060 error = ENOTSUP;
1061 goto out1;
1062 }
1063
1064 /*
1065 * If content protection is enabled, update mounts are not
1066 * allowed to turn it off.
1067 */
1068 if ((mp->mnt_flag & MNT_CPROTECT) &&
1069 ((flags & MNT_CPROTECT) == 0)) {
1070 error = EINVAL;
1071 goto out1;
1072 }
1073
1074 /*
1075 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1076 * failure to return an error for this so we'll just silently
1077 * add it if it is not passed in.
1078 */
1079 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1080 ((flags & MNT_REMOVABLE) == 0)) {
1081 flags |= MNT_REMOVABLE;
1082 }
1083
1084 /* Can't downgrade the backer of the root FS */
1085 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1086 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1087 error = ENOTSUP;
1088 goto out1;
1089 }
1090
1091 /*
1092 * Only root, or the user that did the original mount is
1093 * permitted to update it.
1094 */
1095 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1096 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1097 goto out1;
1098 }
1099 #if CONFIG_MACF
1100 error = mac_mount_check_remount(ctx, mp);
1101 if (error != 0) {
1102 goto out1;
1103 }
1104 #endif
1105 /*
1106 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1107 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1108 */
1109 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1110 flags |= MNT_NOSUID | MNT_NODEV;
1111 if (mp->mnt_flag & MNT_NOEXEC) {
1112 flags |= MNT_NOEXEC;
1113 }
1114 }
1115 flag = mp->mnt_flag;
1116 flag_set = true;
1117
1118
1119
1120 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1121
1122 vfsp = mp->mnt_vtable;
1123 goto update;
1124 } // MNT_UPDATE
1125
1126 /*
1127 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1128 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1129 */
1130 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1131 flags |= MNT_NOSUID | MNT_NODEV;
1132 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1133 flags |= MNT_NOEXEC;
1134 }
1135 }
1136
1137 /* XXXAUDIT: Should we capture the type on the error path as well? */
1138 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1139 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1140 mount_list_lock();
1141 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1142 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1143 vfsp->vfc_refcount++;
1144 vfsp_ref = TRUE;
1145 break;
1146 }
1147 }
1148 mount_list_unlock();
1149 if (vfsp == NULL) {
1150 error = ENODEV;
1151 goto out1;
1152 }
1153
1154 /*
1155 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1156 * except in ROSV configs and for the initial BaseSystem root.
1157 */
1158 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1159 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1160 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1161 error = EINVAL; /* unsupported request */
1162 goto out1;
1163 }
1164
1165 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1166 if (error != 0) {
1167 goto out1;
1168 }
1169
1170 /*
1171 * Allocate and initialize the filesystem (mount_t)
1172 */
1173 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1174 mntalloc = 1;
1175
1176 /* Initialize the default IO constraints */
1177 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1178 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1179 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1180 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1181 mp->mnt_devblocksize = DEV_BSIZE;
1182 mp->mnt_alignmentmask = PAGE_MASK;
1183 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1184 mp->mnt_ioscale = 1;
1185 mp->mnt_ioflags = 0;
1186 mp->mnt_realrootvp = NULLVP;
1187 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1188
1189 mp->mnt_lflag |= MNT_LMOUNT;
1190 did_set_lmount = TRUE;
1191
1192 TAILQ_INIT(&mp->mnt_vnodelist);
1193 TAILQ_INIT(&mp->mnt_workerqueue);
1194 TAILQ_INIT(&mp->mnt_newvnodes);
1195 mount_lock_init(mp);
1196 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1197 is_rwlock_locked = TRUE;
1198 mp->mnt_op = vfsp->vfc_vfsops;
1199 mp->mnt_vtable = vfsp;
1200 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1201 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1202 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1203 do {
1204 size_t pathlen = MAXPATHLEN;
1205
1206 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1207 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1208 }
1209 } while (0);
1210 mp->mnt_vnodecovered = vp;
1211 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1212 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1213 mp->mnt_devbsdunit = 0;
1214 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1215
1216 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1217 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1218
1219 if (kernelmount) {
1220 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1221 }
1222 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1223 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1224 }
1225
1226 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1227 // kernel mounted devfs
1228 mp->mnt_kern_flag |= MNTK_SYSTEM;
1229 }
1230
1231 update:
1232
1233 /*
1234 * Set the mount level flags.
1235 */
1236 if (flags & MNT_RDONLY) {
1237 mp->mnt_flag |= MNT_RDONLY;
1238 } else if (mp->mnt_flag & MNT_RDONLY) {
1239 // disallow read/write upgrades of file systems that
1240 // had the TYPENAME_OVERRIDE feature set.
1241 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1242 error = EPERM;
1243 goto out1;
1244 }
1245 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1246 }
1247 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1248 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1249 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1250 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1251 MNT_QUARANTINE | MNT_CPROTECT);
1252
1253 #if SECURE_KERNEL
1254 #if !CONFIG_MNT_SUID
1255 /*
1256 * On release builds of iOS based platforms, always enforce NOSUID on
1257 * all mounts. We do this here because we can catch update mounts as well as
1258 * non-update mounts in this case.
1259 */
1260 mp->mnt_flag |= (MNT_NOSUID);
1261 #endif
1262 #endif
1263
1264 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1265 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1266 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1267 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1268 MNT_QUARANTINE | MNT_CPROTECT);
1269
1270 #if CONFIG_MACF
1271 if (flags & MNT_MULTILABEL) {
1272 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1273 error = EINVAL;
1274 goto out1;
1275 }
1276 mp->mnt_flag |= MNT_MULTILABEL;
1277 }
1278 #endif
1279 /*
1280 * Process device path for local file systems if requested.
1281 *
1282 * Snapshot and mount-by-role mounts do not use this path; they are
1283 * passing other opaque data in the device path field.
1284 *
1285 * Basesystemroot mounts pass a device path to be resolved here,
1286 * but it's just a char * already inside the kernel, which
1287 * kernel_mount() shoved into a user_addr_t to call us. So for such
1288 * mounts we must skip copyin (both of the address and of the string
1289 * (in NDINIT).
1290 */
1291 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1292 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1293 boolean_t do_copyin_devpath = true;
1294 #if CONFIG_BASESYSTEMROOT
1295 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1296 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1297 // We have been passed fsmountargs, which is typed as a user_addr_t,
1298 // but is actually a char ** pointing to a (kernelspace) string.
1299 // We manually unpack it with a series of casts and dereferences
1300 // that reverses what was done just above us on the stack in
1301 // imageboot_pivot_image().
1302 // After retrieving the path to the dev node (which we will NDINIT
1303 // in a moment), we pass NULL fsmountargs on to the filesystem.
1304 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1305 char **devnamepp = (char **)fsmountargs;
1306 char *devnamep = *devnamepp;
1307 devpath = CAST_USER_ADDR_T(devnamep);
1308 do_copyin_devpath = false;
1309 fsmountargs = USER_ADDR_NULL;
1310
1311 //Now that we have a mp, denote that this mount is for the basesystem.
1312 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1313 }
1314 #endif // CONFIG_BASESYSTEMROOT
1315
1316 if (do_copyin_devpath) {
1317 if (vfs_context_is64bit(ctx)) {
1318 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1319 goto out1;
1320 }
1321 fsmountargs += sizeof(devpath);
1322 } else {
1323 user32_addr_t tmp;
1324 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1325 goto out1;
1326 }
1327 /* munge into LP64 addr */
1328 devpath = CAST_USER_ADDR_T(tmp);
1329 fsmountargs += sizeof(tmp);
1330 }
1331 }
1332
1333 /* Lookup device and authorize access to it */
1334 if ((devpath)) {
1335 struct nameidata nd;
1336
1337 enum uio_seg seg = UIO_USERSPACE;
1338 #if CONFIG_BASESYSTEMROOT
1339 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1340 seg = UIO_SYSSPACE;
1341 }
1342 #endif // CONFIG_BASESYSTEMROOT
1343
1344 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1345 if ((error = namei(&nd))) {
1346 goto out1;
1347 }
1348
1349 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1350 devvp = nd.ni_vp;
1351
1352 nameidone(&nd);
1353
1354 if (devvp->v_type != VBLK) {
1355 error = ENOTBLK;
1356 goto out2;
1357 }
1358 if (major(devvp->v_rdev) >= nblkdev) {
1359 error = ENXIO;
1360 goto out2;
1361 }
1362 /*
1363 * If mount by non-root, then verify that user has necessary
1364 * permissions on the device.
1365 */
1366 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1367 kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1368
1369 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1370 accessmode |= KAUTH_VNODE_WRITE_DATA;
1371 }
1372 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1373 goto out2;
1374 }
1375 }
1376 }
1377 /* On first mount, preflight and open device */
1378 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1379 if ((error = vnode_ref(devvp))) {
1380 goto out2;
1381 }
1382 /*
1383 * Disallow multiple mounts of the same device.
1384 * Disallow mounting of a device that is currently in use
1385 * (except for root, which might share swap device for miniroot).
1386 * Flush out any old buffers remaining from a previous use.
1387 */
1388 if ((error = vfs_setmounting(devvp))) {
1389 vnode_rele(devvp);
1390 goto out2;
1391 }
1392
1393 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1394 error = EBUSY;
1395 goto out3;
1396 }
1397 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1398 error = ENOTBLK;
1399 goto out3;
1400 }
1401 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1402 goto out3;
1403 }
1404
1405 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1406 #if CONFIG_MACF
1407 error = mac_vnode_check_open(ctx,
1408 devvp,
1409 ronly ? FREAD : FREAD | FWRITE);
1410 if (error) {
1411 goto out3;
1412 }
1413 #endif /* MAC */
1414 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1415 goto out3;
1416 }
1417
1418 mp->mnt_devvp = devvp;
1419 device_vnode = devvp;
1420 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1421 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1422 (device_vnode = mp->mnt_devvp)) {
1423 dev_t dev;
1424 int maj;
1425 /*
1426 * If upgrade to read-write by non-root, then verify
1427 * that user has necessary permissions on the device.
1428 */
1429 vnode_getalways(device_vnode);
1430
1431 if (suser(vfs_context_ucred(ctx), NULL) &&
1432 (error = vnode_authorize(device_vnode, NULL,
1433 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1434 ctx)) != 0) {
1435 vnode_put(device_vnode);
1436 goto out2;
1437 }
1438
1439 /* Tell the device that we're upgrading */
1440 dev = (dev_t)device_vnode->v_rdev;
1441 maj = major(dev);
1442
1443 if ((u_int)maj >= (u_int)nblkdev) {
1444 panic("Volume mounted on a device with invalid major number.");
1445 }
1446
1447 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1448 vnode_put(device_vnode);
1449 device_vnode = NULLVP;
1450 if (error != 0) {
1451 goto out2;
1452 }
1453 }
1454 } // localargs && !(snapshot | data | vm)
1455
1456 #if CONFIG_MACF
1457 if ((flags & MNT_UPDATE) == 0) {
1458 mac_mount_label_init(mp);
1459 mac_mount_label_associate(ctx, mp);
1460 }
1461 if (labelstr) {
1462 if ((flags & MNT_UPDATE) != 0) {
1463 error = mac_mount_check_label_update(ctx, mp);
1464 if (error != 0) {
1465 goto out3;
1466 }
1467 }
1468 }
1469 #endif
1470 /*
1471 * Mount the filesystem. We already asserted that internal_flags
1472 * cannot have more than one mount-by-role bit set.
1473 */
1474 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1475 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1476 (caddr_t)fsmountargs, 0, ctx);
1477 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1478 #if CONFIG_ROSV_STARTUP
1479 struct mount *origin_mp = (struct mount*)fsmountargs;
1480 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1481 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1482 if (error) {
1483 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1484 } else {
1485 /* Mark volume associated with system volume */
1486 mp->mnt_kern_flag |= MNTK_SYSTEM;
1487
1488 /* Attempt to acquire the mnt_devvp and set it up */
1489 struct vnode *mp_devvp = NULL;
1490 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1491 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1492 0, &mp_devvp, vfs_context_kernel());
1493 if (!lerr) {
1494 mp->mnt_devvp = mp_devvp;
1495 //vnode_lookup took an iocount, need to drop it.
1496 vnode_put(mp_devvp);
1497 // now set `device_vnode` to the devvp that was acquired.
1498 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1499 // note that though the iocount above was dropped, the mount acquires
1500 // an implicit reference against the device.
1501 device_vnode = mp_devvp;
1502 }
1503 }
1504 }
1505 #else
1506 error = EINVAL;
1507 #endif
1508 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1509 #if CONFIG_MOUNT_VM
1510 struct mount *origin_mp = (struct mount*)fsmountargs;
1511 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1512 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1513 if (error) {
1514 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1515 } else {
1516 /* Mark volume associated with system volume and a swap mount */
1517 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1518 /* Attempt to acquire the mnt_devvp and set it up */
1519 struct vnode *mp_devvp = NULL;
1520 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1521 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1522 0, &mp_devvp, vfs_context_kernel());
1523 if (!lerr) {
1524 mp->mnt_devvp = mp_devvp;
1525 //vnode_lookup took an iocount, need to drop it.
1526 vnode_put(mp_devvp);
1527
1528 // now set `device_vnode` to the devvp that was acquired.
1529 // note that though the iocount above was dropped, the mount acquires
1530 // an implicit reference against the device.
1531 device_vnode = mp_devvp;
1532 }
1533 }
1534 }
1535 #else
1536 error = EINVAL;
1537 #endif
1538 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1539 #if CONFIG_MOUNT_PREBOOTRECOVERY
1540 struct mount *origin_mp = (struct mount*)fsmountargs;
1541 uint32_t mount_role = 0;
1542 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1543 mount_role = VFS_PREBOOT_ROLE;
1544 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1545 mount_role = VFS_RECOVERY_ROLE;
1546 }
1547
1548 if (mount_role != 0) {
1549 fs_role_mount_args_t frma = {origin_mp, mount_role};
1550 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1551 if (error) {
1552 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1553 } else {
1554 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1555 /* Mark volume associated with system volume */
1556 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1557 /* Attempt to acquire the mnt_devvp and set it up */
1558 struct vnode *mp_devvp = NULL;
1559 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1560 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1561 0, &mp_devvp, vfs_context_kernel());
1562 if (!lerr) {
1563 mp->mnt_devvp = mp_devvp;
1564 //vnode_lookup took an iocount, need to drop it.
1565 vnode_put(mp_devvp);
1566
1567 // now set `device_vnode` to the devvp that was acquired.
1568 // note that though the iocount above was dropped, the mount acquires
1569 // an implicit reference against the device.
1570 device_vnode = mp_devvp;
1571 }
1572 }
1573 }
1574 } else {
1575 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1576 error = EINVAL;
1577 }
1578 #else
1579 error = EINVAL;
1580 #endif
1581 } else {
1582 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1583 }
1584
1585 if (flags & MNT_UPDATE) {
1586 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1587 mp->mnt_flag &= ~MNT_RDONLY;
1588 }
1589 mp->mnt_flag &= ~
1590 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1591 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1592 if (error) {
1593 mp->mnt_flag = flag; /* restore flag value */
1594 }
1595 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1596 lck_rw_done(&mp->mnt_rwlock);
1597 is_rwlock_locked = FALSE;
1598 if (!error) {
1599 enablequotas(mp, ctx);
1600 }
1601 goto exit;
1602 }
1603
1604 /*
1605 * Put the new filesystem on the mount list after root.
1606 */
1607 if (error == 0) {
1608 struct vfs_attr vfsattr;
1609 if (device_vnode) {
1610 /*
1611 * cache the IO attributes for the underlying physical media...
1612 * an error return indicates the underlying driver doesn't
1613 * support all the queries necessary... however, reasonable
1614 * defaults will have been set, so no reason to bail or care
1615 *
1616 * Need to do this before calling the MAC hook as it needs
1617 * information from this call.
1618 */
1619 vfs_init_io_attributes(device_vnode, mp);
1620 }
1621
1622 #if CONFIG_MACF
1623 error = mac_mount_check_mount_late(ctx, mp);
1624 if (error != 0) {
1625 goto out4;
1626 }
1627
1628 if (vfs_flags(mp) & MNT_MULTILABEL) {
1629 error = VFS_ROOT(mp, &rvp, ctx);
1630 if (error) {
1631 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1632 goto out4;
1633 }
1634 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1635 /*
1636 * drop reference provided by VFS_ROOT
1637 */
1638 vnode_put(rvp);
1639
1640 if (error) {
1641 goto out4;
1642 }
1643 }
1644 #endif /* MAC */
1645
1646 vnode_lock_spin(vp);
1647 CLR(vp->v_flag, VMOUNT);
1648 vp->v_mountedhere = mp;
1649 SET(vp->v_flag, VMOUNTEDHERE);
1650 vnode_unlock(vp);
1651
1652 /*
1653 * taking the name_cache_lock exclusively will
1654 * insure that everyone is out of the fast path who
1655 * might be trying to use a now stale copy of
1656 * vp->v_mountedhere->mnt_realrootvp
1657 * bumping mount_generation causes the cached values
1658 * to be invalidated
1659 */
1660 name_cache_lock();
1661 mount_generation++;
1662 name_cache_unlock();
1663
1664 error = vnode_ref(vp);
1665 if (error != 0) {
1666 goto out4;
1667 }
1668
1669 have_usecount = TRUE;
1670
1671 error = checkdirs(vp, ctx);
1672 if (error != 0) {
1673 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1674 goto out4;
1675 }
1676 /*
1677 * there is no cleanup code here so I have made it void
1678 * we need to revisit this
1679 */
1680 (void)VFS_START(mp, 0, ctx);
1681
1682 if (mount_list_add(mp) != 0) {
1683 /*
1684 * The system is shutting down trying to umount
1685 * everything, so fail with a plausible errno.
1686 */
1687 error = EBUSY;
1688 goto out4;
1689 }
1690 lck_rw_done(&mp->mnt_rwlock);
1691 is_rwlock_locked = FALSE;
1692
1693 /* Check if this mounted file system supports EAs or named streams. */
1694 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1695 VFSATTR_INIT(&vfsattr);
1696 VFSATTR_WANTED(&vfsattr, f_capabilities);
1697 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1698 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1699 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1700 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1701 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1702 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1703 }
1704 #if NAMEDSTREAMS
1705 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1706 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1707 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1708 }
1709 #endif
1710 /* Check if this file system supports path from id lookups. */
1711 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1712 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1713 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1714 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1715 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1716 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1717 }
1718
1719 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1720 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1721 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1722 }
1723 }
1724 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1725 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1726 }
1727 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1728 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1729 }
1730 /* increment the operations count */
1731 OSAddAtomic(1, &vfs_nummntops);
1732 enablequotas(mp, ctx);
1733
1734 if (device_vnode) {
1735 vfs_setmountedon(device_vnode);
1736 }
1737
1738 /* Now that mount is setup, notify the listeners */
1739 vfs_notify_mount(pvp);
1740 IOBSDMountChange(mp, kIOMountChangeMount);
1741 } else {
1742 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1743 if (mp->mnt_vnodelist.tqh_first != NULL) {
1744 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1745 mp->mnt_vtable->vfc_name, error);
1746 }
1747
1748 vnode_lock_spin(vp);
1749 CLR(vp->v_flag, VMOUNT);
1750 vnode_unlock(vp);
1751 mount_list_lock();
1752 mp->mnt_vtable->vfc_refcount--;
1753 mount_list_unlock();
1754
1755 if (device_vnode) {
1756 vnode_rele(device_vnode);
1757 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1758 vfs_clearmounting(device_vnode);
1759 }
1760 lck_rw_done(&mp->mnt_rwlock);
1761 is_rwlock_locked = FALSE;
1762
1763 if (nc_smr_enabled) {
1764 vfs_smr_synchronize();
1765 }
1766
1767 /*
1768 * if we get here, we have a mount structure that needs to be freed,
1769 * but since the coveredvp hasn't yet been updated to point at it,
1770 * no need to worry about other threads holding a crossref on this mp
1771 * so it's ok to just free it
1772 */
1773 mount_lock_destroy(mp);
1774 #if CONFIG_MACF
1775 mac_mount_label_destroy(mp);
1776 #endif
1777 zfree(mount_zone, mp);
1778 did_set_lmount = false;
1779 }
1780 exit:
1781 /*
1782 * drop I/O count on the device vp if there was one
1783 */
1784 if (devpath && devvp) {
1785 vnode_put(devvp);
1786 }
1787
1788 if (did_set_lmount) {
1789 mount_lock_spin(mp);
1790 mp->mnt_lflag &= ~MNT_LMOUNT;
1791 mount_unlock(mp);
1792 }
1793
1794 return error;
1795
1796 /* Error condition exits */
1797 out4:
1798 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1799
1800 /*
1801 * If the mount has been placed on the covered vp,
1802 * it may have been discovered by now, so we have
1803 * to treat this just like an unmount
1804 */
1805 mount_lock_spin(mp);
1806 mp->mnt_lflag |= MNT_LDEAD;
1807 mount_unlock(mp);
1808
1809 if (device_vnode != NULLVP) {
1810 vnode_rele(device_vnode);
1811 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1812 ctx);
1813 vfs_clearmounting(device_vnode);
1814 did_rele = TRUE;
1815 }
1816
1817 vnode_lock_spin(vp);
1818
1819 mp->mnt_crossref++;
1820 CLR(vp->v_flag, VMOUNTEDHERE);
1821 vp->v_mountedhere = (mount_t) 0;
1822
1823 vnode_unlock(vp);
1824
1825 if (have_usecount) {
1826 vnode_rele(vp);
1827 }
1828 out3:
1829 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1830 vnode_rele(devvp);
1831 vfs_clearmounting(devvp);
1832 }
1833 out2:
1834 if (devpath && devvp) {
1835 vnode_put(devvp);
1836 }
1837 out1:
1838 /* Release mnt_rwlock only when it was taken */
1839 if (is_rwlock_locked == TRUE) {
1840 if (flag_set) {
1841 mp->mnt_flag = flag; /* restore mnt_flag value */
1842 }
1843 lck_rw_done(&mp->mnt_rwlock);
1844 }
1845
1846 if (did_set_lmount) {
1847 mount_lock_spin(mp);
1848 mp->mnt_lflag &= ~MNT_LMOUNT;
1849 mount_unlock(mp);
1850 }
1851
1852 if (mntalloc) {
1853 if (mp->mnt_crossref) {
1854 mount_dropcrossref(mp, vp, 0);
1855 } else {
1856 if (nc_smr_enabled) {
1857 vfs_smr_synchronize();
1858 }
1859
1860 mount_lock_destroy(mp);
1861 #if CONFIG_MACF
1862 mac_mount_label_destroy(mp);
1863 #endif
1864 zfree(mount_zone, mp);
1865 }
1866 }
1867 if (vfsp_ref) {
1868 mount_list_lock();
1869 vfsp->vfc_refcount--;
1870 mount_list_unlock();
1871 }
1872
1873 return error;
1874 }
1875
1876 /*
1877 * Flush in-core data, check for competing mount attempts,
1878 * and set VMOUNT
1879 */
1880 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1881 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1882 {
1883 #if !CONFIG_MACF
1884 #pragma unused(cnp,fsname)
1885 #endif
1886 struct vnode_attr va;
1887 int error;
1888 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1889 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1890 boolean_t is_busy;
1891
1892 if (!skip_auth) {
1893 /*
1894 * If the user is not root, ensure that they own the directory
1895 * onto which we are attempting to mount.
1896 */
1897 VATTR_INIT(&va);
1898 VATTR_WANTED(&va, va_uid);
1899 if ((error = vnode_getattr(vp, &va, ctx)) ||
1900 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1901 (!vfs_context_issuser(ctx)))) {
1902 error = EPERM;
1903 goto out;
1904 }
1905 }
1906
1907 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1908 goto out;
1909 }
1910
1911 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1912 goto out;
1913 }
1914
1915 if (vp->v_type != VDIR) {
1916 error = ENOTDIR;
1917 goto out;
1918 }
1919
1920 vnode_lock_spin(vp);
1921 is_busy = is_fmount ?
1922 (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1923 (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1924 if (is_busy) {
1925 vnode_unlock(vp);
1926 error = EBUSY;
1927 goto out;
1928 }
1929 SET(vp->v_flag, VMOUNT);
1930 vnode_unlock(vp);
1931
1932 #if CONFIG_MACF
1933 error = mac_mount_check_mount(ctx, vp,
1934 cnp, fsname);
1935 if (error != 0) {
1936 vnode_lock_spin(vp);
1937 CLR(vp->v_flag, VMOUNT);
1938 vnode_unlock(vp);
1939 }
1940 #endif
1941
1942 out:
1943 return error;
1944 }
1945
1946 #if CONFIG_IMGSRC_ACCESS
1947
1948 #define DEBUG_IMGSRC 0
1949
1950 #if DEBUG_IMGSRC
1951 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1952 #else
1953 #define IMGSRC_DEBUG(args...) do { } while(0)
1954 #endif
1955
1956 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1957 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1958 {
1959 struct nameidata nd;
1960 vnode_t vp, realdevvp;
1961 kauth_action_t accessmode;
1962 int error;
1963 enum uio_seg uio = UIO_USERSPACE;
1964
1965 if (ctx == vfs_context_kernel()) {
1966 uio = UIO_SYSSPACE;
1967 }
1968
1969 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1970 if ((error = namei(&nd))) {
1971 IMGSRC_DEBUG("namei() failed with %d\n", error);
1972 return error;
1973 }
1974
1975 vp = nd.ni_vp;
1976
1977 if (!vnode_isblk(vp)) {
1978 IMGSRC_DEBUG("Not block device.\n");
1979 error = ENOTBLK;
1980 goto out;
1981 }
1982
1983 realdevvp = mp->mnt_devvp;
1984 if (realdevvp == NULLVP) {
1985 IMGSRC_DEBUG("No device backs the mount.\n");
1986 error = ENXIO;
1987 goto out;
1988 }
1989
1990 error = vnode_getwithref(realdevvp);
1991 if (error != 0) {
1992 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1993 goto out;
1994 }
1995
1996 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1997 IMGSRC_DEBUG("Wrong dev_t.\n");
1998 error = ENXIO;
1999 goto out1;
2000 }
2001
2002 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2003
2004 /*
2005 * If mount by non-root, then verify that user has necessary
2006 * permissions on the device.
2007 */
2008 if (!vfs_context_issuser(ctx)) {
2009 accessmode = KAUTH_VNODE_READ_DATA;
2010 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2011 accessmode |= KAUTH_VNODE_WRITE_DATA;
2012 }
2013 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2014 IMGSRC_DEBUG("Access denied.\n");
2015 goto out1;
2016 }
2017 }
2018
2019 *devvpp = vp;
2020
2021 out1:
2022 vnode_put(realdevvp);
2023
2024 out:
2025 nameidone(&nd);
2026
2027 if (error) {
2028 vnode_put(vp);
2029 }
2030
2031 return error;
2032 }
2033
2034 /*
2035 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2036 * and call checkdirs()
2037 */
2038 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2039 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2040 {
2041 int error;
2042
2043 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2044
2045 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2046 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2047
2048 vnode_lock_spin(vp);
2049 CLR(vp->v_flag, VMOUNT);
2050 vp->v_mountedhere = mp;
2051 SET(vp->v_flag, VMOUNTEDHERE);
2052 vnode_unlock(vp);
2053
2054 /*
2055 * taking the name_cache_lock exclusively will
2056 * insure that everyone is out of the fast path who
2057 * might be trying to use a now stale copy of
2058 * vp->v_mountedhere->mnt_realrootvp
2059 * bumping mount_generation causes the cached values
2060 * to be invalidated
2061 */
2062 name_cache_lock();
2063 mount_generation++;
2064 name_cache_unlock();
2065
2066 error = vnode_ref(vp);
2067 if (error != 0) {
2068 goto out;
2069 }
2070
2071 error = checkdirs(vp, ctx);
2072 if (error != 0) {
2073 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2074 vnode_rele(vp);
2075 goto out;
2076 }
2077
2078 out:
2079 if (error != 0) {
2080 mp->mnt_vnodecovered = NULLVP;
2081 }
2082 return error;
2083 }
2084
2085 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2086 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2087 {
2088 vnode_rele(vp);
2089 vnode_lock_spin(vp);
2090 CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2091 vp->v_mountedhere = (mount_t)NULL;
2092 vnode_unlock(vp);
2093
2094 mp->mnt_vnodecovered = NULLVP;
2095 }
2096
2097 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2098 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2099 {
2100 int error;
2101
2102 /* unmount in progress return error */
2103 mount_lock_spin(mp);
2104 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2105 mount_unlock(mp);
2106 return EBUSY;
2107 }
2108 mount_unlock(mp);
2109 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2110
2111 /*
2112 * We only allow the filesystem to be reloaded if it
2113 * is currently mounted read-only.
2114 */
2115 if ((flags & MNT_RELOAD) &&
2116 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2117 error = ENOTSUP;
2118 goto out;
2119 }
2120
2121 /*
2122 * Only root, or the user that did the original mount is
2123 * permitted to update it.
2124 */
2125 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2126 (!vfs_context_issuser(ctx))) {
2127 error = EPERM;
2128 goto out;
2129 }
2130 #if CONFIG_MACF
2131 error = mac_mount_check_remount(ctx, mp);
2132 if (error != 0) {
2133 goto out;
2134 }
2135 #endif
2136
2137 out:
2138 if (error) {
2139 lck_rw_done(&mp->mnt_rwlock);
2140 }
2141
2142 return error;
2143 }
2144
2145 static void
mount_end_update(mount_t mp)2146 mount_end_update(mount_t mp)
2147 {
2148 lck_rw_done(&mp->mnt_rwlock);
2149 }
2150
2151 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2152 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2153 {
2154 vnode_t vp;
2155
2156 if (height >= MAX_IMAGEBOOT_NESTING) {
2157 return EINVAL;
2158 }
2159
2160 vp = imgsrc_rootvnodes[height];
2161 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2162 *rvpp = vp;
2163 return 0;
2164 } else {
2165 return ENOENT;
2166 }
2167 }
2168
2169 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2170 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2171 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2172 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2173 {
2174 int error;
2175 mount_t mp;
2176 boolean_t placed = FALSE;
2177 struct vfstable *vfsp;
2178 user_addr_t devpath;
2179 char *old_mntonname;
2180 vnode_t rvp;
2181 vnode_t devvp;
2182 uint32_t height;
2183 uint32_t flags;
2184
2185 /* If we didn't imageboot, nothing to move */
2186 if (imgsrc_rootvnodes[0] == NULLVP) {
2187 return EINVAL;
2188 }
2189
2190 /* Only root can do this */
2191 if (!vfs_context_issuser(ctx)) {
2192 return EPERM;
2193 }
2194
2195 IMGSRC_DEBUG("looking for root vnode.\n");
2196
2197 /*
2198 * Get root vnode of filesystem we're moving.
2199 */
2200 if (by_index) {
2201 if (is64bit) {
2202 struct user64_mnt_imgsrc_args mia64;
2203 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2204 if (error != 0) {
2205 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2206 return error;
2207 }
2208
2209 height = mia64.mi_height;
2210 flags = mia64.mi_flags;
2211 devpath = (user_addr_t)mia64.mi_devpath;
2212 } else {
2213 struct user32_mnt_imgsrc_args mia32;
2214 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2215 if (error != 0) {
2216 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2217 return error;
2218 }
2219
2220 height = mia32.mi_height;
2221 flags = mia32.mi_flags;
2222 devpath = mia32.mi_devpath;
2223 }
2224 } else {
2225 /*
2226 * For binary compatibility--assumes one level of nesting.
2227 */
2228 if (is64bit) {
2229 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2230 return error;
2231 }
2232 } else {
2233 user32_addr_t tmp;
2234 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2235 return error;
2236 }
2237
2238 /* munge into LP64 addr */
2239 devpath = CAST_USER_ADDR_T(tmp);
2240 }
2241
2242 height = 0;
2243 flags = 0;
2244 }
2245
2246 if (flags != 0) {
2247 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2248 return EINVAL;
2249 }
2250
2251 error = get_imgsrc_rootvnode(height, &rvp);
2252 if (error != 0) {
2253 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2254 return error;
2255 }
2256
2257 IMGSRC_DEBUG("got old root vnode\n");
2258
2259 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2260
2261 /* Can only move once */
2262 mp = vnode_mount(rvp);
2263 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2264 IMGSRC_DEBUG("Already moved.\n");
2265 error = EBUSY;
2266 goto out0;
2267 }
2268
2269 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2270 IMGSRC_DEBUG("Starting updated.\n");
2271
2272 /* Get exclusive rwlock on mount, authorize update on mp */
2273 error = mount_begin_update(mp, ctx, 0);
2274 if (error != 0) {
2275 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2276 goto out0;
2277 }
2278
2279 /*
2280 * It can only be moved once. Flag is set under the rwlock,
2281 * so we're now safe to proceed.
2282 */
2283 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2284 IMGSRC_DEBUG("Already moved [2]\n");
2285 goto out1;
2286 }
2287
2288 IMGSRC_DEBUG("Preparing coveredvp.\n");
2289
2290 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2291 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2292 if (error != 0) {
2293 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2294 goto out1;
2295 }
2296
2297 IMGSRC_DEBUG("Covered vp OK.\n");
2298
2299 /* Sanity check the name caller has provided */
2300 vfsp = mp->mnt_vtable;
2301 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2302 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2303 vfsp->vfc_name, fsname);
2304 error = EINVAL;
2305 goto out2;
2306 }
2307
2308 /* Check the device vnode and update mount-from name, for local filesystems */
2309 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2310 IMGSRC_DEBUG("Local, doing device validation.\n");
2311
2312 if (devpath != USER_ADDR_NULL) {
2313 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2314 if (error) {
2315 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2316 goto out2;
2317 }
2318
2319 vnode_put(devvp);
2320 }
2321 }
2322
2323 /*
2324 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2325 * and increment the name cache's mount generation
2326 */
2327
2328 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2329 error = place_mount_and_checkdirs(mp, vp, ctx);
2330 if (error != 0) {
2331 goto out2;
2332 }
2333
2334 placed = TRUE;
2335
2336 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2337 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2338
2339 /* Forbid future moves */
2340 mount_lock(mp);
2341 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2342 mount_unlock(mp);
2343
2344 /* Finally, add to mount list, completely ready to go */
2345 if (mount_list_add(mp) != 0) {
2346 /*
2347 * The system is shutting down trying to umount
2348 * everything, so fail with a plausible errno.
2349 */
2350 error = EBUSY;
2351 goto out3;
2352 }
2353
2354 mount_end_update(mp);
2355 vnode_put(rvp);
2356 zfree(ZV_NAMEI, old_mntonname);
2357
2358 vfs_notify_mount(pvp);
2359
2360 return 0;
2361 out3:
2362 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2363
2364 mount_lock(mp);
2365 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2366 mount_unlock(mp);
2367
2368 out2:
2369 /*
2370 * Placing the mp on the vnode clears VMOUNT,
2371 * so cleanup is different after that point
2372 */
2373 if (placed) {
2374 /* Rele the vp, clear VMOUNT and v_mountedhere */
2375 undo_place_on_covered_vp(mp, vp);
2376 } else {
2377 vnode_lock_spin(vp);
2378 CLR(vp->v_flag, VMOUNT);
2379 vnode_unlock(vp);
2380 }
2381 out1:
2382 mount_end_update(mp);
2383
2384 out0:
2385 vnode_put(rvp);
2386 zfree(ZV_NAMEI, old_mntonname);
2387 return error;
2388 }
2389
2390 #endif /* CONFIG_IMGSRC_ACCESS */
2391
2392 void
enablequotas(struct mount * mp,vfs_context_t ctx)2393 enablequotas(struct mount *mp, vfs_context_t ctx)
2394 {
2395 struct nameidata qnd;
2396 int type;
2397 char qfpath[MAXPATHLEN];
2398 const char *qfname = QUOTAFILENAME;
2399 const char *qfopsname = QUOTAOPSNAME;
2400 const char *qfextension[] = INITQFNAMES;
2401
2402 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2403 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2404 return;
2405 }
2406 /*
2407 * Enable filesystem disk quotas if necessary.
2408 * We ignore errors as this should not interfere with final mount
2409 */
2410 for (type = 0; type < MAXQUOTAS; type++) {
2411 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2412 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2413 CAST_USER_ADDR_T(qfpath), ctx);
2414 if (namei(&qnd) != 0) {
2415 continue; /* option file to trigger quotas is not present */
2416 }
2417 vnode_put(qnd.ni_vp);
2418 nameidone(&qnd);
2419 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2420
2421 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2422 }
2423 return;
2424 }
2425
2426
2427 static int
checkdirs_callback(proc_t p,void * arg)2428 checkdirs_callback(proc_t p, void * arg)
2429 {
2430 struct cdirargs *cdrp = (struct cdirargs *)arg;
2431 vnode_t olddp = cdrp->olddp;
2432 vnode_t newdp = cdrp->newdp;
2433 struct filedesc *fdp = &p->p_fd;
2434 vnode_t new_cvp = newdp;
2435 vnode_t new_rvp = newdp;
2436 vnode_t old_cvp = NULL;
2437 vnode_t old_rvp = NULL;
2438
2439 /*
2440 * XXX Also needs to iterate each thread in the process to see if it
2441 * XXX is using a per-thread current working directory, and, if so,
2442 * XXX update that as well.
2443 */
2444
2445 /*
2446 * First, with the proc_fdlock held, check to see if we will need
2447 * to do any work. If not, we will get out fast.
2448 */
2449 proc_fdlock(p);
2450 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2451 proc_fdunlock(p);
2452 return PROC_RETURNED;
2453 }
2454 proc_fdunlock(p);
2455
2456 /*
2457 * Ok, we will have to do some work. Always take two refs
2458 * because we might need that many. We'll dispose of whatever
2459 * we ended up not using.
2460 */
2461 if (vnode_ref(newdp) != 0) {
2462 return PROC_RETURNED;
2463 }
2464 if (vnode_ref(newdp) != 0) {
2465 vnode_rele(newdp);
2466 return PROC_RETURNED;
2467 }
2468
2469 proc_dirs_lock_exclusive(p);
2470 /*
2471 * Now do the work. Note: we dropped the proc_fdlock, so we
2472 * have to do all of the checks again.
2473 */
2474 proc_fdlock(p);
2475 if (fdp->fd_cdir == olddp) {
2476 old_cvp = olddp;
2477 fdp->fd_cdir = newdp;
2478 new_cvp = NULL;
2479 }
2480 if (fdp->fd_rdir == olddp) {
2481 old_rvp = olddp;
2482 fdp->fd_rdir = newdp;
2483 new_rvp = NULL;
2484 }
2485 proc_fdunlock(p);
2486 proc_dirs_unlock_exclusive(p);
2487
2488 /*
2489 * Dispose of any references that are no longer needed.
2490 */
2491 if (old_cvp != NULL) {
2492 vnode_rele(old_cvp);
2493 }
2494 if (old_rvp != NULL) {
2495 vnode_rele(old_rvp);
2496 }
2497 if (new_cvp != NULL) {
2498 vnode_rele(new_cvp);
2499 }
2500 if (new_rvp != NULL) {
2501 vnode_rele(new_rvp);
2502 }
2503
2504 return PROC_RETURNED;
2505 }
2506
2507
2508
2509 /*
2510 * Scan all active processes to see if any of them have a current
2511 * or root directory onto which the new filesystem has just been
2512 * mounted. If so, replace them with the new mount point.
2513 */
2514 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2515 checkdirs(vnode_t olddp, vfs_context_t ctx)
2516 {
2517 vnode_t newdp;
2518 vnode_t tvp;
2519 int err;
2520 struct cdirargs cdr;
2521
2522 if (olddp->v_usecount == 1) {
2523 return 0;
2524 }
2525 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2526
2527 if (err != 0) {
2528 #if DIAGNOSTIC
2529 panic("mount: lost mount: error %d", err);
2530 #endif
2531 return err;
2532 }
2533
2534 cdr.olddp = olddp;
2535 cdr.newdp = newdp;
2536 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2537 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2538
2539 if (rootvnode == olddp) {
2540 vnode_ref(newdp);
2541 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2542 tvp = rootvnode;
2543 rootvnode = newdp;
2544 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2545 vnode_rele(tvp);
2546 }
2547
2548 vnode_put(newdp);
2549 return 0;
2550 }
2551
2552 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2553 "com.apple.private.vfs.role-account-unmount"
2554
2555 /*
2556 * Unmount a file system.
2557 *
2558 * Note: unmount takes a path to the vnode mounted on as argument,
2559 * not special file (as before).
2560 */
2561 /* ARGSUSED */
2562 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2563 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2564 {
2565 vnode_t vp;
2566 struct mount *mp;
2567 int error;
2568 struct nameidata nd;
2569 vfs_context_t ctx;
2570
2571 /*
2572 * If the process has the entitlement, use the kernel's context when
2573 * performing lookup on the mount path as the process might lack proper
2574 * permission to access the directory.
2575 */
2576 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2577 vfs_context_kernel() : vfs_context_current();
2578
2579 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2580 UIO_USERSPACE, uap->path, ctx);
2581 error = namei(&nd);
2582 if (error) {
2583 return error;
2584 }
2585 vp = nd.ni_vp;
2586 mp = vp->v_mount;
2587 nameidone(&nd);
2588
2589 /*
2590 * Must be the root of the filesystem
2591 */
2592 if ((vp->v_flag & VROOT) == 0) {
2593 vnode_put(vp);
2594 return EINVAL;
2595 }
2596 #if CONFIG_MACF
2597 error = mac_mount_check_umount(ctx, mp);
2598 if (error != 0) {
2599 vnode_put(vp);
2600 return error;
2601 }
2602 #endif
2603 mount_ref(mp, 0);
2604 vnode_put(vp);
2605 /* safedounmount consumes the mount ref */
2606 return safedounmount(mp, uap->flags, ctx);
2607 }
2608
2609 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2610 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2611 {
2612 mount_t mp;
2613
2614 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2615 if (mp == (mount_t)0) {
2616 return ENOENT;
2617 }
2618 mount_ref(mp, 0);
2619 mount_iterdrop(mp);
2620 /* safedounmount consumes the mount ref */
2621 return safedounmount(mp, flags, ctx);
2622 }
2623
2624 /*
2625 * The mount struct comes with a mount ref which will be consumed.
2626 * Do the actual file system unmount, prevent some common foot shooting.
2627 */
2628 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2629 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2630 {
2631 int error;
2632 proc_t p = vfs_context_proc(ctx);
2633
2634 /*
2635 * If the file system is not responding and MNT_NOBLOCK
2636 * is set and not a forced unmount then return EBUSY.
2637 */
2638 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2639 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2640 error = EBUSY;
2641 goto out;
2642 }
2643
2644 /*
2645 * Skip authorization in two cases:
2646 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2647 * This entitlement allows non-root processes unmount volumes mounted by
2648 * other processes.
2649 * - If the mount is tagged as permissive and this is not a forced-unmount
2650 * attempt.
2651 */
2652 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2653 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2654 /*
2655 * Only root, or the user that did the original mount is
2656 * permitted to unmount this filesystem.
2657 */
2658 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2659 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2660 goto out;
2661 }
2662 }
2663 /*
2664 * Don't allow unmounting the root file system, or other volumes
2665 * associated with it (for example, the associated VM or DATA mounts) .
2666 */
2667 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2668 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2669 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2670 mp->mnt_vfsstat.f_mntonname);
2671 }
2672 error = EBUSY; /* the root (or associated volumes) is always busy */
2673 goto out;
2674 }
2675
2676 /*
2677 * If the mount is providing the root filesystem's disk image
2678 * (i.e. imageboot), don't allow unmounting
2679 */
2680 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2681 error = EBUSY;
2682 goto out;
2683 }
2684
2685 return dounmount(mp, flags, 1, ctx);
2686
2687 out:
2688 mount_drop(mp, 0);
2689 return error;
2690 }
2691
2692 /*
2693 * Do the actual file system unmount.
2694 */
2695 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2696 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2697 {
2698 vnode_t coveredvp = (vnode_t)0;
2699 int error;
2700 int needwakeup = 0;
2701 int forcedunmount = 0;
2702 int lflags = 0;
2703 struct vnode *devvp = NULLVP;
2704 #if CONFIG_TRIGGERS
2705 proc_t p = vfs_context_proc(ctx);
2706 int did_vflush = 0;
2707 int pflags_save = 0;
2708 #endif /* CONFIG_TRIGGERS */
2709
2710 #if CONFIG_FSE
2711 if (!(flags & MNT_FORCE)) {
2712 fsevent_unmount(mp, ctx); /* has to come first! */
2713 }
2714 #endif
2715
2716 mount_lock(mp);
2717
2718 /*
2719 * If already an unmount in progress just return EBUSY.
2720 * Even a forced unmount cannot override.
2721 */
2722 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2723 if (withref != 0) {
2724 mount_drop(mp, 1);
2725 }
2726 mount_unlock(mp);
2727 return EBUSY;
2728 }
2729
2730 if (flags & MNT_FORCE) {
2731 forcedunmount = 1;
2732 mp->mnt_lflag |= MNT_LFORCE;
2733 }
2734
2735 #if CONFIG_TRIGGERS
2736 if (flags & MNT_NOBLOCK && p != kernproc) {
2737 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2738 }
2739 #endif
2740
2741 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2742 mp->mnt_lflag |= MNT_LUNMOUNT;
2743 mp->mnt_flag &= ~MNT_ASYNC;
2744 /*
2745 * anyone currently in the fast path that
2746 * trips over the cached rootvp will be
2747 * dumped out and forced into the slow path
2748 * to regenerate a new cached value
2749 */
2750 mp->mnt_realrootvp = NULLVP;
2751 mount_unlock(mp);
2752
2753 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2754 /*
2755 * Force unmount any mounts in this filesystem.
2756 * If any unmounts fail - just leave them dangling.
2757 * Avoids recursion.
2758 */
2759 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2760 }
2761
2762 /*
2763 * taking the name_cache_lock exclusively will
2764 * insure that everyone is out of the fast path who
2765 * might be trying to use a now stale copy of
2766 * vp->v_mountedhere->mnt_realrootvp
2767 * bumping mount_generation causes the cached values
2768 * to be invalidated
2769 */
2770 name_cache_lock();
2771 mount_generation++;
2772 name_cache_unlock();
2773
2774
2775 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2776 if (withref != 0) {
2777 mount_drop(mp, 0);
2778 }
2779 error = 0;
2780 if (forcedunmount == 0) {
2781 ubc_umount(mp); /* release cached vnodes */
2782 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2783 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2784 if (error) {
2785 mount_lock(mp);
2786 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2787 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2788 mp->mnt_lflag &= ~MNT_LFORCE;
2789 goto out;
2790 }
2791 }
2792 }
2793
2794 IOBSDMountChange(mp, kIOMountChangeUnmount);
2795
2796 #if CONFIG_TRIGGERS
2797 vfs_nested_trigger_unmounts(mp, flags, ctx);
2798 did_vflush = 1;
2799 #endif
2800 if (forcedunmount) {
2801 lflags |= FORCECLOSE;
2802 }
2803 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2804 if ((forcedunmount == 0) && error) {
2805 mount_lock(mp);
2806 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2807 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2808 mp->mnt_lflag &= ~MNT_LFORCE;
2809 goto out;
2810 }
2811
2812 /* make sure there are no one in the mount iterations or lookup */
2813 mount_iterdrain(mp);
2814
2815 error = VFS_UNMOUNT(mp, flags, ctx);
2816 if (error) {
2817 mount_iterreset(mp);
2818 mount_lock(mp);
2819 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2820 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2821 mp->mnt_lflag &= ~MNT_LFORCE;
2822 goto out;
2823 }
2824
2825 /* increment the operations count */
2826 if (!error) {
2827 OSAddAtomic(1, &vfs_nummntops);
2828 }
2829
2830 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2831 /* hold an io reference and drop the usecount before close */
2832 devvp = mp->mnt_devvp;
2833 vnode_getalways(devvp);
2834 vnode_rele(devvp);
2835 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2836 ctx);
2837 vnode_clearmountedon(devvp);
2838 vnode_put(devvp);
2839 }
2840 lck_rw_done(&mp->mnt_rwlock);
2841 mount_list_remove(mp);
2842 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2843
2844 /* mark the mount point hook in the vp but not drop the ref yet */
2845 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2846 /*
2847 * The covered vnode needs special handling. Trying to get an
2848 * iocount must not block here as this may lead to deadlocks
2849 * if the Filesystem to which the covered vnode belongs is
2850 * undergoing forced unmounts. Since we hold a usecount, the
2851 * vnode cannot be reused (it can, however, still be terminated)
2852 */
2853 vnode_getalways(coveredvp);
2854 vnode_lock_spin(coveredvp);
2855
2856 mp->mnt_crossref++;
2857 coveredvp->v_mountedhere = (struct mount *)0;
2858 CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2859 vnode_unlock(coveredvp);
2860 vnode_put(coveredvp);
2861 }
2862
2863 mount_list_lock();
2864 mp->mnt_vtable->vfc_refcount--;
2865 mount_list_unlock();
2866
2867 cache_purgevfs(mp); /* remove cache entries for this file sys */
2868 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2869 mount_lock(mp);
2870 mp->mnt_lflag |= MNT_LDEAD;
2871
2872 if (mp->mnt_lflag & MNT_LWAIT) {
2873 /*
2874 * do the wakeup here
2875 * in case we block in mount_refdrain
2876 * which will drop the mount lock
2877 * and allow anyone blocked in vfs_busy
2878 * to wakeup and see the LDEAD state
2879 */
2880 mp->mnt_lflag &= ~MNT_LWAIT;
2881 wakeup((caddr_t)mp);
2882 }
2883 mount_refdrain(mp);
2884
2885 /* free disk_conditioner_info structure for this mount */
2886 disk_conditioner_unmount(mp);
2887
2888 out:
2889 if (mp->mnt_lflag & MNT_LWAIT) {
2890 mp->mnt_lflag &= ~MNT_LWAIT;
2891 needwakeup = 1;
2892 }
2893
2894 #if CONFIG_TRIGGERS
2895 if (flags & MNT_NOBLOCK && p != kernproc) {
2896 // Restore P_NOREMOTEHANG bit to its previous value
2897 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2898 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2899 }
2900 }
2901
2902 /*
2903 * Callback and context are set together under the mount lock, and
2904 * never cleared, so we're safe to examine them here, drop the lock,
2905 * and call out.
2906 */
2907 if (mp->mnt_triggercallback != NULL) {
2908 mount_unlock(mp);
2909 if (error == 0) {
2910 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2911 } else if (did_vflush) {
2912 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2913 }
2914 } else {
2915 mount_unlock(mp);
2916 }
2917 #else
2918 mount_unlock(mp);
2919 #endif /* CONFIG_TRIGGERS */
2920
2921 lck_rw_done(&mp->mnt_rwlock);
2922
2923 if (needwakeup) {
2924 wakeup((caddr_t)mp);
2925 }
2926
2927 if (!error) {
2928 if ((coveredvp != NULLVP)) {
2929 vnode_t pvp = NULLVP;
2930
2931 /*
2932 * The covered vnode needs special handling. Trying to
2933 * get an iocount must not block here as this may lead
2934 * to deadlocks if the Filesystem to which the covered
2935 * vnode belongs is undergoing forced unmounts. Since we
2936 * hold a usecount, the vnode cannot be reused
2937 * (it can, however, still be terminated).
2938 */
2939 vnode_getalways(coveredvp);
2940
2941 mount_dropcrossref(mp, coveredvp, 0);
2942 /*
2943 * We'll _try_ to detect if this really needs to be
2944 * done. The coveredvp can only be in termination (or
2945 * terminated) if the coveredvp's mount point is in a
2946 * forced unmount (or has been) since we still hold the
2947 * ref.
2948 */
2949 if (!vnode_isrecycled(coveredvp)) {
2950 pvp = vnode_getparent(coveredvp);
2951 #if CONFIG_TRIGGERS
2952 if (coveredvp->v_resolve) {
2953 vnode_trigger_rearm(coveredvp, ctx);
2954 }
2955 #endif
2956 }
2957
2958 vnode_rele(coveredvp);
2959 vnode_put(coveredvp);
2960 coveredvp = NULLVP;
2961
2962 if (pvp) {
2963 lock_vnode_and_post(pvp, NOTE_WRITE);
2964 vnode_put(pvp);
2965 }
2966 } else if (mp->mnt_flag & MNT_ROOTFS) {
2967 if (nc_smr_enabled) {
2968 vfs_smr_synchronize();
2969 }
2970
2971 mount_lock_destroy(mp);
2972 #if CONFIG_MACF
2973 mac_mount_label_destroy(mp);
2974 #endif
2975 zfree(mount_zone, mp);
2976 } else {
2977 panic("dounmount: no coveredvp");
2978 }
2979 }
2980 return error;
2981 }
2982
2983 /*
2984 * Unmount any mounts in this filesystem.
2985 */
2986 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2987 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2988 {
2989 mount_t smp;
2990 fsid_t *fsids, fsid;
2991 int fsids_sz;
2992 int count = 0, i, m = 0;
2993 vnode_t vp;
2994
2995 mount_list_lock();
2996
2997 // Get an array to hold the submounts fsids.
2998 TAILQ_FOREACH(smp, &mountlist, mnt_list)
2999 count++;
3000 fsids_sz = count * sizeof(fsid_t);
3001 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3002 if (fsids == NULL) {
3003 mount_list_unlock();
3004 goto out;
3005 }
3006 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3007
3008 /*
3009 * Fill the array with submount fsids.
3010 * Since mounts are always added to the tail of the mount list, the
3011 * list is always in mount order.
3012 * For each mount check if the mounted-on vnode belongs to a
3013 * mount that's already added to our array of mounts to be unmounted.
3014 */
3015 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3016 vp = smp->mnt_vnodecovered;
3017 if (vp == NULL) {
3018 continue;
3019 }
3020 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3021 for (i = 0; i <= m; i++) {
3022 if (fsids[i].val[0] == fsid.val[0] &&
3023 fsids[i].val[1] == fsid.val[1]) {
3024 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3025 break;
3026 }
3027 }
3028 }
3029 mount_list_unlock();
3030
3031 // Unmount the submounts in reverse order. Ignore errors.
3032 for (i = m; i > 0; i--) {
3033 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3034 if (smp) {
3035 mount_ref(smp, 0);
3036 mount_iterdrop(smp);
3037 (void) dounmount(smp, flags, 1, ctx);
3038 }
3039 }
3040 out:
3041 kfree_data(fsids, fsids_sz);
3042 }
3043
3044 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3045 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3046 {
3047 vnode_hold(dp);
3048 vnode_lock(dp);
3049 mp->mnt_crossref--;
3050
3051 if (mp->mnt_crossref < 0) {
3052 panic("mount cross refs -ve");
3053 }
3054
3055 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3056 if (need_put) {
3057 vnode_put_locked(dp);
3058 }
3059 vnode_drop_and_unlock(dp);
3060
3061 if (nc_smr_enabled) {
3062 vfs_smr_synchronize();
3063 }
3064
3065 mount_lock_destroy(mp);
3066 #if CONFIG_MACF
3067 mac_mount_label_destroy(mp);
3068 #endif
3069 zfree(mount_zone, mp);
3070 return;
3071 }
3072 if (need_put) {
3073 vnode_put_locked(dp);
3074 }
3075 vnode_drop_and_unlock(dp);
3076 }
3077
3078
3079 /*
3080 * Sync each mounted filesystem.
3081 */
3082 #if DIAGNOSTIC
3083 int syncprt = 0;
3084 #endif
3085
3086 int print_vmpage_stat = 0;
3087
3088 /*
3089 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3090 * mounted read-write with the passed waitfor value.
3091 *
3092 * Parameters: mp mount-point descriptor per mounted file-system instance.
3093 * arg user argument (please see below)
3094 *
3095 * User argument is a pointer to 32 bit unsigned integer which describes the
3096 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3097 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3098 * waitfor value.
3099 *
3100 * Returns: VFS_RETURNED
3101 */
3102 static int
sync_callback(mount_t mp,void * arg)3103 sync_callback(mount_t mp, void *arg)
3104 {
3105 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3106 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3107 unsigned waitfor = MNT_NOWAIT;
3108
3109 if (arg) {
3110 waitfor = *(uint32_t*)arg;
3111 }
3112
3113 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3114 if (waitfor != MNT_WAIT &&
3115 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3116 waitfor != MNT_NOWAIT &&
3117 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3118 waitfor != MNT_DWAIT &&
3119 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3120 panic("Passed inappropriate waitfor %u to "
3121 "sync_callback()", waitfor);
3122 }
3123
3124 mp->mnt_flag &= ~MNT_ASYNC;
3125 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3126 if (asyncflag) {
3127 mp->mnt_flag |= MNT_ASYNC;
3128 }
3129 }
3130
3131 return VFS_RETURNED;
3132 }
3133
3134 /* ARGSUSED */
3135 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3136 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3137 {
3138 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3139
3140 if (print_vmpage_stat) {
3141 vm_countdirtypages();
3142 }
3143
3144 #if DIAGNOSTIC
3145 if (syncprt) {
3146 vfs_bufstats();
3147 }
3148 #endif /* DIAGNOSTIC */
3149 return 0;
3150 }
3151
3152 typedef enum {
3153 SYNC_ALL = 0,
3154 SYNC_ONLY_RELIABLE_MEDIA = 1,
3155 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3156 } sync_type_t;
3157
3158 static int
sync_internal_callback(mount_t mp,void * arg)3159 sync_internal_callback(mount_t mp, void *arg)
3160 {
3161 if (arg) {
3162 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3163 (mp->mnt_flag & MNT_LOCAL);
3164 sync_type_t sync_type = *((sync_type_t *)arg);
3165
3166 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3167 return VFS_RETURNED;
3168 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3169 return VFS_RETURNED;
3170 }
3171 }
3172
3173 (void)sync_callback(mp, NULL);
3174
3175 return VFS_RETURNED;
3176 }
3177
3178 int sync_thread_state = 0;
3179 int sync_timeout_seconds = 5;
3180
3181 #define SYNC_THREAD_RUN 0x0001
3182 #define SYNC_THREAD_RUNNING 0x0002
3183
3184 #if CONFIG_PHYS_WRITE_ACCT
3185 thread_t pm_sync_thread;
3186 #endif /* CONFIG_PHYS_WRITE_ACCT */
3187
3188 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3189 sync_thread(__unused void *arg, __unused wait_result_t wr)
3190 {
3191 sync_type_t sync_type;
3192 #if CONFIG_PHYS_WRITE_ACCT
3193 pm_sync_thread = current_thread();
3194 #endif /* CONFIG_PHYS_WRITE_ACCT */
3195
3196 lck_mtx_lock(&sync_mtx_lck);
3197 while (sync_thread_state & SYNC_THREAD_RUN) {
3198 sync_thread_state &= ~SYNC_THREAD_RUN;
3199 lck_mtx_unlock(&sync_mtx_lck);
3200
3201 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3202 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3203 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3204 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3205
3206 lck_mtx_lock(&sync_mtx_lck);
3207 }
3208 /*
3209 * This wakeup _has_ to be issued before the lock is released otherwise
3210 * we may end up waking up a thread in sync_internal which is
3211 * expecting a wakeup from a thread it just created and not from this
3212 * thread which is about to exit.
3213 */
3214 wakeup(&sync_thread_state);
3215 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3216 #if CONFIG_PHYS_WRITE_ACCT
3217 pm_sync_thread = NULL;
3218 #endif /* CONFIG_PHYS_WRITE_ACCT */
3219 lck_mtx_unlock(&sync_mtx_lck);
3220
3221 if (print_vmpage_stat) {
3222 vm_countdirtypages();
3223 }
3224
3225 #if DIAGNOSTIC
3226 if (syncprt) {
3227 vfs_bufstats();
3228 }
3229 #endif /* DIAGNOSTIC */
3230 }
3231
3232 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3233
3234 /*
3235 * An in-kernel sync for power management to call.
3236 * This function always returns within sync_timeout seconds.
3237 */
3238 __private_extern__ int
sync_internal(void)3239 sync_internal(void)
3240 {
3241 thread_t thd = NULL;
3242 int error;
3243 int thread_created = FALSE;
3244 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3245
3246 lck_mtx_lock(&sync_mtx_lck);
3247 sync_thread_state |= SYNC_THREAD_RUN;
3248 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3249 int kr;
3250
3251 sync_thread_state |= SYNC_THREAD_RUNNING;
3252 kr = kernel_thread_start(sync_thread, NULL, &thd);
3253 if (kr != KERN_SUCCESS) {
3254 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3255 lck_mtx_unlock(&sync_mtx_lck);
3256 printf("sync_thread failed\n");
3257 return 0;
3258 }
3259 thread_created = TRUE;
3260 }
3261
3262 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3263 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3264 if (error) {
3265 struct timeval now;
3266
3267 microtime(&now);
3268 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3269 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3270 sync_timeout_last_print.tv_sec = now.tv_sec;
3271 }
3272 }
3273
3274 if (thread_created) {
3275 thread_deallocate(thd);
3276 }
3277
3278 return 0;
3279 } /* end of sync_internal call */
3280
3281 /*
3282 * Change filesystem quotas.
3283 */
3284 #if QUOTA
3285 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3286 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3287 {
3288 struct mount *mp;
3289 int error, quota_cmd, quota_status = 0;
3290 caddr_t datap;
3291 size_t fnamelen;
3292 struct nameidata nd;
3293 vfs_context_t ctx = vfs_context_current();
3294 struct dqblk my_dqblk = {};
3295
3296 AUDIT_ARG(uid, uap->uid);
3297 AUDIT_ARG(cmd, uap->cmd);
3298 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3299 uap->path, ctx);
3300 error = namei(&nd);
3301 if (error) {
3302 return error;
3303 }
3304 mp = nd.ni_vp->v_mount;
3305 mount_ref(mp, 0);
3306 vnode_put(nd.ni_vp);
3307 nameidone(&nd);
3308
3309 #if CONFIG_MACF
3310 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3311 if (error != 0) {
3312 goto out;
3313 }
3314 #endif
3315
3316 /* copyin any data we will need for downstream code */
3317 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3318
3319 switch (quota_cmd) {
3320 case Q_QUOTAON:
3321 /* uap->arg specifies a file from which to take the quotas */
3322 fnamelen = MAXPATHLEN;
3323 datap = zalloc(ZV_NAMEI);
3324 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3325 break;
3326 case Q_GETQUOTA:
3327 /* uap->arg is a pointer to a dqblk structure. */
3328 datap = (caddr_t) &my_dqblk;
3329 break;
3330 case Q_SETQUOTA:
3331 case Q_SETUSE:
3332 /* uap->arg is a pointer to a dqblk structure. */
3333 datap = (caddr_t) &my_dqblk;
3334 if (proc_is64bit(p)) {
3335 struct user_dqblk my_dqblk64;
3336 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3337 if (error == 0) {
3338 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3339 }
3340 } else {
3341 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3342 }
3343 break;
3344 case Q_QUOTASTAT:
3345 /* uap->arg is a pointer to an integer */
3346 datap = (caddr_t) "a_status;
3347 break;
3348 default:
3349 datap = NULL;
3350 break;
3351 } /* switch */
3352
3353 if (error == 0) {
3354 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3355 }
3356
3357 switch (quota_cmd) {
3358 case Q_QUOTAON:
3359 if (datap != NULL) {
3360 zfree(ZV_NAMEI, datap);
3361 }
3362 break;
3363 case Q_GETQUOTA:
3364 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3365 if (error == 0) {
3366 if (proc_is64bit(p)) {
3367 struct user_dqblk my_dqblk64;
3368
3369 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3370 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3371 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3372 } else {
3373 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3374 }
3375 }
3376 break;
3377 case Q_QUOTASTAT:
3378 /* uap->arg is a pointer to an integer */
3379 if (error == 0) {
3380 error = copyout(datap, uap->arg, sizeof(quota_status));
3381 }
3382 break;
3383 default:
3384 break;
3385 } /* switch */
3386
3387 out:
3388 mount_drop(mp, 0);
3389 return error;
3390 }
3391 #else
3392 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3393 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3394 {
3395 return EOPNOTSUPP;
3396 }
3397 #endif /* QUOTA */
3398
3399 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3400 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3401 {
3402 int error;
3403 vfs_context_t ctx = vfs_context_current();
3404
3405 #if CONFIG_MACF
3406 error = mac_mount_check_stat(ctx, mp);
3407 if (error != 0) {
3408 return error;
3409 }
3410 #endif
3411
3412 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3413 if (error != 0) {
3414 return error;
3415 }
3416
3417 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3418 }
3419
3420 /*
3421 * Get filesystem statistics.
3422 *
3423 * Returns: 0 Success
3424 * namei:???
3425 * vfs_update_vfsstat:???
3426 * munge_statfs:EFAULT
3427 */
3428 /* ARGSUSED */
3429 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3430 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3431 {
3432 int error;
3433 struct mount *mp;
3434 struct nameidata nd;
3435 vfs_context_t ctx = vfs_context_current();
3436 vnode_t vp;
3437
3438 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3439 UIO_USERSPACE, uap->path, ctx);
3440 error = namei(&nd);
3441 if (error != 0) {
3442 return error;
3443 }
3444 vp = nd.ni_vp;
3445 mp = vp->v_mount;
3446 nameidone(&nd);
3447
3448 error = statfs_internal(p, mp, uap->buf);
3449 vnode_put(vp);
3450
3451 return error;
3452 }
3453
3454 /*
3455 * Get filesystem statistics.
3456 */
3457 /* ARGSUSED */
3458 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3459 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3460 {
3461 int error;
3462 vnode_t vp = NULL;
3463 struct mount *mp;
3464
3465 AUDIT_ARG(fd, uap->fd);
3466
3467 if ((error = file_vnode(uap->fd, &vp)) ||
3468 (error = vnode_getwithref(vp))) {
3469 goto out;
3470 }
3471
3472 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3473
3474 mp = vp->v_mount;
3475 if (!mp) {
3476 error = EBADF;
3477 goto out_vnode;
3478 }
3479
3480 error = statfs_internal(p, mp, uap->buf);
3481
3482 out_vnode:
3483 vnode_put(vp);
3484
3485 out:
3486 if (vp != NULL) {
3487 file_drop(uap->fd);
3488 }
3489
3490 return error;
3491 }
3492
3493 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3494 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3495 {
3496 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3497
3498 bzero(sfs, sizeof(*sfs));
3499
3500 sfs->f_bsize = vsfs->f_bsize;
3501 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3502 sfs->f_blocks = vsfs->f_blocks;
3503 sfs->f_bfree = vsfs->f_bfree;
3504 sfs->f_bavail = vsfs->f_bavail;
3505 sfs->f_files = vsfs->f_files;
3506 sfs->f_ffree = vsfs->f_ffree;
3507 sfs->f_fsid = vsfs->f_fsid;
3508 sfs->f_owner = vsfs->f_owner;
3509 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3510 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3511 sfs->f_fssubtype = vsfs->f_fssubtype;
3512 sfs->f_flags_ext = 0;
3513 if (mp->mnt_kern_flag & MNTK_SYSTEMDATA) {
3514 sfs->f_flags_ext |= MNT_EXT_ROOT_DATA_VOL;
3515 }
3516 if (mp->mnt_kern_flag & MNTK_FSKIT) {
3517 sfs->f_flags_ext |= MNT_EXT_FSKIT;
3518 }
3519 vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3520 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3521 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3522 }
3523
3524 /*
3525 * Get file system statistics in 64-bit mode
3526 */
3527 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3528 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3529 {
3530 struct mount *mp;
3531 int error;
3532 struct nameidata *ndp;
3533 struct statfs64 *sfsp;
3534 vfs_context_t ctxp = vfs_context_current();
3535 vnode_t vp;
3536 struct {
3537 struct nameidata nd;
3538 struct statfs64 sfs;
3539 } *__nameidata_statfs64;
3540
3541 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3542 Z_WAITOK);
3543 ndp = &__nameidata_statfs64->nd;
3544
3545 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3546 UIO_USERSPACE, uap->path, ctxp);
3547 error = namei(ndp);
3548 if (error != 0) {
3549 goto out;
3550 }
3551 vp = ndp->ni_vp;
3552 mp = vp->v_mount;
3553 nameidone(ndp);
3554
3555 #if CONFIG_MACF
3556 error = mac_mount_check_stat(ctxp, mp);
3557 if (error != 0) {
3558 vnode_put(vp);
3559 goto out;
3560 }
3561 #endif
3562
3563 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3564 if (error != 0) {
3565 vnode_put(vp);
3566 goto out;
3567 }
3568
3569 sfsp = &__nameidata_statfs64->sfs;
3570 vfs_get_statfs64(mp, sfsp);
3571 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3572 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3573 /* This process does not want to see a seperate data volume mountpoint */
3574 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3575 }
3576 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3577 vnode_put(vp);
3578
3579 out:
3580 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3581
3582 return error;
3583 }
3584
3585 /*
3586 * Get file system statistics in 64-bit mode
3587 */
3588 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3589 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3590 {
3591 struct vnode *vp;
3592 struct mount *mp;
3593 struct statfs64 sfs;
3594 int error;
3595
3596 AUDIT_ARG(fd, uap->fd);
3597
3598 if ((error = file_vnode(uap->fd, &vp))) {
3599 return error;
3600 }
3601
3602 error = vnode_getwithref(vp);
3603 if (error) {
3604 file_drop(uap->fd);
3605 return error;
3606 }
3607
3608 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3609
3610 mp = vp->v_mount;
3611 if (!mp) {
3612 error = EBADF;
3613 goto out;
3614 }
3615
3616 #if CONFIG_MACF
3617 error = mac_mount_check_stat(vfs_context_current(), mp);
3618 if (error != 0) {
3619 goto out;
3620 }
3621 #endif
3622
3623 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3624 goto out;
3625 }
3626
3627 vfs_get_statfs64(mp, &sfs);
3628 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3629 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3630 /* This process does not want to see a seperate data volume mountpoint */
3631 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3632 }
3633 error = copyout(&sfs, uap->buf, sizeof(sfs));
3634
3635 out:
3636 file_drop(uap->fd);
3637 vnode_put(vp);
3638
3639 return error;
3640 }
3641
3642 struct getfsstat_struct {
3643 user_addr_t sfsp;
3644 user_addr_t *mp;
3645 int count;
3646 int maxcount;
3647 int flags;
3648 int error;
3649 };
3650
3651
3652 static int
getfsstat_callback(mount_t mp,void * arg)3653 getfsstat_callback(mount_t mp, void * arg)
3654 {
3655 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3656 struct vfsstatfs *sp;
3657 int error, my_size;
3658 vfs_context_t ctx = vfs_context_current();
3659
3660 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3661 #if CONFIG_MACF
3662 error = mac_mount_check_stat(ctx, mp);
3663 if (error != 0) {
3664 fstp->error = error;
3665 return VFS_RETURNED_DONE;
3666 }
3667 #endif
3668 sp = &mp->mnt_vfsstat;
3669 /*
3670 * If MNT_NOWAIT is specified, do not refresh the
3671 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3672 */
3673 if ((mp->mnt_lflag & MNT_LDEAD) ||
3674 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3675 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3676 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3677 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3678 return VFS_RETURNED;
3679 }
3680
3681 /*
3682 * Need to handle LP64 version of struct statfs
3683 */
3684 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3685 if (error) {
3686 fstp->error = error;
3687 return VFS_RETURNED_DONE;
3688 }
3689 fstp->sfsp += my_size;
3690
3691 if (fstp->mp) {
3692 #if CONFIG_MACF
3693 error = mac_mount_label_get(mp, *fstp->mp);
3694 if (error) {
3695 fstp->error = error;
3696 return VFS_RETURNED_DONE;
3697 }
3698 #endif
3699 fstp->mp++;
3700 }
3701 }
3702 fstp->count++;
3703 return VFS_RETURNED;
3704 }
3705
3706 /*
3707 * Get statistics on all filesystems.
3708 */
3709 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3710 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3711 {
3712 struct __mac_getfsstat_args muap;
3713
3714 muap.buf = uap->buf;
3715 muap.bufsize = uap->bufsize;
3716 muap.mac = USER_ADDR_NULL;
3717 muap.macsize = 0;
3718 muap.flags = uap->flags;
3719
3720 return __mac_getfsstat(p, &muap, retval);
3721 }
3722
3723 /*
3724 * __mac_getfsstat: Get MAC-related file system statistics
3725 *
3726 * Parameters: p (ignored)
3727 * uap User argument descriptor (see below)
3728 * retval Count of file system statistics (N stats)
3729 *
3730 * Indirect: uap->bufsize Buffer size
3731 * uap->macsize MAC info size
3732 * uap->buf Buffer where information will be returned
3733 * uap->mac MAC info
3734 * uap->flags File system flags
3735 *
3736 *
3737 * Returns: 0 Success
3738 * !0 Not success
3739 *
3740 */
3741 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3742 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3743 {
3744 user_addr_t sfsp;
3745 user_addr_t *mp;
3746 size_t count, maxcount, bufsize, macsize;
3747 struct getfsstat_struct fst;
3748
3749 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3750 return EINVAL;
3751 }
3752
3753 bufsize = (size_t) uap->bufsize;
3754 macsize = (size_t) uap->macsize;
3755
3756 if (IS_64BIT_PROCESS(p)) {
3757 maxcount = bufsize / sizeof(struct user64_statfs);
3758 } else {
3759 maxcount = bufsize / sizeof(struct user32_statfs);
3760 }
3761 sfsp = uap->buf;
3762 count = 0;
3763
3764 mp = NULL;
3765
3766 #if CONFIG_MACF
3767 if (uap->mac != USER_ADDR_NULL) {
3768 u_int32_t *mp0;
3769 int error;
3770 unsigned int i;
3771
3772 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3773 if (count != maxcount) {
3774 return EINVAL;
3775 }
3776
3777 /* Copy in the array */
3778 mp0 = kalloc_data(macsize, Z_WAITOK);
3779 if (mp0 == NULL) {
3780 return ENOMEM;
3781 }
3782
3783 error = copyin(uap->mac, mp0, macsize);
3784 if (error) {
3785 kfree_data(mp0, macsize);
3786 return error;
3787 }
3788
3789 /* Normalize to an array of user_addr_t */
3790 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3791 if (mp == NULL) {
3792 kfree_data(mp0, macsize);
3793 return ENOMEM;
3794 }
3795
3796 for (i = 0; i < count; i++) {
3797 if (IS_64BIT_PROCESS(p)) {
3798 mp[i] = ((user_addr_t *)mp0)[i];
3799 } else {
3800 mp[i] = (user_addr_t)mp0[i];
3801 }
3802 }
3803 kfree_data(mp0, macsize);
3804 }
3805 #endif
3806
3807
3808 fst.sfsp = sfsp;
3809 fst.mp = mp;
3810 fst.flags = uap->flags;
3811 fst.count = 0;
3812 fst.error = 0;
3813 fst.maxcount = (int)maxcount;
3814
3815
3816 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3817
3818 if (mp) {
3819 kfree_data(mp, count * sizeof(user_addr_t));
3820 }
3821
3822 if (fst.error) {
3823 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3824 return fst.error;
3825 }
3826
3827 if (fst.sfsp && fst.count > fst.maxcount) {
3828 *retval = fst.maxcount;
3829 } else {
3830 *retval = fst.count;
3831 }
3832 return 0;
3833 }
3834
3835 static int
getfsstat64_callback(mount_t mp,void * arg)3836 getfsstat64_callback(mount_t mp, void * arg)
3837 {
3838 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3839 struct vfsstatfs *sp;
3840 struct statfs64 sfs;
3841 int error;
3842
3843 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3844 #if CONFIG_MACF
3845 error = mac_mount_check_stat(vfs_context_current(), mp);
3846 if (error != 0) {
3847 fstp->error = error;
3848 return VFS_RETURNED_DONE;
3849 }
3850 #endif
3851 sp = &mp->mnt_vfsstat;
3852 /*
3853 * If MNT_NOWAIT is specified, do not refresh the fsstat
3854 * cache. MNT_WAIT overrides MNT_NOWAIT.
3855 *
3856 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3857 * getfsstat, since the constants are out of the same
3858 * namespace.
3859 */
3860 if ((mp->mnt_lflag & MNT_LDEAD) ||
3861 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3862 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3863 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3864 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3865 return VFS_RETURNED;
3866 }
3867
3868 vfs_get_statfs64(mp, &sfs);
3869 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3870 if (error) {
3871 fstp->error = error;
3872 return VFS_RETURNED_DONE;
3873 }
3874 fstp->sfsp += sizeof(sfs);
3875 }
3876 fstp->count++;
3877 return VFS_RETURNED;
3878 }
3879
3880 /*
3881 * Get statistics on all file systems in 64 bit mode.
3882 */
3883 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3884 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3885 {
3886 user_addr_t sfsp;
3887 int count, maxcount;
3888 struct getfsstat_struct fst;
3889
3890 maxcount = uap->bufsize / sizeof(struct statfs64);
3891
3892 sfsp = uap->buf;
3893 count = 0;
3894
3895 fst.sfsp = sfsp;
3896 fst.flags = uap->flags;
3897 fst.count = 0;
3898 fst.error = 0;
3899 fst.maxcount = maxcount;
3900
3901 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3902
3903 if (fst.error) {
3904 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3905 return fst.error;
3906 }
3907
3908 if (fst.sfsp && fst.count > fst.maxcount) {
3909 *retval = fst.maxcount;
3910 } else {
3911 *retval = fst.count;
3912 }
3913
3914 return 0;
3915 }
3916
3917 /*
3918 * gets the associated vnode with the file descriptor passed.
3919 * as input
3920 *
3921 * INPUT
3922 * ctx - vfs context of caller
3923 * fd - file descriptor for which vnode is required.
3924 * vpp - Pointer to pointer to vnode to be returned.
3925 *
3926 * The vnode is returned with an iocount so any vnode obtained
3927 * by this call needs a vnode_put
3928 *
3929 */
3930 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3931 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3932 {
3933 int error;
3934 vnode_t vp;
3935 struct fileproc *fp;
3936 proc_t p = vfs_context_proc(ctx);
3937
3938 *vpp = NULLVP;
3939
3940 error = fp_getfvp(p, fd, &fp, &vp);
3941 if (error) {
3942 return error;
3943 }
3944
3945 error = vnode_getwithref(vp);
3946 if (error) {
3947 (void)fp_drop(p, fd, fp, 0);
3948 return error;
3949 }
3950
3951 (void)fp_drop(p, fd, fp, 0);
3952 *vpp = vp;
3953 return error;
3954 }
3955
3956 /*
3957 * Wrapper function around namei to start lookup from a directory
3958 * specified by a file descriptor ni_dirfd.
3959 *
3960 * In addition to all the errors returned by namei, this call can
3961 * return ENOTDIR if the file descriptor does not refer to a directory.
3962 * and EBADF if the file descriptor is not valid.
3963 */
3964 int
nameiat(struct nameidata * ndp,int dirfd)3965 nameiat(struct nameidata *ndp, int dirfd)
3966 {
3967 if ((dirfd != AT_FDCWD) &&
3968 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3969 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3970 int error = 0;
3971 char c;
3972
3973 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3974 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3975 if (error) {
3976 return error;
3977 }
3978 } else {
3979 c = *((char *)(ndp->ni_dirp));
3980 }
3981
3982 if (c != '/') {
3983 vnode_t dvp_at;
3984
3985 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3986 &dvp_at);
3987 if (error) {
3988 return error;
3989 }
3990
3991 if (vnode_vtype(dvp_at) != VDIR) {
3992 vnode_put(dvp_at);
3993 return ENOTDIR;
3994 }
3995
3996 ndp->ni_dvp = dvp_at;
3997 ndp->ni_cnd.cn_flags |= USEDVP;
3998 error = namei(ndp);
3999 ndp->ni_cnd.cn_flags &= ~USEDVP;
4000 vnode_put(dvp_at);
4001 return error;
4002 }
4003 }
4004
4005 return namei(ndp);
4006 }
4007
4008 /*
4009 * Change current working directory to a given file descriptor.
4010 */
4011 /* ARGSUSED */
4012 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)4013 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
4014 {
4015 vnode_t vp;
4016 vnode_t tdp;
4017 vnode_t tvp;
4018 struct mount *mp;
4019 int error, should_put = 1;
4020 vfs_context_t ctx = vfs_context_current();
4021
4022 AUDIT_ARG(fd, uap->fd);
4023 if (per_thread && uap->fd == -1) {
4024 /*
4025 * Switching back from per-thread to per process CWD; verify we
4026 * in fact have one before proceeding. The only success case
4027 * for this code path is to return 0 preemptively after zapping
4028 * the thread structure contents.
4029 */
4030 thread_t th = vfs_context_thread(ctx);
4031 if (th) {
4032 uthread_t uth = get_bsdthread_info(th);
4033 tvp = uth->uu_cdir;
4034 uth->uu_cdir = NULLVP;
4035 if (tvp != NULLVP) {
4036 vnode_rele(tvp);
4037 return 0;
4038 }
4039 }
4040 return EBADF;
4041 }
4042
4043 if ((error = file_vnode(uap->fd, &vp))) {
4044 return error;
4045 }
4046 if ((error = vnode_getwithref(vp))) {
4047 file_drop(uap->fd);
4048 return error;
4049 }
4050
4051 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4052
4053 if (vp->v_type != VDIR) {
4054 error = ENOTDIR;
4055 goto out;
4056 }
4057
4058 #if CONFIG_MACF
4059 error = mac_vnode_check_chdir(ctx, vp);
4060 if (error) {
4061 goto out;
4062 }
4063 #endif
4064 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4065 if (error) {
4066 goto out;
4067 }
4068
4069 while (!error && (mp = vp->v_mountedhere) != NULL) {
4070 if (vfs_busy(mp, LK_NOWAIT)) {
4071 error = EACCES;
4072 goto out;
4073 }
4074 error = VFS_ROOT(mp, &tdp, ctx);
4075 vfs_unbusy(mp);
4076 if (error) {
4077 break;
4078 }
4079 vnode_put(vp);
4080 vp = tdp;
4081 }
4082 if (error) {
4083 goto out;
4084 }
4085 if ((error = vnode_ref(vp))) {
4086 goto out;
4087 }
4088 vnode_put(vp);
4089 should_put = 0;
4090
4091 if (per_thread) {
4092 thread_t th = vfs_context_thread(ctx);
4093 if (th) {
4094 uthread_t uth = get_bsdthread_info(th);
4095 tvp = uth->uu_cdir;
4096 uth->uu_cdir = vp;
4097 OSBitOrAtomic(P_THCWD, &p->p_flag);
4098 } else {
4099 vnode_rele(vp);
4100 error = ENOENT;
4101 goto out;
4102 }
4103 } else {
4104 proc_dirs_lock_exclusive(p);
4105 proc_fdlock(p);
4106 tvp = p->p_fd.fd_cdir;
4107 p->p_fd.fd_cdir = vp;
4108 proc_fdunlock(p);
4109 proc_dirs_unlock_exclusive(p);
4110 }
4111
4112 if (tvp) {
4113 vnode_rele(tvp);
4114 }
4115
4116 out:
4117 if (should_put) {
4118 vnode_put(vp);
4119 }
4120 file_drop(uap->fd);
4121
4122 return error;
4123 }
4124
4125 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4126 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4127 {
4128 return common_fchdir(p, uap, 0);
4129 }
4130
4131 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4132 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4133 {
4134 return common_fchdir(p, (void *)uap, 1);
4135 }
4136
4137
4138 /*
4139 * Change current working directory (".").
4140 *
4141 * Returns: 0 Success
4142 * change_dir:ENOTDIR
4143 * change_dir:???
4144 * vnode_ref:ENOENT No such file or directory
4145 */
4146 /* ARGSUSED */
4147 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4148 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4149 {
4150 int error;
4151 vnode_t tvp;
4152
4153 error = change_dir(ndp, ctx);
4154 if (error) {
4155 return error;
4156 }
4157 if ((error = vnode_ref(ndp->ni_vp))) {
4158 vnode_put(ndp->ni_vp);
4159 return error;
4160 }
4161 /*
4162 * drop the iocount we picked up in change_dir
4163 */
4164 vnode_put(ndp->ni_vp);
4165
4166 if (per_thread) {
4167 thread_t th = vfs_context_thread(ctx);
4168 if (th) {
4169 uthread_t uth = get_bsdthread_info(th);
4170 tvp = uth->uu_cdir;
4171 uth->uu_cdir = ndp->ni_vp;
4172 OSBitOrAtomic(P_THCWD, &p->p_flag);
4173 } else {
4174 vnode_rele(ndp->ni_vp);
4175 return ENOENT;
4176 }
4177 } else {
4178 proc_dirs_lock_exclusive(p);
4179 proc_fdlock(p);
4180 tvp = p->p_fd.fd_cdir;
4181 p->p_fd.fd_cdir = ndp->ni_vp;
4182 proc_fdunlock(p);
4183 proc_dirs_unlock_exclusive(p);
4184 }
4185
4186 if (tvp) {
4187 vnode_rele(tvp);
4188 }
4189
4190 return 0;
4191 }
4192
4193
4194 /*
4195 * Change current working directory (".").
4196 *
4197 * Returns: 0 Success
4198 * chdir_internal:ENOTDIR
4199 * chdir_internal:ENOENT No such file or directory
4200 * chdir_internal:???
4201 */
4202 /* ARGSUSED */
4203 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4204 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4205 {
4206 struct nameidata nd;
4207 vfs_context_t ctx = vfs_context_current();
4208
4209 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4210 UIO_USERSPACE, uap->path, ctx);
4211
4212 return chdir_internal(p, ctx, &nd, per_thread);
4213 }
4214
4215
4216 /*
4217 * chdir
4218 *
4219 * Change current working directory (".") for the entire process
4220 *
4221 * Parameters: p Process requesting the call
4222 * uap User argument descriptor (see below)
4223 * retval (ignored)
4224 *
4225 * Indirect parameters: uap->path Directory path
4226 *
4227 * Returns: 0 Success
4228 * common_chdir: ENOTDIR
4229 * common_chdir: ENOENT No such file or directory
4230 * common_chdir: ???
4231 *
4232 */
4233 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4234 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4235 {
4236 return common_chdir(p, (void *)uap, 0);
4237 }
4238
4239 /*
4240 * __pthread_chdir
4241 *
4242 * Change current working directory (".") for a single thread
4243 *
4244 * Parameters: p Process requesting the call
4245 * uap User argument descriptor (see below)
4246 * retval (ignored)
4247 *
4248 * Indirect parameters: uap->path Directory path
4249 *
4250 * Returns: 0 Success
4251 * common_chdir: ENOTDIR
4252 * common_chdir: ENOENT No such file or directory
4253 * common_chdir: ???
4254 *
4255 */
4256 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4257 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4258 {
4259 return common_chdir(p, (void *)uap, 1);
4260 }
4261
4262
4263 /*
4264 * Change notion of root (``/'') directory.
4265 */
4266 /* ARGSUSED */
4267 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4268 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4269 {
4270 struct filedesc *fdp = &p->p_fd;
4271 int error;
4272 struct nameidata nd;
4273 vnode_t tvp;
4274 vfs_context_t ctx = vfs_context_current();
4275
4276 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4277 return error;
4278 }
4279
4280 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4281 UIO_USERSPACE, uap->path, ctx);
4282 error = change_dir(&nd, ctx);
4283 if (error) {
4284 return error;
4285 }
4286
4287 #if CONFIG_MACF
4288 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4289 &nd.ni_cnd);
4290 if (error) {
4291 vnode_put(nd.ni_vp);
4292 return error;
4293 }
4294 #endif
4295
4296 if ((error = vnode_ref(nd.ni_vp))) {
4297 vnode_put(nd.ni_vp);
4298 return error;
4299 }
4300 vnode_put(nd.ni_vp);
4301
4302 /*
4303 * This lock provides the guarantee that as long as you hold the lock
4304 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4305 * on a referenced vnode in namei when determining the rootvnode for
4306 * a process.
4307 */
4308 /* needed for synchronization with lookup */
4309 proc_dirs_lock_exclusive(p);
4310 /* needed for setting the flag and other activities on the fd itself */
4311 proc_fdlock(p);
4312 tvp = fdp->fd_rdir;
4313 fdp->fd_rdir = nd.ni_vp;
4314 fdt_flag_set(fdp, FD_CHROOT);
4315 proc_fdunlock(p);
4316 proc_dirs_unlock_exclusive(p);
4317
4318 if (tvp != NULL) {
4319 vnode_rele(tvp);
4320 }
4321
4322 return 0;
4323 }
4324
4325 #define PATHSTATICBUFLEN 256
4326 #define PIVOT_ROOT_ENTITLEMENT \
4327 "com.apple.private.vfs.pivot-root"
4328
4329 #if defined(XNU_TARGET_OS_OSX)
4330 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4331 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4332 {
4333 int error;
4334 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4335 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4336 char *new_rootfs_path_before_buf = NULL;
4337 char *old_rootfs_path_after_buf = NULL;
4338 char *incoming = NULL;
4339 char *outgoing = NULL;
4340 vnode_t incoming_rootvp = NULLVP;
4341 size_t bytes_copied;
4342
4343 /*
4344 * XXX : Additional restrictions needed
4345 * - perhaps callable only once.
4346 */
4347 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4348 return error;
4349 }
4350
4351 /*
4352 * pivot_root can be executed by launchd only.
4353 * Enforce entitlement.
4354 */
4355 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4356 return EPERM;
4357 }
4358
4359 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4360 if (error == ENAMETOOLONG) {
4361 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4362 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4363 }
4364
4365 if (error) {
4366 goto out;
4367 }
4368
4369 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4370 if (error == ENAMETOOLONG) {
4371 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4372 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4373 }
4374 if (error) {
4375 goto out;
4376 }
4377
4378 if (new_rootfs_path_before_buf) {
4379 incoming = new_rootfs_path_before_buf;
4380 } else {
4381 incoming = &new_rootfs_path_before[0];
4382 }
4383
4384 if (old_rootfs_path_after_buf) {
4385 outgoing = old_rootfs_path_after_buf;
4386 } else {
4387 outgoing = &old_rootfs_path_after[0];
4388 }
4389
4390 /*
4391 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4392 * Userland is not allowed to pivot to an image.
4393 */
4394 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4395 if (error) {
4396 goto out;
4397 }
4398 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4399 if (error) {
4400 goto out;
4401 }
4402
4403 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4404
4405 out:
4406 if (incoming_rootvp != NULLVP) {
4407 vnode_put(incoming_rootvp);
4408 incoming_rootvp = NULLVP;
4409 }
4410
4411 if (old_rootfs_path_after_buf) {
4412 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4413 }
4414
4415 if (new_rootfs_path_before_buf) {
4416 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4417 }
4418
4419 return error;
4420 }
4421 #else
4422 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4423 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4424 {
4425 return nosys(p, NULL, retval);
4426 }
4427 #endif /* XNU_TARGET_OS_OSX */
4428
4429 /*
4430 * Common routine for chroot and chdir.
4431 *
4432 * Returns: 0 Success
4433 * ENOTDIR Not a directory
4434 * namei:??? [anything namei can return]
4435 * vnode_authorize:??? [anything vnode_authorize can return]
4436 */
4437 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4438 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4439 {
4440 vnode_t vp;
4441 int error;
4442
4443 if ((error = namei(ndp))) {
4444 return error;
4445 }
4446 nameidone(ndp);
4447 vp = ndp->ni_vp;
4448
4449 if (vp->v_type != VDIR) {
4450 vnode_put(vp);
4451 return ENOTDIR;
4452 }
4453
4454 #if CONFIG_MACF
4455 error = mac_vnode_check_chdir(ctx, vp);
4456 if (error) {
4457 vnode_put(vp);
4458 return error;
4459 }
4460 #endif
4461
4462 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4463 if (error) {
4464 vnode_put(vp);
4465 return error;
4466 }
4467
4468 return error;
4469 }
4470
4471 /*
4472 * Free the vnode data (for directories) associated with the file glob.
4473 */
4474 struct fd_vn_data *
fg_vn_data_alloc(void)4475 fg_vn_data_alloc(void)
4476 {
4477 struct fd_vn_data *fvdata;
4478
4479 /* Allocate per fd vnode data */
4480 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4481 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4482 return fvdata;
4483 }
4484
4485 /*
4486 * Free the vnode data (for directories) associated with the file glob.
4487 */
4488 void
fg_vn_data_free(void * fgvndata)4489 fg_vn_data_free(void *fgvndata)
4490 {
4491 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4492
4493 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4494 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4495 kfree_type(struct fd_vn_data, fvdata);
4496 }
4497
4498 /*
4499 * Check permissions, allocate an open file structure,
4500 * and call the device open routine if any.
4501 *
4502 * Returns: 0 Success
4503 * EINVAL
4504 * EINTR
4505 * falloc:ENFILE
4506 * falloc:EMFILE
4507 * falloc:ENOMEM
4508 * vn_open_auth:???
4509 * dupfdopen:???
4510 * VNOP_ADVLOCK:???
4511 * vnode_setsize:???
4512 *
4513 * XXX Need to implement uid, gid
4514 */
4515 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4516 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4517 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4518 {
4519 proc_t p = vfs_context_proc(ctx);
4520 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4521 struct fileproc *fp;
4522 vnode_t vp;
4523 int flags, oflags, amode;
4524 int type, indx, error;
4525 struct vfs_context context;
4526 vnode_t authvp = NULLVP;
4527
4528 oflags = uflags;
4529
4530 amode = oflags & O_ACCMODE;
4531 /*
4532 * Because O_RDONLY is 0, it is not possible to distinguish between
4533 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4534 * with FREAD/FWRITE.
4535 */
4536 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4537 return EINVAL;
4538 }
4539
4540 flags = FFLAGS(uflags);
4541 CLR(flags, FENCRYPTED);
4542 CLR(flags, FUNENCRYPTED);
4543
4544 AUDIT_ARG(fflags, oflags);
4545 AUDIT_ARG(mode, vap->va_mode);
4546
4547 if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4548 return error;
4549 }
4550 if (flags & O_CLOEXEC) {
4551 fp->fp_flags |= FP_CLOEXEC;
4552 }
4553 if (flags & O_CLOFORK) {
4554 fp->fp_flags |= FP_CLOFORK;
4555 }
4556
4557 /* setup state to recognize when fdesc_open was called */
4558 uu->uu_dupfd = -1;
4559
4560 /*
4561 * Disable read/write access if file is opened with O_EVTONLY and
4562 * the process has requested to deny read/write access.
4563 */
4564 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4565 flags &= ~(FREAD | FWRITE);
4566 }
4567
4568 if (authfd != AUTH_OPEN_NOAUTHFD) {
4569 error = vnode_getfromfd(ctx, authfd, &authvp);
4570 if (error) {
4571 fp_free(p, indx, fp);
4572 return error;
4573 }
4574 }
4575
4576 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4577 if (authvp != NULLVP) {
4578 vnode_put(authvp);
4579 }
4580 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4581 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4582 *retval = indx;
4583 return 0;
4584 }
4585 }
4586 if (error == ERESTART) {
4587 error = EINTR;
4588 }
4589 fp_free(p, indx, fp);
4590 return error;
4591 }
4592
4593 if (authvp != NULLVP) {
4594 vnode_put(authvp);
4595 }
4596
4597 uu->uu_dupfd = 0;
4598 vp = ndp->ni_vp;
4599
4600 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4601 fp->fp_glob->fg_ops = &vnops;
4602 fp_set_data(fp, vp);
4603
4604 #if CONFIG_FILE_LEASES
4605 /*
4606 * If we are creating a file or open with truncate, we need to break the
4607 * lease if there is a read lease placed on the parent dir.
4608 */
4609 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4610 vnode_breakdirlease(vp, true, oflags);
4611 }
4612 /* Now check if there is a lease placed on the file itself. */
4613 error = vnode_breaklease(vp, oflags, ctx);
4614 if (error) {
4615 goto bad;
4616 }
4617 #endif /* CONFIG_FILE_LEASES */
4618
4619 if (flags & (O_EXLOCK | O_SHLOCK)) {
4620 struct flock lf = {
4621 .l_whence = SEEK_SET,
4622 };
4623
4624 if (flags & O_EXLOCK) {
4625 lf.l_type = F_WRLCK;
4626 } else {
4627 lf.l_type = F_RDLCK;
4628 }
4629 type = F_FLOCK;
4630 if ((flags & FNONBLOCK) == 0) {
4631 type |= F_WAIT;
4632 }
4633 #if CONFIG_MACF
4634 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4635 F_SETLK, &lf);
4636 if (error) {
4637 goto bad;
4638 }
4639 #endif
4640 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4641 goto bad;
4642 }
4643 fp->fp_glob->fg_flag |= FWASLOCKED;
4644 }
4645
4646 /* try to truncate by setting the size attribute */
4647 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4648 goto bad;
4649 }
4650
4651 /*
4652 * For directories we hold some additional information in the fd.
4653 */
4654 if (vnode_vtype(vp) == VDIR) {
4655 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4656 } else {
4657 fp->fp_glob->fg_vn_data = NULL;
4658 }
4659
4660 #if CONFIG_SECLUDED_MEMORY
4661 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4662 memory_object_control_t moc;
4663 const char *v_name;
4664
4665 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4666
4667 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4668 /* nothing to do... */
4669 } else if (fp->fp_glob->fg_flag & FWRITE) {
4670 /* writable -> no longer eligible for secluded pages */
4671 memory_object_mark_eligible_for_secluded(moc,
4672 FALSE);
4673 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4674 char pathname[32] = { 0, };
4675 size_t copied;
4676 /* XXX FBDP: better way to detect /Applications/ ? */
4677 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4678 (void)copyinstr(ndp->ni_dirp,
4679 pathname,
4680 sizeof(pathname),
4681 &copied);
4682 } else {
4683 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4684 pathname,
4685 sizeof(pathname),
4686 &copied);
4687 }
4688 pathname[sizeof(pathname) - 1] = '\0';
4689 if (strncmp(pathname,
4690 "/Applications/",
4691 strlen("/Applications/")) == 0 &&
4692 strncmp(pathname,
4693 "/Applications/Camera.app/",
4694 strlen("/Applications/Camera.app/")) != 0) {
4695 /*
4696 * not writable
4697 * AND from "/Applications/"
4698 * AND not from "/Applications/Camera.app/"
4699 * ==> eligible for secluded
4700 */
4701 memory_object_mark_eligible_for_secluded(moc,
4702 TRUE);
4703 }
4704 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4705 (v_name = vnode_getname(vp))) {
4706 size_t len = strlen(v_name);
4707
4708 if (!strncmp(v_name, "dyld", len) ||
4709 !strncmp(v_name, "launchd", len) ||
4710 !strncmp(v_name, "Camera", len) ||
4711 !strncmp(v_name, "SpringBoard", len) ||
4712 !strncmp(v_name, "backboardd", len)) {
4713 /*
4714 * This file matters when launching Camera:
4715 * do not store its contents in the secluded
4716 * pool that will be drained on Camera launch.
4717 */
4718 memory_object_mark_eligible_for_secluded(moc,
4719 FALSE);
4720 } else if (!strncmp(v_name, "mediaserverd", len)) {
4721 memory_object_mark_eligible_for_secluded(moc,
4722 FALSE);
4723 memory_object_mark_for_realtime(moc,
4724 true);
4725 } else if (!strncmp(v_name, "bluetoothd", len)) {
4726 /*
4727 * bluetoothd might be needed for realtime audio
4728 * playback.
4729 */
4730 memory_object_mark_eligible_for_secluded(moc,
4731 FALSE);
4732 memory_object_mark_for_realtime(moc,
4733 true);
4734 } else {
4735 char pathname[64] = { 0, };
4736 size_t copied;
4737 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4738 (void)copyinstr(ndp->ni_dirp,
4739 pathname,
4740 sizeof(pathname),
4741 &copied);
4742 } else {
4743 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4744 pathname,
4745 sizeof(pathname),
4746 &copied);
4747 }
4748 pathname[sizeof(pathname) - 1] = '\0';
4749 if (strncmp(pathname,
4750 "/Library/Audio/Plug-Ins/",
4751 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4752 strncmp(pathname,
4753 "/System/Library/Audio/Plug-Ins/",
4754 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4755 /*
4756 * This may be an audio plugin required
4757 * for realtime playback.
4758 * ==> NOT eligible for secluded.
4759 */
4760 memory_object_mark_eligible_for_secluded(moc,
4761 FALSE);
4762 memory_object_mark_for_realtime(moc,
4763 true);
4764 }
4765 }
4766 vnode_putname(v_name);
4767 }
4768 }
4769 #endif /* CONFIG_SECLUDED_MEMORY */
4770
4771 vnode_put(vp);
4772
4773 /*
4774 * The first terminal open (without a O_NOCTTY) by a session leader
4775 * results in it being set as the controlling terminal.
4776 */
4777 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4778 !(flags & O_NOCTTY)) {
4779 int tmp = 0;
4780
4781 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4782 (caddr_t)&tmp, ctx);
4783 }
4784
4785 proc_fdlock(p);
4786 procfdtbl_releasefd(p, indx, NULL);
4787
4788 fp_drop(p, indx, fp, 1);
4789 proc_fdunlock(p);
4790
4791 *retval = indx;
4792
4793 return 0;
4794 bad:
4795 context = *vfs_context_current();
4796 context.vc_ucred = fp->fp_glob->fg_cred;
4797
4798 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4799 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4800 struct flock lf = {
4801 .l_whence = SEEK_SET,
4802 .l_type = F_UNLCK,
4803 };
4804
4805 (void)VNOP_ADVLOCK(
4806 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4807 }
4808
4809 vn_close(vp, fp->fp_glob->fg_flag, &context);
4810 vnode_put(vp);
4811 fp_free(p, indx, fp);
4812
4813 return error;
4814 }
4815
4816 /*
4817 * While most of the *at syscall handlers can call nameiat() which
4818 * is a wrapper around namei, the use of namei and initialisation
4819 * of nameidata are far removed and in different functions - namei
4820 * gets called in vn_open_auth for open1. So we'll just do here what
4821 * nameiat() does.
4822 */
4823 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4824 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4825 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4826 int dirfd, int authfd)
4827 {
4828 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4829 int error;
4830 char c;
4831
4832 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4833 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4834 if (error) {
4835 return error;
4836 }
4837 } else {
4838 c = *((char *)(ndp->ni_dirp));
4839 }
4840
4841 if (c != '/') {
4842 vnode_t dvp_at;
4843
4844 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4845 &dvp_at);
4846 if (error) {
4847 return error;
4848 }
4849
4850 if (vnode_vtype(dvp_at) != VDIR) {
4851 vnode_put(dvp_at);
4852 return ENOTDIR;
4853 }
4854
4855 ndp->ni_dvp = dvp_at;
4856 ndp->ni_cnd.cn_flags |= USEDVP;
4857 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4858 retval, authfd);
4859 vnode_put(dvp_at);
4860 return error;
4861 }
4862 }
4863
4864 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4865 }
4866
4867 /*
4868 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4869 *
4870 * Parameters: p Process requesting the open
4871 * uap User argument descriptor (see below)
4872 * retval Pointer to an area to receive the
4873 * return calue from the system call
4874 *
4875 * Indirect: uap->path Path to open (same as 'open')
4876 * uap->flags Flags to open (same as 'open'
4877 * uap->uid UID to set, if creating
4878 * uap->gid GID to set, if creating
4879 * uap->mode File mode, if creating (same as 'open')
4880 * uap->xsecurity ACL to set, if creating
4881 *
4882 * Returns: 0 Success
4883 * !0 errno value
4884 *
4885 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4886 *
4887 * XXX: We should enummerate the possible errno values here, and where
4888 * in the code they originated.
4889 */
4890 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4891 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4892 {
4893 int ciferror;
4894 kauth_filesec_t xsecdst;
4895 struct vnode_attr va;
4896 struct nameidata nd;
4897 int cmode;
4898
4899 AUDIT_ARG(owner, uap->uid, uap->gid);
4900
4901 xsecdst = NULL;
4902 if ((uap->xsecurity != USER_ADDR_NULL) &&
4903 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4904 return ciferror;
4905 }
4906
4907 VATTR_INIT(&va);
4908 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4909 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4910 if (uap->uid != KAUTH_UID_NONE) {
4911 VATTR_SET(&va, va_uid, uap->uid);
4912 }
4913 if (uap->gid != KAUTH_GID_NONE) {
4914 VATTR_SET(&va, va_gid, uap->gid);
4915 }
4916 if (xsecdst != NULL) {
4917 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4918 va.va_vaflags |= VA_FILESEC_ACL;
4919 }
4920
4921 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4922 uap->path, vfs_context_current());
4923
4924 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4925 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4926 if (xsecdst != NULL) {
4927 kauth_filesec_free(xsecdst);
4928 }
4929
4930 return ciferror;
4931 }
4932
4933 /*
4934 * Go through the data-protected atomically controlled open (2)
4935 *
4936 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4937 */
4938 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4939 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4940 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4941 {
4942 /*
4943 * Follow the same path as normal open(2)
4944 * Look up the item if it exists, and acquire the vnode.
4945 */
4946 struct vnode_attr va;
4947 struct nameidata nd;
4948 int cmode;
4949 int error;
4950 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4951
4952 VATTR_INIT(&va);
4953 /* Mask off all but regular access permissions */
4954 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4955 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4956
4957 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4958 path, ctx);
4959
4960 /*
4961 * Initialize the extra fields in vnode_attr to pass down our
4962 * extra fields.
4963 * 1. target cprotect class.
4964 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4965 */
4966 if (flags & O_CREAT) {
4967 /* lower level kernel code validates that the class is valid before applying it. */
4968 if (class != PROTECTION_CLASS_DEFAULT) {
4969 /*
4970 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4971 * file behave the same as open (2)
4972 */
4973 VATTR_SET(&va, va_dataprotect_class, class);
4974 }
4975 }
4976
4977 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4978 if (flags & (O_RDWR | O_WRONLY)) {
4979 /*
4980 * Not allowed to write raw encrypted bytes or when opening authenticated.
4981 */
4982 return EINVAL;
4983 }
4984 if (dpflags & O_DP_GETRAWENCRYPTED) {
4985 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4986 }
4987 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4988 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4989 }
4990 if (dpflags & O_DP_AUTHENTICATE) {
4991 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4992 }
4993 }
4994
4995 error = open1at(vfs_context_current(), &nd, flags, &va,
4996 NULL, NULL, retval, fd, authfd);
4997
4998 return error;
4999 }
5000
5001 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5002 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5003 {
5004 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5005 return EINVAL;
5006 }
5007
5008 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5009 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5010 }
5011
5012 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5013 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5014 {
5015 if (uap->dpflags & O_DP_AUTHENTICATE) {
5016 return EINVAL;
5017 }
5018
5019 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5020 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5021 }
5022
5023 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5024 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5025 int fd, enum uio_seg segflg, int *retval)
5026 {
5027 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5028 struct {
5029 struct vnode_attr va;
5030 struct nameidata nd;
5031 } *__open_data;
5032 struct vnode_attr *vap;
5033 struct nameidata *ndp;
5034 int cmode;
5035 int error;
5036
5037 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5038 vap = &__open_data->va;
5039 ndp = &__open_data->nd;
5040
5041 VATTR_INIT(vap);
5042 /* Mask off all but regular access permissions */
5043 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5044 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5045
5046 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5047 segflg, path, ctx);
5048
5049 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5050
5051 kfree_type(typeof(*__open_data), __open_data);
5052
5053 return error;
5054 }
5055
5056 int
open(proc_t p,struct open_args * uap,int32_t * retval)5057 open(proc_t p, struct open_args *uap, int32_t *retval)
5058 {
5059 __pthread_testcancel(1);
5060 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5061 }
5062
5063 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5064 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5065 int32_t *retval)
5066 {
5067 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5068 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5069 }
5070
5071 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5072 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5073 int32_t *retval)
5074 {
5075 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5076 uap->mode, uap->fd, UIO_USERSPACE, retval);
5077 }
5078
5079 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5080 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5081 {
5082 __pthread_testcancel(1);
5083 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5084 }
5085
5086 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5087
5088 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5089 vfs_context_can_open_by_id(vfs_context_t ctx)
5090 {
5091 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5092 return TRUE;
5093 }
5094
5095 return IOTaskHasEntitlement(vfs_context_task(ctx),
5096 OPEN_BY_ID_ENTITLEMENT);
5097 }
5098
5099 /*
5100 * openbyid_np: open a file given a file system id and a file system object id
5101 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5102 * file systems that don't support object ids it is a node id (uint64_t).
5103 *
5104 * Parameters: p Process requesting the open
5105 * uap User argument descriptor (see below)
5106 * retval Pointer to an area to receive the
5107 * return calue from the system call
5108 *
5109 * Indirect: uap->path Path to open (same as 'open')
5110 *
5111 * uap->fsid id of target file system
5112 * uap->objid id of target file system object
5113 * uap->flags Flags to open (same as 'open')
5114 *
5115 * Returns: 0 Success
5116 * !0 errno value
5117 *
5118 *
5119 * XXX: We should enummerate the possible errno values here, and where
5120 * in the code they originated.
5121 */
5122 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5123 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5124 {
5125 fsid_t fsid;
5126 uint64_t objid;
5127 int error;
5128 char *buf = NULL;
5129 int buflen = MAXPATHLEN;
5130 int pathlen = 0;
5131 vfs_context_t ctx = vfs_context_current();
5132
5133 if (!vfs_context_can_open_by_id(ctx)) {
5134 return EPERM;
5135 }
5136
5137 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5138 return error;
5139 }
5140
5141 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5142 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5143 return error;
5144 }
5145
5146 AUDIT_ARG(value32, fsid.val[0]);
5147 AUDIT_ARG(value64, objid);
5148
5149 /*resolve path from fsis, objid*/
5150 do {
5151 buf = kalloc_data(buflen + 1, Z_WAITOK);
5152 if (buf == NULL) {
5153 return ENOMEM;
5154 }
5155
5156 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5157 buf, FSOPT_ISREALFSID, &pathlen);
5158
5159 if (error) {
5160 kfree_data(buf, buflen + 1);
5161 buf = NULL;
5162 }
5163 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5164
5165 if (error) {
5166 return error;
5167 }
5168
5169 buf[pathlen] = 0;
5170
5171 error = openat_internal(
5172 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5173
5174 kfree_data(buf, buflen + 1);
5175
5176 return error;
5177 }
5178
5179
5180 /*
5181 * Create a special file.
5182 */
5183 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5184 int fd);
5185
5186 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5187 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5188 mode_t mode, int fd)
5189 {
5190 vfs_context_t ctx = vfs_context_current();
5191 struct nameidata nd;
5192 vnode_t vp, dvp;
5193 int error;
5194
5195 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5196 if ((mode & S_IFMT) == S_IFIFO) {
5197 return mkfifo1(ctx, upath, vap, fd);
5198 }
5199
5200 AUDIT_ARG(mode, mode);
5201 AUDIT_ARG(value32, vap->va_rdev);
5202
5203 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5204 return error;
5205 }
5206 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5207 UIO_USERSPACE, upath, ctx);
5208 error = nameiat(&nd, fd);
5209 if (error) {
5210 return error;
5211 }
5212 dvp = nd.ni_dvp;
5213 vp = nd.ni_vp;
5214
5215 if (vp != NULL) {
5216 error = EEXIST;
5217 goto out;
5218 }
5219
5220 switch (mode & S_IFMT) {
5221 case S_IFCHR:
5222 VATTR_SET(vap, va_type, VCHR);
5223 break;
5224 case S_IFBLK:
5225 VATTR_SET(vap, va_type, VBLK);
5226 break;
5227 default:
5228 error = EINVAL;
5229 goto out;
5230 }
5231
5232 #if CONFIG_MACF
5233 error = mac_vnode_check_create(ctx,
5234 nd.ni_dvp, &nd.ni_cnd, vap);
5235 if (error) {
5236 goto out;
5237 }
5238 #endif
5239
5240 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5241 goto out;
5242 }
5243
5244 #if CONFIG_FILE_LEASES
5245 vnode_breakdirlease(dvp, false, O_WRONLY);
5246 #endif
5247
5248 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5249 goto out;
5250 }
5251
5252 if (vp) {
5253 int update_flags = 0;
5254
5255 // Make sure the name & parent pointers are hooked up
5256 if (vp->v_name == NULL) {
5257 update_flags |= VNODE_UPDATE_NAME;
5258 }
5259 if (vp->v_parent == NULLVP) {
5260 update_flags |= VNODE_UPDATE_PARENT;
5261 }
5262
5263 if (update_flags) {
5264 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5265 }
5266
5267 #if CONFIG_FSE
5268 add_fsevent(FSE_CREATE_FILE, ctx,
5269 FSE_ARG_VNODE, vp,
5270 FSE_ARG_DONE);
5271 #endif
5272 }
5273
5274 out:
5275 /*
5276 * nameidone has to happen before we vnode_put(dvp)
5277 * since it may need to release the fs_nodelock on the dvp
5278 */
5279 nameidone(&nd);
5280
5281 if (vp) {
5282 vnode_put(vp);
5283 }
5284 vnode_put(dvp);
5285
5286 return error;
5287 }
5288
5289 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5290 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5291 {
5292 struct vnode_attr va;
5293
5294 VATTR_INIT(&va);
5295 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5296 VATTR_SET(&va, va_rdev, uap->dev);
5297
5298 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5299 }
5300
5301 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5302 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5303 {
5304 struct vnode_attr va;
5305
5306 VATTR_INIT(&va);
5307 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5308 VATTR_SET(&va, va_rdev, uap->dev);
5309
5310 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5311 }
5312
5313 /*
5314 * Create a named pipe.
5315 *
5316 * Returns: 0 Success
5317 * EEXIST
5318 * namei:???
5319 * vnode_authorize:???
5320 * vn_create:???
5321 */
5322 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5323 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5324 {
5325 vnode_t vp, dvp;
5326 int error;
5327 struct nameidata nd;
5328
5329 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5330 UIO_USERSPACE, upath, ctx);
5331 error = nameiat(&nd, fd);
5332 if (error) {
5333 return error;
5334 }
5335 dvp = nd.ni_dvp;
5336 vp = nd.ni_vp;
5337
5338 /* check that this is a new file and authorize addition */
5339 if (vp != NULL) {
5340 error = EEXIST;
5341 goto out;
5342 }
5343 VATTR_SET(vap, va_type, VFIFO);
5344
5345 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5346 goto out;
5347 }
5348
5349 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5350 out:
5351 /*
5352 * nameidone has to happen before we vnode_put(dvp)
5353 * since it may need to release the fs_nodelock on the dvp
5354 */
5355 nameidone(&nd);
5356
5357 if (vp) {
5358 vnode_put(vp);
5359 }
5360 vnode_put(dvp);
5361
5362 return error;
5363 }
5364
5365
5366 /*
5367 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5368 *
5369 * Parameters: p Process requesting the open
5370 * uap User argument descriptor (see below)
5371 * retval (Ignored)
5372 *
5373 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5374 * uap->uid UID to set
5375 * uap->gid GID to set
5376 * uap->mode File mode to set (same as 'mkfifo')
5377 * uap->xsecurity ACL to set, if creating
5378 *
5379 * Returns: 0 Success
5380 * !0 errno value
5381 *
5382 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5383 *
5384 * XXX: We should enummerate the possible errno values here, and where
5385 * in the code they originated.
5386 */
5387 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5388 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5389 {
5390 int ciferror;
5391 kauth_filesec_t xsecdst;
5392 struct vnode_attr va;
5393
5394 AUDIT_ARG(owner, uap->uid, uap->gid);
5395
5396 xsecdst = KAUTH_FILESEC_NONE;
5397 if (uap->xsecurity != USER_ADDR_NULL) {
5398 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5399 return ciferror;
5400 }
5401 }
5402
5403 VATTR_INIT(&va);
5404 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5405 if (uap->uid != KAUTH_UID_NONE) {
5406 VATTR_SET(&va, va_uid, uap->uid);
5407 }
5408 if (uap->gid != KAUTH_GID_NONE) {
5409 VATTR_SET(&va, va_gid, uap->gid);
5410 }
5411 if (xsecdst != KAUTH_FILESEC_NONE) {
5412 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5413 va.va_vaflags |= VA_FILESEC_ACL;
5414 }
5415
5416 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5417
5418 if (xsecdst != KAUTH_FILESEC_NONE) {
5419 kauth_filesec_free(xsecdst);
5420 }
5421 return ciferror;
5422 }
5423
5424 /* ARGSUSED */
5425 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5426 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5427 {
5428 struct vnode_attr va;
5429
5430 VATTR_INIT(&va);
5431 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5432
5433 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5434 }
5435
5436 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5437 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5438 {
5439 struct vnode_attr va;
5440
5441 VATTR_INIT(&va);
5442 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5443
5444 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5445 }
5446
5447 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5448 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5449 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5450
5451 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5452 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5453 {
5454 int ret, len = _len;
5455
5456 *truncated_path = 0;
5457
5458 if (firmlink) {
5459 ret = vn_getpath(dvp, path, &len);
5460 } else {
5461 ret = vn_getpath_no_firmlink(dvp, path, &len);
5462 }
5463 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5464 if (leafname) {
5465 path[len - 1] = '/';
5466 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5467 if (len > MAXPATHLEN) {
5468 char *ptr;
5469
5470 // the string got truncated!
5471 *truncated_path = 1;
5472 ptr = strrchr(path, '/');
5473 if (ptr) {
5474 *ptr = '\0'; // chop off the string at the last directory component
5475 }
5476 len = (int)strlen(path) + 1;
5477 }
5478 }
5479 } else if (ret == 0) {
5480 *truncated_path = 1;
5481 } else if (ret != 0) {
5482 struct vnode *mydvp = dvp;
5483
5484 if (ret != ENOSPC) {
5485 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5486 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5487 }
5488 *truncated_path = 1;
5489
5490 do {
5491 if (mydvp->v_parent != NULL) {
5492 mydvp = mydvp->v_parent;
5493 } else if (mydvp->v_mount) {
5494 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5495 break;
5496 } else {
5497 // no parent and no mount point? only thing is to punt and say "/" changed
5498 strlcpy(path, "/", _len);
5499 len = 2;
5500 mydvp = NULL;
5501 }
5502
5503 if (mydvp == NULL) {
5504 break;
5505 }
5506
5507 len = _len;
5508 if (firmlink) {
5509 ret = vn_getpath(mydvp, path, &len);
5510 } else {
5511 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5512 }
5513 } while (ret == ENOSPC);
5514 }
5515
5516 return len;
5517 }
5518
5519 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5520 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5521 {
5522 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5523 }
5524
5525 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5526 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5527 {
5528 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5529 }
5530
5531 /*
5532 * Make a hard file link.
5533 *
5534 * Returns: 0 Success
5535 * EPERM
5536 * EEXIST
5537 * EXDEV
5538 * namei:???
5539 * vnode_authorize:???
5540 * VNOP_LINK:???
5541 */
5542 /* ARGSUSED */
5543 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5544 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5545 user_addr_t link, int flag, enum uio_seg segflg)
5546 {
5547 vnode_t vp, pvp, dvp, lvp;
5548 struct nameidata nd;
5549 int follow;
5550 int error;
5551 #if CONFIG_FSE
5552 fse_info finfo;
5553 #endif
5554 int need_event, has_listeners, need_kpath2;
5555 char *target_path = NULL;
5556 char *no_firmlink_path = NULL;
5557 int truncated = 0;
5558 int truncated_no_firmlink_path = 0;
5559
5560 vp = dvp = lvp = NULLVP;
5561
5562 /* look up the object we are linking to */
5563 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5564 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5565 segflg, path, ctx);
5566
5567 error = nameiat(&nd, fd1);
5568 if (error) {
5569 return error;
5570 }
5571 vp = nd.ni_vp;
5572
5573 nameidone(&nd);
5574
5575 /*
5576 * Normally, linking to directories is not supported.
5577 * However, some file systems may have limited support.
5578 */
5579 if (vp->v_type == VDIR) {
5580 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5581 error = EPERM; /* POSIX */
5582 goto out;
5583 }
5584
5585 /* Linking to a directory requires ownership. */
5586 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5587 struct vnode_attr dva;
5588
5589 VATTR_INIT(&dva);
5590 VATTR_WANTED(&dva, va_uid);
5591 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5592 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5593 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5594 error = EACCES;
5595 goto out;
5596 }
5597 }
5598 }
5599
5600 /* lookup the target node */
5601 #if CONFIG_TRIGGERS
5602 nd.ni_op = OP_LINK;
5603 #endif
5604 nd.ni_cnd.cn_nameiop = CREATE;
5605 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5606 nd.ni_dirp = link;
5607 error = nameiat(&nd, fd2);
5608 if (error != 0) {
5609 goto out;
5610 }
5611 dvp = nd.ni_dvp;
5612 lvp = nd.ni_vp;
5613
5614 #if CONFIG_MACF
5615 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5616 goto out2;
5617 }
5618 #endif
5619
5620 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5621 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5622 goto out2;
5623 }
5624
5625 /* target node must not exist */
5626 if (lvp != NULLVP) {
5627 error = EEXIST;
5628 goto out2;
5629 }
5630 /* cannot link across mountpoints */
5631 if (vnode_mount(vp) != vnode_mount(dvp)) {
5632 error = EXDEV;
5633 goto out2;
5634 }
5635
5636 /* authorize creation of the target note */
5637 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5638 goto out2;
5639 }
5640
5641 #if CONFIG_FILE_LEASES
5642 vnode_breakdirlease(dvp, false, O_WRONLY);
5643 #endif
5644
5645 /* and finally make the link */
5646 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5647 if (error) {
5648 goto out2;
5649 }
5650
5651 #if CONFIG_MACF
5652 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5653 #endif
5654
5655 #if CONFIG_FSE
5656 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5657 #else
5658 need_event = 0;
5659 #endif
5660 has_listeners = kauth_authorize_fileop_has_listeners();
5661
5662 need_kpath2 = 0;
5663 #if CONFIG_AUDIT
5664 if (AUDIT_RECORD_EXISTS()) {
5665 need_kpath2 = 1;
5666 }
5667 #endif
5668
5669 if (need_event || has_listeners || need_kpath2) {
5670 char *link_to_path = NULL;
5671 int len, link_name_len;
5672 int len_no_firmlink_path = 0;
5673
5674 /* build the path to the new link file */
5675 GET_PATH(target_path);
5676
5677 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5678 if (no_firmlink_path == NULL) {
5679 GET_PATH(no_firmlink_path);
5680 }
5681 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5682
5683 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5684
5685 if (has_listeners) {
5686 /* build the path to file we are linking to */
5687 GET_PATH(link_to_path);
5688
5689 link_name_len = MAXPATHLEN;
5690 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5691 /*
5692 * Call out to allow 3rd party notification of rename.
5693 * Ignore result of kauth_authorize_fileop call.
5694 */
5695 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5696 (uintptr_t)link_to_path,
5697 (uintptr_t)target_path);
5698 }
5699 if (link_to_path != NULL) {
5700 RELEASE_PATH(link_to_path);
5701 }
5702 }
5703 #if CONFIG_FSE
5704 if (need_event) {
5705 /* construct fsevent */
5706 if (get_fse_info(vp, &finfo, ctx) == 0) {
5707 if (truncated_no_firmlink_path) {
5708 finfo.mode |= FSE_TRUNCATED_PATH;
5709 }
5710
5711 // build the path to the destination of the link
5712 add_fsevent(FSE_CREATE_FILE, ctx,
5713 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5714 FSE_ARG_FINFO, &finfo,
5715 FSE_ARG_DONE);
5716 }
5717
5718 pvp = vp->v_parent;
5719 // need an iocount on parent vnode in this case
5720 if (pvp && pvp != dvp) {
5721 pvp = vnode_getparent_if_different(vp, dvp);
5722 }
5723 if (pvp) {
5724 add_fsevent(FSE_STAT_CHANGED, ctx,
5725 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5726 }
5727 if (pvp && pvp != dvp) {
5728 vnode_put(pvp);
5729 }
5730 }
5731 #endif
5732 }
5733 out2:
5734 /*
5735 * nameidone has to happen before we vnode_put(dvp)
5736 * since it may need to release the fs_nodelock on the dvp
5737 */
5738 nameidone(&nd);
5739 if (target_path != NULL) {
5740 RELEASE_PATH(target_path);
5741 }
5742 if (no_firmlink_path != NULL) {
5743 RELEASE_PATH(no_firmlink_path);
5744 no_firmlink_path = NULL;
5745 }
5746 out:
5747 if (lvp) {
5748 vnode_put(lvp);
5749 }
5750 if (dvp) {
5751 vnode_put(dvp);
5752 }
5753 vnode_put(vp);
5754 return error;
5755 }
5756
5757 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5758 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5759 {
5760 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5761 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5762 }
5763
5764 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5765 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5766 {
5767 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5768 return EINVAL;
5769 }
5770
5771 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5772 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5773 }
5774
5775 /*
5776 * Make a symbolic link.
5777 *
5778 * We could add support for ACLs here too...
5779 */
5780 /* ARGSUSED */
5781 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5782 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5783 user_addr_t link, enum uio_seg segflg)
5784 {
5785 struct vnode_attr va;
5786 char *path;
5787 int error;
5788 struct nameidata nd;
5789 vnode_t vp, dvp;
5790 size_t dummy = 0;
5791 proc_t p;
5792
5793 error = 0;
5794 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5795 path = zalloc(ZV_NAMEI);
5796 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5797 } else {
5798 path = (char *)path_data;
5799 }
5800 if (error) {
5801 goto out;
5802 }
5803 AUDIT_ARG(text, path); /* This is the link string */
5804
5805 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5806 segflg, link, ctx);
5807
5808 error = nameiat(&nd, fd);
5809 if (error) {
5810 goto out;
5811 }
5812 dvp = nd.ni_dvp;
5813 vp = nd.ni_vp;
5814
5815 p = vfs_context_proc(ctx);
5816 VATTR_INIT(&va);
5817 VATTR_SET(&va, va_type, VLNK);
5818 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5819
5820 #if CONFIG_MACF
5821 error = mac_vnode_check_create(ctx,
5822 dvp, &nd.ni_cnd, &va);
5823 #endif
5824 if (error != 0) {
5825 goto skipit;
5826 }
5827
5828 if (vp != NULL) {
5829 error = EEXIST;
5830 goto skipit;
5831 }
5832
5833 /* authorize */
5834 if (error == 0) {
5835 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5836 }
5837 /* get default ownership, etc. */
5838 if (error == 0) {
5839 error = vnode_authattr_new(dvp, &va, 0, ctx);
5840 }
5841
5842 #if CONFIG_FILE_LEASES
5843 vnode_breakdirlease(dvp, false, O_WRONLY);
5844 #endif
5845
5846 if (error == 0) {
5847 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5848 }
5849
5850 /* do fallback attribute handling */
5851 if (error == 0 && vp) {
5852 error = vnode_setattr_fallback(vp, &va, ctx);
5853 }
5854
5855 #if CONFIG_MACF
5856 if (error == 0 && vp) {
5857 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5858 }
5859 #endif
5860
5861 if (error == 0) {
5862 int update_flags = 0;
5863
5864 /*check if a new vnode was created, else try to get one*/
5865 if (vp == NULL) {
5866 nd.ni_cnd.cn_nameiop = LOOKUP;
5867 #if CONFIG_TRIGGERS
5868 nd.ni_op = OP_LOOKUP;
5869 #endif
5870 /*
5871 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5872 * reallocated again in namei().
5873 */
5874 nd.ni_cnd.cn_flags &= HASBUF;
5875 error = nameiat(&nd, fd);
5876 if (error) {
5877 goto skipit;
5878 }
5879 vp = nd.ni_vp;
5880 }
5881
5882 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5883 /* call out to allow 3rd party notification of rename.
5884 * Ignore result of kauth_authorize_fileop call.
5885 */
5886 if (kauth_authorize_fileop_has_listeners() &&
5887 namei(&nd) == 0) {
5888 char *new_link_path = NULL;
5889 int len;
5890
5891 /* build the path to the new link file */
5892 new_link_path = get_pathbuff();
5893 len = MAXPATHLEN;
5894 vn_getpath(dvp, new_link_path, &len);
5895 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5896 new_link_path[len - 1] = '/';
5897 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5898 }
5899
5900 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5901 (uintptr_t)path, (uintptr_t)new_link_path);
5902 if (new_link_path != NULL) {
5903 release_pathbuff(new_link_path);
5904 }
5905 }
5906 #endif
5907 // Make sure the name & parent pointers are hooked up
5908 if (vp->v_name == NULL) {
5909 update_flags |= VNODE_UPDATE_NAME;
5910 }
5911 if (vp->v_parent == NULLVP) {
5912 update_flags |= VNODE_UPDATE_PARENT;
5913 }
5914
5915 if (update_flags) {
5916 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5917 }
5918
5919 #if CONFIG_FSE
5920 add_fsevent(FSE_CREATE_FILE, ctx,
5921 FSE_ARG_VNODE, vp,
5922 FSE_ARG_DONE);
5923 #endif
5924 }
5925
5926 skipit:
5927 /*
5928 * nameidone has to happen before we vnode_put(dvp)
5929 * since it may need to release the fs_nodelock on the dvp
5930 */
5931 nameidone(&nd);
5932
5933 if (vp) {
5934 vnode_put(vp);
5935 }
5936 vnode_put(dvp);
5937 out:
5938 if (path && (path != (char *)path_data)) {
5939 zfree(ZV_NAMEI, path);
5940 }
5941
5942 return error;
5943 }
5944
5945 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5946 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5947 {
5948 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5949 uap->link, UIO_USERSPACE);
5950 }
5951
5952 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5953 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5954 __unused int32_t *retval)
5955 {
5956 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5957 uap->path2, UIO_USERSPACE);
5958 }
5959
5960 /*
5961 * Delete a whiteout from the filesystem.
5962 * No longer supported.
5963 */
5964 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5965 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5966 {
5967 return ENOTSUP;
5968 }
5969
5970 /*
5971 * Delete a name from the filesystem.
5972 */
5973 /* ARGSUSED */
5974 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5975 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5976 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5977 {
5978 struct {
5979 struct nameidata nd;
5980 #if CONFIG_FSE
5981 struct vnode_attr va;
5982 fse_info finfo;
5983 #endif
5984 } *__unlink_data;
5985 struct nameidata *ndp;
5986 vnode_t vp, dvp;
5987 int error;
5988 struct componentname *cnp;
5989 char *path = NULL;
5990 char *no_firmlink_path = NULL;
5991 int len_path = 0;
5992 int len_no_firmlink_path = 0;
5993 int flags;
5994 int need_event;
5995 int has_listeners;
5996 int truncated_path;
5997 int truncated_no_firmlink_path;
5998 int batched;
5999 struct vnode_attr *vap;
6000 int do_retry;
6001 int retry_count = 0;
6002 int cn_flags;
6003 int nofollow_any = 0;
6004
6005 cn_flags = LOCKPARENT;
6006 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6007 cn_flags |= AUDITVNPATH1;
6008 }
6009 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6010 nofollow_any = NAMEI_NOFOLLOW_ANY;
6011 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6012 }
6013 /* If a starting dvp is passed, it trumps any fd passed. */
6014 if (start_dvp) {
6015 cn_flags |= USEDVP;
6016 }
6017
6018 #if NAMEDRSRCFORK
6019 /* unlink or delete is allowed on rsrc forks and named streams */
6020 cn_flags |= CN_ALLOWRSRCFORK;
6021 #endif
6022
6023 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6024 ndp = &__unlink_data->nd;
6025 #if CONFIG_FSE
6026 fse_info *finfop = &__unlink_data->finfo;
6027 #endif
6028
6029 retry:
6030 do_retry = 0;
6031 flags = 0;
6032 need_event = 0;
6033 has_listeners = 0;
6034 truncated_path = 0;
6035 truncated_no_firmlink_path = 0;
6036 vap = NULL;
6037
6038 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6039
6040 ndp->ni_dvp = start_dvp;
6041 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6042 cnp = &ndp->ni_cnd;
6043
6044 continue_lookup:
6045 error = nameiat(ndp, fd);
6046 if (error) {
6047 goto early_out;
6048 }
6049
6050 dvp = ndp->ni_dvp;
6051 vp = ndp->ni_vp;
6052
6053 /* With Carbon delete semantics, busy files cannot be deleted */
6054 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6055 flags |= VNODE_REMOVE_NODELETEBUSY;
6056 }
6057
6058 /* Skip any potential upcalls if told to. */
6059 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6060 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6061 }
6062
6063 if (vp) {
6064 batched = vnode_compound_remove_available(vp);
6065 /*
6066 * The root of a mounted filesystem cannot be deleted.
6067 */
6068 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6069 error = EBUSY;
6070 goto out;
6071 }
6072
6073 #if DEVELOPMENT || DEBUG
6074 /*
6075 * XXX VSWAP: Check for entitlements or special flag here
6076 * so we can restrict access appropriately.
6077 */
6078 #else /* DEVELOPMENT || DEBUG */
6079
6080 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6081 error = EPERM;
6082 goto out;
6083 }
6084 #endif /* DEVELOPMENT || DEBUG */
6085
6086 if (!batched) {
6087 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6088 if (error) {
6089 if (error == ENOENT) {
6090 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6091 do_retry = 1;
6092 retry_count++;
6093 }
6094 }
6095 goto out;
6096 }
6097 }
6098 } else {
6099 batched = 1;
6100
6101 if (!vnode_compound_remove_available(dvp)) {
6102 panic("No vp, but no compound remove?");
6103 }
6104 }
6105
6106 #if CONFIG_FSE
6107 need_event = need_fsevent(FSE_DELETE, dvp);
6108 if (need_event) {
6109 if (!batched) {
6110 if ((vp->v_flag & VISHARDLINK) == 0) {
6111 /* XXX need to get these data in batched VNOP */
6112 get_fse_info(vp, finfop, ctx);
6113 }
6114 } else {
6115 error =
6116 vfs_get_notify_attributes(&__unlink_data->va);
6117 if (error) {
6118 goto out;
6119 }
6120
6121 vap = &__unlink_data->va;
6122 }
6123 }
6124 #endif
6125 has_listeners = kauth_authorize_fileop_has_listeners();
6126 if (need_event || has_listeners) {
6127 if (path == NULL) {
6128 GET_PATH(path);
6129 }
6130 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6131 if (no_firmlink_path == NULL) {
6132 GET_PATH(no_firmlink_path);
6133 }
6134 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6135 }
6136
6137 #if NAMEDRSRCFORK
6138 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6139 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6140 } else
6141 #endif
6142 {
6143 #if CONFIG_FILE_LEASES
6144 vnode_breakdirlease(dvp, false, O_WRONLY);
6145 #endif
6146
6147 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6148 vp = ndp->ni_vp;
6149 if (error == EKEEPLOOKING) {
6150 if (!batched) {
6151 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6152 }
6153
6154 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6155 panic("EKEEPLOOKING, but continue flag not set?");
6156 }
6157
6158 if (vnode_isdir(vp)) {
6159 error = EISDIR;
6160 goto out;
6161 }
6162 goto continue_lookup;
6163 } else if (error == ENOENT && batched) {
6164 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6165 /*
6166 * For compound VNOPs, the authorization callback may
6167 * return ENOENT in case of racing hardlink lookups
6168 * hitting the name cache, redrive the lookup.
6169 */
6170 do_retry = 1;
6171 retry_count += 1;
6172 goto out;
6173 }
6174 }
6175 }
6176
6177 /*
6178 * Call out to allow 3rd party notification of delete.
6179 * Ignore result of kauth_authorize_fileop call.
6180 */
6181 if (!error) {
6182 if (has_listeners) {
6183 kauth_authorize_fileop(vfs_context_ucred(ctx),
6184 KAUTH_FILEOP_DELETE,
6185 (uintptr_t)vp,
6186 (uintptr_t)path);
6187 }
6188
6189 if (vp->v_flag & VISHARDLINK) {
6190 //
6191 // if a hardlink gets deleted we want to blow away the
6192 // v_parent link because the path that got us to this
6193 // instance of the link is no longer valid. this will
6194 // force the next call to get the path to ask the file
6195 // system instead of just following the v_parent link.
6196 //
6197 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6198 }
6199
6200 #if CONFIG_FSE
6201 if (need_event) {
6202 if (vp->v_flag & VISHARDLINK) {
6203 get_fse_info(vp, finfop, ctx);
6204 } else if (vap) {
6205 vnode_get_fse_info_from_vap(vp, finfop, vap);
6206 }
6207 if (truncated_path) {
6208 finfop->mode |= FSE_TRUNCATED_PATH;
6209 }
6210 add_fsevent(FSE_DELETE, ctx,
6211 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6212 FSE_ARG_FINFO, finfop,
6213 FSE_ARG_DONE);
6214 }
6215 #endif
6216
6217 #if CONFIG_MACF
6218 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6219 #endif
6220 }
6221
6222 out:
6223 if (path != NULL) {
6224 RELEASE_PATH(path);
6225 path = NULL;
6226 }
6227
6228 if (no_firmlink_path != NULL) {
6229 RELEASE_PATH(no_firmlink_path);
6230 no_firmlink_path = NULL;
6231 }
6232 #if NAMEDRSRCFORK
6233 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6234 * will cause its shadow file to go away if necessary.
6235 */
6236 if (vp && (vnode_isnamedstream(vp)) &&
6237 (vp->v_parent != NULLVP) &&
6238 vnode_isshadow(vp)) {
6239 vnode_recycle(vp);
6240 }
6241 #endif
6242 /*
6243 * nameidone has to happen before we vnode_put(dvp)
6244 * since it may need to release the fs_nodelock on the dvp
6245 */
6246 nameidone(ndp);
6247 vnode_put(dvp);
6248 if (vp) {
6249 vnode_put(vp);
6250 }
6251
6252 if (do_retry) {
6253 goto retry;
6254 }
6255
6256 early_out:
6257 kfree_type(typeof(*__unlink_data), __unlink_data);
6258 return error;
6259 }
6260
6261 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6262 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6263 enum uio_seg segflg, int unlink_flags)
6264 {
6265 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6266 unlink_flags);
6267 }
6268
6269 /*
6270 * Delete a name from the filesystem using Carbon semantics.
6271 */
6272 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6273 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6274 {
6275 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6276 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6277 }
6278
6279 /*
6280 * Delete a name from the filesystem using POSIX semantics.
6281 */
6282 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6283 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6284 {
6285 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6286 uap->path, UIO_USERSPACE, 0);
6287 }
6288
6289 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6290 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6291 {
6292 int unlink_flags = 0;
6293
6294 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY)) {
6295 return EINVAL;
6296 }
6297
6298 if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6299 unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6300 }
6301
6302 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6303 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6304 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6305 }
6306 return rmdirat_internal(vfs_context_current(), uap->fd,
6307 uap->path, UIO_USERSPACE, unlink_flags);
6308 } else {
6309 return unlinkat_internal(vfs_context_current(), uap->fd,
6310 NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6311 }
6312 }
6313
6314 /*
6315 * Reposition read/write file offset.
6316 */
6317 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6318 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6319 {
6320 struct fileproc *fp;
6321 vnode_t vp;
6322 struct vfs_context *ctx;
6323 off_t offset = uap->offset, file_size;
6324 int error;
6325
6326 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6327 if (error == ENOTSUP) {
6328 return ESPIPE;
6329 }
6330 return error;
6331 }
6332 if (vnode_isfifo(vp)) {
6333 file_drop(uap->fd);
6334 return ESPIPE;
6335 }
6336
6337
6338 ctx = vfs_context_current();
6339 #if CONFIG_MACF
6340 if (uap->whence == L_INCR && uap->offset == 0) {
6341 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6342 fp->fp_glob);
6343 } else {
6344 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6345 fp->fp_glob);
6346 }
6347 if (error) {
6348 file_drop(uap->fd);
6349 return error;
6350 }
6351 #endif
6352 if ((error = vnode_getwithref(vp))) {
6353 file_drop(uap->fd);
6354 return error;
6355 }
6356
6357 switch (uap->whence) {
6358 case L_INCR:
6359 offset += fp->fp_glob->fg_offset;
6360 break;
6361 case L_XTND:
6362 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6363 break;
6364 }
6365 offset += file_size;
6366 break;
6367 case L_SET:
6368 break;
6369 case SEEK_HOLE:
6370 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6371 break;
6372 case SEEK_DATA:
6373 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6374 break;
6375 default:
6376 error = EINVAL;
6377 }
6378 if (error == 0) {
6379 if (uap->offset > 0 && offset < 0) {
6380 /* Incremented/relative move past max size */
6381 error = EOVERFLOW;
6382 } else {
6383 /*
6384 * Allow negative offsets on character devices, per
6385 * POSIX 1003.1-2001. Most likely for writing disk
6386 * labels.
6387 */
6388 if (offset < 0 && vp->v_type != VCHR) {
6389 /* Decremented/relative move before start */
6390 error = EINVAL;
6391 } else {
6392 /* Success */
6393 fp->fp_glob->fg_offset = offset;
6394 *retval = fp->fp_glob->fg_offset;
6395 }
6396 }
6397 }
6398
6399 /*
6400 * An lseek can affect whether data is "available to read." Use
6401 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6402 */
6403 post_event_if_success(vp, error, NOTE_NONE);
6404 (void)vnode_put(vp);
6405 file_drop(uap->fd);
6406 return error;
6407 }
6408
6409
6410 /*
6411 * Check access permissions.
6412 *
6413 * Returns: 0 Success
6414 * vnode_authorize:???
6415 */
6416 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6417 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6418 {
6419 kauth_action_t action;
6420 int error;
6421
6422 /*
6423 * If just the regular access bits, convert them to something
6424 * that vnode_authorize will understand.
6425 */
6426 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6427 action = 0;
6428 if (uflags & R_OK) {
6429 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6430 }
6431 if (uflags & W_OK) {
6432 if (vnode_isdir(vp)) {
6433 action |= KAUTH_VNODE_ADD_FILE |
6434 KAUTH_VNODE_ADD_SUBDIRECTORY;
6435 /* might want delete rights here too */
6436 } else {
6437 action |= KAUTH_VNODE_WRITE_DATA;
6438 }
6439 }
6440 if (uflags & X_OK) {
6441 if (vnode_isdir(vp)) {
6442 action |= KAUTH_VNODE_SEARCH;
6443 } else {
6444 action |= KAUTH_VNODE_EXECUTE;
6445 }
6446 }
6447 } else {
6448 /* take advantage of definition of uflags */
6449 action = uflags >> 8;
6450 }
6451
6452 #if CONFIG_MACF
6453 error = mac_vnode_check_access(ctx, vp, uflags);
6454 if (error) {
6455 return error;
6456 }
6457 #endif /* MAC */
6458
6459 /* action == 0 means only check for existence */
6460 if (action != 0) {
6461 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6462 } else {
6463 error = 0;
6464 }
6465
6466 return error;
6467 }
6468
6469
6470
6471 /*
6472 * access_extended: Check access permissions in bulk.
6473 *
6474 * Description: uap->entries Pointer to an array of accessx
6475 * descriptor structs, plus one or
6476 * more NULL terminated strings (see
6477 * "Notes" section below).
6478 * uap->size Size of the area pointed to by
6479 * uap->entries.
6480 * uap->results Pointer to the results array.
6481 *
6482 * Returns: 0 Success
6483 * ENOMEM Insufficient memory
6484 * EINVAL Invalid arguments
6485 * namei:EFAULT Bad address
6486 * namei:ENAMETOOLONG Filename too long
6487 * namei:ENOENT No such file or directory
6488 * namei:ELOOP Too many levels of symbolic links
6489 * namei:EBADF Bad file descriptor
6490 * namei:ENOTDIR Not a directory
6491 * namei:???
6492 * access1:
6493 *
6494 * Implicit returns:
6495 * uap->results Array contents modified
6496 *
6497 * Notes: The uap->entries are structured as an arbitrary length array
6498 * of accessx descriptors, followed by one or more NULL terminated
6499 * strings
6500 *
6501 * struct accessx_descriptor[0]
6502 * ...
6503 * struct accessx_descriptor[n]
6504 * char name_data[0];
6505 *
6506 * We determine the entry count by walking the buffer containing
6507 * the uap->entries argument descriptor. For each descriptor we
6508 * see, the valid values for the offset ad_name_offset will be
6509 * in the byte range:
6510 *
6511 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6512 * to
6513 * [ uap->entries + uap->size - 2 ]
6514 *
6515 * since we must have at least one string, and the string must
6516 * be at least one character plus the NULL terminator in length.
6517 *
6518 * XXX: Need to support the check-as uid argument
6519 */
6520 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6521 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6522 {
6523 struct accessx_descriptor *input = NULL;
6524 errno_t *result = NULL;
6525 errno_t error = 0;
6526 int wantdelete = 0;
6527 size_t desc_max, desc_actual = 0;
6528 unsigned int i, j;
6529 struct vfs_context context;
6530 struct nameidata nd;
6531 int niopts;
6532 vnode_t vp = NULL;
6533 vnode_t dvp = NULL;
6534 #define ACCESSX_MAX_DESCR_ON_STACK 10
6535 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6536
6537 context.vc_ucred = NULL;
6538
6539 /*
6540 * Validate parameters; if valid, copy the descriptor array and string
6541 * arguments into local memory. Before proceeding, the following
6542 * conditions must have been met:
6543 *
6544 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6545 * o There must be sufficient room in the request for at least one
6546 * descriptor and a one yte NUL terminated string.
6547 * o The allocation of local storage must not fail.
6548 */
6549 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6550 return ENOMEM;
6551 }
6552 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6553 return EINVAL;
6554 }
6555 if (uap->size <= sizeof(stack_input)) {
6556 input = stack_input;
6557 } else {
6558 input = kalloc_data(uap->size, Z_WAITOK);
6559 if (input == NULL) {
6560 error = ENOMEM;
6561 goto out;
6562 }
6563 }
6564 error = copyin(uap->entries, input, uap->size);
6565 if (error) {
6566 goto out;
6567 }
6568
6569 AUDIT_ARG(opaque, input, uap->size);
6570
6571 /*
6572 * Force NUL termination of the copyin buffer to avoid nami() running
6573 * off the end. If the caller passes us bogus data, they may get a
6574 * bogus result.
6575 */
6576 ((char *)input)[uap->size - 1] = 0;
6577
6578 /*
6579 * Access is defined as checking against the process' real identity,
6580 * even if operations are checking the effective identity. This
6581 * requires that we use a local vfs context.
6582 */
6583 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6584 context.vc_thread = current_thread();
6585
6586 /*
6587 * Find out how many entries we have, so we can allocate the result
6588 * array by walking the list and adjusting the count downward by the
6589 * earliest string offset we see.
6590 */
6591 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6592 desc_actual = desc_max;
6593 for (i = 0; i < desc_actual; i++) {
6594 /*
6595 * Take the offset to the name string for this entry and
6596 * convert to an input array index, which would be one off
6597 * the end of the array if this entry was the lowest-addressed
6598 * name string.
6599 */
6600 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6601
6602 /*
6603 * An offset greater than the max allowable offset is an error.
6604 * It is also an error for any valid entry to point
6605 * to a location prior to the end of the current entry, if
6606 * it's not a reference to the string of the previous entry.
6607 */
6608 if (j > desc_max || (j != 0 && j <= i)) {
6609 error = EINVAL;
6610 goto out;
6611 }
6612
6613 /* Also do not let ad_name_offset point to something beyond the size of the input */
6614 if (input[i].ad_name_offset >= uap->size) {
6615 error = EINVAL;
6616 goto out;
6617 }
6618
6619 /*
6620 * An offset of 0 means use the previous descriptor's offset;
6621 * this is used to chain multiple requests for the same file
6622 * to avoid multiple lookups.
6623 */
6624 if (j == 0) {
6625 /* This is not valid for the first entry */
6626 if (i == 0) {
6627 error = EINVAL;
6628 goto out;
6629 }
6630 continue;
6631 }
6632
6633 /*
6634 * If the offset of the string for this descriptor is before
6635 * what we believe is the current actual last descriptor,
6636 * then we need to adjust our estimate downward; this permits
6637 * the string table following the last descriptor to be out
6638 * of order relative to the descriptor list.
6639 */
6640 if (j < desc_actual) {
6641 desc_actual = j;
6642 }
6643 }
6644
6645 /*
6646 * We limit the actual number of descriptors we are willing to process
6647 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6648 * requested does not exceed this limit,
6649 */
6650 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6651 error = ENOMEM;
6652 goto out;
6653 }
6654 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6655 if (result == NULL) {
6656 error = ENOMEM;
6657 goto out;
6658 }
6659
6660 /*
6661 * Do the work by iterating over the descriptor entries we know to
6662 * at least appear to contain valid data.
6663 */
6664 error = 0;
6665 for (i = 0; i < desc_actual; i++) {
6666 /*
6667 * If the ad_name_offset is 0, then we use the previous
6668 * results to make the check; otherwise, we are looking up
6669 * a new file name.
6670 */
6671 if (input[i].ad_name_offset != 0) {
6672 /* discard old vnodes */
6673 if (vp) {
6674 vnode_put(vp);
6675 vp = NULL;
6676 }
6677 if (dvp) {
6678 vnode_put(dvp);
6679 dvp = NULL;
6680 }
6681
6682 /*
6683 * Scan forward in the descriptor list to see if we
6684 * need the parent vnode. We will need it if we are
6685 * deleting, since we must have rights to remove
6686 * entries in the parent directory, as well as the
6687 * rights to delete the object itself.
6688 */
6689 wantdelete = input[i].ad_flags & _DELETE_OK;
6690 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6691 if (input[j].ad_flags & _DELETE_OK) {
6692 wantdelete = 1;
6693 }
6694 }
6695
6696 niopts = FOLLOW | AUDITVNPATH1;
6697
6698 /* need parent for vnode_authorize for deletion test */
6699 if (wantdelete) {
6700 niopts |= WANTPARENT;
6701 }
6702
6703 /* do the lookup */
6704 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6705 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6706 &context);
6707 error = namei(&nd);
6708 if (!error) {
6709 vp = nd.ni_vp;
6710 if (wantdelete) {
6711 dvp = nd.ni_dvp;
6712 }
6713 }
6714 nameidone(&nd);
6715 }
6716
6717 /*
6718 * Handle lookup errors.
6719 */
6720 switch (error) {
6721 case ENOENT:
6722 case EACCES:
6723 case EPERM:
6724 case ENOTDIR:
6725 result[i] = error;
6726 break;
6727 case 0:
6728 /* run this access check */
6729 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6730 break;
6731 default:
6732 /* fatal lookup error */
6733
6734 goto out;
6735 }
6736 }
6737
6738 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6739
6740 /* copy out results */
6741 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6742
6743 out:
6744 if (input && input != stack_input) {
6745 kfree_data(input, uap->size);
6746 }
6747 if (result) {
6748 kfree_data(result, desc_actual * sizeof(errno_t));
6749 }
6750 if (vp) {
6751 vnode_put(vp);
6752 }
6753 if (dvp) {
6754 vnode_put(dvp);
6755 }
6756 if (IS_VALID_CRED(context.vc_ucred)) {
6757 kauth_cred_unref(&context.vc_ucred);
6758 }
6759 return error;
6760 }
6761
6762
6763 /*
6764 * Returns: 0 Success
6765 * namei:EFAULT Bad address
6766 * namei:ENAMETOOLONG Filename too long
6767 * namei:ENOENT No such file or directory
6768 * namei:ELOOP Too many levels of symbolic links
6769 * namei:EBADF Bad file descriptor
6770 * namei:ENOTDIR Not a directory
6771 * namei:???
6772 * access1:
6773 */
6774 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6775 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6776 int flag, enum uio_seg segflg)
6777 {
6778 int error;
6779 struct nameidata nd;
6780 int niopts;
6781 struct vfs_context context;
6782 #if NAMEDRSRCFORK
6783 int is_namedstream = 0;
6784 #endif
6785
6786 /*
6787 * Unless the AT_EACCESS option is used, Access is defined as checking
6788 * against the process' real identity, even if operations are checking
6789 * the effective identity. So we need to tweak the credential
6790 * in the context for that case.
6791 */
6792 if (!(flag & AT_EACCESS)) {
6793 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6794 } else {
6795 context.vc_ucred = ctx->vc_ucred;
6796 }
6797 context.vc_thread = ctx->vc_thread;
6798
6799
6800 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6801 /* need parent for vnode_authorize for deletion test */
6802 if (amode & _DELETE_OK) {
6803 niopts |= WANTPARENT;
6804 }
6805 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6806 path, &context);
6807 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6808 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6809 }
6810
6811 #if NAMEDRSRCFORK
6812 /* access(F_OK) calls are allowed for resource forks. */
6813 if (amode == F_OK) {
6814 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6815 }
6816 #endif
6817 error = nameiat(&nd, fd);
6818 if (error) {
6819 goto out;
6820 }
6821
6822 #if NAMEDRSRCFORK
6823 /* Grab reference on the shadow stream file vnode to
6824 * force an inactive on release which will mark it
6825 * for recycle.
6826 */
6827 if (vnode_isnamedstream(nd.ni_vp) &&
6828 (nd.ni_vp->v_parent != NULLVP) &&
6829 vnode_isshadow(nd.ni_vp)) {
6830 is_namedstream = 1;
6831 vnode_ref(nd.ni_vp);
6832 }
6833 #endif
6834
6835 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6836
6837 #if NAMEDRSRCFORK
6838 if (is_namedstream) {
6839 vnode_rele(nd.ni_vp);
6840 }
6841 #endif
6842
6843 vnode_put(nd.ni_vp);
6844 if (amode & _DELETE_OK) {
6845 vnode_put(nd.ni_dvp);
6846 }
6847 nameidone(&nd);
6848
6849 out:
6850 if (!(flag & AT_EACCESS)) {
6851 kauth_cred_unref(&context.vc_ucred);
6852 }
6853 return error;
6854 }
6855
6856 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6857 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6858 {
6859 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6860 uap->path, uap->flags, 0, UIO_USERSPACE);
6861 }
6862
6863 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6864 faccessat(__unused proc_t p, struct faccessat_args *uap,
6865 __unused int32_t *retval)
6866 {
6867 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6868 return EINVAL;
6869 }
6870
6871 return faccessat_internal(vfs_context_current(), uap->fd,
6872 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6873 }
6874
6875 /*
6876 * Returns: 0 Success
6877 * EFAULT
6878 * copyout:EFAULT
6879 * namei:???
6880 * vn_stat:???
6881 */
6882 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6883 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6884 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6885 enum uio_seg segflg, int fd, int flag)
6886 {
6887 struct nameidata *ndp = NULL;
6888 int follow;
6889 union {
6890 struct stat sb;
6891 struct stat64 sb64;
6892 } source = {};
6893 union {
6894 struct user64_stat user64_sb;
6895 struct user32_stat user32_sb;
6896 struct user64_stat64 user64_sb64;
6897 struct user32_stat64 user32_sb64;
6898 } dest = {};
6899 caddr_t sbp;
6900 int error, my_size;
6901 kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
6902 size_t xsecurity_bufsize;
6903 void * statptr;
6904 struct fileproc *fp = NULL;
6905 int needsrealdev = 0;
6906
6907 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6908 ndp = kalloc_type(struct nameidata, Z_WAITOK);
6909 NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6910 segflg, path, ctx);
6911 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6912 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
6913 }
6914
6915 #if NAMEDRSRCFORK
6916 int is_namedstream = 0;
6917 /* stat calls are allowed for resource forks. */
6918 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6919 #endif
6920
6921 if (flag & AT_FDONLY) {
6922 vnode_t fvp;
6923
6924 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6925 if (error) {
6926 goto out;
6927 }
6928 if ((error = vnode_getwithref(fvp))) {
6929 file_drop(fd);
6930 goto out;
6931 }
6932 ndp->ni_vp = fvp;
6933 } else {
6934 error = nameiat(ndp, fd);
6935 if (error) {
6936 goto out;
6937 }
6938 }
6939
6940 statptr = (void *)&source;
6941
6942 #if NAMEDRSRCFORK
6943 /* Grab reference on the shadow stream file vnode to
6944 * force an inactive on release which will mark it
6945 * for recycle.
6946 */
6947 if (vnode_isnamedstream(ndp->ni_vp) &&
6948 (ndp->ni_vp->v_parent != NULLVP) &&
6949 vnode_isshadow(ndp->ni_vp)) {
6950 is_namedstream = 1;
6951 vnode_ref(ndp->ni_vp);
6952 }
6953 #endif
6954
6955 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6956 if (fp && (xsecurity == USER_ADDR_NULL)) {
6957 /*
6958 * If the caller has the file open, and is not
6959 * requesting extended security information, we are
6960 * going to let them get the basic stat information.
6961 */
6962 error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6963 fp->fp_glob->fg_cred);
6964 } else {
6965 error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6966 isstat64, needsrealdev, ctx);
6967 }
6968
6969 #if NAMEDRSRCFORK
6970 if (is_namedstream) {
6971 vnode_rele(ndp->ni_vp);
6972 }
6973 #endif
6974 vnode_put(ndp->ni_vp);
6975 nameidone(ndp);
6976
6977 if (fp) {
6978 file_drop(fd);
6979 fp = NULL;
6980 }
6981
6982 if (error) {
6983 goto out;
6984 }
6985 /* Zap spare fields */
6986 if (isstat64 != 0) {
6987 source.sb64.st_lspare = 0;
6988 source.sb64.st_qspare[0] = 0LL;
6989 source.sb64.st_qspare[1] = 0LL;
6990 if (vfs_context_is64bit(ctx)) {
6991 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6992 my_size = sizeof(dest.user64_sb64);
6993 sbp = (caddr_t)&dest.user64_sb64;
6994 } else {
6995 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6996 my_size = sizeof(dest.user32_sb64);
6997 sbp = (caddr_t)&dest.user32_sb64;
6998 }
6999 /*
7000 * Check if we raced (post lookup) against the last unlink of a file.
7001 */
7002 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7003 source.sb64.st_nlink = 1;
7004 }
7005 } else {
7006 source.sb.st_lspare = 0;
7007 source.sb.st_qspare[0] = 0LL;
7008 source.sb.st_qspare[1] = 0LL;
7009 if (vfs_context_is64bit(ctx)) {
7010 munge_user64_stat(&source.sb, &dest.user64_sb);
7011 my_size = sizeof(dest.user64_sb);
7012 sbp = (caddr_t)&dest.user64_sb;
7013 } else {
7014 munge_user32_stat(&source.sb, &dest.user32_sb);
7015 my_size = sizeof(dest.user32_sb);
7016 sbp = (caddr_t)&dest.user32_sb;
7017 }
7018
7019 /*
7020 * Check if we raced (post lookup) against the last unlink of a file.
7021 */
7022 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7023 source.sb.st_nlink = 1;
7024 }
7025 }
7026 if ((error = copyout(sbp, ub, my_size)) != 0) {
7027 goto out;
7028 }
7029
7030 /* caller wants extended security information? */
7031 if (xsecurity != USER_ADDR_NULL) {
7032 /* did we get any? */
7033 if (fsec == KAUTH_FILESEC_NONE) {
7034 if (susize(xsecurity_size, 0) != 0) {
7035 error = EFAULT;
7036 goto out;
7037 }
7038 } else {
7039 /* find the user buffer size */
7040 xsecurity_bufsize = fusize(xsecurity_size);
7041
7042 /* copy out the actual data size */
7043 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7044 error = EFAULT;
7045 goto out;
7046 }
7047
7048 /* if the caller supplied enough room, copy out to it */
7049 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7050 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7051 }
7052 }
7053 }
7054 out:
7055 if (ndp) {
7056 kfree_type(struct nameidata, ndp);
7057 }
7058 if (fsec != KAUTH_FILESEC_NONE) {
7059 kauth_filesec_free(fsec);
7060 }
7061 return error;
7062 }
7063
7064 /*
7065 * stat_extended: Get file status; with extended security (ACL).
7066 *
7067 * Parameters: p (ignored)
7068 * uap User argument descriptor (see below)
7069 * retval (ignored)
7070 *
7071 * Indirect: uap->path Path of file to get status from
7072 * uap->ub User buffer (holds file status info)
7073 * uap->xsecurity ACL to get (extended security)
7074 * uap->xsecurity_size Size of ACL
7075 *
7076 * Returns: 0 Success
7077 * !0 errno value
7078 *
7079 */
7080 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7081 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7082 __unused int32_t *retval)
7083 {
7084 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7085 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7086 0);
7087 }
7088
7089 /*
7090 * Returns: 0 Success
7091 * fstatat_internal:??? [see fstatat_internal() in this file]
7092 */
7093 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7094 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7095 {
7096 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7097 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7098 }
7099
7100 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7101 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7102 {
7103 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7104 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7105 }
7106
7107 /*
7108 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7109 *
7110 * Parameters: p (ignored)
7111 * uap User argument descriptor (see below)
7112 * retval (ignored)
7113 *
7114 * Indirect: uap->path Path of file to get status from
7115 * uap->ub User buffer (holds file status info)
7116 * uap->xsecurity ACL to get (extended security)
7117 * uap->xsecurity_size Size of ACL
7118 *
7119 * Returns: 0 Success
7120 * !0 errno value
7121 *
7122 */
7123 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7124 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7125 {
7126 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7127 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7128 0);
7129 }
7130
7131 /*
7132 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7133 *
7134 * Parameters: p (ignored)
7135 * uap User argument descriptor (see below)
7136 * retval (ignored)
7137 *
7138 * Indirect: uap->path Path of file to get status from
7139 * uap->ub User buffer (holds file status info)
7140 * uap->xsecurity ACL to get (extended security)
7141 * uap->xsecurity_size Size of ACL
7142 *
7143 * Returns: 0 Success
7144 * !0 errno value
7145 *
7146 */
7147 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7148 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7149 {
7150 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7151 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7152 AT_SYMLINK_NOFOLLOW);
7153 }
7154
7155 /*
7156 * Get file status; this version does not follow links.
7157 */
7158 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7159 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7160 {
7161 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7162 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7163 }
7164
7165 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7166 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7167 {
7168 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7169 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7170 }
7171
7172 /*
7173 * lstat64_extended: Get file status; can handle large inode numbers; does not
7174 * follow links; with extended security (ACL).
7175 *
7176 * Parameters: p (ignored)
7177 * uap User argument descriptor (see below)
7178 * retval (ignored)
7179 *
7180 * Indirect: uap->path Path of file to get status from
7181 * uap->ub User buffer (holds file status info)
7182 * uap->xsecurity ACL to get (extended security)
7183 * uap->xsecurity_size Size of ACL
7184 *
7185 * Returns: 0 Success
7186 * !0 errno value
7187 *
7188 */
7189 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7190 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7191 {
7192 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7193 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7194 AT_SYMLINK_NOFOLLOW);
7195 }
7196
7197 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7198 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7199 {
7200 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7201 return EINVAL;
7202 }
7203
7204 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7205 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7206 }
7207
7208 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7209 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7210 __unused int32_t *retval)
7211 {
7212 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7213 return EINVAL;
7214 }
7215
7216 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7217 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7218 }
7219
7220 /*
7221 * Get configurable pathname variables.
7222 *
7223 * Returns: 0 Success
7224 * namei:???
7225 * vn_pathconf:???
7226 *
7227 * Notes: Global implementation constants are intended to be
7228 * implemented in this function directly; all other constants
7229 * are per-FS implementation, and therefore must be handled in
7230 * each respective FS, instead.
7231 *
7232 * XXX We implement some things globally right now that should actually be
7233 * XXX per-FS; we will need to deal with this at some point.
7234 */
7235 /* ARGSUSED */
7236 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7237 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7238 {
7239 int error;
7240 struct nameidata nd;
7241 vfs_context_t ctx = vfs_context_current();
7242
7243 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7244 UIO_USERSPACE, uap->path, ctx);
7245 error = namei(&nd);
7246 if (error) {
7247 return error;
7248 }
7249
7250 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7251
7252 vnode_put(nd.ni_vp);
7253 nameidone(&nd);
7254 return error;
7255 }
7256
7257 /*
7258 * Return target name of a symbolic link.
7259 */
7260 /* ARGSUSED */
7261 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7262 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7263 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7264 int *retval)
7265 {
7266 vnode_t vp;
7267 uio_t auio;
7268 int error;
7269 struct nameidata nd;
7270 UIO_STACKBUF(uio_buf, 1);
7271 bool put_vnode;
7272
7273 if (bufsize > INT32_MAX) {
7274 return EINVAL;
7275 }
7276
7277 if (lnk_vp) {
7278 vp = lnk_vp;
7279 put_vnode = false;
7280 } else {
7281 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7282 seg, path, ctx);
7283
7284 error = nameiat(&nd, fd);
7285 if (error) {
7286 return error;
7287 }
7288 vp = nd.ni_vp;
7289 put_vnode = true;
7290 nameidone(&nd);
7291 }
7292
7293 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7294 &uio_buf[0], sizeof(uio_buf));
7295 uio_addiov(auio, buf, bufsize);
7296 if (vp->v_type != VLNK) {
7297 error = EINVAL;
7298 } else {
7299 #if CONFIG_MACF
7300 error = mac_vnode_check_readlink(ctx, vp);
7301 #endif
7302 if (error == 0) {
7303 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7304 ctx);
7305 }
7306 if (error == 0) {
7307 error = VNOP_READLINK(vp, auio, ctx);
7308 }
7309 }
7310
7311 if (put_vnode) {
7312 vnode_put(vp);
7313 }
7314
7315 *retval = (int)(bufsize - uio_resid(auio));
7316 return error;
7317 }
7318
7319 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7320 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7321 {
7322 enum uio_seg procseg;
7323 vnode_t vp;
7324 int error;
7325
7326 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7327
7328 AUDIT_ARG(fd, uap->fd);
7329
7330 if ((error = file_vnode(uap->fd, &vp))) {
7331 return error;
7332 }
7333 if ((error = vnode_getwithref(vp))) {
7334 file_drop(uap->fd);
7335 return error;
7336 }
7337
7338 error = readlinkat_internal(vfs_context_current(), -1,
7339 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7340 uap->bufsize, procseg, retval);
7341
7342 vnode_put(vp);
7343 file_drop(uap->fd);
7344 return error;
7345 }
7346
7347 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7348 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7349 {
7350 enum uio_seg procseg;
7351
7352 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7353 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7354 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7355 uap->count, procseg, retval);
7356 }
7357
7358 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7359 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7360 {
7361 enum uio_seg procseg;
7362
7363 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7364 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7365 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7366 retval);
7367 }
7368
7369 /*
7370 * Change file flags, the deep inner layer.
7371 */
7372 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7373 chflags0(vnode_t vp, struct vnode_attr *va,
7374 int (*setattr)(vnode_t, void *, vfs_context_t),
7375 void *arg, vfs_context_t ctx)
7376 {
7377 kauth_action_t action = 0;
7378 int error;
7379
7380 #if CONFIG_MACF
7381 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7382 if (error) {
7383 goto out;
7384 }
7385 #endif
7386
7387 /* request authorisation, disregard immutability */
7388 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7389 goto out;
7390 }
7391 /*
7392 * Request that the auth layer disregard those file flags it's allowed to when
7393 * authorizing this operation; we need to do this in order to be able to
7394 * clear immutable flags.
7395 */
7396 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7397 goto out;
7398 }
7399 error = (*setattr)(vp, arg, ctx);
7400
7401 #if CONFIG_MACF
7402 if (error == 0) {
7403 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7404 }
7405 #endif
7406
7407 out:
7408 return error;
7409 }
7410
7411 /*
7412 * Change file flags.
7413 *
7414 * NOTE: this will vnode_put() `vp'
7415 */
7416 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7417 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7418 {
7419 struct vnode_attr va;
7420 int error;
7421
7422 VATTR_INIT(&va);
7423 VATTR_SET(&va, va_flags, flags);
7424
7425 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7426 vnode_put(vp);
7427
7428 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7429 error = ENOTSUP;
7430 }
7431
7432 return error;
7433 }
7434
7435 /*
7436 * Change flags of a file given a path name.
7437 */
7438 /* ARGSUSED */
7439 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7440 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7441 {
7442 vnode_t vp;
7443 vfs_context_t ctx = vfs_context_current();
7444 int error;
7445 struct nameidata nd;
7446 uint32_t wantparent = 0;
7447
7448 #if CONFIG_FILE_LEASES
7449 wantparent = WANTPARENT;
7450 #endif
7451
7452 AUDIT_ARG(fflags, uap->flags);
7453 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7454 UIO_USERSPACE, uap->path, ctx);
7455 error = namei(&nd);
7456 if (error) {
7457 return error;
7458 }
7459 vp = nd.ni_vp;
7460
7461 #if CONFIG_FILE_LEASES
7462 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7463 vnode_put(nd.ni_dvp);
7464 #endif
7465
7466 nameidone(&nd);
7467
7468 /* we don't vnode_put() here because chflags1 does internally */
7469 error = chflags1(vp, uap->flags, ctx);
7470
7471 return error;
7472 }
7473
7474 /*
7475 * Change flags of a file given a file descriptor.
7476 */
7477 /* ARGSUSED */
7478 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7479 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7480 {
7481 vnode_t vp;
7482 int error;
7483
7484 AUDIT_ARG(fd, uap->fd);
7485 AUDIT_ARG(fflags, uap->flags);
7486 if ((error = file_vnode(uap->fd, &vp))) {
7487 return error;
7488 }
7489
7490 if ((error = vnode_getwithref(vp))) {
7491 file_drop(uap->fd);
7492 return error;
7493 }
7494
7495 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7496
7497 #if CONFIG_FILE_LEASES
7498 vnode_breakdirlease(vp, true, O_WRONLY);
7499 #endif
7500
7501 /* we don't vnode_put() here because chflags1 does internally */
7502 error = chflags1(vp, uap->flags, vfs_context_current());
7503
7504 file_drop(uap->fd);
7505 return error;
7506 }
7507
7508 /*
7509 * Change security information on a filesystem object.
7510 *
7511 * Returns: 0 Success
7512 * EPERM Operation not permitted
7513 * vnode_authattr:??? [anything vnode_authattr can return]
7514 * vnode_authorize:??? [anything vnode_authorize can return]
7515 * vnode_setattr:??? [anything vnode_setattr can return]
7516 *
7517 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7518 * translated to EPERM before being returned.
7519 */
7520 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7521 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7522 {
7523 kauth_action_t action;
7524 int error;
7525
7526 AUDIT_ARG(mode, vap->va_mode);
7527 /* XXX audit new args */
7528
7529 #if NAMEDSTREAMS
7530 /* chmod calls are not allowed for resource forks. */
7531 if (vp->v_flag & VISNAMEDSTREAM) {
7532 return EPERM;
7533 }
7534 #endif
7535
7536 #if CONFIG_MACF
7537 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7538 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7539 return error;
7540 }
7541
7542 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7543 if ((error = mac_vnode_check_setowner(ctx, vp,
7544 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7545 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7546 return error;
7547 }
7548 }
7549
7550 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7551 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7552 return error;
7553 }
7554 #endif
7555
7556 /* make sure that the caller is allowed to set this security information */
7557 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7558 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7559 if (error == EACCES) {
7560 error = EPERM;
7561 }
7562 return error;
7563 }
7564
7565 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7566 return error;
7567 }
7568
7569 #if CONFIG_MACF
7570 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7571 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7572 }
7573
7574 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7575 mac_vnode_notify_setowner(ctx, vp,
7576 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7577 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7578 }
7579
7580 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7581 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7582 }
7583 #endif
7584
7585 return error;
7586 }
7587
7588
7589 /*
7590 * Change mode of a file given a path name.
7591 *
7592 * Returns: 0 Success
7593 * namei:??? [anything namei can return]
7594 * chmod_vnode:??? [anything chmod_vnode can return]
7595 */
7596 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7597 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7598 int fd, int flag, enum uio_seg segflg)
7599 {
7600 struct nameidata nd;
7601 int follow, error;
7602 uint32_t wantparent = 0;
7603
7604 #if CONFIG_FILE_LEASES
7605 wantparent = WANTPARENT;
7606 #endif
7607
7608 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7609 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7610 segflg, path, ctx);
7611 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7612 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7613 }
7614 if ((error = nameiat(&nd, fd))) {
7615 return error;
7616 }
7617
7618 #if CONFIG_FILE_LEASES
7619 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7620 vnode_put(nd.ni_dvp);
7621 #endif
7622
7623 error = chmod_vnode(ctx, nd.ni_vp, vap);
7624 vnode_put(nd.ni_vp);
7625 nameidone(&nd);
7626 return error;
7627 }
7628
7629 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7630 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7631 gid_t gid, user_addr_t xsecurity)
7632 {
7633 int error;
7634
7635 VATTR_INIT(pva);
7636
7637 if (mode != -1) {
7638 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7639 } else {
7640 pva->va_mode = 0;
7641 }
7642
7643 if (uid != KAUTH_UID_NONE) {
7644 VATTR_SET(pva, va_uid, uid);
7645 }
7646
7647 if (gid != KAUTH_GID_NONE) {
7648 VATTR_SET(pva, va_gid, gid);
7649 }
7650
7651 *pxsecdst = NULL;
7652 switch (xsecurity) {
7653 case USER_ADDR_NULL:
7654 break;
7655
7656 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7657 VATTR_SET(pva, va_acl, NULL);
7658 break;
7659
7660 default:
7661 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7662 return error;
7663 }
7664
7665 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7666 pva->va_vaflags |= VA_FILESEC_ACL;
7667 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7668 break;
7669 }
7670
7671 return 0;
7672 }
7673
7674 /*
7675 * chmod_extended: Change the mode of a file given a path name; with extended
7676 * argument list (including extended security (ACL)).
7677 *
7678 * Parameters: p Process requesting the open
7679 * uap User argument descriptor (see below)
7680 * retval (ignored)
7681 *
7682 * Indirect: uap->path Path to object (same as 'chmod')
7683 * uap->uid UID to set
7684 * uap->gid GID to set
7685 * uap->mode File mode to set (same as 'chmod')
7686 * uap->xsecurity ACL to set (or delete)
7687 *
7688 * Returns: 0 Success
7689 * !0 errno value
7690 *
7691 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7692 *
7693 * XXX: We should enummerate the possible errno values here, and where
7694 * in the code they originated.
7695 */
7696 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7697 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7698 {
7699 int error;
7700 struct vnode_attr va;
7701 kauth_filesec_t xsecdst = NULL;
7702
7703 AUDIT_ARG(owner, uap->uid, uap->gid);
7704
7705 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7706 uap->gid, uap->xsecurity);
7707
7708 if (error) {
7709 return error;
7710 }
7711
7712 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7713 UIO_USERSPACE);
7714
7715 if (xsecdst != NULL) {
7716 kauth_filesec_free(xsecdst);
7717 }
7718 return error;
7719 }
7720
7721 /*
7722 * Returns: 0 Success
7723 * chmodat:??? [anything chmodat can return]
7724 */
7725 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7726 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7727 int flag, enum uio_seg segflg)
7728 {
7729 struct vnode_attr va;
7730
7731 VATTR_INIT(&va);
7732 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7733
7734 return chmodat(ctx, path, &va, fd, flag, segflg);
7735 }
7736
7737 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7738 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7739 {
7740 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7741 AT_FDCWD, 0, UIO_USERSPACE);
7742 }
7743
7744 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7745 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7746 {
7747 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7748 return EINVAL;
7749 }
7750
7751 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7752 uap->fd, uap->flag, UIO_USERSPACE);
7753 }
7754
7755 /*
7756 * Change mode of a file given a file descriptor.
7757 */
7758 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7759 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7760 {
7761 vnode_t vp;
7762 int error;
7763
7764 AUDIT_ARG(fd, fd);
7765
7766 if ((error = file_vnode(fd, &vp)) != 0) {
7767 return error;
7768 }
7769 if ((error = vnode_getwithref(vp)) != 0) {
7770 file_drop(fd);
7771 return error;
7772 }
7773 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7774
7775 #if CONFIG_FILE_LEASES
7776 vnode_breakdirlease(vp, true, O_WRONLY);
7777 #endif
7778
7779 error = chmod_vnode(vfs_context_current(), vp, vap);
7780 (void)vnode_put(vp);
7781 file_drop(fd);
7782
7783 return error;
7784 }
7785
7786 /*
7787 * fchmod_extended: Change mode of a file given a file descriptor; with
7788 * extended argument list (including extended security (ACL)).
7789 *
7790 * Parameters: p Process requesting to change file mode
7791 * uap User argument descriptor (see below)
7792 * retval (ignored)
7793 *
7794 * Indirect: uap->mode File mode to set (same as 'chmod')
7795 * uap->uid UID to set
7796 * uap->gid GID to set
7797 * uap->xsecurity ACL to set (or delete)
7798 * uap->fd File descriptor of file to change mode
7799 *
7800 * Returns: 0 Success
7801 * !0 errno value
7802 *
7803 */
7804 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7805 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7806 {
7807 int error;
7808 struct vnode_attr va;
7809 kauth_filesec_t xsecdst = NULL;
7810
7811 AUDIT_ARG(owner, uap->uid, uap->gid);
7812
7813 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7814 uap->gid, uap->xsecurity);
7815
7816 if (error) {
7817 return error;
7818 }
7819
7820 error = fchmod1(p, uap->fd, &va);
7821
7822 if (xsecdst != NULL) {
7823 kauth_filesec_free(xsecdst);
7824 }
7825 return error;
7826 }
7827
7828 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7829 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7830 {
7831 struct vnode_attr va;
7832
7833 VATTR_INIT(&va);
7834 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7835
7836 return fchmod1(p, uap->fd, &va);
7837 }
7838
7839 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)7840 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
7841 {
7842 struct vnode_attr va;
7843 kauth_action_t action;
7844 int error;
7845
7846 VATTR_INIT(&va);
7847 if (uid != (uid_t)VNOVAL) {
7848 VATTR_SET(&va, va_uid, uid);
7849 }
7850 if (gid != (gid_t)VNOVAL) {
7851 VATTR_SET(&va, va_gid, gid);
7852 }
7853
7854 #if NAMEDSTREAMS
7855 /* chown calls are not allowed for resource forks. */
7856 if (vp->v_flag & VISNAMEDSTREAM) {
7857 error = EPERM;
7858 goto out;
7859 }
7860 #endif
7861
7862 #if CONFIG_MACF
7863 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7864 if (error) {
7865 goto out;
7866 }
7867 #endif
7868
7869 /* preflight and authorize attribute changes */
7870 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7871 goto out;
7872 }
7873 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7874 /*
7875 * EACCES is only allowed from namei(); permissions failure should
7876 * return EPERM, so we need to translate the error code.
7877 */
7878 if (error == EACCES) {
7879 error = EPERM;
7880 }
7881
7882 goto out;
7883 }
7884
7885 #if CONFIG_FILE_LEASES
7886 vnode_breakdirlease(vp, true, O_WRONLY);
7887 #endif
7888
7889 error = vnode_setattr(vp, &va, ctx);
7890
7891 #if CONFIG_MACF
7892 if (error == 0) {
7893 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7894 }
7895 #endif
7896
7897 out:
7898 return error;
7899 }
7900
7901 /*
7902 * Set ownership given a path name.
7903 */
7904 /* ARGSUSED */
7905 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7906 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7907 gid_t gid, int flag, enum uio_seg segflg)
7908 {
7909 vnode_t vp;
7910 int error;
7911 struct nameidata nd;
7912 int follow;
7913
7914 AUDIT_ARG(owner, uid, gid);
7915
7916 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7917 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
7918 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7919 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7920 }
7921
7922 error = nameiat(&nd, fd);
7923 if (error) {
7924 return error;
7925 }
7926
7927 vp = nd.ni_vp;
7928 error = vn_chown_internal(ctx, vp, uid, gid);
7929
7930 nameidone(&nd);
7931 vnode_put(vp);
7932 return error;
7933 }
7934
7935 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7936 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7937 {
7938 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7939 uap->uid, uap->gid, 0, UIO_USERSPACE);
7940 }
7941
7942 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7943 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7944 {
7945 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7946 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7947 }
7948
7949 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7950 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7951 {
7952 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7953 return EINVAL;
7954 }
7955
7956 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7957 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7958 }
7959
7960 /*
7961 * Set ownership given a file descriptor.
7962 */
7963 /* ARGSUSED */
7964 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7965 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7966 {
7967 vfs_context_t ctx = vfs_context_current();
7968 vnode_t vp;
7969 int error;
7970
7971 AUDIT_ARG(owner, uap->uid, uap->gid);
7972 AUDIT_ARG(fd, uap->fd);
7973
7974 if ((error = file_vnode(uap->fd, &vp))) {
7975 return error;
7976 }
7977
7978 if ((error = vnode_getwithref(vp))) {
7979 file_drop(uap->fd);
7980 return error;
7981 }
7982 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7983
7984 error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
7985
7986 (void)vnode_put(vp);
7987 file_drop(uap->fd);
7988 return error;
7989 }
7990
7991 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7992 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7993 {
7994 int error;
7995
7996 if (usrtvp == USER_ADDR_NULL) {
7997 struct timeval old_tv;
7998 /* XXX Y2038 bug because of microtime argument */
7999 microtime(&old_tv);
8000 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8001 tsp[1] = tsp[0];
8002 } else {
8003 if (IS_64BIT_PROCESS(current_proc())) {
8004 struct user64_timeval tv[2];
8005 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8006 if (error) {
8007 return error;
8008 }
8009 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8010 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8011 } else {
8012 struct user32_timeval tv[2];
8013 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8014 if (error) {
8015 return error;
8016 }
8017 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8018 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8019 }
8020 }
8021 return 0;
8022 }
8023
8024 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8025 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8026 int nullflag)
8027 {
8028 int error;
8029 struct vnode_attr va;
8030 kauth_action_t action;
8031
8032 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8033
8034 VATTR_INIT(&va);
8035 VATTR_SET(&va, va_access_time, ts[0]);
8036 VATTR_SET(&va, va_modify_time, ts[1]);
8037 if (nullflag) {
8038 va.va_vaflags |= VA_UTIMES_NULL;
8039 }
8040
8041 #if NAMEDSTREAMS
8042 /* utimes calls are not allowed for resource forks. */
8043 if (vp->v_flag & VISNAMEDSTREAM) {
8044 error = EPERM;
8045 goto out;
8046 }
8047 #endif
8048
8049 #if CONFIG_MACF
8050 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8051 if (error) {
8052 goto out;
8053 }
8054 #endif
8055 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8056 if (!nullflag && error == EACCES) {
8057 error = EPERM;
8058 }
8059 goto out;
8060 }
8061
8062 /* since we may not need to auth anything, check here */
8063 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8064 if (!nullflag && error == EACCES) {
8065 error = EPERM;
8066 }
8067 goto out;
8068 }
8069 error = vnode_setattr(vp, &va, ctx);
8070
8071 #if CONFIG_MACF
8072 if (error == 0) {
8073 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8074 }
8075 #endif
8076
8077 out:
8078 return error;
8079 }
8080
8081 /*
8082 * Set the access and modification times of a file.
8083 */
8084 /* ARGSUSED */
8085 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8086 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8087 {
8088 struct timespec ts[2];
8089 user_addr_t usrtvp;
8090 int error;
8091 struct nameidata nd;
8092 vfs_context_t ctx = vfs_context_current();
8093 uint32_t wantparent = 0;
8094
8095 #if CONFIG_FILE_LEASES
8096 wantparent = WANTPARENT;
8097 #endif
8098
8099 /*
8100 * AUDIT: Needed to change the order of operations to do the
8101 * name lookup first because auditing wants the path.
8102 */
8103 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8104 UIO_USERSPACE, uap->path, ctx);
8105 error = namei(&nd);
8106 if (error) {
8107 return error;
8108 }
8109
8110 /*
8111 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8112 * the current time instead.
8113 */
8114 usrtvp = uap->tptr;
8115 if ((error = getutimes(usrtvp, ts)) != 0) {
8116 goto out;
8117 }
8118
8119 #if CONFIG_FILE_LEASES
8120 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8121 #endif
8122
8123 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8124
8125 out:
8126 #if CONFIG_FILE_LEASES
8127 vnode_put(nd.ni_dvp);
8128 #endif
8129 nameidone(&nd);
8130 vnode_put(nd.ni_vp);
8131 return error;
8132 }
8133
8134 /*
8135 * Set the access and modification times of a file.
8136 */
8137 /* ARGSUSED */
8138 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8139 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8140 {
8141 struct timespec ts[2];
8142 vnode_t vp;
8143 user_addr_t usrtvp;
8144 int error;
8145
8146 AUDIT_ARG(fd, uap->fd);
8147 usrtvp = uap->tptr;
8148 if ((error = getutimes(usrtvp, ts)) != 0) {
8149 return error;
8150 }
8151 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8152 return error;
8153 }
8154 if ((error = vnode_getwithref(vp))) {
8155 file_drop(uap->fd);
8156 return error;
8157 }
8158
8159 #if CONFIG_FILE_LEASES
8160 vnode_breakdirlease(vp, true, O_WRONLY);
8161 #endif
8162
8163 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8164
8165 vnode_put(vp);
8166 file_drop(uap->fd);
8167 return error;
8168 }
8169
8170 static int
truncate_validate_common(proc_t p,off_t length)8171 truncate_validate_common(proc_t p, off_t length)
8172 {
8173 rlim_t fsize_limit;
8174
8175 if (length < 0) {
8176 return EINVAL;
8177 }
8178
8179 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8180 if ((rlim_t)length > fsize_limit) {
8181 psignal(p, SIGXFSZ);
8182 return EFBIG;
8183 }
8184
8185 return 0;
8186 }
8187
8188 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8189 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8190 vfs_context_t ctx, boolean_t need_auth)
8191 {
8192 struct vnode_attr va;
8193 kauth_action_t action;
8194 int error;
8195
8196 VATTR_INIT(&va);
8197 VATTR_SET(&va, va_data_size, length);
8198
8199 #if CONFIG_MACF
8200 error = mac_vnode_check_truncate(ctx, cred, vp);
8201 if (error) {
8202 return error;
8203 }
8204 #endif
8205
8206 /*
8207 * If we reached here from `ftruncate` then we already did an effective
8208 * `vnode_authorize` upon open. We honour the result from then.
8209 */
8210 if (need_auth) {
8211 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8212 return error;
8213 }
8214
8215 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8216 return error;
8217 }
8218 }
8219
8220 #if CONFIG_FILE_LEASES
8221 /* Check if there is a lease placed on the parent directory. */
8222 vnode_breakdirlease(vp, true, O_WRONLY);
8223
8224 /* Now check if there is a lease placed on the file itself. */
8225 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8226 #endif
8227
8228 error = vnode_setattr(vp, &va, ctx);
8229
8230 #if CONFIG_MACF
8231 if (error == 0) {
8232 mac_vnode_notify_truncate(ctx, cred, vp);
8233 }
8234 #endif
8235
8236 return error;
8237 }
8238
8239 /*
8240 * Truncate a file given its path name.
8241 */
8242 /* ARGSUSED */
8243 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8244 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8245 {
8246 vfs_context_t ctx = vfs_context_current();
8247 vnode_t vp;
8248 int error;
8249 struct nameidata nd;
8250
8251 if ((error = truncate_validate_common(p, uap->length))) {
8252 return error;
8253 }
8254
8255 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8256 UIO_USERSPACE, uap->path, ctx);
8257
8258 if ((error = namei(&nd))) {
8259 return error;
8260 }
8261
8262 vp = nd.ni_vp;
8263 nameidone(&nd);
8264
8265 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8266 vnode_put(vp);
8267
8268 return error;
8269 }
8270
8271 /*
8272 * Truncate a file given a file descriptor.
8273 */
8274 /* ARGSUSED */
8275 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8276 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8277 {
8278 vnode_t vp;
8279 struct fileproc *fp;
8280 int error;
8281
8282 AUDIT_ARG(fd, uap->fd);
8283
8284 if ((error = truncate_validate_common(p, uap->length))) {
8285 return error;
8286 }
8287
8288 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8289 return error;
8290 }
8291
8292 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8293 case DTYPE_PSXSHM:
8294 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8295 goto out;
8296 case DTYPE_VNODE:
8297 break;
8298 default:
8299 error = EINVAL;
8300 goto out;
8301 }
8302
8303 vp = (vnode_t)fp_get_data(fp);
8304
8305 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8306 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8307 error = EINVAL;
8308 goto out;
8309 }
8310
8311 if ((error = vnode_getwithref(vp)) != 0) {
8312 goto out;
8313 }
8314
8315 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8316
8317 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8318 vfs_context_current(), false);
8319 vnode_put(vp);
8320
8321 out:
8322 file_drop(uap->fd);
8323 return error;
8324 }
8325
8326
8327 /*
8328 * Sync an open file with synchronized I/O _file_ integrity completion
8329 */
8330 /* ARGSUSED */
8331 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8332 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8333 {
8334 __pthread_testcancel(1);
8335 return fsync_common(p, uap, MNT_WAIT);
8336 }
8337
8338
8339 /*
8340 * Sync an open file with synchronized I/O _file_ integrity completion
8341 *
8342 * Notes: This is a legacy support function that does not test for
8343 * thread cancellation points.
8344 */
8345 /* ARGSUSED */
8346 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8347 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8348 {
8349 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8350 }
8351
8352
8353 /*
8354 * Sync an open file with synchronized I/O _data_ integrity completion
8355 */
8356 /* ARGSUSED */
8357 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8358 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8359 {
8360 __pthread_testcancel(1);
8361 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8362 }
8363
8364
8365 /*
8366 * fsync_common
8367 *
8368 * Common fsync code to support both synchronized I/O file integrity completion
8369 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8370 *
8371 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8372 * will only guarantee that the file data contents are retrievable. If
8373 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8374 * includes additional metadata unnecessary for retrieving the file data
8375 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8376 * storage.
8377 *
8378 * Parameters: p The process
8379 * uap->fd The descriptor to synchronize
8380 * flags The data integrity flags
8381 *
8382 * Returns: int Success
8383 * fp_getfvp:EBADF Bad file descriptor
8384 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8385 * VNOP_FSYNC:??? unspecified
8386 *
8387 * Notes: We use struct fsync_args because it is a short name, and all
8388 * caller argument structures are otherwise identical.
8389 */
8390 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8391 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8392 {
8393 vnode_t vp;
8394 struct fileproc *fp;
8395 vfs_context_t ctx = vfs_context_current();
8396 int error;
8397
8398 AUDIT_ARG(fd, uap->fd);
8399
8400 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8401 return error;
8402 }
8403 if ((error = vnode_getwithref(vp))) {
8404 file_drop(uap->fd);
8405 return error;
8406 }
8407
8408 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8409
8410 error = VNOP_FSYNC(vp, flags, ctx);
8411
8412 #if NAMEDRSRCFORK
8413 /* Sync resource fork shadow file if necessary. */
8414 if ((error == 0) &&
8415 (vp->v_flag & VISNAMEDSTREAM) &&
8416 (vp->v_parent != NULLVP) &&
8417 vnode_isshadow(vp) &&
8418 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8419 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8420 }
8421 #endif
8422
8423 (void)vnode_put(vp);
8424 file_drop(uap->fd);
8425 return error;
8426 }
8427
8428 /*
8429 * Duplicate files. Source must be a file, target must be a file or
8430 * must not exist.
8431 *
8432 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8433 * perform inheritance correctly.
8434 */
8435 /* ARGSUSED */
8436 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8437 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8438 {
8439 vnode_t tvp, fvp, tdvp, sdvp;
8440 struct nameidata fromnd, tond;
8441 int error;
8442 vfs_context_t ctx = vfs_context_current();
8443
8444 /* Check that the flags are valid. */
8445 if (uap->flags & ~CPF_MASK) {
8446 return EINVAL;
8447 }
8448
8449 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8450 UIO_USERSPACE, uap->from, ctx);
8451 if ((error = namei(&fromnd))) {
8452 return error;
8453 }
8454 fvp = fromnd.ni_vp;
8455
8456 NDINIT(&tond, CREATE, OP_LINK,
8457 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8458 UIO_USERSPACE, uap->to, ctx);
8459 if ((error = namei(&tond))) {
8460 goto out1;
8461 }
8462 tdvp = tond.ni_dvp;
8463 tvp = tond.ni_vp;
8464
8465 if (tvp != NULL) {
8466 if (!(uap->flags & CPF_OVERWRITE)) {
8467 error = EEXIST;
8468 goto out;
8469 }
8470 }
8471
8472 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8473 error = EISDIR;
8474 goto out;
8475 }
8476
8477 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8478 error = EOPNOTSUPP;
8479 goto out;
8480 }
8481
8482 #if CONFIG_MACF
8483 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8484 goto out;
8485 }
8486 #endif /* CONFIG_MACF */
8487
8488 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8489 goto out;
8490 }
8491 if (tvp) {
8492 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8493 goto out;
8494 }
8495 }
8496 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8497 goto out;
8498 }
8499
8500 if (fvp == tdvp) {
8501 error = EINVAL;
8502 }
8503 /*
8504 * If source is the same as the destination (that is the
8505 * same inode number) then there is nothing to do.
8506 * (fixed to have POSIX semantics - CSM 3/2/98)
8507 */
8508 if (fvp == tvp) {
8509 error = -1;
8510 }
8511
8512 #if CONFIG_FILE_LEASES
8513 vnode_breakdirlease(tdvp, false, O_WRONLY);
8514 #endif
8515
8516 if (!error) {
8517 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8518 }
8519 out:
8520 sdvp = tond.ni_startdir;
8521 /*
8522 * nameidone has to happen before we vnode_put(tdvp)
8523 * since it may need to release the fs_nodelock on the tdvp
8524 */
8525 nameidone(&tond);
8526
8527 if (tvp) {
8528 vnode_put(tvp);
8529 }
8530 vnode_put(tdvp);
8531 vnode_put(sdvp);
8532 out1:
8533 vnode_put(fvp);
8534
8535 nameidone(&fromnd);
8536
8537 if (error == -1) {
8538 return 0;
8539 }
8540 return error;
8541 }
8542
8543 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8544
8545 /*
8546 * Helper function for doing clones. The caller is expected to provide an
8547 * iocounted source vnode and release it.
8548 */
8549 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8550 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8551 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8552 {
8553 vnode_t tvp, tdvp;
8554 struct nameidata tond;
8555 int error;
8556 int follow;
8557 boolean_t free_src_acl;
8558 boolean_t attr_cleanup;
8559 enum vtype v_type;
8560 kauth_action_t action;
8561 struct componentname *cnp;
8562 uint32_t defaulted = 0;
8563 struct vnode_attr va;
8564 struct vnode_attr nva;
8565 uint32_t vnop_flags;
8566
8567 v_type = vnode_vtype(fvp);
8568 switch (v_type) {
8569 case VLNK:
8570 /* FALLTHRU */
8571 case VREG:
8572 action = KAUTH_VNODE_ADD_FILE;
8573 break;
8574 case VDIR:
8575 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8576 fvp->v_mountedhere) {
8577 return EINVAL;
8578 }
8579 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8580 break;
8581 default:
8582 return EINVAL;
8583 }
8584
8585 AUDIT_ARG(fd2, dst_dirfd);
8586 AUDIT_ARG(value32, flags);
8587
8588 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8589 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8590 UIO_USERSPACE, dst, ctx);
8591 if ((error = nameiat(&tond, dst_dirfd))) {
8592 return error;
8593 }
8594 cnp = &tond.ni_cnd;
8595 tdvp = tond.ni_dvp;
8596 tvp = tond.ni_vp;
8597
8598 free_src_acl = FALSE;
8599 attr_cleanup = FALSE;
8600
8601 if (tvp != NULL) {
8602 error = EEXIST;
8603 goto out;
8604 }
8605
8606 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8607 error = EXDEV;
8608 goto out;
8609 }
8610
8611 #if CONFIG_MACF
8612 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8613 goto out;
8614 }
8615 #endif
8616 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8617 goto out;
8618 }
8619
8620 action = KAUTH_VNODE_GENERIC_READ_BITS;
8621 if (data_read_authorised) {
8622 action &= ~KAUTH_VNODE_READ_DATA;
8623 }
8624 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8625 goto out;
8626 }
8627
8628 /*
8629 * certain attributes may need to be changed from the source, we ask for
8630 * those here with the exception of source file's ACLs unless the CLONE_ACL
8631 * flag is specified. By default, the clone file will inherit the target
8632 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8633 * will inherit the source file's ACLs instead.
8634 */
8635 VATTR_INIT(&va);
8636 VATTR_WANTED(&va, va_uid);
8637 VATTR_WANTED(&va, va_gid);
8638 VATTR_WANTED(&va, va_mode);
8639 VATTR_WANTED(&va, va_flags);
8640 if (flags & CLONE_ACL) {
8641 VATTR_WANTED(&va, va_acl);
8642 }
8643
8644 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8645 goto out;
8646 }
8647
8648 VATTR_INIT(&nva);
8649 VATTR_SET(&nva, va_type, v_type);
8650 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8651 VATTR_SET(&nva, va_acl, va.va_acl);
8652 free_src_acl = TRUE;
8653 }
8654
8655 /* Handle ACL inheritance, initialize vap. */
8656 if (v_type == VLNK) {
8657 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8658 } else {
8659 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8660 if (error) {
8661 goto out;
8662 }
8663 attr_cleanup = TRUE;
8664 }
8665
8666 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8667 /*
8668 * We've got initial values for all security parameters,
8669 * If we are superuser, then we can change owners to be the
8670 * same as the source. Both superuser and the owner have default
8671 * WRITE_SECURITY privileges so all other fields can be taken
8672 * from source as well.
8673 */
8674 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8675 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8676 VATTR_SET(&nva, va_uid, va.va_uid);
8677 }
8678 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8679 VATTR_SET(&nva, va_gid, va.va_gid);
8680 }
8681 } else {
8682 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8683 }
8684
8685 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8686 VATTR_SET(&nva, va_mode, va.va_mode);
8687 }
8688 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8689 VATTR_SET(&nva, va_flags,
8690 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8691 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8692 }
8693
8694 #if CONFIG_FILE_LEASES
8695 vnode_breakdirlease(tdvp, false, O_WRONLY);
8696 #endif
8697
8698 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8699
8700 if (!error && tvp) {
8701 int update_flags = 0;
8702 #if CONFIG_FSE
8703 int fsevent;
8704 #endif /* CONFIG_FSE */
8705
8706 /*
8707 * If some of the requested attributes weren't handled by the
8708 * VNOP, use our fallback code.
8709 */
8710 if (!VATTR_ALL_SUPPORTED(&nva)) {
8711 (void)vnode_setattr_fallback(tvp, &nva, ctx);
8712 }
8713
8714 #if CONFIG_MACF
8715 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8716 VNODE_LABEL_CREATE, ctx);
8717 #endif
8718
8719 // Make sure the name & parent pointers are hooked up
8720 if (tvp->v_name == NULL) {
8721 update_flags |= VNODE_UPDATE_NAME;
8722 }
8723 if (tvp->v_parent == NULLVP) {
8724 update_flags |= VNODE_UPDATE_PARENT;
8725 }
8726
8727 if (update_flags) {
8728 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8729 cnp->cn_namelen, cnp->cn_hash, update_flags);
8730 }
8731
8732 #if CONFIG_FSE
8733 switch (vnode_vtype(tvp)) {
8734 case VLNK:
8735 /* FALLTHRU */
8736 case VREG:
8737 fsevent = FSE_CREATE_FILE;
8738 break;
8739 case VDIR:
8740 fsevent = FSE_CREATE_DIR;
8741 break;
8742 default:
8743 goto out;
8744 }
8745
8746 if (need_fsevent(fsevent, tvp)) {
8747 /*
8748 * The following is a sequence of three explicit events.
8749 * A pair of FSE_CLONE events representing the source and destination
8750 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8751 * fseventsd may coalesce the destination clone and create events
8752 * into a single event resulting in the following sequence for a client
8753 * FSE_CLONE (src)
8754 * FSE_CLONE | FSE_CREATE (dst)
8755 */
8756 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8757 FSE_ARG_DONE);
8758 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8759 FSE_ARG_DONE);
8760 }
8761 #endif /* CONFIG_FSE */
8762 }
8763
8764 out:
8765 if (attr_cleanup) {
8766 vn_attribute_cleanup(&nva, defaulted);
8767 }
8768 if (free_src_acl && va.va_acl) {
8769 kauth_acl_free(va.va_acl);
8770 }
8771 nameidone(&tond);
8772 if (tvp) {
8773 vnode_put(tvp);
8774 }
8775 vnode_put(tdvp);
8776 return error;
8777 }
8778
8779 /*
8780 * clone files or directories, target must not exist.
8781 */
8782 /* ARGSUSED */
8783 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8784 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8785 __unused int32_t *retval)
8786 {
8787 vnode_t fvp;
8788 struct nameidata fromnd;
8789 int follow;
8790 int error;
8791 vfs_context_t ctx = vfs_context_current();
8792
8793 /* Check that the flags are valid. */
8794 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8795 return EINVAL;
8796 }
8797
8798 AUDIT_ARG(fd, uap->src_dirfd);
8799
8800 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8801 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8802 UIO_USERSPACE, uap->src, ctx);
8803 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8804 return error;
8805 }
8806
8807 fvp = fromnd.ni_vp;
8808 nameidone(&fromnd);
8809
8810 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8811 uap->flags, ctx);
8812
8813 vnode_put(fvp);
8814 return error;
8815 }
8816
8817 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8818 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8819 __unused int32_t *retval)
8820 {
8821 vnode_t fvp;
8822 struct fileproc *fp;
8823 int error;
8824 vfs_context_t ctx = vfs_context_current();
8825
8826 /* Check that the flags are valid. */
8827 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8828 return EINVAL;
8829 }
8830
8831 AUDIT_ARG(fd, uap->src_fd);
8832 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8833 if (error) {
8834 return error;
8835 }
8836
8837 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8838 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8839 error = EBADF;
8840 goto out;
8841 }
8842
8843 if ((error = vnode_getwithref(fvp))) {
8844 goto out;
8845 }
8846
8847 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8848
8849 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8850 uap->flags, ctx);
8851
8852 vnode_put(fvp);
8853 out:
8854 file_drop(uap->src_fd);
8855 return error;
8856 }
8857
8858 static int
rename_submounts_callback(mount_t mp,void * arg)8859 rename_submounts_callback(mount_t mp, void *arg)
8860 {
8861 int error = 0;
8862 mount_t pmp = (mount_t)arg;
8863 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8864
8865 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8866 return 0;
8867 }
8868
8869 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8870 return 0;
8871 }
8872
8873 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8874 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8875 return -1;
8876 }
8877
8878 size_t pathlen = MAXPATHLEN;
8879 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8880 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8881 }
8882
8883 vfs_unbusy(mp);
8884
8885 return error;
8886 }
8887
8888 /*
8889 * Rename files. Source and destination must either both be directories,
8890 * or both not be directories. If target is a directory, it must be empty.
8891 */
8892 /* ARGSUSED */
8893 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8894 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8895 int tofd, user_addr_t to, int segflg, u_int uflags)
8896 {
8897 vnode_t tvp, tdvp;
8898 vnode_t fvp, fdvp;
8899 vnode_t mnt_fvp;
8900 struct nameidata *fromnd, *tond;
8901 int error = 0;
8902 int do_retry;
8903 int retry_count;
8904 int mntrename;
8905 int need_event;
8906 int need_kpath2;
8907 int has_listeners;
8908 const char *oname = NULL;
8909 char *from_name = NULL, *to_name = NULL;
8910 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8911 int from_len = 0, to_len = 0;
8912 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8913 int holding_mntlock;
8914 int vn_authorize_skipped;
8915 mount_t locked_mp = NULL;
8916 vnode_t oparent = NULLVP;
8917 #if CONFIG_FSE
8918 fse_info from_finfo = {}, to_finfo;
8919 #endif
8920 int from_truncated = 0, to_truncated = 0;
8921 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8922 int batched = 0;
8923 struct vnode_attr *fvap, *tvap;
8924 int continuing = 0;
8925 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8926 int32_t nofollow_any = 0;
8927 /* carving out a chunk for structs that are too big to be on stack. */
8928 struct {
8929 struct nameidata from_node, to_node;
8930 struct vnode_attr fv_attr, tv_attr;
8931 } * __rename_data;
8932
8933 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8934 fromnd = &__rename_data->from_node;
8935 tond = &__rename_data->to_node;
8936
8937 holding_mntlock = 0;
8938 do_retry = 0;
8939 retry_count = 0;
8940 retry:
8941 fvp = tvp = NULL;
8942 fdvp = tdvp = NULL;
8943 fvap = tvap = NULL;
8944 mnt_fvp = NULLVP;
8945 mntrename = FALSE;
8946 vn_authorize_skipped = FALSE;
8947
8948 if (uflags & RENAME_NOFOLLOW_ANY) {
8949 nofollow_any = NAMEI_NOFOLLOW_ANY;
8950 }
8951 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8952 segflg, from, ctx);
8953 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8954
8955 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8956 segflg, to, ctx);
8957 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8958
8959 continue_lookup:
8960 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8961 if ((error = nameiat(fromnd, fromfd))) {
8962 goto out1;
8963 }
8964 fdvp = fromnd->ni_dvp;
8965 fvp = fromnd->ni_vp;
8966
8967 if (fvp && fvp->v_type == VDIR) {
8968 tond->ni_cnd.cn_flags |= WILLBEDIR;
8969 }
8970 }
8971
8972 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8973 if ((error = nameiat(tond, tofd))) {
8974 /*
8975 * Translate error code for rename("dir1", "dir2/.").
8976 */
8977 if (error == EISDIR && fvp->v_type == VDIR) {
8978 error = EINVAL;
8979 }
8980 goto out1;
8981 }
8982 tdvp = tond->ni_dvp;
8983 tvp = tond->ni_vp;
8984 }
8985
8986 #if DEVELOPMENT || DEBUG
8987 /*
8988 * XXX VSWAP: Check for entitlements or special flag here
8989 * so we can restrict access appropriately.
8990 */
8991 #else /* DEVELOPMENT || DEBUG */
8992
8993 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8994 error = EPERM;
8995 goto out1;
8996 }
8997
8998 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8999 error = EPERM;
9000 goto out1;
9001 }
9002 #endif /* DEVELOPMENT || DEBUG */
9003
9004 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9005 error = ENOENT;
9006 goto out1;
9007 }
9008
9009 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9010 int32_t pval = 0;
9011 int err = 0;
9012
9013 /*
9014 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9015 * has the same name as target iff the following conditions are met:
9016 * 1. the target file system is case insensitive
9017 * 2. source and target directories are the same
9018 * 3. source and target files are the same
9019 * 4. name only differs in case (determined by underlying filesystem)
9020 */
9021 if (fvp != tvp || fdvp != tdvp) {
9022 error = EEXIST;
9023 goto out1;
9024 }
9025
9026 /*
9027 * Assume that the target file system is case sensitive if
9028 * _PC_CASE_SENSITIVE selector isn't supported.
9029 */
9030 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9031 if (err != 0 || pval != 0) {
9032 error = EEXIST;
9033 goto out1;
9034 }
9035 }
9036
9037 batched = vnode_compound_rename_available(fdvp);
9038
9039 #if CONFIG_FSE
9040 need_event = need_fsevent(FSE_RENAME, fdvp);
9041 if (need_event) {
9042 if (fvp) {
9043 get_fse_info(fvp, &from_finfo, ctx);
9044 } else {
9045 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9046 if (error) {
9047 goto out1;
9048 }
9049
9050 fvap = &__rename_data->fv_attr;
9051 }
9052
9053 if (tvp) {
9054 get_fse_info(tvp, &to_finfo, ctx);
9055 } else if (batched) {
9056 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9057 if (error) {
9058 goto out1;
9059 }
9060
9061 tvap = &__rename_data->tv_attr;
9062 }
9063 }
9064 #else
9065 need_event = 0;
9066 #endif /* CONFIG_FSE */
9067
9068 has_listeners = kauth_authorize_fileop_has_listeners();
9069
9070 need_kpath2 = 0;
9071 #if CONFIG_AUDIT
9072 if (AUDIT_RECORD_EXISTS()) {
9073 need_kpath2 = 1;
9074 }
9075 #endif
9076
9077 if (need_event || has_listeners) {
9078 if (from_name == NULL) {
9079 GET_PATH(from_name);
9080 }
9081
9082 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9083
9084 if (from_name_no_firmlink == NULL) {
9085 GET_PATH(from_name_no_firmlink);
9086 }
9087
9088 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9089 }
9090
9091 if (need_event || need_kpath2 || has_listeners) {
9092 if (to_name == NULL) {
9093 GET_PATH(to_name);
9094 }
9095
9096 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9097
9098 if (to_name_no_firmlink == NULL) {
9099 GET_PATH(to_name_no_firmlink);
9100 }
9101
9102 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9103 if (to_name && need_kpath2) {
9104 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9105 }
9106 }
9107 if (!fvp) {
9108 /*
9109 * Claim: this check will never reject a valid rename.
9110 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9111 * Suppose fdvp and tdvp are not on the same mount.
9112 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9113 * then you can't move it to within another dir on the same mountpoint.
9114 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9115 *
9116 * If this check passes, then we are safe to pass these vnodes to the same FS.
9117 */
9118 if (fdvp->v_mount != tdvp->v_mount) {
9119 error = EXDEV;
9120 goto out1;
9121 }
9122 goto skipped_lookup;
9123 }
9124
9125 /*
9126 * If the source and destination are the same (i.e. they're
9127 * links to the same vnode) and the target file system is
9128 * case sensitive, then there is nothing to do.
9129 *
9130 * XXX Come back to this.
9131 */
9132 if (fvp == tvp) {
9133 int pathconf_val;
9134
9135 /*
9136 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9137 * then assume that this file system is case sensitive.
9138 */
9139 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9140 pathconf_val != 0) {
9141 vn_authorize_skipped = TRUE;
9142 goto out1;
9143 }
9144 }
9145
9146 /*
9147 * Allow the renaming of mount points.
9148 * - target must not exist
9149 * - target must reside in the same directory as source
9150 * - union mounts cannot be renamed
9151 * - the root fs, and tightly-linked system volumes, cannot be renamed
9152 *
9153 * XXX Handle this in VFS after a continued lookup (if we missed
9154 * in the cache to start off)
9155 *
9156 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9157 * we'll skip past here. The file system is responsible for
9158 * checking that @tvp is not a descendent of @fvp and vice versa
9159 * so it should always return EINVAL if either @tvp or @fvp is the
9160 * root of a volume.
9161 */
9162 if ((fvp->v_flag & VROOT) &&
9163 (fvp->v_type == VDIR) &&
9164 (tvp == NULL) &&
9165 (fvp->v_mountedhere == NULL) &&
9166 (fdvp == tdvp) &&
9167 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9168 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9169 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9170 vnode_t coveredvp;
9171
9172 /* switch fvp to the covered vnode */
9173 coveredvp = fvp->v_mount->mnt_vnodecovered;
9174 if ((vnode_getwithref(coveredvp))) {
9175 error = ENOENT;
9176 goto out1;
9177 }
9178 /*
9179 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9180 * later.
9181 */
9182 mnt_fvp = fvp;
9183
9184 fvp = coveredvp;
9185 mntrename = TRUE;
9186 }
9187 /*
9188 * Check for cross-device rename.
9189 */
9190 if ((fvp->v_mount != tdvp->v_mount) ||
9191 (tvp && (fvp->v_mount != tvp->v_mount))) {
9192 error = EXDEV;
9193 goto out1;
9194 }
9195
9196 /*
9197 * If source is the same as the destination (that is the
9198 * same inode number) then there is nothing to do...
9199 * EXCEPT if the underlying file system supports case
9200 * insensitivity and is case preserving. In this case
9201 * the file system needs to handle the special case of
9202 * getting the same vnode as target (fvp) and source (tvp).
9203 *
9204 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9205 * and _PC_CASE_PRESERVING can have this exception, and they need to
9206 * handle the special case of getting the same vnode as target and
9207 * source. NOTE: Then the target is unlocked going into vnop_rename,
9208 * so not to cause locking problems. There is a single reference on tvp.
9209 *
9210 * NOTE - that fvp == tvp also occurs if they are hard linked and
9211 * that correct behaviour then is just to return success without doing
9212 * anything.
9213 *
9214 * XXX filesystem should take care of this itself, perhaps...
9215 */
9216 if (fvp == tvp && fdvp == tdvp) {
9217 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9218 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9219 fromnd->ni_cnd.cn_namelen)) {
9220 vn_authorize_skipped = TRUE;
9221 goto out1;
9222 }
9223 }
9224
9225 if (holding_mntlock && fvp->v_mount != locked_mp) {
9226 /*
9227 * we're holding a reference and lock
9228 * on locked_mp, but it no longer matches
9229 * what we want to do... so drop our hold
9230 */
9231 mount_unlock_renames(locked_mp);
9232 mount_drop(locked_mp, 0);
9233 holding_mntlock = 0;
9234 }
9235 if (tdvp != fdvp && fvp->v_type == VDIR) {
9236 /*
9237 * serialize renames that re-shape
9238 * the tree... if holding_mntlock is
9239 * set, then we're ready to go...
9240 * otherwise we
9241 * first need to drop the iocounts
9242 * we picked up, second take the
9243 * lock to serialize the access,
9244 * then finally start the lookup
9245 * process over with the lock held
9246 */
9247 if (!holding_mntlock) {
9248 /*
9249 * need to grab a reference on
9250 * the mount point before we
9251 * drop all the iocounts... once
9252 * the iocounts are gone, the mount
9253 * could follow
9254 */
9255 locked_mp = fvp->v_mount;
9256 mount_ref(locked_mp, 0);
9257
9258 /*
9259 * nameidone has to happen before we vnode_put(tvp)
9260 * since it may need to release the fs_nodelock on the tvp
9261 */
9262 nameidone(tond);
9263
9264 if (tvp) {
9265 vnode_put(tvp);
9266 }
9267 vnode_put(tdvp);
9268
9269 /*
9270 * nameidone has to happen before we vnode_put(fdvp)
9271 * since it may need to release the fs_nodelock on the fvp
9272 */
9273 nameidone(fromnd);
9274
9275 vnode_put(fvp);
9276 vnode_put(fdvp);
9277
9278 if (mnt_fvp != NULLVP) {
9279 vnode_put(mnt_fvp);
9280 }
9281
9282 mount_lock_renames(locked_mp);
9283 holding_mntlock = 1;
9284
9285 goto retry;
9286 }
9287 } else {
9288 /*
9289 * when we dropped the iocounts to take
9290 * the lock, we allowed the identity of
9291 * the various vnodes to change... if they did,
9292 * we may no longer be dealing with a rename
9293 * that reshapes the tree... once we're holding
9294 * the iocounts, the vnodes can't change type
9295 * so we're free to drop the lock at this point
9296 * and continue on
9297 */
9298 if (holding_mntlock) {
9299 mount_unlock_renames(locked_mp);
9300 mount_drop(locked_mp, 0);
9301 holding_mntlock = 0;
9302 }
9303 }
9304
9305 if (!batched) {
9306 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9307 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9308 flags, NULL);
9309 if (error) {
9310 if (error == ENOENT) {
9311 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9312 /*
9313 * We encountered a race where after doing the namei,
9314 * tvp stops being valid. If so, simply re-drive the rename
9315 * call from the top.
9316 */
9317 do_retry = 1;
9318 retry_count += 1;
9319 }
9320 }
9321 goto out1;
9322 }
9323 }
9324
9325 /* Release the 'mnt_fvp' now that it is no longer needed. */
9326 if (mnt_fvp != NULLVP) {
9327 vnode_put(mnt_fvp);
9328 mnt_fvp = NULLVP;
9329 }
9330
9331 // save these off so we can later verify that fvp is the same
9332 oname = fvp->v_name;
9333 oparent = fvp->v_parent;
9334
9335 skipped_lookup:
9336 #if CONFIG_FILE_LEASES
9337 /* Lease break needed for source's parent dir? */
9338 vnode_breakdirlease(fdvp, false, O_WRONLY);
9339
9340 /* Lease break needed for target's parent dir? */
9341 vnode_breakdirlease(tdvp, false, O_WRONLY);
9342 #endif
9343
9344 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9345 tdvp, &tvp, &tond->ni_cnd, tvap,
9346 flags, ctx);
9347
9348 if (holding_mntlock) {
9349 /*
9350 * we can drop our serialization
9351 * lock now
9352 */
9353 mount_unlock_renames(locked_mp);
9354 mount_drop(locked_mp, 0);
9355 holding_mntlock = 0;
9356 }
9357 if (error) {
9358 if (error == EDATALESS) {
9359 /*
9360 * If we've been here before, something has gone
9361 * horribly wrong and we should just get out lest
9362 * we spiral around the drain forever.
9363 */
9364 if (flags & VFS_RENAME_DATALESS) {
9365 error = EIO;
9366 goto out1;
9367 }
9368
9369 /*
9370 * The object we're renaming is dataless (or has a
9371 * dataless descendent) and requires materialization
9372 * before the rename occurs. But we're holding the
9373 * mount point's rename lock, so it's not safe to
9374 * make the upcall.
9375 *
9376 * In this case, we release the lock (above), perform
9377 * the materialization, and start the whole thing over.
9378 */
9379 error = vfs_materialize_reparent(fvp, tdvp);
9380 if (error == 0) {
9381 /*
9382 * The next time around we need to tell the
9383 * file system that the materializtaion has
9384 * been performed.
9385 */
9386 flags |= VFS_RENAME_DATALESS;
9387 do_retry = 1;
9388 }
9389 goto out1;
9390 }
9391 if (error == EKEEPLOOKING) {
9392 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9393 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9394 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9395 }
9396 }
9397
9398 fromnd->ni_vp = fvp;
9399 tond->ni_vp = tvp;
9400
9401 goto continue_lookup;
9402 }
9403
9404 /*
9405 * We may encounter a race in the VNOP where the destination didn't
9406 * exist when we did the namei, but it does by the time we go and
9407 * try to create the entry. In this case, we should re-drive this rename
9408 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9409 * but other filesystems susceptible to this race could return it, too.
9410 */
9411 if (error == ERECYCLE) {
9412 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9413 do_retry = 1;
9414 retry_count += 1;
9415 } else {
9416 printf("rename retry limit due to ERECYCLE reached\n");
9417 error = ENOENT;
9418 }
9419 }
9420
9421 /*
9422 * For compound VNOPs, the authorization callback may return
9423 * ENOENT in case of racing hardlink lookups hitting the name
9424 * cache, redrive the lookup.
9425 */
9426 if (batched && error == ENOENT) {
9427 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9428 do_retry = 1;
9429 retry_count += 1;
9430 }
9431 }
9432
9433 goto out1;
9434 }
9435
9436 /* call out to allow 3rd party notification of rename.
9437 * Ignore result of kauth_authorize_fileop call.
9438 */
9439 kauth_authorize_fileop(vfs_context_ucred(ctx),
9440 KAUTH_FILEOP_RENAME,
9441 (uintptr_t)from_name, (uintptr_t)to_name);
9442 if (flags & VFS_RENAME_SWAP) {
9443 kauth_authorize_fileop(vfs_context_ucred(ctx),
9444 KAUTH_FILEOP_RENAME,
9445 (uintptr_t)to_name, (uintptr_t)from_name);
9446 }
9447
9448 #if CONFIG_FSE
9449 if (from_name != NULL && to_name != NULL) {
9450 if (from_truncated || to_truncated) {
9451 // set it here since only the from_finfo gets reported up to user space
9452 from_finfo.mode |= FSE_TRUNCATED_PATH;
9453 }
9454
9455 if (tvap && tvp) {
9456 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9457 }
9458 if (fvap) {
9459 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9460 }
9461
9462 if (tvp) {
9463 add_fsevent(FSE_RENAME, ctx,
9464 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9465 FSE_ARG_FINFO, &from_finfo,
9466 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9467 FSE_ARG_FINFO, &to_finfo,
9468 FSE_ARG_DONE);
9469 if (flags & VFS_RENAME_SWAP) {
9470 /*
9471 * Strictly speaking, swap is the equivalent of
9472 * *three* renames. FSEvents clients should only take
9473 * the events as a hint, so we only bother reporting
9474 * two.
9475 */
9476 add_fsevent(FSE_RENAME, ctx,
9477 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9478 FSE_ARG_FINFO, &to_finfo,
9479 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9480 FSE_ARG_FINFO, &from_finfo,
9481 FSE_ARG_DONE);
9482 }
9483 } else {
9484 add_fsevent(FSE_RENAME, ctx,
9485 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9486 FSE_ARG_FINFO, &from_finfo,
9487 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9488 FSE_ARG_DONE);
9489 }
9490 }
9491 #endif /* CONFIG_FSE */
9492
9493 /*
9494 * update filesystem's mount point data
9495 */
9496 if (mntrename) {
9497 char *cp, *pathend, *mpname;
9498 char * tobuf;
9499 struct mount *mp;
9500 int maxlen;
9501 size_t len = 0;
9502
9503 mp = fvp->v_mountedhere;
9504
9505 if (vfs_busy(mp, LK_NOWAIT)) {
9506 error = EBUSY;
9507 goto out1;
9508 }
9509 tobuf = zalloc(ZV_NAMEI);
9510
9511 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9512 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9513 } else {
9514 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9515 }
9516 if (!error) {
9517 /* find current mount point prefix */
9518 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9519 for (cp = pathend; *cp != '\0'; ++cp) {
9520 if (*cp == '/') {
9521 pathend = cp + 1;
9522 }
9523 }
9524 /* find last component of target name */
9525 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9526 if (*cp == '/') {
9527 mpname = cp + 1;
9528 }
9529 }
9530
9531 /* Update f_mntonname of sub mounts */
9532 vfs_iterate(0, rename_submounts_callback, (void *)mp);
9533
9534 /* append name to prefix */
9535 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9536 bzero(pathend, maxlen);
9537
9538 strlcpy(pathend, mpname, maxlen);
9539 }
9540 zfree(ZV_NAMEI, tobuf);
9541
9542 vfs_unbusy(mp);
9543
9544 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9545 }
9546 /*
9547 * fix up name & parent pointers. note that we first
9548 * check that fvp has the same name/parent pointers it
9549 * had before the rename call... this is a 'weak' check
9550 * at best...
9551 *
9552 * XXX oparent and oname may not be set in the compound vnop case
9553 */
9554 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9555 int update_flags;
9556
9557 update_flags = VNODE_UPDATE_NAME;
9558
9559 if (fdvp != tdvp) {
9560 update_flags |= VNODE_UPDATE_PARENT;
9561 }
9562
9563 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9564 }
9565 out1:
9566 /*
9567 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9568 * skipped earlier as no actual rename was performed.
9569 */
9570 if (vn_authorize_skipped && error == 0) {
9571 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9572 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9573 flags, NULL);
9574 if (error && error == ENOENT) {
9575 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9576 do_retry = 1;
9577 retry_count += 1;
9578 }
9579 }
9580 }
9581 if (to_name != NULL) {
9582 RELEASE_PATH(to_name);
9583 to_name = NULL;
9584 }
9585 if (to_name_no_firmlink != NULL) {
9586 RELEASE_PATH(to_name_no_firmlink);
9587 to_name_no_firmlink = NULL;
9588 }
9589 if (from_name != NULL) {
9590 RELEASE_PATH(from_name);
9591 from_name = NULL;
9592 }
9593 if (from_name_no_firmlink != NULL) {
9594 RELEASE_PATH(from_name_no_firmlink);
9595 from_name_no_firmlink = NULL;
9596 }
9597 if (holding_mntlock) {
9598 mount_unlock_renames(locked_mp);
9599 mount_drop(locked_mp, 0);
9600 holding_mntlock = 0;
9601 }
9602 if (tdvp) {
9603 /*
9604 * nameidone has to happen before we vnode_put(tdvp)
9605 * since it may need to release the fs_nodelock on the tdvp
9606 */
9607 nameidone(tond);
9608
9609 if (tvp) {
9610 vnode_put(tvp);
9611 }
9612 vnode_put(tdvp);
9613 }
9614 if (fdvp) {
9615 /*
9616 * nameidone has to happen before we vnode_put(fdvp)
9617 * since it may need to release the fs_nodelock on the fdvp
9618 */
9619 nameidone(fromnd);
9620
9621 if (fvp) {
9622 vnode_put(fvp);
9623 }
9624 vnode_put(fdvp);
9625 }
9626 if (mnt_fvp != NULLVP) {
9627 vnode_put(mnt_fvp);
9628 }
9629 /*
9630 * If things changed after we did the namei, then we will re-drive
9631 * this rename call from the top.
9632 */
9633 if (do_retry) {
9634 do_retry = 0;
9635 goto retry;
9636 }
9637
9638 kfree_type(typeof(*__rename_data), __rename_data);
9639 return error;
9640 }
9641
9642 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9643 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9644 {
9645 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9646 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9647 }
9648
9649 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9650 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9651 {
9652 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9653 return EINVAL;
9654 }
9655
9656 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9657 return EINVAL;
9658 }
9659
9660 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9661 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9662 }
9663
9664 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9665 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9666 {
9667 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9668 uap->tofd, uap->to, UIO_USERSPACE, 0);
9669 }
9670
9671 /*
9672 * Make a directory file.
9673 *
9674 * Returns: 0 Success
9675 * EEXIST
9676 * namei:???
9677 * vnode_authorize:???
9678 * vn_create:???
9679 */
9680 /* ARGSUSED */
9681 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9682 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9683 enum uio_seg segflg)
9684 {
9685 vnode_t vp, dvp;
9686 int error;
9687 int update_flags = 0;
9688 int batched;
9689 struct nameidata nd;
9690
9691 AUDIT_ARG(mode, vap->va_mode);
9692 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9693 path, ctx);
9694 nd.ni_cnd.cn_flags |= WILLBEDIR;
9695 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9696
9697 continue_lookup:
9698 error = nameiat(&nd, fd);
9699 if (error) {
9700 return error;
9701 }
9702 dvp = nd.ni_dvp;
9703 vp = nd.ni_vp;
9704
9705 if (vp != NULL) {
9706 error = EEXIST;
9707 goto out;
9708 }
9709
9710 batched = vnode_compound_mkdir_available(dvp);
9711
9712 VATTR_SET(vap, va_type, VDIR);
9713
9714 /*
9715 * XXX
9716 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9717 * only get EXISTS or EISDIR for existing path components, and not that it could see
9718 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9719 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9720 */
9721 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9722 if (error == EACCES || error == EPERM) {
9723 int error2;
9724
9725 nameidone(&nd);
9726 vnode_put(dvp);
9727 dvp = NULLVP;
9728
9729 /*
9730 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9731 * rather than EACCESS if the target exists.
9732 */
9733 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9734 path, ctx);
9735 error2 = nameiat(&nd, fd);
9736 if (error2) {
9737 goto out;
9738 } else {
9739 vp = nd.ni_vp;
9740 error = EEXIST;
9741 goto out;
9742 }
9743 }
9744
9745 goto out;
9746 }
9747
9748 #if CONFIG_FILE_LEASES
9749 vnode_breakdirlease(dvp, false, O_WRONLY);
9750 #endif
9751
9752 /*
9753 * make the directory
9754 */
9755 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9756 if (error == EKEEPLOOKING) {
9757 nd.ni_vp = vp;
9758 goto continue_lookup;
9759 }
9760
9761 goto out;
9762 }
9763
9764 // Make sure the name & parent pointers are hooked up
9765 if (vp->v_name == NULL) {
9766 update_flags |= VNODE_UPDATE_NAME;
9767 }
9768 if (vp->v_parent == NULLVP) {
9769 update_flags |= VNODE_UPDATE_PARENT;
9770 }
9771
9772 if (update_flags) {
9773 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9774 }
9775
9776 #if CONFIG_FSE
9777 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9778 #endif
9779
9780 out:
9781 /*
9782 * nameidone has to happen before we vnode_put(dvp)
9783 * since it may need to release the fs_nodelock on the dvp
9784 */
9785 nameidone(&nd);
9786
9787 if (vp) {
9788 vnode_put(vp);
9789 }
9790 if (dvp) {
9791 vnode_put(dvp);
9792 }
9793
9794 return error;
9795 }
9796
9797 /*
9798 * mkdir_extended: Create a directory; with extended security (ACL).
9799 *
9800 * Parameters: p Process requesting to create the directory
9801 * uap User argument descriptor (see below)
9802 * retval (ignored)
9803 *
9804 * Indirect: uap->path Path of directory to create
9805 * uap->mode Access permissions to set
9806 * uap->xsecurity ACL to set
9807 *
9808 * Returns: 0 Success
9809 * !0 Not success
9810 *
9811 */
9812 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9813 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9814 {
9815 int ciferror;
9816 kauth_filesec_t xsecdst;
9817 struct vnode_attr va;
9818
9819 AUDIT_ARG(owner, uap->uid, uap->gid);
9820
9821 xsecdst = NULL;
9822 if ((uap->xsecurity != USER_ADDR_NULL) &&
9823 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9824 return ciferror;
9825 }
9826
9827 VATTR_INIT(&va);
9828 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9829 if (xsecdst != NULL) {
9830 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9831 va.va_vaflags |= VA_FILESEC_ACL;
9832 }
9833
9834 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9835 UIO_USERSPACE);
9836 if (xsecdst != NULL) {
9837 kauth_filesec_free(xsecdst);
9838 }
9839 return ciferror;
9840 }
9841
9842 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9843 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9844 {
9845 struct vnode_attr va;
9846
9847 VATTR_INIT(&va);
9848 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9849
9850 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9851 UIO_USERSPACE);
9852 }
9853
9854 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9855 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9856 {
9857 struct vnode_attr va;
9858
9859 VATTR_INIT(&va);
9860 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9861
9862 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9863 UIO_USERSPACE);
9864 }
9865
9866 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9867 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9868 enum uio_seg segflg, int unlink_flags)
9869 {
9870 struct {
9871 struct nameidata nd;
9872 #if CONFIG_FSE
9873 struct vnode_attr va;
9874 #endif /* CONFIG_FSE */
9875 } *__rmdir_data;
9876 vnode_t vp, dvp;
9877 int error;
9878 struct nameidata *ndp;
9879 char *path = NULL;
9880 char *no_firmlink_path = NULL;
9881 int len_path = 0;
9882 int len_no_firmlink_path = 0;
9883 int has_listeners = 0;
9884 int need_event = 0;
9885 int truncated_path = 0;
9886 int truncated_no_firmlink_path = 0;
9887 struct vnode_attr *vap = NULL;
9888 int restart_count = 0;
9889 int batched;
9890
9891 int restart_flag;
9892 int nofollow_any = 0;
9893
9894 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9895 ndp = &__rmdir_data->nd;
9896
9897 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
9898 nofollow_any = NAMEI_NOFOLLOW_ANY;
9899 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
9900 }
9901
9902 /*
9903 * This loop exists to restart rmdir in the unlikely case that two
9904 * processes are simultaneously trying to remove the same directory
9905 * containing orphaned appleDouble files.
9906 */
9907 do {
9908 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9909 segflg, dirpath, ctx);
9910 ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
9911 continue_lookup:
9912 restart_flag = 0;
9913 vap = NULL;
9914
9915 error = nameiat(ndp, fd);
9916 if (error) {
9917 goto err_out;
9918 }
9919
9920 dvp = ndp->ni_dvp;
9921 vp = ndp->ni_vp;
9922
9923 if (vp) {
9924 batched = vnode_compound_rmdir_available(vp);
9925
9926 if (vp->v_flag & VROOT) {
9927 /*
9928 * The root of a mounted filesystem cannot be deleted.
9929 */
9930 error = EBUSY;
9931 goto out;
9932 }
9933
9934 #if DEVELOPMENT || DEBUG
9935 /*
9936 * XXX VSWAP: Check for entitlements or special flag here
9937 * so we can restrict access appropriately.
9938 */
9939 #else /* DEVELOPMENT || DEBUG */
9940
9941 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9942 error = EPERM;
9943 goto out;
9944 }
9945 #endif /* DEVELOPMENT || DEBUG */
9946
9947 /*
9948 * Removed a check here; we used to abort if vp's vid
9949 * was not the same as what we'd seen the last time around.
9950 * I do not think that check was valid, because if we retry
9951 * and all dirents are gone, the directory could legitimately
9952 * be recycled but still be present in a situation where we would
9953 * have had permission to delete. Therefore, we won't make
9954 * an effort to preserve that check now that we may not have a
9955 * vp here.
9956 */
9957
9958 if (!batched) {
9959 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9960 if (error) {
9961 if (error == ENOENT) {
9962 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9963 restart_flag = 1;
9964 restart_count += 1;
9965 }
9966 }
9967 goto out;
9968 }
9969 }
9970 } else {
9971 batched = 1;
9972
9973 if (!vnode_compound_rmdir_available(dvp)) {
9974 panic("No error, but no compound rmdir?");
9975 }
9976 }
9977
9978 #if CONFIG_FSE
9979 fse_info finfo = {0};
9980
9981 need_event = need_fsevent(FSE_DELETE, dvp);
9982 if (need_event) {
9983 if (!batched) {
9984 get_fse_info(vp, &finfo, ctx);
9985 } else {
9986 error = vfs_get_notify_attributes(&__rmdir_data->va);
9987 if (error) {
9988 goto out;
9989 }
9990
9991 vap = &__rmdir_data->va;
9992 }
9993 }
9994 #endif
9995 has_listeners = kauth_authorize_fileop_has_listeners();
9996 if (need_event || has_listeners) {
9997 if (path == NULL) {
9998 GET_PATH(path);
9999 }
10000
10001 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10002
10003 if (no_firmlink_path == NULL) {
10004 GET_PATH(no_firmlink_path);
10005 }
10006
10007 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10008 #if CONFIG_FSE
10009 if (truncated_no_firmlink_path) {
10010 finfo.mode |= FSE_TRUNCATED_PATH;
10011 }
10012 #endif
10013 }
10014
10015 #if CONFIG_FILE_LEASES
10016 vnode_breakdirlease(dvp, false, O_WRONLY);
10017 #endif
10018
10019 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10020 ndp->ni_vp = vp;
10021 if (vp == NULLVP) {
10022 /* Couldn't find a vnode */
10023 goto out;
10024 }
10025
10026 if (error == EKEEPLOOKING) {
10027 goto continue_lookup;
10028 } else if (batched && error == ENOENT) {
10029 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10030 /*
10031 * For compound VNOPs, the authorization callback
10032 * may return ENOENT in case of racing hard link lookups
10033 * redrive the lookup.
10034 */
10035 restart_flag = 1;
10036 restart_count += 1;
10037 goto out;
10038 }
10039 }
10040
10041 /*
10042 * XXX There's no provision for passing flags
10043 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10044 * because it's not empty, then we try again
10045 * with VNOP_REMOVE(), passing in a special
10046 * flag that clever file systems will know
10047 * how to handle.
10048 */
10049 if (error == ENOTEMPTY &&
10050 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10051 /*
10052 * Only do this if the directory is actually
10053 * marked as DATALESS.
10054 */
10055 struct vnode_attr *lvap =
10056 kalloc_type(struct vnode_attr, Z_WAITOK);
10057
10058 VATTR_INIT(lvap);
10059 VATTR_WANTED(lvap, va_flags);
10060 if (vnode_getattr(vp, lvap, ctx) == 0 &&
10061 VATTR_IS_SUPPORTED(lvap, va_flags) &&
10062 (lvap->va_flags & SF_DATALESS) != 0) {
10063 /*
10064 * If this fails, we want to keep the original
10065 * error.
10066 */
10067 if (vn_remove(dvp, &vp, ndp,
10068 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10069 error = 0;
10070 }
10071 }
10072 kfree_type(struct vnode_attr, lvap);
10073 }
10074
10075 #if CONFIG_APPLEDOUBLE
10076 /*
10077 * Special case to remove orphaned AppleDouble
10078 * files. I don't like putting this in the kernel,
10079 * but carbon does not like putting this in carbon either,
10080 * so here we are.
10081 */
10082 if (error == ENOTEMPTY) {
10083 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10084 if (ad_error == EBUSY) {
10085 error = ad_error;
10086 goto out;
10087 }
10088
10089
10090 /*
10091 * Assuming everything went well, we will try the RMDIR again
10092 */
10093 if (!ad_error) {
10094 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10095 }
10096 }
10097 #endif /* CONFIG_APPLEDOUBLE */
10098 /*
10099 * Call out to allow 3rd party notification of delete.
10100 * Ignore result of kauth_authorize_fileop call.
10101 */
10102 if (!error) {
10103 if (has_listeners) {
10104 kauth_authorize_fileop(vfs_context_ucred(ctx),
10105 KAUTH_FILEOP_DELETE,
10106 (uintptr_t)vp,
10107 (uintptr_t)path);
10108 }
10109
10110 if (vp->v_flag & VISHARDLINK) {
10111 // see the comment in unlink1() about why we update
10112 // the parent of a hard link when it is removed
10113 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10114 }
10115
10116 #if CONFIG_FSE
10117 if (need_event) {
10118 if (vap) {
10119 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10120 }
10121 add_fsevent(FSE_DELETE, ctx,
10122 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10123 FSE_ARG_FINFO, &finfo,
10124 FSE_ARG_DONE);
10125 }
10126 #endif
10127
10128 #if CONFIG_MACF
10129 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10130 #endif
10131 }
10132
10133 out:
10134 if (path != NULL) {
10135 RELEASE_PATH(path);
10136 path = NULL;
10137 }
10138
10139 if (no_firmlink_path != NULL) {
10140 RELEASE_PATH(no_firmlink_path);
10141 no_firmlink_path = NULL;
10142 }
10143
10144 /*
10145 * nameidone has to happen before we vnode_put(dvp)
10146 * since it may need to release the fs_nodelock on the dvp
10147 */
10148 nameidone(ndp);
10149 vnode_put(dvp);
10150
10151 if (vp) {
10152 vnode_put(vp);
10153 }
10154
10155 if (restart_flag == 0) {
10156 wakeup_one((caddr_t)vp);
10157 goto err_out;
10158 }
10159 tsleep(vp, PVFS, "rm AD", 1);
10160 } while (restart_flag != 0);
10161
10162 err_out:
10163 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10164
10165 return error;
10166 }
10167
10168 /*
10169 * Remove a directory file.
10170 */
10171 /* ARGSUSED */
10172 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10173 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10174 {
10175 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10176 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10177 }
10178
10179 /* Get direntry length padded to 8 byte alignment */
10180 #define DIRENT64_LEN(namlen) \
10181 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10182
10183 /* Get dirent length padded to 4 byte alignment */
10184 #define DIRENT_LEN(namelen) \
10185 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10186
10187 /* Get the end of this dirent */
10188 #define DIRENT_END(dep) \
10189 (((char *)(dep)) + (dep)->d_reclen - 1)
10190
10191 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10192 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10193 int *numdirent, vfs_context_t ctxp)
10194 {
10195 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10196 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10197 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10198 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10199 } else {
10200 size_t bufsize;
10201 void * bufptr;
10202 uio_t auio;
10203 struct direntry *entry64;
10204 struct dirent *dep;
10205 size_t bytesread;
10206 int error;
10207
10208 /*
10209 * We're here because the underlying file system does not
10210 * support direnties or we mounted denying support so we must
10211 * fall back to dirents and convert them to direntries.
10212 *
10213 * Our kernel buffer needs to be smaller since re-packing will
10214 * expand each dirent. The worse case (when the name length
10215 * is 3 or less) corresponds to a struct direntry size of 32
10216 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10217 * (4-byte aligned). So having a buffer that is 3/8 the size
10218 * will prevent us from reading more than we can pack.
10219 *
10220 * Since this buffer is wired memory, we will limit the
10221 * buffer size to a maximum of 32K. We would really like to
10222 * use 32K in the MIN(), but we use magic number 87371 to
10223 * prevent uio_resid() * 3 / 8 from overflowing.
10224 */
10225 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10226 bufptr = kalloc_data(bufsize, Z_WAITOK);
10227 if (bufptr == NULL) {
10228 return ENOMEM;
10229 }
10230
10231 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10232 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10233 auio->uio_offset = uio->uio_offset;
10234
10235 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10236
10237 dep = (struct dirent *)bufptr;
10238 bytesread = bufsize - uio_resid(auio);
10239
10240 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10241 /*
10242 * Convert all the entries and copy them out to user's buffer.
10243 */
10244 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10245 /* First check that the dirent struct up to d_name is within the buffer */
10246 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10247 /* Check that the length of the entire dirent is within the buffer */
10248 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10249 /* Check that the actual length including the name doesn't exceed d_reclen */
10250 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10251 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10252 vp->v_mount->mnt_vfsstat.f_mntonname,
10253 vp->v_name ? vp->v_name : "<unknown>");
10254 error = EIO;
10255 break;
10256 }
10257
10258 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10259
10260 bzero(entry64, enbufsize);
10261 /* Convert a dirent to a dirent64. */
10262 entry64->d_ino = dep->d_ino;
10263 entry64->d_seekoff = 0;
10264 entry64->d_reclen = (uint16_t)enbufsize;
10265 entry64->d_namlen = dep->d_namlen;
10266 entry64->d_type = dep->d_type;
10267 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10268
10269 /* Move to next entry. */
10270 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10271
10272 /* Copy entry64 to user's buffer. */
10273 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10274 }
10275
10276 /* Update the real offset using the offset we got from VNOP_READDIR. */
10277 if (error == 0) {
10278 uio->uio_offset = auio->uio_offset;
10279 }
10280 uio_free(auio);
10281 kfree_data(bufptr, bufsize);
10282 kfree_type(struct direntry, entry64);
10283 return error;
10284 }
10285 }
10286
10287 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10288
10289 /*
10290 * Read a block of directory entries in a file system independent format.
10291 */
10292 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10293 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10294 off_t *offset, int *eofflag, int flags)
10295 {
10296 vnode_t vp;
10297 struct vfs_context context = *vfs_context_current(); /* local copy */
10298 struct fileproc *fp;
10299 uio_t auio;
10300 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10301 off_t loff;
10302 int error, numdirent;
10303 UIO_STACKBUF(uio_buf, 1);
10304
10305 get_from_fd:
10306 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10307 if (error) {
10308 return error;
10309 }
10310
10311 vn_offset_lock(fp->fp_glob);
10312 if (((vnode_t)fp_get_data(fp)) != vp) {
10313 vn_offset_unlock(fp->fp_glob);
10314 file_drop(fd);
10315 goto get_from_fd;
10316 }
10317
10318 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10319 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10320 error = EBADF;
10321 goto out;
10322 }
10323
10324 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10325 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10326 }
10327
10328 #if CONFIG_MACF
10329 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10330 if (error) {
10331 goto out;
10332 }
10333 #endif
10334
10335 if ((error = vnode_getwithref(vp))) {
10336 goto out;
10337 }
10338 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10339
10340 #if CONFIG_UNION_MOUNTS
10341 unionread:
10342 #endif /* CONFIG_UNION_MOUNTS */
10343 if (vp->v_type != VDIR) {
10344 (void)vnode_put(vp);
10345 error = EINVAL;
10346 goto out;
10347 }
10348
10349 #if CONFIG_MACF
10350 error = mac_vnode_check_readdir(&context, vp);
10351 if (error != 0) {
10352 (void)vnode_put(vp);
10353 goto out;
10354 }
10355 #endif /* MAC */
10356
10357 loff = fp->fp_glob->fg_offset;
10358 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10359 uio_addiov(auio, bufp, bufsize);
10360
10361 if (flags & VNODE_READDIR_EXTENDED) {
10362 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10363 fp->fp_glob->fg_offset = uio_offset(auio);
10364 } else {
10365 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10366 fp->fp_glob->fg_offset = uio_offset(auio);
10367 }
10368 if (error) {
10369 (void)vnode_put(vp);
10370 goto out;
10371 }
10372
10373 #if CONFIG_UNION_MOUNTS
10374 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10375 (vp->v_mount->mnt_flag & MNT_UNION)) {
10376 vnode_t uvp;
10377
10378 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10379 if (vnode_ref(uvp) == 0) {
10380 fp_set_data(fp, uvp);
10381 fp->fp_glob->fg_offset = 0;
10382 vnode_rele(vp);
10383 vnode_put(vp);
10384 vp = uvp;
10385 goto unionread;
10386 } else {
10387 /* could not get a ref, can't replace in fd */
10388 vnode_put(uvp);
10389 }
10390 }
10391 }
10392 #endif /* CONFIG_UNION_MOUNTS */
10393
10394 vnode_put(vp);
10395 if (offset) {
10396 *offset = loff;
10397 }
10398
10399 *bytesread = bufsize - uio_resid(auio);
10400 out:
10401 vn_offset_unlock(fp->fp_glob);
10402 file_drop(fd);
10403 return error;
10404 }
10405
10406
10407 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10408 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10409 {
10410 off_t offset;
10411 ssize_t bytesread;
10412 int error, eofflag;
10413
10414 AUDIT_ARG(fd, uap->fd);
10415 error = getdirentries_common(uap->fd, uap->buf, uap->count,
10416 &bytesread, &offset, &eofflag, 0);
10417
10418 if (error == 0) {
10419 if (proc_is64bit(p)) {
10420 user64_long_t base = (user64_long_t)offset;
10421 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10422 } else {
10423 user32_long_t base = (user32_long_t)offset;
10424 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10425 }
10426 *retval = (int)bytesread;
10427 }
10428 return error;
10429 }
10430
10431 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10432 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10433 {
10434 off_t offset;
10435 ssize_t bytesread;
10436 int error, eofflag;
10437 user_size_t bufsize;
10438
10439 AUDIT_ARG(fd, uap->fd);
10440
10441 /*
10442 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10443 * then the kernel carves out the last 4 bytes to return extended
10444 * information to userspace (namely whether we reached EOF with this call).
10445 */
10446 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10447 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10448 } else {
10449 bufsize = uap->bufsize;
10450 }
10451
10452 error = getdirentries_common(uap->fd, uap->buf, bufsize,
10453 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10454
10455 if (error == 0) {
10456 *retval = bytesread;
10457 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10458
10459 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10460 getdirentries64_flags_t flags = 0;
10461 if (eofflag) {
10462 flags |= GETDIRENTRIES64_EOF;
10463 }
10464 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10465 sizeof(flags));
10466 }
10467 }
10468 return error;
10469 }
10470
10471
10472 /*
10473 * Set the mode mask for creation of filesystem nodes.
10474 * XXX implement xsecurity
10475 */
10476 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10477 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10478 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10479 {
10480 AUDIT_ARG(mask, newmask);
10481 proc_fdlock(p);
10482 *retval = p->p_fd.fd_cmask;
10483 p->p_fd.fd_cmask = newmask & ALLPERMS;
10484 proc_fdunlock(p);
10485 return 0;
10486 }
10487
10488 /*
10489 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10490 *
10491 * Parameters: p Process requesting to set the umask
10492 * uap User argument descriptor (see below)
10493 * retval umask of the process (parameter p)
10494 *
10495 * Indirect: uap->newmask umask to set
10496 * uap->xsecurity ACL to set
10497 *
10498 * Returns: 0 Success
10499 * !0 Not success
10500 *
10501 */
10502 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10503 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10504 {
10505 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10506 }
10507
10508 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10509 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10510 {
10511 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10512 }
10513
10514 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10515 "com.apple.private.vfs.revoke-mounted-device"
10516
10517 /*
10518 * Void all references to file by ripping underlying filesystem
10519 * away from vnode.
10520 */
10521 /* ARGSUSED */
10522 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10523 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10524 {
10525 vnode_t vp;
10526 struct vnode_attr va;
10527 vfs_context_t ctx = vfs_context_current();
10528 int error;
10529 struct nameidata nd;
10530
10531 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10532 uap->path, ctx);
10533 error = namei(&nd);
10534 if (error) {
10535 return error;
10536 }
10537 vp = nd.ni_vp;
10538
10539 nameidone(&nd);
10540
10541 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10542 error = ENOTSUP;
10543 goto out;
10544 }
10545
10546 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10547 error = EBUSY;
10548 goto out;
10549 }
10550
10551 #if CONFIG_MACF
10552 error = mac_vnode_check_revoke(ctx, vp);
10553 if (error) {
10554 goto out;
10555 }
10556 #endif
10557
10558 VATTR_INIT(&va);
10559 VATTR_WANTED(&va, va_uid);
10560 if ((error = vnode_getattr(vp, &va, ctx))) {
10561 goto out;
10562 }
10563 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10564 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10565 goto out;
10566 }
10567 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10568 VNOP_REVOKE(vp, REVOKEALL, ctx);
10569 }
10570 out:
10571 vnode_put(vp);
10572 return error;
10573 }
10574
10575
10576 /*
10577 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10578 * The following system calls are designed to support features
10579 * which are specific to the HFS & HFS Plus volume formats
10580 */
10581
10582
10583 /*
10584 * Obtain attribute information on objects in a directory while enumerating
10585 * the directory.
10586 */
10587 /* ARGSUSED */
10588 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10589 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10590 {
10591 vnode_t vp;
10592 struct fileproc *fp;
10593 uio_t auio = NULL;
10594 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10595 uint32_t count = 0, savecount = 0;
10596 uint32_t newstate = 0;
10597 int error, eofflag = 0;
10598 off_t loff = 0;
10599 struct attrlist attributelist;
10600 vfs_context_t ctx = vfs_context_current();
10601 int fd = uap->fd;
10602 UIO_STACKBUF(uio_buf, 1);
10603 kauth_action_t action;
10604
10605 AUDIT_ARG(fd, fd);
10606
10607 /* Get the attributes into kernel space */
10608 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10609 return error;
10610 }
10611 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10612 return error;
10613 }
10614 savecount = count;
10615
10616 get_from_fd:
10617 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10618 return error;
10619 }
10620
10621 vn_offset_lock(fp->fp_glob);
10622 if (((vnode_t)fp_get_data(fp)) != vp) {
10623 vn_offset_unlock(fp->fp_glob);
10624 file_drop(fd);
10625 goto get_from_fd;
10626 }
10627
10628 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10629 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10630 error = EBADF;
10631 goto out;
10632 }
10633
10634
10635 #if CONFIG_MACF
10636 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10637 fp->fp_glob);
10638 if (error) {
10639 goto out;
10640 }
10641 #endif
10642
10643
10644 if ((error = vnode_getwithref(vp))) {
10645 goto out;
10646 }
10647
10648 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10649
10650 #if CONFIG_UNION_MOUNTS
10651 unionread:
10652 #endif /* CONFIG_UNION_MOUNTS */
10653 if (vp->v_type != VDIR) {
10654 (void)vnode_put(vp);
10655 error = EINVAL;
10656 goto out;
10657 }
10658
10659 #if CONFIG_MACF
10660 error = mac_vnode_check_readdir(ctx, vp);
10661 if (error != 0) {
10662 (void)vnode_put(vp);
10663 goto out;
10664 }
10665 #endif /* MAC */
10666
10667 /* set up the uio structure which will contain the users return buffer */
10668 loff = fp->fp_glob->fg_offset;
10669 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10670 uio_addiov(auio, uap->buffer, uap->buffersize);
10671
10672 /*
10673 * If the only item requested is file names, we can let that past with
10674 * just LIST_DIRECTORY. If they want any other attributes, that means
10675 * they need SEARCH as well.
10676 */
10677 action = KAUTH_VNODE_LIST_DIRECTORY;
10678 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10679 attributelist.fileattr || attributelist.dirattr) {
10680 action |= KAUTH_VNODE_SEARCH;
10681 }
10682
10683 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10684 /* Believe it or not, uap->options only has 32-bits of valid
10685 * info, so truncate before extending again */
10686
10687 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10688 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10689 }
10690
10691 if (error) {
10692 (void) vnode_put(vp);
10693 goto out;
10694 }
10695
10696 #if CONFIG_UNION_MOUNTS
10697 /*
10698 * If we've got the last entry of a directory in a union mount
10699 * then reset the eofflag and pretend there's still more to come.
10700 * The next call will again set eofflag and the buffer will be empty,
10701 * so traverse to the underlying directory and do the directory
10702 * read there.
10703 */
10704 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10705 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10706 eofflag = 0;
10707 } else { // Empty buffer
10708 vnode_t uvp;
10709 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10710 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10711 fp_set_data(fp, uvp);
10712 fp->fp_glob->fg_offset = 0; // reset index for new dir
10713 count = savecount;
10714 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10715 vnode_put(vp);
10716 vp = uvp;
10717 goto unionread;
10718 } else {
10719 /* could not get a ref, can't replace in fd */
10720 vnode_put(uvp);
10721 }
10722 }
10723 }
10724 }
10725 #endif /* CONFIG_UNION_MOUNTS */
10726
10727 (void)vnode_put(vp);
10728
10729 if (error) {
10730 goto out;
10731 }
10732 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10733
10734 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10735 goto out;
10736 }
10737 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10738 goto out;
10739 }
10740 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10741 goto out;
10742 }
10743
10744 *retval = eofflag; /* similar to getdirentries */
10745 error = 0;
10746 out:
10747 vn_offset_unlock(fp->fp_glob);
10748 file_drop(fd);
10749 return error; /* return error earlier, an retval of 0 or 1 now */
10750 } /* end of getdirentriesattr system call */
10751
10752 /*
10753 * Exchange data between two files
10754 */
10755
10756 /* ARGSUSED */
10757 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10758 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10759 {
10760 struct nameidata fnd, snd;
10761 vfs_context_t ctx = vfs_context_current();
10762 vnode_t fvp;
10763 vnode_t svp;
10764 int error;
10765 u_int32_t nameiflags;
10766 char *fpath = NULL;
10767 char *spath = NULL;
10768 int flen = 0, slen = 0;
10769 int from_truncated = 0, to_truncated = 0;
10770 #if CONFIG_FSE
10771 fse_info f_finfo, s_finfo;
10772 #endif
10773
10774 nameiflags = 0;
10775 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10776 nameiflags |= FOLLOW;
10777 }
10778
10779 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10780 UIO_USERSPACE, uap->path1, ctx);
10781
10782 error = namei(&fnd);
10783 if (error) {
10784 goto out2;
10785 }
10786
10787 nameidone(&fnd);
10788 fvp = fnd.ni_vp;
10789
10790 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10791 UIO_USERSPACE, uap->path2, ctx);
10792
10793 error = namei(&snd);
10794 if (error) {
10795 vnode_put(fvp);
10796 goto out2;
10797 }
10798 nameidone(&snd);
10799 svp = snd.ni_vp;
10800
10801 /*
10802 * if the files are the same, return an inval error
10803 */
10804 if (svp == fvp) {
10805 error = EINVAL;
10806 goto out;
10807 }
10808
10809 /*
10810 * if the files are on different volumes, return an error
10811 */
10812 if (svp->v_mount != fvp->v_mount) {
10813 error = EXDEV;
10814 goto out;
10815 }
10816
10817 /* If they're not files, return an error */
10818 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10819 error = EINVAL;
10820 goto out;
10821 }
10822
10823 #if CONFIG_MACF
10824 error = mac_vnode_check_exchangedata(ctx,
10825 fvp, svp);
10826 if (error) {
10827 goto out;
10828 }
10829 #endif
10830 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10831 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10832 goto out;
10833 }
10834
10835 if (
10836 #if CONFIG_FSE
10837 need_fsevent(FSE_EXCHANGE, fvp) ||
10838 #endif
10839 kauth_authorize_fileop_has_listeners()) {
10840 GET_PATH(fpath);
10841 GET_PATH(spath);
10842
10843 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10844 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10845
10846 #if CONFIG_FSE
10847 get_fse_info(fvp, &f_finfo, ctx);
10848 get_fse_info(svp, &s_finfo, ctx);
10849 if (from_truncated || to_truncated) {
10850 // set it here since only the f_finfo gets reported up to user space
10851 f_finfo.mode |= FSE_TRUNCATED_PATH;
10852 }
10853 #endif
10854 }
10855 /* Ok, make the call */
10856 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10857
10858 if (error == 0) {
10859 const char *tmpname;
10860
10861 if (fpath != NULL && spath != NULL) {
10862 /* call out to allow 3rd party notification of exchangedata.
10863 * Ignore result of kauth_authorize_fileop call.
10864 */
10865 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10866 (uintptr_t)fpath, (uintptr_t)spath);
10867 }
10868 name_cache_lock();
10869
10870 tmpname = fvp->v_name;
10871 fvp->v_name = svp->v_name;
10872 svp->v_name = tmpname;
10873
10874 if (fvp->v_parent != svp->v_parent) {
10875 vnode_t tmp;
10876
10877 tmp = fvp->v_parent;
10878 fvp->v_parent = svp->v_parent;
10879 svp->v_parent = tmp;
10880 }
10881 name_cache_unlock();
10882
10883 #if CONFIG_FSE
10884 if (fpath != NULL && spath != NULL) {
10885 add_fsevent(FSE_EXCHANGE, ctx,
10886 FSE_ARG_STRING, flen, fpath,
10887 FSE_ARG_FINFO, &f_finfo,
10888 FSE_ARG_STRING, slen, spath,
10889 FSE_ARG_FINFO, &s_finfo,
10890 FSE_ARG_DONE);
10891 }
10892 #endif
10893 }
10894
10895 out:
10896 if (fpath != NULL) {
10897 RELEASE_PATH(fpath);
10898 }
10899 if (spath != NULL) {
10900 RELEASE_PATH(spath);
10901 }
10902 vnode_put(svp);
10903 vnode_put(fvp);
10904 out2:
10905 return error;
10906 }
10907
10908 /*
10909 * Return (in MB) the amount of freespace on the given vnode's volume.
10910 */
10911 uint32_t freespace_mb(vnode_t vp);
10912
10913 uint32_t
freespace_mb(vnode_t vp)10914 freespace_mb(vnode_t vp)
10915 {
10916 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10917 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10918 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10919 }
10920
10921 #if CONFIG_SEARCHFS
10922
10923 /* ARGSUSED */
10924
10925 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10926 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10927 {
10928 vnode_t vp, tvp;
10929 int i, error = 0;
10930 int fserror = 0;
10931 struct nameidata nd;
10932 struct user64_fssearchblock searchblock;
10933 struct searchstate *state;
10934 struct attrlist *returnattrs;
10935 struct timeval timelimit;
10936 void *searchparams1, *searchparams2;
10937 uio_t auio = NULL;
10938 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10939 uint32_t nummatches;
10940 size_t mallocsize;
10941 uint32_t nameiflags;
10942 vfs_context_t ctx = vfs_context_current();
10943 UIO_STACKBUF(uio_buf, 1);
10944
10945 /* Start by copying in fsearchblock parameter list */
10946 if (IS_64BIT_PROCESS(p)) {
10947 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10948 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10949 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10950 } else {
10951 struct user32_fssearchblock tmp_searchblock;
10952
10953 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10954 // munge into 64-bit version
10955 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10956 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10957 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10958 searchblock.maxmatches = tmp_searchblock.maxmatches;
10959 /*
10960 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10961 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10962 */
10963 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10964 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10965 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10966 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10967 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10968 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10969 searchblock.searchattrs = tmp_searchblock.searchattrs;
10970 }
10971 if (error) {
10972 return error;
10973 }
10974
10975 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10976 */
10977 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10978 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10979 return EINVAL;
10980 }
10981
10982 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10983 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
10984 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10985 /* block. */
10986 /* */
10987 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
10988 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
10989 /* assumes the size is still 556 bytes it will continue to work */
10990
10991 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10992 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10993
10994 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10995
10996 /* Now set up the various pointers to the correct place in our newly allocated memory */
10997
10998 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10999 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11000 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11001
11002 /* Now copy in the stuff given our local variables. */
11003
11004 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11005 goto freeandexit;
11006 }
11007
11008 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11009 goto freeandexit;
11010 }
11011
11012 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11013 goto freeandexit;
11014 }
11015
11016 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11017 goto freeandexit;
11018 }
11019
11020 /*
11021 * When searching a union mount, need to set the
11022 * start flag at the first call on each layer to
11023 * reset state for the new volume.
11024 */
11025 if (uap->options & SRCHFS_START) {
11026 state->ss_union_layer = 0;
11027 } else {
11028 uap->options |= state->ss_union_flags;
11029 }
11030 state->ss_union_flags = 0;
11031
11032 /*
11033 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11034 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11035 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11036 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11037 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11038 */
11039
11040 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11041 attrreference_t* string_ref;
11042 u_int32_t* start_length;
11043 user64_size_t param_length;
11044
11045 /* validate searchparams1 */
11046 param_length = searchblock.sizeofsearchparams1;
11047 /* skip the word that specifies length of the buffer */
11048 start_length = (u_int32_t*) searchparams1;
11049 start_length = start_length + 1;
11050 string_ref = (attrreference_t*) start_length;
11051
11052 /* ensure no negative offsets or too big offsets */
11053 if (string_ref->attr_dataoffset < 0) {
11054 error = EINVAL;
11055 goto freeandexit;
11056 }
11057 if (string_ref->attr_length > MAXPATHLEN) {
11058 error = EINVAL;
11059 goto freeandexit;
11060 }
11061
11062 /* Check for pointer overflow in the string ref */
11063 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11064 error = EINVAL;
11065 goto freeandexit;
11066 }
11067
11068 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11069 error = EINVAL;
11070 goto freeandexit;
11071 }
11072 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11073 error = EINVAL;
11074 goto freeandexit;
11075 }
11076 }
11077
11078 /* set up the uio structure which will contain the users return buffer */
11079 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11080 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11081
11082 nameiflags = 0;
11083 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11084 nameiflags |= FOLLOW;
11085 }
11086 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11087 UIO_USERSPACE, uap->path, ctx);
11088
11089 error = namei(&nd);
11090 if (error) {
11091 goto freeandexit;
11092 }
11093 vp = nd.ni_vp;
11094 nameidone(&nd);
11095
11096 /*
11097 * Switch to the root vnode for the volume
11098 */
11099 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11100 vnode_put(vp);
11101 if (error) {
11102 goto freeandexit;
11103 }
11104 vp = tvp;
11105
11106 #if CONFIG_UNION_MOUNTS
11107 /*
11108 * If it's a union mount, the path lookup takes
11109 * us to the top layer. But we may need to descend
11110 * to a lower layer. For non-union mounts the layer
11111 * is always zero.
11112 */
11113 for (i = 0; i < (int) state->ss_union_layer; i++) {
11114 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11115 break;
11116 }
11117 tvp = vp;
11118 vp = vp->v_mount->mnt_vnodecovered;
11119 if (vp == NULL) {
11120 vnode_put(tvp);
11121 error = ENOENT;
11122 goto freeandexit;
11123 }
11124 error = vnode_getwithref(vp);
11125 vnode_put(tvp);
11126 if (error) {
11127 goto freeandexit;
11128 }
11129 }
11130 #endif /* CONFIG_UNION_MOUNTS */
11131
11132 #if CONFIG_MACF
11133 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11134 if (error) {
11135 vnode_put(vp);
11136 goto freeandexit;
11137 }
11138 #endif
11139
11140
11141 /*
11142 * If searchblock.maxmatches == 0, then skip the search. This has happened
11143 * before and sometimes the underlying code doesnt deal with it well.
11144 */
11145 if (searchblock.maxmatches == 0) {
11146 nummatches = 0;
11147 goto saveandexit;
11148 }
11149
11150 /*
11151 * Allright, we have everything we need, so lets make that call.
11152 *
11153 * We keep special track of the return value from the file system:
11154 * EAGAIN is an acceptable error condition that shouldn't keep us
11155 * from copying out any results...
11156 */
11157
11158 fserror = VNOP_SEARCHFS(vp,
11159 searchparams1,
11160 searchparams2,
11161 &searchblock.searchattrs,
11162 (uint32_t)searchblock.maxmatches,
11163 &timelimit,
11164 returnattrs,
11165 &nummatches,
11166 (uint32_t)uap->scriptcode,
11167 (uint32_t)uap->options,
11168 auio,
11169 (struct searchstate *) &state->ss_fsstate,
11170 ctx);
11171
11172 #if CONFIG_UNION_MOUNTS
11173 /*
11174 * If it's a union mount we need to be called again
11175 * to search the mounted-on filesystem.
11176 */
11177 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11178 state->ss_union_flags = SRCHFS_START;
11179 state->ss_union_layer++; // search next layer down
11180 fserror = EAGAIN;
11181 }
11182 #endif /* CONFIG_UNION_MOUNTS */
11183
11184 saveandexit:
11185
11186 vnode_put(vp);
11187
11188 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11189 * search state. Everything was already put into he return buffer by the vop call. */
11190
11191 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11192 goto freeandexit;
11193 }
11194
11195 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11196 goto freeandexit;
11197 }
11198
11199 error = fserror;
11200
11201 freeandexit:
11202
11203 kfree_data(searchparams1, mallocsize);
11204
11205 return error;
11206 } /* end of searchfs system call */
11207
11208 #else /* CONFIG_SEARCHFS */
11209
11210 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11211 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11212 {
11213 return ENOTSUP;
11214 }
11215
11216 #endif /* CONFIG_SEARCHFS */
11217
11218
11219 #if CONFIG_DATALESS_FILES
11220
11221 /*
11222 * === Namespace Resolver Up-call Mechanism ===
11223 *
11224 * When I/O is performed to a dataless file or directory (read, write,
11225 * lookup-in, etc.), the file system performs an upcall to the namespace
11226 * resolver (filecoordinationd) to materialize the object.
11227 *
11228 * We need multiple up-calls to be in flight at once, and we need these
11229 * up-calls to be interruptible, thus the following implementation:
11230 *
11231 * => The nspace_resolver_request represents the in-kernel request state.
11232 * It contains a request ID, storage space for the errno code returned
11233 * by filecoordinationd, and flags.
11234 *
11235 * => The request ID is simply a global monotonically incrementing 32-bit
11236 * number. Outstanding requests are stored in a hash table, and the
11237 * hash function is extremely simple.
11238 *
11239 * => When an upcall is to be made to filecoordinationd, a request structure
11240 * is allocated on the stack (it is small, and needs to live only during
11241 * the duration of the call to resolve_nspace_item_ext()). It is
11242 * initialized and inserted into the table. Some backpressure from
11243 * filecoordinationd is applied by limiting the numnber of entries that
11244 * can be inserted into the table (and thus limiting the number of
11245 * outstanding requests issued to filecoordinationd); waiting for an
11246 * available slot is interruptible.
11247 *
11248 * => Once the request has been inserted into the table, the up-call is made
11249 * to filecoordinationd via a MiG-generated stub. The up-call returns
11250 * immediately and filecoordinationd processes the request asynchronously.
11251 *
11252 * => The caller now waits for the request to complete. Tnis is achieved by
11253 * sleeping on the address of the request structure and waiting for
11254 * filecoordinationd to mark the request structure as complete. This
11255 * is an interruptible sleep call; if interrupted, the request structure
11256 * is removed from the table and EINTR is returned to the caller. If
11257 * this occurs, an advisory up-call is made to filecoordinationd with
11258 * the request ID to indicate that the request can be aborted or
11259 * de-prioritized at the discretion of filecoordinationd.
11260 *
11261 * => When filecoordinationd has completed the request, it signals completion
11262 * by writing to the vfs.nspace.complete sysctl node. Only a process
11263 * decorated as a namespace resolver can write to this sysctl node. The
11264 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11265 * The request ID is looked up in the table, and if the request is found,
11266 * the error code is stored in the request structure and a wakeup()
11267 * issued on the address of the request structure. If the request is not
11268 * found, we simply drop the completion notification, assuming that the
11269 * caller was interrupted.
11270 *
11271 * => When the waiting thread wakes up, it extracts the error code from the
11272 * request structure, removes the request from the table, and returns the
11273 * error code to the calling function. Fini!
11274 */
11275
11276 struct nspace_resolver_request {
11277 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11278 vnode_t r_vp;
11279 vnode_t r_tdvp;
11280 uint32_t r_req_id;
11281 int r_resolver_error;
11282 int r_flags;
11283 };
11284
11285 #define RRF_COMPLETE 0x0001
11286 #define RRF_COMPLETING 0x0002
11287
11288 struct nspace_resolver_completion_data {
11289 uint32_t req_id;
11290 int32_t resolver_error;
11291 uint64_t orig_gencount;
11292 uint64_t orig_syncroot;
11293 };
11294
11295 static uint32_t
next_nspace_req_id(void)11296 next_nspace_req_id(void)
11297 {
11298 static uint32_t next_req_id;
11299
11300 return OSAddAtomic(1, &next_req_id);
11301 }
11302
11303 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11304 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11305
11306 static LIST_HEAD(nspace_resolver_requesthead,
11307 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11308 static u_long nspace_resolver_request_hashmask;
11309 static u_int nspace_resolver_request_count;
11310 static bool nspace_resolver_request_wait_slot;
11311 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11312 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11313 &nspace_resolver_request_lck_grp);
11314
11315 #define NSPACE_REQ_LOCK() \
11316 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11317 #define NSPACE_REQ_UNLOCK() \
11318 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11319
11320 #define NSPACE_RESOLVER_HASH(req_id) \
11321 (&nspace_resolver_request_hashtbl[(req_id) & \
11322 nspace_resolver_request_hashmask])
11323
11324 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11325 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11326 {
11327 struct nspace_resolver_requesthead *bucket;
11328 struct nspace_resolver_request *req;
11329
11330 bucket = NSPACE_RESOLVER_HASH(req_id);
11331 LIST_FOREACH(req, bucket, r_hashlink) {
11332 if (req->r_req_id == req_id) {
11333 /*
11334 * If this request already has a completion
11335 * pending, don't return it again.
11336 */
11337 if ((req->r_flags & RRF_COMPLETING) != 0 &&
11338 skip_completing) {
11339 req = NULL;
11340 }
11341 return req;
11342 }
11343 }
11344
11345 return NULL;
11346 }
11347
11348 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11349 nspace_resolver_req_add(struct nspace_resolver_request *req)
11350 {
11351 struct nspace_resolver_requesthead *bucket;
11352 int error;
11353
11354 NSPACE_REQ_LOCK();
11355
11356 while (nspace_resolver_request_count >=
11357 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11358 nspace_resolver_request_wait_slot = true;
11359 error = msleep(&nspace_resolver_request_count,
11360 &nspace_resolver_request_hash_mutex,
11361 PVFS | PCATCH, "nspacerq", NULL);
11362 if (error) {
11363 NSPACE_REQ_UNLOCK();
11364 return error;
11365 }
11366 }
11367
11368 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11369 #if DIAGNOSTIC
11370 assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11371 #endif /* DIAGNOSTIC */
11372 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11373 nspace_resolver_request_count++;
11374
11375 NSPACE_REQ_UNLOCK();
11376
11377 return 0;
11378 }
11379
11380 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11381 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11382 {
11383 /*
11384 * If a completion is in-progress, we have to wait for the
11385 * completion handler to finish because it's still using 'req',
11386 * which is allocated on our stack a couple of frames up.
11387 */
11388 while ((req->r_flags & RRF_COMPLETING) != 0) {
11389 (void) msleep(req, &nspace_resolver_request_hash_mutex,
11390 PVFS, "nspacecmplt", NULL);
11391 }
11392 }
11393
11394 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11395 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11396 {
11397 struct nspace_resolver_requesthead *bucket;
11398
11399 /* We're called with NSPACE_REQ_LOCK held. */
11400
11401 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11402 #if DIAGNOSTIC
11403 assert((req->r_flags & RRF_COMPLETING) == 0);
11404 assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11405 #endif /* DIAGNOSTIC */
11406 LIST_REMOVE(req, r_hashlink);
11407 nspace_resolver_request_count--;
11408
11409 if (nspace_resolver_request_wait_slot) {
11410 nspace_resolver_request_wait_slot = false;
11411 wakeup(&nspace_resolver_request_count);
11412 }
11413
11414 nspace_resolver_req_wait_pending_completion(req);
11415
11416 NSPACE_REQ_UNLOCK();
11417 }
11418
11419 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11420 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11421 {
11422 NSPACE_REQ_LOCK();
11423 nspace_resolver_req_remove_and_unlock(req);
11424 }
11425
11426 static void
nspace_resolver_req_cancel(uint32_t req_id)11427 nspace_resolver_req_cancel(uint32_t req_id)
11428 {
11429 kern_return_t kr;
11430 mach_port_t mp;
11431
11432 // Failures here aren't fatal -- the cancellation message
11433 // sent to the resolver is merely advisory.
11434
11435 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11436 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11437 return;
11438 }
11439
11440 kr = send_nspace_resolve_cancel(mp, req_id);
11441 if (kr != KERN_SUCCESS) {
11442 os_log_error(OS_LOG_DEFAULT,
11443 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11444 }
11445
11446 ipc_port_release_send(mp);
11447 }
11448
11449 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11450 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11451 {
11452 bool send_cancel_message = false;
11453 int error;
11454
11455 NSPACE_REQ_LOCK();
11456
11457 while ((req->r_flags & RRF_COMPLETE) == 0) {
11458 error = msleep(req, &nspace_resolver_request_hash_mutex,
11459 PVFS | PCATCH, "nspace", NULL);
11460 if (error && error != ERESTART) {
11461 req->r_resolver_error = (error == EINTR) ? EINTR :
11462 ETIMEDOUT;
11463 send_cancel_message = true;
11464 break;
11465 }
11466 }
11467
11468 nspace_resolver_req_remove_and_unlock(req);
11469
11470 /*
11471 * It's safe to continue referencing 'req' here because it's
11472 * allocated on our caller's stack.
11473 */
11474
11475 if (send_cancel_message) {
11476 nspace_resolver_req_cancel(req->r_req_id);
11477 }
11478
11479 return req->r_resolver_error;
11480 }
11481
11482 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11483 nspace_resolver_req_mark_complete(
11484 struct nspace_resolver_request *req,
11485 int resolver_error)
11486 {
11487 req->r_resolver_error = resolver_error;
11488 req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11489 wakeup(req);
11490 }
11491
11492 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11493 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11494 {
11495 req->r_flags |= RRF_COMPLETING;
11496 }
11497
11498 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11499 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11500 {
11501 struct nspace_resolver_request *req;
11502 int error;
11503 struct vnode_attr va;
11504 vnode_t vp;
11505
11506 NSPACE_REQ_LOCK();
11507
11508 req = nspace_resolver_req_lookup(c->req_id, true);
11509 if (req == NULL) {
11510 /*
11511 * If we don't find the request corresponding to our req_id,
11512 * just drop the completion on the floor; it's likely that
11513 * the requester interrupted with a signal, or it may already
11514 * be completing.
11515 */
11516 NSPACE_REQ_UNLOCK();
11517 return;
11518 }
11519
11520 /*
11521 * Get out now if the resolver reported an error.
11522 */
11523 if ((error = c->resolver_error) != 0) {
11524 goto out;
11525 }
11526
11527 /*
11528 * If the resolver did not specify any namespace shape criteria
11529 * for letting the operation proceed, then get out now.
11530 */
11531 if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11532 goto out;
11533 }
11534
11535 /*
11536 * We're going to have to acquire the mount rename lock and do
11537 * some I/O in order to verify the criteria. Mark the request
11538 * as pending so no one else messes with it after we drop the
11539 * NSPACE_REQ_LOCK.
11540 */
11541 nspace_resolver_req_mark_completion_pending(req);
11542 NSPACE_REQ_UNLOCK();
11543
11544 /*
11545 * Lock out renames from changing the shape of the tree while
11546 * validate the criteria.
11547 */
11548 mount_t locked_mp = req->r_vp->v_mount;
11549 mount_ref(locked_mp, 0);
11550 mount_lock_renames(locked_mp);
11551
11552 if (c->orig_gencount != 0) {
11553 vp = req->r_vp;
11554 if (error) {
11555 goto out_dropmount;
11556 }
11557
11558 VATTR_INIT(&va);
11559 VATTR_WANTED(&va, va_recursive_gencount);
11560 error = vnode_getattr(vp, &va, vfs_context_kernel());
11561 if (error) {
11562 goto out_dropmount;
11563 }
11564 if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11565 va.va_recursive_gencount != c->orig_gencount) {
11566 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11567 c->orig_gencount, va.va_recursive_gencount);
11568 error = EBUSY;
11569 goto out_dropmount;
11570 }
11571 }
11572
11573 /*
11574 * Ignore orig_syncroot if a destination directory wasn't specified
11575 * in the request.
11576 */
11577 if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11578 uint64_t syncroot_id;
11579
11580 if (error) {
11581 goto out_dropmount;
11582 }
11583
11584 #ifndef APFSIOC_GET_SYNC_ROOT
11585 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11586 #endif
11587
11588 error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11589 (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11590 if (error) {
11591 goto out_dropmount;
11592 }
11593 if (syncroot_id != c->orig_syncroot) {
11594 printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11595 c->orig_syncroot, syncroot_id);
11596 error = EBUSY;
11597 goto out_dropmount;
11598 }
11599 }
11600
11601 out_dropmount:
11602 mount_unlock_renames(locked_mp);
11603 mount_drop(locked_mp, 0);
11604 NSPACE_REQ_LOCK();
11605
11606 out:
11607 nspace_resolver_req_mark_complete(req, error);
11608 NSPACE_REQ_UNLOCK();
11609 }
11610
11611 static struct proc *nspace_resolver_proc;
11612
11613 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11614 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11615 {
11616 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11617 p == nspace_resolver_proc) ? 1 : 0;
11618 return 0;
11619 }
11620
11621 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11622
11623 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11624 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11625 {
11626 vfs_context_t ctx = vfs_context_current();
11627 int error = 0;
11628
11629 //
11630 // The system filecoordinationd runs as uid == 0. This also
11631 // has the nice side-effect of filtering out filecoordinationd
11632 // running in the simulator.
11633 //
11634 if (!vfs_context_issuser(ctx) ||
11635 !vfs_context_is_dataless_resolver(ctx)) {
11636 return EPERM;
11637 }
11638
11639 if (is_resolver) {
11640 NSPACE_REQ_LOCK();
11641
11642 if (nspace_resolver_proc == NULL) {
11643 proc_lock(p);
11644 p->p_lflag |= P_LNSPACE_RESOLVER;
11645 proc_unlock(p);
11646 nspace_resolver_proc = p;
11647 } else {
11648 error = EBUSY;
11649 }
11650
11651 NSPACE_REQ_UNLOCK();
11652 } else {
11653 // This is basically just like the exit case.
11654 // nspace_resolver_exited() will verify that the
11655 // process is the resolver, and will clear the
11656 // global.
11657 nspace_resolver_exited(p);
11658 }
11659
11660 return error;
11661 }
11662
11663 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11664 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11665 {
11666 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11667 (p->p_vfs_iopolicy &
11668 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11669 *is_prevented = 1;
11670 } else {
11671 *is_prevented = 0;
11672 }
11673 return 0;
11674 }
11675
11676 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11677 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11678 {
11679 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11680 return is_prevented ? 0 : EBUSY;
11681 }
11682
11683 if (is_prevented) {
11684 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11685 } else {
11686 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11687 }
11688 return 0;
11689 }
11690
11691 static int
nspace_materialization_get_thread_state(int * is_prevented)11692 nspace_materialization_get_thread_state(int *is_prevented)
11693 {
11694 uthread_t ut = current_uthread();
11695
11696 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11697 return 0;
11698 }
11699
11700 static int
nspace_materialization_set_thread_state(int is_prevented)11701 nspace_materialization_set_thread_state(int is_prevented)
11702 {
11703 uthread_t ut = current_uthread();
11704
11705 if (is_prevented) {
11706 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11707 } else {
11708 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11709 }
11710 return 0;
11711 }
11712
11713 /* the vfs.nspace branch */
11714 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11715
11716 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11717 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11718 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11719 {
11720 struct proc *p = req->p;
11721 int new_value, old_value, changed = 0;
11722 int error;
11723
11724 error = nspace_resolver_get_proc_state(p, &old_value);
11725 if (error) {
11726 return error;
11727 }
11728
11729 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11730 &changed);
11731 if (error == 0 && changed) {
11732 error = nspace_resolver_set_proc_state(p, new_value);
11733 }
11734 return error;
11735 }
11736
11737 /* decorate this process as the dataless file resolver */
11738 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11739 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11740 0, 0, sysctl_nspace_resolver, "I", "");
11741
11742 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11743 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11744 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11745 {
11746 struct proc *p = req->p;
11747 int new_value, old_value, changed = 0;
11748 int error;
11749
11750 error = nspace_materialization_get_proc_state(p, &old_value);
11751 if (error) {
11752 return error;
11753 }
11754
11755 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11756 &changed);
11757 if (error == 0 && changed) {
11758 error = nspace_materialization_set_proc_state(p, new_value);
11759 }
11760 return error;
11761 }
11762
11763 /* decorate this process as not wanting to materialize dataless files */
11764 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11765 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11766 0, 0, sysctl_nspace_prevent_materialization, "I", "");
11767
11768 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11769 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11770 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11771 {
11772 int new_value, old_value, changed = 0;
11773 int error;
11774
11775 error = nspace_materialization_get_thread_state(&old_value);
11776 if (error) {
11777 return error;
11778 }
11779
11780 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11781 &changed);
11782 if (error == 0 && changed) {
11783 error = nspace_materialization_set_thread_state(new_value);
11784 }
11785 return error;
11786 }
11787
11788 /* decorate this thread as not wanting to materialize dataless files */
11789 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11790 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11791 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11792
11793 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11794 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11795 __unused int arg2, struct sysctl_req *req)
11796 {
11797 struct proc *p = req->p;
11798 uint32_t req_status[2] = { 0, 0 };
11799 uint64_t gencount = 0;
11800 uint64_t syncroot = 0;
11801 int error, is_resolver, changed = 0, other_changed;
11802
11803 error = nspace_resolver_get_proc_state(p, &is_resolver);
11804 if (error) {
11805 return error;
11806 }
11807
11808 if (!is_resolver) {
11809 return EPERM;
11810 }
11811
11812 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11813 &changed);
11814 if (error) {
11815 return error;
11816 }
11817
11818 /*
11819 * Get the gencount if it was passed. Ignore errors, because
11820 * it's optional.
11821 */
11822 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11823 &other_changed);
11824 if (error) {
11825 gencount = 0;
11826 error = 0;
11827 }
11828
11829 /*
11830 * ...and now the syncroot ID.
11831 */
11832 error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
11833 &other_changed);
11834 if (error) {
11835 syncroot = 0;
11836 error = 0;
11837 }
11838
11839 /*
11840 * req_status[0] is the req_id
11841 *
11842 * req_status[1] is the errno
11843 */
11844 if (error == 0 && changed) {
11845 const struct nspace_resolver_completion_data cd = {
11846 .req_id = req_status[0],
11847 .resolver_error = req_status[1],
11848 .orig_gencount = gencount,
11849 .orig_syncroot = syncroot,
11850 };
11851 nspace_resolver_req_completed(&cd);
11852 }
11853 return error;
11854 }
11855
11856 /* Resolver reports completed reqs here. */
11857 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11858 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11859 0, 0, sysctl_nspace_complete, "-", "");
11860
11861 #endif /* CONFIG_DATALESS_FILES */
11862
11863 #if CONFIG_DATALESS_FILES
11864 #define __no_dataless_unused /* nothing */
11865 #else
11866 #define __no_dataless_unused __unused
11867 #endif
11868
11869 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11870 vfs_context_dataless_materialization_is_prevented(
11871 vfs_context_t const ctx __no_dataless_unused)
11872 {
11873 #if CONFIG_DATALESS_FILES
11874 proc_t const p = vfs_context_proc(ctx);
11875 thread_t const t = vfs_context_thread(ctx);
11876 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11877
11878 /*
11879 * Kernel context ==> return EDEADLK, as we would with any random
11880 * process decorated as no-materialize.
11881 */
11882 if (ctx == vfs_context_kernel()) {
11883 return EDEADLK;
11884 }
11885
11886 /*
11887 * If the process has the dataless-manipulation entitlement,
11888 * materialization is prevented, and depending on the kind
11889 * of file system operation, things get to proceed as if the
11890 * object is not dataless.
11891 */
11892 if (vfs_context_is_dataless_manipulator(ctx)) {
11893 return EJUSTRETURN;
11894 }
11895
11896 /*
11897 * Per-thread decorations override any process-wide decorations.
11898 * (Foundation uses this, and this overrides even the dataless-
11899 * manipulation entitlement so as to make API contracts consistent.)
11900 */
11901 if (ut != NULL) {
11902 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11903 return EDEADLK;
11904 }
11905 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11906 return 0;
11907 }
11908 }
11909
11910 /*
11911 * If the process's iopolicy specifies that dataless files
11912 * can be materialized, then we let it go ahead.
11913 */
11914 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11915 return 0;
11916 }
11917 #endif /* CONFIG_DATALESS_FILES */
11918
11919 /*
11920 * The default behavior is to not materialize dataless files;
11921 * return to the caller that deadlock was detected.
11922 */
11923 return EDEADLK;
11924 }
11925
11926 void
nspace_resolver_init(void)11927 nspace_resolver_init(void)
11928 {
11929 #if CONFIG_DATALESS_FILES
11930 nspace_resolver_request_hashtbl =
11931 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11932 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11933 #endif /* CONFIG_DATALESS_FILES */
11934 }
11935
11936 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11937 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11938 {
11939 #if CONFIG_DATALESS_FILES
11940 struct nspace_resolver_requesthead *bucket;
11941 struct nspace_resolver_request *req;
11942 u_long idx;
11943
11944 NSPACE_REQ_LOCK();
11945
11946 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11947 p == nspace_resolver_proc) {
11948 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11949 bucket = &nspace_resolver_request_hashtbl[idx];
11950 LIST_FOREACH(req, bucket, r_hashlink) {
11951 nspace_resolver_req_wait_pending_completion(req);
11952 nspace_resolver_req_mark_complete(req,
11953 ETIMEDOUT);
11954 }
11955 }
11956 nspace_resolver_proc = NULL;
11957 }
11958
11959 NSPACE_REQ_UNLOCK();
11960 #endif /* CONFIG_DATALESS_FILES */
11961 }
11962
11963 #define DATALESS_RESOLVER_ENTITLEMENT \
11964 "com.apple.private.vfs.dataless-resolver"
11965 #define DATALESS_MANIPULATION_ENTITLEMENT \
11966 "com.apple.private.vfs.dataless-manipulation"
11967
11968 #if CONFIG_DATALESS_FILES
11969 /*
11970 * Return TRUE if the vfs context is associated with the dataless
11971 * resolver.
11972 */
11973 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11974 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11975 {
11976 return IOTaskHasEntitlement(vfs_context_task(ctx),
11977 DATALESS_RESOLVER_ENTITLEMENT);
11978 }
11979 #endif /* CONFIG_DATALESS_FILES */
11980
11981 /*
11982 * Return TRUE if the vfs context is associated with a process entitled
11983 * for dataless manipulation.
11984 *
11985 * XXX Arguably belongs in vfs_subr.c, but is here because of the
11986 * complication around CONFIG_DATALESS_FILES.
11987 */
11988 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11989 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11990 {
11991 #if CONFIG_DATALESS_FILES
11992 task_t task = vfs_context_task(ctx);
11993 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11994 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11995 #else
11996 return false;
11997 #endif /* CONFIG_DATALESS_FILES */
11998 }
11999
12000 #if CONFIG_DATALESS_FILES
12001 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12002 log_materialization_prevented(vnode_t vp, uint64_t op)
12003 {
12004 char p_name[MAXCOMLEN + 1];
12005 char *vntype;
12006 proc_selfname(&p_name[0], sizeof(p_name));
12007
12008 if (vp->v_type == VREG) {
12009 vntype = "File";
12010 } else if (vp->v_type == VDIR) {
12011 vntype = "Dir";
12012 } else if (vp->v_type == VLNK) {
12013 vntype = "SymLink";
12014 } else {
12015 vntype = "Other";
12016 }
12017
12018 #if DEVELOPMENT
12019 char *path = NULL;
12020 int len;
12021
12022 path = get_pathbuff();
12023 len = MAXPATHLEN;
12024 if (path) {
12025 vn_getpath(vp, path, &len);
12026 }
12027
12028 os_log_debug(OS_LOG_DEFAULT,
12029 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
12030 p_name, proc_selfpid(),
12031 op, vntype, path ? path : "<unknown-path>");
12032 if (path) {
12033 release_pathbuff(path);
12034 }
12035 #else
12036 os_log_debug(OS_LOG_DEFAULT,
12037 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12038 p_name, proc_selfpid(),
12039 op, vntype);
12040 #endif
12041 }
12042 #endif /* CONFIG_DATALESS_FILES */
12043
12044 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12045 vfs_materialize_item(
12046 vnode_t vp __no_dataless_unused,
12047 uint32_t op __no_dataless_unused,
12048 int64_t offset __no_dataless_unused,
12049 int64_t size __no_dataless_unused,
12050 char *lookup_name __no_dataless_unused,
12051 size_t const namelen __no_dataless_unused,
12052 vnode_t tdvp __no_dataless_unused)
12053 {
12054 #if CONFIG_DATALESS_FILES
12055 kern_return_t kern_ret;
12056 mach_port_t mach_port;
12057 char *path = NULL;
12058 vfs_context_t context;
12059 int path_len;
12060 int error;
12061 audit_token_t atoken;
12062 enum vtype vp_vtype;
12063
12064 /* Swap files are special; ignore them */
12065 if (vnode_isswap(vp)) {
12066 return 0;
12067 }
12068
12069 /*
12070 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12071 * are no longer used nor supported.
12072 */
12073 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12074 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12075 return ENOTSUP;
12076 }
12077 if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12078 os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12079 return ENOTSUP;
12080 }
12081
12082 /* Normalize 'op'. */
12083 op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12084
12085 /*
12086 * To-directory is only meaningful for rename operations;
12087 * ignore it if someone handed one to us unexpectedly.
12088 */
12089 if (op != NAMESPACE_HANDLER_RENAME_OP) {
12090 tdvp = NULL;
12091 }
12092
12093 context = vfs_context_current();
12094
12095 /* Remember this for later. */
12096 vp_vtype = vnode_vtype(vp);
12097
12098 error = vfs_context_dataless_materialization_is_prevented(context);
12099 if (error) {
12100 log_materialization_prevented(vp, op);
12101 goto out_check_errors;
12102 }
12103
12104 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12105 &mach_port);
12106 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12107 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12108 /*
12109 * Treat this like being unable to access the backing store
12110 * server.
12111 */
12112 return ETIMEDOUT;
12113 }
12114
12115 int path_alloc_len = MAXPATHLEN;
12116 do {
12117 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12118 if (path == NULL) {
12119 return ENOMEM;
12120 }
12121
12122 path_len = path_alloc_len;
12123 error = vn_getpath(vp, path, &path_len);
12124 if (error == 0) {
12125 break;
12126 } else if (error == ENOSPC) {
12127 kfree_data(path, path_alloc_len);
12128 path = NULL;
12129 } else {
12130 goto out_release_port;
12131 }
12132 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12133
12134 error = vfs_context_copy_audit_token(context, &atoken);
12135 if (error) {
12136 goto out_release_port;
12137 }
12138
12139 struct nspace_resolver_request req = {
12140 .r_req_id = next_nspace_req_id(),
12141 .r_vp = vp,
12142 .r_tdvp = tdvp,
12143 };
12144
12145 error = nspace_resolver_req_add(&req);
12146 if (error) {
12147 goto out_release_port;
12148 }
12149
12150 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12151
12152 if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12153 char *dest_path = NULL;
12154 int dest_path_len;
12155
12156 dest_path = zalloc(ZV_NAMEI);
12157 dest_path_len = MAXPATHLEN;
12158
12159 error = vn_getpath(tdvp, dest_path, &dest_path_len);
12160 if (error) {
12161 zfree(ZV_NAMEI, dest_path);
12162 goto out_release_port;
12163 }
12164
12165 /*
12166 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12167 * compatibility with existing agents in user-space
12168 * who get passed this value.
12169 */
12170 kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12171 req.r_req_id,
12172 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12173 path, dest_path, atoken);
12174
12175 zfree(ZV_NAMEI, dest_path);
12176 } else if (vp_vtype == VDIR) {
12177 char *tmpname = NULL;
12178
12179 /*
12180 * If the caller provided a lookup_name *and* a name length,
12181 * then we assume the lookup_name is not NUL-terminated.
12182 * Allocate a temporary buffer in this case to provide
12183 * a NUL-terminated path name to the IPC call.
12184 */
12185 if (lookup_name != NULL && namelen != 0) {
12186 if (namelen >= PATH_MAX) {
12187 error = EINVAL;
12188 goto out_req_remove;
12189 }
12190 tmpname = zalloc(ZV_NAMEI);
12191 strlcpy(tmpname, lookup_name, namelen + 1);
12192 lookup_name = tmpname;
12193 } else if (lookup_name != NULL) {
12194 /*
12195 * If the caller provided a lookup_name with a
12196 * zero name length, then we assume it's NUL-
12197 * terminated. Verify it has a valid length.
12198 */
12199 if (strlen(lookup_name) >= PATH_MAX) {
12200 error = EINVAL;
12201 goto out_req_remove;
12202 }
12203 }
12204
12205 /* (See above.) */
12206 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12207 req.r_req_id,
12208 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12209 lookup_name == NULL ? "" : lookup_name, path, atoken);
12210
12211 if (tmpname != NULL) {
12212 zfree(ZV_NAMEI, tmpname);
12213
12214 /*
12215 * Poison lookup_name rather than reference
12216 * freed memory.
12217 */
12218 lookup_name = NULL;
12219 }
12220 } else {
12221 /* (See above.) */
12222 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12223 req.r_req_id,
12224 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12225 offset, size, path, atoken);
12226 }
12227 if (kern_ret != KERN_SUCCESS) {
12228 /*
12229 * Also treat this like being unable to access the backing
12230 * store server.
12231 */
12232 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12233 kern_ret);
12234 error = ETIMEDOUT;
12235 goto out_req_remove;
12236 }
12237
12238 /*
12239 * Give back the memory we allocated earlier while we wait; we
12240 * no longer need it.
12241 */
12242 kfree_data(path, path_alloc_len);
12243 path = NULL;
12244
12245 /*
12246 * Request has been submitted to the resolver. Now (interruptibly)
12247 * wait for completion. Upon requrn, the request will have been
12248 * removed from the lookup table.
12249 */
12250 error = nspace_resolver_req_wait(&req);
12251
12252 out_release_port:
12253 if (path != NULL) {
12254 kfree_data(path, path_alloc_len);
12255 path = NULL;
12256 }
12257 ipc_port_release_send(mach_port);
12258
12259 out_check_errors:
12260 /*
12261 * The file resolver owns the logic about what error to return
12262 * to the caller. We only need to handle a couple of special
12263 * cases here:
12264 */
12265 if (error == EJUSTRETURN) {
12266 /*
12267 * The requesting process is allowed to interact with
12268 * dataless objects. Make a couple of sanity-checks
12269 * here to ensure the action makes sense.
12270 */
12271 switch (op) {
12272 case NAMESPACE_HANDLER_WRITE_OP:
12273 case NAMESPACE_HANDLER_TRUNCATE_OP:
12274 case NAMESPACE_HANDLER_RENAME_OP:
12275 /*
12276 * This handles the case of the resolver itself
12277 * writing data to the file (or throwing it
12278 * away).
12279 */
12280 error = 0;
12281 break;
12282 case NAMESPACE_HANDLER_READ_OP:
12283 case NAMESPACE_HANDLER_LOOKUP_OP:
12284 /*
12285 * This handles the case of the resolver needing
12286 * to look up inside of a dataless directory while
12287 * it's in the process of materializing it (for
12288 * example, creating files or directories).
12289 */
12290 error = (vp_vtype == VDIR) ? 0 : EBADF;
12291 break;
12292 default:
12293 error = EBADF;
12294 break;
12295 }
12296 }
12297
12298 return error;
12299
12300 out_req_remove:
12301 nspace_resolver_req_remove(&req);
12302 goto out_release_port;
12303 #else
12304 return ENOTSUP;
12305 #endif /* CONFIG_DATALESS_FILES */
12306 }
12307
12308 /*
12309 * vfs_materialize_file: Materialize a regular file.
12310 *
12311 * Inputs:
12312 * vp The dataless file to be materialized.
12313 *
12314 * op What kind of operation is being performed:
12315 * -> NAMESPACE_HANDLER_READ_OP
12316 * -> NAMESPACE_HANDLER_WRITE_OP
12317 * -> NAMESPACE_HANDLER_LINK_CREATE
12318 * -> NAMESPACE_HANDLER_DELETE_OP
12319 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12320 * -> NAMESPACE_HANDLER_RENAME_OP
12321 *
12322 * offset offset of I/O for READ or WRITE. Ignored for
12323 * other ops.
12324 *
12325 * size size of I/O for READ or WRITE Ignored for
12326 * other ops.
12327 *
12328 * If offset or size are -1 for a READ or WRITE, then the resolver should
12329 * consider the range to be unknown.
12330 *
12331 * Upon successful return, the caller may proceed with the operation.
12332 * N.B. the file may still be "dataless" in this case.
12333 */
12334 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12335 vfs_materialize_file(
12336 struct vnode *vp,
12337 uint64_t op,
12338 int64_t offset,
12339 int64_t size)
12340 {
12341 if (vp->v_type != VREG) {
12342 return EFTYPE;
12343 }
12344 return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12345 NULL);
12346 }
12347
12348 /*
12349 * vfs_materialize_dir:
12350 *
12351 * Inputs:
12352 * vp The dataless directory to be materialized.
12353 *
12354 * op What kind of operation is being performed:
12355 * -> NAMESPACE_HANDLER_READ_OP
12356 * -> NAMESPACE_HANDLER_WRITE_OP
12357 * -> NAMESPACE_HANDLER_DELETE_OP
12358 * -> NAMESPACE_HANDLER_RENAME_OP
12359 * -> NAMESPACE_HANDLER_LOOKUP_OP
12360 *
12361 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12362 * other ops. May or may not be NUL-terminated; see below.
12363 *
12364 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12365 * terminated and namelen is the number of valid bytes in
12366 * lookup_name. If zero, then lookup_name is assumed to be
12367 * NUL-terminated.
12368 *
12369 * Upon successful return, the caller may proceed with the operation.
12370 * N.B. the directory may still be "dataless" in this case.
12371 */
12372 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12373 vfs_materialize_dir(
12374 struct vnode *vp,
12375 uint64_t op,
12376 char *lookup_name,
12377 size_t namelen)
12378 {
12379 if (vp->v_type != VDIR) {
12380 return EFTYPE;
12381 }
12382 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12383 return EINVAL;
12384 }
12385 return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12386 namelen, NULL);
12387 }
12388
12389 /*
12390 * vfs_materialize_reparent:
12391 *
12392 * Inputs:
12393 * vp The dataless file or directory to be materialized.
12394 *
12395 * tdvp The new parent directory for the dataless file.
12396 *
12397 * Upon successful return, the caller may proceed with the operation.
12398 * N.B. the item may still be "dataless" in this case.
12399 */
12400 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12401 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12402 {
12403 if (vp->v_type != VDIR && vp->v_type != VREG) {
12404 return EFTYPE;
12405 }
12406 return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12407 0, 0, NULL, 0, tdvp);
12408 }
12409
12410 #if 0
12411 static int
12412 build_volfs_path(struct vnode *vp, char *path, int *len)
12413 {
12414 struct vnode_attr va;
12415 int ret;
12416
12417 VATTR_INIT(&va);
12418 VATTR_WANTED(&va, va_fsid);
12419 VATTR_WANTED(&va, va_fileid);
12420
12421 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12422 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12423 ret = -1;
12424 } else {
12425 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12426 ret = 0;
12427 }
12428
12429 return ret;
12430 }
12431 #endif
12432
12433 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12434 fsctl_bogus_command_compat(unsigned long cmd)
12435 {
12436 switch (cmd) {
12437 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12438 return FSIOC_SYNC_VOLUME;
12439 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12440 return FSIOC_ROUTEFS_SETROUTEID;
12441 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12442 return FSIOC_SET_PACKAGE_EXTS;
12443 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12444 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12445 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12446 return DISK_CONDITIONER_IOC_GET;
12447 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12448 return DISK_CONDITIONER_IOC_SET;
12449 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12450 return FSIOC_FIOSEEKHOLE;
12451 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12452 return FSIOC_FIOSEEKDATA;
12453 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12454 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12455 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12456 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12457 }
12458
12459 return cmd;
12460 }
12461
12462 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12463 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12464 {
12465 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12466 }
12467
12468 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12469 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12470 {
12471 struct vfs_attr vfa;
12472 mount_t mp = vp->v_mount;
12473 unsigned arg;
12474 int error;
12475
12476 /* record vid of vp so we can drop it below. */
12477 uint32_t vvid = vp->v_id;
12478
12479 /*
12480 * Then grab mount_iterref so that we can release the vnode.
12481 * Without this, a thread may call vnode_iterate_prepare then
12482 * get into a deadlock because we've never released the root vp
12483 */
12484 error = mount_iterref(mp, 0);
12485 if (error) {
12486 return error;
12487 }
12488 vnode_hold(vp);
12489 vnode_put(vp);
12490
12491 arg = MNT_NOWAIT;
12492 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12493 arg = MNT_WAIT;
12494 }
12495
12496 /*
12497 * If the filessytem supports multiple filesytems in a
12498 * partition (For eg APFS volumes in a container, it knows
12499 * that the waitfor argument to VFS_SYNC are flags.
12500 */
12501 VFSATTR_INIT(&vfa);
12502 VFSATTR_WANTED(&vfa, f_capabilities);
12503 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12504 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12505 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12506 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12507 arg |= MNT_VOLUME;
12508 }
12509
12510 /* issue the sync for this volume */
12511 (void)sync_callback(mp, &arg);
12512
12513 /*
12514 * Then release the mount_iterref once we're done syncing; it's not
12515 * needed for the VNOP_IOCTL below
12516 */
12517 mount_iterdrop(mp);
12518
12519 if (arg & FSCTL_SYNC_FULLSYNC) {
12520 /* re-obtain vnode iocount on the root vp, if possible */
12521 error = vnode_getwithvid(vp, vvid);
12522 if (error == 0) {
12523 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12524 vnode_put(vp);
12525 }
12526 }
12527 vnode_drop(vp);
12528 /* mark the argument VP as having been released */
12529 *arg_vp = NULL;
12530 return error;
12531 }
12532
12533 #if ROUTEFS
12534 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12535 handle_routes(user_addr_t udata)
12536 {
12537 char routepath[MAXPATHLEN];
12538 size_t len = 0;
12539 int error;
12540
12541 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12542 return error;
12543 }
12544 bzero(routepath, MAXPATHLEN);
12545 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12546 if (error) {
12547 return error;
12548 }
12549 error = routefs_kernel_mount(routepath);
12550 return error;
12551 }
12552 #endif
12553
12554 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12555 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12556 {
12557 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12558 struct vnode_attr va;
12559 int error;
12560
12561 VATTR_INIT(&va);
12562 VATTR_SET(&va, va_flags, cas->new_flags);
12563
12564 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12565
12566 #if CONFIG_FSE
12567 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12568 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12569 }
12570 #endif
12571
12572 return error;
12573 }
12574
12575 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12576 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12577 {
12578 struct mount *mp = NULL;
12579 errno_t rootauth = 0;
12580
12581 mp = vp->v_mount;
12582
12583 /*
12584 * query the underlying FS and see if it reports something
12585 * sane for this vnode. If volume is authenticated via
12586 * chunklist, leave that for the caller to determine.
12587 */
12588 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12589
12590 return rootauth;
12591 }
12592
12593 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12594 "com.apple.private.kernel.set-package-extensions"
12595
12596 /*
12597 * Make a filesystem-specific control call:
12598 */
12599 /* ARGSUSED */
12600 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12601 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12602 {
12603 int error = 0;
12604 boolean_t is64bit;
12605 u_int size;
12606 #define STK_PARAMS 128
12607 char stkbuf[STK_PARAMS] = {0};
12608 caddr_t data, memp;
12609 vnode_t vp = *arg_vp;
12610
12611 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12612 return ENOTTY;
12613 }
12614
12615 cmd = fsctl_bogus_command_compat(cmd);
12616
12617 size = IOCPARM_LEN(cmd);
12618 if (size > IOCPARM_MAX) {
12619 return EINVAL;
12620 }
12621
12622 is64bit = proc_is64bit(p);
12623
12624 memp = NULL;
12625
12626 if (size > sizeof(stkbuf)) {
12627 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12628 return ENOMEM;
12629 }
12630 data = memp;
12631 } else {
12632 data = &stkbuf[0];
12633 };
12634
12635 if (cmd & IOC_IN) {
12636 if (size) {
12637 error = copyin(udata, data, size);
12638 if (error) {
12639 if (memp) {
12640 kfree_data(memp, size);
12641 }
12642 return error;
12643 }
12644 } else {
12645 if (is64bit) {
12646 *(user_addr_t *)data = udata;
12647 } else {
12648 *(uint32_t *)data = (uint32_t)udata;
12649 }
12650 };
12651 } else if ((cmd & IOC_OUT) && size) {
12652 /*
12653 * Zero the buffer so the user always
12654 * gets back something deterministic.
12655 */
12656 bzero(data, size);
12657 } else if (cmd & IOC_VOID) {
12658 if (is64bit) {
12659 *(user_addr_t *)data = udata;
12660 } else {
12661 *(uint32_t *)data = (uint32_t)udata;
12662 }
12663 }
12664
12665 /* Check to see if it's a generic command */
12666 switch (cmd) {
12667 case FSIOC_SYNC_VOLUME:
12668 error = handle_sync_volume(vp, arg_vp, data, ctx);
12669 break;
12670
12671 case FSIOC_ROUTEFS_SETROUTEID:
12672 #if ROUTEFS
12673 error = handle_routes(udata);
12674 #endif
12675 break;
12676
12677 case FSIOC_SET_PACKAGE_EXTS: {
12678 user_addr_t ext_strings;
12679 uint32_t num_entries;
12680 uint32_t max_width;
12681
12682 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12683 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12684 error = EPERM;
12685 break;
12686 }
12687
12688 if ((is64bit && size != sizeof(user64_package_ext_info))
12689 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12690 // either you're 64-bit and passed a 64-bit struct or
12691 // you're 32-bit and passed a 32-bit struct. otherwise
12692 // it's not ok.
12693 error = EINVAL;
12694 break;
12695 }
12696
12697 if (is64bit) {
12698 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12699 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12700 }
12701 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12702 num_entries = ((user64_package_ext_info *)data)->num_entries;
12703 max_width = ((user64_package_ext_info *)data)->max_width;
12704 } else {
12705 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12706 num_entries = ((user32_package_ext_info *)data)->num_entries;
12707 max_width = ((user32_package_ext_info *)data)->max_width;
12708 }
12709 error = set_package_extensions_table(ext_strings, num_entries, max_width);
12710 }
12711 break;
12712
12713 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12714 {
12715 mount_t mp;
12716
12717 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12718 break;
12719 }
12720 if ((mp = vp->v_mount) != NULL) {
12721 mount_lock(mp);
12722 if (data[0] != 0) {
12723 for (int i = 0; i < MFSTYPENAMELEN; i++) {
12724 if (!data[i]) {
12725 goto continue_copy;
12726 }
12727 }
12728 /*
12729 * Getting here means we have a user data
12730 * string which has no NULL termination in
12731 * its first MFSTYPENAMELEN bytes. This is
12732 * bogus, let's avoid strlcpy-ing the read
12733 * data and return an error.
12734 */
12735 error = EINVAL;
12736 goto unlock;
12737 continue_copy:
12738 vfs_setfstypename_locked(mp, data);
12739 if (vfs_isrdonly(mp) &&
12740 strcmp(data, "mtmfs") == 0) {
12741 mp->mnt_kern_flag |=
12742 MNTK_EXTENDED_SECURITY;
12743 mp->mnt_kern_flag &=
12744 ~MNTK_AUTH_OPAQUE;
12745 }
12746 } else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12747 const char *name =
12748 vfs_getfstypenameref_locked(mp, NULL);
12749 if (strcmp(name, "mtmfs") == 0) {
12750 mp->mnt_kern_flag &=
12751 ~MNTK_EXTENDED_SECURITY;
12752 }
12753 vfs_setfstypename_locked(mp, NULL);
12754 }
12755 unlock:
12756 mount_unlock(mp);
12757 }
12758 }
12759 break;
12760
12761 case DISK_CONDITIONER_IOC_GET: {
12762 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12763 }
12764 break;
12765
12766 case DISK_CONDITIONER_IOC_SET: {
12767 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12768 }
12769 break;
12770
12771 case FSIOC_CAS_BSDFLAGS:
12772 error = handle_flags(vp, data, ctx);
12773 break;
12774
12775 case FSIOC_FD_ONLY_OPEN_ONCE: {
12776 error = 0;
12777 if (vnode_usecount(vp) > 1) {
12778 vnode_lock_spin(vp);
12779 if (vp->v_lflag & VL_HASSTREAMS) {
12780 if (vnode_isinuse_locked(vp, 1, 1)) {
12781 error = EBUSY;
12782 }
12783 } else if (vnode_usecount(vp) > 1) {
12784 error = EBUSY;
12785 }
12786 vnode_unlock(vp);
12787 }
12788 }
12789 break;
12790
12791 case FSIOC_EVAL_ROOTAUTH:
12792 error = handle_auth(vp, cmd, data, options, ctx);
12793 break;
12794
12795 case FSIOC_TEST_FSE_ACCESS_GRANTED:
12796 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12797 break;
12798
12799 default: {
12800 /*
12801 * Other, known commands shouldn't be passed down here.
12802 * (When adding a selector to this list, it may be prudent
12803 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
12804 */
12805 switch (cmd) {
12806 case F_PUNCHHOLE:
12807 case F_TRIM_ACTIVE_FILE:
12808 case F_RDADVISE:
12809 case F_TRANSCODEKEY:
12810 case F_GETPROTECTIONLEVEL:
12811 case F_GETDEFAULTPROTLEVEL:
12812 case F_MAKECOMPRESSED:
12813 case F_SET_GREEDY_MODE:
12814 case F_SETSTATICCONTENT:
12815 case F_SETIOTYPE:
12816 case F_SETBACKINGSTORE:
12817 case F_GETPATH_MTMINFO:
12818 case APFSIOC_REVERT_TO_SNAPSHOT:
12819 case FSIOC_FIOSEEKHOLE:
12820 case FSIOC_FIOSEEKDATA:
12821 case HFS_GET_BOOT_INFO:
12822 case HFS_SET_BOOT_INFO:
12823 case FIOPINSWAP:
12824 case F_CHKCLEAN:
12825 case F_FULLFSYNC:
12826 case F_BARRIERFSYNC:
12827 case F_FREEZE_FS:
12828 case F_THAW_FS:
12829 case FSIOC_KERNEL_ROOTAUTH:
12830 case FSIOC_GRAFT_FS:
12831 case FSIOC_UNGRAFT_FS:
12832 case FSIOC_AUTH_FS:
12833 error = EINVAL;
12834 goto outdrop;
12835 }
12836 /* Invoke the filesystem-specific code */
12837 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12838 }
12839 } /* end switch stmt */
12840
12841 /*
12842 * if no errors, copy any data to user. Size was
12843 * already set and checked above.
12844 */
12845 if (error == 0 && (cmd & IOC_OUT) && size) {
12846 error = copyout(data, udata, size);
12847 }
12848
12849 outdrop:
12850 if (memp) {
12851 kfree_data(memp, size);
12852 }
12853
12854 return error;
12855 }
12856
12857 /* ARGSUSED */
12858 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12859 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12860 {
12861 int error;
12862 struct nameidata nd;
12863 uint32_t nameiflags;
12864 vnode_t vp = NULL;
12865 vfs_context_t ctx = vfs_context_current();
12866
12867 AUDIT_ARG(cmd, (int)uap->cmd);
12868 AUDIT_ARG(value32, uap->options);
12869 /* Get the vnode for the file we are getting info on: */
12870 nameiflags = 0;
12871 //
12872 // if we come through fsctl() then the file is by definition not open.
12873 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12874 // lest the caller mistakenly thinks the only open is their own (but in
12875 // reality it's someone elses).
12876 //
12877 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12878 return EINVAL;
12879 }
12880 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12881 nameiflags |= FOLLOW;
12882 }
12883 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12884 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12885 }
12886 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12887 UIO_USERSPACE, uap->path, ctx);
12888 if ((error = namei(&nd))) {
12889 goto done;
12890 }
12891 vp = nd.ni_vp;
12892 nameidone(&nd);
12893
12894 #if CONFIG_MACF
12895 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12896 if (error) {
12897 goto done;
12898 }
12899 #endif
12900
12901 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12902
12903 done:
12904 if (vp) {
12905 vnode_put(vp);
12906 }
12907 return error;
12908 }
12909 /* ARGSUSED */
12910 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12911 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12912 {
12913 int error;
12914 vnode_t vp = NULL;
12915 vfs_context_t ctx = vfs_context_current();
12916 int fd = -1;
12917
12918 AUDIT_ARG(fd, uap->fd);
12919 AUDIT_ARG(cmd, (int)uap->cmd);
12920 AUDIT_ARG(value32, uap->options);
12921
12922 /* Get the vnode for the file we are getting info on: */
12923 if ((error = file_vnode(uap->fd, &vp))) {
12924 return error;
12925 }
12926 fd = uap->fd;
12927 if ((error = vnode_getwithref(vp))) {
12928 file_drop(fd);
12929 return error;
12930 }
12931
12932 #if CONFIG_MACF
12933 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12934 file_drop(fd);
12935 vnode_put(vp);
12936 return error;
12937 }
12938 #endif
12939
12940 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12941
12942 file_drop(fd);
12943
12944 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12945 if (vp) {
12946 vnode_put(vp);
12947 }
12948
12949 return error;
12950 }
12951 /* end of fsctl system call */
12952
12953 #define FILESEC_ACCESS_ENTITLEMENT \
12954 "com.apple.private.vfs.filesec-access"
12955
12956 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12957 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12958 {
12959 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12960 /*
12961 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12962 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12963 */
12964 if ((!setting && vfs_context_issuser(ctx)) ||
12965 IOTaskHasEntitlement(vfs_context_task(ctx),
12966 FILESEC_ACCESS_ENTITLEMENT)) {
12967 return 0;
12968 }
12969 }
12970
12971 return EPERM;
12972 }
12973
12974 /*
12975 * Retrieve the data of an extended attribute.
12976 */
12977 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12978 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12979 {
12980 vnode_t vp;
12981 struct nameidata nd;
12982 char attrname[XATTR_MAXNAMELEN + 1];
12983 vfs_context_t ctx = vfs_context_current();
12984 uio_t auio = NULL;
12985 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12986 size_t attrsize = 0;
12987 size_t namelen;
12988 u_int32_t nameiflags;
12989 int error;
12990 UIO_STACKBUF(uio_buf, 1);
12991
12992 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12993 return EINVAL;
12994 }
12995
12996 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12997 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12998 if ((error = namei(&nd))) {
12999 return error;
13000 }
13001 vp = nd.ni_vp;
13002 nameidone(&nd);
13003
13004 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13005 if (error != 0) {
13006 goto out;
13007 }
13008 if (xattr_protected(attrname) &&
13009 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13010 goto out;
13011 }
13012 /*
13013 * the specific check for 0xffffffff is a hack to preserve
13014 * binaray compatibilty in K64 with applications that discovered
13015 * that passing in a buf pointer and a size of -1 resulted in
13016 * just the size of the indicated extended attribute being returned.
13017 * this isn't part of the documented behavior, but because of the
13018 * original implemtation's check for "uap->size > 0", this behavior
13019 * was allowed. In K32 that check turned into a signed comparison
13020 * even though uap->size is unsigned... in K64, we blow by that
13021 * check because uap->size is unsigned and doesn't get sign smeared
13022 * in the munger for a 32 bit user app. we also need to add a
13023 * check to limit the maximum size of the buffer being passed in...
13024 * unfortunately, the underlying fileystems seem to just malloc
13025 * the requested size even if the actual extended attribute is tiny.
13026 * because that malloc is for kernel wired memory, we have to put a
13027 * sane limit on it.
13028 *
13029 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13030 * U64 running on K64 will yield -1 (64 bits wide)
13031 * U32/U64 running on K32 will yield -1 (32 bits wide)
13032 */
13033 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13034 goto no_uio;
13035 }
13036
13037 if (uap->value) {
13038 if (uap->size > (size_t)XATTR_MAXSIZE) {
13039 uap->size = XATTR_MAXSIZE;
13040 }
13041
13042 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13043 &uio_buf[0], sizeof(uio_buf));
13044 uio_addiov(auio, uap->value, uap->size);
13045 }
13046 no_uio:
13047 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13048 out:
13049 vnode_put(vp);
13050
13051 if (auio) {
13052 *retval = uap->size - uio_resid(auio);
13053 } else {
13054 *retval = (user_ssize_t)attrsize;
13055 }
13056
13057 return error;
13058 }
13059
13060 /*
13061 * Retrieve the data of an extended attribute.
13062 */
13063 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13064 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13065 {
13066 vnode_t vp;
13067 char attrname[XATTR_MAXNAMELEN + 1];
13068 vfs_context_t ctx = vfs_context_current();
13069 uio_t auio = NULL;
13070 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13071 size_t attrsize = 0;
13072 size_t namelen;
13073 int error;
13074 UIO_STACKBUF(uio_buf, 1);
13075
13076 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13077 return EINVAL;
13078 }
13079
13080 if ((error = file_vnode(uap->fd, &vp))) {
13081 return error;
13082 }
13083 if ((error = vnode_getwithref(vp))) {
13084 file_drop(uap->fd);
13085 return error;
13086 }
13087 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13088 if (error != 0) {
13089 goto out;
13090 }
13091 if (xattr_protected(attrname) &&
13092 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13093 goto out;
13094 }
13095 if (uap->value && uap->size > 0) {
13096 if (uap->size > (size_t)XATTR_MAXSIZE) {
13097 uap->size = XATTR_MAXSIZE;
13098 }
13099
13100 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13101 &uio_buf[0], sizeof(uio_buf));
13102 uio_addiov(auio, uap->value, uap->size);
13103 }
13104
13105 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13106 out:
13107 (void)vnode_put(vp);
13108 file_drop(uap->fd);
13109
13110 if (auio) {
13111 *retval = uap->size - uio_resid(auio);
13112 } else {
13113 *retval = (user_ssize_t)attrsize;
13114 }
13115 return error;
13116 }
13117
13118 /* struct for checkdirs iteration */
13119 struct setxattr_ctx {
13120 struct nameidata nd;
13121 char attrname[XATTR_MAXNAMELEN + 1];
13122 UIO_STACKBUF(uio_buf, 1);
13123 };
13124
13125 /*
13126 * Set the data of an extended attribute.
13127 */
13128 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13129 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13130 {
13131 vnode_t vp;
13132 vfs_context_t ctx = vfs_context_current();
13133 uio_t auio = NULL;
13134 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13135 size_t namelen;
13136 u_int32_t nameiflags;
13137 int error;
13138 struct setxattr_ctx *sactx;
13139
13140 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13141 return EINVAL;
13142 }
13143
13144 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13145 if (sactx == NULL) {
13146 return ENOMEM;
13147 }
13148
13149 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13150 if (error != 0) {
13151 if (error == EPERM) {
13152 /* if the string won't fit in attrname, copyinstr emits EPERM */
13153 error = ENAMETOOLONG;
13154 }
13155 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13156 goto out;
13157 }
13158 if (xattr_protected(sactx->attrname) &&
13159 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13160 goto out;
13161 }
13162 if (uap->size != 0 && uap->value == 0) {
13163 error = EINVAL;
13164 goto out;
13165 }
13166 if (uap->size > INT_MAX) {
13167 error = E2BIG;
13168 goto out;
13169 }
13170
13171 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13172 #if CONFIG_FILE_LEASES
13173 nameiflags |= WANTPARENT;
13174 #endif
13175 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13176 if ((error = namei(&sactx->nd))) {
13177 goto out;
13178 }
13179 vp = sactx->nd.ni_vp;
13180 #if CONFIG_FILE_LEASES
13181 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13182 vnode_put(sactx->nd.ni_dvp);
13183 #endif
13184 nameidone(&sactx->nd);
13185
13186 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13187 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13188 uio_addiov(auio, uap->value, uap->size);
13189
13190 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13191 #if CONFIG_FSE
13192 if (error == 0) {
13193 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13194 FSE_ARG_VNODE, vp,
13195 FSE_ARG_DONE);
13196 }
13197 #endif
13198 vnode_put(vp);
13199 out:
13200 kfree_type(struct setxattr_ctx, sactx);
13201 *retval = 0;
13202 return error;
13203 }
13204
13205 /*
13206 * Set the data of an extended attribute.
13207 */
13208 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13209 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13210 {
13211 vnode_t vp;
13212 char attrname[XATTR_MAXNAMELEN + 1];
13213 vfs_context_t ctx = vfs_context_current();
13214 uio_t auio = NULL;
13215 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13216 size_t namelen;
13217 int error;
13218 UIO_STACKBUF(uio_buf, 1);
13219
13220 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13221 return EINVAL;
13222 }
13223
13224 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13225 if (error != 0) {
13226 if (error == EPERM) {
13227 /* if the string won't fit in attrname, copyinstr emits EPERM */
13228 return ENAMETOOLONG;
13229 }
13230 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13231 return error;
13232 }
13233 if (xattr_protected(attrname) &&
13234 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13235 return error;
13236 }
13237 if (uap->size != 0 && uap->value == 0) {
13238 return EINVAL;
13239 }
13240 if (uap->size > INT_MAX) {
13241 return E2BIG;
13242 }
13243 if ((error = file_vnode(uap->fd, &vp))) {
13244 return error;
13245 }
13246 if ((error = vnode_getwithref(vp))) {
13247 file_drop(uap->fd);
13248 return error;
13249 }
13250
13251 #if CONFIG_FILE_LEASES
13252 vnode_breakdirlease(vp, true, O_WRONLY);
13253 #endif
13254
13255 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13256 &uio_buf[0], sizeof(uio_buf));
13257 uio_addiov(auio, uap->value, uap->size);
13258
13259 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13260 #if CONFIG_FSE
13261 if (error == 0) {
13262 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13263 FSE_ARG_VNODE, vp,
13264 FSE_ARG_DONE);
13265 }
13266 #endif
13267 vnode_put(vp);
13268 file_drop(uap->fd);
13269 *retval = 0;
13270 return error;
13271 }
13272
13273 /*
13274 * Remove an extended attribute.
13275 * XXX Code duplication here.
13276 */
13277 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13278 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13279 {
13280 vnode_t vp;
13281 struct nameidata nd;
13282 char attrname[XATTR_MAXNAMELEN + 1];
13283 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13284 vfs_context_t ctx = vfs_context_current();
13285 size_t namelen;
13286 u_int32_t nameiflags;
13287 int error;
13288
13289 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13290 return EINVAL;
13291 }
13292
13293 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13294 if (error != 0) {
13295 return error;
13296 }
13297 if (xattr_protected(attrname)) {
13298 return EPERM;
13299 }
13300 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13301 #if CONFIG_FILE_LEASES
13302 nameiflags |= WANTPARENT;
13303 #endif
13304 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13305 if ((error = namei(&nd))) {
13306 return error;
13307 }
13308 vp = nd.ni_vp;
13309 #if CONFIG_FILE_LEASES
13310 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13311 vnode_put(nd.ni_dvp);
13312 #endif
13313 nameidone(&nd);
13314
13315 error = vn_removexattr(vp, attrname, uap->options, ctx);
13316 #if CONFIG_FSE
13317 if (error == 0) {
13318 add_fsevent(FSE_XATTR_REMOVED, ctx,
13319 FSE_ARG_VNODE, vp,
13320 FSE_ARG_DONE);
13321 }
13322 #endif
13323 vnode_put(vp);
13324 *retval = 0;
13325 return error;
13326 }
13327
13328 /*
13329 * Remove an extended attribute.
13330 * XXX Code duplication here.
13331 */
13332 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13333 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13334 {
13335 vnode_t vp;
13336 char attrname[XATTR_MAXNAMELEN + 1];
13337 size_t namelen;
13338 int error;
13339 #if CONFIG_FSE
13340 vfs_context_t ctx = vfs_context_current();
13341 #endif
13342
13343 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13344 return EINVAL;
13345 }
13346
13347 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13348 if (error != 0) {
13349 return error;
13350 }
13351 if (xattr_protected(attrname)) {
13352 return EPERM;
13353 }
13354 if ((error = file_vnode(uap->fd, &vp))) {
13355 return error;
13356 }
13357 if ((error = vnode_getwithref(vp))) {
13358 file_drop(uap->fd);
13359 return error;
13360 }
13361
13362 #if CONFIG_FILE_LEASES
13363 vnode_breakdirlease(vp, true, O_WRONLY);
13364 #endif
13365
13366 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13367 #if CONFIG_FSE
13368 if (error == 0) {
13369 add_fsevent(FSE_XATTR_REMOVED, ctx,
13370 FSE_ARG_VNODE, vp,
13371 FSE_ARG_DONE);
13372 }
13373 #endif
13374 vnode_put(vp);
13375 file_drop(uap->fd);
13376 *retval = 0;
13377 return error;
13378 }
13379
13380 /*
13381 * Retrieve the list of extended attribute names.
13382 * XXX Code duplication here.
13383 */
13384 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13385 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13386 {
13387 vnode_t vp;
13388 struct nameidata nd;
13389 vfs_context_t ctx = vfs_context_current();
13390 uio_t auio = NULL;
13391 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13392 size_t attrsize = 0;
13393 u_int32_t nameiflags;
13394 int error;
13395 UIO_STACKBUF(uio_buf, 1);
13396
13397 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13398 return EINVAL;
13399 }
13400
13401 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13402 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13403 if ((error = namei(&nd))) {
13404 return error;
13405 }
13406 vp = nd.ni_vp;
13407 nameidone(&nd);
13408 if (uap->namebuf != 0 && uap->bufsize > 0) {
13409 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13410 &uio_buf[0], sizeof(uio_buf));
13411 uio_addiov(auio, uap->namebuf, uap->bufsize);
13412 }
13413
13414 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13415
13416 vnode_put(vp);
13417 if (auio) {
13418 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13419 } else {
13420 *retval = (user_ssize_t)attrsize;
13421 }
13422 return error;
13423 }
13424
13425 /*
13426 * Retrieve the list of extended attribute names.
13427 * XXX Code duplication here.
13428 */
13429 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13430 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13431 {
13432 vnode_t vp;
13433 uio_t auio = NULL;
13434 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13435 size_t attrsize = 0;
13436 int error;
13437 UIO_STACKBUF(uio_buf, 1);
13438
13439 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13440 return EINVAL;
13441 }
13442
13443 if ((error = file_vnode(uap->fd, &vp))) {
13444 return error;
13445 }
13446 if ((error = vnode_getwithref(vp))) {
13447 file_drop(uap->fd);
13448 return error;
13449 }
13450 if (uap->namebuf != 0 && uap->bufsize > 0) {
13451 auio = uio_createwithbuffer(1, 0, spacetype,
13452 UIO_READ, &uio_buf[0], sizeof(uio_buf));
13453 uio_addiov(auio, uap->namebuf, uap->bufsize);
13454 }
13455
13456 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13457
13458 vnode_put(vp);
13459 file_drop(uap->fd);
13460 if (auio) {
13461 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13462 } else {
13463 *retval = (user_ssize_t)attrsize;
13464 }
13465 return error;
13466 }
13467
13468 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13469 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13470 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13471 {
13472 int error;
13473 struct mount *mp = NULL;
13474 vnode_t vp;
13475 int length;
13476 int bpflags;
13477 /* maximum number of times to retry build_path */
13478 unsigned int retries = 0x10;
13479
13480 if (bufsize > FSGETPATH_MAXBUFLEN) {
13481 return EINVAL;
13482 }
13483
13484 if (buf == NULL) {
13485 return ENOMEM;
13486 }
13487
13488 retry:
13489 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13490 error = ENOTSUP; /* unexpected failure */
13491 return ENOTSUP;
13492 }
13493
13494 #if CONFIG_UNION_MOUNTS
13495 unionget:
13496 #endif /* CONFIG_UNION_MOUNTS */
13497 if (objid == 2) {
13498 struct vfs_attr vfsattr;
13499 int use_vfs_root = TRUE;
13500
13501 VFSATTR_INIT(&vfsattr);
13502 VFSATTR_WANTED(&vfsattr, f_capabilities);
13503 if (!(options & FSOPT_ISREALFSID) &&
13504 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13505 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13506 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13507 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13508 use_vfs_root = FALSE;
13509 }
13510 }
13511
13512 if (use_vfs_root) {
13513 error = VFS_ROOT(mp, &vp, ctx);
13514 } else {
13515 error = VFS_VGET(mp, objid, &vp, ctx);
13516 }
13517 } else {
13518 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13519 }
13520
13521 #if CONFIG_UNION_MOUNTS
13522 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13523 /*
13524 * If the fileid isn't found and we're in a union
13525 * mount volume, then see if the fileid is in the
13526 * mounted-on volume.
13527 */
13528 struct mount *tmp = mp;
13529 mp = vnode_mount(tmp->mnt_vnodecovered);
13530 vfs_unbusy(tmp);
13531 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13532 goto unionget;
13533 }
13534 } else {
13535 vfs_unbusy(mp);
13536 }
13537 #else
13538 vfs_unbusy(mp);
13539 #endif /* CONFIG_UNION_MOUNTS */
13540
13541 if (error) {
13542 return error;
13543 }
13544
13545 #if CONFIG_MACF
13546 error = mac_vnode_check_fsgetpath(ctx, vp);
13547 if (error) {
13548 vnode_put(vp);
13549 return error;
13550 }
13551 #endif
13552
13553 /* Obtain the absolute path to this vnode. */
13554 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13555 if (options & FSOPT_NOFIRMLINKPATH) {
13556 bpflags |= BUILDPATH_NO_FIRMLINK;
13557 }
13558 bpflags |= BUILDPATH_CHECK_MOVED;
13559 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13560 vnode_put(vp);
13561
13562 if (error) {
13563 /* there was a race building the path, try a few more times */
13564 if (error == EAGAIN) {
13565 --retries;
13566 if (retries > 0) {
13567 goto retry;
13568 }
13569
13570 error = ENOENT;
13571 }
13572 goto out;
13573 }
13574
13575 AUDIT_ARG(text, buf);
13576
13577 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13578 unsigned long path_words[NUMPARMS];
13579 size_t path_len = sizeof(path_words);
13580
13581 if ((size_t)length < path_len) {
13582 memcpy((char *)path_words, buf, length);
13583 memset((char *)path_words + length, 0, path_len - length);
13584
13585 path_len = length;
13586 } else {
13587 memcpy((char *)path_words, buf + (length - path_len), path_len);
13588 }
13589
13590 kdebug_vfs_lookup(path_words, (int)path_len, vp,
13591 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13592 }
13593
13594 *pathlen = length; /* may be superseded by error */
13595
13596 out:
13597 return error;
13598 }
13599
13600 /*
13601 * Obtain the full pathname of a file system object by id.
13602 */
13603 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13604 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13605 uint32_t options, user_ssize_t *retval)
13606 {
13607 vfs_context_t ctx = vfs_context_current();
13608 fsid_t fsid;
13609 char *realpath;
13610 int length;
13611 int error;
13612
13613 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13614 return EINVAL;
13615 }
13616
13617 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13618 return error;
13619 }
13620 AUDIT_ARG(value32, fsid.val[0]);
13621 AUDIT_ARG(value64, objid);
13622 /* Restrict output buffer size for now. */
13623
13624 if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13625 return EINVAL;
13626 }
13627 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13628 if (realpath == NULL) {
13629 return ENOMEM;
13630 }
13631
13632 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13633 options, &length);
13634
13635 if (error) {
13636 goto out;
13637 }
13638
13639 error = copyout((caddr_t)realpath, buf, length);
13640
13641 *retval = (user_ssize_t)length; /* may be superseded by error */
13642 out:
13643 kfree_data(realpath, bufsize);
13644 return error;
13645 }
13646
13647 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13648 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13649 {
13650 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13651 0, retval);
13652 }
13653
13654 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13655 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13656 {
13657 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13658 uap->options, retval);
13659 }
13660
13661 /*
13662 * Common routine to handle various flavors of statfs data heading out
13663 * to user space.
13664 *
13665 * Returns: 0 Success
13666 * EFAULT
13667 */
13668 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13669 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13670 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13671 boolean_t partial_copy)
13672 {
13673 int error;
13674 int my_size, copy_size;
13675
13676 if (is_64_bit) {
13677 struct user64_statfs sfs;
13678 my_size = copy_size = sizeof(sfs);
13679 bzero(&sfs, my_size);
13680 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13681 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13682 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13683 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13684 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13685 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13686 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13687 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13688 sfs.f_files = (user64_long_t)sfsp->f_files;
13689 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13690 sfs.f_fsid = sfsp->f_fsid;
13691 sfs.f_owner = sfsp->f_owner;
13692 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13693 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13694 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13695
13696 if (partial_copy) {
13697 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13698 }
13699 error = copyout((caddr_t)&sfs, bufp, copy_size);
13700 } else {
13701 struct user32_statfs sfs;
13702
13703 my_size = copy_size = sizeof(sfs);
13704 bzero(&sfs, my_size);
13705
13706 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13707 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13708 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13709
13710 /*
13711 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13712 * have to fudge the numbers here in that case. We inflate the blocksize in order
13713 * to reflect the filesystem size as best we can.
13714 */
13715 if ((sfsp->f_blocks > INT_MAX)
13716 /* Hack for 4061702 . I think the real fix is for Carbon to
13717 * look for some volume capability and not depend on hidden
13718 * semantics agreed between a FS and carbon.
13719 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13720 * for Carbon to set bNoVolumeSizes volume attribute.
13721 * Without this the webdavfs files cannot be copied onto
13722 * disk as they look huge. This change should not affect
13723 * XSAN as they should not setting these to -1..
13724 */
13725 && (sfsp->f_blocks != 0xffffffffffffffffULL)
13726 && (sfsp->f_bfree != 0xffffffffffffffffULL)
13727 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13728 int shift;
13729
13730 /*
13731 * Work out how far we have to shift the block count down to make it fit.
13732 * Note that it's possible to have to shift so far that the resulting
13733 * blocksize would be unreportably large. At that point, we will clip
13734 * any values that don't fit.
13735 *
13736 * For safety's sake, we also ensure that f_iosize is never reported as
13737 * being smaller than f_bsize.
13738 */
13739 for (shift = 0; shift < 32; shift++) {
13740 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13741 break;
13742 }
13743 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13744 break;
13745 }
13746 }
13747 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13748 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13749 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13750 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13751 #undef __SHIFT_OR_CLIP
13752 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13753 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13754 } else {
13755 /* filesystem is small enough to be reported honestly */
13756 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13757 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13758 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13759 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13760 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13761 }
13762 sfs.f_files = (user32_long_t)sfsp->f_files;
13763 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13764 sfs.f_fsid = sfsp->f_fsid;
13765 sfs.f_owner = sfsp->f_owner;
13766 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13767 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13768 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13769
13770 if (partial_copy) {
13771 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13772 }
13773 error = copyout((caddr_t)&sfs, bufp, copy_size);
13774 }
13775
13776 if (sizep != NULL) {
13777 *sizep = my_size;
13778 }
13779 return error;
13780 }
13781
13782 /*
13783 * copy stat structure into user_stat structure.
13784 */
13785 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13786 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13787 {
13788 bzero(usbp, sizeof(*usbp));
13789
13790 usbp->st_dev = sbp->st_dev;
13791 usbp->st_ino = sbp->st_ino;
13792 usbp->st_mode = sbp->st_mode;
13793 usbp->st_nlink = sbp->st_nlink;
13794 usbp->st_uid = sbp->st_uid;
13795 usbp->st_gid = sbp->st_gid;
13796 usbp->st_rdev = sbp->st_rdev;
13797 #ifndef _POSIX_C_SOURCE
13798 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13799 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13800 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13801 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13802 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13803 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13804 #else
13805 usbp->st_atime = sbp->st_atime;
13806 usbp->st_atimensec = sbp->st_atimensec;
13807 usbp->st_mtime = sbp->st_mtime;
13808 usbp->st_mtimensec = sbp->st_mtimensec;
13809 usbp->st_ctime = sbp->st_ctime;
13810 usbp->st_ctimensec = sbp->st_ctimensec;
13811 #endif
13812 usbp->st_size = sbp->st_size;
13813 usbp->st_blocks = sbp->st_blocks;
13814 usbp->st_blksize = sbp->st_blksize;
13815 usbp->st_flags = sbp->st_flags;
13816 usbp->st_gen = sbp->st_gen;
13817 usbp->st_lspare = sbp->st_lspare;
13818 usbp->st_qspare[0] = sbp->st_qspare[0];
13819 usbp->st_qspare[1] = sbp->st_qspare[1];
13820 }
13821
13822 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13823 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13824 {
13825 bzero(usbp, sizeof(*usbp));
13826
13827 usbp->st_dev = sbp->st_dev;
13828 usbp->st_ino = sbp->st_ino;
13829 usbp->st_mode = sbp->st_mode;
13830 usbp->st_nlink = sbp->st_nlink;
13831 usbp->st_uid = sbp->st_uid;
13832 usbp->st_gid = sbp->st_gid;
13833 usbp->st_rdev = sbp->st_rdev;
13834 #ifndef _POSIX_C_SOURCE
13835 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13836 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13837 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13838 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13839 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13840 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13841 #else
13842 usbp->st_atime = sbp->st_atime;
13843 usbp->st_atimensec = sbp->st_atimensec;
13844 usbp->st_mtime = sbp->st_mtime;
13845 usbp->st_mtimensec = sbp->st_mtimensec;
13846 usbp->st_ctime = sbp->st_ctime;
13847 usbp->st_ctimensec = sbp->st_ctimensec;
13848 #endif
13849 usbp->st_size = sbp->st_size;
13850 usbp->st_blocks = sbp->st_blocks;
13851 usbp->st_blksize = sbp->st_blksize;
13852 usbp->st_flags = sbp->st_flags;
13853 usbp->st_gen = sbp->st_gen;
13854 usbp->st_lspare = sbp->st_lspare;
13855 usbp->st_qspare[0] = sbp->st_qspare[0];
13856 usbp->st_qspare[1] = sbp->st_qspare[1];
13857 }
13858
13859 /*
13860 * copy stat64 structure into user_stat64 structure.
13861 */
13862 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13863 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13864 {
13865 bzero(usbp, sizeof(*usbp));
13866
13867 usbp->st_dev = sbp->st_dev;
13868 usbp->st_ino = sbp->st_ino;
13869 usbp->st_mode = sbp->st_mode;
13870 usbp->st_nlink = sbp->st_nlink;
13871 usbp->st_uid = sbp->st_uid;
13872 usbp->st_gid = sbp->st_gid;
13873 usbp->st_rdev = sbp->st_rdev;
13874 #ifndef _POSIX_C_SOURCE
13875 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13876 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13877 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13878 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13879 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13880 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13881 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13882 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13883 #else
13884 usbp->st_atime = sbp->st_atime;
13885 usbp->st_atimensec = sbp->st_atimensec;
13886 usbp->st_mtime = sbp->st_mtime;
13887 usbp->st_mtimensec = sbp->st_mtimensec;
13888 usbp->st_ctime = sbp->st_ctime;
13889 usbp->st_ctimensec = sbp->st_ctimensec;
13890 usbp->st_birthtime = sbp->st_birthtime;
13891 usbp->st_birthtimensec = sbp->st_birthtimensec;
13892 #endif
13893 usbp->st_size = sbp->st_size;
13894 usbp->st_blocks = sbp->st_blocks;
13895 usbp->st_blksize = sbp->st_blksize;
13896 usbp->st_flags = sbp->st_flags;
13897 usbp->st_gen = sbp->st_gen;
13898 usbp->st_lspare = sbp->st_lspare;
13899 usbp->st_qspare[0] = sbp->st_qspare[0];
13900 usbp->st_qspare[1] = sbp->st_qspare[1];
13901 }
13902
13903 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13904 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13905 {
13906 bzero(usbp, sizeof(*usbp));
13907
13908 usbp->st_dev = sbp->st_dev;
13909 usbp->st_ino = sbp->st_ino;
13910 usbp->st_mode = sbp->st_mode;
13911 usbp->st_nlink = sbp->st_nlink;
13912 usbp->st_uid = sbp->st_uid;
13913 usbp->st_gid = sbp->st_gid;
13914 usbp->st_rdev = sbp->st_rdev;
13915 #ifndef _POSIX_C_SOURCE
13916 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13917 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13918 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13919 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13920 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13921 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13922 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13923 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13924 #else
13925 usbp->st_atime = sbp->st_atime;
13926 usbp->st_atimensec = sbp->st_atimensec;
13927 usbp->st_mtime = sbp->st_mtime;
13928 usbp->st_mtimensec = sbp->st_mtimensec;
13929 usbp->st_ctime = sbp->st_ctime;
13930 usbp->st_ctimensec = sbp->st_ctimensec;
13931 usbp->st_birthtime = sbp->st_birthtime;
13932 usbp->st_birthtimensec = sbp->st_birthtimensec;
13933 #endif
13934 usbp->st_size = sbp->st_size;
13935 usbp->st_blocks = sbp->st_blocks;
13936 usbp->st_blksize = sbp->st_blksize;
13937 usbp->st_flags = sbp->st_flags;
13938 usbp->st_gen = sbp->st_gen;
13939 usbp->st_lspare = sbp->st_lspare;
13940 usbp->st_qspare[0] = sbp->st_qspare[0];
13941 usbp->st_qspare[1] = sbp->st_qspare[1];
13942 }
13943
13944 /*
13945 * Purge buffer cache for simulating cold starts
13946 */
13947 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13948 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13949 {
13950 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13951
13952 return VNODE_RETURNED;
13953 }
13954
13955 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13956 vfs_purge_callback(mount_t mp, __unused void * arg)
13957 {
13958 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13959
13960 return VFS_RETURNED;
13961 }
13962
13963 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
13964 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
13965
13966 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13967 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13968 {
13969 if (!kauth_cred_issuser(kauth_cred_get())) {
13970 return EPERM;
13971 }
13972
13973 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13974
13975 /* also flush any VM pagers backed by files */
13976 if (vfs_purge_vm_pagers) {
13977 vm_purge_filebacked_pagers();
13978 }
13979
13980 return 0;
13981 }
13982
13983 /*
13984 * gets the vnode associated with the (unnamed) snapshot directory
13985 * for a Filesystem. The snapshot directory vnode is returned with
13986 * an iocount on it.
13987 */
13988 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13989 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13990 {
13991 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13992 }
13993
13994 /*
13995 * Get the snapshot vnode.
13996 *
13997 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13998 * needs nameidone() on ndp.
13999 *
14000 * If the snapshot vnode exists it is returned in ndp->ni_vp.
14001 *
14002 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14003 * not needed.
14004 */
14005 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14006 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14007 user_addr_t name, struct nameidata *ndp, int32_t op,
14008 #if !CONFIG_TRIGGERS
14009 __unused
14010 #endif
14011 enum path_operation pathop,
14012 vfs_context_t ctx)
14013 {
14014 int error, i;
14015 caddr_t name_buf;
14016 size_t name_len;
14017 struct vfs_attr vfa;
14018
14019 *sdvpp = NULLVP;
14020 *rvpp = NULLVP;
14021
14022 error = vnode_getfromfd(ctx, dirfd, rvpp);
14023 if (error) {
14024 return error;
14025 }
14026
14027 if (!vnode_isvroot(*rvpp)) {
14028 error = EINVAL;
14029 goto out;
14030 }
14031
14032 /* Make sure the filesystem supports snapshots */
14033 VFSATTR_INIT(&vfa);
14034 VFSATTR_WANTED(&vfa, f_capabilities);
14035 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14036 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14037 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14038 VOL_CAP_INT_SNAPSHOT)) ||
14039 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14040 VOL_CAP_INT_SNAPSHOT))) {
14041 error = ENOTSUP;
14042 goto out;
14043 }
14044
14045 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14046 if (error) {
14047 goto out;
14048 }
14049
14050 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14051 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14052 if (error) {
14053 goto out1;
14054 }
14055
14056 /*
14057 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14058 * (the length returned by copyinstr includes the terminating NUL)
14059 */
14060 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14061 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14062 error = EINVAL;
14063 goto out1;
14064 }
14065 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14066 ;
14067 }
14068 if (i < (int)name_len) {
14069 error = EINVAL;
14070 goto out1;
14071 }
14072
14073 #if CONFIG_MACF
14074 if (op == CREATE) {
14075 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14076 name_buf);
14077 } else if (op == DELETE) {
14078 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14079 name_buf);
14080 }
14081 if (error) {
14082 goto out1;
14083 }
14084 #endif
14085
14086 /* Check if the snapshot already exists ... */
14087 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14088 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14089 ndp->ni_dvp = *sdvpp;
14090
14091 error = namei(ndp);
14092 out1:
14093 zfree(ZV_NAMEI, name_buf);
14094 out:
14095 if (error) {
14096 if (*sdvpp) {
14097 vnode_put(*sdvpp);
14098 *sdvpp = NULLVP;
14099 }
14100 if (*rvpp) {
14101 vnode_put(*rvpp);
14102 *rvpp = NULLVP;
14103 }
14104 }
14105 return error;
14106 }
14107
14108 /*
14109 * create a filesystem snapshot (for supporting filesystems)
14110 *
14111 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14112 * We get to the (unnamed) snapshot directory vnode and create the vnode
14113 * for the snapshot in it.
14114 *
14115 * Restrictions:
14116 *
14117 * a) Passed in name for snapshot cannot have slashes.
14118 * b) name can't be "." or ".."
14119 *
14120 * Since this requires superuser privileges, vnode_authorize calls are not
14121 * made.
14122 */
14123 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14124 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14125 vfs_context_t ctx)
14126 {
14127 vnode_t rvp, snapdvp;
14128 int error;
14129 struct nameidata *ndp;
14130
14131 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14132
14133 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14134 OP_LINK, ctx);
14135 if (error) {
14136 goto out;
14137 }
14138
14139 if (ndp->ni_vp) {
14140 vnode_put(ndp->ni_vp);
14141 error = EEXIST;
14142 } else {
14143 struct vnode_attr *vap;
14144 vnode_t vp = NULLVP;
14145
14146 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14147
14148 VATTR_INIT(vap);
14149 VATTR_SET(vap, va_type, VREG);
14150 VATTR_SET(vap, va_mode, 0);
14151
14152 error = vn_create(snapdvp, &vp, ndp, vap,
14153 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14154 if (!error && vp) {
14155 vnode_put(vp);
14156 }
14157
14158 kfree_type(struct vnode_attr, vap);
14159 }
14160
14161 nameidone(ndp);
14162 vnode_put(snapdvp);
14163 vnode_put(rvp);
14164 out:
14165 kfree_type(struct nameidata, ndp);
14166
14167 return error;
14168 }
14169
14170 /*
14171 * Delete a Filesystem snapshot
14172 *
14173 * get the vnode for the unnamed snapshot directory and the snapshot and
14174 * delete the snapshot.
14175 */
14176 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14177 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14178 vfs_context_t ctx)
14179 {
14180 vnode_t rvp, snapdvp;
14181 int error;
14182 struct nameidata *ndp;
14183
14184 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14185
14186 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14187 OP_UNLINK, ctx);
14188 if (error) {
14189 goto out;
14190 }
14191
14192 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14193 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14194
14195 vnode_put(ndp->ni_vp);
14196 nameidone(ndp);
14197 vnode_put(snapdvp);
14198 vnode_put(rvp);
14199 out:
14200 kfree_type(struct nameidata, ndp);
14201
14202 return error;
14203 }
14204
14205 /*
14206 * Revert a filesystem to a snapshot
14207 *
14208 * Marks the filesystem to revert to the given snapshot on next mount.
14209 */
14210 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14211 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14212 vfs_context_t ctx)
14213 {
14214 int error;
14215 vnode_t rvp;
14216 mount_t mp;
14217 struct fs_snapshot_revert_args revert_data;
14218 struct componentname cnp;
14219 caddr_t name_buf;
14220 size_t name_len;
14221
14222 error = vnode_getfromfd(ctx, dirfd, &rvp);
14223 if (error) {
14224 return error;
14225 }
14226 mp = vnode_mount(rvp);
14227
14228 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14229 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14230 if (error) {
14231 zfree(ZV_NAMEI, name_buf);
14232 vnode_put(rvp);
14233 return error;
14234 }
14235
14236 #if CONFIG_MACF
14237 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14238 if (error) {
14239 zfree(ZV_NAMEI, name_buf);
14240 vnode_put(rvp);
14241 return error;
14242 }
14243 #endif
14244
14245 /*
14246 * Grab mount_iterref so that we can release the vnode,
14247 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14248 */
14249 error = mount_iterref(mp, 0);
14250 vnode_put(rvp);
14251 if (error) {
14252 zfree(ZV_NAMEI, name_buf);
14253 return error;
14254 }
14255
14256 memset(&cnp, 0, sizeof(cnp));
14257 cnp.cn_pnbuf = (char *)name_buf;
14258 cnp.cn_nameiop = LOOKUP;
14259 cnp.cn_flags = ISLASTCN | HASBUF;
14260 cnp.cn_pnlen = MAXPATHLEN;
14261 cnp.cn_nameptr = cnp.cn_pnbuf;
14262 cnp.cn_namelen = (int)name_len;
14263 revert_data.sr_cnp = &cnp;
14264
14265 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14266 mount_iterdrop(mp);
14267 zfree(ZV_NAMEI, name_buf);
14268
14269 if (error) {
14270 /* If there was any error, try again using VNOP_IOCTL */
14271
14272 vnode_t snapdvp;
14273 struct nameidata namend;
14274
14275 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14276 OP_LOOKUP, ctx);
14277 if (error) {
14278 return error;
14279 }
14280
14281
14282 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14283 0, ctx);
14284
14285 vnode_put(namend.ni_vp);
14286 nameidone(&namend);
14287 vnode_put(snapdvp);
14288 vnode_put(rvp);
14289 }
14290
14291 return error;
14292 }
14293
14294 /*
14295 * rename a Filesystem snapshot
14296 *
14297 * get the vnode for the unnamed snapshot directory and the snapshot and
14298 * rename the snapshot. This is a very specialised (and simple) case of
14299 * rename(2) (which has to deal with a lot more complications). It differs
14300 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14301 */
14302 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14303 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14304 __unused uint32_t flags, vfs_context_t ctx)
14305 {
14306 vnode_t rvp, snapdvp;
14307 int error, i;
14308 caddr_t newname_buf;
14309 size_t name_len;
14310 vnode_t fvp;
14311 struct nameidata *fromnd, *tond;
14312 /* carving out a chunk for structs that are too big to be on stack. */
14313 struct {
14314 struct nameidata from_node;
14315 struct nameidata to_node;
14316 } * __rename_data;
14317
14318 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14319 fromnd = &__rename_data->from_node;
14320 tond = &__rename_data->to_node;
14321
14322 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14323 OP_UNLINK, ctx);
14324 if (error) {
14325 goto out;
14326 }
14327 fvp = fromnd->ni_vp;
14328
14329 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14330 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14331 if (error) {
14332 goto out1;
14333 }
14334
14335 /*
14336 * Some sanity checks- new name can't be empty, "." or ".." or have
14337 * slashes.
14338 * (the length returned by copyinstr includes the terminating NUL)
14339 *
14340 * The FS rename VNOP is suppossed to handle this but we'll pick it
14341 * off here itself.
14342 */
14343 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14344 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14345 error = EINVAL;
14346 goto out1;
14347 }
14348 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14349 ;
14350 }
14351 if (i < (int)name_len) {
14352 error = EINVAL;
14353 goto out1;
14354 }
14355
14356 #if CONFIG_MACF
14357 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14358 newname_buf);
14359 if (error) {
14360 goto out1;
14361 }
14362 #endif
14363
14364 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14365 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14366 tond->ni_dvp = snapdvp;
14367
14368 error = namei(tond);
14369 if (error) {
14370 goto out2;
14371 } else if (tond->ni_vp) {
14372 /*
14373 * snapshot rename behaves differently than rename(2) - if the
14374 * new name exists, EEXIST is returned.
14375 */
14376 vnode_put(tond->ni_vp);
14377 error = EEXIST;
14378 goto out2;
14379 }
14380
14381 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14382 &tond->ni_cnd, ctx);
14383
14384 out2:
14385 nameidone(tond);
14386 out1:
14387 zfree(ZV_NAMEI, newname_buf);
14388 vnode_put(fvp);
14389 vnode_put(snapdvp);
14390 vnode_put(rvp);
14391 nameidone(fromnd);
14392 out:
14393 kfree_type(typeof(*__rename_data), __rename_data);
14394 return error;
14395 }
14396
14397 /*
14398 * Mount a Filesystem snapshot
14399 *
14400 * get the vnode for the unnamed snapshot directory and the snapshot and
14401 * mount the snapshot.
14402 */
14403 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14404 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14405 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14406 {
14407 mount_t mp;
14408 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14409 struct fs_snapshot_mount_args smnt_data;
14410 int error;
14411 struct nameidata *snapndp, *dirndp;
14412 /* carving out a chunk for structs that are too big to be on stack. */
14413 struct {
14414 struct nameidata snapnd;
14415 struct nameidata dirnd;
14416 } * __snapshot_mount_data;
14417
14418 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14419 snapndp = &__snapshot_mount_data->snapnd;
14420 dirndp = &__snapshot_mount_data->dirnd;
14421
14422 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14423 OP_LOOKUP, ctx);
14424 if (error) {
14425 goto out;
14426 }
14427
14428 snapvp = snapndp->ni_vp;
14429 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14430 error = EIO;
14431 goto out1;
14432 }
14433
14434 /* Get the vnode to be covered */
14435 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14436 UIO_USERSPACE, directory, ctx);
14437 error = namei(dirndp);
14438 if (error) {
14439 goto out1;
14440 }
14441
14442 vp = dirndp->ni_vp;
14443 pvp = dirndp->ni_dvp;
14444 mp = vnode_mount(rvp);
14445
14446 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14447 error = EINVAL;
14448 goto out2;
14449 }
14450
14451 #if CONFIG_MACF
14452 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14453 mp->mnt_vfsstat.f_fstypename);
14454 if (error) {
14455 goto out2;
14456 }
14457 #endif
14458
14459 smnt_data.sm_mp = mp;
14460 smnt_data.sm_cnp = &snapndp->ni_cnd;
14461 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14462 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & (MNT_DONTBROWSE | MNT_IGNORE_OWNERSHIP),
14463 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14464
14465 out2:
14466 vnode_put(vp);
14467 vnode_put(pvp);
14468 nameidone(dirndp);
14469 out1:
14470 vnode_put(snapvp);
14471 vnode_put(snapdvp);
14472 vnode_put(rvp);
14473 nameidone(snapndp);
14474 out:
14475 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14476 return error;
14477 }
14478
14479 /*
14480 * Root from a snapshot of the filesystem
14481 *
14482 * Marks the filesystem to root from the given snapshot on next boot.
14483 */
14484 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14485 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14486 vfs_context_t ctx)
14487 {
14488 int error;
14489 vnode_t rvp;
14490 mount_t mp;
14491 struct fs_snapshot_root_args root_data;
14492 struct componentname cnp;
14493 caddr_t name_buf;
14494 size_t name_len;
14495
14496 error = vnode_getfromfd(ctx, dirfd, &rvp);
14497 if (error) {
14498 return error;
14499 }
14500 mp = vnode_mount(rvp);
14501
14502 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14503 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14504 if (error) {
14505 zfree(ZV_NAMEI, name_buf);
14506 vnode_put(rvp);
14507 return error;
14508 }
14509
14510 // XXX MAC checks ?
14511
14512 /*
14513 * Grab mount_iterref so that we can release the vnode,
14514 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14515 */
14516 error = mount_iterref(mp, 0);
14517 vnode_put(rvp);
14518 if (error) {
14519 zfree(ZV_NAMEI, name_buf);
14520 return error;
14521 }
14522
14523 memset(&cnp, 0, sizeof(cnp));
14524 cnp.cn_pnbuf = (char *)name_buf;
14525 cnp.cn_nameiop = LOOKUP;
14526 cnp.cn_flags = ISLASTCN | HASBUF;
14527 cnp.cn_pnlen = MAXPATHLEN;
14528 cnp.cn_nameptr = cnp.cn_pnbuf;
14529 cnp.cn_namelen = (int)name_len;
14530 root_data.sr_cnp = &cnp;
14531
14532 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14533
14534 mount_iterdrop(mp);
14535 zfree(ZV_NAMEI, name_buf);
14536
14537 return error;
14538 }
14539
14540 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14541 vfs_context_can_snapshot(vfs_context_t ctx)
14542 {
14543 static const char * const snapshot_entitlements[] = {
14544 "com.apple.private.vfs.snapshot",
14545 "com.apple.developer.vfs.snapshot",
14546 "com.apple.private.apfs.arv.limited.snapshot",
14547 };
14548 static const size_t nentitlements =
14549 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14550 size_t i;
14551
14552 task_t task = vfs_context_task(ctx);
14553 for (i = 0; i < nentitlements; i++) {
14554 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14555 return TRUE;
14556 }
14557 }
14558 return FALSE;
14559 }
14560
14561 /*
14562 * FS snapshot operations dispatcher
14563 */
14564 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14565 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14566 __unused int32_t *retval)
14567 {
14568 int error;
14569 vfs_context_t ctx = vfs_context_current();
14570
14571 AUDIT_ARG(fd, uap->dirfd);
14572 AUDIT_ARG(value32, uap->op);
14573
14574 if (!vfs_context_can_snapshot(ctx)) {
14575 return EPERM;
14576 }
14577
14578 /*
14579 * Enforce user authorization for snapshot modification operations,
14580 * or if trying to root from snapshot.
14581 */
14582 if (uap->op != SNAPSHOT_OP_MOUNT) {
14583 vnode_t dvp = NULLVP;
14584 vnode_t devvp = NULLVP;
14585 mount_t mp;
14586
14587 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14588 if (error) {
14589 return error;
14590 }
14591 mp = vnode_mount(dvp);
14592 devvp = mp->mnt_devvp;
14593
14594 /* get an iocount on devvp */
14595 if (devvp == NULLVP) {
14596 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14597 /* for mounts which arent block devices */
14598 if (error == ENOENT) {
14599 error = ENXIO;
14600 }
14601 } else {
14602 error = vnode_getwithref(devvp);
14603 }
14604
14605 if (error) {
14606 vnode_put(dvp);
14607 return error;
14608 }
14609
14610 if ((vfs_context_issuser(ctx) == 0) &&
14611 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14612 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14613 error = EPERM;
14614 }
14615 vnode_put(dvp);
14616 vnode_put(devvp);
14617
14618 if (error) {
14619 return error;
14620 }
14621 }
14622
14623 switch (uap->op) {
14624 case SNAPSHOT_OP_CREATE:
14625 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14626 break;
14627 case SNAPSHOT_OP_DELETE:
14628 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14629 break;
14630 case SNAPSHOT_OP_RENAME:
14631 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14632 uap->flags, ctx);
14633 break;
14634 case SNAPSHOT_OP_MOUNT:
14635 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14636 uap->data, uap->flags, ctx);
14637 break;
14638 case SNAPSHOT_OP_REVERT:
14639 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14640 break;
14641 #if CONFIG_MNT_ROOTSNAP
14642 case SNAPSHOT_OP_ROOT:
14643 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14644 break;
14645 #endif /* CONFIG_MNT_ROOTSNAP */
14646 default:
14647 error = ENOSYS;
14648 }
14649
14650 return error;
14651 }
14652