1 /*
2 * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114 #if CONFIG_EXCLAVES
115 #include <vfs/vfs_exclave_fs.h>
116 #endif
117
118 #include <security/audit/audit.h>
119 #include <bsm/audit_kevents.h>
120
121 #include <mach/mach_types.h>
122 #include <kern/kern_types.h>
123 #include <kern/kalloc.h>
124 #include <kern/task.h>
125
126 #include <vm/vm_pageout.h>
127 #include <vm/vm_protos.h>
128
129 #include <libkern/OSAtomic.h>
130 #include <os/atomic_private.h>
131 #include <pexpert/pexpert.h>
132 #include <IOKit/IOBSD.h>
133
134 // deps for MIG call
135 #include <kern/host.h>
136 #include <kern/ipc_misc.h>
137 #include <mach/host_priv.h>
138 #include <mach/vfs_nspace.h>
139 #include <os/log.h>
140
141 #include <nfs/nfs_conf.h>
142
143 #if ROUTEFS
144 #include <miscfs/routefs/routefs.h>
145 #endif /* ROUTEFS */
146
147 #if CONFIG_MACF
148 #include <security/mac.h>
149 #include <security/mac_framework.h>
150 #endif
151
152 #if CONFIG_FSE
153 #define GET_PATH(x) \
154 ((x) = get_pathbuff())
155 #define RELEASE_PATH(x) \
156 release_pathbuff(x)
157 #else
158 #define GET_PATH(x) \
159 ((x) = zalloc(ZV_NAMEI))
160 #define RELEASE_PATH(x) \
161 zfree(ZV_NAMEI, x)
162 #endif /* CONFIG_FSE */
163
164 #ifndef HFS_GET_BOOT_INFO
165 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
166 #endif
167
168 #ifndef HFS_SET_BOOT_INFO
169 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
170 #endif
171
172 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
173 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
174 #endif
175
176 extern void disk_conditioner_unmount(mount_t mp);
177
178 /* struct for checkdirs iteration */
179 struct cdirargs {
180 vnode_t olddp;
181 vnode_t newdp;
182 };
183 /* callback for checkdirs iteration */
184 static int checkdirs_callback(proc_t p, void * arg);
185
186 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
187 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
188 void enablequotas(struct mount *mp, vfs_context_t ctx);
189 static int getfsstat_callback(mount_t mp, void * arg);
190 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
191 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
192 static int sync_callback(mount_t, void *);
193 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
194 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
195 boolean_t partial_copy);
196 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
197 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
198 struct componentname *cnp, user_addr_t fsmountargs,
199 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
200 void vfs_notify_mount(vnode_t pdvp);
201
202 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
203
204 struct fd_vn_data * fg_vn_data_alloc(void);
205
206 /*
207 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
208 * Concurrent lookups (or lookups by ids) on hard links can cause the
209 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
210 * does) to return ENOENT as the path cannot be returned from the name cache
211 * alone. We have no option but to retry and hope to get one namei->reverse path
212 * generation done without an intervening lookup, lookup by id on the hard link
213 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
214 * which currently are the MAC hooks for rename, unlink and rmdir.
215 */
216 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
217
218 /* Max retry limit for rename due to vnode recycling. */
219 #define MAX_RENAME_ERECYCLE_RETRIES 1024
220
221 /* Max retries for concurrent mounts on the same covered vnode. */
222 #define MAX_MOUNT_RETRIES 10
223
224 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
225 int unlink_flags);
226
227 #ifdef CONFIG_IMGSRC_ACCESS
228 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
229 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
230 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
231 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
232 static void mount_end_update(mount_t mp);
233 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
234 #endif /* CONFIG_IMGSRC_ACCESS */
235
236 //snapshot functions
237 #if CONFIG_MNT_ROOTSNAP
238 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
239 #else
240 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
241 #endif
242
243 __private_extern__
244 int sync_internal(void);
245
246 __private_extern__
247 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
248
249 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
250 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
251
252 /* vars for sync mutex */
253 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
254 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
255
256 extern lck_rw_t rootvnode_rw_lock;
257
258 VFS_SMR_DECLARE;
259 extern uint32_t nc_smr_enabled;
260
261 /*
262 * incremented each time a mount or unmount operation occurs
263 * used to invalidate the cached value of the rootvp in the
264 * mount structure utilized by cache_lookup_path
265 */
266 uint32_t mount_generation = 0;
267
268 /* counts number of mount and unmount operations */
269 unsigned int vfs_nummntops = 0;
270
271 /* system-wide, per-boot unique mount ID */
272 static _Atomic uint64_t mount_unique_id = 1;
273
274 extern const struct fileops vnops;
275 #if CONFIG_APPLEDOUBLE
276 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
277 #endif /* CONFIG_APPLEDOUBLE */
278
279 /* Maximum buffer length supported by fsgetpath(2) */
280 #define FSGETPATH_MAXBUFLEN 8192
281
282 /*
283 * Virtual File System System Calls
284 */
285
286 /*
287 * Private in-kernel mounting spi (specific use-cases only)
288 */
289 boolean_t
vfs_iskernelmount(mount_t mp)290 vfs_iskernelmount(mount_t mp)
291 {
292 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
293 }
294
295 __private_extern__
296 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)297 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
298 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
299 vfs_context_t ctx)
300 {
301 struct nameidata nd;
302 boolean_t did_namei;
303 int error;
304
305 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
306 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
307
308 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
309
310 /*
311 * Get the vnode to be covered if it's not supplied
312 */
313 if (vp == NULLVP) {
314 error = namei(&nd);
315 if (error) {
316 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
317 printf("failed to locate mount-on path: %s ", path);
318 }
319 return error;
320 }
321 vp = nd.ni_vp;
322 pvp = nd.ni_dvp;
323 did_namei = TRUE;
324 } else {
325 char *pnbuf = CAST_DOWN(char *, path);
326
327 nd.ni_cnd.cn_pnbuf = pnbuf;
328 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
329 did_namei = FALSE;
330 }
331
332 kern_flags |= KERNEL_MOUNT_KMOUNT;
333 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
334 syscall_flags, kern_flags, NULL, ctx);
335
336 if (did_namei) {
337 vnode_put(vp);
338 vnode_put(pvp);
339 nameidone(&nd);
340 }
341
342 return error;
343 }
344
345 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)346 vfs_mount_at_path(const char *fstype, const char *path,
347 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
348 int mnt_flags, int flags)
349 {
350 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
351 int error, km_flags = 0;
352 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
353
354 /*
355 * This call is currently restricted to specific use cases.
356 */
357 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
358 return ENOTSUP;
359 }
360
361 #if !defined(XNU_TARGET_OS_OSX)
362 if (strcmp(fstype, "lifs") == 0) {
363 syscall_flags |= MNT_NOEXEC;
364 }
365 #endif
366
367 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
368 km_flags |= KERNEL_MOUNT_NOAUTH;
369 }
370 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
371 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
372 }
373
374 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
375 syscall_flags, km_flags, ctx);
376 if (error) {
377 printf("%s: mount on %s failed, error %d\n", __func__, path,
378 error);
379 }
380
381 return error;
382 }
383
384 /*
385 * Mount a file system.
386 */
387 /* ARGSUSED */
388 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)389 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
390 {
391 struct __mac_mount_args muap;
392
393 muap.type = uap->type;
394 muap.path = uap->path;
395 muap.flags = uap->flags;
396 muap.data = uap->data;
397 muap.mac_p = USER_ADDR_NULL;
398 return __mac_mount(p, &muap, retval);
399 }
400
401 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)402 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
403 {
404 struct componentname cn;
405 vfs_context_t ctx = vfs_context_current();
406 size_t dummy = 0;
407 int error;
408 int flags = uap->flags;
409 char fstypename[MFSNAMELEN];
410 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
411 vnode_t pvp;
412 vnode_t vp;
413
414 AUDIT_ARG(fd, uap->fd);
415 AUDIT_ARG(fflags, flags);
416 /* fstypename will get audited by mount_common */
417
418 /* Sanity check the flags */
419 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
420 return ENOTSUP;
421 }
422
423 if (flags & MNT_UNION) {
424 return EPERM;
425 }
426
427 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
428 if (error) {
429 return error;
430 }
431
432 if ((error = file_vnode(uap->fd, &vp)) != 0) {
433 return error;
434 }
435
436 if ((error = vnode_getwithref(vp)) != 0) {
437 file_drop(uap->fd);
438 return error;
439 }
440
441 pvp = vnode_getparent(vp);
442 if (pvp == NULL) {
443 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
444 error = EBUSY;
445 } else {
446 error = EINVAL;
447 }
448 vnode_put(vp);
449 file_drop(uap->fd);
450 return error;
451 }
452
453 memset(&cn, 0, sizeof(struct componentname));
454 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
455 cn.cn_pnlen = MAXPATHLEN;
456
457 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
458 zfree(ZV_NAMEI, cn.cn_pnbuf);
459 vnode_put(pvp);
460 vnode_put(vp);
461 file_drop(uap->fd);
462 return error;
463 }
464
465 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
466
467 zfree(ZV_NAMEI, cn.cn_pnbuf);
468 vnode_put(pvp);
469 vnode_put(vp);
470 file_drop(uap->fd);
471
472 return error;
473 }
474
475 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
476
477 /*
478 * Get the size of a graft file (a manifest or payload file).
479 * The vp should be an iocounted vnode.
480 */
481 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)482 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
483 {
484 struct stat64 sb = {};
485 int error;
486
487 *size = 0;
488
489 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
490 if (error) {
491 return error;
492 }
493
494 if (sb.st_size == 0) {
495 error = ENODATA;
496 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
497 error = EFBIG;
498 } else {
499 *size = (size_t) sb.st_size;
500 }
501
502 return error;
503 }
504
505 /*
506 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
507 * `size` must already be validated.
508 */
509 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)510 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
511 {
512 return vn_rdwr(UIO_READ, graft_vp,
513 (caddr_t) buf, (int) size, /* offset */ 0,
514 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
515 vfs_context_ucred(vctx), /* resid */ NULL,
516 vfs_context_proc(vctx));
517 }
518
519 /*
520 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
521 * and read it into `buf`.
522 */
523 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)524 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
525 {
526 vnode_t metadata_vp = NULLVP;
527 int error;
528
529 // Convert this graft fd to a vnode.
530 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
531 goto out;
532 }
533
534 // Get (and validate) size information.
535 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
536 goto out;
537 }
538
539 // Read each file into the provided buffer - we must get the expected amount of bytes.
540 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
541 goto out;
542 }
543
544 out:
545 if (metadata_vp) {
546 vnode_put(metadata_vp);
547 metadata_vp = NULLVP;
548 }
549
550 return error;
551 }
552
553 /*
554 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
555 * provided in `gfs`, saving the size of data read in `gfs`.
556 */
557 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)558 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
559 fsioc_graft_fs_t *gfs)
560 {
561 int error;
562
563 // Read the authentic manifest.
564 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
565 &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
566 return error;
567 }
568
569 // The user manifest is currently unused, but set its size.
570 gfs->user_manifest_size = 0;
571
572 // Read the payload.
573 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
574 &gfs->payload_size, gfs->payload))) {
575 return error;
576 }
577
578 return 0;
579 }
580
581 /*
582 * Call into the filesystem to verify and graft a cryptex.
583 */
584 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)585 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
586 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
587 {
588 fsioc_graft_fs_t gfs = {};
589 uint64_t graft_dir_ino = 0;
590 struct stat64 sb = {};
591 int error;
592
593 // Pre-flight arguments.
594 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
595 // Make sure that this graft version matches what we support.
596 return ENOTSUP;
597 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
598 // For this type, cryptex VP must live on same volume as the target of graft.
599 return EXDEV;
600 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
601 // We cannot graft upon non-directories.
602 return ENOTDIR;
603 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
604 sbc_args->sbc_payload_fd < 0) {
605 // We cannot graft without a manifest and payload.
606 return EINVAL;
607 }
608
609 if (mounton_vp) {
610 // Get the mounton's inode number.
611 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
612 if (error) {
613 return error;
614 }
615 graft_dir_ino = (uint64_t) sb.st_ino;
616 }
617
618 // Create buffers (of our maximum-defined size) to store authentication info.
619 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
620 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
621
622 if (!gfs.authentic_manifest || !gfs.payload) {
623 error = ENOMEM;
624 goto out;
625 }
626
627 // Read our fd's into our buffers.
628 // (Note that this will set the buffer size fields in `gfs`.)
629 error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
630 if (error) {
631 goto out;
632 }
633
634 gfs.graft_version = FSIOC_GRAFT_VERSION;
635 gfs.graft_type = graft_type;
636 gfs.graft_4cc = sbc_args->sbc_4cc;
637 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
638 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
639 }
640 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
641 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
642 }
643 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
644 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
645 }
646 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
647 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
648 }
649 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
650 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
651 }
652 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
653 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
654 }
655 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
656
657 // Call into the FS to perform the graft (and validation).
658 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
659
660 out:
661 if (gfs.authentic_manifest) {
662 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
663 gfs.authentic_manifest = NULL;
664 }
665 if (gfs.payload) {
666 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
667 gfs.payload = NULL;
668 }
669
670 return error;
671 }
672
673 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
674
675 /*
676 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
677 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
678 */
679 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)680 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
681 {
682 int ua_dmgfd = uap->dmg_fd;
683 user_addr_t ua_mountdir = uap->mountdir;
684 uint32_t ua_grafttype = uap->graft_type;
685 user_addr_t ua_graftargs = uap->gda;
686
687 graftdmg_args_un kern_gda = {};
688 int error = 0;
689 secure_boot_cryptex_args_t *sbc_args = NULL;
690
691 vnode_t cryptex_vp = NULLVP;
692 vnode_t mounton_vp = NULLVP;
693 struct nameidata nd = {};
694 vfs_context_t ctx = vfs_context_current();
695
696 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
697 return EPERM;
698 }
699
700 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
701 if (error) {
702 return error;
703 }
704
705 // Copy mount dir in, if provided.
706 if (ua_mountdir != USER_ADDR_NULL) {
707 // Acquire vnode for mount-on path
708 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
709 UIO_USERSPACE, ua_mountdir, ctx);
710
711 error = namei(&nd);
712 if (error) {
713 return error;
714 }
715 mounton_vp = nd.ni_vp;
716 }
717
718 // Convert fd to vnode.
719 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
720 if (error) {
721 goto graftout;
722 }
723
724 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
725 error = EINVAL;
726 } else {
727 sbc_args = &kern_gda.sbc_args;
728 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
729 }
730
731 graftout:
732 if (cryptex_vp) {
733 vnode_put(cryptex_vp);
734 cryptex_vp = NULLVP;
735 }
736 if (mounton_vp) {
737 vnode_put(mounton_vp);
738 mounton_vp = NULLVP;
739 }
740 if (ua_mountdir != USER_ADDR_NULL) {
741 nameidone(&nd);
742 }
743
744 return error;
745 }
746
747 /*
748 * Ungraft a cryptex disk image (via mount dir FD)
749 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
750 */
751 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)752 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
753 {
754 int error = 0;
755 user_addr_t ua_mountdir = uap->mountdir;
756 fsioc_ungraft_fs_t ugfs;
757 vnode_t mounton_vp = NULLVP;
758 struct nameidata nd = {};
759 vfs_context_t ctx = vfs_context_current();
760
761 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
762 return EPERM;
763 }
764
765 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
766 return EINVAL;
767 }
768
769 ugfs.ungraft_flags = 0;
770
771 // Acquire vnode for mount-on path
772 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
773 UIO_USERSPACE, ua_mountdir, ctx);
774
775 error = namei(&nd);
776 if (error) {
777 return error;
778 }
779 mounton_vp = nd.ni_vp;
780
781 // Call into the FS to perform the ungraft
782 error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
783
784 vnode_put(mounton_vp);
785 nameidone(&nd);
786
787 return error;
788 }
789
790
791 void
vfs_notify_mount(vnode_t pdvp)792 vfs_notify_mount(vnode_t pdvp)
793 {
794 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
795 lock_vnode_and_post(pdvp, NOTE_WRITE);
796 }
797
798 /*
799 * __mac_mount:
800 * Mount a file system taking into account MAC label behavior.
801 * See mount(2) man page for more information
802 *
803 * Parameters: p Process requesting the mount
804 * uap User argument descriptor (see below)
805 * retval (ignored)
806 *
807 * Indirect: uap->type Filesystem type
808 * uap->path Path to mount
809 * uap->data Mount arguments
810 * uap->mac_p MAC info
811 * uap->flags Mount flags
812 *
813 *
814 * Returns: 0 Success
815 * !0 Not success
816 */
817 boolean_t root_fs_upgrade_try = FALSE;
818
819 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)820 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
821 {
822 vnode_t pvp = NULL;
823 vnode_t vp = NULL;
824 int need_nameidone = 0;
825 vfs_context_t ctx = vfs_context_current();
826 char fstypename[MFSNAMELEN];
827 struct nameidata nd;
828 size_t dummy = 0;
829 char *labelstr = NULL;
830 size_t labelsz = 0;
831 int flags = uap->flags;
832 int error;
833 int num_retries = 0;
834 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
835 boolean_t is_64bit = IS_64BIT_PROCESS(p);
836 #else
837 #pragma unused(p)
838 #endif
839 /*
840 * Get the fs type name from user space
841 */
842 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
843 if (error) {
844 return error;
845 }
846
847 retry:
848 /*
849 * Get the vnode to be covered
850 */
851 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
852 UIO_USERSPACE, uap->path, ctx);
853 if (flags & MNT_NOFOLLOW) {
854 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
855 }
856 error = namei(&nd);
857 if (error) {
858 goto out;
859 }
860 need_nameidone = 1;
861 vp = nd.ni_vp;
862 pvp = nd.ni_dvp;
863
864 #ifdef CONFIG_IMGSRC_ACCESS
865 /* Mounting image source cannot be batched with other operations */
866 if (flags == MNT_IMGSRC_BY_INDEX) {
867 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
868 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
869 goto out;
870 }
871 #endif /* CONFIG_IMGSRC_ACCESS */
872
873 #if CONFIG_MACF
874 /*
875 * Get the label string (if any) from user space
876 */
877 if (uap->mac_p != USER_ADDR_NULL) {
878 struct user_mac mac;
879 size_t ulen = 0;
880
881 if (is_64bit) {
882 struct user64_mac mac64;
883 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
884 mac.m_buflen = (user_size_t)mac64.m_buflen;
885 mac.m_string = (user_addr_t)mac64.m_string;
886 } else {
887 struct user32_mac mac32;
888 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
889 mac.m_buflen = mac32.m_buflen;
890 mac.m_string = mac32.m_string;
891 }
892 if (error) {
893 goto out;
894 }
895 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
896 (mac.m_buflen < 2)) {
897 error = EINVAL;
898 goto out;
899 }
900 labelsz = mac.m_buflen;
901 labelstr = kalloc_data(labelsz, Z_WAITOK);
902 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
903 if (error) {
904 goto out;
905 }
906 AUDIT_ARG(mac_string, labelstr);
907 }
908 #endif /* CONFIG_MACF */
909
910 AUDIT_ARG(fflags, flags);
911
912 #if !CONFIG_UNION_MOUNTS
913 if (flags & MNT_UNION) {
914 error = EPERM;
915 goto out;
916 }
917 #endif
918
919 if ((vp->v_flag & VROOT) &&
920 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
921 #if CONFIG_UNION_MOUNTS
922 if (!(flags & MNT_UNION)) {
923 flags |= MNT_UPDATE;
924 } else {
925 /*
926 * For a union mount on '/', treat it as fresh
927 * mount instead of update.
928 * Otherwise, union mouting on '/' used to panic the
929 * system before, since mnt_vnodecovered was found to
930 * be NULL for '/' which is required for unionlookup
931 * after it gets ENOENT on union mount.
932 */
933 flags = (flags & ~(MNT_UPDATE));
934 }
935 #else
936 flags |= MNT_UPDATE;
937 #endif /* CONFIG_UNION_MOUNTS */
938
939 #if SECURE_KERNEL
940 if ((flags & MNT_RDONLY) == 0) {
941 /* Release kernels are not allowed to mount "/" as rw */
942 error = EPERM;
943 goto out;
944 }
945 #endif
946
947 /*
948 * See 7392553 for more details on why this check exists.
949 * Suffice to say: If this check is ON and something tries
950 * to mount the rootFS RW, we'll turn off the codesign
951 * bitmap optimization.
952 */
953 #if CHECK_CS_VALIDATION_BITMAP
954 if ((flags & MNT_RDONLY) == 0) {
955 root_fs_upgrade_try = TRUE;
956 }
957 #endif
958 }
959
960 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
961 labelstr, ctx);
962
963 out:
964
965 #if CONFIG_MACF
966 kfree_data(labelstr, labelsz);
967 #endif /* CONFIG_MACF */
968
969 if (vp) {
970 vnode_put(vp);
971 }
972 if (pvp) {
973 vnode_put(pvp);
974 }
975 if (need_nameidone) {
976 nameidone(&nd);
977 }
978
979 if (error == EBUSY) {
980 /* Retry the lookup and mount again due to concurrent mounts. */
981 if (++num_retries < MAX_MOUNT_RETRIES) {
982 goto retry;
983 }
984 }
985
986 return error;
987 }
988
989 /*
990 * common mount implementation (final stage of mounting)
991 *
992 * Arguments:
993 * fstypename file system type (ie it's vfs name)
994 * pvp parent of covered vnode
995 * vp covered vnode
996 * cnp component name (ie path) of covered vnode
997 * flags generic mount flags
998 * fsmountargs file system specific data
999 * labelstr optional MAC label
1000 * kernelmount TRUE for mounts initiated from inside the kernel
1001 * ctx caller's context
1002 */
1003 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)1004 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1005 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1006 char *labelstr, vfs_context_t ctx)
1007 {
1008 #if !CONFIG_MACF
1009 #pragma unused(labelstr)
1010 #endif
1011 struct vnode *devvp = NULLVP;
1012 struct vnode *device_vnode = NULLVP;
1013 #if CONFIG_MACF
1014 struct vnode *rvp;
1015 #endif
1016 struct mount *mp = NULL;
1017 struct vfstable *vfsp = (struct vfstable *)0;
1018 struct proc *p = vfs_context_proc(ctx);
1019 int error, flag = 0;
1020 bool flag_set = false;
1021 user_addr_t devpath = USER_ADDR_NULL;
1022 int ronly = 0;
1023 int mntalloc = 0;
1024 boolean_t vfsp_ref = FALSE;
1025 boolean_t is_rwlock_locked = FALSE;
1026 boolean_t did_rele = FALSE;
1027 boolean_t have_usecount = FALSE;
1028 boolean_t did_set_lmount = FALSE;
1029 boolean_t did_set_vmount = FALSE;
1030 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1031
1032 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1033 /* Check for mutually-exclusive flag bits */
1034 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1035 int bitcount = 0;
1036 while (checkflags != 0) {
1037 checkflags &= (checkflags - 1);
1038 bitcount++;
1039 }
1040
1041 if (bitcount > 1) {
1042 //not allowed to request multiple mount-by-role flags
1043 error = EINVAL;
1044 goto out1;
1045 }
1046 #endif
1047
1048 /*
1049 * Process an update for an existing mount
1050 */
1051 if (flags & MNT_UPDATE) {
1052 if ((vp->v_flag & VROOT) == 0) {
1053 error = EINVAL;
1054 goto out1;
1055 }
1056 mp = vp->v_mount;
1057
1058 /* if unmount or mount in progress, return error */
1059 mount_lock_spin(mp);
1060 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1061 mount_unlock(mp);
1062 error = EBUSY;
1063 goto out1;
1064 }
1065 mp->mnt_lflag |= MNT_LMOUNT;
1066 did_set_lmount = TRUE;
1067 mount_unlock(mp);
1068 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1069 is_rwlock_locked = TRUE;
1070 /*
1071 * We only allow the filesystem to be reloaded if it
1072 * is currently mounted read-only.
1073 */
1074 if ((flags & MNT_RELOAD) &&
1075 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1076 error = ENOTSUP;
1077 goto out1;
1078 }
1079
1080 /*
1081 * If content protection is enabled, update mounts are not
1082 * allowed to turn it off.
1083 */
1084 if ((mp->mnt_flag & MNT_CPROTECT) &&
1085 ((flags & MNT_CPROTECT) == 0)) {
1086 error = EINVAL;
1087 goto out1;
1088 }
1089
1090 /*
1091 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1092 * failure to return an error for this so we'll just silently
1093 * add it if it is not passed in.
1094 */
1095 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1096 ((flags & MNT_REMOVABLE) == 0)) {
1097 flags |= MNT_REMOVABLE;
1098 }
1099
1100 /* Can't downgrade the backer of the root FS */
1101 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1102 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1103 error = ENOTSUP;
1104 goto out1;
1105 }
1106
1107 /*
1108 * Only root, or the user that did the original mount is
1109 * permitted to update it.
1110 */
1111 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1112 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1113 goto out1;
1114 }
1115 #if CONFIG_MACF
1116 error = mac_mount_check_remount(ctx, mp);
1117 if (error != 0) {
1118 goto out1;
1119 }
1120 #endif
1121 /*
1122 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1123 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1124 */
1125 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1126 flags |= MNT_NOSUID | MNT_NODEV;
1127 if (mp->mnt_flag & MNT_NOEXEC) {
1128 flags |= MNT_NOEXEC;
1129 }
1130 }
1131 flag = mp->mnt_flag;
1132 flag_set = true;
1133
1134
1135
1136 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1137
1138 vfsp = mp->mnt_vtable;
1139 goto update;
1140 } // MNT_UPDATE
1141
1142 /*
1143 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1144 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1145 */
1146 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1147 flags |= MNT_NOSUID | MNT_NODEV;
1148 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1149 flags |= MNT_NOEXEC;
1150 }
1151 }
1152
1153 /* XXXAUDIT: Should we capture the type on the error path as well? */
1154 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1155 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1156 mount_list_lock();
1157 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1158 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1159 vfsp->vfc_refcount++;
1160 vfsp_ref = TRUE;
1161 break;
1162 }
1163 }
1164 mount_list_unlock();
1165 if (vfsp == NULL) {
1166 error = ENODEV;
1167 goto out1;
1168 }
1169
1170 /*
1171 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1172 * except in ROSV configs and for the initial BaseSystem root.
1173 */
1174 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1175 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1176 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1177 error = EINVAL; /* unsupported request */
1178 goto out1;
1179 }
1180
1181 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1182 if (error != 0) {
1183 goto out1;
1184 }
1185
1186 /*
1187 * Upon successful of prepare_coveredvp(), VMOUNT is set for the covered vp.
1188 */
1189 did_set_vmount = TRUE;
1190
1191 /*
1192 * Allocate and initialize the filesystem (mount_t)
1193 */
1194 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1195 mntalloc = 1;
1196
1197 /* Initialize the default IO constraints */
1198 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1199 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1200 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1201 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1202 mp->mnt_devblocksize = DEV_BSIZE;
1203 mp->mnt_alignmentmask = PAGE_MASK;
1204 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1205 mp->mnt_ioscale = 1;
1206 mp->mnt_ioflags = 0;
1207 mp->mnt_realrootvp = NULLVP;
1208 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1209
1210 mp->mnt_lflag |= MNT_LMOUNT;
1211 did_set_lmount = TRUE;
1212
1213 TAILQ_INIT(&mp->mnt_vnodelist);
1214 TAILQ_INIT(&mp->mnt_workerqueue);
1215 TAILQ_INIT(&mp->mnt_newvnodes);
1216 mount_lock_init(mp);
1217 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1218 is_rwlock_locked = TRUE;
1219 mp->mnt_op = vfsp->vfc_vfsops;
1220 mp->mnt_vtable = vfsp;
1221 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1222 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1223 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1224 do {
1225 size_t pathlen = MAXPATHLEN;
1226
1227 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1228 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1229 }
1230 } while (0);
1231 mp->mnt_vnodecovered = vp;
1232 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1233 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1234 mp->mnt_devbsdunit = 0;
1235 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1236
1237 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1238 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1239
1240 if (kernelmount) {
1241 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1242 }
1243 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1244 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1245 }
1246
1247 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1248 // kernel mounted devfs
1249 mp->mnt_kern_flag |= MNTK_SYSTEM;
1250 }
1251
1252 update:
1253
1254 /*
1255 * Set the mount level flags.
1256 */
1257 if (flags & MNT_RDONLY) {
1258 mp->mnt_flag |= MNT_RDONLY;
1259 } else if (mp->mnt_flag & MNT_RDONLY) {
1260 // disallow read/write upgrades of file systems that
1261 // had the TYPENAME_OVERRIDE feature set.
1262 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1263 error = EPERM;
1264 goto out1;
1265 }
1266 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1267 }
1268 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1269 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1270 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1271 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1272 MNT_QUARANTINE | MNT_CPROTECT);
1273
1274 #if SECURE_KERNEL
1275 #if !CONFIG_MNT_SUID
1276 /*
1277 * On release builds of iOS based platforms, always enforce NOSUID on
1278 * all mounts. We do this here because we can catch update mounts as well as
1279 * non-update mounts in this case.
1280 */
1281 mp->mnt_flag |= (MNT_NOSUID);
1282 #endif
1283 #endif
1284
1285 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1286 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1287 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1288 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1289 MNT_QUARANTINE | MNT_CPROTECT);
1290
1291 #if CONFIG_MACF
1292 if (flags & MNT_MULTILABEL) {
1293 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1294 error = EINVAL;
1295 goto out1;
1296 }
1297 mp->mnt_flag |= MNT_MULTILABEL;
1298 }
1299 #endif
1300 /*
1301 * Process device path for local file systems if requested.
1302 *
1303 * Snapshot and mount-by-role mounts do not use this path; they are
1304 * passing other opaque data in the device path field.
1305 *
1306 * Basesystemroot mounts pass a device path to be resolved here,
1307 * but it's just a char * already inside the kernel, which
1308 * kernel_mount() shoved into a user_addr_t to call us. So for such
1309 * mounts we must skip copyin (both of the address and of the string
1310 * (in NDINIT).
1311 */
1312 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1313 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1314 boolean_t do_copyin_devpath = true;
1315 #if CONFIG_BASESYSTEMROOT
1316 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1317 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1318 // We have been passed fsmountargs, which is typed as a user_addr_t,
1319 // but is actually a char ** pointing to a (kernelspace) string.
1320 // We manually unpack it with a series of casts and dereferences
1321 // that reverses what was done just above us on the stack in
1322 // imageboot_pivot_image().
1323 // After retrieving the path to the dev node (which we will NDINIT
1324 // in a moment), we pass NULL fsmountargs on to the filesystem.
1325 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1326 char **devnamepp = (char **)fsmountargs;
1327 char *devnamep = *devnamepp;
1328 devpath = CAST_USER_ADDR_T(devnamep);
1329 do_copyin_devpath = false;
1330 fsmountargs = USER_ADDR_NULL;
1331
1332 //Now that we have a mp, denote that this mount is for the basesystem.
1333 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1334 }
1335 #endif // CONFIG_BASESYSTEMROOT
1336
1337 if (do_copyin_devpath) {
1338 if (vfs_context_is64bit(ctx)) {
1339 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1340 goto out1;
1341 }
1342 fsmountargs += sizeof(devpath);
1343 } else {
1344 user32_addr_t tmp;
1345 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1346 goto out1;
1347 }
1348 /* munge into LP64 addr */
1349 devpath = CAST_USER_ADDR_T(tmp);
1350 fsmountargs += sizeof(tmp);
1351 }
1352 }
1353
1354 /* Lookup device and authorize access to it */
1355 if ((devpath)) {
1356 struct nameidata nd;
1357
1358 enum uio_seg seg = UIO_USERSPACE;
1359 #if CONFIG_BASESYSTEMROOT
1360 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1361 seg = UIO_SYSSPACE;
1362 }
1363 #endif // CONFIG_BASESYSTEMROOT
1364
1365 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1366 if ((error = namei(&nd))) {
1367 goto out1;
1368 }
1369
1370 devvp = nd.ni_vp;
1371
1372 if (devvp->v_type != VBLK) {
1373 error = ENOTBLK;
1374 nameidone(&nd);
1375 goto out2;
1376 }
1377 if (major(devvp->v_rdev) >= nblkdev) {
1378 error = ENXIO;
1379 nameidone(&nd);
1380 goto out2;
1381 }
1382 /*
1383 * If mount by non-root, then verify that user has necessary
1384 * permissions on the device.
1385 */
1386 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1387 kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1388
1389 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1390 accessmode |= KAUTH_VNODE_WRITE_DATA;
1391 }
1392 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1393 nameidone(&nd);
1394 goto out2;
1395 }
1396 }
1397
1398 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1399 nameidone(&nd);
1400 }
1401 /* On first mount, preflight and open device */
1402 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1403 if ((error = vnode_ref(devvp))) {
1404 goto out2;
1405 }
1406 /*
1407 * Disallow multiple mounts of the same device.
1408 * Disallow mounting of a device that is currently in use
1409 * (except for root, which might share swap device for miniroot).
1410 * Flush out any old buffers remaining from a previous use.
1411 */
1412 if ((error = vfs_setmounting(devvp))) {
1413 vnode_rele(devvp);
1414 goto out2;
1415 }
1416
1417 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1418 error = EBUSY;
1419 goto out3;
1420 }
1421 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1422 error = ENOTBLK;
1423 goto out3;
1424 }
1425 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1426 goto out3;
1427 }
1428
1429 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1430 #if CONFIG_MACF
1431 error = mac_vnode_check_open(ctx,
1432 devvp,
1433 ronly ? FREAD : FREAD | FWRITE);
1434 if (error) {
1435 goto out3;
1436 }
1437 #endif /* MAC */
1438 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1439 goto out3;
1440 }
1441
1442 mp->mnt_devvp = devvp;
1443 device_vnode = devvp;
1444 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1445 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1446 (device_vnode = mp->mnt_devvp)) {
1447 dev_t dev;
1448 int maj;
1449 /*
1450 * If upgrade to read-write by non-root, then verify
1451 * that user has necessary permissions on the device.
1452 */
1453 vnode_getalways(device_vnode);
1454
1455 if (suser(vfs_context_ucred(ctx), NULL) &&
1456 (error = vnode_authorize(device_vnode, NULL,
1457 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1458 ctx)) != 0) {
1459 vnode_put(device_vnode);
1460 goto out2;
1461 }
1462
1463 /* Tell the device that we're upgrading */
1464 dev = (dev_t)device_vnode->v_rdev;
1465 maj = major(dev);
1466
1467 if ((u_int)maj >= (u_int)nblkdev) {
1468 panic("Volume mounted on a device with invalid major number.");
1469 }
1470
1471 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1472 vnode_put(device_vnode);
1473 device_vnode = NULLVP;
1474 if (error != 0) {
1475 goto out2;
1476 }
1477 }
1478 } // localargs && !(snapshot | data | vm)
1479
1480 #if CONFIG_MACF
1481 if ((flags & MNT_UPDATE) == 0) {
1482 mac_mount_label_init(mp);
1483 mac_mount_label_associate(ctx, mp);
1484 }
1485 if (labelstr) {
1486 if ((flags & MNT_UPDATE) != 0) {
1487 error = mac_mount_check_label_update(ctx, mp);
1488 if (error != 0) {
1489 goto out3;
1490 }
1491 }
1492 }
1493 #endif
1494 /*
1495 * Mount the filesystem. We already asserted that internal_flags
1496 * cannot have more than one mount-by-role bit set.
1497 */
1498 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1499 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1500 (caddr_t)fsmountargs, 0, ctx);
1501 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1502 #if CONFIG_ROSV_STARTUP
1503 struct mount *origin_mp = (struct mount*)fsmountargs;
1504 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1505 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1506 if (error) {
1507 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1508 } else {
1509 /* Mark volume associated with system volume */
1510 mp->mnt_kern_flag |= MNTK_SYSTEM;
1511
1512 /* Attempt to acquire the mnt_devvp and set it up */
1513 struct vnode *mp_devvp = NULL;
1514 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1515 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1516 0, &mp_devvp, vfs_context_kernel());
1517 if (!lerr) {
1518 mp->mnt_devvp = mp_devvp;
1519 //vnode_lookup took an iocount, need to drop it.
1520 vnode_put(mp_devvp);
1521 // now set `device_vnode` to the devvp that was acquired.
1522 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1523 // note that though the iocount above was dropped, the mount acquires
1524 // an implicit reference against the device.
1525 device_vnode = mp_devvp;
1526 }
1527 }
1528 }
1529 #else
1530 error = EINVAL;
1531 #endif
1532 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1533 #if CONFIG_MOUNT_VM
1534 struct mount *origin_mp = (struct mount*)fsmountargs;
1535 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1536 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1537 if (error) {
1538 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1539 } else {
1540 /* Mark volume associated with system volume and a swap mount */
1541 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1542 /* Attempt to acquire the mnt_devvp and set it up */
1543 struct vnode *mp_devvp = NULL;
1544 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1545 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1546 0, &mp_devvp, vfs_context_kernel());
1547 if (!lerr) {
1548 mp->mnt_devvp = mp_devvp;
1549 //vnode_lookup took an iocount, need to drop it.
1550 vnode_put(mp_devvp);
1551
1552 // now set `device_vnode` to the devvp that was acquired.
1553 // note that though the iocount above was dropped, the mount acquires
1554 // an implicit reference against the device.
1555 device_vnode = mp_devvp;
1556 }
1557 }
1558 }
1559 #else
1560 error = EINVAL;
1561 #endif
1562 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1563 #if CONFIG_MOUNT_PREBOOTRECOVERY
1564 struct mount *origin_mp = (struct mount*)fsmountargs;
1565 uint32_t mount_role = 0;
1566 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1567 mount_role = VFS_PREBOOT_ROLE;
1568 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1569 mount_role = VFS_RECOVERY_ROLE;
1570 }
1571
1572 if (mount_role != 0) {
1573 fs_role_mount_args_t frma = {origin_mp, mount_role};
1574 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1575 if (error) {
1576 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1577 } else {
1578 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1579 /* Mark volume associated with system volume */
1580 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1581 /* Attempt to acquire the mnt_devvp and set it up */
1582 struct vnode *mp_devvp = NULL;
1583 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1584 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1585 0, &mp_devvp, vfs_context_kernel());
1586 if (!lerr) {
1587 mp->mnt_devvp = mp_devvp;
1588 //vnode_lookup took an iocount, need to drop it.
1589 vnode_put(mp_devvp);
1590
1591 // now set `device_vnode` to the devvp that was acquired.
1592 // note that though the iocount above was dropped, the mount acquires
1593 // an implicit reference against the device.
1594 device_vnode = mp_devvp;
1595 }
1596 }
1597 }
1598 } else {
1599 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1600 error = EINVAL;
1601 }
1602 #else
1603 error = EINVAL;
1604 #endif
1605 } else {
1606 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1607 }
1608
1609 if (flags & MNT_UPDATE) {
1610 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1611 mp->mnt_flag &= ~MNT_RDONLY;
1612 }
1613 mp->mnt_flag &= ~
1614 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1615 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1616 if (error) {
1617 mp->mnt_flag = flag; /* restore flag value */
1618 }
1619 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1620 lck_rw_done(&mp->mnt_rwlock);
1621 is_rwlock_locked = FALSE;
1622 if (!error) {
1623 enablequotas(mp, ctx);
1624 }
1625 goto exit;
1626 }
1627
1628 /*
1629 * Put the new filesystem on the mount list after root.
1630 */
1631 if (error == 0) {
1632 struct vfs_attr vfsattr;
1633 if (device_vnode) {
1634 /*
1635 * cache the IO attributes for the underlying physical media...
1636 * an error return indicates the underlying driver doesn't
1637 * support all the queries necessary... however, reasonable
1638 * defaults will have been set, so no reason to bail or care
1639 *
1640 * Need to do this before calling the MAC hook as it needs
1641 * information from this call.
1642 */
1643 vfs_init_io_attributes(device_vnode, mp);
1644 }
1645
1646 #if CONFIG_MACF
1647 error = mac_mount_check_mount_late(ctx, mp);
1648 if (error != 0) {
1649 goto out4;
1650 }
1651
1652 if (vfs_flags(mp) & MNT_MULTILABEL) {
1653 error = VFS_ROOT(mp, &rvp, ctx);
1654 if (error) {
1655 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1656 goto out4;
1657 }
1658 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1659 /*
1660 * drop reference provided by VFS_ROOT
1661 */
1662 vnode_put(rvp);
1663
1664 if (error) {
1665 goto out4;
1666 }
1667 }
1668 #endif /* MAC */
1669
1670 vnode_lock_spin(vp);
1671 CLR(vp->v_flag, VMOUNT);
1672 vp->v_mountedhere = mp;
1673 SET(vp->v_flag, VMOUNTEDHERE);
1674
1675 /*
1676 * Wakeup any waiter(s) in prepare_coveredvp() that is waiting for the
1677 * 'v_mountedhere' to be planted.
1678 */
1679 wakeup(&vp->v_flag);
1680 vnode_unlock(vp);
1681
1682 /*
1683 * taking the name_cache_lock exclusively will
1684 * insure that everyone is out of the fast path who
1685 * might be trying to use a now stale copy of
1686 * vp->v_mountedhere->mnt_realrootvp
1687 * bumping mount_generation causes the cached values
1688 * to be invalidated
1689 */
1690 name_cache_lock();
1691 mount_generation++;
1692 name_cache_unlock();
1693
1694 error = vnode_ref(vp);
1695 if (error != 0) {
1696 goto out4;
1697 }
1698
1699 have_usecount = TRUE;
1700
1701 error = checkdirs(vp, ctx);
1702 if (error != 0) {
1703 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1704 goto out4;
1705 }
1706 /*
1707 * there is no cleanup code here so I have made it void
1708 * we need to revisit this
1709 */
1710 (void)VFS_START(mp, 0, ctx);
1711
1712 if (mount_list_add(mp) != 0) {
1713 /*
1714 * The system is shutting down trying to umount
1715 * everything, so fail with a plausible errno.
1716 */
1717 error = EBUSY;
1718 goto out4;
1719 }
1720 lck_rw_done(&mp->mnt_rwlock);
1721 is_rwlock_locked = FALSE;
1722
1723 /* Check if this mounted file system supports EAs or named streams. */
1724 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1725 VFSATTR_INIT(&vfsattr);
1726 VFSATTR_WANTED(&vfsattr, f_capabilities);
1727 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1728 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1729 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1730 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1731 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1732 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1733 }
1734 #if NAMEDSTREAMS
1735 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1736 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1737 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1738 }
1739 #endif
1740 /* Check if this file system supports path from id lookups. */
1741 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1742 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1743 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1744 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1745 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1746 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1747 }
1748
1749 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1750 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1751 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1752 }
1753 }
1754 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1755 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1756 }
1757 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1758 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1759 }
1760 /* increment the operations count */
1761 OSAddAtomic(1, &vfs_nummntops);
1762 enablequotas(mp, ctx);
1763
1764 if (device_vnode) {
1765 vfs_setmountedon(device_vnode);
1766 }
1767
1768 /* Now that mount is setup, notify the listeners */
1769 vfs_notify_mount(pvp);
1770 IOBSDMountChange(mp, kIOMountChangeMount);
1771 } else {
1772 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1773 if (mp->mnt_vnodelist.tqh_first != NULL) {
1774 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1775 mp->mnt_vtable->vfc_name, error);
1776 }
1777
1778 vnode_lock_spin(vp);
1779 CLR(vp->v_flag, VMOUNT);
1780 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
1781 wakeup(&vp->v_flag);
1782 vnode_unlock(vp);
1783 mount_list_lock();
1784 mp->mnt_vtable->vfc_refcount--;
1785 mount_list_unlock();
1786
1787 if (device_vnode) {
1788 vnode_rele(device_vnode);
1789 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1790 vfs_clearmounting(device_vnode);
1791 }
1792 lck_rw_done(&mp->mnt_rwlock);
1793 is_rwlock_locked = FALSE;
1794
1795 if (nc_smr_enabled) {
1796 vfs_smr_synchronize();
1797 }
1798
1799 /*
1800 * if we get here, we have a mount structure that needs to be freed,
1801 * but since the coveredvp hasn't yet been updated to point at it,
1802 * no need to worry about other threads holding a crossref on this mp
1803 * so it's ok to just free it
1804 */
1805 mount_lock_destroy(mp);
1806 #if CONFIG_MACF
1807 mac_mount_label_destroy(mp);
1808 #endif
1809 zfree(mount_zone, mp);
1810 did_set_lmount = false;
1811 }
1812 exit:
1813 /*
1814 * drop I/O count on the device vp if there was one
1815 */
1816 if (devpath && devvp) {
1817 vnode_put(devvp);
1818 }
1819
1820 if (did_set_lmount) {
1821 mount_lock_spin(mp);
1822 mp->mnt_lflag &= ~MNT_LMOUNT;
1823 mount_unlock(mp);
1824 }
1825
1826 return error;
1827
1828 /* Error condition exits */
1829 out4:
1830 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1831
1832 /*
1833 * If the mount has been placed on the covered vp,
1834 * it may have been discovered by now, so we have
1835 * to treat this just like an unmount
1836 */
1837 mount_lock_spin(mp);
1838 mp->mnt_lflag |= MNT_LDEAD;
1839 mount_unlock(mp);
1840
1841 if (device_vnode != NULLVP) {
1842 vnode_rele(device_vnode);
1843 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1844 ctx);
1845 vfs_clearmounting(device_vnode);
1846 did_rele = TRUE;
1847 }
1848
1849 vnode_lock_spin(vp);
1850
1851 mp->mnt_crossref++;
1852 CLR(vp->v_flag, VMOUNTEDHERE);
1853 vp->v_mountedhere = (mount_t) 0;
1854
1855 vnode_unlock(vp);
1856
1857 if (have_usecount) {
1858 vnode_rele(vp);
1859 }
1860 out3:
1861 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1862 vnode_rele(devvp);
1863 vfs_clearmounting(devvp);
1864 }
1865 out2:
1866 if (devpath && devvp) {
1867 vnode_put(devvp);
1868 }
1869 out1:
1870 /* Release mnt_rwlock only when it was taken */
1871 if (is_rwlock_locked == TRUE) {
1872 if (flag_set) {
1873 mp->mnt_flag = flag; /* restore mnt_flag value */
1874 }
1875 lck_rw_done(&mp->mnt_rwlock);
1876 }
1877
1878 if (did_set_lmount) {
1879 mount_lock_spin(mp);
1880 mp->mnt_lflag &= ~MNT_LMOUNT;
1881 mount_unlock(mp);
1882 }
1883
1884 if (did_set_vmount) {
1885 vnode_lock_spin(vp);
1886 CLR(vp->v_flag, VMOUNT);
1887 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
1888 wakeup(&vp->v_flag);
1889 vnode_unlock(vp);
1890 }
1891
1892 if (mntalloc) {
1893 if (mp->mnt_crossref) {
1894 mount_dropcrossref(mp, vp, 0);
1895 } else {
1896 if (nc_smr_enabled) {
1897 vfs_smr_synchronize();
1898 }
1899
1900 mount_lock_destroy(mp);
1901 #if CONFIG_MACF
1902 mac_mount_label_destroy(mp);
1903 #endif
1904 zfree(mount_zone, mp);
1905 }
1906 }
1907 if (vfsp_ref) {
1908 mount_list_lock();
1909 vfsp->vfc_refcount--;
1910 mount_list_unlock();
1911 }
1912
1913 return error;
1914 }
1915
1916 /*
1917 * Flush in-core data, check for competing mount attempts,
1918 * and set VMOUNT
1919 */
1920 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1921 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1922 {
1923 #if !CONFIG_MACF
1924 #pragma unused(cnp,fsname)
1925 #endif
1926 struct vnode_attr va;
1927 int error;
1928 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1929 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1930 boolean_t is_kmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1931
1932 if (!skip_auth) {
1933 /*
1934 * If the user is not root, ensure that they own the directory
1935 * onto which we are attempting to mount.
1936 */
1937 VATTR_INIT(&va);
1938 VATTR_WANTED(&va, va_uid);
1939 if ((error = vnode_getattr(vp, &va, ctx)) ||
1940 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1941 (!vfs_context_issuser(ctx)))) {
1942 error = EPERM;
1943 goto out;
1944 }
1945 }
1946
1947 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1948 goto out;
1949 }
1950
1951 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1952 goto out;
1953 }
1954
1955 if (vp->v_type != VDIR) {
1956 error = ENOTDIR;
1957 goto out;
1958 }
1959
1960 vnode_lock_spin(vp);
1961
1962 if (is_fmount && (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL))) {
1963 error = EBUSY;
1964 } else if (!is_kmount && (ISSET(vp->v_flag, VMOUNT) ||
1965 (vp->v_mountedhere != NULL))) {
1966 /*
1967 * For mount triggered from mount() call, we want to wait for the
1968 * current in-progress mount to complete, redo lookup and retry the
1969 * mount again. Similarly, we also want to retry if we lost the race
1970 * due to concurrent mounts and the 'VMOUNT' flag has been cleared and
1971 * 'v_mountedhere' has been planted after initial lookup.
1972 */
1973 if (ISSET(vp->v_flag, VMOUNT)) {
1974 vnode_lock_convert(vp);
1975 msleep(&vp->v_flag, &vp->v_lock, PVFS, "vnode_waitformount", NULL);
1976 }
1977 error = EBUSY;
1978 } else if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1979 error = EBUSY;
1980 }
1981
1982 if (error) {
1983 vnode_unlock(vp);
1984 goto out;
1985 }
1986 SET(vp->v_flag, VMOUNT);
1987 vnode_unlock(vp);
1988
1989 #if CONFIG_MACF
1990 error = mac_mount_check_mount(ctx, vp,
1991 cnp, fsname);
1992 if (error != 0) {
1993 vnode_lock_spin(vp);
1994 CLR(vp->v_flag, VMOUNT);
1995 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
1996 wakeup(&vp->v_flag);
1997 vnode_unlock(vp);
1998 }
1999 #endif
2000
2001 out:
2002 return error;
2003 }
2004
2005 #if CONFIG_IMGSRC_ACCESS
2006
2007 #define DEBUG_IMGSRC 0
2008
2009 #if DEBUG_IMGSRC
2010 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
2011 #else
2012 #define IMGSRC_DEBUG(args...) do { } while(0)
2013 #endif
2014
2015 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)2016 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
2017 {
2018 struct nameidata nd;
2019 vnode_t vp, realdevvp;
2020 kauth_action_t accessmode;
2021 int error;
2022 enum uio_seg uio = UIO_USERSPACE;
2023
2024 if (ctx == vfs_context_kernel()) {
2025 uio = UIO_SYSSPACE;
2026 }
2027
2028 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
2029 if ((error = namei(&nd))) {
2030 IMGSRC_DEBUG("namei() failed with %d\n", error);
2031 return error;
2032 }
2033
2034 vp = nd.ni_vp;
2035
2036 if (!vnode_isblk(vp)) {
2037 IMGSRC_DEBUG("Not block device.\n");
2038 error = ENOTBLK;
2039 goto out;
2040 }
2041
2042 realdevvp = mp->mnt_devvp;
2043 if (realdevvp == NULLVP) {
2044 IMGSRC_DEBUG("No device backs the mount.\n");
2045 error = ENXIO;
2046 goto out;
2047 }
2048
2049 error = vnode_getwithref(realdevvp);
2050 if (error != 0) {
2051 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
2052 goto out;
2053 }
2054
2055 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2056 IMGSRC_DEBUG("Wrong dev_t.\n");
2057 error = ENXIO;
2058 goto out1;
2059 }
2060
2061 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2062
2063 /*
2064 * If mount by non-root, then verify that user has necessary
2065 * permissions on the device.
2066 */
2067 if (!vfs_context_issuser(ctx)) {
2068 accessmode = KAUTH_VNODE_READ_DATA;
2069 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2070 accessmode |= KAUTH_VNODE_WRITE_DATA;
2071 }
2072 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2073 IMGSRC_DEBUG("Access denied.\n");
2074 goto out1;
2075 }
2076 }
2077
2078 *devvpp = vp;
2079
2080 out1:
2081 vnode_put(realdevvp);
2082
2083 out:
2084 nameidone(&nd);
2085
2086 if (error) {
2087 vnode_put(vp);
2088 }
2089
2090 return error;
2091 }
2092
2093 /*
2094 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2095 * and call checkdirs()
2096 */
2097 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2098 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2099 {
2100 int error;
2101
2102 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2103
2104 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2105 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2106
2107 vnode_lock_spin(vp);
2108 CLR(vp->v_flag, VMOUNT);
2109 vp->v_mountedhere = mp;
2110 SET(vp->v_flag, VMOUNTEDHERE);
2111 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2112 wakeup(&vp->v_flag);
2113 vnode_unlock(vp);
2114
2115 /*
2116 * taking the name_cache_lock exclusively will
2117 * insure that everyone is out of the fast path who
2118 * might be trying to use a now stale copy of
2119 * vp->v_mountedhere->mnt_realrootvp
2120 * bumping mount_generation causes the cached values
2121 * to be invalidated
2122 */
2123 name_cache_lock();
2124 mount_generation++;
2125 name_cache_unlock();
2126
2127 error = vnode_ref(vp);
2128 if (error != 0) {
2129 goto out;
2130 }
2131
2132 error = checkdirs(vp, ctx);
2133 if (error != 0) {
2134 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2135 vnode_rele(vp);
2136 goto out;
2137 }
2138
2139 out:
2140 if (error != 0) {
2141 mp->mnt_vnodecovered = NULLVP;
2142 }
2143 return error;
2144 }
2145
2146 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2147 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2148 {
2149 vnode_rele(vp);
2150 vnode_lock_spin(vp);
2151 CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2152 vp->v_mountedhere = (mount_t)NULL;
2153 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2154 wakeup(&vp->v_flag);
2155 vnode_unlock(vp);
2156
2157 mp->mnt_vnodecovered = NULLVP;
2158 }
2159
2160 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2161 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2162 {
2163 int error;
2164
2165 /* unmount in progress return error */
2166 mount_lock_spin(mp);
2167 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2168 mount_unlock(mp);
2169 return EBUSY;
2170 }
2171 mount_unlock(mp);
2172 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2173
2174 /*
2175 * We only allow the filesystem to be reloaded if it
2176 * is currently mounted read-only.
2177 */
2178 if ((flags & MNT_RELOAD) &&
2179 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2180 error = ENOTSUP;
2181 goto out;
2182 }
2183
2184 /*
2185 * Only root, or the user that did the original mount is
2186 * permitted to update it.
2187 */
2188 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2189 (!vfs_context_issuser(ctx))) {
2190 error = EPERM;
2191 goto out;
2192 }
2193 #if CONFIG_MACF
2194 error = mac_mount_check_remount(ctx, mp);
2195 if (error != 0) {
2196 goto out;
2197 }
2198 #endif
2199
2200 out:
2201 if (error) {
2202 lck_rw_done(&mp->mnt_rwlock);
2203 }
2204
2205 return error;
2206 }
2207
2208 static void
mount_end_update(mount_t mp)2209 mount_end_update(mount_t mp)
2210 {
2211 lck_rw_done(&mp->mnt_rwlock);
2212 }
2213
2214 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2215 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2216 {
2217 vnode_t vp;
2218
2219 if (height >= MAX_IMAGEBOOT_NESTING) {
2220 return EINVAL;
2221 }
2222
2223 vp = imgsrc_rootvnodes[height];
2224 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2225 *rvpp = vp;
2226 return 0;
2227 } else {
2228 return ENOENT;
2229 }
2230 }
2231
2232 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2233 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2234 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2235 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2236 {
2237 int error;
2238 mount_t mp;
2239 boolean_t placed = FALSE;
2240 struct vfstable *vfsp;
2241 user_addr_t devpath;
2242 char *old_mntonname;
2243 vnode_t rvp;
2244 vnode_t devvp;
2245 uint32_t height;
2246 uint32_t flags;
2247
2248 /* If we didn't imageboot, nothing to move */
2249 if (imgsrc_rootvnodes[0] == NULLVP) {
2250 return EINVAL;
2251 }
2252
2253 /* Only root can do this */
2254 if (!vfs_context_issuser(ctx)) {
2255 return EPERM;
2256 }
2257
2258 IMGSRC_DEBUG("looking for root vnode.\n");
2259
2260 /*
2261 * Get root vnode of filesystem we're moving.
2262 */
2263 if (by_index) {
2264 if (is64bit) {
2265 struct user64_mnt_imgsrc_args mia64;
2266 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2267 if (error != 0) {
2268 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2269 return error;
2270 }
2271
2272 height = mia64.mi_height;
2273 flags = mia64.mi_flags;
2274 devpath = (user_addr_t)mia64.mi_devpath;
2275 } else {
2276 struct user32_mnt_imgsrc_args mia32;
2277 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2278 if (error != 0) {
2279 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2280 return error;
2281 }
2282
2283 height = mia32.mi_height;
2284 flags = mia32.mi_flags;
2285 devpath = mia32.mi_devpath;
2286 }
2287 } else {
2288 /*
2289 * For binary compatibility--assumes one level of nesting.
2290 */
2291 if (is64bit) {
2292 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2293 return error;
2294 }
2295 } else {
2296 user32_addr_t tmp;
2297 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2298 return error;
2299 }
2300
2301 /* munge into LP64 addr */
2302 devpath = CAST_USER_ADDR_T(tmp);
2303 }
2304
2305 height = 0;
2306 flags = 0;
2307 }
2308
2309 if (flags != 0) {
2310 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2311 return EINVAL;
2312 }
2313
2314 error = get_imgsrc_rootvnode(height, &rvp);
2315 if (error != 0) {
2316 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2317 return error;
2318 }
2319
2320 IMGSRC_DEBUG("got old root vnode\n");
2321
2322 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2323
2324 /* Can only move once */
2325 mp = vnode_mount(rvp);
2326 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2327 IMGSRC_DEBUG("Already moved.\n");
2328 error = EBUSY;
2329 goto out0;
2330 }
2331
2332 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2333 IMGSRC_DEBUG("Starting updated.\n");
2334
2335 /* Get exclusive rwlock on mount, authorize update on mp */
2336 error = mount_begin_update(mp, ctx, 0);
2337 if (error != 0) {
2338 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2339 goto out0;
2340 }
2341
2342 /*
2343 * It can only be moved once. Flag is set under the rwlock,
2344 * so we're now safe to proceed.
2345 */
2346 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2347 IMGSRC_DEBUG("Already moved [2]\n");
2348 goto out1;
2349 }
2350
2351 IMGSRC_DEBUG("Preparing coveredvp.\n");
2352
2353 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2354 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2355 if (error != 0) {
2356 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2357 goto out1;
2358 }
2359
2360 IMGSRC_DEBUG("Covered vp OK.\n");
2361
2362 /* Sanity check the name caller has provided */
2363 vfsp = mp->mnt_vtable;
2364 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2365 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2366 vfsp->vfc_name, fsname);
2367 error = EINVAL;
2368 goto out2;
2369 }
2370
2371 /* Check the device vnode and update mount-from name, for local filesystems */
2372 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2373 IMGSRC_DEBUG("Local, doing device validation.\n");
2374
2375 if (devpath != USER_ADDR_NULL) {
2376 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2377 if (error) {
2378 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2379 goto out2;
2380 }
2381
2382 vnode_put(devvp);
2383 }
2384 }
2385
2386 /*
2387 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2388 * and increment the name cache's mount generation
2389 */
2390
2391 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2392 error = place_mount_and_checkdirs(mp, vp, ctx);
2393 if (error != 0) {
2394 goto out2;
2395 }
2396
2397 placed = TRUE;
2398
2399 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2400 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2401
2402 /* Forbid future moves */
2403 mount_lock(mp);
2404 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2405 mount_unlock(mp);
2406
2407 /* Finally, add to mount list, completely ready to go */
2408 if (mount_list_add(mp) != 0) {
2409 /*
2410 * The system is shutting down trying to umount
2411 * everything, so fail with a plausible errno.
2412 */
2413 error = EBUSY;
2414 goto out3;
2415 }
2416
2417 mount_end_update(mp);
2418 vnode_put(rvp);
2419 zfree(ZV_NAMEI, old_mntonname);
2420
2421 vfs_notify_mount(pvp);
2422
2423 return 0;
2424 out3:
2425 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2426
2427 mount_lock(mp);
2428 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2429 mount_unlock(mp);
2430
2431 out2:
2432 /*
2433 * Placing the mp on the vnode clears VMOUNT,
2434 * so cleanup is different after that point
2435 */
2436 if (placed) {
2437 /* Rele the vp, clear VMOUNT and v_mountedhere */
2438 undo_place_on_covered_vp(mp, vp);
2439 } else {
2440 vnode_lock_spin(vp);
2441 CLR(vp->v_flag, VMOUNT);
2442 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2443 wakeup(&vp->v_flag);
2444 vnode_unlock(vp);
2445 }
2446 out1:
2447 mount_end_update(mp);
2448
2449 out0:
2450 vnode_put(rvp);
2451 zfree(ZV_NAMEI, old_mntonname);
2452 return error;
2453 }
2454
2455 #endif /* CONFIG_IMGSRC_ACCESS */
2456
2457 void
enablequotas(struct mount * mp,vfs_context_t ctx)2458 enablequotas(struct mount *mp, vfs_context_t ctx)
2459 {
2460 struct nameidata qnd;
2461 int type;
2462 char qfpath[MAXPATHLEN];
2463 const char *qfname = QUOTAFILENAME;
2464 const char *qfopsname = QUOTAOPSNAME;
2465 const char *qfextension[] = INITQFNAMES;
2466
2467 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2468 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2469 return;
2470 }
2471 /*
2472 * Enable filesystem disk quotas if necessary.
2473 * We ignore errors as this should not interfere with final mount
2474 */
2475 for (type = 0; type < MAXQUOTAS; type++) {
2476 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2477 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2478 CAST_USER_ADDR_T(qfpath), ctx);
2479 if (namei(&qnd) != 0) {
2480 continue; /* option file to trigger quotas is not present */
2481 }
2482 vnode_put(qnd.ni_vp);
2483 nameidone(&qnd);
2484 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2485
2486 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2487 }
2488 return;
2489 }
2490
2491
2492 static int
checkdirs_callback(proc_t p,void * arg)2493 checkdirs_callback(proc_t p, void * arg)
2494 {
2495 struct cdirargs *cdrp = (struct cdirargs *)arg;
2496 vnode_t olddp = cdrp->olddp;
2497 vnode_t newdp = cdrp->newdp;
2498 struct filedesc *fdp = &p->p_fd;
2499 vnode_t new_cvp = newdp;
2500 vnode_t new_rvp = newdp;
2501 vnode_t old_cvp = NULL;
2502 vnode_t old_rvp = NULL;
2503
2504 /*
2505 * XXX Also needs to iterate each thread in the process to see if it
2506 * XXX is using a per-thread current working directory, and, if so,
2507 * XXX update that as well.
2508 */
2509
2510 /*
2511 * First, with the proc_fdlock held, check to see if we will need
2512 * to do any work. If not, we will get out fast.
2513 */
2514 proc_fdlock(p);
2515 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2516 proc_fdunlock(p);
2517 return PROC_RETURNED;
2518 }
2519 proc_fdunlock(p);
2520
2521 /*
2522 * Ok, we will have to do some work. Always take two refs
2523 * because we might need that many. We'll dispose of whatever
2524 * we ended up not using.
2525 */
2526 if (vnode_ref(newdp) != 0) {
2527 return PROC_RETURNED;
2528 }
2529 if (vnode_ref(newdp) != 0) {
2530 vnode_rele(newdp);
2531 return PROC_RETURNED;
2532 }
2533
2534 proc_dirs_lock_exclusive(p);
2535 /*
2536 * Now do the work. Note: we dropped the proc_fdlock, so we
2537 * have to do all of the checks again.
2538 */
2539 proc_fdlock(p);
2540 if (fdp->fd_cdir == olddp) {
2541 old_cvp = olddp;
2542 fdp->fd_cdir = newdp;
2543 new_cvp = NULL;
2544 }
2545 if (fdp->fd_rdir == olddp) {
2546 old_rvp = olddp;
2547 fdp->fd_rdir = newdp;
2548 new_rvp = NULL;
2549 }
2550 proc_fdunlock(p);
2551 proc_dirs_unlock_exclusive(p);
2552
2553 /*
2554 * Dispose of any references that are no longer needed.
2555 */
2556 if (old_cvp != NULL) {
2557 vnode_rele(old_cvp);
2558 }
2559 if (old_rvp != NULL) {
2560 vnode_rele(old_rvp);
2561 }
2562 if (new_cvp != NULL) {
2563 vnode_rele(new_cvp);
2564 }
2565 if (new_rvp != NULL) {
2566 vnode_rele(new_rvp);
2567 }
2568
2569 return PROC_RETURNED;
2570 }
2571
2572
2573
2574 /*
2575 * Scan all active processes to see if any of them have a current
2576 * or root directory onto which the new filesystem has just been
2577 * mounted. If so, replace them with the new mount point.
2578 */
2579 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2580 checkdirs(vnode_t olddp, vfs_context_t ctx)
2581 {
2582 vnode_t newdp;
2583 vnode_t tvp;
2584 int err;
2585 struct cdirargs cdr;
2586
2587 if (olddp->v_usecount == 1) {
2588 return 0;
2589 }
2590 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2591
2592 if (err != 0) {
2593 #if DIAGNOSTIC
2594 panic("mount: lost mount: error %d", err);
2595 #endif
2596 return err;
2597 }
2598
2599 cdr.olddp = olddp;
2600 cdr.newdp = newdp;
2601 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2602 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2603
2604 if (rootvnode == olddp) {
2605 vnode_ref(newdp);
2606 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2607 tvp = rootvnode;
2608 rootvnode = newdp;
2609 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2610 vnode_rele(tvp);
2611 }
2612
2613 vnode_put(newdp);
2614 return 0;
2615 }
2616
2617 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2618 "com.apple.private.vfs.role-account-unmount"
2619
2620 /*
2621 * Unmount a file system.
2622 *
2623 * Note: unmount takes a path to the vnode mounted on as argument,
2624 * not special file (as before).
2625 */
2626 /* ARGSUSED */
2627 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2628 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2629 {
2630 vnode_t vp;
2631 struct mount *mp;
2632 int error;
2633 struct nameidata nd;
2634 vfs_context_t ctx;
2635
2636 /*
2637 * If the process has the entitlement, use the kernel's context when
2638 * performing lookup on the mount path as the process might lack proper
2639 * permission to access the directory.
2640 */
2641 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2642 vfs_context_kernel() : vfs_context_current();
2643
2644 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2645 UIO_USERSPACE, uap->path, ctx);
2646 error = namei(&nd);
2647 if (error) {
2648 return error;
2649 }
2650 vp = nd.ni_vp;
2651 mp = vp->v_mount;
2652 nameidone(&nd);
2653
2654 /*
2655 * Must be the root of the filesystem
2656 */
2657 if ((vp->v_flag & VROOT) == 0) {
2658 vnode_put(vp);
2659 return EINVAL;
2660 }
2661 #if CONFIG_MACF
2662 error = mac_mount_check_umount(ctx, mp);
2663 if (error != 0) {
2664 vnode_put(vp);
2665 return error;
2666 }
2667 #endif
2668 mount_ref(mp, 0);
2669 vnode_put(vp);
2670 /* safedounmount consumes the mount ref */
2671 return safedounmount(mp, uap->flags, ctx);
2672 }
2673
2674 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2675 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2676 {
2677 mount_t mp;
2678
2679 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2680 if (mp == (mount_t)0) {
2681 return ENOENT;
2682 }
2683 mount_ref(mp, 0);
2684 mount_iterdrop(mp);
2685 /* safedounmount consumes the mount ref */
2686 return safedounmount(mp, flags, ctx);
2687 }
2688
2689 /*
2690 * The mount struct comes with a mount ref which will be consumed.
2691 * Do the actual file system unmount, prevent some common foot shooting.
2692 */
2693 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2694 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2695 {
2696 int error;
2697 proc_t p = vfs_context_proc(ctx);
2698
2699 /*
2700 * If the file system is not responding and MNT_NOBLOCK
2701 * is set and not a forced unmount then return EBUSY.
2702 */
2703 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2704 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2705 error = EBUSY;
2706 goto out;
2707 }
2708
2709 /*
2710 * Skip authorization in two cases:
2711 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2712 * This entitlement allows non-root processes unmount volumes mounted by
2713 * other processes.
2714 * - If the mount is tagged as permissive and this is not a forced-unmount
2715 * attempt.
2716 */
2717 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2718 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2719 /*
2720 * Only root, or the user that did the original mount is
2721 * permitted to unmount this filesystem.
2722 */
2723 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2724 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2725 goto out;
2726 }
2727 }
2728 /*
2729 * Don't allow unmounting the root file system, or other volumes
2730 * associated with it (for example, the associated VM or DATA mounts) .
2731 */
2732 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2733 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2734 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2735 mp->mnt_vfsstat.f_mntonname);
2736 }
2737 error = EBUSY; /* the root (or associated volumes) is always busy */
2738 goto out;
2739 }
2740
2741 /*
2742 * If the mount is providing the root filesystem's disk image
2743 * (i.e. imageboot), don't allow unmounting
2744 */
2745 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2746 error = EBUSY;
2747 goto out;
2748 }
2749
2750 return dounmount(mp, flags, 1, ctx);
2751
2752 out:
2753 mount_drop(mp, 0);
2754 return error;
2755 }
2756
2757 /*
2758 * Do the actual file system unmount.
2759 */
2760 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2761 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2762 {
2763 vnode_t coveredvp = (vnode_t)0;
2764 int error;
2765 int needwakeup = 0;
2766 int forcedunmount = 0;
2767 int lflags = 0;
2768 struct vnode *devvp = NULLVP;
2769 #if CONFIG_TRIGGERS
2770 proc_t p = vfs_context_proc(ctx);
2771 int did_vflush = 0;
2772 int pflags_save = 0;
2773 #endif /* CONFIG_TRIGGERS */
2774
2775 #if CONFIG_FSE
2776 if (!(flags & MNT_FORCE)) {
2777 fsevent_unmount(mp, ctx); /* has to come first! */
2778 }
2779 #endif
2780
2781 mount_lock(mp);
2782
2783 /*
2784 * If already an unmount in progress just return EBUSY.
2785 * Even a forced unmount cannot override.
2786 */
2787 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2788 if (withref != 0) {
2789 mount_drop(mp, 1);
2790 }
2791 mount_unlock(mp);
2792 return EBUSY;
2793 }
2794
2795 if (flags & MNT_FORCE) {
2796 forcedunmount = 1;
2797 mp->mnt_lflag |= MNT_LFORCE;
2798 }
2799
2800 #if CONFIG_TRIGGERS
2801 if (flags & MNT_NOBLOCK && p != kernproc) {
2802 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2803 }
2804 #endif
2805
2806 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2807 mp->mnt_lflag |= MNT_LUNMOUNT;
2808 mp->mnt_flag &= ~MNT_ASYNC;
2809 /*
2810 * anyone currently in the fast path that
2811 * trips over the cached rootvp will be
2812 * dumped out and forced into the slow path
2813 * to regenerate a new cached value
2814 */
2815 mp->mnt_realrootvp = NULLVP;
2816 mount_unlock(mp);
2817
2818 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2819 /*
2820 * Force unmount any mounts in this filesystem.
2821 * If any unmounts fail - just leave them dangling.
2822 * Avoids recursion.
2823 */
2824 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2825 }
2826
2827 /*
2828 * taking the name_cache_lock exclusively will
2829 * insure that everyone is out of the fast path who
2830 * might be trying to use a now stale copy of
2831 * vp->v_mountedhere->mnt_realrootvp
2832 * bumping mount_generation causes the cached values
2833 * to be invalidated
2834 */
2835 name_cache_lock();
2836 mount_generation++;
2837 name_cache_unlock();
2838
2839
2840 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2841 if (withref != 0) {
2842 mount_drop(mp, 0);
2843 }
2844 error = 0;
2845 if (forcedunmount == 0) {
2846 ubc_umount(mp); /* release cached vnodes */
2847 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2848 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2849 if (error) {
2850 mount_lock(mp);
2851 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2852 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2853 mp->mnt_lflag &= ~MNT_LFORCE;
2854 goto out;
2855 }
2856 }
2857 }
2858
2859 IOBSDMountChange(mp, kIOMountChangeUnmount);
2860
2861 #if CONFIG_TRIGGERS
2862 vfs_nested_trigger_unmounts(mp, flags, ctx);
2863 did_vflush = 1;
2864 #endif
2865 if (forcedunmount) {
2866 lflags |= FORCECLOSE;
2867 }
2868 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2869 if ((forcedunmount == 0) && error) {
2870 mount_lock(mp);
2871 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2872 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2873 mp->mnt_lflag &= ~MNT_LFORCE;
2874 goto out;
2875 }
2876
2877 /* make sure there are no one in the mount iterations or lookup */
2878 mount_iterdrain(mp);
2879
2880 error = VFS_UNMOUNT(mp, flags, ctx);
2881 if (error) {
2882 mount_iterreset(mp);
2883 mount_lock(mp);
2884 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2885 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2886 mp->mnt_lflag &= ~MNT_LFORCE;
2887 goto out;
2888 }
2889
2890 /* increment the operations count */
2891 if (!error) {
2892 OSAddAtomic(1, &vfs_nummntops);
2893 }
2894
2895 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2896 /* hold an io reference and drop the usecount before close */
2897 devvp = mp->mnt_devvp;
2898 vnode_getalways(devvp);
2899 vnode_rele(devvp);
2900 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2901 ctx);
2902 vnode_clearmountedon(devvp);
2903 vnode_put(devvp);
2904 }
2905 lck_rw_done(&mp->mnt_rwlock);
2906 mount_list_remove(mp);
2907 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2908
2909 /* mark the mount point hook in the vp but not drop the ref yet */
2910 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2911 /*
2912 * The covered vnode needs special handling. Trying to get an
2913 * iocount must not block here as this may lead to deadlocks
2914 * if the Filesystem to which the covered vnode belongs is
2915 * undergoing forced unmounts. Since we hold a usecount, the
2916 * vnode cannot be reused (it can, however, still be terminated)
2917 */
2918 vnode_getalways(coveredvp);
2919 vnode_lock_spin(coveredvp);
2920
2921 mp->mnt_crossref++;
2922 coveredvp->v_mountedhere = (struct mount *)0;
2923 CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2924 /* Wakeup waiter(s) waiting for in-progress mount to finish. */
2925 wakeup(&coveredvp->v_flag);
2926 vnode_unlock(coveredvp);
2927 vnode_put(coveredvp);
2928 }
2929
2930 mount_list_lock();
2931 mp->mnt_vtable->vfc_refcount--;
2932 mount_list_unlock();
2933
2934 cache_purgevfs(mp); /* remove cache entries for this file sys */
2935 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2936 mount_lock(mp);
2937 mp->mnt_lflag |= MNT_LDEAD;
2938
2939 if (mp->mnt_lflag & MNT_LWAIT) {
2940 /*
2941 * do the wakeup here
2942 * in case we block in mount_refdrain
2943 * which will drop the mount lock
2944 * and allow anyone blocked in vfs_busy
2945 * to wakeup and see the LDEAD state
2946 */
2947 mp->mnt_lflag &= ~MNT_LWAIT;
2948 wakeup((caddr_t)mp);
2949 }
2950 mount_refdrain(mp);
2951
2952 /* free disk_conditioner_info structure for this mount */
2953 disk_conditioner_unmount(mp);
2954
2955 out:
2956 if (mp->mnt_lflag & MNT_LWAIT) {
2957 mp->mnt_lflag &= ~MNT_LWAIT;
2958 needwakeup = 1;
2959 }
2960
2961 #if CONFIG_TRIGGERS
2962 if (flags & MNT_NOBLOCK && p != kernproc) {
2963 // Restore P_NOREMOTEHANG bit to its previous value
2964 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2965 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2966 }
2967 }
2968
2969 /*
2970 * Callback and context are set together under the mount lock, and
2971 * never cleared, so we're safe to examine them here, drop the lock,
2972 * and call out.
2973 */
2974 if (mp->mnt_triggercallback != NULL) {
2975 mount_unlock(mp);
2976 if (error == 0) {
2977 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2978 } else if (did_vflush) {
2979 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2980 }
2981 } else {
2982 mount_unlock(mp);
2983 }
2984 #else
2985 mount_unlock(mp);
2986 #endif /* CONFIG_TRIGGERS */
2987
2988 lck_rw_done(&mp->mnt_rwlock);
2989
2990 if (needwakeup) {
2991 wakeup((caddr_t)mp);
2992 }
2993
2994 if (!error) {
2995 if ((coveredvp != NULLVP)) {
2996 vnode_t pvp = NULLVP;
2997
2998 /*
2999 * The covered vnode needs special handling. Trying to
3000 * get an iocount must not block here as this may lead
3001 * to deadlocks if the Filesystem to which the covered
3002 * vnode belongs is undergoing forced unmounts. Since we
3003 * hold a usecount, the vnode cannot be reused
3004 * (it can, however, still be terminated).
3005 */
3006 vnode_getalways(coveredvp);
3007
3008 mount_dropcrossref(mp, coveredvp, 0);
3009 /*
3010 * We'll _try_ to detect if this really needs to be
3011 * done. The coveredvp can only be in termination (or
3012 * terminated) if the coveredvp's mount point is in a
3013 * forced unmount (or has been) since we still hold the
3014 * ref.
3015 */
3016 if (!vnode_isrecycled(coveredvp)) {
3017 pvp = vnode_getparent(coveredvp);
3018 #if CONFIG_TRIGGERS
3019 if (coveredvp->v_resolve) {
3020 vnode_trigger_rearm(coveredvp, ctx);
3021 }
3022 #endif
3023 }
3024
3025 vnode_rele(coveredvp);
3026 vnode_put(coveredvp);
3027 coveredvp = NULLVP;
3028
3029 if (pvp) {
3030 lock_vnode_and_post(pvp, NOTE_WRITE);
3031 vnode_put(pvp);
3032 }
3033 } else if (mp->mnt_flag & MNT_ROOTFS) {
3034 if (nc_smr_enabled) {
3035 vfs_smr_synchronize();
3036 }
3037
3038 mount_lock_destroy(mp);
3039 #if CONFIG_MACF
3040 mac_mount_label_destroy(mp);
3041 #endif
3042 zfree(mount_zone, mp);
3043 } else {
3044 panic("dounmount: no coveredvp");
3045 }
3046 }
3047 return error;
3048 }
3049
3050 /*
3051 * Unmount any mounts in this filesystem.
3052 */
3053 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)3054 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
3055 {
3056 mount_t smp;
3057 fsid_t *fsids, fsid;
3058 int fsids_sz;
3059 int count = 0, i, m = 0;
3060 vnode_t vp;
3061
3062 mount_list_lock();
3063
3064 // Get an array to hold the submounts fsids.
3065 TAILQ_FOREACH(smp, &mountlist, mnt_list)
3066 count++;
3067 fsids_sz = count * sizeof(fsid_t);
3068 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3069 if (fsids == NULL) {
3070 mount_list_unlock();
3071 goto out;
3072 }
3073 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3074
3075 /*
3076 * Fill the array with submount fsids.
3077 * Since mounts are always added to the tail of the mount list, the
3078 * list is always in mount order.
3079 * For each mount check if the mounted-on vnode belongs to a
3080 * mount that's already added to our array of mounts to be unmounted.
3081 */
3082 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3083 vp = smp->mnt_vnodecovered;
3084 if (vp == NULL) {
3085 continue;
3086 }
3087 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3088 for (i = 0; i <= m; i++) {
3089 if (fsids[i].val[0] == fsid.val[0] &&
3090 fsids[i].val[1] == fsid.val[1]) {
3091 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3092 break;
3093 }
3094 }
3095 }
3096 mount_list_unlock();
3097
3098 // Unmount the submounts in reverse order. Ignore errors.
3099 for (i = m; i > 0; i--) {
3100 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3101 if (smp) {
3102 mount_ref(smp, 0);
3103 mount_iterdrop(smp);
3104 (void) dounmount(smp, flags, 1, ctx);
3105 }
3106 }
3107 out:
3108 kfree_data(fsids, fsids_sz);
3109 }
3110
3111 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3112 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3113 {
3114 vnode_hold(dp);
3115 vnode_lock(dp);
3116 mp->mnt_crossref--;
3117
3118 if (mp->mnt_crossref < 0) {
3119 panic("mount cross refs -ve");
3120 }
3121
3122 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3123 if (need_put) {
3124 vnode_put_locked(dp);
3125 }
3126 vnode_drop_and_unlock(dp);
3127
3128 if (nc_smr_enabled) {
3129 vfs_smr_synchronize();
3130 }
3131
3132 mount_lock_destroy(mp);
3133 #if CONFIG_MACF
3134 mac_mount_label_destroy(mp);
3135 #endif
3136 zfree(mount_zone, mp);
3137 return;
3138 }
3139 if (need_put) {
3140 vnode_put_locked(dp);
3141 }
3142 vnode_drop_and_unlock(dp);
3143 }
3144
3145
3146 /*
3147 * Sync each mounted filesystem.
3148 */
3149 #if DIAGNOSTIC
3150 int syncprt = 0;
3151 #endif
3152
3153 int print_vmpage_stat = 0;
3154
3155 /*
3156 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3157 * mounted read-write with the passed waitfor value.
3158 *
3159 * Parameters: mp mount-point descriptor per mounted file-system instance.
3160 * arg user argument (please see below)
3161 *
3162 * User argument is a pointer to 32 bit unsigned integer which describes the
3163 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3164 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3165 * waitfor value.
3166 *
3167 * Returns: VFS_RETURNED
3168 */
3169 static int
sync_callback(mount_t mp,void * arg)3170 sync_callback(mount_t mp, void *arg)
3171 {
3172 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3173 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3174 unsigned waitfor = MNT_NOWAIT;
3175
3176 if (arg) {
3177 waitfor = *(uint32_t*)arg;
3178 }
3179
3180 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3181 if (waitfor != MNT_WAIT &&
3182 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3183 waitfor != MNT_NOWAIT &&
3184 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3185 waitfor != MNT_DWAIT &&
3186 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3187 panic("Passed inappropriate waitfor %u to "
3188 "sync_callback()", waitfor);
3189 }
3190
3191 mp->mnt_flag &= ~MNT_ASYNC;
3192 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3193 if (asyncflag) {
3194 mp->mnt_flag |= MNT_ASYNC;
3195 }
3196 }
3197
3198 return VFS_RETURNED;
3199 }
3200
3201 /* ARGSUSED */
3202 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3203 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3204 {
3205 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3206
3207 if (print_vmpage_stat) {
3208 vm_countdirtypages();
3209 }
3210
3211 #if DIAGNOSTIC
3212 if (syncprt) {
3213 vfs_bufstats();
3214 }
3215 #endif /* DIAGNOSTIC */
3216 return 0;
3217 }
3218
3219 typedef enum {
3220 SYNC_ALL = 0,
3221 SYNC_ONLY_RELIABLE_MEDIA = 1,
3222 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3223 } sync_type_t;
3224
3225 static int
sync_internal_callback(mount_t mp,void * arg)3226 sync_internal_callback(mount_t mp, void *arg)
3227 {
3228 if (arg) {
3229 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3230 (mp->mnt_flag & MNT_LOCAL);
3231 sync_type_t sync_type = *((sync_type_t *)arg);
3232
3233 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3234 return VFS_RETURNED;
3235 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3236 return VFS_RETURNED;
3237 }
3238 }
3239
3240 (void)sync_callback(mp, NULL);
3241
3242 return VFS_RETURNED;
3243 }
3244
3245 int sync_thread_state = 0;
3246 int sync_timeout_seconds = 5;
3247
3248 #define SYNC_THREAD_RUN 0x0001
3249 #define SYNC_THREAD_RUNNING 0x0002
3250
3251 #if CONFIG_PHYS_WRITE_ACCT
3252 thread_t pm_sync_thread;
3253 #endif /* CONFIG_PHYS_WRITE_ACCT */
3254
3255 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3256 sync_thread(__unused void *arg, __unused wait_result_t wr)
3257 {
3258 sync_type_t sync_type;
3259 #if CONFIG_PHYS_WRITE_ACCT
3260 pm_sync_thread = current_thread();
3261 #endif /* CONFIG_PHYS_WRITE_ACCT */
3262
3263 lck_mtx_lock(&sync_mtx_lck);
3264 while (sync_thread_state & SYNC_THREAD_RUN) {
3265 sync_thread_state &= ~SYNC_THREAD_RUN;
3266 lck_mtx_unlock(&sync_mtx_lck);
3267
3268 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3269 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3270 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3271 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3272
3273 lck_mtx_lock(&sync_mtx_lck);
3274 }
3275 /*
3276 * This wakeup _has_ to be issued before the lock is released otherwise
3277 * we may end up waking up a thread in sync_internal which is
3278 * expecting a wakeup from a thread it just created and not from this
3279 * thread which is about to exit.
3280 */
3281 wakeup(&sync_thread_state);
3282 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3283 #if CONFIG_PHYS_WRITE_ACCT
3284 pm_sync_thread = NULL;
3285 #endif /* CONFIG_PHYS_WRITE_ACCT */
3286 lck_mtx_unlock(&sync_mtx_lck);
3287
3288 if (print_vmpage_stat) {
3289 vm_countdirtypages();
3290 }
3291
3292 #if DIAGNOSTIC
3293 if (syncprt) {
3294 vfs_bufstats();
3295 }
3296 #endif /* DIAGNOSTIC */
3297 }
3298
3299 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3300
3301 /*
3302 * An in-kernel sync for power management to call.
3303 * This function always returns within sync_timeout seconds.
3304 */
3305 __private_extern__ int
sync_internal(void)3306 sync_internal(void)
3307 {
3308 thread_t thd = NULL;
3309 int error;
3310 int thread_created = FALSE;
3311 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3312
3313 lck_mtx_lock(&sync_mtx_lck);
3314 sync_thread_state |= SYNC_THREAD_RUN;
3315 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3316 int kr;
3317
3318 sync_thread_state |= SYNC_THREAD_RUNNING;
3319 kr = kernel_thread_start(sync_thread, NULL, &thd);
3320 if (kr != KERN_SUCCESS) {
3321 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3322 lck_mtx_unlock(&sync_mtx_lck);
3323 printf("sync_thread failed\n");
3324 return 0;
3325 }
3326 thread_created = TRUE;
3327 }
3328
3329 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3330 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3331 if (error) {
3332 struct timeval now;
3333
3334 microtime(&now);
3335 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3336 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3337 sync_timeout_last_print.tv_sec = now.tv_sec;
3338 }
3339 }
3340
3341 if (thread_created) {
3342 thread_deallocate(thd);
3343 }
3344
3345 return 0;
3346 } /* end of sync_internal call */
3347
3348 /*
3349 * Change filesystem quotas.
3350 */
3351 #if QUOTA
3352 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3353 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3354 {
3355 struct mount *mp;
3356 int error, quota_cmd, quota_status = 0;
3357 caddr_t datap;
3358 size_t fnamelen;
3359 struct nameidata nd;
3360 vfs_context_t ctx = vfs_context_current();
3361 struct dqblk my_dqblk = {};
3362
3363 AUDIT_ARG(uid, uap->uid);
3364 AUDIT_ARG(cmd, uap->cmd);
3365 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3366 uap->path, ctx);
3367 error = namei(&nd);
3368 if (error) {
3369 return error;
3370 }
3371 mp = nd.ni_vp->v_mount;
3372 mount_ref(mp, 0);
3373 vnode_put(nd.ni_vp);
3374 nameidone(&nd);
3375
3376 #if CONFIG_MACF
3377 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3378 if (error != 0) {
3379 goto out;
3380 }
3381 #endif
3382
3383 /* copyin any data we will need for downstream code */
3384 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3385
3386 switch (quota_cmd) {
3387 case Q_QUOTAON:
3388 /* uap->arg specifies a file from which to take the quotas */
3389 fnamelen = MAXPATHLEN;
3390 datap = zalloc(ZV_NAMEI);
3391 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3392 break;
3393 case Q_GETQUOTA:
3394 /* uap->arg is a pointer to a dqblk structure. */
3395 datap = (caddr_t) &my_dqblk;
3396 break;
3397 case Q_SETQUOTA:
3398 case Q_SETUSE:
3399 /* uap->arg is a pointer to a dqblk structure. */
3400 datap = (caddr_t) &my_dqblk;
3401 if (proc_is64bit(p)) {
3402 struct user_dqblk my_dqblk64;
3403 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3404 if (error == 0) {
3405 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3406 }
3407 } else {
3408 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3409 }
3410 break;
3411 case Q_QUOTASTAT:
3412 /* uap->arg is a pointer to an integer */
3413 datap = (caddr_t) "a_status;
3414 break;
3415 default:
3416 datap = NULL;
3417 break;
3418 } /* switch */
3419
3420 if (error == 0) {
3421 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3422 }
3423
3424 switch (quota_cmd) {
3425 case Q_QUOTAON:
3426 if (datap != NULL) {
3427 zfree(ZV_NAMEI, datap);
3428 }
3429 break;
3430 case Q_GETQUOTA:
3431 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3432 if (error == 0) {
3433 if (proc_is64bit(p)) {
3434 struct user_dqblk my_dqblk64;
3435
3436 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3437 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3438 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3439 } else {
3440 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3441 }
3442 }
3443 break;
3444 case Q_QUOTASTAT:
3445 /* uap->arg is a pointer to an integer */
3446 if (error == 0) {
3447 error = copyout(datap, uap->arg, sizeof(quota_status));
3448 }
3449 break;
3450 default:
3451 break;
3452 } /* switch */
3453
3454 out:
3455 mount_drop(mp, 0);
3456 return error;
3457 }
3458 #else
3459 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3460 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3461 {
3462 return EOPNOTSUPP;
3463 }
3464 #endif /* QUOTA */
3465
3466 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3467 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3468 {
3469 int error;
3470 vfs_context_t ctx = vfs_context_current();
3471
3472 #if CONFIG_MACF
3473 error = mac_mount_check_stat(ctx, mp);
3474 if (error != 0) {
3475 return error;
3476 }
3477 #endif
3478
3479 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3480 if (error != 0) {
3481 return error;
3482 }
3483
3484 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3485 }
3486
3487 /*
3488 * Get filesystem statistics.
3489 *
3490 * Returns: 0 Success
3491 * namei:???
3492 * vfs_update_vfsstat:???
3493 * munge_statfs:EFAULT
3494 */
3495 /* ARGSUSED */
3496 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3497 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3498 {
3499 int error;
3500 struct mount *mp;
3501 struct nameidata nd;
3502 vfs_context_t ctx = vfs_context_current();
3503 vnode_t vp;
3504
3505 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3506 UIO_USERSPACE, uap->path, ctx);
3507 error = namei(&nd);
3508 if (error != 0) {
3509 return error;
3510 }
3511 vp = nd.ni_vp;
3512 mp = vp->v_mount;
3513 nameidone(&nd);
3514
3515 error = statfs_internal(p, mp, uap->buf);
3516 vnode_put(vp);
3517
3518 return error;
3519 }
3520
3521 /*
3522 * Get filesystem statistics.
3523 */
3524 /* ARGSUSED */
3525 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3526 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3527 {
3528 int error;
3529 vnode_t vp = NULL;
3530 struct mount *mp;
3531
3532 AUDIT_ARG(fd, uap->fd);
3533
3534 if ((error = file_vnode(uap->fd, &vp)) ||
3535 (error = vnode_getwithref(vp))) {
3536 goto out;
3537 }
3538
3539 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3540
3541 mp = vp->v_mount;
3542 if (!mp) {
3543 error = EBADF;
3544 goto out_vnode;
3545 }
3546
3547 error = statfs_internal(p, mp, uap->buf);
3548
3549 out_vnode:
3550 vnode_put(vp);
3551
3552 out:
3553 if (vp != NULL) {
3554 file_drop(uap->fd);
3555 }
3556
3557 return error;
3558 }
3559
3560 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3561 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3562 {
3563 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3564
3565 bzero(sfs, sizeof(*sfs));
3566
3567 sfs->f_bsize = vsfs->f_bsize;
3568 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3569 sfs->f_blocks = vsfs->f_blocks;
3570 sfs->f_bfree = vsfs->f_bfree;
3571 sfs->f_bavail = vsfs->f_bavail;
3572 sfs->f_files = vsfs->f_files;
3573 sfs->f_ffree = vsfs->f_ffree;
3574 sfs->f_fsid = vsfs->f_fsid;
3575 sfs->f_owner = vsfs->f_owner;
3576 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3577 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3578 sfs->f_fssubtype = vsfs->f_fssubtype;
3579 sfs->f_flags_ext = 0;
3580 if (mp->mnt_kern_flag & MNTK_SYSTEMDATA) {
3581 sfs->f_flags_ext |= MNT_EXT_ROOT_DATA_VOL;
3582 }
3583 if (mp->mnt_kern_flag & MNTK_FSKIT) {
3584 sfs->f_flags_ext |= MNT_EXT_FSKIT;
3585 }
3586 vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3587 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3588 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3589 }
3590
3591 /*
3592 * Get file system statistics in 64-bit mode
3593 */
3594 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3595 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3596 {
3597 struct mount *mp;
3598 int error;
3599 struct nameidata *ndp;
3600 struct statfs64 *sfsp;
3601 vfs_context_t ctxp = vfs_context_current();
3602 vnode_t vp;
3603 struct {
3604 struct nameidata nd;
3605 struct statfs64 sfs;
3606 } *__nameidata_statfs64;
3607
3608 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3609 Z_WAITOK);
3610 ndp = &__nameidata_statfs64->nd;
3611
3612 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3613 UIO_USERSPACE, uap->path, ctxp);
3614 error = namei(ndp);
3615 if (error != 0) {
3616 goto out;
3617 }
3618 vp = ndp->ni_vp;
3619 mp = vp->v_mount;
3620 nameidone(ndp);
3621
3622 #if CONFIG_MACF
3623 error = mac_mount_check_stat(ctxp, mp);
3624 if (error != 0) {
3625 vnode_put(vp);
3626 goto out;
3627 }
3628 #endif
3629
3630 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3631 if (error != 0) {
3632 vnode_put(vp);
3633 goto out;
3634 }
3635
3636 sfsp = &__nameidata_statfs64->sfs;
3637 vfs_get_statfs64(mp, sfsp);
3638 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3639 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3640 /* This process does not want to see a seperate data volume mountpoint */
3641 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3642 }
3643 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3644 vnode_put(vp);
3645
3646 out:
3647 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3648
3649 return error;
3650 }
3651
3652 /*
3653 * Get file system statistics in 64-bit mode
3654 */
3655 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3656 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3657 {
3658 struct vnode *vp;
3659 struct mount *mp;
3660 struct statfs64 sfs;
3661 int error;
3662
3663 AUDIT_ARG(fd, uap->fd);
3664
3665 if ((error = file_vnode(uap->fd, &vp))) {
3666 return error;
3667 }
3668
3669 error = vnode_getwithref(vp);
3670 if (error) {
3671 file_drop(uap->fd);
3672 return error;
3673 }
3674
3675 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3676
3677 mp = vp->v_mount;
3678 if (!mp) {
3679 error = EBADF;
3680 goto out;
3681 }
3682
3683 #if CONFIG_MACF
3684 error = mac_mount_check_stat(vfs_context_current(), mp);
3685 if (error != 0) {
3686 goto out;
3687 }
3688 #endif
3689
3690 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3691 goto out;
3692 }
3693
3694 vfs_get_statfs64(mp, &sfs);
3695 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3696 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3697 /* This process does not want to see a seperate data volume mountpoint */
3698 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3699 }
3700 error = copyout(&sfs, uap->buf, sizeof(sfs));
3701
3702 out:
3703 file_drop(uap->fd);
3704 vnode_put(vp);
3705
3706 return error;
3707 }
3708
3709 struct getfsstat_struct {
3710 user_addr_t sfsp;
3711 user_addr_t *mp;
3712 int count;
3713 int maxcount;
3714 int flags;
3715 int error;
3716 };
3717
3718
3719 static int
getfsstat_callback(mount_t mp,void * arg)3720 getfsstat_callback(mount_t mp, void * arg)
3721 {
3722 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3723 struct vfsstatfs *sp;
3724 int error, my_size;
3725 vfs_context_t ctx = vfs_context_current();
3726
3727 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3728 #if CONFIG_MACF
3729 error = mac_mount_check_stat(ctx, mp);
3730 if (error != 0) {
3731 fstp->error = error;
3732 return VFS_RETURNED_DONE;
3733 }
3734 #endif
3735 sp = &mp->mnt_vfsstat;
3736 /*
3737 * If MNT_NOWAIT is specified, do not refresh the
3738 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3739 */
3740 if ((mp->mnt_lflag & MNT_LDEAD) ||
3741 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3742 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3743 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3744 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3745 return VFS_RETURNED;
3746 }
3747
3748 /*
3749 * Need to handle LP64 version of struct statfs
3750 */
3751 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3752 if (error) {
3753 fstp->error = error;
3754 return VFS_RETURNED_DONE;
3755 }
3756 fstp->sfsp += my_size;
3757
3758 if (fstp->mp) {
3759 #if CONFIG_MACF
3760 error = mac_mount_label_get(mp, *fstp->mp);
3761 if (error) {
3762 fstp->error = error;
3763 return VFS_RETURNED_DONE;
3764 }
3765 #endif
3766 fstp->mp++;
3767 }
3768 }
3769 fstp->count++;
3770 return VFS_RETURNED;
3771 }
3772
3773 /*
3774 * Get statistics on all filesystems.
3775 */
3776 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3777 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3778 {
3779 struct __mac_getfsstat_args muap;
3780
3781 muap.buf = uap->buf;
3782 muap.bufsize = uap->bufsize;
3783 muap.mac = USER_ADDR_NULL;
3784 muap.macsize = 0;
3785 muap.flags = uap->flags;
3786
3787 return __mac_getfsstat(p, &muap, retval);
3788 }
3789
3790 /*
3791 * __mac_getfsstat: Get MAC-related file system statistics
3792 *
3793 * Parameters: p (ignored)
3794 * uap User argument descriptor (see below)
3795 * retval Count of file system statistics (N stats)
3796 *
3797 * Indirect: uap->bufsize Buffer size
3798 * uap->macsize MAC info size
3799 * uap->buf Buffer where information will be returned
3800 * uap->mac MAC info
3801 * uap->flags File system flags
3802 *
3803 *
3804 * Returns: 0 Success
3805 * !0 Not success
3806 *
3807 */
3808 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3809 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3810 {
3811 user_addr_t sfsp;
3812 user_addr_t *mp;
3813 size_t count, maxcount, bufsize, macsize;
3814 struct getfsstat_struct fst;
3815
3816 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3817 return EINVAL;
3818 }
3819
3820 bufsize = (size_t) uap->bufsize;
3821 macsize = (size_t) uap->macsize;
3822
3823 if (IS_64BIT_PROCESS(p)) {
3824 maxcount = bufsize / sizeof(struct user64_statfs);
3825 } else {
3826 maxcount = bufsize / sizeof(struct user32_statfs);
3827 }
3828 sfsp = uap->buf;
3829 count = 0;
3830
3831 mp = NULL;
3832
3833 #if CONFIG_MACF
3834 if (uap->mac != USER_ADDR_NULL) {
3835 u_int32_t *mp0;
3836 int error;
3837 unsigned int i;
3838
3839 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3840 if (count != maxcount) {
3841 return EINVAL;
3842 }
3843
3844 /* Copy in the array */
3845 mp0 = kalloc_data(macsize, Z_WAITOK);
3846 if (mp0 == NULL) {
3847 return ENOMEM;
3848 }
3849
3850 error = copyin(uap->mac, mp0, macsize);
3851 if (error) {
3852 kfree_data(mp0, macsize);
3853 return error;
3854 }
3855
3856 /* Normalize to an array of user_addr_t */
3857 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3858 if (mp == NULL) {
3859 kfree_data(mp0, macsize);
3860 return ENOMEM;
3861 }
3862
3863 for (i = 0; i < count; i++) {
3864 if (IS_64BIT_PROCESS(p)) {
3865 mp[i] = ((user_addr_t *)mp0)[i];
3866 } else {
3867 mp[i] = (user_addr_t)mp0[i];
3868 }
3869 }
3870 kfree_data(mp0, macsize);
3871 }
3872 #endif
3873
3874
3875 fst.sfsp = sfsp;
3876 fst.mp = mp;
3877 fst.flags = uap->flags;
3878 fst.count = 0;
3879 fst.error = 0;
3880 fst.maxcount = (int)maxcount;
3881
3882
3883 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3884
3885 if (mp) {
3886 kfree_data(mp, count * sizeof(user_addr_t));
3887 }
3888
3889 if (fst.error) {
3890 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3891 return fst.error;
3892 }
3893
3894 if (fst.sfsp && fst.count > fst.maxcount) {
3895 *retval = fst.maxcount;
3896 } else {
3897 *retval = fst.count;
3898 }
3899 return 0;
3900 }
3901
3902 static int
getfsstat64_callback(mount_t mp,void * arg)3903 getfsstat64_callback(mount_t mp, void * arg)
3904 {
3905 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3906 struct vfsstatfs *sp;
3907 struct statfs64 sfs;
3908 int error;
3909
3910 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3911 #if CONFIG_MACF
3912 error = mac_mount_check_stat(vfs_context_current(), mp);
3913 if (error != 0) {
3914 fstp->error = error;
3915 return VFS_RETURNED_DONE;
3916 }
3917 #endif
3918 sp = &mp->mnt_vfsstat;
3919 /*
3920 * If MNT_NOWAIT is specified, do not refresh the fsstat
3921 * cache. MNT_WAIT overrides MNT_NOWAIT.
3922 *
3923 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3924 * getfsstat, since the constants are out of the same
3925 * namespace.
3926 */
3927 if ((mp->mnt_lflag & MNT_LDEAD) ||
3928 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3929 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3930 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3931 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3932 return VFS_RETURNED;
3933 }
3934
3935 vfs_get_statfs64(mp, &sfs);
3936 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3937 if (error) {
3938 fstp->error = error;
3939 return VFS_RETURNED_DONE;
3940 }
3941 fstp->sfsp += sizeof(sfs);
3942 }
3943 fstp->count++;
3944 return VFS_RETURNED;
3945 }
3946
3947 /*
3948 * Get statistics on all file systems in 64 bit mode.
3949 */
3950 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3951 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3952 {
3953 user_addr_t sfsp;
3954 int count, maxcount;
3955 struct getfsstat_struct fst;
3956
3957 maxcount = uap->bufsize / sizeof(struct statfs64);
3958
3959 sfsp = uap->buf;
3960 count = 0;
3961
3962 fst.sfsp = sfsp;
3963 fst.flags = uap->flags;
3964 fst.count = 0;
3965 fst.error = 0;
3966 fst.maxcount = maxcount;
3967
3968 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3969
3970 if (fst.error) {
3971 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3972 return fst.error;
3973 }
3974
3975 if (fst.sfsp && fst.count > fst.maxcount) {
3976 *retval = fst.maxcount;
3977 } else {
3978 *retval = fst.count;
3979 }
3980
3981 return 0;
3982 }
3983
3984 /*
3985 * gets the associated vnode with the file descriptor passed.
3986 * as input
3987 *
3988 * INPUT
3989 * ctx - vfs context of caller
3990 * fd - file descriptor for which vnode is required.
3991 * vpp - Pointer to pointer to vnode to be returned.
3992 *
3993 * The vnode is returned with an iocount so any vnode obtained
3994 * by this call needs a vnode_put
3995 *
3996 */
3997 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3998 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3999 {
4000 int error;
4001 vnode_t vp;
4002 struct fileproc *fp;
4003 proc_t p = vfs_context_proc(ctx);
4004
4005 *vpp = NULLVP;
4006
4007 error = fp_getfvp(p, fd, &fp, &vp);
4008 if (error) {
4009 return error;
4010 }
4011
4012 error = vnode_getwithref(vp);
4013 if (error) {
4014 (void)fp_drop(p, fd, fp, 0);
4015 return error;
4016 }
4017
4018 (void)fp_drop(p, fd, fp, 0);
4019 *vpp = vp;
4020 return error;
4021 }
4022
4023 /*
4024 * Wrapper function around namei to start lookup from a directory
4025 * specified by a file descriptor ni_dirfd.
4026 *
4027 * In addition to all the errors returned by namei, this call can
4028 * return ENOTDIR if the file descriptor does not refer to a directory.
4029 * and EBADF if the file descriptor is not valid.
4030 */
4031 int
nameiat(struct nameidata * ndp,int dirfd)4032 nameiat(struct nameidata *ndp, int dirfd)
4033 {
4034 if ((dirfd != AT_FDCWD) &&
4035 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
4036 !(ndp->ni_cnd.cn_flags & USEDVP)) {
4037 int error = 0;
4038 char c;
4039
4040 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4041 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4042 if (error) {
4043 return error;
4044 }
4045 } else {
4046 c = *((char *)(ndp->ni_dirp));
4047 }
4048
4049 if (c != '/') {
4050 vnode_t dvp_at;
4051
4052 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4053 &dvp_at);
4054 if (error) {
4055 return error;
4056 }
4057
4058 if (vnode_vtype(dvp_at) != VDIR) {
4059 vnode_put(dvp_at);
4060 return ENOTDIR;
4061 }
4062
4063 ndp->ni_dvp = dvp_at;
4064 ndp->ni_cnd.cn_flags |= USEDVP;
4065 error = namei(ndp);
4066 ndp->ni_cnd.cn_flags &= ~USEDVP;
4067 vnode_put(dvp_at);
4068 return error;
4069 }
4070 }
4071
4072 return namei(ndp);
4073 }
4074
4075 /*
4076 * Change current working directory to a given file descriptor.
4077 */
4078 /* ARGSUSED */
4079 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4080 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4081 {
4082 vnode_t vp;
4083 vnode_t tdp;
4084 vnode_t tvp;
4085 struct mount *mp;
4086 int error, should_put = 1;
4087
4088 AUDIT_ARG(fd, fd);
4089 if (per_thread && fd == -1) {
4090 /*
4091 * Switching back from per-thread to per process CWD; verify we
4092 * in fact have one before proceeding. The only success case
4093 * for this code path is to return 0 preemptively after zapping
4094 * the thread structure contents.
4095 */
4096 thread_t th = vfs_context_thread(ctx);
4097 if (th) {
4098 uthread_t uth = get_bsdthread_info(th);
4099 tvp = uth->uu_cdir;
4100 uth->uu_cdir = NULLVP;
4101 if (tvp != NULLVP) {
4102 vnode_rele(tvp);
4103 return 0;
4104 }
4105 }
4106 return EBADF;
4107 }
4108
4109 if ((error = file_vnode(fd, &vp))) {
4110 return error;
4111 }
4112 if ((error = vnode_getwithref(vp))) {
4113 file_drop(fd);
4114 return error;
4115 }
4116
4117 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4118
4119 if (vp->v_type != VDIR) {
4120 error = ENOTDIR;
4121 goto out;
4122 }
4123
4124 #if CONFIG_MACF
4125 error = mac_vnode_check_chdir(ctx, vp);
4126 if (error) {
4127 goto out;
4128 }
4129 #endif
4130 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4131 if (error) {
4132 goto out;
4133 }
4134
4135 while (!error && (mp = vp->v_mountedhere) != NULL) {
4136 if (vfs_busy(mp, LK_NOWAIT)) {
4137 error = EACCES;
4138 goto out;
4139 }
4140 error = VFS_ROOT(mp, &tdp, ctx);
4141 vfs_unbusy(mp);
4142 if (error) {
4143 break;
4144 }
4145 vnode_put(vp);
4146 vp = tdp;
4147 }
4148 if (error) {
4149 goto out;
4150 }
4151 if ((error = vnode_ref(vp))) {
4152 goto out;
4153 }
4154 vnode_put(vp);
4155 should_put = 0;
4156
4157 if (per_thread) {
4158 thread_t th = vfs_context_thread(ctx);
4159 if (th) {
4160 uthread_t uth = get_bsdthread_info(th);
4161 tvp = uth->uu_cdir;
4162 uth->uu_cdir = vp;
4163 OSBitOrAtomic(P_THCWD, &p->p_flag);
4164 } else {
4165 vnode_rele(vp);
4166 error = ENOENT;
4167 goto out;
4168 }
4169 } else {
4170 proc_dirs_lock_exclusive(p);
4171 proc_fdlock(p);
4172 tvp = p->p_fd.fd_cdir;
4173 p->p_fd.fd_cdir = vp;
4174 proc_fdunlock(p);
4175 proc_dirs_unlock_exclusive(p);
4176 }
4177
4178 if (tvp) {
4179 vnode_rele(tvp);
4180 }
4181
4182 out:
4183 if (should_put) {
4184 vnode_put(vp);
4185 }
4186 file_drop(fd);
4187
4188 return error;
4189 }
4190
4191 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4192 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4193 {
4194 return fchdir(p, vfs_context_current(), uap->fd, false);
4195 }
4196
4197 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4198 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4199 {
4200 return fchdir(p, vfs_context_current(), uap->fd, true);
4201 }
4202
4203
4204 /*
4205 * Change current working directory (".").
4206 *
4207 * Returns: 0 Success
4208 * change_dir:ENOTDIR
4209 * change_dir:???
4210 * vnode_ref:ENOENT No such file or directory
4211 */
4212 /* ARGSUSED */
4213 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4214 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4215 {
4216 int error;
4217 vnode_t tvp;
4218
4219 error = change_dir(ndp, ctx);
4220 if (error) {
4221 return error;
4222 }
4223 if ((error = vnode_ref(ndp->ni_vp))) {
4224 vnode_put(ndp->ni_vp);
4225 return error;
4226 }
4227 /*
4228 * drop the iocount we picked up in change_dir
4229 */
4230 vnode_put(ndp->ni_vp);
4231
4232 if (per_thread) {
4233 thread_t th = vfs_context_thread(ctx);
4234 if (th) {
4235 uthread_t uth = get_bsdthread_info(th);
4236 tvp = uth->uu_cdir;
4237 uth->uu_cdir = ndp->ni_vp;
4238 OSBitOrAtomic(P_THCWD, &p->p_flag);
4239 } else {
4240 vnode_rele(ndp->ni_vp);
4241 return ENOENT;
4242 }
4243 } else {
4244 proc_dirs_lock_exclusive(p);
4245 proc_fdlock(p);
4246 tvp = p->p_fd.fd_cdir;
4247 p->p_fd.fd_cdir = ndp->ni_vp;
4248 proc_fdunlock(p);
4249 proc_dirs_unlock_exclusive(p);
4250 }
4251
4252 if (tvp) {
4253 vnode_rele(tvp);
4254 }
4255
4256 return 0;
4257 }
4258
4259
4260 /*
4261 * Change current working directory (".").
4262 *
4263 * Returns: 0 Success
4264 * chdir_internal:ENOTDIR
4265 * chdir_internal:ENOENT No such file or directory
4266 * chdir_internal:???
4267 */
4268 /* ARGSUSED */
4269 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4270 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4271 {
4272 struct nameidata nd;
4273 vfs_context_t ctx = vfs_context_current();
4274
4275 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4276 UIO_USERSPACE, uap->path, ctx);
4277
4278 return chdir_internal(p, ctx, &nd, per_thread);
4279 }
4280
4281
4282 /*
4283 * chdir
4284 *
4285 * Change current working directory (".") for the entire process
4286 *
4287 * Parameters: p Process requesting the call
4288 * uap User argument descriptor (see below)
4289 * retval (ignored)
4290 *
4291 * Indirect parameters: uap->path Directory path
4292 *
4293 * Returns: 0 Success
4294 * common_chdir: ENOTDIR
4295 * common_chdir: ENOENT No such file or directory
4296 * common_chdir: ???
4297 *
4298 */
4299 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4300 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4301 {
4302 return common_chdir(p, (void *)uap, 0);
4303 }
4304
4305 /*
4306 * __pthread_chdir
4307 *
4308 * Change current working directory (".") for a single thread
4309 *
4310 * Parameters: p Process requesting the call
4311 * uap User argument descriptor (see below)
4312 * retval (ignored)
4313 *
4314 * Indirect parameters: uap->path Directory path
4315 *
4316 * Returns: 0 Success
4317 * common_chdir: ENOTDIR
4318 * common_chdir: ENOENT No such file or directory
4319 * common_chdir: ???
4320 *
4321 */
4322 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4323 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4324 {
4325 return common_chdir(p, (void *)uap, 1);
4326 }
4327
4328
4329 /*
4330 * Change notion of root (``/'') directory.
4331 */
4332 /* ARGSUSED */
4333 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4334 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4335 {
4336 struct filedesc *fdp = &p->p_fd;
4337 int error;
4338 struct nameidata nd;
4339 vnode_t tvp;
4340 vfs_context_t ctx = vfs_context_current();
4341
4342 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4343 return error;
4344 }
4345
4346 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4347 UIO_USERSPACE, uap->path, ctx);
4348 error = change_dir(&nd, ctx);
4349 if (error) {
4350 return error;
4351 }
4352
4353 #if CONFIG_MACF
4354 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4355 &nd.ni_cnd);
4356 if (error) {
4357 vnode_put(nd.ni_vp);
4358 return error;
4359 }
4360 #endif
4361
4362 if ((error = vnode_ref(nd.ni_vp))) {
4363 vnode_put(nd.ni_vp);
4364 return error;
4365 }
4366 vnode_put(nd.ni_vp);
4367
4368 /*
4369 * This lock provides the guarantee that as long as you hold the lock
4370 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4371 * on a referenced vnode in namei when determining the rootvnode for
4372 * a process.
4373 */
4374 /* needed for synchronization with lookup */
4375 proc_dirs_lock_exclusive(p);
4376 /* needed for setting the flag and other activities on the fd itself */
4377 proc_fdlock(p);
4378 tvp = fdp->fd_rdir;
4379 fdp->fd_rdir = nd.ni_vp;
4380 fdt_flag_set(fdp, FD_CHROOT);
4381 proc_fdunlock(p);
4382 proc_dirs_unlock_exclusive(p);
4383
4384 if (tvp != NULL) {
4385 vnode_rele(tvp);
4386 }
4387
4388 return 0;
4389 }
4390
4391 #define PATHSTATICBUFLEN 256
4392 #define PIVOT_ROOT_ENTITLEMENT \
4393 "com.apple.private.vfs.pivot-root"
4394
4395 #if defined(XNU_TARGET_OS_OSX)
4396 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4397 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4398 {
4399 int error;
4400 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4401 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4402 char *new_rootfs_path_before_buf = NULL;
4403 char *old_rootfs_path_after_buf = NULL;
4404 char *incoming = NULL;
4405 char *outgoing = NULL;
4406 vnode_t incoming_rootvp = NULLVP;
4407 size_t bytes_copied;
4408
4409 /*
4410 * XXX : Additional restrictions needed
4411 * - perhaps callable only once.
4412 */
4413 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4414 return error;
4415 }
4416
4417 /*
4418 * pivot_root can be executed by launchd only.
4419 * Enforce entitlement.
4420 */
4421 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4422 return EPERM;
4423 }
4424
4425 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4426 if (error == ENAMETOOLONG) {
4427 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4428 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4429 }
4430
4431 if (error) {
4432 goto out;
4433 }
4434
4435 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4436 if (error == ENAMETOOLONG) {
4437 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4438 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4439 }
4440 if (error) {
4441 goto out;
4442 }
4443
4444 if (new_rootfs_path_before_buf) {
4445 incoming = new_rootfs_path_before_buf;
4446 } else {
4447 incoming = &new_rootfs_path_before[0];
4448 }
4449
4450 if (old_rootfs_path_after_buf) {
4451 outgoing = old_rootfs_path_after_buf;
4452 } else {
4453 outgoing = &old_rootfs_path_after[0];
4454 }
4455
4456 /*
4457 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4458 * Userland is not allowed to pivot to an image.
4459 */
4460 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4461 if (error) {
4462 goto out;
4463 }
4464 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4465 if (error) {
4466 goto out;
4467 }
4468
4469 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4470
4471 out:
4472 if (incoming_rootvp != NULLVP) {
4473 vnode_put(incoming_rootvp);
4474 incoming_rootvp = NULLVP;
4475 }
4476
4477 if (old_rootfs_path_after_buf) {
4478 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4479 }
4480
4481 if (new_rootfs_path_before_buf) {
4482 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4483 }
4484
4485 return error;
4486 }
4487 #else
4488 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4489 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4490 {
4491 return nosys(p, NULL, retval);
4492 }
4493 #endif /* XNU_TARGET_OS_OSX */
4494
4495 /*
4496 * Common routine for chroot and chdir.
4497 *
4498 * Returns: 0 Success
4499 * ENOTDIR Not a directory
4500 * namei:??? [anything namei can return]
4501 * vnode_authorize:??? [anything vnode_authorize can return]
4502 */
4503 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4504 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4505 {
4506 vnode_t vp;
4507 int error;
4508
4509 if ((error = namei(ndp))) {
4510 return error;
4511 }
4512 nameidone(ndp);
4513 vp = ndp->ni_vp;
4514
4515 if (vp->v_type != VDIR) {
4516 vnode_put(vp);
4517 return ENOTDIR;
4518 }
4519
4520 #if CONFIG_MACF
4521 error = mac_vnode_check_chdir(ctx, vp);
4522 if (error) {
4523 vnode_put(vp);
4524 return error;
4525 }
4526 #endif
4527
4528 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4529 if (error) {
4530 vnode_put(vp);
4531 return error;
4532 }
4533
4534 return error;
4535 }
4536
4537 /*
4538 * Free the vnode data (for directories) associated with the file glob.
4539 */
4540 struct fd_vn_data *
fg_vn_data_alloc(void)4541 fg_vn_data_alloc(void)
4542 {
4543 struct fd_vn_data *fvdata;
4544
4545 /* Allocate per fd vnode data */
4546 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4547 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4548 return fvdata;
4549 }
4550
4551 /*
4552 * Free the vnode data (for directories) associated with the file glob.
4553 */
4554 void
fg_vn_data_free(void * fgvndata)4555 fg_vn_data_free(void *fgvndata)
4556 {
4557 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4558
4559 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4560 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4561 kfree_type(struct fd_vn_data, fvdata);
4562 }
4563
4564 /*
4565 * Check permissions, allocate an open file structure,
4566 * and call the device open routine if any.
4567 *
4568 * Returns: 0 Success
4569 * EINVAL
4570 * EINTR
4571 * falloc:ENFILE
4572 * falloc:EMFILE
4573 * falloc:ENOMEM
4574 * vn_open_auth:???
4575 * dupfdopen:???
4576 * VNOP_ADVLOCK:???
4577 * vnode_setsize:???
4578 *
4579 * XXX Need to implement uid, gid
4580 */
4581 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4582 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4583 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4584 {
4585 proc_t p = vfs_context_proc(ctx);
4586 kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4587 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4588 struct fileproc *fp;
4589 vnode_t vp;
4590 int flags, oflags, amode;
4591 int type, indx, error;
4592 struct vfs_context context;
4593 vnode_t authvp = NULLVP;
4594
4595 oflags = uflags;
4596
4597 amode = oflags & O_ACCMODE;
4598 /*
4599 * Because O_RDONLY is 0, it is not possible to distinguish between
4600 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4601 * with FREAD/FWRITE.
4602 */
4603 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4604 return EINVAL;
4605 }
4606
4607 flags = FFLAGS(uflags);
4608 CLR(flags, FENCRYPTED);
4609 CLR(flags, FUNENCRYPTED);
4610
4611 AUDIT_ARG(fflags, oflags);
4612 AUDIT_ARG(mode, vap->va_mode);
4613
4614 if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4615 return error;
4616 }
4617 if (flags & O_CLOEXEC) {
4618 fp->fp_flags |= FP_CLOEXEC;
4619 }
4620 if (flags & O_CLOFORK) {
4621 fp->fp_flags |= FP_CLOFORK;
4622 }
4623
4624 /* setup state to recognize when fdesc_open was called */
4625 uu->uu_dupfd = -1;
4626
4627 /*
4628 * Disable read/write access if file is opened with O_EVTONLY and
4629 * the process has requested to deny read/write access.
4630 */
4631 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4632 flags &= ~(FREAD | FWRITE);
4633 }
4634
4635 if (authfd != AUTH_OPEN_NOAUTHFD) {
4636 error = vnode_getfromfd(ctx, authfd, &authvp);
4637 if (error) {
4638 fp_free(p, indx, fp);
4639 return error;
4640 }
4641 }
4642
4643 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4644 if (authvp != NULLVP) {
4645 vnode_put(authvp);
4646 }
4647 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4648 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4649 *retval = indx;
4650 return 0;
4651 }
4652 }
4653 if (error == ERESTART) {
4654 error = EINTR;
4655 }
4656 fp_free(p, indx, fp);
4657 return error;
4658 }
4659
4660 if (authvp != NULLVP) {
4661 vnode_put(authvp);
4662 }
4663
4664 uu->uu_dupfd = 0;
4665 vp = ndp->ni_vp;
4666
4667 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4668 fp->fp_glob->fg_ops = &vnops;
4669 fp_set_data(fp, vp);
4670
4671 #if CONFIG_FILE_LEASES
4672 /*
4673 * If we are creating a file or open with truncate, we need to break the
4674 * lease if there is a read lease placed on the parent dir.
4675 */
4676 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4677 vnode_breakdirlease(vp, true, oflags);
4678 }
4679 /* Now check if there is a lease placed on the file itself. */
4680 error = vnode_breaklease(vp, oflags, ctx);
4681 if (error) {
4682 goto bad;
4683 }
4684 #endif /* CONFIG_FILE_LEASES */
4685
4686 if (flags & (O_EXLOCK | O_SHLOCK)) {
4687 struct flock lf = {
4688 .l_whence = SEEK_SET,
4689 };
4690
4691 if (flags & O_EXLOCK) {
4692 lf.l_type = F_WRLCK;
4693 } else {
4694 lf.l_type = F_RDLCK;
4695 }
4696 type = F_FLOCK;
4697 if ((flags & FNONBLOCK) == 0) {
4698 type |= F_WAIT;
4699 }
4700 #if CONFIG_MACF
4701 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4702 F_SETLK, &lf);
4703 if (error) {
4704 goto bad;
4705 }
4706 #endif
4707 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4708 goto bad;
4709 }
4710 fp->fp_glob->fg_flag |= FWASLOCKED;
4711 }
4712
4713 /* try to truncate by setting the size attribute */
4714 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4715 goto bad;
4716 }
4717
4718 /*
4719 * For directories we hold some additional information in the fd.
4720 */
4721 if (vnode_vtype(vp) == VDIR) {
4722 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4723 } else {
4724 fp->fp_glob->fg_vn_data = NULL;
4725 }
4726
4727 #if CONFIG_SECLUDED_MEMORY
4728 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4729 memory_object_control_t moc;
4730 const char *v_name;
4731
4732 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4733
4734 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4735 /* nothing to do... */
4736 } else if (fp->fp_glob->fg_flag & FWRITE) {
4737 /* writable -> no longer eligible for secluded pages */
4738 memory_object_mark_eligible_for_secluded(moc,
4739 FALSE);
4740 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4741 char pathname[32] = { 0, };
4742 size_t copied;
4743 /* XXX FBDP: better way to detect /Applications/ ? */
4744 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4745 (void)copyinstr(ndp->ni_dirp,
4746 pathname,
4747 sizeof(pathname),
4748 &copied);
4749 } else {
4750 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4751 pathname,
4752 sizeof(pathname),
4753 &copied);
4754 }
4755 pathname[sizeof(pathname) - 1] = '\0';
4756 if (strncmp(pathname,
4757 "/Applications/",
4758 strlen("/Applications/")) == 0 &&
4759 strncmp(pathname,
4760 "/Applications/Camera.app/",
4761 strlen("/Applications/Camera.app/")) != 0) {
4762 /*
4763 * not writable
4764 * AND from "/Applications/"
4765 * AND not from "/Applications/Camera.app/"
4766 * ==> eligible for secluded
4767 */
4768 memory_object_mark_eligible_for_secluded(moc,
4769 TRUE);
4770 }
4771 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4772 (v_name = vnode_getname(vp))) {
4773 size_t len = strlen(v_name);
4774
4775 if (!strncmp(v_name, "dyld", len) ||
4776 !strncmp(v_name, "launchd", len) ||
4777 !strncmp(v_name, "Camera", len) ||
4778 !strncmp(v_name, "SpringBoard", len) ||
4779 !strncmp(v_name, "backboardd", len)) {
4780 /*
4781 * This file matters when launching Camera:
4782 * do not store its contents in the secluded
4783 * pool that will be drained on Camera launch.
4784 */
4785 memory_object_mark_eligible_for_secluded(moc,
4786 FALSE);
4787 } else if (!strncmp(v_name, "audiomxd", len) ||
4788 !strncmp(v_name, "mediaplaybackd", len)) {
4789 memory_object_mark_eligible_for_secluded(moc,
4790 FALSE);
4791 memory_object_mark_for_realtime(moc,
4792 true);
4793 } else if (!strncmp(v_name, "bluetoothd", len)) {
4794 /*
4795 * bluetoothd might be needed for realtime audio
4796 * playback.
4797 */
4798 memory_object_mark_eligible_for_secluded(moc,
4799 FALSE);
4800 memory_object_mark_for_realtime(moc,
4801 true);
4802 } else {
4803 char pathname[64] = { 0, };
4804 size_t copied;
4805 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4806 (void)copyinstr(ndp->ni_dirp,
4807 pathname,
4808 sizeof(pathname),
4809 &copied);
4810 } else {
4811 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4812 pathname,
4813 sizeof(pathname),
4814 &copied);
4815 }
4816 pathname[sizeof(pathname) - 1] = '\0';
4817 if (strncmp(pathname,
4818 "/Library/Audio/Plug-Ins/",
4819 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4820 strncmp(pathname,
4821 "/System/Library/Audio/Plug-Ins/",
4822 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4823 /*
4824 * This may be an audio plugin required
4825 * for realtime playback.
4826 * ==> NOT eligible for secluded.
4827 */
4828 memory_object_mark_eligible_for_secluded(moc,
4829 FALSE);
4830 memory_object_mark_for_realtime(moc,
4831 true);
4832 }
4833 }
4834 vnode_putname(v_name);
4835 }
4836 }
4837 #endif /* CONFIG_SECLUDED_MEMORY */
4838
4839 vnode_put(vp);
4840
4841 /*
4842 * The first terminal open (without a O_NOCTTY) by a session leader
4843 * results in it being set as the controlling terminal.
4844 */
4845 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4846 !(flags & O_NOCTTY)) {
4847 int tmp = 0;
4848
4849 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4850 (caddr_t)&tmp, ctx);
4851 }
4852
4853 proc_fdlock(p);
4854 procfdtbl_releasefd(p, indx, NULL);
4855
4856 fp_drop(p, indx, fp, 1);
4857 proc_fdunlock(p);
4858
4859 *retval = indx;
4860
4861 return 0;
4862 bad:
4863 context = *vfs_context_current();
4864 context.vc_ucred = fp->fp_glob->fg_cred;
4865
4866 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4867 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4868 struct flock lf = {
4869 .l_whence = SEEK_SET,
4870 .l_type = F_UNLCK,
4871 };
4872
4873 (void)VNOP_ADVLOCK(
4874 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4875 }
4876
4877 vn_close(vp, fp->fp_glob->fg_flag, &context);
4878 vnode_put(vp);
4879 fp_free(p, indx, fp);
4880
4881 return error;
4882 }
4883
4884 /*
4885 * While most of the *at syscall handlers can call nameiat() which
4886 * is a wrapper around namei, the use of namei and initialisation
4887 * of nameidata are far removed and in different functions - namei
4888 * gets called in vn_open_auth for open1. So we'll just do here what
4889 * nameiat() does.
4890 */
4891 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4892 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4893 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4894 int dirfd, int authfd)
4895 {
4896 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4897 int error;
4898 char c;
4899
4900 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4901 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4902 if (error) {
4903 return error;
4904 }
4905 } else {
4906 c = *((char *)(ndp->ni_dirp));
4907 }
4908
4909 if (c != '/') {
4910 vnode_t dvp_at;
4911
4912 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4913 &dvp_at);
4914 if (error) {
4915 return error;
4916 }
4917
4918 if (vnode_vtype(dvp_at) != VDIR) {
4919 vnode_put(dvp_at);
4920 return ENOTDIR;
4921 }
4922
4923 ndp->ni_dvp = dvp_at;
4924 ndp->ni_cnd.cn_flags |= USEDVP;
4925 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4926 retval, authfd);
4927 vnode_put(dvp_at);
4928 return error;
4929 }
4930 }
4931
4932 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4933 }
4934
4935 /*
4936 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4937 *
4938 * Parameters: p Process requesting the open
4939 * uap User argument descriptor (see below)
4940 * retval Pointer to an area to receive the
4941 * return calue from the system call
4942 *
4943 * Indirect: uap->path Path to open (same as 'open')
4944 * uap->flags Flags to open (same as 'open'
4945 * uap->uid UID to set, if creating
4946 * uap->gid GID to set, if creating
4947 * uap->mode File mode, if creating (same as 'open')
4948 * uap->xsecurity ACL to set, if creating
4949 *
4950 * Returns: 0 Success
4951 * !0 errno value
4952 *
4953 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4954 *
4955 * XXX: We should enummerate the possible errno values here, and where
4956 * in the code they originated.
4957 */
4958 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4959 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4960 {
4961 int ciferror;
4962 kauth_filesec_t xsecdst;
4963 struct vnode_attr va;
4964 struct nameidata nd;
4965 int cmode;
4966
4967 AUDIT_ARG(owner, uap->uid, uap->gid);
4968
4969 xsecdst = NULL;
4970 if ((uap->xsecurity != USER_ADDR_NULL) &&
4971 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4972 return ciferror;
4973 }
4974
4975 VATTR_INIT(&va);
4976 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4977 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4978 if (uap->uid != KAUTH_UID_NONE) {
4979 VATTR_SET(&va, va_uid, uap->uid);
4980 }
4981 if (uap->gid != KAUTH_GID_NONE) {
4982 VATTR_SET(&va, va_gid, uap->gid);
4983 }
4984 if (xsecdst != NULL) {
4985 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4986 va.va_vaflags |= VA_FILESEC_ACL;
4987 }
4988
4989 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4990 uap->path, vfs_context_current());
4991
4992 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4993 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4994 if (xsecdst != NULL) {
4995 kauth_filesec_free(xsecdst);
4996 }
4997
4998 return ciferror;
4999 }
5000
5001 /*
5002 * Go through the data-protected atomically controlled open (2)
5003 *
5004 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
5005 */
5006 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)5007 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5008 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
5009 {
5010 /*
5011 * Follow the same path as normal open(2)
5012 * Look up the item if it exists, and acquire the vnode.
5013 */
5014 struct vnode_attr va;
5015 struct nameidata nd;
5016 int cmode;
5017 int error;
5018 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5019
5020 VATTR_INIT(&va);
5021 /* Mask off all but regular access permissions */
5022 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5023 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
5024
5025 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
5026 path, ctx);
5027
5028 /*
5029 * Initialize the extra fields in vnode_attr to pass down our
5030 * extra fields.
5031 * 1. target cprotect class.
5032 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
5033 */
5034 if (flags & O_CREAT) {
5035 /* lower level kernel code validates that the class is valid before applying it. */
5036 if (class != PROTECTION_CLASS_DEFAULT) {
5037 /*
5038 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
5039 * file behave the same as open (2)
5040 */
5041 VATTR_SET(&va, va_dataprotect_class, class);
5042 }
5043 }
5044
5045 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
5046 if (flags & (O_RDWR | O_WRONLY)) {
5047 /*
5048 * Not allowed to write raw encrypted bytes or when opening authenticated.
5049 */
5050 return EINVAL;
5051 }
5052 if (dpflags & O_DP_GETRAWENCRYPTED) {
5053 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
5054 }
5055 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
5056 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
5057 }
5058 if (dpflags & O_DP_AUTHENTICATE) {
5059 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
5060 }
5061 }
5062
5063 error = open1at(vfs_context_current(), &nd, flags, &va,
5064 NULL, NULL, retval, fd, authfd);
5065
5066 return error;
5067 }
5068
5069 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5070 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5071 {
5072 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5073 return EINVAL;
5074 }
5075
5076 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5077 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5078 }
5079
5080 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5081 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5082 {
5083 if (uap->dpflags & O_DP_AUTHENTICATE) {
5084 return EINVAL;
5085 }
5086
5087 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5088 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5089 }
5090
5091 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5092 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5093 int fd, enum uio_seg segflg, int *retval)
5094 {
5095 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5096 struct {
5097 struct vnode_attr va;
5098 struct nameidata nd;
5099 } *__open_data;
5100 struct vnode_attr *vap;
5101 struct nameidata *ndp;
5102 int cmode;
5103 int error;
5104
5105 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5106 vap = &__open_data->va;
5107 ndp = &__open_data->nd;
5108
5109 VATTR_INIT(vap);
5110 /* Mask off all but regular access permissions */
5111 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5112 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5113
5114 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5115 segflg, path, ctx);
5116
5117 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5118
5119 kfree_type(typeof(*__open_data), __open_data);
5120
5121 return error;
5122 }
5123
5124 int
open(proc_t p,struct open_args * uap,int32_t * retval)5125 open(proc_t p, struct open_args *uap, int32_t *retval)
5126 {
5127 __pthread_testcancel(1);
5128 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5129 }
5130
5131 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5132 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5133 int32_t *retval)
5134 {
5135 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5136 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5137 }
5138
5139 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5140 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5141 int32_t *retval)
5142 {
5143 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5144 uap->mode, uap->fd, UIO_USERSPACE, retval);
5145 }
5146
5147 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5148 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5149 {
5150 __pthread_testcancel(1);
5151 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5152 }
5153
5154 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5155
5156 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5157 vfs_context_can_open_by_id(vfs_context_t ctx)
5158 {
5159 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5160 return TRUE;
5161 }
5162
5163 return IOTaskHasEntitlement(vfs_context_task(ctx),
5164 OPEN_BY_ID_ENTITLEMENT);
5165 }
5166
5167 /*
5168 * openbyid_np: open a file given a file system id and a file system object id
5169 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5170 * file systems that don't support object ids it is a node id (uint64_t).
5171 *
5172 * Parameters: p Process requesting the open
5173 * uap User argument descriptor (see below)
5174 * retval Pointer to an area to receive the
5175 * return calue from the system call
5176 *
5177 * Indirect: uap->path Path to open (same as 'open')
5178 *
5179 * uap->fsid id of target file system
5180 * uap->objid id of target file system object
5181 * uap->flags Flags to open (same as 'open')
5182 *
5183 * Returns: 0 Success
5184 * !0 errno value
5185 *
5186 *
5187 * XXX: We should enummerate the possible errno values here, and where
5188 * in the code they originated.
5189 */
5190 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5191 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5192 {
5193 fsid_t fsid;
5194 uint64_t objid;
5195 int error;
5196 char *buf = NULL;
5197 int buflen = MAXPATHLEN;
5198 int pathlen = 0;
5199 vfs_context_t ctx = vfs_context_current();
5200
5201 if (!vfs_context_can_open_by_id(ctx)) {
5202 return EPERM;
5203 }
5204
5205 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5206 return error;
5207 }
5208
5209 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5210 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5211 return error;
5212 }
5213
5214 AUDIT_ARG(value32, fsid.val[0]);
5215 AUDIT_ARG(value64, objid);
5216
5217 /*resolve path from fsis, objid*/
5218 do {
5219 buf = kalloc_data(buflen + 1, Z_WAITOK);
5220 if (buf == NULL) {
5221 return ENOMEM;
5222 }
5223
5224 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5225 buf, FSOPT_ISREALFSID, &pathlen);
5226
5227 if (error) {
5228 kfree_data(buf, buflen + 1);
5229 buf = NULL;
5230 }
5231 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5232
5233 if (error) {
5234 return error;
5235 }
5236
5237 buf[pathlen] = 0;
5238
5239 error = openat_internal(
5240 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5241
5242 kfree_data(buf, buflen + 1);
5243
5244 return error;
5245 }
5246
5247
5248 /*
5249 * Create a special file.
5250 */
5251 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5252 int fd);
5253
5254 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5255 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5256 mode_t mode, int fd)
5257 {
5258 vfs_context_t ctx = vfs_context_current();
5259 struct nameidata nd;
5260 vnode_t vp, dvp;
5261 int error;
5262
5263 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5264 if ((mode & S_IFMT) == S_IFIFO) {
5265 return mkfifo1(ctx, upath, vap, fd);
5266 }
5267
5268 AUDIT_ARG(mode, mode);
5269 AUDIT_ARG(value32, vap->va_rdev);
5270
5271 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5272 return error;
5273 }
5274 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5275 UIO_USERSPACE, upath, ctx);
5276 error = nameiat(&nd, fd);
5277 if (error) {
5278 return error;
5279 }
5280 dvp = nd.ni_dvp;
5281 vp = nd.ni_vp;
5282
5283 if (vp != NULL) {
5284 error = EEXIST;
5285 goto out;
5286 }
5287
5288 switch (mode & S_IFMT) {
5289 case S_IFCHR:
5290 VATTR_SET(vap, va_type, VCHR);
5291 break;
5292 case S_IFBLK:
5293 VATTR_SET(vap, va_type, VBLK);
5294 break;
5295 default:
5296 error = EINVAL;
5297 goto out;
5298 }
5299
5300 #if CONFIG_MACF
5301 error = mac_vnode_check_create(ctx,
5302 nd.ni_dvp, &nd.ni_cnd, vap);
5303 if (error) {
5304 goto out;
5305 }
5306 #endif
5307
5308 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5309 goto out;
5310 }
5311
5312 #if CONFIG_FILE_LEASES
5313 vnode_breakdirlease(dvp, false, O_WRONLY);
5314 #endif
5315
5316 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5317 goto out;
5318 }
5319
5320 if (vp) {
5321 int update_flags = 0;
5322
5323 // Make sure the name & parent pointers are hooked up
5324 if (vp->v_name == NULL) {
5325 update_flags |= VNODE_UPDATE_NAME;
5326 }
5327 if (vp->v_parent == NULLVP) {
5328 update_flags |= VNODE_UPDATE_PARENT;
5329 }
5330
5331 if (update_flags) {
5332 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5333 }
5334
5335 #if CONFIG_FSE
5336 add_fsevent(FSE_CREATE_FILE, ctx,
5337 FSE_ARG_VNODE, vp,
5338 FSE_ARG_DONE);
5339 #endif
5340 }
5341
5342 out:
5343 /*
5344 * nameidone has to happen before we vnode_put(dvp)
5345 * since it may need to release the fs_nodelock on the dvp
5346 */
5347 nameidone(&nd);
5348
5349 if (vp) {
5350 vnode_put(vp);
5351 }
5352 vnode_put(dvp);
5353
5354 return error;
5355 }
5356
5357 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5358 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5359 {
5360 struct vnode_attr va;
5361
5362 VATTR_INIT(&va);
5363 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5364 VATTR_SET(&va, va_rdev, uap->dev);
5365
5366 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5367 }
5368
5369 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5370 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5371 {
5372 struct vnode_attr va;
5373
5374 VATTR_INIT(&va);
5375 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5376 VATTR_SET(&va, va_rdev, uap->dev);
5377
5378 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5379 }
5380
5381 /*
5382 * Create a named pipe.
5383 *
5384 * Returns: 0 Success
5385 * EEXIST
5386 * namei:???
5387 * vnode_authorize:???
5388 * vn_create:???
5389 */
5390 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5391 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5392 {
5393 vnode_t vp, dvp;
5394 int error;
5395 struct nameidata nd;
5396
5397 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5398 UIO_USERSPACE, upath, ctx);
5399 error = nameiat(&nd, fd);
5400 if (error) {
5401 return error;
5402 }
5403 dvp = nd.ni_dvp;
5404 vp = nd.ni_vp;
5405
5406 /* check that this is a new file and authorize addition */
5407 if (vp != NULL) {
5408 error = EEXIST;
5409 goto out;
5410 }
5411 VATTR_SET(vap, va_type, VFIFO);
5412
5413 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5414 goto out;
5415 }
5416
5417 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5418 out:
5419 /*
5420 * nameidone has to happen before we vnode_put(dvp)
5421 * since it may need to release the fs_nodelock on the dvp
5422 */
5423 nameidone(&nd);
5424
5425 if (vp) {
5426 vnode_put(vp);
5427 }
5428 vnode_put(dvp);
5429
5430 return error;
5431 }
5432
5433
5434 /*
5435 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5436 *
5437 * Parameters: p Process requesting the open
5438 * uap User argument descriptor (see below)
5439 * retval (Ignored)
5440 *
5441 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5442 * uap->uid UID to set
5443 * uap->gid GID to set
5444 * uap->mode File mode to set (same as 'mkfifo')
5445 * uap->xsecurity ACL to set, if creating
5446 *
5447 * Returns: 0 Success
5448 * !0 errno value
5449 *
5450 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5451 *
5452 * XXX: We should enummerate the possible errno values here, and where
5453 * in the code they originated.
5454 */
5455 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5456 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5457 {
5458 int ciferror;
5459 kauth_filesec_t xsecdst;
5460 struct vnode_attr va;
5461
5462 AUDIT_ARG(owner, uap->uid, uap->gid);
5463
5464 xsecdst = KAUTH_FILESEC_NONE;
5465 if (uap->xsecurity != USER_ADDR_NULL) {
5466 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5467 return ciferror;
5468 }
5469 }
5470
5471 VATTR_INIT(&va);
5472 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5473 if (uap->uid != KAUTH_UID_NONE) {
5474 VATTR_SET(&va, va_uid, uap->uid);
5475 }
5476 if (uap->gid != KAUTH_GID_NONE) {
5477 VATTR_SET(&va, va_gid, uap->gid);
5478 }
5479 if (xsecdst != KAUTH_FILESEC_NONE) {
5480 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5481 va.va_vaflags |= VA_FILESEC_ACL;
5482 }
5483
5484 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5485
5486 if (xsecdst != KAUTH_FILESEC_NONE) {
5487 kauth_filesec_free(xsecdst);
5488 }
5489 return ciferror;
5490 }
5491
5492 /* ARGSUSED */
5493 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5494 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5495 {
5496 struct vnode_attr va;
5497
5498 VATTR_INIT(&va);
5499 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5500
5501 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5502 }
5503
5504 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5505 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5506 {
5507 struct vnode_attr va;
5508
5509 VATTR_INIT(&va);
5510 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5511
5512 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5513 }
5514
5515 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5516 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5517 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5518
5519 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5520 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5521 {
5522 int ret, len = _len;
5523
5524 *truncated_path = 0;
5525
5526 if (firmlink) {
5527 ret = vn_getpath(dvp, path, &len);
5528 } else {
5529 ret = vn_getpath_no_firmlink(dvp, path, &len);
5530 }
5531 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5532 if (leafname) {
5533 path[len - 1] = '/';
5534 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5535 if (len > MAXPATHLEN) {
5536 char *ptr;
5537
5538 // the string got truncated!
5539 *truncated_path = 1;
5540 ptr = strrchr(path, '/');
5541 if (ptr) {
5542 *ptr = '\0'; // chop off the string at the last directory component
5543 }
5544 len = (int)strlen(path) + 1;
5545 }
5546 }
5547 } else if (ret == 0) {
5548 *truncated_path = 1;
5549 } else if (ret != 0) {
5550 struct vnode *mydvp = dvp;
5551
5552 if (ret != ENOSPC) {
5553 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5554 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5555 }
5556 *truncated_path = 1;
5557
5558 do {
5559 if (mydvp->v_parent != NULL) {
5560 mydvp = mydvp->v_parent;
5561 } else if (mydvp->v_mount) {
5562 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5563 break;
5564 } else {
5565 // no parent and no mount point? only thing is to punt and say "/" changed
5566 strlcpy(path, "/", _len);
5567 len = 2;
5568 mydvp = NULL;
5569 }
5570
5571 if (mydvp == NULL) {
5572 break;
5573 }
5574
5575 len = _len;
5576 if (firmlink) {
5577 ret = vn_getpath(mydvp, path, &len);
5578 } else {
5579 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5580 }
5581 } while (ret == ENOSPC);
5582 }
5583
5584 return len;
5585 }
5586
5587 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5588 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5589 {
5590 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5591 }
5592
5593 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5594 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5595 {
5596 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5597 }
5598
5599 /*
5600 * Make a hard file link.
5601 *
5602 * Returns: 0 Success
5603 * EPERM
5604 * EEXIST
5605 * EXDEV
5606 * namei:???
5607 * vnode_authorize:???
5608 * VNOP_LINK:???
5609 */
5610 /* ARGSUSED */
5611 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5612 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5613 user_addr_t link, int flag, enum uio_seg segflg)
5614 {
5615 vnode_t vp, pvp, dvp, lvp;
5616 struct nameidata nd;
5617 int follow;
5618 int error;
5619 #if CONFIG_FSE
5620 fse_info finfo;
5621 #endif
5622 int need_event, has_listeners, need_kpath2;
5623 char *target_path = NULL;
5624 char *no_firmlink_path = NULL;
5625 int truncated = 0;
5626 int truncated_no_firmlink_path = 0;
5627
5628 vp = dvp = lvp = NULLVP;
5629
5630 /* look up the object we are linking to */
5631 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5632 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5633 segflg, path, ctx);
5634
5635 error = nameiat(&nd, fd1);
5636 if (error) {
5637 return error;
5638 }
5639 vp = nd.ni_vp;
5640
5641 nameidone(&nd);
5642
5643 /*
5644 * Normally, linking to directories is not supported.
5645 * However, some file systems may have limited support.
5646 */
5647 if (vp->v_type == VDIR) {
5648 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5649 error = EPERM; /* POSIX */
5650 goto out;
5651 }
5652
5653 /* Linking to a directory requires ownership. */
5654 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5655 struct vnode_attr dva;
5656
5657 VATTR_INIT(&dva);
5658 VATTR_WANTED(&dva, va_uid);
5659 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5660 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5661 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5662 error = EACCES;
5663 goto out;
5664 }
5665 }
5666 }
5667
5668 /* lookup the target node */
5669 #if CONFIG_TRIGGERS
5670 nd.ni_op = OP_LINK;
5671 #endif
5672 nd.ni_cnd.cn_nameiop = CREATE;
5673 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5674 nd.ni_dirp = link;
5675 error = nameiat(&nd, fd2);
5676 if (error != 0) {
5677 goto out;
5678 }
5679 dvp = nd.ni_dvp;
5680 lvp = nd.ni_vp;
5681
5682 #if CONFIG_MACF
5683 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5684 goto out2;
5685 }
5686 #endif
5687
5688 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5689 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5690 goto out2;
5691 }
5692
5693 /* target node must not exist */
5694 if (lvp != NULLVP) {
5695 error = EEXIST;
5696 goto out2;
5697 }
5698 /* cannot link across mountpoints */
5699 if (vnode_mount(vp) != vnode_mount(dvp)) {
5700 error = EXDEV;
5701 goto out2;
5702 }
5703
5704 /* authorize creation of the target note */
5705 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5706 goto out2;
5707 }
5708
5709 #if CONFIG_FILE_LEASES
5710 vnode_breakdirlease(dvp, false, O_WRONLY);
5711 #endif
5712
5713 /* and finally make the link */
5714 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5715 if (error) {
5716 goto out2;
5717 }
5718
5719 #if CONFIG_MACF
5720 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5721 #endif
5722
5723 #if CONFIG_FSE
5724 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5725 #else
5726 need_event = 0;
5727 #endif
5728 has_listeners = kauth_authorize_fileop_has_listeners();
5729
5730 need_kpath2 = 0;
5731 #if CONFIG_AUDIT
5732 if (AUDIT_RECORD_EXISTS()) {
5733 need_kpath2 = 1;
5734 }
5735 #endif
5736
5737 if (need_event || has_listeners || need_kpath2) {
5738 char *link_to_path = NULL;
5739 int len, link_name_len;
5740 int len_no_firmlink_path = 0;
5741
5742 /* build the path to the new link file */
5743 GET_PATH(target_path);
5744
5745 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5746 if (no_firmlink_path == NULL) {
5747 GET_PATH(no_firmlink_path);
5748 }
5749 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5750
5751 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5752
5753 if (has_listeners) {
5754 /* build the path to file we are linking to */
5755 GET_PATH(link_to_path);
5756
5757 link_name_len = MAXPATHLEN;
5758 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5759 /*
5760 * Call out to allow 3rd party notification of rename.
5761 * Ignore result of kauth_authorize_fileop call.
5762 */
5763 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5764 (uintptr_t)link_to_path,
5765 (uintptr_t)target_path);
5766 }
5767 if (link_to_path != NULL) {
5768 RELEASE_PATH(link_to_path);
5769 }
5770 }
5771 #if CONFIG_FSE
5772 if (need_event) {
5773 /* construct fsevent */
5774 if (get_fse_info(vp, &finfo, ctx) == 0) {
5775 if (truncated_no_firmlink_path) {
5776 finfo.mode |= FSE_TRUNCATED_PATH;
5777 }
5778
5779 // build the path to the destination of the link
5780 add_fsevent(FSE_CREATE_FILE, ctx,
5781 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5782 FSE_ARG_FINFO, &finfo,
5783 FSE_ARG_DONE);
5784 }
5785
5786 pvp = vp->v_parent;
5787 // need an iocount on parent vnode in this case
5788 if (pvp && pvp != dvp) {
5789 pvp = vnode_getparent_if_different(vp, dvp);
5790 }
5791 if (pvp) {
5792 add_fsevent(FSE_STAT_CHANGED, ctx,
5793 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5794 }
5795 if (pvp && pvp != dvp) {
5796 vnode_put(pvp);
5797 }
5798 }
5799 #endif
5800 }
5801 out2:
5802 /*
5803 * nameidone has to happen before we vnode_put(dvp)
5804 * since it may need to release the fs_nodelock on the dvp
5805 */
5806 nameidone(&nd);
5807 if (target_path != NULL) {
5808 RELEASE_PATH(target_path);
5809 }
5810 if (no_firmlink_path != NULL) {
5811 RELEASE_PATH(no_firmlink_path);
5812 no_firmlink_path = NULL;
5813 }
5814 out:
5815 if (lvp) {
5816 vnode_put(lvp);
5817 }
5818 if (dvp) {
5819 vnode_put(dvp);
5820 }
5821 vnode_put(vp);
5822 return error;
5823 }
5824
5825 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5826 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5827 {
5828 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5829 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5830 }
5831
5832 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5833 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5834 {
5835 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5836 return EINVAL;
5837 }
5838
5839 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5840 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5841 }
5842
5843 /*
5844 * Make a symbolic link.
5845 *
5846 * We could add support for ACLs here too...
5847 */
5848 /* ARGSUSED */
5849 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5850 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5851 user_addr_t link, enum uio_seg segflg)
5852 {
5853 struct vnode_attr va;
5854 char *path;
5855 int error;
5856 struct nameidata nd;
5857 vnode_t vp, dvp;
5858 size_t dummy = 0;
5859 proc_t p;
5860
5861 error = 0;
5862 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5863 path = zalloc(ZV_NAMEI);
5864 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5865 } else {
5866 path = (char *)path_data;
5867 }
5868 if (error) {
5869 goto out;
5870 }
5871 AUDIT_ARG(text, path); /* This is the link string */
5872
5873 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5874 segflg, link, ctx);
5875
5876 error = nameiat(&nd, fd);
5877 if (error) {
5878 goto out;
5879 }
5880 dvp = nd.ni_dvp;
5881 vp = nd.ni_vp;
5882
5883 p = vfs_context_proc(ctx);
5884 VATTR_INIT(&va);
5885 VATTR_SET(&va, va_type, VLNK);
5886 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5887
5888 #if CONFIG_MACF
5889 error = mac_vnode_check_create(ctx,
5890 dvp, &nd.ni_cnd, &va);
5891 #endif
5892 if (error != 0) {
5893 goto skipit;
5894 }
5895
5896 if (vp != NULL) {
5897 error = EEXIST;
5898 goto skipit;
5899 }
5900
5901 /* authorize */
5902 if (error == 0) {
5903 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5904 }
5905 /* get default ownership, etc. */
5906 if (error == 0) {
5907 error = vnode_authattr_new(dvp, &va, 0, ctx);
5908 }
5909
5910 #if CONFIG_FILE_LEASES
5911 vnode_breakdirlease(dvp, false, O_WRONLY);
5912 #endif
5913
5914 if (error == 0) {
5915 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5916 }
5917
5918 /* do fallback attribute handling */
5919 if (error == 0 && vp) {
5920 error = vnode_setattr_fallback(vp, &va, ctx);
5921 }
5922
5923 #if CONFIG_MACF
5924 if (error == 0 && vp) {
5925 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5926 }
5927 #endif
5928
5929 if (error == 0) {
5930 int update_flags = 0;
5931
5932 /*check if a new vnode was created, else try to get one*/
5933 if (vp == NULL) {
5934 nd.ni_cnd.cn_nameiop = LOOKUP;
5935 #if CONFIG_TRIGGERS
5936 nd.ni_op = OP_LOOKUP;
5937 #endif
5938 /*
5939 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5940 * reallocated again in namei().
5941 */
5942 nd.ni_cnd.cn_flags &= HASBUF;
5943 error = nameiat(&nd, fd);
5944 if (error) {
5945 goto skipit;
5946 }
5947 vp = nd.ni_vp;
5948 }
5949
5950 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5951 /* call out to allow 3rd party notification of rename.
5952 * Ignore result of kauth_authorize_fileop call.
5953 */
5954 if (kauth_authorize_fileop_has_listeners() &&
5955 namei(&nd) == 0) {
5956 char *new_link_path = NULL;
5957 int len;
5958
5959 /* build the path to the new link file */
5960 new_link_path = get_pathbuff();
5961 len = MAXPATHLEN;
5962 vn_getpath(dvp, new_link_path, &len);
5963 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5964 new_link_path[len - 1] = '/';
5965 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5966 }
5967
5968 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5969 (uintptr_t)path, (uintptr_t)new_link_path);
5970 if (new_link_path != NULL) {
5971 release_pathbuff(new_link_path);
5972 }
5973 }
5974 #endif
5975 // Make sure the name & parent pointers are hooked up
5976 if (vp->v_name == NULL) {
5977 update_flags |= VNODE_UPDATE_NAME;
5978 }
5979 if (vp->v_parent == NULLVP) {
5980 update_flags |= VNODE_UPDATE_PARENT;
5981 }
5982
5983 if (update_flags) {
5984 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5985 }
5986
5987 #if CONFIG_FSE
5988 add_fsevent(FSE_CREATE_FILE, ctx,
5989 FSE_ARG_VNODE, vp,
5990 FSE_ARG_DONE);
5991 #endif
5992 }
5993
5994 skipit:
5995 /*
5996 * nameidone has to happen before we vnode_put(dvp)
5997 * since it may need to release the fs_nodelock on the dvp
5998 */
5999 nameidone(&nd);
6000
6001 if (vp) {
6002 vnode_put(vp);
6003 }
6004 vnode_put(dvp);
6005 out:
6006 if (path && (path != (char *)path_data)) {
6007 zfree(ZV_NAMEI, path);
6008 }
6009
6010 return error;
6011 }
6012
6013 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)6014 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
6015 {
6016 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
6017 uap->link, UIO_USERSPACE);
6018 }
6019
6020 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)6021 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
6022 __unused int32_t *retval)
6023 {
6024 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
6025 uap->path2, UIO_USERSPACE);
6026 }
6027
6028 /*
6029 * Delete a whiteout from the filesystem.
6030 * No longer supported.
6031 */
6032 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)6033 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
6034 {
6035 return ENOTSUP;
6036 }
6037
6038 /*
6039 * Delete a name from the filesystem.
6040 */
6041 /* ARGSUSED */
6042 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6043 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
6044 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
6045 {
6046 struct {
6047 struct nameidata nd;
6048 #if CONFIG_FSE
6049 struct vnode_attr va;
6050 fse_info finfo;
6051 #endif
6052 } *__unlink_data;
6053 struct nameidata *ndp;
6054 vnode_t vp, dvp;
6055 int error;
6056 struct componentname *cnp;
6057 char *path = NULL;
6058 char *no_firmlink_path = NULL;
6059 int len_path = 0;
6060 int len_no_firmlink_path = 0;
6061 int flags;
6062 int need_event;
6063 int has_listeners;
6064 int truncated_path;
6065 int truncated_no_firmlink_path;
6066 int batched;
6067 struct vnode_attr *vap;
6068 int do_retry;
6069 int retry_count = 0;
6070 int cn_flags;
6071 int nofollow_any = 0;
6072
6073 cn_flags = LOCKPARENT;
6074 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6075 cn_flags |= AUDITVNPATH1;
6076 }
6077 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6078 nofollow_any = NAMEI_NOFOLLOW_ANY;
6079 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6080 }
6081 /* If a starting dvp is passed, it trumps any fd passed. */
6082 if (start_dvp) {
6083 cn_flags |= USEDVP;
6084 }
6085
6086 #if NAMEDRSRCFORK
6087 /* unlink or delete is allowed on rsrc forks and named streams */
6088 cn_flags |= CN_ALLOWRSRCFORK;
6089 #endif
6090
6091 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6092 ndp = &__unlink_data->nd;
6093 #if CONFIG_FSE
6094 fse_info *finfop = &__unlink_data->finfo;
6095 #endif
6096
6097 retry:
6098 do_retry = 0;
6099 flags = 0;
6100 need_event = 0;
6101 has_listeners = 0;
6102 truncated_path = 0;
6103 truncated_no_firmlink_path = 0;
6104 vap = NULL;
6105
6106 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6107
6108 ndp->ni_dvp = start_dvp;
6109 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6110 cnp = &ndp->ni_cnd;
6111
6112 continue_lookup:
6113 error = nameiat(ndp, fd);
6114 if (error) {
6115 goto early_out;
6116 }
6117
6118 dvp = ndp->ni_dvp;
6119 vp = ndp->ni_vp;
6120
6121 /* With Carbon delete semantics, busy files cannot be deleted */
6122 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6123 flags |= VNODE_REMOVE_NODELETEBUSY;
6124 }
6125
6126 /* Skip any potential upcalls if told to. */
6127 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6128 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6129 }
6130
6131 if (vp) {
6132 batched = vnode_compound_remove_available(vp);
6133 /*
6134 * The root of a mounted filesystem cannot be deleted.
6135 */
6136 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6137 error = EBUSY;
6138 goto out;
6139 }
6140
6141 #if DEVELOPMENT || DEBUG
6142 /*
6143 * XXX VSWAP: Check for entitlements or special flag here
6144 * so we can restrict access appropriately.
6145 */
6146 #else /* DEVELOPMENT || DEBUG */
6147
6148 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6149 error = EPERM;
6150 goto out;
6151 }
6152 #endif /* DEVELOPMENT || DEBUG */
6153
6154 if (!batched) {
6155 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6156 if (error) {
6157 if (error == ENOENT) {
6158 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6159 do_retry = 1;
6160 retry_count++;
6161 }
6162 }
6163 goto out;
6164 }
6165 }
6166 } else {
6167 batched = 1;
6168
6169 if (!vnode_compound_remove_available(dvp)) {
6170 panic("No vp, but no compound remove?");
6171 }
6172 }
6173
6174 #if CONFIG_FSE
6175 need_event = need_fsevent(FSE_DELETE, dvp);
6176 if (need_event) {
6177 if (!batched) {
6178 if ((vp->v_flag & VISHARDLINK) == 0) {
6179 /* XXX need to get these data in batched VNOP */
6180 get_fse_info(vp, finfop, ctx);
6181 }
6182 } else {
6183 error =
6184 vfs_get_notify_attributes(&__unlink_data->va);
6185 if (error) {
6186 goto out;
6187 }
6188
6189 vap = &__unlink_data->va;
6190 }
6191 }
6192 #endif
6193 has_listeners = kauth_authorize_fileop_has_listeners();
6194 if (need_event || has_listeners) {
6195 if (path == NULL) {
6196 GET_PATH(path);
6197 }
6198 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6199 if (no_firmlink_path == NULL) {
6200 GET_PATH(no_firmlink_path);
6201 }
6202 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6203 }
6204
6205 #if NAMEDRSRCFORK
6206 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6207 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6208 } else
6209 #endif
6210 {
6211 #if CONFIG_FILE_LEASES
6212 vnode_breakdirlease(dvp, false, O_WRONLY);
6213 #endif
6214
6215 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6216 vp = ndp->ni_vp;
6217 if (error == EKEEPLOOKING) {
6218 if (!batched) {
6219 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6220 }
6221
6222 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6223 panic("EKEEPLOOKING, but continue flag not set?");
6224 }
6225
6226 if (vnode_isdir(vp)) {
6227 error = EISDIR;
6228 goto out;
6229 }
6230 goto continue_lookup;
6231 } else if (error == ENOENT && batched) {
6232 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6233 /*
6234 * For compound VNOPs, the authorization callback may
6235 * return ENOENT in case of racing hardlink lookups
6236 * hitting the name cache, redrive the lookup.
6237 */
6238 do_retry = 1;
6239 retry_count += 1;
6240 goto out;
6241 }
6242 }
6243 }
6244
6245 /*
6246 * Call out to allow 3rd party notification of delete.
6247 * Ignore result of kauth_authorize_fileop call.
6248 */
6249 if (!error) {
6250 if (has_listeners) {
6251 kauth_authorize_fileop(vfs_context_ucred(ctx),
6252 KAUTH_FILEOP_DELETE,
6253 (uintptr_t)vp,
6254 (uintptr_t)path);
6255 }
6256
6257 if (vp->v_flag & VISHARDLINK) {
6258 //
6259 // if a hardlink gets deleted we want to blow away the
6260 // v_parent link because the path that got us to this
6261 // instance of the link is no longer valid. this will
6262 // force the next call to get the path to ask the file
6263 // system instead of just following the v_parent link.
6264 //
6265 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6266 }
6267
6268 #if CONFIG_FSE
6269 if (need_event) {
6270 if (vp->v_flag & VISHARDLINK) {
6271 get_fse_info(vp, finfop, ctx);
6272 } else if (vap) {
6273 vnode_get_fse_info_from_vap(vp, finfop, vap);
6274 }
6275 if (truncated_path) {
6276 finfop->mode |= FSE_TRUNCATED_PATH;
6277 }
6278 add_fsevent(FSE_DELETE, ctx,
6279 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6280 FSE_ARG_FINFO, finfop,
6281 FSE_ARG_DONE);
6282 }
6283 #endif
6284
6285 #if CONFIG_MACF
6286 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6287 #endif
6288 }
6289
6290 out:
6291 if (path != NULL) {
6292 RELEASE_PATH(path);
6293 path = NULL;
6294 }
6295
6296 if (no_firmlink_path != NULL) {
6297 RELEASE_PATH(no_firmlink_path);
6298 no_firmlink_path = NULL;
6299 }
6300 #if NAMEDRSRCFORK
6301 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6302 * will cause its shadow file to go away if necessary.
6303 */
6304 if (vp && (vnode_isnamedstream(vp)) &&
6305 (vp->v_parent != NULLVP) &&
6306 vnode_isshadow(vp)) {
6307 vnode_recycle(vp);
6308 }
6309 #endif
6310 /*
6311 * nameidone has to happen before we vnode_put(dvp)
6312 * since it may need to release the fs_nodelock on the dvp
6313 */
6314 nameidone(ndp);
6315 vnode_put(dvp);
6316 if (vp) {
6317 vnode_put(vp);
6318 }
6319
6320 if (do_retry) {
6321 goto retry;
6322 }
6323
6324 early_out:
6325 kfree_type(typeof(*__unlink_data), __unlink_data);
6326 return error;
6327 }
6328
6329 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6330 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6331 enum uio_seg segflg, int unlink_flags)
6332 {
6333 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6334 unlink_flags);
6335 }
6336
6337 /*
6338 * Delete a name from the filesystem using Carbon semantics.
6339 */
6340 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6341 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6342 {
6343 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6344 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6345 }
6346
6347 /*
6348 * Delete a name from the filesystem using POSIX semantics.
6349 */
6350 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6351 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6352 {
6353 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6354 uap->path, UIO_USERSPACE, 0);
6355 }
6356
6357 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6358 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6359 {
6360 int unlink_flags = 0;
6361
6362 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY)) {
6363 return EINVAL;
6364 }
6365
6366 if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6367 unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6368 }
6369
6370 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6371 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6372 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6373 }
6374 return rmdirat_internal(vfs_context_current(), uap->fd,
6375 uap->path, UIO_USERSPACE, unlink_flags);
6376 } else {
6377 return unlinkat_internal(vfs_context_current(), uap->fd,
6378 NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6379 }
6380 }
6381
6382 /*
6383 * Reposition read/write file offset.
6384 */
6385 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6386 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6387 {
6388 struct fileproc *fp;
6389 vnode_t vp;
6390 struct vfs_context *ctx;
6391 off_t offset = uap->offset, file_size;
6392 int error;
6393
6394 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6395 if (error == ENOTSUP) {
6396 return ESPIPE;
6397 }
6398 return error;
6399 }
6400 if (vnode_isfifo(vp)) {
6401 file_drop(uap->fd);
6402 return ESPIPE;
6403 }
6404
6405
6406 ctx = vfs_context_current();
6407 #if CONFIG_MACF
6408 if (uap->whence == L_INCR && uap->offset == 0) {
6409 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6410 fp->fp_glob);
6411 } else {
6412 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6413 fp->fp_glob);
6414 }
6415 if (error) {
6416 file_drop(uap->fd);
6417 return error;
6418 }
6419 #endif
6420 if ((error = vnode_getwithref(vp))) {
6421 file_drop(uap->fd);
6422 return error;
6423 }
6424
6425 switch (uap->whence) {
6426 case L_INCR:
6427 offset += fp->fp_glob->fg_offset;
6428 break;
6429 case L_XTND:
6430 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6431 break;
6432 }
6433 offset += file_size;
6434 break;
6435 case L_SET:
6436 break;
6437 case SEEK_HOLE:
6438 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6439 break;
6440 case SEEK_DATA:
6441 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6442 break;
6443 default:
6444 error = EINVAL;
6445 }
6446 if (error == 0) {
6447 if (uap->offset > 0 && offset < 0) {
6448 /* Incremented/relative move past max size */
6449 error = EOVERFLOW;
6450 } else {
6451 /*
6452 * Allow negative offsets on character devices, per
6453 * POSIX 1003.1-2001. Most likely for writing disk
6454 * labels.
6455 */
6456 if (offset < 0 && vp->v_type != VCHR) {
6457 /* Decremented/relative move before start */
6458 error = EINVAL;
6459 } else {
6460 /* Success */
6461 fp->fp_glob->fg_offset = offset;
6462 *retval = fp->fp_glob->fg_offset;
6463 }
6464 }
6465 }
6466
6467 /*
6468 * An lseek can affect whether data is "available to read." Use
6469 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6470 */
6471 post_event_if_success(vp, error, NOTE_NONE);
6472 (void)vnode_put(vp);
6473 file_drop(uap->fd);
6474 return error;
6475 }
6476
6477
6478 /*
6479 * Check access permissions.
6480 *
6481 * Returns: 0 Success
6482 * vnode_authorize:???
6483 */
6484 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6485 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6486 {
6487 kauth_action_t action;
6488 int error;
6489
6490 /*
6491 * If just the regular access bits, convert them to something
6492 * that vnode_authorize will understand.
6493 */
6494 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6495 action = 0;
6496 if (uflags & R_OK) {
6497 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6498 }
6499 if (uflags & W_OK) {
6500 if (vnode_isdir(vp)) {
6501 action |= KAUTH_VNODE_ADD_FILE |
6502 KAUTH_VNODE_ADD_SUBDIRECTORY;
6503 /* might want delete rights here too */
6504 } else {
6505 action |= KAUTH_VNODE_WRITE_DATA;
6506 }
6507 }
6508 if (uflags & X_OK) {
6509 if (vnode_isdir(vp)) {
6510 action |= KAUTH_VNODE_SEARCH;
6511 } else {
6512 action |= KAUTH_VNODE_EXECUTE;
6513 }
6514 }
6515 } else {
6516 /* take advantage of definition of uflags */
6517 action = uflags >> 8;
6518 }
6519
6520 #if CONFIG_MACF
6521 error = mac_vnode_check_access(ctx, vp, uflags);
6522 if (error) {
6523 return error;
6524 }
6525 #endif /* MAC */
6526
6527 /* action == 0 means only check for existence */
6528 if (action != 0) {
6529 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6530 } else {
6531 error = 0;
6532 }
6533
6534 return error;
6535 }
6536
6537
6538
6539 /*
6540 * access_extended: Check access permissions in bulk.
6541 *
6542 * Description: uap->entries Pointer to an array of accessx
6543 * descriptor structs, plus one or
6544 * more NULL terminated strings (see
6545 * "Notes" section below).
6546 * uap->size Size of the area pointed to by
6547 * uap->entries.
6548 * uap->results Pointer to the results array.
6549 *
6550 * Returns: 0 Success
6551 * ENOMEM Insufficient memory
6552 * EINVAL Invalid arguments
6553 * namei:EFAULT Bad address
6554 * namei:ENAMETOOLONG Filename too long
6555 * namei:ENOENT No such file or directory
6556 * namei:ELOOP Too many levels of symbolic links
6557 * namei:EBADF Bad file descriptor
6558 * namei:ENOTDIR Not a directory
6559 * namei:???
6560 * access1:
6561 *
6562 * Implicit returns:
6563 * uap->results Array contents modified
6564 *
6565 * Notes: The uap->entries are structured as an arbitrary length array
6566 * of accessx descriptors, followed by one or more NULL terminated
6567 * strings
6568 *
6569 * struct accessx_descriptor[0]
6570 * ...
6571 * struct accessx_descriptor[n]
6572 * char name_data[0];
6573 *
6574 * We determine the entry count by walking the buffer containing
6575 * the uap->entries argument descriptor. For each descriptor we
6576 * see, the valid values for the offset ad_name_offset will be
6577 * in the byte range:
6578 *
6579 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6580 * to
6581 * [ uap->entries + uap->size - 2 ]
6582 *
6583 * since we must have at least one string, and the string must
6584 * be at least one character plus the NULL terminator in length.
6585 *
6586 * XXX: Need to support the check-as uid argument
6587 */
6588 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6589 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6590 {
6591 struct accessx_descriptor *input = NULL;
6592 errno_t *result = NULL;
6593 errno_t error = 0;
6594 int wantdelete = 0;
6595 size_t desc_max, desc_actual = 0;
6596 unsigned int i, j;
6597 struct vfs_context context;
6598 struct nameidata nd;
6599 int niopts;
6600 vnode_t vp = NULL;
6601 vnode_t dvp = NULL;
6602 #define ACCESSX_MAX_DESCR_ON_STACK 10
6603 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6604
6605 context.vc_ucred = NULL;
6606
6607 /*
6608 * Validate parameters; if valid, copy the descriptor array and string
6609 * arguments into local memory. Before proceeding, the following
6610 * conditions must have been met:
6611 *
6612 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6613 * o There must be sufficient room in the request for at least one
6614 * descriptor and a one yte NUL terminated string.
6615 * o The allocation of local storage must not fail.
6616 */
6617 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6618 return ENOMEM;
6619 }
6620 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6621 return EINVAL;
6622 }
6623 if (uap->size <= sizeof(stack_input)) {
6624 input = stack_input;
6625 } else {
6626 input = kalloc_data(uap->size, Z_WAITOK);
6627 if (input == NULL) {
6628 error = ENOMEM;
6629 goto out;
6630 }
6631 }
6632 error = copyin(uap->entries, input, uap->size);
6633 if (error) {
6634 goto out;
6635 }
6636
6637 AUDIT_ARG(opaque, input, uap->size);
6638
6639 /*
6640 * Force NUL termination of the copyin buffer to avoid nami() running
6641 * off the end. If the caller passes us bogus data, they may get a
6642 * bogus result.
6643 */
6644 ((char *)input)[uap->size - 1] = 0;
6645
6646 /*
6647 * Access is defined as checking against the process' real identity,
6648 * even if operations are checking the effective identity. This
6649 * requires that we use a local vfs context.
6650 */
6651 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6652 context.vc_thread = current_thread();
6653
6654 /*
6655 * Find out how many entries we have, so we can allocate the result
6656 * array by walking the list and adjusting the count downward by the
6657 * earliest string offset we see.
6658 */
6659 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6660 desc_actual = desc_max;
6661 for (i = 0; i < desc_actual; i++) {
6662 /*
6663 * Take the offset to the name string for this entry and
6664 * convert to an input array index, which would be one off
6665 * the end of the array if this entry was the lowest-addressed
6666 * name string.
6667 */
6668 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6669
6670 /*
6671 * An offset greater than the max allowable offset is an error.
6672 * It is also an error for any valid entry to point
6673 * to a location prior to the end of the current entry, if
6674 * it's not a reference to the string of the previous entry.
6675 */
6676 if (j > desc_max || (j != 0 && j <= i)) {
6677 error = EINVAL;
6678 goto out;
6679 }
6680
6681 /* Also do not let ad_name_offset point to something beyond the size of the input */
6682 if (input[i].ad_name_offset >= uap->size) {
6683 error = EINVAL;
6684 goto out;
6685 }
6686
6687 /*
6688 * An offset of 0 means use the previous descriptor's offset;
6689 * this is used to chain multiple requests for the same file
6690 * to avoid multiple lookups.
6691 */
6692 if (j == 0) {
6693 /* This is not valid for the first entry */
6694 if (i == 0) {
6695 error = EINVAL;
6696 goto out;
6697 }
6698 continue;
6699 }
6700
6701 /*
6702 * If the offset of the string for this descriptor is before
6703 * what we believe is the current actual last descriptor,
6704 * then we need to adjust our estimate downward; this permits
6705 * the string table following the last descriptor to be out
6706 * of order relative to the descriptor list.
6707 */
6708 if (j < desc_actual) {
6709 desc_actual = j;
6710 }
6711 }
6712
6713 /*
6714 * We limit the actual number of descriptors we are willing to process
6715 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6716 * requested does not exceed this limit,
6717 */
6718 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6719 error = ENOMEM;
6720 goto out;
6721 }
6722 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6723 if (result == NULL) {
6724 error = ENOMEM;
6725 goto out;
6726 }
6727
6728 /*
6729 * Do the work by iterating over the descriptor entries we know to
6730 * at least appear to contain valid data.
6731 */
6732 error = 0;
6733 for (i = 0; i < desc_actual; i++) {
6734 /*
6735 * If the ad_name_offset is 0, then we use the previous
6736 * results to make the check; otherwise, we are looking up
6737 * a new file name.
6738 */
6739 if (input[i].ad_name_offset != 0) {
6740 /* discard old vnodes */
6741 if (vp) {
6742 vnode_put(vp);
6743 vp = NULL;
6744 }
6745 if (dvp) {
6746 vnode_put(dvp);
6747 dvp = NULL;
6748 }
6749
6750 /*
6751 * Scan forward in the descriptor list to see if we
6752 * need the parent vnode. We will need it if we are
6753 * deleting, since we must have rights to remove
6754 * entries in the parent directory, as well as the
6755 * rights to delete the object itself.
6756 */
6757 wantdelete = input[i].ad_flags & _DELETE_OK;
6758 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6759 if (input[j].ad_flags & _DELETE_OK) {
6760 wantdelete = 1;
6761 }
6762 }
6763
6764 niopts = FOLLOW | AUDITVNPATH1;
6765
6766 /* need parent for vnode_authorize for deletion test */
6767 if (wantdelete) {
6768 niopts |= WANTPARENT;
6769 }
6770
6771 /* do the lookup */
6772 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6773 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6774 &context);
6775 error = namei(&nd);
6776 if (!error) {
6777 vp = nd.ni_vp;
6778 if (wantdelete) {
6779 dvp = nd.ni_dvp;
6780 }
6781 }
6782 nameidone(&nd);
6783 }
6784
6785 /*
6786 * Handle lookup errors.
6787 */
6788 switch (error) {
6789 case ENOENT:
6790 case EACCES:
6791 case EPERM:
6792 case ENOTDIR:
6793 result[i] = error;
6794 break;
6795 case 0:
6796 /* run this access check */
6797 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6798 break;
6799 default:
6800 /* fatal lookup error */
6801
6802 goto out;
6803 }
6804 }
6805
6806 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6807
6808 /* copy out results */
6809 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6810
6811 out:
6812 if (input && input != stack_input) {
6813 kfree_data(input, uap->size);
6814 }
6815 if (result) {
6816 kfree_data(result, desc_actual * sizeof(errno_t));
6817 }
6818 if (vp) {
6819 vnode_put(vp);
6820 }
6821 if (dvp) {
6822 vnode_put(dvp);
6823 }
6824 if (IS_VALID_CRED(context.vc_ucred)) {
6825 kauth_cred_unref(&context.vc_ucred);
6826 }
6827 return error;
6828 }
6829
6830
6831 /*
6832 * Returns: 0 Success
6833 * namei:EFAULT Bad address
6834 * namei:ENAMETOOLONG Filename too long
6835 * namei:ENOENT No such file or directory
6836 * namei:ELOOP Too many levels of symbolic links
6837 * namei:EBADF Bad file descriptor
6838 * namei:ENOTDIR Not a directory
6839 * namei:???
6840 * access1:
6841 */
6842 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6843 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6844 int flag, enum uio_seg segflg)
6845 {
6846 int error;
6847 struct nameidata nd;
6848 int niopts;
6849 struct vfs_context context;
6850 #if NAMEDRSRCFORK
6851 int is_namedstream = 0;
6852 #endif
6853
6854 /*
6855 * Unless the AT_EACCESS option is used, Access is defined as checking
6856 * against the process' real identity, even if operations are checking
6857 * the effective identity. So we need to tweak the credential
6858 * in the context for that case.
6859 */
6860 if (!(flag & AT_EACCESS)) {
6861 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6862 } else {
6863 context.vc_ucred = ctx->vc_ucred;
6864 }
6865 context.vc_thread = ctx->vc_thread;
6866
6867
6868 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6869 /* need parent for vnode_authorize for deletion test */
6870 if (amode & _DELETE_OK) {
6871 niopts |= WANTPARENT;
6872 }
6873 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6874 path, &context);
6875 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6876 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6877 }
6878
6879 #if NAMEDRSRCFORK
6880 /* access(F_OK) calls are allowed for resource forks. */
6881 if (amode == F_OK) {
6882 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6883 }
6884 #endif
6885 error = nameiat(&nd, fd);
6886 if (error) {
6887 goto out;
6888 }
6889
6890 #if NAMEDRSRCFORK
6891 /* Grab reference on the shadow stream file vnode to
6892 * force an inactive on release which will mark it
6893 * for recycle.
6894 */
6895 if (vnode_isnamedstream(nd.ni_vp) &&
6896 (nd.ni_vp->v_parent != NULLVP) &&
6897 vnode_isshadow(nd.ni_vp)) {
6898 is_namedstream = 1;
6899 vnode_ref(nd.ni_vp);
6900 }
6901 #endif
6902
6903 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6904
6905 #if NAMEDRSRCFORK
6906 if (is_namedstream) {
6907 vnode_rele(nd.ni_vp);
6908 }
6909 #endif
6910
6911 vnode_put(nd.ni_vp);
6912 if (amode & _DELETE_OK) {
6913 vnode_put(nd.ni_dvp);
6914 }
6915 nameidone(&nd);
6916
6917 out:
6918 if (!(flag & AT_EACCESS)) {
6919 kauth_cred_unref(&context.vc_ucred);
6920 }
6921 return error;
6922 }
6923
6924 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6925 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6926 {
6927 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6928 uap->path, uap->flags, 0, UIO_USERSPACE);
6929 }
6930
6931 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6932 faccessat(__unused proc_t p, struct faccessat_args *uap,
6933 __unused int32_t *retval)
6934 {
6935 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6936 return EINVAL;
6937 }
6938
6939 return faccessat_internal(vfs_context_current(), uap->fd,
6940 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6941 }
6942
6943 /*
6944 * Returns: 0 Success
6945 * EFAULT
6946 * copyout:EFAULT
6947 * namei:???
6948 * vn_stat:???
6949 */
6950 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6951 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6952 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6953 enum uio_seg segflg, int fd, int flag)
6954 {
6955 struct nameidata *ndp = NULL;
6956 int follow;
6957 union {
6958 struct stat sb;
6959 struct stat64 sb64;
6960 } source = {};
6961 union {
6962 struct user64_stat user64_sb;
6963 struct user32_stat user32_sb;
6964 struct user64_stat64 user64_sb64;
6965 struct user32_stat64 user32_sb64;
6966 } dest = {};
6967 caddr_t sbp;
6968 int error, my_size;
6969 kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
6970 size_t xsecurity_bufsize;
6971 void * statptr;
6972 struct fileproc *fp = NULL;
6973 int needsrealdev = 0;
6974
6975 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6976 ndp = kalloc_type(struct nameidata, Z_WAITOK);
6977 NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6978 segflg, path, ctx);
6979 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6980 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
6981 }
6982
6983 #if NAMEDRSRCFORK
6984 int is_namedstream = 0;
6985 /* stat calls are allowed for resource forks. */
6986 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6987 #endif
6988
6989 if (flag & AT_FDONLY) {
6990 vnode_t fvp;
6991
6992 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6993 if (error) {
6994 goto out;
6995 }
6996 if ((error = vnode_getwithref(fvp))) {
6997 file_drop(fd);
6998 goto out;
6999 }
7000 ndp->ni_vp = fvp;
7001 } else {
7002 error = nameiat(ndp, fd);
7003 if (error) {
7004 goto out;
7005 }
7006 }
7007
7008 statptr = (void *)&source;
7009
7010 #if NAMEDRSRCFORK
7011 /* Grab reference on the shadow stream file vnode to
7012 * force an inactive on release which will mark it
7013 * for recycle.
7014 */
7015 if (vnode_isnamedstream(ndp->ni_vp) &&
7016 (ndp->ni_vp->v_parent != NULLVP) &&
7017 vnode_isshadow(ndp->ni_vp)) {
7018 is_namedstream = 1;
7019 vnode_ref(ndp->ni_vp);
7020 }
7021 #endif
7022
7023 needsrealdev = flag & AT_REALDEV ? 1 : 0;
7024 if (fp && (xsecurity == USER_ADDR_NULL)) {
7025 /*
7026 * If the caller has the file open, and is not
7027 * requesting extended security information, we are
7028 * going to let them get the basic stat information.
7029 */
7030 error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
7031 fp->fp_glob->fg_cred);
7032 } else {
7033 error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
7034 isstat64, needsrealdev, ctx);
7035 }
7036
7037 #if NAMEDRSRCFORK
7038 if (is_namedstream) {
7039 vnode_rele(ndp->ni_vp);
7040 }
7041 #endif
7042 vnode_put(ndp->ni_vp);
7043 nameidone(ndp);
7044
7045 if (fp) {
7046 file_drop(fd);
7047 fp = NULL;
7048 }
7049
7050 if (error) {
7051 goto out;
7052 }
7053 /* Zap spare fields */
7054 if (isstat64 != 0) {
7055 source.sb64.st_lspare = 0;
7056 source.sb64.st_qspare[0] = 0LL;
7057 source.sb64.st_qspare[1] = 0LL;
7058 if (vfs_context_is64bit(ctx)) {
7059 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
7060 my_size = sizeof(dest.user64_sb64);
7061 sbp = (caddr_t)&dest.user64_sb64;
7062 } else {
7063 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
7064 my_size = sizeof(dest.user32_sb64);
7065 sbp = (caddr_t)&dest.user32_sb64;
7066 }
7067 /*
7068 * Check if we raced (post lookup) against the last unlink of a file.
7069 */
7070 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7071 source.sb64.st_nlink = 1;
7072 }
7073 } else {
7074 source.sb.st_lspare = 0;
7075 source.sb.st_qspare[0] = 0LL;
7076 source.sb.st_qspare[1] = 0LL;
7077 if (vfs_context_is64bit(ctx)) {
7078 munge_user64_stat(&source.sb, &dest.user64_sb);
7079 my_size = sizeof(dest.user64_sb);
7080 sbp = (caddr_t)&dest.user64_sb;
7081 } else {
7082 munge_user32_stat(&source.sb, &dest.user32_sb);
7083 my_size = sizeof(dest.user32_sb);
7084 sbp = (caddr_t)&dest.user32_sb;
7085 }
7086
7087 /*
7088 * Check if we raced (post lookup) against the last unlink of a file.
7089 */
7090 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7091 source.sb.st_nlink = 1;
7092 }
7093 }
7094 if ((error = copyout(sbp, ub, my_size)) != 0) {
7095 goto out;
7096 }
7097
7098 /* caller wants extended security information? */
7099 if (xsecurity != USER_ADDR_NULL) {
7100 /* did we get any? */
7101 if (fsec == KAUTH_FILESEC_NONE) {
7102 if (susize(xsecurity_size, 0) != 0) {
7103 error = EFAULT;
7104 goto out;
7105 }
7106 } else {
7107 /* find the user buffer size */
7108 xsecurity_bufsize = fusize(xsecurity_size);
7109
7110 /* copy out the actual data size */
7111 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7112 error = EFAULT;
7113 goto out;
7114 }
7115
7116 /* if the caller supplied enough room, copy out to it */
7117 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7118 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7119 }
7120 }
7121 }
7122 out:
7123 if (ndp) {
7124 kfree_type(struct nameidata, ndp);
7125 }
7126 if (fsec != KAUTH_FILESEC_NONE) {
7127 kauth_filesec_free(fsec);
7128 }
7129 return error;
7130 }
7131
7132 /*
7133 * stat_extended: Get file status; with extended security (ACL).
7134 *
7135 * Parameters: p (ignored)
7136 * uap User argument descriptor (see below)
7137 * retval (ignored)
7138 *
7139 * Indirect: uap->path Path of file to get status from
7140 * uap->ub User buffer (holds file status info)
7141 * uap->xsecurity ACL to get (extended security)
7142 * uap->xsecurity_size Size of ACL
7143 *
7144 * Returns: 0 Success
7145 * !0 errno value
7146 *
7147 */
7148 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7149 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7150 __unused int32_t *retval)
7151 {
7152 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7153 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7154 0);
7155 }
7156
7157 /*
7158 * Returns: 0 Success
7159 * fstatat_internal:??? [see fstatat_internal() in this file]
7160 */
7161 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7162 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7163 {
7164 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7165 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7166 }
7167
7168 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7169 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7170 {
7171 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7172 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7173 }
7174
7175 /*
7176 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7177 *
7178 * Parameters: p (ignored)
7179 * uap User argument descriptor (see below)
7180 * retval (ignored)
7181 *
7182 * Indirect: uap->path Path of file to get status from
7183 * uap->ub User buffer (holds file status info)
7184 * uap->xsecurity ACL to get (extended security)
7185 * uap->xsecurity_size Size of ACL
7186 *
7187 * Returns: 0 Success
7188 * !0 errno value
7189 *
7190 */
7191 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7192 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7193 {
7194 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7195 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7196 0);
7197 }
7198
7199 /*
7200 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7201 *
7202 * Parameters: p (ignored)
7203 * uap User argument descriptor (see below)
7204 * retval (ignored)
7205 *
7206 * Indirect: uap->path Path of file to get status from
7207 * uap->ub User buffer (holds file status info)
7208 * uap->xsecurity ACL to get (extended security)
7209 * uap->xsecurity_size Size of ACL
7210 *
7211 * Returns: 0 Success
7212 * !0 errno value
7213 *
7214 */
7215 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7216 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7217 {
7218 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7219 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7220 AT_SYMLINK_NOFOLLOW);
7221 }
7222
7223 /*
7224 * Get file status; this version does not follow links.
7225 */
7226 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7227 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7228 {
7229 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7230 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7231 }
7232
7233 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7234 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7235 {
7236 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7237 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7238 }
7239
7240 /*
7241 * lstat64_extended: Get file status; can handle large inode numbers; does not
7242 * follow links; with extended security (ACL).
7243 *
7244 * Parameters: p (ignored)
7245 * uap User argument descriptor (see below)
7246 * retval (ignored)
7247 *
7248 * Indirect: uap->path Path of file to get status from
7249 * uap->ub User buffer (holds file status info)
7250 * uap->xsecurity ACL to get (extended security)
7251 * uap->xsecurity_size Size of ACL
7252 *
7253 * Returns: 0 Success
7254 * !0 errno value
7255 *
7256 */
7257 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7258 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7259 {
7260 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7261 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7262 AT_SYMLINK_NOFOLLOW);
7263 }
7264
7265 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7266 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7267 {
7268 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7269 return EINVAL;
7270 }
7271
7272 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7273 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7274 }
7275
7276 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7277 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7278 __unused int32_t *retval)
7279 {
7280 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7281 return EINVAL;
7282 }
7283
7284 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7285 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7286 }
7287
7288 /*
7289 * Get configurable pathname variables.
7290 *
7291 * Returns: 0 Success
7292 * namei:???
7293 * vn_pathconf:???
7294 *
7295 * Notes: Global implementation constants are intended to be
7296 * implemented in this function directly; all other constants
7297 * are per-FS implementation, and therefore must be handled in
7298 * each respective FS, instead.
7299 *
7300 * XXX We implement some things globally right now that should actually be
7301 * XXX per-FS; we will need to deal with this at some point.
7302 */
7303 /* ARGSUSED */
7304 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7305 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7306 {
7307 int error;
7308 struct nameidata nd;
7309 vfs_context_t ctx = vfs_context_current();
7310
7311 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7312 UIO_USERSPACE, uap->path, ctx);
7313 error = namei(&nd);
7314 if (error) {
7315 return error;
7316 }
7317
7318 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7319
7320 vnode_put(nd.ni_vp);
7321 nameidone(&nd);
7322 return error;
7323 }
7324
7325 /*
7326 * Return target name of a symbolic link.
7327 */
7328 /* ARGSUSED */
7329 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7330 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7331 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7332 int *retval)
7333 {
7334 vnode_t vp;
7335 uio_t auio;
7336 int error;
7337 struct nameidata nd;
7338 UIO_STACKBUF(uio_buf, 1);
7339 bool put_vnode;
7340
7341 if (bufsize > INT32_MAX) {
7342 return EINVAL;
7343 }
7344
7345 if (lnk_vp) {
7346 vp = lnk_vp;
7347 put_vnode = false;
7348 } else {
7349 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7350 seg, path, ctx);
7351
7352 error = nameiat(&nd, fd);
7353 if (error) {
7354 return error;
7355 }
7356 vp = nd.ni_vp;
7357 put_vnode = true;
7358 nameidone(&nd);
7359 }
7360
7361 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7362 &uio_buf[0], sizeof(uio_buf));
7363 uio_addiov(auio, buf, bufsize);
7364 if (vp->v_type != VLNK) {
7365 error = EINVAL;
7366 } else {
7367 #if CONFIG_MACF
7368 error = mac_vnode_check_readlink(ctx, vp);
7369 #endif
7370 if (error == 0) {
7371 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7372 ctx);
7373 }
7374 if (error == 0) {
7375 error = VNOP_READLINK(vp, auio, ctx);
7376 }
7377 }
7378
7379 if (put_vnode) {
7380 vnode_put(vp);
7381 }
7382
7383 *retval = (int)(bufsize - uio_resid(auio));
7384 return error;
7385 }
7386
7387 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7388 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7389 {
7390 enum uio_seg procseg;
7391 vnode_t vp;
7392 int error;
7393
7394 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7395
7396 AUDIT_ARG(fd, uap->fd);
7397
7398 if ((error = file_vnode(uap->fd, &vp))) {
7399 return error;
7400 }
7401 if ((error = vnode_getwithref(vp))) {
7402 file_drop(uap->fd);
7403 return error;
7404 }
7405
7406 error = readlinkat_internal(vfs_context_current(), -1,
7407 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7408 uap->bufsize, procseg, retval);
7409
7410 vnode_put(vp);
7411 file_drop(uap->fd);
7412 return error;
7413 }
7414
7415 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7416 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7417 {
7418 enum uio_seg procseg;
7419
7420 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7421 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7422 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7423 uap->count, procseg, retval);
7424 }
7425
7426 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7427 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7428 {
7429 enum uio_seg procseg;
7430
7431 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7432 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7433 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7434 retval);
7435 }
7436
7437 /*
7438 * Change file flags, the deep inner layer.
7439 */
7440 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7441 chflags0(vnode_t vp, struct vnode_attr *va,
7442 int (*setattr)(vnode_t, void *, vfs_context_t),
7443 void *arg, vfs_context_t ctx)
7444 {
7445 kauth_action_t action = 0;
7446 int error;
7447
7448 #if CONFIG_MACF
7449 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7450 if (error) {
7451 goto out;
7452 }
7453 #endif
7454
7455 /* request authorisation, disregard immutability */
7456 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7457 goto out;
7458 }
7459 /*
7460 * Request that the auth layer disregard those file flags it's allowed to when
7461 * authorizing this operation; we need to do this in order to be able to
7462 * clear immutable flags.
7463 */
7464 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7465 goto out;
7466 }
7467 error = (*setattr)(vp, arg, ctx);
7468
7469 #if CONFIG_MACF
7470 if (error == 0) {
7471 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7472 }
7473 #endif
7474
7475 out:
7476 return error;
7477 }
7478
7479 /*
7480 * Change file flags.
7481 *
7482 * NOTE: this will vnode_put() `vp'
7483 */
7484 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7485 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7486 {
7487 struct vnode_attr va;
7488 int error;
7489
7490 VATTR_INIT(&va);
7491 VATTR_SET(&va, va_flags, flags);
7492
7493 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7494 vnode_put(vp);
7495
7496 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7497 error = ENOTSUP;
7498 }
7499
7500 return error;
7501 }
7502
7503 /*
7504 * Change flags of a file given a path name.
7505 */
7506 /* ARGSUSED */
7507 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7508 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7509 {
7510 vnode_t vp;
7511 vfs_context_t ctx = vfs_context_current();
7512 int error;
7513 struct nameidata nd;
7514 uint32_t wantparent = 0;
7515
7516 #if CONFIG_FILE_LEASES
7517 wantparent = WANTPARENT;
7518 #endif
7519
7520 AUDIT_ARG(fflags, uap->flags);
7521 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7522 UIO_USERSPACE, uap->path, ctx);
7523 error = namei(&nd);
7524 if (error) {
7525 return error;
7526 }
7527 vp = nd.ni_vp;
7528
7529 #if CONFIG_FILE_LEASES
7530 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7531 vnode_put(nd.ni_dvp);
7532 #endif
7533
7534 nameidone(&nd);
7535
7536 /* we don't vnode_put() here because chflags1 does internally */
7537 error = chflags1(vp, uap->flags, ctx);
7538
7539 return error;
7540 }
7541
7542 /*
7543 * Change flags of a file given a file descriptor.
7544 */
7545 /* ARGSUSED */
7546 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7547 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7548 {
7549 vnode_t vp;
7550 int error;
7551
7552 AUDIT_ARG(fd, uap->fd);
7553 AUDIT_ARG(fflags, uap->flags);
7554 if ((error = file_vnode(uap->fd, &vp))) {
7555 return error;
7556 }
7557
7558 if ((error = vnode_getwithref(vp))) {
7559 file_drop(uap->fd);
7560 return error;
7561 }
7562
7563 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7564
7565 #if CONFIG_FILE_LEASES
7566 vnode_breakdirlease(vp, true, O_WRONLY);
7567 #endif
7568
7569 /* we don't vnode_put() here because chflags1 does internally */
7570 error = chflags1(vp, uap->flags, vfs_context_current());
7571
7572 file_drop(uap->fd);
7573 return error;
7574 }
7575
7576 /*
7577 * Change security information on a filesystem object.
7578 *
7579 * Returns: 0 Success
7580 * EPERM Operation not permitted
7581 * vnode_authattr:??? [anything vnode_authattr can return]
7582 * vnode_authorize:??? [anything vnode_authorize can return]
7583 * vnode_setattr:??? [anything vnode_setattr can return]
7584 *
7585 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7586 * translated to EPERM before being returned.
7587 */
7588 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7589 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7590 {
7591 kauth_action_t action;
7592 int error;
7593
7594 AUDIT_ARG(mode, vap->va_mode);
7595 /* XXX audit new args */
7596
7597 #if NAMEDSTREAMS
7598 /* chmod calls are not allowed for resource forks. */
7599 if (vp->v_flag & VISNAMEDSTREAM) {
7600 return EPERM;
7601 }
7602 #endif
7603
7604 #if CONFIG_MACF
7605 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7606 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7607 return error;
7608 }
7609
7610 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7611 if ((error = mac_vnode_check_setowner(ctx, vp,
7612 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7613 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7614 return error;
7615 }
7616 }
7617
7618 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7619 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7620 return error;
7621 }
7622 #endif
7623
7624 /* make sure that the caller is allowed to set this security information */
7625 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7626 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7627 if (error == EACCES) {
7628 error = EPERM;
7629 }
7630 return error;
7631 }
7632
7633 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7634 return error;
7635 }
7636
7637 #if CONFIG_MACF
7638 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7639 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7640 }
7641
7642 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7643 mac_vnode_notify_setowner(ctx, vp,
7644 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7645 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7646 }
7647
7648 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7649 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7650 }
7651 #endif
7652
7653 return error;
7654 }
7655
7656
7657 /*
7658 * Change mode of a file given a path name.
7659 *
7660 * Returns: 0 Success
7661 * namei:??? [anything namei can return]
7662 * chmod_vnode:??? [anything chmod_vnode can return]
7663 */
7664 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7665 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7666 int fd, int flag, enum uio_seg segflg)
7667 {
7668 struct nameidata nd;
7669 int follow, error;
7670 uint32_t wantparent = 0;
7671
7672 #if CONFIG_FILE_LEASES
7673 wantparent = WANTPARENT;
7674 #endif
7675
7676 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7677 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7678 segflg, path, ctx);
7679 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7680 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7681 }
7682 if ((error = nameiat(&nd, fd))) {
7683 return error;
7684 }
7685
7686 #if CONFIG_FILE_LEASES
7687 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7688 vnode_put(nd.ni_dvp);
7689 #endif
7690
7691 error = chmod_vnode(ctx, nd.ni_vp, vap);
7692 vnode_put(nd.ni_vp);
7693 nameidone(&nd);
7694 return error;
7695 }
7696
7697 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7698 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7699 gid_t gid, user_addr_t xsecurity)
7700 {
7701 int error;
7702
7703 VATTR_INIT(pva);
7704
7705 if (mode != -1) {
7706 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7707 } else {
7708 pva->va_mode = 0;
7709 }
7710
7711 if (uid != KAUTH_UID_NONE) {
7712 VATTR_SET(pva, va_uid, uid);
7713 }
7714
7715 if (gid != KAUTH_GID_NONE) {
7716 VATTR_SET(pva, va_gid, gid);
7717 }
7718
7719 *pxsecdst = NULL;
7720 switch (xsecurity) {
7721 case USER_ADDR_NULL:
7722 break;
7723
7724 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7725 VATTR_SET(pva, va_acl, NULL);
7726 break;
7727
7728 default:
7729 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7730 return error;
7731 }
7732
7733 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7734 pva->va_vaflags |= VA_FILESEC_ACL;
7735 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7736 break;
7737 }
7738
7739 return 0;
7740 }
7741
7742 /*
7743 * chmod_extended: Change the mode of a file given a path name; with extended
7744 * argument list (including extended security (ACL)).
7745 *
7746 * Parameters: p Process requesting the open
7747 * uap User argument descriptor (see below)
7748 * retval (ignored)
7749 *
7750 * Indirect: uap->path Path to object (same as 'chmod')
7751 * uap->uid UID to set
7752 * uap->gid GID to set
7753 * uap->mode File mode to set (same as 'chmod')
7754 * uap->xsecurity ACL to set (or delete)
7755 *
7756 * Returns: 0 Success
7757 * !0 errno value
7758 *
7759 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7760 *
7761 * XXX: We should enummerate the possible errno values here, and where
7762 * in the code they originated.
7763 */
7764 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7765 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7766 {
7767 int error;
7768 struct vnode_attr va;
7769 kauth_filesec_t xsecdst = NULL;
7770
7771 AUDIT_ARG(owner, uap->uid, uap->gid);
7772
7773 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7774 uap->gid, uap->xsecurity);
7775
7776 if (error) {
7777 return error;
7778 }
7779
7780 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7781 UIO_USERSPACE);
7782
7783 if (xsecdst != NULL) {
7784 kauth_filesec_free(xsecdst);
7785 }
7786 return error;
7787 }
7788
7789 /*
7790 * Returns: 0 Success
7791 * chmodat:??? [anything chmodat can return]
7792 */
7793 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7794 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7795 int flag, enum uio_seg segflg)
7796 {
7797 struct vnode_attr va;
7798
7799 VATTR_INIT(&va);
7800 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7801
7802 return chmodat(ctx, path, &va, fd, flag, segflg);
7803 }
7804
7805 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7806 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7807 {
7808 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7809 AT_FDCWD, 0, UIO_USERSPACE);
7810 }
7811
7812 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7813 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7814 {
7815 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7816 return EINVAL;
7817 }
7818
7819 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7820 uap->fd, uap->flag, UIO_USERSPACE);
7821 }
7822
7823 /*
7824 * Change mode of a file given a file descriptor.
7825 */
7826 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7827 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7828 {
7829 vnode_t vp;
7830 int error;
7831
7832 AUDIT_ARG(fd, fd);
7833
7834 if ((error = file_vnode(fd, &vp)) != 0) {
7835 return error;
7836 }
7837 if ((error = vnode_getwithref(vp)) != 0) {
7838 file_drop(fd);
7839 return error;
7840 }
7841 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7842
7843 #if CONFIG_FILE_LEASES
7844 vnode_breakdirlease(vp, true, O_WRONLY);
7845 #endif
7846
7847 error = chmod_vnode(vfs_context_current(), vp, vap);
7848 (void)vnode_put(vp);
7849 file_drop(fd);
7850
7851 return error;
7852 }
7853
7854 /*
7855 * fchmod_extended: Change mode of a file given a file descriptor; with
7856 * extended argument list (including extended security (ACL)).
7857 *
7858 * Parameters: p Process requesting to change file mode
7859 * uap User argument descriptor (see below)
7860 * retval (ignored)
7861 *
7862 * Indirect: uap->mode File mode to set (same as 'chmod')
7863 * uap->uid UID to set
7864 * uap->gid GID to set
7865 * uap->xsecurity ACL to set (or delete)
7866 * uap->fd File descriptor of file to change mode
7867 *
7868 * Returns: 0 Success
7869 * !0 errno value
7870 *
7871 */
7872 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7873 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7874 {
7875 int error;
7876 struct vnode_attr va;
7877 kauth_filesec_t xsecdst = NULL;
7878
7879 AUDIT_ARG(owner, uap->uid, uap->gid);
7880
7881 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7882 uap->gid, uap->xsecurity);
7883
7884 if (error) {
7885 return error;
7886 }
7887
7888 error = fchmod1(p, uap->fd, &va);
7889
7890 if (xsecdst != NULL) {
7891 kauth_filesec_free(xsecdst);
7892 }
7893 return error;
7894 }
7895
7896 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7897 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7898 {
7899 struct vnode_attr va;
7900
7901 VATTR_INIT(&va);
7902 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7903
7904 return fchmod1(p, uap->fd, &va);
7905 }
7906
7907 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)7908 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
7909 {
7910 struct vnode_attr va;
7911 kauth_action_t action;
7912 int error;
7913
7914 VATTR_INIT(&va);
7915 if (uid != (uid_t)VNOVAL) {
7916 VATTR_SET(&va, va_uid, uid);
7917 }
7918 if (gid != (gid_t)VNOVAL) {
7919 VATTR_SET(&va, va_gid, gid);
7920 }
7921
7922 #if NAMEDSTREAMS
7923 /* chown calls are not allowed for resource forks. */
7924 if (vp->v_flag & VISNAMEDSTREAM) {
7925 error = EPERM;
7926 goto out;
7927 }
7928 #endif
7929
7930 #if CONFIG_MACF
7931 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7932 if (error) {
7933 goto out;
7934 }
7935 #endif
7936
7937 /* preflight and authorize attribute changes */
7938 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7939 goto out;
7940 }
7941 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7942 /*
7943 * EACCES is only allowed from namei(); permissions failure should
7944 * return EPERM, so we need to translate the error code.
7945 */
7946 if (error == EACCES) {
7947 error = EPERM;
7948 }
7949
7950 goto out;
7951 }
7952
7953 #if CONFIG_FILE_LEASES
7954 vnode_breakdirlease(vp, true, O_WRONLY);
7955 #endif
7956
7957 error = vnode_setattr(vp, &va, ctx);
7958
7959 #if CONFIG_MACF
7960 if (error == 0) {
7961 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7962 }
7963 #endif
7964
7965 out:
7966 return error;
7967 }
7968
7969 /*
7970 * Set ownership given a path name.
7971 */
7972 /* ARGSUSED */
7973 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7974 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7975 gid_t gid, int flag, enum uio_seg segflg)
7976 {
7977 vnode_t vp;
7978 int error;
7979 struct nameidata nd;
7980 int follow;
7981
7982 AUDIT_ARG(owner, uid, gid);
7983
7984 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7985 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
7986 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7987 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7988 }
7989
7990 error = nameiat(&nd, fd);
7991 if (error) {
7992 return error;
7993 }
7994
7995 vp = nd.ni_vp;
7996 error = vn_chown_internal(ctx, vp, uid, gid);
7997
7998 nameidone(&nd);
7999 vnode_put(vp);
8000 return error;
8001 }
8002
8003 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)8004 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
8005 {
8006 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8007 uap->uid, uap->gid, 0, UIO_USERSPACE);
8008 }
8009
8010 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)8011 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
8012 {
8013 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
8014 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
8015 }
8016
8017 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)8018 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
8019 {
8020 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
8021 return EINVAL;
8022 }
8023
8024 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
8025 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
8026 }
8027
8028 /*
8029 * Set ownership given a file descriptor.
8030 */
8031 /* ARGSUSED */
8032 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)8033 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
8034 {
8035 vfs_context_t ctx = vfs_context_current();
8036 vnode_t vp;
8037 int error;
8038
8039 AUDIT_ARG(owner, uap->uid, uap->gid);
8040 AUDIT_ARG(fd, uap->fd);
8041
8042 if ((error = file_vnode(uap->fd, &vp))) {
8043 return error;
8044 }
8045
8046 if ((error = vnode_getwithref(vp))) {
8047 file_drop(uap->fd);
8048 return error;
8049 }
8050 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8051
8052 error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
8053
8054 (void)vnode_put(vp);
8055 file_drop(uap->fd);
8056 return error;
8057 }
8058
8059 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)8060 getutimes(user_addr_t usrtvp, struct timespec *tsp)
8061 {
8062 int error;
8063
8064 if (usrtvp == USER_ADDR_NULL) {
8065 struct timeval old_tv;
8066 /* XXX Y2038 bug because of microtime argument */
8067 microtime(&old_tv);
8068 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8069 tsp[1] = tsp[0];
8070 } else {
8071 if (IS_64BIT_PROCESS(current_proc())) {
8072 struct user64_timeval tv[2];
8073 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8074 if (error) {
8075 return error;
8076 }
8077 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8078 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8079 } else {
8080 struct user32_timeval tv[2];
8081 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8082 if (error) {
8083 return error;
8084 }
8085 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8086 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8087 }
8088 }
8089 return 0;
8090 }
8091
8092 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8093 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8094 int nullflag)
8095 {
8096 int error;
8097 struct vnode_attr va;
8098 kauth_action_t action;
8099
8100 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8101
8102 VATTR_INIT(&va);
8103 VATTR_SET(&va, va_access_time, ts[0]);
8104 VATTR_SET(&va, va_modify_time, ts[1]);
8105 if (nullflag) {
8106 va.va_vaflags |= VA_UTIMES_NULL;
8107 }
8108
8109 #if NAMEDSTREAMS
8110 /* utimes calls are not allowed for resource forks. */
8111 if (vp->v_flag & VISNAMEDSTREAM) {
8112 error = EPERM;
8113 goto out;
8114 }
8115 #endif
8116
8117 #if CONFIG_MACF
8118 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8119 if (error) {
8120 goto out;
8121 }
8122 #endif
8123 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8124 if (!nullflag && error == EACCES) {
8125 error = EPERM;
8126 }
8127 goto out;
8128 }
8129
8130 /* since we may not need to auth anything, check here */
8131 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8132 if (!nullflag && error == EACCES) {
8133 error = EPERM;
8134 }
8135 goto out;
8136 }
8137 error = vnode_setattr(vp, &va, ctx);
8138
8139 #if CONFIG_MACF
8140 if (error == 0) {
8141 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8142 }
8143 #endif
8144
8145 out:
8146 return error;
8147 }
8148
8149 /*
8150 * Set the access and modification times of a file.
8151 */
8152 /* ARGSUSED */
8153 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8154 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8155 {
8156 struct timespec ts[2];
8157 user_addr_t usrtvp;
8158 int error;
8159 struct nameidata nd;
8160 vfs_context_t ctx = vfs_context_current();
8161 uint32_t wantparent = 0;
8162
8163 #if CONFIG_FILE_LEASES
8164 wantparent = WANTPARENT;
8165 #endif
8166
8167 /*
8168 * AUDIT: Needed to change the order of operations to do the
8169 * name lookup first because auditing wants the path.
8170 */
8171 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8172 UIO_USERSPACE, uap->path, ctx);
8173 error = namei(&nd);
8174 if (error) {
8175 return error;
8176 }
8177
8178 /*
8179 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8180 * the current time instead.
8181 */
8182 usrtvp = uap->tptr;
8183 if ((error = getutimes(usrtvp, ts)) != 0) {
8184 goto out;
8185 }
8186
8187 #if CONFIG_FILE_LEASES
8188 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8189 #endif
8190
8191 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8192
8193 out:
8194 #if CONFIG_FILE_LEASES
8195 vnode_put(nd.ni_dvp);
8196 #endif
8197 nameidone(&nd);
8198 vnode_put(nd.ni_vp);
8199 return error;
8200 }
8201
8202 /*
8203 * Set the access and modification times of a file.
8204 */
8205 /* ARGSUSED */
8206 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8207 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8208 {
8209 struct timespec ts[2];
8210 vnode_t vp;
8211 user_addr_t usrtvp;
8212 int error;
8213
8214 AUDIT_ARG(fd, uap->fd);
8215 usrtvp = uap->tptr;
8216 if ((error = getutimes(usrtvp, ts)) != 0) {
8217 return error;
8218 }
8219 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8220 return error;
8221 }
8222 if ((error = vnode_getwithref(vp))) {
8223 file_drop(uap->fd);
8224 return error;
8225 }
8226
8227 #if CONFIG_FILE_LEASES
8228 vnode_breakdirlease(vp, true, O_WRONLY);
8229 #endif
8230
8231 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8232
8233 vnode_put(vp);
8234 file_drop(uap->fd);
8235 return error;
8236 }
8237
8238 static int
truncate_validate_common(proc_t p,off_t length)8239 truncate_validate_common(proc_t p, off_t length)
8240 {
8241 rlim_t fsize_limit;
8242
8243 if (length < 0) {
8244 return EINVAL;
8245 }
8246
8247 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8248 if ((rlim_t)length > fsize_limit) {
8249 psignal(p, SIGXFSZ);
8250 return EFBIG;
8251 }
8252
8253 return 0;
8254 }
8255
8256 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8257 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8258 vfs_context_t ctx, boolean_t need_auth)
8259 {
8260 struct vnode_attr va;
8261 kauth_action_t action;
8262 int error;
8263
8264 VATTR_INIT(&va);
8265 VATTR_SET(&va, va_data_size, length);
8266
8267 #if CONFIG_MACF
8268 error = mac_vnode_check_truncate(ctx, cred, vp);
8269 if (error) {
8270 return error;
8271 }
8272 #endif
8273
8274 /*
8275 * If we reached here from `ftruncate` then we already did an effective
8276 * `vnode_authorize` upon open. We honour the result from then.
8277 */
8278 if (need_auth) {
8279 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8280 return error;
8281 }
8282
8283 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8284 return error;
8285 }
8286 }
8287
8288 #if CONFIG_FILE_LEASES
8289 /* Check if there is a lease placed on the parent directory. */
8290 vnode_breakdirlease(vp, true, O_WRONLY);
8291
8292 /* Now check if there is a lease placed on the file itself. */
8293 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8294 #endif
8295
8296 error = vnode_setattr(vp, &va, ctx);
8297
8298 #if CONFIG_MACF
8299 if (error == 0) {
8300 mac_vnode_notify_truncate(ctx, cred, vp);
8301 }
8302 #endif
8303
8304 return error;
8305 }
8306
8307 /*
8308 * Truncate a file given its path name.
8309 */
8310 /* ARGSUSED */
8311 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8312 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8313 {
8314 vfs_context_t ctx = vfs_context_current();
8315 vnode_t vp;
8316 int error;
8317 struct nameidata nd;
8318
8319 if ((error = truncate_validate_common(p, uap->length))) {
8320 return error;
8321 }
8322
8323 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8324 UIO_USERSPACE, uap->path, ctx);
8325
8326 if ((error = namei(&nd))) {
8327 return error;
8328 }
8329
8330 vp = nd.ni_vp;
8331 nameidone(&nd);
8332
8333 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8334 vnode_put(vp);
8335
8336 return error;
8337 }
8338
8339 /*
8340 * Truncate a file given a file descriptor.
8341 */
8342 /* ARGSUSED */
8343 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8344 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8345 {
8346 vnode_t vp;
8347 struct fileproc *fp;
8348 int error;
8349
8350 AUDIT_ARG(fd, uap->fd);
8351
8352 if ((error = truncate_validate_common(p, uap->length))) {
8353 return error;
8354 }
8355
8356 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8357 return error;
8358 }
8359
8360 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8361 case DTYPE_PSXSHM:
8362 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8363 goto out;
8364 case DTYPE_VNODE:
8365 break;
8366 default:
8367 error = EINVAL;
8368 goto out;
8369 }
8370
8371 vp = (vnode_t)fp_get_data(fp);
8372
8373 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8374 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8375 error = EINVAL;
8376 goto out;
8377 }
8378
8379 if ((error = vnode_getwithref(vp)) != 0) {
8380 goto out;
8381 }
8382
8383 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8384
8385 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8386 vfs_context_current(), false);
8387 vnode_put(vp);
8388
8389 out:
8390 file_drop(uap->fd);
8391 return error;
8392 }
8393
8394
8395 /*
8396 * Sync an open file with synchronized I/O _file_ integrity completion
8397 */
8398 /* ARGSUSED */
8399 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8400 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8401 {
8402 __pthread_testcancel(1);
8403 return fsync_common(p, uap, MNT_WAIT);
8404 }
8405
8406
8407 /*
8408 * Sync an open file with synchronized I/O _file_ integrity completion
8409 *
8410 * Notes: This is a legacy support function that does not test for
8411 * thread cancellation points.
8412 */
8413 /* ARGSUSED */
8414 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8415 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8416 {
8417 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8418 }
8419
8420
8421 /*
8422 * Sync an open file with synchronized I/O _data_ integrity completion
8423 */
8424 /* ARGSUSED */
8425 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8426 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8427 {
8428 __pthread_testcancel(1);
8429 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8430 }
8431
8432
8433 /*
8434 * fsync_common
8435 *
8436 * Common fsync code to support both synchronized I/O file integrity completion
8437 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8438 *
8439 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8440 * will only guarantee that the file data contents are retrievable. If
8441 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8442 * includes additional metadata unnecessary for retrieving the file data
8443 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8444 * storage.
8445 *
8446 * Parameters: p The process
8447 * uap->fd The descriptor to synchronize
8448 * flags The data integrity flags
8449 *
8450 * Returns: int Success
8451 * fp_getfvp:EBADF Bad file descriptor
8452 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8453 * VNOP_FSYNC:??? unspecified
8454 *
8455 * Notes: We use struct fsync_args because it is a short name, and all
8456 * caller argument structures are otherwise identical.
8457 */
8458 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8459 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8460 {
8461 vnode_t vp;
8462 struct fileproc *fp;
8463 vfs_context_t ctx = vfs_context_current();
8464 int error;
8465
8466 AUDIT_ARG(fd, uap->fd);
8467
8468 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8469 return error;
8470 }
8471 if ((error = vnode_getwithref(vp))) {
8472 file_drop(uap->fd);
8473 return error;
8474 }
8475
8476 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8477
8478 error = VNOP_FSYNC(vp, flags, ctx);
8479
8480 #if NAMEDRSRCFORK
8481 /* Sync resource fork shadow file if necessary. */
8482 if ((error == 0) &&
8483 (vp->v_flag & VISNAMEDSTREAM) &&
8484 (vp->v_parent != NULLVP) &&
8485 vnode_isshadow(vp) &&
8486 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8487 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8488 }
8489 #endif
8490
8491 (void)vnode_put(vp);
8492 file_drop(uap->fd);
8493 return error;
8494 }
8495
8496 /*
8497 * Duplicate files. Source must be a file, target must be a file or
8498 * must not exist.
8499 *
8500 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8501 * perform inheritance correctly.
8502 */
8503 /* ARGSUSED */
8504 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8505 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8506 {
8507 vnode_t tvp, fvp, tdvp, sdvp;
8508 struct nameidata fromnd, tond;
8509 int error;
8510 vfs_context_t ctx = vfs_context_current();
8511
8512 /* Check that the flags are valid. */
8513 if (uap->flags & ~CPF_MASK) {
8514 return EINVAL;
8515 }
8516
8517 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8518 UIO_USERSPACE, uap->from, ctx);
8519 if ((error = namei(&fromnd))) {
8520 return error;
8521 }
8522 fvp = fromnd.ni_vp;
8523
8524 NDINIT(&tond, CREATE, OP_LINK,
8525 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8526 UIO_USERSPACE, uap->to, ctx);
8527 if ((error = namei(&tond))) {
8528 goto out1;
8529 }
8530 tdvp = tond.ni_dvp;
8531 tvp = tond.ni_vp;
8532
8533 if (tvp != NULL) {
8534 if (!(uap->flags & CPF_OVERWRITE)) {
8535 error = EEXIST;
8536 goto out;
8537 }
8538 }
8539
8540 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8541 error = EISDIR;
8542 goto out;
8543 }
8544
8545 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8546 error = EOPNOTSUPP;
8547 goto out;
8548 }
8549
8550 #if CONFIG_MACF
8551 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8552 goto out;
8553 }
8554 #endif /* CONFIG_MACF */
8555
8556 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8557 goto out;
8558 }
8559 if (tvp) {
8560 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8561 goto out;
8562 }
8563 }
8564 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8565 goto out;
8566 }
8567
8568 if (fvp == tdvp) {
8569 error = EINVAL;
8570 }
8571 /*
8572 * If source is the same as the destination (that is the
8573 * same inode number) then there is nothing to do.
8574 * (fixed to have POSIX semantics - CSM 3/2/98)
8575 */
8576 if (fvp == tvp) {
8577 error = -1;
8578 }
8579
8580 #if CONFIG_FILE_LEASES
8581 vnode_breakdirlease(tdvp, false, O_WRONLY);
8582 #endif
8583
8584 if (!error) {
8585 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8586 }
8587 out:
8588 sdvp = tond.ni_startdir;
8589 /*
8590 * nameidone has to happen before we vnode_put(tdvp)
8591 * since it may need to release the fs_nodelock on the tdvp
8592 */
8593 nameidone(&tond);
8594
8595 if (tvp) {
8596 vnode_put(tvp);
8597 }
8598 vnode_put(tdvp);
8599 vnode_put(sdvp);
8600 out1:
8601 vnode_put(fvp);
8602
8603 nameidone(&fromnd);
8604
8605 if (error == -1) {
8606 return 0;
8607 }
8608 return error;
8609 }
8610
8611 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8612
8613 /*
8614 * Helper function for doing clones. The caller is expected to provide an
8615 * iocounted source vnode and release it.
8616 */
8617 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8618 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8619 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8620 {
8621 vnode_t tvp, tdvp;
8622 struct nameidata tond;
8623 int error;
8624 int follow;
8625 boolean_t free_src_acl;
8626 boolean_t attr_cleanup;
8627 enum vtype v_type;
8628 kauth_action_t action;
8629 struct componentname *cnp;
8630 uint32_t defaulted = 0;
8631 struct vnode_attr va;
8632 struct vnode_attr nva;
8633 uint32_t vnop_flags;
8634
8635 v_type = vnode_vtype(fvp);
8636 switch (v_type) {
8637 case VLNK:
8638 /* FALLTHRU */
8639 case VREG:
8640 action = KAUTH_VNODE_ADD_FILE;
8641 break;
8642 case VDIR:
8643 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8644 fvp->v_mountedhere) {
8645 return EINVAL;
8646 }
8647 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8648 break;
8649 default:
8650 return EINVAL;
8651 }
8652
8653 AUDIT_ARG(fd2, dst_dirfd);
8654 AUDIT_ARG(value32, flags);
8655
8656 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8657 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8658 UIO_USERSPACE, dst, ctx);
8659 if ((error = nameiat(&tond, dst_dirfd))) {
8660 return error;
8661 }
8662 cnp = &tond.ni_cnd;
8663 tdvp = tond.ni_dvp;
8664 tvp = tond.ni_vp;
8665
8666 free_src_acl = FALSE;
8667 attr_cleanup = FALSE;
8668
8669 if (tvp != NULL) {
8670 error = EEXIST;
8671 goto out;
8672 }
8673
8674 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8675 error = EXDEV;
8676 goto out;
8677 }
8678
8679 #if CONFIG_MACF
8680 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8681 goto out;
8682 }
8683 #endif
8684 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8685 goto out;
8686 }
8687
8688 action = KAUTH_VNODE_GENERIC_READ_BITS;
8689 if (data_read_authorised) {
8690 action &= ~KAUTH_VNODE_READ_DATA;
8691 }
8692 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8693 goto out;
8694 }
8695
8696 /*
8697 * certain attributes may need to be changed from the source, we ask for
8698 * those here with the exception of source file's ACLs unless the CLONE_ACL
8699 * flag is specified. By default, the clone file will inherit the target
8700 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8701 * will inherit the source file's ACLs instead.
8702 */
8703 VATTR_INIT(&va);
8704 VATTR_WANTED(&va, va_uid);
8705 VATTR_WANTED(&va, va_gid);
8706 VATTR_WANTED(&va, va_mode);
8707 VATTR_WANTED(&va, va_flags);
8708 if (flags & CLONE_ACL) {
8709 VATTR_WANTED(&va, va_acl);
8710 }
8711
8712 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8713 goto out;
8714 }
8715
8716 VATTR_INIT(&nva);
8717 VATTR_SET(&nva, va_type, v_type);
8718 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8719 VATTR_SET(&nva, va_acl, va.va_acl);
8720 free_src_acl = TRUE;
8721 }
8722
8723 /* Handle ACL inheritance, initialize vap. */
8724 if (v_type == VLNK) {
8725 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8726 } else {
8727 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8728 if (error) {
8729 goto out;
8730 }
8731 attr_cleanup = TRUE;
8732 }
8733
8734 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8735 /*
8736 * We've got initial values for all security parameters,
8737 * If we are superuser, then we can change owners to be the
8738 * same as the source. Both superuser and the owner have default
8739 * WRITE_SECURITY privileges so all other fields can be taken
8740 * from source as well.
8741 */
8742 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8743 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8744 VATTR_SET(&nva, va_uid, va.va_uid);
8745 }
8746 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8747 VATTR_SET(&nva, va_gid, va.va_gid);
8748 }
8749 } else {
8750 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8751 }
8752
8753 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8754 VATTR_SET(&nva, va_mode, va.va_mode);
8755 }
8756 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8757 VATTR_SET(&nva, va_flags,
8758 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8759 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8760 }
8761
8762 #if CONFIG_FILE_LEASES
8763 vnode_breakdirlease(tdvp, false, O_WRONLY);
8764 #endif
8765
8766 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8767
8768 if (!error && tvp) {
8769 int update_flags = 0;
8770 #if CONFIG_FSE
8771 int fsevent;
8772 #endif /* CONFIG_FSE */
8773
8774 /*
8775 * If some of the requested attributes weren't handled by the
8776 * VNOP, use our fallback code.
8777 */
8778 if (!VATTR_ALL_SUPPORTED(&nva)) {
8779 (void)vnode_setattr_fallback(tvp, &nva, ctx);
8780 }
8781
8782 #if CONFIG_MACF
8783 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8784 VNODE_LABEL_CREATE, ctx);
8785 #endif
8786
8787 // Make sure the name & parent pointers are hooked up
8788 if (tvp->v_name == NULL) {
8789 update_flags |= VNODE_UPDATE_NAME;
8790 }
8791 if (tvp->v_parent == NULLVP) {
8792 update_flags |= VNODE_UPDATE_PARENT;
8793 }
8794
8795 if (update_flags) {
8796 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8797 cnp->cn_namelen, cnp->cn_hash, update_flags);
8798 }
8799
8800 #if CONFIG_FSE
8801 switch (vnode_vtype(tvp)) {
8802 case VLNK:
8803 /* FALLTHRU */
8804 case VREG:
8805 fsevent = FSE_CREATE_FILE;
8806 break;
8807 case VDIR:
8808 fsevent = FSE_CREATE_DIR;
8809 break;
8810 default:
8811 goto out;
8812 }
8813
8814 if (need_fsevent(fsevent, tvp)) {
8815 /*
8816 * The following is a sequence of three explicit events.
8817 * A pair of FSE_CLONE events representing the source and destination
8818 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8819 * fseventsd may coalesce the destination clone and create events
8820 * into a single event resulting in the following sequence for a client
8821 * FSE_CLONE (src)
8822 * FSE_CLONE | FSE_CREATE (dst)
8823 */
8824 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8825 FSE_ARG_DONE);
8826 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8827 FSE_ARG_DONE);
8828 }
8829 #endif /* CONFIG_FSE */
8830 }
8831
8832 out:
8833 if (attr_cleanup) {
8834 vn_attribute_cleanup(&nva, defaulted);
8835 }
8836 if (free_src_acl && va.va_acl) {
8837 kauth_acl_free(va.va_acl);
8838 }
8839 nameidone(&tond);
8840 if (tvp) {
8841 vnode_put(tvp);
8842 }
8843 vnode_put(tdvp);
8844 return error;
8845 }
8846
8847 /*
8848 * clone files or directories, target must not exist.
8849 */
8850 /* ARGSUSED */
8851 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8852 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8853 __unused int32_t *retval)
8854 {
8855 vnode_t fvp;
8856 struct nameidata fromnd;
8857 int follow;
8858 int error;
8859 vfs_context_t ctx = vfs_context_current();
8860
8861 /* Check that the flags are valid. */
8862 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8863 return EINVAL;
8864 }
8865
8866 AUDIT_ARG(fd, uap->src_dirfd);
8867
8868 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8869 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8870 UIO_USERSPACE, uap->src, ctx);
8871 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8872 return error;
8873 }
8874
8875 fvp = fromnd.ni_vp;
8876 nameidone(&fromnd);
8877
8878 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8879 uap->flags, ctx);
8880
8881 vnode_put(fvp);
8882 return error;
8883 }
8884
8885 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8886 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8887 __unused int32_t *retval)
8888 {
8889 vnode_t fvp;
8890 struct fileproc *fp;
8891 int error;
8892 vfs_context_t ctx = vfs_context_current();
8893
8894 /* Check that the flags are valid. */
8895 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8896 return EINVAL;
8897 }
8898
8899 AUDIT_ARG(fd, uap->src_fd);
8900 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8901 if (error) {
8902 return error;
8903 }
8904
8905 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8906 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8907 error = EBADF;
8908 goto out;
8909 }
8910
8911 if ((error = vnode_getwithref(fvp))) {
8912 goto out;
8913 }
8914
8915 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8916
8917 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8918 uap->flags, ctx);
8919
8920 vnode_put(fvp);
8921 out:
8922 file_drop(uap->src_fd);
8923 return error;
8924 }
8925
8926 static int
rename_submounts_callback(mount_t mp,void * arg)8927 rename_submounts_callback(mount_t mp, void *arg)
8928 {
8929 int error = 0;
8930 mount_t pmp = (mount_t)arg;
8931 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8932
8933 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8934 return 0;
8935 }
8936
8937 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8938 return 0;
8939 }
8940
8941 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8942 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8943 return -1;
8944 }
8945
8946 size_t pathlen = MAXPATHLEN;
8947 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8948 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8949 }
8950
8951 vfs_unbusy(mp);
8952
8953 return error;
8954 }
8955
8956 /*
8957 * Rename files. Source and destination must either both be directories,
8958 * or both not be directories. If target is a directory, it must be empty.
8959 */
8960 /* ARGSUSED */
8961 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8962 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8963 int tofd, user_addr_t to, int segflg, u_int uflags)
8964 {
8965 vnode_t tvp, tdvp;
8966 vnode_t fvp, fdvp;
8967 vnode_t mnt_fvp;
8968 struct nameidata *fromnd, *tond;
8969 int error = 0;
8970 int do_retry;
8971 int retry_count;
8972 int mntrename;
8973 int need_event;
8974 int need_kpath2;
8975 int has_listeners;
8976 const char *oname = NULL;
8977 char *from_name = NULL, *to_name = NULL;
8978 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8979 int from_len = 0, to_len = 0;
8980 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8981 int holding_mntlock;
8982 int vn_authorize_skipped;
8983 mount_t locked_mp = NULL;
8984 vnode_t oparent = NULLVP;
8985 #if CONFIG_FSE
8986 fse_info from_finfo = {}, to_finfo;
8987 #endif
8988 int from_truncated = 0, to_truncated = 0;
8989 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8990 int batched = 0;
8991 struct vnode_attr *fvap, *tvap;
8992 int continuing = 0;
8993 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8994 int32_t nofollow_any = 0;
8995 /* carving out a chunk for structs that are too big to be on stack. */
8996 struct {
8997 struct nameidata from_node, to_node;
8998 struct vnode_attr fv_attr, tv_attr;
8999 } * __rename_data;
9000
9001 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
9002 fromnd = &__rename_data->from_node;
9003 tond = &__rename_data->to_node;
9004
9005 holding_mntlock = 0;
9006 do_retry = 0;
9007 retry_count = 0;
9008 retry:
9009 fvp = tvp = NULL;
9010 fdvp = tdvp = NULL;
9011 fvap = tvap = NULL;
9012 mnt_fvp = NULLVP;
9013 mntrename = FALSE;
9014 vn_authorize_skipped = FALSE;
9015
9016 if (uflags & RENAME_NOFOLLOW_ANY) {
9017 nofollow_any = NAMEI_NOFOLLOW_ANY;
9018 }
9019 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
9020 segflg, from, ctx);
9021 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9022
9023 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
9024 segflg, to, ctx);
9025 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
9026
9027 continue_lookup:
9028 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9029 if ((error = nameiat(fromnd, fromfd))) {
9030 goto out1;
9031 }
9032 fdvp = fromnd->ni_dvp;
9033 fvp = fromnd->ni_vp;
9034
9035 if (fvp && fvp->v_type == VDIR) {
9036 tond->ni_cnd.cn_flags |= WILLBEDIR;
9037 }
9038 }
9039
9040 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
9041 if ((error = nameiat(tond, tofd))) {
9042 /*
9043 * Translate error code for rename("dir1", "dir2/.").
9044 */
9045 if (error == EISDIR && fvp->v_type == VDIR) {
9046 error = EINVAL;
9047 }
9048 goto out1;
9049 }
9050 tdvp = tond->ni_dvp;
9051 tvp = tond->ni_vp;
9052 }
9053
9054 #if DEVELOPMENT || DEBUG
9055 /*
9056 * XXX VSWAP: Check for entitlements or special flag here
9057 * so we can restrict access appropriately.
9058 */
9059 #else /* DEVELOPMENT || DEBUG */
9060
9061 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
9062 error = EPERM;
9063 goto out1;
9064 }
9065
9066 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9067 error = EPERM;
9068 goto out1;
9069 }
9070 #endif /* DEVELOPMENT || DEBUG */
9071
9072 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9073 error = ENOENT;
9074 goto out1;
9075 }
9076
9077 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9078 int32_t pval = 0;
9079 int err = 0;
9080
9081 /*
9082 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9083 * has the same name as target iff the following conditions are met:
9084 * 1. the target file system is case insensitive
9085 * 2. source and target directories are the same
9086 * 3. source and target files are the same
9087 * 4. name only differs in case (determined by underlying filesystem)
9088 */
9089 if (fvp != tvp || fdvp != tdvp) {
9090 error = EEXIST;
9091 goto out1;
9092 }
9093
9094 /*
9095 * Assume that the target file system is case sensitive if
9096 * _PC_CASE_SENSITIVE selector isn't supported.
9097 */
9098 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9099 if (err != 0 || pval != 0) {
9100 error = EEXIST;
9101 goto out1;
9102 }
9103 }
9104
9105 batched = vnode_compound_rename_available(fdvp);
9106
9107 #if CONFIG_FSE
9108 need_event = need_fsevent(FSE_RENAME, fdvp);
9109 if (need_event) {
9110 if (fvp) {
9111 get_fse_info(fvp, &from_finfo, ctx);
9112 } else {
9113 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9114 if (error) {
9115 goto out1;
9116 }
9117
9118 fvap = &__rename_data->fv_attr;
9119 }
9120
9121 if (tvp) {
9122 get_fse_info(tvp, &to_finfo, ctx);
9123 } else if (batched) {
9124 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9125 if (error) {
9126 goto out1;
9127 }
9128
9129 tvap = &__rename_data->tv_attr;
9130 }
9131 }
9132 #else
9133 need_event = 0;
9134 #endif /* CONFIG_FSE */
9135
9136 has_listeners = kauth_authorize_fileop_has_listeners();
9137
9138 need_kpath2 = 0;
9139 #if CONFIG_AUDIT
9140 if (AUDIT_RECORD_EXISTS()) {
9141 need_kpath2 = 1;
9142 }
9143 #endif
9144
9145 if (need_event || has_listeners) {
9146 if (from_name == NULL) {
9147 GET_PATH(from_name);
9148 }
9149
9150 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9151
9152 if (from_name_no_firmlink == NULL) {
9153 GET_PATH(from_name_no_firmlink);
9154 }
9155
9156 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9157 }
9158
9159 if (need_event || need_kpath2 || has_listeners) {
9160 if (to_name == NULL) {
9161 GET_PATH(to_name);
9162 }
9163
9164 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9165
9166 if (to_name_no_firmlink == NULL) {
9167 GET_PATH(to_name_no_firmlink);
9168 }
9169
9170 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9171 if (to_name && need_kpath2) {
9172 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9173 }
9174 }
9175 if (!fvp) {
9176 /*
9177 * Claim: this check will never reject a valid rename.
9178 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9179 * Suppose fdvp and tdvp are not on the same mount.
9180 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9181 * then you can't move it to within another dir on the same mountpoint.
9182 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9183 *
9184 * If this check passes, then we are safe to pass these vnodes to the same FS.
9185 */
9186 if (fdvp->v_mount != tdvp->v_mount) {
9187 error = EXDEV;
9188 goto out1;
9189 }
9190 goto skipped_lookup;
9191 }
9192
9193 /*
9194 * If the source and destination are the same (i.e. they're
9195 * links to the same vnode) and the target file system is
9196 * case sensitive, then there is nothing to do.
9197 *
9198 * XXX Come back to this.
9199 */
9200 if (fvp == tvp) {
9201 int pathconf_val;
9202
9203 /*
9204 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9205 * then assume that this file system is case sensitive.
9206 */
9207 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9208 pathconf_val != 0) {
9209 vn_authorize_skipped = TRUE;
9210 goto out1;
9211 }
9212 }
9213
9214 /*
9215 * Allow the renaming of mount points.
9216 * - target must not exist
9217 * - target must reside in the same directory as source
9218 * - union mounts cannot be renamed
9219 * - the root fs, and tightly-linked system volumes, cannot be renamed
9220 *
9221 * XXX Handle this in VFS after a continued lookup (if we missed
9222 * in the cache to start off)
9223 *
9224 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9225 * we'll skip past here. The file system is responsible for
9226 * checking that @tvp is not a descendent of @fvp and vice versa
9227 * so it should always return EINVAL if either @tvp or @fvp is the
9228 * root of a volume.
9229 */
9230 if ((fvp->v_flag & VROOT) &&
9231 (fvp->v_type == VDIR) &&
9232 (tvp == NULL) &&
9233 (fvp->v_mountedhere == NULL) &&
9234 (fdvp == tdvp) &&
9235 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9236 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9237 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9238 vnode_t coveredvp;
9239
9240 /* switch fvp to the covered vnode */
9241 coveredvp = fvp->v_mount->mnt_vnodecovered;
9242 if ((vnode_getwithref(coveredvp))) {
9243 error = ENOENT;
9244 goto out1;
9245 }
9246 /*
9247 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9248 * later.
9249 */
9250 mnt_fvp = fvp;
9251
9252 fvp = coveredvp;
9253 mntrename = TRUE;
9254 }
9255 /*
9256 * Check for cross-device rename.
9257 */
9258 if ((fvp->v_mount != tdvp->v_mount) ||
9259 (tvp && (fvp->v_mount != tvp->v_mount))) {
9260 error = EXDEV;
9261 goto out1;
9262 }
9263
9264 /*
9265 * If source is the same as the destination (that is the
9266 * same inode number) then there is nothing to do...
9267 * EXCEPT if the underlying file system supports case
9268 * insensitivity and is case preserving. In this case
9269 * the file system needs to handle the special case of
9270 * getting the same vnode as target (fvp) and source (tvp).
9271 *
9272 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9273 * and _PC_CASE_PRESERVING can have this exception, and they need to
9274 * handle the special case of getting the same vnode as target and
9275 * source. NOTE: Then the target is unlocked going into vnop_rename,
9276 * so not to cause locking problems. There is a single reference on tvp.
9277 *
9278 * NOTE - that fvp == tvp also occurs if they are hard linked and
9279 * that correct behaviour then is just to return success without doing
9280 * anything.
9281 *
9282 * XXX filesystem should take care of this itself, perhaps...
9283 */
9284 if (fvp == tvp && fdvp == tdvp) {
9285 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9286 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9287 fromnd->ni_cnd.cn_namelen)) {
9288 vn_authorize_skipped = TRUE;
9289 goto out1;
9290 }
9291 }
9292
9293 if (holding_mntlock && fvp->v_mount != locked_mp) {
9294 /*
9295 * we're holding a reference and lock
9296 * on locked_mp, but it no longer matches
9297 * what we want to do... so drop our hold
9298 */
9299 mount_unlock_renames(locked_mp);
9300 mount_drop(locked_mp, 0);
9301 holding_mntlock = 0;
9302 }
9303 if (tdvp != fdvp && fvp->v_type == VDIR) {
9304 /*
9305 * serialize renames that re-shape
9306 * the tree... if holding_mntlock is
9307 * set, then we're ready to go...
9308 * otherwise we
9309 * first need to drop the iocounts
9310 * we picked up, second take the
9311 * lock to serialize the access,
9312 * then finally start the lookup
9313 * process over with the lock held
9314 */
9315 if (!holding_mntlock) {
9316 /*
9317 * need to grab a reference on
9318 * the mount point before we
9319 * drop all the iocounts... once
9320 * the iocounts are gone, the mount
9321 * could follow
9322 */
9323 locked_mp = fvp->v_mount;
9324 mount_ref(locked_mp, 0);
9325
9326 /*
9327 * nameidone has to happen before we vnode_put(tvp)
9328 * since it may need to release the fs_nodelock on the tvp
9329 */
9330 nameidone(tond);
9331
9332 if (tvp) {
9333 vnode_put(tvp);
9334 }
9335 vnode_put(tdvp);
9336
9337 /*
9338 * nameidone has to happen before we vnode_put(fdvp)
9339 * since it may need to release the fs_nodelock on the fvp
9340 */
9341 nameidone(fromnd);
9342
9343 vnode_put(fvp);
9344 vnode_put(fdvp);
9345
9346 if (mnt_fvp != NULLVP) {
9347 vnode_put(mnt_fvp);
9348 }
9349
9350 mount_lock_renames(locked_mp);
9351 holding_mntlock = 1;
9352
9353 goto retry;
9354 }
9355 } else {
9356 /*
9357 * when we dropped the iocounts to take
9358 * the lock, we allowed the identity of
9359 * the various vnodes to change... if they did,
9360 * we may no longer be dealing with a rename
9361 * that reshapes the tree... once we're holding
9362 * the iocounts, the vnodes can't change type
9363 * so we're free to drop the lock at this point
9364 * and continue on
9365 */
9366 if (holding_mntlock) {
9367 mount_unlock_renames(locked_mp);
9368 mount_drop(locked_mp, 0);
9369 holding_mntlock = 0;
9370 }
9371 }
9372
9373 if (!batched) {
9374 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9375 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9376 flags, NULL);
9377 if (error) {
9378 if (error == ENOENT) {
9379 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9380 /*
9381 * We encountered a race where after doing the namei,
9382 * tvp stops being valid. If so, simply re-drive the rename
9383 * call from the top.
9384 */
9385 do_retry = 1;
9386 retry_count += 1;
9387 }
9388 }
9389 goto out1;
9390 }
9391 }
9392
9393 /* Release the 'mnt_fvp' now that it is no longer needed. */
9394 if (mnt_fvp != NULLVP) {
9395 vnode_put(mnt_fvp);
9396 mnt_fvp = NULLVP;
9397 }
9398
9399 // save these off so we can later verify that fvp is the same
9400 oname = fvp->v_name;
9401 oparent = fvp->v_parent;
9402
9403 skipped_lookup:
9404 #if CONFIG_FILE_LEASES
9405 /* Lease break needed for source's parent dir? */
9406 vnode_breakdirlease(fdvp, false, O_WRONLY);
9407
9408 /* Lease break needed for target's parent dir? */
9409 vnode_breakdirlease(tdvp, false, O_WRONLY);
9410 #endif
9411
9412 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9413 tdvp, &tvp, &tond->ni_cnd, tvap,
9414 flags, ctx);
9415
9416 if (holding_mntlock) {
9417 /*
9418 * we can drop our serialization
9419 * lock now
9420 */
9421 mount_unlock_renames(locked_mp);
9422 mount_drop(locked_mp, 0);
9423 holding_mntlock = 0;
9424 }
9425 if (error) {
9426 if (error == EDATALESS) {
9427 /*
9428 * If we've been here before, something has gone
9429 * horribly wrong and we should just get out lest
9430 * we spiral around the drain forever.
9431 */
9432 if (flags & VFS_RENAME_DATALESS) {
9433 error = EIO;
9434 goto out1;
9435 }
9436
9437 /*
9438 * The object we're renaming is dataless (or has a
9439 * dataless descendent) and requires materialization
9440 * before the rename occurs. But we're holding the
9441 * mount point's rename lock, so it's not safe to
9442 * make the upcall.
9443 *
9444 * In this case, we release the lock (above), perform
9445 * the materialization, and start the whole thing over.
9446 */
9447 error = vfs_materialize_reparent(fvp, tdvp);
9448 if (error == 0) {
9449 /*
9450 * The next time around we need to tell the
9451 * file system that the materializtaion has
9452 * been performed.
9453 */
9454 flags |= VFS_RENAME_DATALESS;
9455 do_retry = 1;
9456 }
9457 goto out1;
9458 }
9459 if (error == EKEEPLOOKING) {
9460 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9461 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9462 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9463 }
9464 }
9465
9466 fromnd->ni_vp = fvp;
9467 tond->ni_vp = tvp;
9468
9469 goto continue_lookup;
9470 }
9471
9472 /*
9473 * We may encounter a race in the VNOP where the destination didn't
9474 * exist when we did the namei, but it does by the time we go and
9475 * try to create the entry. In this case, we should re-drive this rename
9476 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9477 * but other filesystems susceptible to this race could return it, too.
9478 */
9479 if (error == ERECYCLE) {
9480 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9481 do_retry = 1;
9482 retry_count += 1;
9483 } else {
9484 printf("rename retry limit due to ERECYCLE reached\n");
9485 error = ENOENT;
9486 }
9487 }
9488
9489 /*
9490 * For compound VNOPs, the authorization callback may return
9491 * ENOENT in case of racing hardlink lookups hitting the name
9492 * cache, redrive the lookup.
9493 */
9494 if (batched && error == ENOENT) {
9495 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9496 do_retry = 1;
9497 retry_count += 1;
9498 }
9499 }
9500
9501 goto out1;
9502 }
9503
9504 /* call out to allow 3rd party notification of rename.
9505 * Ignore result of kauth_authorize_fileop call.
9506 */
9507 kauth_authorize_fileop(vfs_context_ucred(ctx),
9508 KAUTH_FILEOP_RENAME,
9509 (uintptr_t)from_name, (uintptr_t)to_name);
9510 if (flags & VFS_RENAME_SWAP) {
9511 kauth_authorize_fileop(vfs_context_ucred(ctx),
9512 KAUTH_FILEOP_RENAME,
9513 (uintptr_t)to_name, (uintptr_t)from_name);
9514 }
9515
9516 #if CONFIG_FSE
9517 if (from_name != NULL && to_name != NULL) {
9518 if (from_truncated || to_truncated) {
9519 // set it here since only the from_finfo gets reported up to user space
9520 from_finfo.mode |= FSE_TRUNCATED_PATH;
9521 }
9522
9523 if (tvap && tvp) {
9524 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9525 }
9526 if (fvap) {
9527 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9528 }
9529
9530 if (tvp) {
9531 add_fsevent(FSE_RENAME, ctx,
9532 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9533 FSE_ARG_FINFO, &from_finfo,
9534 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9535 FSE_ARG_FINFO, &to_finfo,
9536 FSE_ARG_DONE);
9537 if (flags & VFS_RENAME_SWAP) {
9538 /*
9539 * Strictly speaking, swap is the equivalent of
9540 * *three* renames. FSEvents clients should only take
9541 * the events as a hint, so we only bother reporting
9542 * two.
9543 */
9544 add_fsevent(FSE_RENAME, ctx,
9545 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9546 FSE_ARG_FINFO, &to_finfo,
9547 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9548 FSE_ARG_FINFO, &from_finfo,
9549 FSE_ARG_DONE);
9550 }
9551 } else {
9552 add_fsevent(FSE_RENAME, ctx,
9553 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9554 FSE_ARG_FINFO, &from_finfo,
9555 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9556 FSE_ARG_DONE);
9557 }
9558 }
9559 #endif /* CONFIG_FSE */
9560
9561 /*
9562 * update filesystem's mount point data
9563 */
9564 if (mntrename) {
9565 char *cp, *pathend, *mpname;
9566 char * tobuf;
9567 struct mount *mp;
9568 int maxlen;
9569 size_t len = 0;
9570
9571 mp = fvp->v_mountedhere;
9572
9573 if (vfs_busy(mp, LK_NOWAIT)) {
9574 error = EBUSY;
9575 goto out1;
9576 }
9577 tobuf = zalloc(ZV_NAMEI);
9578
9579 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9580 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9581 } else {
9582 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9583 }
9584 if (!error) {
9585 /* find current mount point prefix */
9586 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9587 for (cp = pathend; *cp != '\0'; ++cp) {
9588 if (*cp == '/') {
9589 pathend = cp + 1;
9590 }
9591 }
9592 /* find last component of target name */
9593 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9594 if (*cp == '/') {
9595 mpname = cp + 1;
9596 }
9597 }
9598
9599 /* Update f_mntonname of sub mounts */
9600 vfs_iterate(0, rename_submounts_callback, (void *)mp);
9601
9602 /* append name to prefix */
9603 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9604 bzero(pathend, maxlen);
9605
9606 strlcpy(pathend, mpname, maxlen);
9607 }
9608 zfree(ZV_NAMEI, tobuf);
9609
9610 vfs_unbusy(mp);
9611
9612 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9613 }
9614 /*
9615 * fix up name & parent pointers. note that we first
9616 * check that fvp has the same name/parent pointers it
9617 * had before the rename call... this is a 'weak' check
9618 * at best...
9619 *
9620 * XXX oparent and oname may not be set in the compound vnop case
9621 */
9622 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9623 int update_flags;
9624
9625 update_flags = VNODE_UPDATE_NAME;
9626
9627 if (fdvp != tdvp) {
9628 update_flags |= VNODE_UPDATE_PARENT;
9629 }
9630
9631 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9632 }
9633 out1:
9634 /*
9635 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9636 * skipped earlier as no actual rename was performed.
9637 */
9638 if (vn_authorize_skipped && error == 0) {
9639 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9640 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9641 flags, NULL);
9642 if (error && error == ENOENT) {
9643 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9644 do_retry = 1;
9645 retry_count += 1;
9646 }
9647 }
9648 }
9649 if (to_name != NULL) {
9650 RELEASE_PATH(to_name);
9651 to_name = NULL;
9652 }
9653 if (to_name_no_firmlink != NULL) {
9654 RELEASE_PATH(to_name_no_firmlink);
9655 to_name_no_firmlink = NULL;
9656 }
9657 if (from_name != NULL) {
9658 RELEASE_PATH(from_name);
9659 from_name = NULL;
9660 }
9661 if (from_name_no_firmlink != NULL) {
9662 RELEASE_PATH(from_name_no_firmlink);
9663 from_name_no_firmlink = NULL;
9664 }
9665 if (holding_mntlock) {
9666 mount_unlock_renames(locked_mp);
9667 mount_drop(locked_mp, 0);
9668 holding_mntlock = 0;
9669 }
9670 if (tdvp) {
9671 /*
9672 * nameidone has to happen before we vnode_put(tdvp)
9673 * since it may need to release the fs_nodelock on the tdvp
9674 */
9675 nameidone(tond);
9676
9677 if (tvp) {
9678 vnode_put(tvp);
9679 }
9680 vnode_put(tdvp);
9681 }
9682 if (fdvp) {
9683 /*
9684 * nameidone has to happen before we vnode_put(fdvp)
9685 * since it may need to release the fs_nodelock on the fdvp
9686 */
9687 nameidone(fromnd);
9688
9689 if (fvp) {
9690 vnode_put(fvp);
9691 }
9692 vnode_put(fdvp);
9693 }
9694 if (mnt_fvp != NULLVP) {
9695 vnode_put(mnt_fvp);
9696 }
9697 /*
9698 * If things changed after we did the namei, then we will re-drive
9699 * this rename call from the top.
9700 */
9701 if (do_retry) {
9702 do_retry = 0;
9703 goto retry;
9704 }
9705
9706 kfree_type(typeof(*__rename_data), __rename_data);
9707 return error;
9708 }
9709
9710 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9711 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9712 {
9713 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9714 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9715 }
9716
9717 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9718 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9719 {
9720 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9721 return EINVAL;
9722 }
9723
9724 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9725 return EINVAL;
9726 }
9727
9728 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9729 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9730 }
9731
9732 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9733 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9734 {
9735 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9736 uap->tofd, uap->to, UIO_USERSPACE, 0);
9737 }
9738
9739 /*
9740 * Make a directory file.
9741 *
9742 * Returns: 0 Success
9743 * EEXIST
9744 * namei:???
9745 * vnode_authorize:???
9746 * vn_create:???
9747 */
9748 /* ARGSUSED */
9749 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9750 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9751 enum uio_seg segflg)
9752 {
9753 vnode_t vp, dvp;
9754 int error;
9755 int update_flags = 0;
9756 int batched;
9757 struct nameidata nd;
9758
9759 AUDIT_ARG(mode, vap->va_mode);
9760 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9761 path, ctx);
9762 nd.ni_cnd.cn_flags |= WILLBEDIR;
9763 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9764
9765 continue_lookup:
9766 error = nameiat(&nd, fd);
9767 if (error) {
9768 return error;
9769 }
9770 dvp = nd.ni_dvp;
9771 vp = nd.ni_vp;
9772
9773 if (vp != NULL) {
9774 error = EEXIST;
9775 goto out;
9776 }
9777
9778 batched = vnode_compound_mkdir_available(dvp);
9779
9780 VATTR_SET(vap, va_type, VDIR);
9781
9782 /*
9783 * XXX
9784 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9785 * only get EXISTS or EISDIR for existing path components, and not that it could see
9786 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9787 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9788 */
9789 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9790 if (error == EACCES || error == EPERM) {
9791 int error2;
9792
9793 nameidone(&nd);
9794 vnode_put(dvp);
9795 dvp = NULLVP;
9796
9797 /*
9798 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9799 * rather than EACCESS if the target exists.
9800 */
9801 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9802 path, ctx);
9803 error2 = nameiat(&nd, fd);
9804 if (error2) {
9805 goto out;
9806 } else {
9807 vp = nd.ni_vp;
9808 error = EEXIST;
9809 goto out;
9810 }
9811 }
9812
9813 goto out;
9814 }
9815
9816 #if CONFIG_FILE_LEASES
9817 vnode_breakdirlease(dvp, false, O_WRONLY);
9818 #endif
9819
9820 /*
9821 * make the directory
9822 */
9823 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9824 if (error == EKEEPLOOKING) {
9825 nd.ni_vp = vp;
9826 goto continue_lookup;
9827 }
9828
9829 goto out;
9830 }
9831
9832 // Make sure the name & parent pointers are hooked up
9833 if (vp->v_name == NULL) {
9834 update_flags |= VNODE_UPDATE_NAME;
9835 }
9836 if (vp->v_parent == NULLVP) {
9837 update_flags |= VNODE_UPDATE_PARENT;
9838 }
9839
9840 if (update_flags) {
9841 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9842 }
9843
9844 #if CONFIG_FSE
9845 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9846 #endif
9847
9848 out:
9849 /*
9850 * nameidone has to happen before we vnode_put(dvp)
9851 * since it may need to release the fs_nodelock on the dvp
9852 */
9853 nameidone(&nd);
9854
9855 if (vp) {
9856 vnode_put(vp);
9857 }
9858 if (dvp) {
9859 vnode_put(dvp);
9860 }
9861
9862 return error;
9863 }
9864
9865 /*
9866 * mkdir_extended: Create a directory; with extended security (ACL).
9867 *
9868 * Parameters: p Process requesting to create the directory
9869 * uap User argument descriptor (see below)
9870 * retval (ignored)
9871 *
9872 * Indirect: uap->path Path of directory to create
9873 * uap->mode Access permissions to set
9874 * uap->xsecurity ACL to set
9875 *
9876 * Returns: 0 Success
9877 * !0 Not success
9878 *
9879 */
9880 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9881 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9882 {
9883 int ciferror;
9884 kauth_filesec_t xsecdst;
9885 struct vnode_attr va;
9886
9887 AUDIT_ARG(owner, uap->uid, uap->gid);
9888
9889 xsecdst = NULL;
9890 if ((uap->xsecurity != USER_ADDR_NULL) &&
9891 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9892 return ciferror;
9893 }
9894
9895 VATTR_INIT(&va);
9896 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9897 if (xsecdst != NULL) {
9898 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9899 va.va_vaflags |= VA_FILESEC_ACL;
9900 }
9901
9902 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9903 UIO_USERSPACE);
9904 if (xsecdst != NULL) {
9905 kauth_filesec_free(xsecdst);
9906 }
9907 return ciferror;
9908 }
9909
9910 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9911 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9912 {
9913 struct vnode_attr va;
9914
9915 VATTR_INIT(&va);
9916 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9917
9918 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9919 UIO_USERSPACE);
9920 }
9921
9922 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9923 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9924 {
9925 struct vnode_attr va;
9926
9927 VATTR_INIT(&va);
9928 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9929
9930 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9931 UIO_USERSPACE);
9932 }
9933
9934 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9935 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9936 enum uio_seg segflg, int unlink_flags)
9937 {
9938 struct {
9939 struct nameidata nd;
9940 #if CONFIG_FSE
9941 struct vnode_attr va;
9942 #endif /* CONFIG_FSE */
9943 } *__rmdir_data;
9944 vnode_t vp, dvp;
9945 int error;
9946 struct nameidata *ndp;
9947 char *path = NULL;
9948 char *no_firmlink_path = NULL;
9949 int len_path = 0;
9950 int len_no_firmlink_path = 0;
9951 int has_listeners = 0;
9952 int need_event = 0;
9953 int truncated_path = 0;
9954 int truncated_no_firmlink_path = 0;
9955 struct vnode_attr *vap = NULL;
9956 int restart_count = 0;
9957 int batched;
9958
9959 int restart_flag;
9960 int nofollow_any = 0;
9961
9962 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9963 ndp = &__rmdir_data->nd;
9964
9965 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
9966 nofollow_any = NAMEI_NOFOLLOW_ANY;
9967 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
9968 }
9969
9970 /*
9971 * This loop exists to restart rmdir in the unlikely case that two
9972 * processes are simultaneously trying to remove the same directory
9973 * containing orphaned appleDouble files.
9974 */
9975 do {
9976 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9977 segflg, dirpath, ctx);
9978 ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
9979 continue_lookup:
9980 restart_flag = 0;
9981 vap = NULL;
9982
9983 error = nameiat(ndp, fd);
9984 if (error) {
9985 goto err_out;
9986 }
9987
9988 dvp = ndp->ni_dvp;
9989 vp = ndp->ni_vp;
9990
9991 if (vp) {
9992 batched = vnode_compound_rmdir_available(vp);
9993
9994 if (vp->v_flag & VROOT) {
9995 /*
9996 * The root of a mounted filesystem cannot be deleted.
9997 */
9998 error = EBUSY;
9999 goto out;
10000 }
10001
10002 #if DEVELOPMENT || DEBUG
10003 /*
10004 * XXX VSWAP: Check for entitlements or special flag here
10005 * so we can restrict access appropriately.
10006 */
10007 #else /* DEVELOPMENT || DEBUG */
10008
10009 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
10010 error = EPERM;
10011 goto out;
10012 }
10013 #endif /* DEVELOPMENT || DEBUG */
10014
10015 /*
10016 * Removed a check here; we used to abort if vp's vid
10017 * was not the same as what we'd seen the last time around.
10018 * I do not think that check was valid, because if we retry
10019 * and all dirents are gone, the directory could legitimately
10020 * be recycled but still be present in a situation where we would
10021 * have had permission to delete. Therefore, we won't make
10022 * an effort to preserve that check now that we may not have a
10023 * vp here.
10024 */
10025
10026 if (!batched) {
10027 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
10028 if (error) {
10029 if (error == ENOENT) {
10030 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10031 restart_flag = 1;
10032 restart_count += 1;
10033 }
10034 }
10035 goto out;
10036 }
10037 }
10038 } else {
10039 batched = 1;
10040
10041 if (!vnode_compound_rmdir_available(dvp)) {
10042 panic("No error, but no compound rmdir?");
10043 }
10044 }
10045
10046 #if CONFIG_FSE
10047 fse_info finfo = {0};
10048
10049 need_event = need_fsevent(FSE_DELETE, dvp);
10050 if (need_event) {
10051 if (!batched) {
10052 get_fse_info(vp, &finfo, ctx);
10053 } else {
10054 error = vfs_get_notify_attributes(&__rmdir_data->va);
10055 if (error) {
10056 goto out;
10057 }
10058
10059 vap = &__rmdir_data->va;
10060 }
10061 }
10062 #endif
10063 has_listeners = kauth_authorize_fileop_has_listeners();
10064 if (need_event || has_listeners) {
10065 if (path == NULL) {
10066 GET_PATH(path);
10067 }
10068
10069 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10070
10071 if (no_firmlink_path == NULL) {
10072 GET_PATH(no_firmlink_path);
10073 }
10074
10075 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10076 #if CONFIG_FSE
10077 if (truncated_no_firmlink_path) {
10078 finfo.mode |= FSE_TRUNCATED_PATH;
10079 }
10080 #endif
10081 }
10082
10083 #if CONFIG_FILE_LEASES
10084 vnode_breakdirlease(dvp, false, O_WRONLY);
10085 #endif
10086
10087 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10088 ndp->ni_vp = vp;
10089 if (vp == NULLVP) {
10090 /* Couldn't find a vnode */
10091 goto out;
10092 }
10093
10094 if (error == EKEEPLOOKING) {
10095 goto continue_lookup;
10096 } else if (batched && error == ENOENT) {
10097 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10098 /*
10099 * For compound VNOPs, the authorization callback
10100 * may return ENOENT in case of racing hard link lookups
10101 * redrive the lookup.
10102 */
10103 restart_flag = 1;
10104 restart_count += 1;
10105 goto out;
10106 }
10107 }
10108
10109 /*
10110 * XXX There's no provision for passing flags
10111 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10112 * because it's not empty, then we try again
10113 * with VNOP_REMOVE(), passing in a special
10114 * flag that clever file systems will know
10115 * how to handle.
10116 */
10117 if (error == ENOTEMPTY &&
10118 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10119 /*
10120 * Only do this if the directory is actually
10121 * marked as DATALESS.
10122 */
10123 struct vnode_attr *lvap =
10124 kalloc_type(struct vnode_attr, Z_WAITOK);
10125
10126 VATTR_INIT(lvap);
10127 VATTR_WANTED(lvap, va_flags);
10128 if (vnode_getattr(vp, lvap, ctx) == 0 &&
10129 VATTR_IS_SUPPORTED(lvap, va_flags) &&
10130 (lvap->va_flags & SF_DATALESS) != 0) {
10131 /*
10132 * If this fails, we want to keep the original
10133 * error.
10134 */
10135 if (vn_remove(dvp, &vp, ndp,
10136 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10137 error = 0;
10138 }
10139 }
10140 kfree_type(struct vnode_attr, lvap);
10141 }
10142
10143 #if CONFIG_APPLEDOUBLE
10144 /*
10145 * Special case to remove orphaned AppleDouble
10146 * files. I don't like putting this in the kernel,
10147 * but carbon does not like putting this in carbon either,
10148 * so here we are.
10149 */
10150 if (error == ENOTEMPTY) {
10151 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10152 if (ad_error == EBUSY) {
10153 error = ad_error;
10154 goto out;
10155 }
10156
10157
10158 /*
10159 * Assuming everything went well, we will try the RMDIR again
10160 */
10161 if (!ad_error) {
10162 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10163 }
10164 }
10165 #endif /* CONFIG_APPLEDOUBLE */
10166 /*
10167 * Call out to allow 3rd party notification of delete.
10168 * Ignore result of kauth_authorize_fileop call.
10169 */
10170 if (!error) {
10171 if (has_listeners) {
10172 kauth_authorize_fileop(vfs_context_ucred(ctx),
10173 KAUTH_FILEOP_DELETE,
10174 (uintptr_t)vp,
10175 (uintptr_t)path);
10176 }
10177
10178 if (vp->v_flag & VISHARDLINK) {
10179 // see the comment in unlink1() about why we update
10180 // the parent of a hard link when it is removed
10181 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10182 }
10183
10184 #if CONFIG_FSE
10185 if (need_event) {
10186 if (vap) {
10187 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10188 }
10189 add_fsevent(FSE_DELETE, ctx,
10190 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10191 FSE_ARG_FINFO, &finfo,
10192 FSE_ARG_DONE);
10193 }
10194 #endif
10195
10196 #if CONFIG_MACF
10197 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10198 #endif
10199 }
10200
10201 out:
10202 if (path != NULL) {
10203 RELEASE_PATH(path);
10204 path = NULL;
10205 }
10206
10207 if (no_firmlink_path != NULL) {
10208 RELEASE_PATH(no_firmlink_path);
10209 no_firmlink_path = NULL;
10210 }
10211
10212 /*
10213 * nameidone has to happen before we vnode_put(dvp)
10214 * since it may need to release the fs_nodelock on the dvp
10215 */
10216 nameidone(ndp);
10217 vnode_put(dvp);
10218
10219 if (vp) {
10220 vnode_put(vp);
10221 }
10222
10223 if (restart_flag == 0) {
10224 wakeup_one((caddr_t)vp);
10225 goto err_out;
10226 }
10227 tsleep(vp, PVFS, "rm AD", 1);
10228 } while (restart_flag != 0);
10229
10230 err_out:
10231 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10232
10233 return error;
10234 }
10235
10236 /*
10237 * Remove a directory file.
10238 */
10239 /* ARGSUSED */
10240 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10241 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10242 {
10243 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10244 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10245 }
10246
10247 /* Get direntry length padded to 8 byte alignment */
10248 #define DIRENT64_LEN(namlen) \
10249 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10250
10251 /* Get dirent length padded to 4 byte alignment */
10252 #define DIRENT_LEN(namelen) \
10253 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10254
10255 /* Get the end of this dirent */
10256 #define DIRENT_END(dep) \
10257 (((char *)(dep)) + (dep)->d_reclen - 1)
10258
10259 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10260 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10261 int *numdirent, vfs_context_t ctxp)
10262 {
10263 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10264 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10265 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10266 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10267 } else {
10268 size_t bufsize;
10269 void * bufptr;
10270 uio_t auio;
10271 struct direntry *entry64;
10272 struct dirent *dep;
10273 size_t bytesread;
10274 int error;
10275
10276 /*
10277 * We're here because the underlying file system does not
10278 * support direnties or we mounted denying support so we must
10279 * fall back to dirents and convert them to direntries.
10280 *
10281 * Our kernel buffer needs to be smaller since re-packing will
10282 * expand each dirent. The worse case (when the name length
10283 * is 3 or less) corresponds to a struct direntry size of 32
10284 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10285 * (4-byte aligned). So having a buffer that is 3/8 the size
10286 * will prevent us from reading more than we can pack.
10287 *
10288 * Since this buffer is wired memory, we will limit the
10289 * buffer size to a maximum of 32K. We would really like to
10290 * use 32K in the MIN(), but we use magic number 87371 to
10291 * prevent uio_resid() * 3 / 8 from overflowing.
10292 */
10293 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10294 bufptr = kalloc_data(bufsize, Z_WAITOK);
10295 if (bufptr == NULL) {
10296 return ENOMEM;
10297 }
10298
10299 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10300 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10301 auio->uio_offset = uio->uio_offset;
10302
10303 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10304
10305 dep = (struct dirent *)bufptr;
10306 bytesread = bufsize - uio_resid(auio);
10307
10308 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10309 /*
10310 * Convert all the entries and copy them out to user's buffer.
10311 */
10312 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10313 /* First check that the dirent struct up to d_name is within the buffer */
10314 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10315 /* Check that the length of the entire dirent is within the buffer */
10316 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10317 /* Check that the actual length including the name doesn't exceed d_reclen */
10318 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10319 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10320 vp->v_mount->mnt_vfsstat.f_mntonname,
10321 vp->v_name ? vp->v_name : "<unknown>");
10322 error = EIO;
10323 break;
10324 }
10325
10326 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10327
10328 bzero(entry64, enbufsize);
10329 /* Convert a dirent to a dirent64. */
10330 entry64->d_ino = dep->d_ino;
10331 entry64->d_seekoff = 0;
10332 entry64->d_reclen = (uint16_t)enbufsize;
10333 entry64->d_namlen = dep->d_namlen;
10334 entry64->d_type = dep->d_type;
10335 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10336
10337 /* Move to next entry. */
10338 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10339
10340 /* Copy entry64 to user's buffer. */
10341 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10342 }
10343
10344 /* Update the real offset using the offset we got from VNOP_READDIR. */
10345 if (error == 0) {
10346 uio->uio_offset = auio->uio_offset;
10347 }
10348 uio_free(auio);
10349 kfree_data(bufptr, bufsize);
10350 kfree_type(struct direntry, entry64);
10351 return error;
10352 }
10353 }
10354
10355 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10356
10357 /*
10358 * Read a block of directory entries in a file system independent format.
10359 */
10360 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10361 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10362 off_t *offset, int *eofflag, int flags)
10363 {
10364 vnode_t vp;
10365 struct vfs_context context = *vfs_context_current(); /* local copy */
10366 struct fileproc *fp;
10367 uio_t auio;
10368 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10369 off_t loff;
10370 int error, numdirent;
10371 UIO_STACKBUF(uio_buf, 1);
10372
10373 get_from_fd:
10374 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10375 if (error) {
10376 return error;
10377 }
10378
10379 vn_offset_lock(fp->fp_glob);
10380 if (((vnode_t)fp_get_data(fp)) != vp) {
10381 vn_offset_unlock(fp->fp_glob);
10382 file_drop(fd);
10383 goto get_from_fd;
10384 }
10385
10386 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10387 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10388 error = EBADF;
10389 goto out;
10390 }
10391
10392 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10393 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10394 }
10395
10396 #if CONFIG_MACF
10397 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10398 if (error) {
10399 goto out;
10400 }
10401 #endif
10402
10403 if ((error = vnode_getwithref(vp))) {
10404 goto out;
10405 }
10406 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10407
10408 #if CONFIG_UNION_MOUNTS
10409 unionread:
10410 #endif /* CONFIG_UNION_MOUNTS */
10411 if (vp->v_type != VDIR) {
10412 (void)vnode_put(vp);
10413 error = EINVAL;
10414 goto out;
10415 }
10416
10417 #if CONFIG_MACF
10418 error = mac_vnode_check_readdir(&context, vp);
10419 if (error != 0) {
10420 (void)vnode_put(vp);
10421 goto out;
10422 }
10423 #endif /* MAC */
10424
10425 loff = fp->fp_glob->fg_offset;
10426 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10427 uio_addiov(auio, bufp, bufsize);
10428
10429 if (flags & VNODE_READDIR_EXTENDED) {
10430 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10431 fp->fp_glob->fg_offset = uio_offset(auio);
10432 } else {
10433 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10434 fp->fp_glob->fg_offset = uio_offset(auio);
10435 }
10436 if (error) {
10437 (void)vnode_put(vp);
10438 goto out;
10439 }
10440
10441 #if CONFIG_UNION_MOUNTS
10442 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10443 (vp->v_mount->mnt_flag & MNT_UNION)) {
10444 vnode_t uvp;
10445
10446 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10447 if (vnode_ref(uvp) == 0) {
10448 fp_set_data(fp, uvp);
10449 fp->fp_glob->fg_offset = 0;
10450 vnode_rele(vp);
10451 vnode_put(vp);
10452 vp = uvp;
10453 goto unionread;
10454 } else {
10455 /* could not get a ref, can't replace in fd */
10456 vnode_put(uvp);
10457 }
10458 }
10459 }
10460 #endif /* CONFIG_UNION_MOUNTS */
10461
10462 vnode_put(vp);
10463 if (offset) {
10464 *offset = loff;
10465 }
10466
10467 *bytesread = bufsize - uio_resid(auio);
10468 out:
10469 vn_offset_unlock(fp->fp_glob);
10470 file_drop(fd);
10471 return error;
10472 }
10473
10474
10475 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10476 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10477 {
10478 off_t offset;
10479 ssize_t bytesread;
10480 int error, eofflag;
10481
10482 AUDIT_ARG(fd, uap->fd);
10483 error = getdirentries_common(uap->fd, uap->buf, uap->count,
10484 &bytesread, &offset, &eofflag, 0);
10485
10486 if (error == 0) {
10487 if (proc_is64bit(p)) {
10488 user64_long_t base = (user64_long_t)offset;
10489 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10490 } else {
10491 user32_long_t base = (user32_long_t)offset;
10492 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10493 }
10494 *retval = (int)bytesread;
10495 }
10496 return error;
10497 }
10498
10499 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10500 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10501 {
10502 off_t offset;
10503 ssize_t bytesread;
10504 int error, eofflag;
10505 user_size_t bufsize;
10506
10507 AUDIT_ARG(fd, uap->fd);
10508
10509 /*
10510 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10511 * then the kernel carves out the last 4 bytes to return extended
10512 * information to userspace (namely whether we reached EOF with this call).
10513 */
10514 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10515 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10516 } else {
10517 bufsize = uap->bufsize;
10518 }
10519
10520 error = getdirentries_common(uap->fd, uap->buf, bufsize,
10521 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10522
10523 if (error == 0) {
10524 *retval = bytesread;
10525 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10526
10527 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10528 getdirentries64_flags_t flags = 0;
10529 if (eofflag) {
10530 flags |= GETDIRENTRIES64_EOF;
10531 }
10532 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10533 sizeof(flags));
10534 }
10535 }
10536 return error;
10537 }
10538
10539
10540 /*
10541 * Set the mode mask for creation of filesystem nodes.
10542 * XXX implement xsecurity
10543 */
10544 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10545 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10546 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10547 {
10548 AUDIT_ARG(mask, newmask);
10549 proc_fdlock(p);
10550 *retval = p->p_fd.fd_cmask;
10551 p->p_fd.fd_cmask = newmask & ALLPERMS;
10552 proc_fdunlock(p);
10553 return 0;
10554 }
10555
10556 /*
10557 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10558 *
10559 * Parameters: p Process requesting to set the umask
10560 * uap User argument descriptor (see below)
10561 * retval umask of the process (parameter p)
10562 *
10563 * Indirect: uap->newmask umask to set
10564 * uap->xsecurity ACL to set
10565 *
10566 * Returns: 0 Success
10567 * !0 Not success
10568 *
10569 */
10570 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10571 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10572 {
10573 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10574 }
10575
10576 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10577 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10578 {
10579 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10580 }
10581
10582 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10583 "com.apple.private.vfs.revoke-mounted-device"
10584
10585 /*
10586 * Void all references to file by ripping underlying filesystem
10587 * away from vnode.
10588 */
10589 /* ARGSUSED */
10590 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10591 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10592 {
10593 vnode_t vp;
10594 struct vnode_attr va;
10595 vfs_context_t ctx = vfs_context_current();
10596 int error;
10597 struct nameidata nd;
10598
10599 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10600 uap->path, ctx);
10601 error = namei(&nd);
10602 if (error) {
10603 return error;
10604 }
10605 vp = nd.ni_vp;
10606
10607 nameidone(&nd);
10608
10609 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10610 error = ENOTSUP;
10611 goto out;
10612 }
10613
10614 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10615 error = EBUSY;
10616 goto out;
10617 }
10618
10619 #if CONFIG_MACF
10620 error = mac_vnode_check_revoke(ctx, vp);
10621 if (error) {
10622 goto out;
10623 }
10624 #endif
10625
10626 VATTR_INIT(&va);
10627 VATTR_WANTED(&va, va_uid);
10628 if ((error = vnode_getattr(vp, &va, ctx))) {
10629 goto out;
10630 }
10631 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10632 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10633 goto out;
10634 }
10635 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10636 VNOP_REVOKE(vp, REVOKEALL, ctx);
10637 }
10638 out:
10639 vnode_put(vp);
10640 return error;
10641 }
10642
10643
10644 /*
10645 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10646 * The following system calls are designed to support features
10647 * which are specific to the HFS & HFS Plus volume formats
10648 */
10649
10650
10651 /*
10652 * Obtain attribute information on objects in a directory while enumerating
10653 * the directory.
10654 */
10655 /* ARGSUSED */
10656 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10657 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10658 {
10659 vnode_t vp;
10660 struct fileproc *fp;
10661 uio_t auio = NULL;
10662 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10663 uint32_t count = 0, savecount = 0;
10664 uint32_t newstate = 0;
10665 int error, eofflag = 0;
10666 off_t loff = 0;
10667 struct attrlist attributelist;
10668 vfs_context_t ctx = vfs_context_current();
10669 int fd = uap->fd;
10670 UIO_STACKBUF(uio_buf, 1);
10671 kauth_action_t action;
10672
10673 AUDIT_ARG(fd, fd);
10674
10675 /* Get the attributes into kernel space */
10676 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10677 return error;
10678 }
10679 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10680 return error;
10681 }
10682 savecount = count;
10683
10684 get_from_fd:
10685 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10686 return error;
10687 }
10688
10689 vn_offset_lock(fp->fp_glob);
10690 if (((vnode_t)fp_get_data(fp)) != vp) {
10691 vn_offset_unlock(fp->fp_glob);
10692 file_drop(fd);
10693 goto get_from_fd;
10694 }
10695
10696 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10697 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10698 error = EBADF;
10699 goto out;
10700 }
10701
10702
10703 #if CONFIG_MACF
10704 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10705 fp->fp_glob);
10706 if (error) {
10707 goto out;
10708 }
10709 #endif
10710
10711
10712 if ((error = vnode_getwithref(vp))) {
10713 goto out;
10714 }
10715
10716 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10717
10718 #if CONFIG_UNION_MOUNTS
10719 unionread:
10720 #endif /* CONFIG_UNION_MOUNTS */
10721 if (vp->v_type != VDIR) {
10722 (void)vnode_put(vp);
10723 error = EINVAL;
10724 goto out;
10725 }
10726
10727 #if CONFIG_MACF
10728 error = mac_vnode_check_readdir(ctx, vp);
10729 if (error != 0) {
10730 (void)vnode_put(vp);
10731 goto out;
10732 }
10733 #endif /* MAC */
10734
10735 /* set up the uio structure which will contain the users return buffer */
10736 loff = fp->fp_glob->fg_offset;
10737 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10738 uio_addiov(auio, uap->buffer, uap->buffersize);
10739
10740 /*
10741 * If the only item requested is file names, we can let that past with
10742 * just LIST_DIRECTORY. If they want any other attributes, that means
10743 * they need SEARCH as well.
10744 */
10745 action = KAUTH_VNODE_LIST_DIRECTORY;
10746 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10747 attributelist.fileattr || attributelist.dirattr) {
10748 action |= KAUTH_VNODE_SEARCH;
10749 }
10750
10751 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10752 /* Believe it or not, uap->options only has 32-bits of valid
10753 * info, so truncate before extending again */
10754
10755 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10756 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10757 }
10758
10759 if (error) {
10760 (void) vnode_put(vp);
10761 goto out;
10762 }
10763
10764 #if CONFIG_UNION_MOUNTS
10765 /*
10766 * If we've got the last entry of a directory in a union mount
10767 * then reset the eofflag and pretend there's still more to come.
10768 * The next call will again set eofflag and the buffer will be empty,
10769 * so traverse to the underlying directory and do the directory
10770 * read there.
10771 */
10772 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10773 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10774 eofflag = 0;
10775 } else { // Empty buffer
10776 vnode_t uvp;
10777 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10778 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10779 fp_set_data(fp, uvp);
10780 fp->fp_glob->fg_offset = 0; // reset index for new dir
10781 count = savecount;
10782 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10783 vnode_put(vp);
10784 vp = uvp;
10785 goto unionread;
10786 } else {
10787 /* could not get a ref, can't replace in fd */
10788 vnode_put(uvp);
10789 }
10790 }
10791 }
10792 }
10793 #endif /* CONFIG_UNION_MOUNTS */
10794
10795 (void)vnode_put(vp);
10796
10797 if (error) {
10798 goto out;
10799 }
10800 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10801
10802 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10803 goto out;
10804 }
10805 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10806 goto out;
10807 }
10808 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10809 goto out;
10810 }
10811
10812 *retval = eofflag; /* similar to getdirentries */
10813 error = 0;
10814 out:
10815 vn_offset_unlock(fp->fp_glob);
10816 file_drop(fd);
10817 return error; /* return error earlier, an retval of 0 or 1 now */
10818 } /* end of getdirentriesattr system call */
10819
10820 /*
10821 * Exchange data between two files
10822 */
10823
10824 /* ARGSUSED */
10825 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10826 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10827 {
10828 struct nameidata fnd, snd;
10829 vfs_context_t ctx = vfs_context_current();
10830 vnode_t fvp;
10831 vnode_t svp;
10832 int error;
10833 u_int32_t nameiflags;
10834 char *fpath = NULL;
10835 char *spath = NULL;
10836 int flen = 0, slen = 0;
10837 int from_truncated = 0, to_truncated = 0;
10838 #if CONFIG_FSE
10839 fse_info f_finfo, s_finfo;
10840 #endif
10841
10842 nameiflags = 0;
10843 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10844 nameiflags |= FOLLOW;
10845 }
10846
10847 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10848 UIO_USERSPACE, uap->path1, ctx);
10849
10850 error = namei(&fnd);
10851 if (error) {
10852 goto out2;
10853 }
10854
10855 nameidone(&fnd);
10856 fvp = fnd.ni_vp;
10857
10858 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10859 UIO_USERSPACE, uap->path2, ctx);
10860
10861 error = namei(&snd);
10862 if (error) {
10863 vnode_put(fvp);
10864 goto out2;
10865 }
10866 nameidone(&snd);
10867 svp = snd.ni_vp;
10868
10869 /*
10870 * if the files are the same, return an inval error
10871 */
10872 if (svp == fvp) {
10873 error = EINVAL;
10874 goto out;
10875 }
10876
10877 /*
10878 * if the files are on different volumes, return an error
10879 */
10880 if (svp->v_mount != fvp->v_mount) {
10881 error = EXDEV;
10882 goto out;
10883 }
10884
10885 /* If they're not files, return an error */
10886 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10887 error = EINVAL;
10888 goto out;
10889 }
10890
10891 #if CONFIG_MACF
10892 error = mac_vnode_check_exchangedata(ctx,
10893 fvp, svp);
10894 if (error) {
10895 goto out;
10896 }
10897 #endif
10898 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10899 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10900 goto out;
10901 }
10902
10903 if (
10904 #if CONFIG_FSE
10905 need_fsevent(FSE_EXCHANGE, fvp) ||
10906 #endif
10907 kauth_authorize_fileop_has_listeners()) {
10908 GET_PATH(fpath);
10909 GET_PATH(spath);
10910
10911 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10912 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10913
10914 #if CONFIG_FSE
10915 get_fse_info(fvp, &f_finfo, ctx);
10916 get_fse_info(svp, &s_finfo, ctx);
10917 if (from_truncated || to_truncated) {
10918 // set it here since only the f_finfo gets reported up to user space
10919 f_finfo.mode |= FSE_TRUNCATED_PATH;
10920 }
10921 #endif
10922 }
10923 /* Ok, make the call */
10924 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10925
10926 if (error == 0) {
10927 const char *tmpname;
10928
10929 if (fpath != NULL && spath != NULL) {
10930 /* call out to allow 3rd party notification of exchangedata.
10931 * Ignore result of kauth_authorize_fileop call.
10932 */
10933 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10934 (uintptr_t)fpath, (uintptr_t)spath);
10935 }
10936 name_cache_lock();
10937
10938 tmpname = fvp->v_name;
10939 fvp->v_name = svp->v_name;
10940 svp->v_name = tmpname;
10941
10942 if (fvp->v_parent != svp->v_parent) {
10943 vnode_t tmp;
10944
10945 tmp = fvp->v_parent;
10946 fvp->v_parent = svp->v_parent;
10947 svp->v_parent = tmp;
10948 }
10949 name_cache_unlock();
10950
10951 #if CONFIG_FSE
10952 if (fpath != NULL && spath != NULL) {
10953 add_fsevent(FSE_EXCHANGE, ctx,
10954 FSE_ARG_STRING, flen, fpath,
10955 FSE_ARG_FINFO, &f_finfo,
10956 FSE_ARG_STRING, slen, spath,
10957 FSE_ARG_FINFO, &s_finfo,
10958 FSE_ARG_DONE);
10959 }
10960 #endif
10961 }
10962
10963 out:
10964 if (fpath != NULL) {
10965 RELEASE_PATH(fpath);
10966 }
10967 if (spath != NULL) {
10968 RELEASE_PATH(spath);
10969 }
10970 vnode_put(svp);
10971 vnode_put(fvp);
10972 out2:
10973 return error;
10974 }
10975
10976 /*
10977 * Return (in MB) the amount of freespace on the given vnode's volume.
10978 */
10979 uint32_t freespace_mb(vnode_t vp);
10980
10981 uint32_t
freespace_mb(vnode_t vp)10982 freespace_mb(vnode_t vp)
10983 {
10984 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10985 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10986 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10987 }
10988
10989 #if CONFIG_SEARCHFS
10990
10991 /* ARGSUSED */
10992
10993 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10994 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10995 {
10996 vnode_t vp, tvp;
10997 int i, error = 0;
10998 int fserror = 0;
10999 struct nameidata nd;
11000 struct user64_fssearchblock searchblock;
11001 struct searchstate *state;
11002 struct attrlist *returnattrs;
11003 struct timeval timelimit;
11004 void *searchparams1, *searchparams2;
11005 uio_t auio = NULL;
11006 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11007 uint32_t nummatches;
11008 size_t mallocsize;
11009 uint32_t nameiflags;
11010 vfs_context_t ctx = vfs_context_current();
11011 UIO_STACKBUF(uio_buf, 1);
11012
11013 /* Start by copying in fsearchblock parameter list */
11014 if (IS_64BIT_PROCESS(p)) {
11015 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
11016 timelimit.tv_sec = searchblock.timelimit.tv_sec;
11017 timelimit.tv_usec = searchblock.timelimit.tv_usec;
11018 } else {
11019 struct user32_fssearchblock tmp_searchblock;
11020
11021 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
11022 // munge into 64-bit version
11023 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
11024 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
11025 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
11026 searchblock.maxmatches = tmp_searchblock.maxmatches;
11027 /*
11028 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
11029 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
11030 */
11031 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
11032 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
11033 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
11034 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
11035 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
11036 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
11037 searchblock.searchattrs = tmp_searchblock.searchattrs;
11038 }
11039 if (error) {
11040 return error;
11041 }
11042
11043 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
11044 */
11045 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
11046 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
11047 return EINVAL;
11048 }
11049
11050 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
11051 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
11052 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
11053 /* block. */
11054 /* */
11055 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
11056 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
11057 /* assumes the size is still 556 bytes it will continue to work */
11058
11059 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
11060 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
11061
11062 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
11063
11064 /* Now set up the various pointers to the correct place in our newly allocated memory */
11065
11066 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11067 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11068 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11069
11070 /* Now copy in the stuff given our local variables. */
11071
11072 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11073 goto freeandexit;
11074 }
11075
11076 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11077 goto freeandexit;
11078 }
11079
11080 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11081 goto freeandexit;
11082 }
11083
11084 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11085 goto freeandexit;
11086 }
11087
11088 /*
11089 * When searching a union mount, need to set the
11090 * start flag at the first call on each layer to
11091 * reset state for the new volume.
11092 */
11093 if (uap->options & SRCHFS_START) {
11094 state->ss_union_layer = 0;
11095 } else {
11096 uap->options |= state->ss_union_flags;
11097 }
11098 state->ss_union_flags = 0;
11099
11100 /*
11101 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11102 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11103 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11104 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11105 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11106 */
11107
11108 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11109 attrreference_t* string_ref;
11110 u_int32_t* start_length;
11111 user64_size_t param_length;
11112
11113 /* validate searchparams1 */
11114 param_length = searchblock.sizeofsearchparams1;
11115 /* skip the word that specifies length of the buffer */
11116 start_length = (u_int32_t*) searchparams1;
11117 start_length = start_length + 1;
11118 string_ref = (attrreference_t*) start_length;
11119
11120 /* ensure no negative offsets or too big offsets */
11121 if (string_ref->attr_dataoffset < 0) {
11122 error = EINVAL;
11123 goto freeandexit;
11124 }
11125 if (string_ref->attr_length > MAXPATHLEN) {
11126 error = EINVAL;
11127 goto freeandexit;
11128 }
11129
11130 /* Check for pointer overflow in the string ref */
11131 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11132 error = EINVAL;
11133 goto freeandexit;
11134 }
11135
11136 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11137 error = EINVAL;
11138 goto freeandexit;
11139 }
11140 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11141 error = EINVAL;
11142 goto freeandexit;
11143 }
11144 }
11145
11146 /* set up the uio structure which will contain the users return buffer */
11147 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11148 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11149
11150 nameiflags = 0;
11151 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11152 nameiflags |= FOLLOW;
11153 }
11154 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11155 UIO_USERSPACE, uap->path, ctx);
11156
11157 error = namei(&nd);
11158 if (error) {
11159 goto freeandexit;
11160 }
11161 vp = nd.ni_vp;
11162 nameidone(&nd);
11163
11164 /*
11165 * Switch to the root vnode for the volume
11166 */
11167 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11168 vnode_put(vp);
11169 if (error) {
11170 goto freeandexit;
11171 }
11172 vp = tvp;
11173
11174 #if CONFIG_UNION_MOUNTS
11175 /*
11176 * If it's a union mount, the path lookup takes
11177 * us to the top layer. But we may need to descend
11178 * to a lower layer. For non-union mounts the layer
11179 * is always zero.
11180 */
11181 for (i = 0; i < (int) state->ss_union_layer; i++) {
11182 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11183 break;
11184 }
11185 tvp = vp;
11186 vp = vp->v_mount->mnt_vnodecovered;
11187 if (vp == NULL) {
11188 vnode_put(tvp);
11189 error = ENOENT;
11190 goto freeandexit;
11191 }
11192 error = vnode_getwithref(vp);
11193 vnode_put(tvp);
11194 if (error) {
11195 goto freeandexit;
11196 }
11197 }
11198 #endif /* CONFIG_UNION_MOUNTS */
11199
11200 #if CONFIG_MACF
11201 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11202 if (error) {
11203 vnode_put(vp);
11204 goto freeandexit;
11205 }
11206 #endif
11207
11208
11209 /*
11210 * If searchblock.maxmatches == 0, then skip the search. This has happened
11211 * before and sometimes the underlying code doesnt deal with it well.
11212 */
11213 if (searchblock.maxmatches == 0) {
11214 nummatches = 0;
11215 goto saveandexit;
11216 }
11217
11218 /*
11219 * Allright, we have everything we need, so lets make that call.
11220 *
11221 * We keep special track of the return value from the file system:
11222 * EAGAIN is an acceptable error condition that shouldn't keep us
11223 * from copying out any results...
11224 */
11225
11226 fserror = VNOP_SEARCHFS(vp,
11227 searchparams1,
11228 searchparams2,
11229 &searchblock.searchattrs,
11230 (uint32_t)searchblock.maxmatches,
11231 &timelimit,
11232 returnattrs,
11233 &nummatches,
11234 (uint32_t)uap->scriptcode,
11235 (uint32_t)uap->options,
11236 auio,
11237 (struct searchstate *) &state->ss_fsstate,
11238 ctx);
11239
11240 #if CONFIG_UNION_MOUNTS
11241 /*
11242 * If it's a union mount we need to be called again
11243 * to search the mounted-on filesystem.
11244 */
11245 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11246 state->ss_union_flags = SRCHFS_START;
11247 state->ss_union_layer++; // search next layer down
11248 fserror = EAGAIN;
11249 }
11250 #endif /* CONFIG_UNION_MOUNTS */
11251
11252 saveandexit:
11253
11254 vnode_put(vp);
11255
11256 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11257 * search state. Everything was already put into he return buffer by the vop call. */
11258
11259 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11260 goto freeandexit;
11261 }
11262
11263 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11264 goto freeandexit;
11265 }
11266
11267 error = fserror;
11268
11269 freeandexit:
11270
11271 kfree_data(searchparams1, mallocsize);
11272
11273 return error;
11274 } /* end of searchfs system call */
11275
11276 #else /* CONFIG_SEARCHFS */
11277
11278 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11279 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11280 {
11281 return ENOTSUP;
11282 }
11283
11284 #endif /* CONFIG_SEARCHFS */
11285
11286
11287 #if CONFIG_DATALESS_FILES
11288
11289 /*
11290 * === Namespace Resolver Up-call Mechanism ===
11291 *
11292 * When I/O is performed to a dataless file or directory (read, write,
11293 * lookup-in, etc.), the file system performs an upcall to the namespace
11294 * resolver (filecoordinationd) to materialize the object.
11295 *
11296 * We need multiple up-calls to be in flight at once, and we need these
11297 * up-calls to be interruptible, thus the following implementation:
11298 *
11299 * => The nspace_resolver_request represents the in-kernel request state.
11300 * It contains a request ID, storage space for the errno code returned
11301 * by filecoordinationd, and flags.
11302 *
11303 * => The request ID is simply a global monotonically incrementing 32-bit
11304 * number. Outstanding requests are stored in a hash table, and the
11305 * hash function is extremely simple.
11306 *
11307 * => When an upcall is to be made to filecoordinationd, a request structure
11308 * is allocated on the stack (it is small, and needs to live only during
11309 * the duration of the call to resolve_nspace_item_ext()). It is
11310 * initialized and inserted into the table. Some backpressure from
11311 * filecoordinationd is applied by limiting the numnber of entries that
11312 * can be inserted into the table (and thus limiting the number of
11313 * outstanding requests issued to filecoordinationd); waiting for an
11314 * available slot is interruptible.
11315 *
11316 * => Once the request has been inserted into the table, the up-call is made
11317 * to filecoordinationd via a MiG-generated stub. The up-call returns
11318 * immediately and filecoordinationd processes the request asynchronously.
11319 *
11320 * => The caller now waits for the request to complete. Tnis is achieved by
11321 * sleeping on the address of the request structure and waiting for
11322 * filecoordinationd to mark the request structure as complete. This
11323 * is an interruptible sleep call; if interrupted, the request structure
11324 * is removed from the table and EINTR is returned to the caller. If
11325 * this occurs, an advisory up-call is made to filecoordinationd with
11326 * the request ID to indicate that the request can be aborted or
11327 * de-prioritized at the discretion of filecoordinationd.
11328 *
11329 * => When filecoordinationd has completed the request, it signals completion
11330 * by writing to the vfs.nspace.complete sysctl node. Only a process
11331 * decorated as a namespace resolver can write to this sysctl node. The
11332 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11333 * The request ID is looked up in the table, and if the request is found,
11334 * the error code is stored in the request structure and a wakeup()
11335 * issued on the address of the request structure. If the request is not
11336 * found, we simply drop the completion notification, assuming that the
11337 * caller was interrupted.
11338 *
11339 * => When the waiting thread wakes up, it extracts the error code from the
11340 * request structure, removes the request from the table, and returns the
11341 * error code to the calling function. Fini!
11342 */
11343
11344 struct nspace_resolver_request {
11345 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11346 vnode_t r_vp;
11347 vnode_t r_tdvp;
11348 uint32_t r_req_id;
11349 int r_resolver_error;
11350 int r_flags;
11351 };
11352
11353 #define RRF_COMPLETE 0x0001
11354 #define RRF_COMPLETING 0x0002
11355
11356 struct nspace_resolver_completion_data {
11357 uint32_t req_id;
11358 int32_t resolver_error;
11359 uint64_t orig_gencount;
11360 uint64_t orig_syncroot;
11361 };
11362
11363 static uint32_t
next_nspace_req_id(void)11364 next_nspace_req_id(void)
11365 {
11366 static uint32_t next_req_id;
11367
11368 return OSAddAtomic(1, &next_req_id);
11369 }
11370
11371 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11372 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11373
11374 static LIST_HEAD(nspace_resolver_requesthead,
11375 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11376 static u_long nspace_resolver_request_hashmask;
11377 static u_int nspace_resolver_request_count;
11378 static bool nspace_resolver_request_wait_slot;
11379 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11380 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11381 &nspace_resolver_request_lck_grp);
11382
11383 #define NSPACE_REQ_LOCK() \
11384 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11385 #define NSPACE_REQ_UNLOCK() \
11386 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11387
11388 #define NSPACE_RESOLVER_HASH(req_id) \
11389 (&nspace_resolver_request_hashtbl[(req_id) & \
11390 nspace_resolver_request_hashmask])
11391
11392 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11393 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11394 {
11395 struct nspace_resolver_requesthead *bucket;
11396 struct nspace_resolver_request *req;
11397
11398 bucket = NSPACE_RESOLVER_HASH(req_id);
11399 LIST_FOREACH(req, bucket, r_hashlink) {
11400 if (req->r_req_id == req_id) {
11401 /*
11402 * If this request already has a completion
11403 * pending, don't return it again.
11404 */
11405 if ((req->r_flags & RRF_COMPLETING) != 0 &&
11406 skip_completing) {
11407 req = NULL;
11408 }
11409 return req;
11410 }
11411 }
11412
11413 return NULL;
11414 }
11415
11416 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11417 nspace_resolver_req_add(struct nspace_resolver_request *req)
11418 {
11419 struct nspace_resolver_requesthead *bucket;
11420 int error;
11421
11422 NSPACE_REQ_LOCK();
11423
11424 while (nspace_resolver_request_count >=
11425 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11426 nspace_resolver_request_wait_slot = true;
11427 error = msleep(&nspace_resolver_request_count,
11428 &nspace_resolver_request_hash_mutex,
11429 PVFS | PCATCH, "nspacerq", NULL);
11430 if (error) {
11431 NSPACE_REQ_UNLOCK();
11432 return error;
11433 }
11434 }
11435
11436 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11437 #if DIAGNOSTIC
11438 assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11439 #endif /* DIAGNOSTIC */
11440 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11441 nspace_resolver_request_count++;
11442
11443 NSPACE_REQ_UNLOCK();
11444
11445 return 0;
11446 }
11447
11448 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11449 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11450 {
11451 /*
11452 * If a completion is in-progress, we have to wait for the
11453 * completion handler to finish because it's still using 'req',
11454 * which is allocated on our stack a couple of frames up.
11455 */
11456 while ((req->r_flags & RRF_COMPLETING) != 0) {
11457 (void) msleep(req, &nspace_resolver_request_hash_mutex,
11458 PVFS, "nspacecmplt", NULL);
11459 }
11460 }
11461
11462 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11463 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11464 {
11465 struct nspace_resolver_requesthead *bucket;
11466
11467 /* We're called with NSPACE_REQ_LOCK held. */
11468
11469 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11470 #if DIAGNOSTIC
11471 assert((req->r_flags & RRF_COMPLETING) == 0);
11472 assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11473 #endif /* DIAGNOSTIC */
11474 LIST_REMOVE(req, r_hashlink);
11475 nspace_resolver_request_count--;
11476
11477 if (nspace_resolver_request_wait_slot) {
11478 nspace_resolver_request_wait_slot = false;
11479 wakeup(&nspace_resolver_request_count);
11480 }
11481
11482 nspace_resolver_req_wait_pending_completion(req);
11483
11484 NSPACE_REQ_UNLOCK();
11485 }
11486
11487 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11488 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11489 {
11490 NSPACE_REQ_LOCK();
11491 nspace_resolver_req_remove_and_unlock(req);
11492 }
11493
11494 static void
nspace_resolver_req_cancel(uint32_t req_id)11495 nspace_resolver_req_cancel(uint32_t req_id)
11496 {
11497 kern_return_t kr;
11498 mach_port_t mp;
11499
11500 // Failures here aren't fatal -- the cancellation message
11501 // sent to the resolver is merely advisory.
11502
11503 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11504 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11505 return;
11506 }
11507
11508 kr = send_nspace_resolve_cancel(mp, req_id);
11509 if (kr != KERN_SUCCESS) {
11510 os_log_error(OS_LOG_DEFAULT,
11511 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11512 }
11513
11514 ipc_port_release_send(mp);
11515 }
11516
11517 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11518 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11519 {
11520 bool send_cancel_message = false;
11521 int error;
11522
11523 NSPACE_REQ_LOCK();
11524
11525 while ((req->r_flags & RRF_COMPLETE) == 0) {
11526 error = msleep(req, &nspace_resolver_request_hash_mutex,
11527 PVFS | PCATCH, "nspace", NULL);
11528 if (error && error != ERESTART) {
11529 req->r_resolver_error = (error == EINTR) ? EINTR :
11530 ETIMEDOUT;
11531 send_cancel_message = true;
11532 break;
11533 }
11534 }
11535
11536 nspace_resolver_req_remove_and_unlock(req);
11537
11538 /*
11539 * It's safe to continue referencing 'req' here because it's
11540 * allocated on our caller's stack.
11541 */
11542
11543 if (send_cancel_message) {
11544 nspace_resolver_req_cancel(req->r_req_id);
11545 }
11546
11547 return req->r_resolver_error;
11548 }
11549
11550 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11551 nspace_resolver_req_mark_complete(
11552 struct nspace_resolver_request *req,
11553 int resolver_error)
11554 {
11555 req->r_resolver_error = resolver_error;
11556 req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11557 wakeup(req);
11558 }
11559
11560 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11561 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11562 {
11563 req->r_flags |= RRF_COMPLETING;
11564 }
11565
11566 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11567 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11568 {
11569 struct nspace_resolver_request *req;
11570 int error;
11571 struct vnode_attr va;
11572 vnode_t vp;
11573
11574 NSPACE_REQ_LOCK();
11575
11576 req = nspace_resolver_req_lookup(c->req_id, true);
11577 if (req == NULL) {
11578 /*
11579 * If we don't find the request corresponding to our req_id,
11580 * just drop the completion on the floor; it's likely that
11581 * the requester interrupted with a signal, or it may already
11582 * be completing.
11583 */
11584 NSPACE_REQ_UNLOCK();
11585 return;
11586 }
11587
11588 /*
11589 * Get out now if the resolver reported an error.
11590 */
11591 if ((error = c->resolver_error) != 0) {
11592 goto out;
11593 }
11594
11595 /*
11596 * If the resolver did not specify any namespace shape criteria
11597 * for letting the operation proceed, then get out now.
11598 */
11599 if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11600 goto out;
11601 }
11602
11603 /*
11604 * We're going to have to acquire the mount rename lock and do
11605 * some I/O in order to verify the criteria. Mark the request
11606 * as pending so no one else messes with it after we drop the
11607 * NSPACE_REQ_LOCK.
11608 */
11609 nspace_resolver_req_mark_completion_pending(req);
11610 NSPACE_REQ_UNLOCK();
11611
11612 /*
11613 * Lock out renames from changing the shape of the tree while
11614 * validate the criteria.
11615 */
11616 mount_t locked_mp = req->r_vp->v_mount;
11617 mount_ref(locked_mp, 0);
11618 mount_lock_renames(locked_mp);
11619
11620 if (c->orig_gencount != 0) {
11621 vp = req->r_vp;
11622 if (error) {
11623 goto out_dropmount;
11624 }
11625
11626 VATTR_INIT(&va);
11627 VATTR_WANTED(&va, va_recursive_gencount);
11628 error = vnode_getattr(vp, &va, vfs_context_kernel());
11629 if (error) {
11630 goto out_dropmount;
11631 }
11632 if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11633 va.va_recursive_gencount != c->orig_gencount) {
11634 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11635 c->orig_gencount, va.va_recursive_gencount);
11636 error = EBUSY;
11637 goto out_dropmount;
11638 }
11639 }
11640
11641 /*
11642 * Ignore orig_syncroot if a destination directory wasn't specified
11643 * in the request.
11644 */
11645 if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11646 uint64_t syncroot_id;
11647
11648 if (error) {
11649 goto out_dropmount;
11650 }
11651
11652 #ifndef APFSIOC_GET_SYNC_ROOT
11653 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11654 #endif
11655
11656 error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11657 (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11658 if (error) {
11659 goto out_dropmount;
11660 }
11661 if (syncroot_id != c->orig_syncroot) {
11662 printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11663 c->orig_syncroot, syncroot_id);
11664 error = EBUSY;
11665 goto out_dropmount;
11666 }
11667 }
11668
11669 out_dropmount:
11670 mount_unlock_renames(locked_mp);
11671 mount_drop(locked_mp, 0);
11672 NSPACE_REQ_LOCK();
11673
11674 out:
11675 nspace_resolver_req_mark_complete(req, error);
11676 NSPACE_REQ_UNLOCK();
11677 }
11678
11679 static struct proc *nspace_resolver_proc;
11680
11681 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11682 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11683 {
11684 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11685 p == nspace_resolver_proc) ? 1 : 0;
11686 return 0;
11687 }
11688
11689 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11690
11691 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11692 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11693 {
11694 vfs_context_t ctx = vfs_context_current();
11695 int error = 0;
11696
11697 //
11698 // The system filecoordinationd runs as uid == 0. This also
11699 // has the nice side-effect of filtering out filecoordinationd
11700 // running in the simulator.
11701 //
11702 if (!vfs_context_issuser(ctx) ||
11703 !vfs_context_is_dataless_resolver(ctx)) {
11704 return EPERM;
11705 }
11706
11707 if (is_resolver) {
11708 NSPACE_REQ_LOCK();
11709
11710 if (nspace_resolver_proc == NULL) {
11711 proc_lock(p);
11712 p->p_lflag |= P_LNSPACE_RESOLVER;
11713 proc_unlock(p);
11714 nspace_resolver_proc = p;
11715 } else {
11716 error = EBUSY;
11717 }
11718
11719 NSPACE_REQ_UNLOCK();
11720 } else {
11721 // This is basically just like the exit case.
11722 // nspace_resolver_exited() will verify that the
11723 // process is the resolver, and will clear the
11724 // global.
11725 nspace_resolver_exited(p);
11726 }
11727
11728 return error;
11729 }
11730
11731 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11732 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11733 {
11734 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11735 (p->p_vfs_iopolicy &
11736 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11737 *is_prevented = 1;
11738 } else {
11739 *is_prevented = 0;
11740 }
11741 return 0;
11742 }
11743
11744 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11745 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11746 {
11747 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11748 return is_prevented ? 0 : EBUSY;
11749 }
11750
11751 if (is_prevented) {
11752 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11753 } else {
11754 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11755 }
11756 return 0;
11757 }
11758
11759 static int
nspace_materialization_get_thread_state(int * is_prevented)11760 nspace_materialization_get_thread_state(int *is_prevented)
11761 {
11762 uthread_t ut = current_uthread();
11763
11764 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11765 return 0;
11766 }
11767
11768 static int
nspace_materialization_set_thread_state(int is_prevented)11769 nspace_materialization_set_thread_state(int is_prevented)
11770 {
11771 uthread_t ut = current_uthread();
11772
11773 if (is_prevented) {
11774 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11775 } else {
11776 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11777 }
11778 return 0;
11779 }
11780
11781 /* the vfs.nspace branch */
11782 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11783
11784 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11785 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11786 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11787 {
11788 struct proc *p = req->p;
11789 int new_value, old_value, changed = 0;
11790 int error;
11791
11792 error = nspace_resolver_get_proc_state(p, &old_value);
11793 if (error) {
11794 return error;
11795 }
11796
11797 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11798 &changed);
11799 if (error == 0 && changed) {
11800 error = nspace_resolver_set_proc_state(p, new_value);
11801 }
11802 return error;
11803 }
11804
11805 /* decorate this process as the dataless file resolver */
11806 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11807 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11808 0, 0, sysctl_nspace_resolver, "I", "");
11809
11810 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11811 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11812 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11813 {
11814 struct proc *p = req->p;
11815 int new_value, old_value, changed = 0;
11816 int error;
11817
11818 error = nspace_materialization_get_proc_state(p, &old_value);
11819 if (error) {
11820 return error;
11821 }
11822
11823 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11824 &changed);
11825 if (error == 0 && changed) {
11826 error = nspace_materialization_set_proc_state(p, new_value);
11827 }
11828 return error;
11829 }
11830
11831 /* decorate this process as not wanting to materialize dataless files */
11832 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11833 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11834 0, 0, sysctl_nspace_prevent_materialization, "I", "");
11835
11836 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11837 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11838 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11839 {
11840 int new_value, old_value, changed = 0;
11841 int error;
11842
11843 error = nspace_materialization_get_thread_state(&old_value);
11844 if (error) {
11845 return error;
11846 }
11847
11848 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11849 &changed);
11850 if (error == 0 && changed) {
11851 error = nspace_materialization_set_thread_state(new_value);
11852 }
11853 return error;
11854 }
11855
11856 /* decorate this thread as not wanting to materialize dataless files */
11857 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11858 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11859 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11860
11861 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11862 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11863 __unused int arg2, struct sysctl_req *req)
11864 {
11865 struct proc *p = req->p;
11866 uint32_t req_status[2] = { 0, 0 };
11867 uint64_t gencount = 0;
11868 uint64_t syncroot = 0;
11869 int error, is_resolver, changed = 0, other_changed;
11870
11871 error = nspace_resolver_get_proc_state(p, &is_resolver);
11872 if (error) {
11873 return error;
11874 }
11875
11876 if (!is_resolver) {
11877 return EPERM;
11878 }
11879
11880 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11881 &changed);
11882 if (error) {
11883 return error;
11884 }
11885
11886 /*
11887 * Get the gencount if it was passed. Ignore errors, because
11888 * it's optional.
11889 */
11890 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11891 &other_changed);
11892 if (error) {
11893 gencount = 0;
11894 error = 0;
11895 }
11896
11897 /*
11898 * ...and now the syncroot ID.
11899 */
11900 error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
11901 &other_changed);
11902 if (error) {
11903 syncroot = 0;
11904 error = 0;
11905 }
11906
11907 /*
11908 * req_status[0] is the req_id
11909 *
11910 * req_status[1] is the errno
11911 */
11912 if (error == 0 && changed) {
11913 const struct nspace_resolver_completion_data cd = {
11914 .req_id = req_status[0],
11915 .resolver_error = req_status[1],
11916 .orig_gencount = gencount,
11917 .orig_syncroot = syncroot,
11918 };
11919 nspace_resolver_req_completed(&cd);
11920 }
11921 return error;
11922 }
11923
11924 /* Resolver reports completed reqs here. */
11925 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11926 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11927 0, 0, sysctl_nspace_complete, "-", "");
11928
11929 #endif /* CONFIG_DATALESS_FILES */
11930
11931 #if CONFIG_DATALESS_FILES
11932 #define __no_dataless_unused /* nothing */
11933 #else
11934 #define __no_dataless_unused __unused
11935 #endif
11936
11937 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11938 vfs_context_dataless_materialization_is_prevented(
11939 vfs_context_t const ctx __no_dataless_unused)
11940 {
11941 #if CONFIG_DATALESS_FILES
11942 proc_t const p = vfs_context_proc(ctx);
11943 thread_t const t = vfs_context_thread(ctx);
11944 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11945
11946 /*
11947 * Kernel context ==> return EDEADLK, as we would with any random
11948 * process decorated as no-materialize.
11949 */
11950 if (ctx == vfs_context_kernel()) {
11951 return EDEADLK;
11952 }
11953
11954 /*
11955 * If the process has the dataless-manipulation entitlement,
11956 * materialization is prevented, and depending on the kind
11957 * of file system operation, things get to proceed as if the
11958 * object is not dataless.
11959 */
11960 if (vfs_context_is_dataless_manipulator(ctx)) {
11961 return EJUSTRETURN;
11962 }
11963
11964 /*
11965 * Per-thread decorations override any process-wide decorations.
11966 * (Foundation uses this, and this overrides even the dataless-
11967 * manipulation entitlement so as to make API contracts consistent.)
11968 */
11969 if (ut != NULL) {
11970 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11971 return EDEADLK;
11972 }
11973 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11974 return 0;
11975 }
11976 }
11977
11978 /*
11979 * If the process's iopolicy specifies that dataless files
11980 * can be materialized, then we let it go ahead.
11981 */
11982 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11983 return 0;
11984 }
11985 #endif /* CONFIG_DATALESS_FILES */
11986
11987 /*
11988 * The default behavior is to not materialize dataless files;
11989 * return to the caller that deadlock was detected.
11990 */
11991 return EDEADLK;
11992 }
11993
11994 void
nspace_resolver_init(void)11995 nspace_resolver_init(void)
11996 {
11997 #if CONFIG_DATALESS_FILES
11998 nspace_resolver_request_hashtbl =
11999 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
12000 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
12001 #endif /* CONFIG_DATALESS_FILES */
12002 }
12003
12004 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)12005 nspace_resolver_exited(struct proc *p __no_dataless_unused)
12006 {
12007 #if CONFIG_DATALESS_FILES
12008 struct nspace_resolver_requesthead *bucket;
12009 struct nspace_resolver_request *req;
12010 u_long idx;
12011
12012 NSPACE_REQ_LOCK();
12013
12014 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
12015 p == nspace_resolver_proc) {
12016 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
12017 bucket = &nspace_resolver_request_hashtbl[idx];
12018 LIST_FOREACH(req, bucket, r_hashlink) {
12019 nspace_resolver_req_wait_pending_completion(req);
12020 nspace_resolver_req_mark_complete(req,
12021 ETIMEDOUT);
12022 }
12023 }
12024 nspace_resolver_proc = NULL;
12025 }
12026
12027 NSPACE_REQ_UNLOCK();
12028 #endif /* CONFIG_DATALESS_FILES */
12029 }
12030
12031 #define DATALESS_RESOLVER_ENTITLEMENT \
12032 "com.apple.private.vfs.dataless-resolver"
12033 #define DATALESS_MANIPULATION_ENTITLEMENT \
12034 "com.apple.private.vfs.dataless-manipulation"
12035
12036 #if CONFIG_DATALESS_FILES
12037 /*
12038 * Return TRUE if the vfs context is associated with the dataless
12039 * resolver.
12040 */
12041 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)12042 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
12043 {
12044 return IOTaskHasEntitlement(vfs_context_task(ctx),
12045 DATALESS_RESOLVER_ENTITLEMENT);
12046 }
12047 #endif /* CONFIG_DATALESS_FILES */
12048
12049 /*
12050 * Return TRUE if the vfs context is associated with a process entitled
12051 * for dataless manipulation.
12052 *
12053 * XXX Arguably belongs in vfs_subr.c, but is here because of the
12054 * complication around CONFIG_DATALESS_FILES.
12055 */
12056 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)12057 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
12058 {
12059 #if CONFIG_DATALESS_FILES
12060 task_t task = vfs_context_task(ctx);
12061 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
12062 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
12063 #else
12064 return false;
12065 #endif /* CONFIG_DATALESS_FILES */
12066 }
12067
12068 #if CONFIG_DATALESS_FILES
12069 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12070 log_materialization_prevented(vnode_t vp, uint64_t op)
12071 {
12072 char p_name[MAXCOMLEN + 1];
12073 char *vntype;
12074 proc_selfname(&p_name[0], sizeof(p_name));
12075
12076 if (vp->v_type == VREG) {
12077 vntype = "File";
12078 } else if (vp->v_type == VDIR) {
12079 vntype = "Dir";
12080 } else if (vp->v_type == VLNK) {
12081 vntype = "SymLink";
12082 } else {
12083 vntype = "Other";
12084 }
12085
12086 #if DEVELOPMENT
12087 char *path = NULL;
12088 int len;
12089
12090 path = get_pathbuff();
12091 len = MAXPATHLEN;
12092 if (path) {
12093 vn_getpath(vp, path, &len);
12094 }
12095
12096 os_log_debug(OS_LOG_DEFAULT,
12097 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
12098 p_name, proc_selfpid(),
12099 op, vntype, path ? path : "<unknown-path>");
12100 if (path) {
12101 release_pathbuff(path);
12102 }
12103 #else
12104 os_log_debug(OS_LOG_DEFAULT,
12105 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12106 p_name, proc_selfpid(),
12107 op, vntype);
12108 #endif
12109 }
12110 #endif /* CONFIG_DATALESS_FILES */
12111
12112 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12113 vfs_materialize_item(
12114 vnode_t vp __no_dataless_unused,
12115 uint32_t op __no_dataless_unused,
12116 int64_t offset __no_dataless_unused,
12117 int64_t size __no_dataless_unused,
12118 char *lookup_name __no_dataless_unused,
12119 size_t const namelen __no_dataless_unused,
12120 vnode_t tdvp __no_dataless_unused)
12121 {
12122 #if CONFIG_DATALESS_FILES
12123 kern_return_t kern_ret;
12124 mach_port_t mach_port;
12125 char *path = NULL;
12126 vfs_context_t context;
12127 int path_len;
12128 int error;
12129 audit_token_t atoken;
12130 enum vtype vp_vtype;
12131
12132 /* Swap files are special; ignore them */
12133 if (vnode_isswap(vp)) {
12134 return 0;
12135 }
12136
12137 /*
12138 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12139 * are no longer used nor supported.
12140 */
12141 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12142 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12143 return ENOTSUP;
12144 }
12145 if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12146 os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12147 return ENOTSUP;
12148 }
12149
12150 /* Normalize 'op'. */
12151 op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12152
12153 /*
12154 * To-directory is only meaningful for rename operations;
12155 * ignore it if someone handed one to us unexpectedly.
12156 */
12157 if (op != NAMESPACE_HANDLER_RENAME_OP) {
12158 tdvp = NULL;
12159 }
12160
12161 context = vfs_context_current();
12162
12163 /* Remember this for later. */
12164 vp_vtype = vnode_vtype(vp);
12165
12166 error = vfs_context_dataless_materialization_is_prevented(context);
12167 if (error) {
12168 log_materialization_prevented(vp, op);
12169 goto out_check_errors;
12170 }
12171
12172 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12173 &mach_port);
12174 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12175 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12176 /*
12177 * Treat this like being unable to access the backing store
12178 * server.
12179 */
12180 return ETIMEDOUT;
12181 }
12182
12183 int path_alloc_len = MAXPATHLEN;
12184 do {
12185 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12186 if (path == NULL) {
12187 return ENOMEM;
12188 }
12189
12190 path_len = path_alloc_len;
12191 error = vn_getpath(vp, path, &path_len);
12192 if (error == 0) {
12193 break;
12194 } else if (error == ENOSPC) {
12195 kfree_data(path, path_alloc_len);
12196 path = NULL;
12197 } else {
12198 goto out_release_port;
12199 }
12200 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12201
12202 error = vfs_context_copy_audit_token(context, &atoken);
12203 if (error) {
12204 goto out_release_port;
12205 }
12206
12207 struct nspace_resolver_request req = {
12208 .r_req_id = next_nspace_req_id(),
12209 .r_vp = vp,
12210 .r_tdvp = tdvp,
12211 };
12212
12213 error = nspace_resolver_req_add(&req);
12214 if (error) {
12215 goto out_release_port;
12216 }
12217
12218 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12219
12220 if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12221 char *dest_path = NULL;
12222 int dest_path_len;
12223
12224 dest_path = zalloc(ZV_NAMEI);
12225 dest_path_len = MAXPATHLEN;
12226
12227 error = vn_getpath(tdvp, dest_path, &dest_path_len);
12228 if (error) {
12229 zfree(ZV_NAMEI, dest_path);
12230 goto out_release_port;
12231 }
12232
12233 /*
12234 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12235 * compatibility with existing agents in user-space
12236 * who get passed this value.
12237 */
12238 kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12239 req.r_req_id,
12240 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12241 path, dest_path, atoken);
12242
12243 zfree(ZV_NAMEI, dest_path);
12244 } else if (vp_vtype == VDIR) {
12245 char *tmpname = NULL;
12246
12247 /*
12248 * If the caller provided a lookup_name *and* a name length,
12249 * then we assume the lookup_name is not NUL-terminated.
12250 * Allocate a temporary buffer in this case to provide
12251 * a NUL-terminated path name to the IPC call.
12252 */
12253 if (lookup_name != NULL && namelen != 0) {
12254 if (namelen >= PATH_MAX) {
12255 error = EINVAL;
12256 goto out_req_remove;
12257 }
12258 tmpname = zalloc(ZV_NAMEI);
12259 strlcpy(tmpname, lookup_name, namelen + 1);
12260 lookup_name = tmpname;
12261 } else if (lookup_name != NULL) {
12262 /*
12263 * If the caller provided a lookup_name with a
12264 * zero name length, then we assume it's NUL-
12265 * terminated. Verify it has a valid length.
12266 */
12267 if (strlen(lookup_name) >= PATH_MAX) {
12268 error = EINVAL;
12269 goto out_req_remove;
12270 }
12271 }
12272
12273 /* (See above.) */
12274 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12275 req.r_req_id,
12276 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12277 lookup_name == NULL ? "" : lookup_name, path, atoken);
12278
12279 if (tmpname != NULL) {
12280 zfree(ZV_NAMEI, tmpname);
12281
12282 /*
12283 * Poison lookup_name rather than reference
12284 * freed memory.
12285 */
12286 lookup_name = NULL;
12287 }
12288 } else {
12289 /* (See above.) */
12290 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12291 req.r_req_id,
12292 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12293 offset, size, path, atoken);
12294 }
12295 if (kern_ret != KERN_SUCCESS) {
12296 /*
12297 * Also treat this like being unable to access the backing
12298 * store server.
12299 */
12300 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12301 kern_ret);
12302 error = ETIMEDOUT;
12303 goto out_req_remove;
12304 }
12305
12306 /*
12307 * Give back the memory we allocated earlier while we wait; we
12308 * no longer need it.
12309 */
12310 kfree_data(path, path_alloc_len);
12311 path = NULL;
12312
12313 /*
12314 * Request has been submitted to the resolver. Now (interruptibly)
12315 * wait for completion. Upon requrn, the request will have been
12316 * removed from the lookup table.
12317 */
12318 error = nspace_resolver_req_wait(&req);
12319
12320 out_release_port:
12321 if (path != NULL) {
12322 kfree_data(path, path_alloc_len);
12323 path = NULL;
12324 }
12325 ipc_port_release_send(mach_port);
12326
12327 out_check_errors:
12328 /*
12329 * The file resolver owns the logic about what error to return
12330 * to the caller. We only need to handle a couple of special
12331 * cases here:
12332 */
12333 if (error == EJUSTRETURN) {
12334 /*
12335 * The requesting process is allowed to interact with
12336 * dataless objects. Make a couple of sanity-checks
12337 * here to ensure the action makes sense.
12338 */
12339 switch (op) {
12340 case NAMESPACE_HANDLER_WRITE_OP:
12341 case NAMESPACE_HANDLER_TRUNCATE_OP:
12342 case NAMESPACE_HANDLER_RENAME_OP:
12343 /*
12344 * This handles the case of the resolver itself
12345 * writing data to the file (or throwing it
12346 * away).
12347 */
12348 error = 0;
12349 break;
12350 case NAMESPACE_HANDLER_READ_OP:
12351 case NAMESPACE_HANDLER_LOOKUP_OP:
12352 /*
12353 * This handles the case of the resolver needing
12354 * to look up inside of a dataless directory while
12355 * it's in the process of materializing it (for
12356 * example, creating files or directories).
12357 */
12358 error = (vp_vtype == VDIR) ? 0 : EBADF;
12359 break;
12360 default:
12361 error = EBADF;
12362 break;
12363 }
12364 }
12365
12366 return error;
12367
12368 out_req_remove:
12369 nspace_resolver_req_remove(&req);
12370 goto out_release_port;
12371 #else
12372 return ENOTSUP;
12373 #endif /* CONFIG_DATALESS_FILES */
12374 }
12375
12376 /*
12377 * vfs_materialize_file: Materialize a regular file.
12378 *
12379 * Inputs:
12380 * vp The dataless file to be materialized.
12381 *
12382 * op What kind of operation is being performed:
12383 * -> NAMESPACE_HANDLER_READ_OP
12384 * -> NAMESPACE_HANDLER_WRITE_OP
12385 * -> NAMESPACE_HANDLER_LINK_CREATE
12386 * -> NAMESPACE_HANDLER_DELETE_OP
12387 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12388 * -> NAMESPACE_HANDLER_RENAME_OP
12389 *
12390 * offset offset of I/O for READ or WRITE. Ignored for
12391 * other ops.
12392 *
12393 * size size of I/O for READ or WRITE Ignored for
12394 * other ops.
12395 *
12396 * If offset or size are -1 for a READ or WRITE, then the resolver should
12397 * consider the range to be unknown.
12398 *
12399 * Upon successful return, the caller may proceed with the operation.
12400 * N.B. the file may still be "dataless" in this case.
12401 */
12402 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12403 vfs_materialize_file(
12404 struct vnode *vp,
12405 uint64_t op,
12406 int64_t offset,
12407 int64_t size)
12408 {
12409 if (vp->v_type != VREG) {
12410 return EFTYPE;
12411 }
12412 return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12413 NULL);
12414 }
12415
12416 /*
12417 * vfs_materialize_dir:
12418 *
12419 * Inputs:
12420 * vp The dataless directory to be materialized.
12421 *
12422 * op What kind of operation is being performed:
12423 * -> NAMESPACE_HANDLER_READ_OP
12424 * -> NAMESPACE_HANDLER_WRITE_OP
12425 * -> NAMESPACE_HANDLER_DELETE_OP
12426 * -> NAMESPACE_HANDLER_RENAME_OP
12427 * -> NAMESPACE_HANDLER_LOOKUP_OP
12428 *
12429 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12430 * other ops. May or may not be NUL-terminated; see below.
12431 *
12432 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12433 * terminated and namelen is the number of valid bytes in
12434 * lookup_name. If zero, then lookup_name is assumed to be
12435 * NUL-terminated.
12436 *
12437 * Upon successful return, the caller may proceed with the operation.
12438 * N.B. the directory may still be "dataless" in this case.
12439 */
12440 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12441 vfs_materialize_dir(
12442 struct vnode *vp,
12443 uint64_t op,
12444 char *lookup_name,
12445 size_t namelen)
12446 {
12447 if (vp->v_type != VDIR) {
12448 return EFTYPE;
12449 }
12450 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12451 return EINVAL;
12452 }
12453 return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12454 namelen, NULL);
12455 }
12456
12457 /*
12458 * vfs_materialize_reparent:
12459 *
12460 * Inputs:
12461 * vp The dataless file or directory to be materialized.
12462 *
12463 * tdvp The new parent directory for the dataless file.
12464 *
12465 * Upon successful return, the caller may proceed with the operation.
12466 * N.B. the item may still be "dataless" in this case.
12467 */
12468 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12469 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12470 {
12471 if (vp->v_type != VDIR && vp->v_type != VREG) {
12472 return EFTYPE;
12473 }
12474 return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12475 0, 0, NULL, 0, tdvp);
12476 }
12477
12478 #if 0
12479 static int
12480 build_volfs_path(struct vnode *vp, char *path, int *len)
12481 {
12482 struct vnode_attr va;
12483 int ret;
12484
12485 VATTR_INIT(&va);
12486 VATTR_WANTED(&va, va_fsid);
12487 VATTR_WANTED(&va, va_fileid);
12488
12489 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12490 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12491 ret = -1;
12492 } else {
12493 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12494 ret = 0;
12495 }
12496
12497 return ret;
12498 }
12499 #endif
12500
12501 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12502 fsctl_bogus_command_compat(unsigned long cmd)
12503 {
12504 switch (cmd) {
12505 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12506 return FSIOC_SYNC_VOLUME;
12507 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12508 return FSIOC_ROUTEFS_SETROUTEID;
12509 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12510 return FSIOC_SET_PACKAGE_EXTS;
12511 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12512 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12513 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12514 return DISK_CONDITIONER_IOC_GET;
12515 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12516 return DISK_CONDITIONER_IOC_SET;
12517 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12518 return FSIOC_FIOSEEKHOLE;
12519 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12520 return FSIOC_FIOSEEKDATA;
12521 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12522 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12523 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12524 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12525 }
12526
12527 return cmd;
12528 }
12529
12530 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12531 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12532 {
12533 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12534 }
12535
12536 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12537 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12538 {
12539 struct vfs_attr vfa;
12540 mount_t mp = vp->v_mount;
12541 unsigned arg;
12542 int error;
12543
12544 /* record vid of vp so we can drop it below. */
12545 uint32_t vvid = vp->v_id;
12546
12547 /*
12548 * Then grab mount_iterref so that we can release the vnode.
12549 * Without this, a thread may call vnode_iterate_prepare then
12550 * get into a deadlock because we've never released the root vp
12551 */
12552 error = mount_iterref(mp, 0);
12553 if (error) {
12554 return error;
12555 }
12556 vnode_hold(vp);
12557 vnode_put(vp);
12558
12559 arg = MNT_NOWAIT;
12560 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12561 arg = MNT_WAIT;
12562 }
12563
12564 /*
12565 * If the filessytem supports multiple filesytems in a
12566 * partition (For eg APFS volumes in a container, it knows
12567 * that the waitfor argument to VFS_SYNC are flags.
12568 */
12569 VFSATTR_INIT(&vfa);
12570 VFSATTR_WANTED(&vfa, f_capabilities);
12571 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12572 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12573 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12574 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12575 arg |= MNT_VOLUME;
12576 }
12577
12578 /* issue the sync for this volume */
12579 (void)sync_callback(mp, &arg);
12580
12581 /*
12582 * Then release the mount_iterref once we're done syncing; it's not
12583 * needed for the VNOP_IOCTL below
12584 */
12585 mount_iterdrop(mp);
12586
12587 if (arg & FSCTL_SYNC_FULLSYNC) {
12588 /* re-obtain vnode iocount on the root vp, if possible */
12589 error = vnode_getwithvid(vp, vvid);
12590 if (error == 0) {
12591 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12592 vnode_put(vp);
12593 }
12594 }
12595 vnode_drop(vp);
12596 /* mark the argument VP as having been released */
12597 *arg_vp = NULL;
12598 return error;
12599 }
12600
12601 #if ROUTEFS
12602 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12603 handle_routes(user_addr_t udata)
12604 {
12605 char routepath[MAXPATHLEN];
12606 size_t len = 0;
12607 int error;
12608
12609 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12610 return error;
12611 }
12612 bzero(routepath, MAXPATHLEN);
12613 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12614 if (error) {
12615 return error;
12616 }
12617 error = routefs_kernel_mount(routepath);
12618 return error;
12619 }
12620 #endif
12621
12622 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12623 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12624 {
12625 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12626 struct vnode_attr va;
12627 int error;
12628
12629 VATTR_INIT(&va);
12630 VATTR_SET(&va, va_flags, cas->new_flags);
12631
12632 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12633
12634 #if CONFIG_FSE
12635 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12636 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12637 }
12638 #endif
12639
12640 return error;
12641 }
12642
12643 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12644 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12645 {
12646 struct mount *mp = NULL;
12647 errno_t rootauth = 0;
12648
12649 mp = vp->v_mount;
12650
12651 /*
12652 * query the underlying FS and see if it reports something
12653 * sane for this vnode. If volume is authenticated via
12654 * chunklist, leave that for the caller to determine.
12655 */
12656 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12657
12658 return rootauth;
12659 }
12660
12661 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12662 "com.apple.private.kernel.set-package-extensions"
12663
12664 /*
12665 * Make a filesystem-specific control call:
12666 */
12667 /* ARGSUSED */
12668 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12669 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12670 {
12671 int error = 0;
12672 boolean_t is64bit;
12673 u_int size;
12674 #define STK_PARAMS 128
12675 char stkbuf[STK_PARAMS] = {0};
12676 caddr_t data, memp;
12677 vnode_t vp = *arg_vp;
12678
12679 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12680 return ENOTTY;
12681 }
12682
12683 cmd = fsctl_bogus_command_compat(cmd);
12684
12685 size = IOCPARM_LEN(cmd);
12686 if (size > IOCPARM_MAX) {
12687 return EINVAL;
12688 }
12689
12690 is64bit = proc_is64bit(p);
12691
12692 memp = NULL;
12693
12694 if (size > sizeof(stkbuf)) {
12695 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12696 return ENOMEM;
12697 }
12698 data = memp;
12699 } else {
12700 data = &stkbuf[0];
12701 };
12702
12703 if (cmd & IOC_IN) {
12704 if (size) {
12705 error = copyin(udata, data, size);
12706 if (error) {
12707 if (memp) {
12708 kfree_data(memp, size);
12709 }
12710 return error;
12711 }
12712 } else {
12713 if (is64bit) {
12714 *(user_addr_t *)data = udata;
12715 } else {
12716 *(uint32_t *)data = (uint32_t)udata;
12717 }
12718 };
12719 } else if ((cmd & IOC_OUT) && size) {
12720 /*
12721 * Zero the buffer so the user always
12722 * gets back something deterministic.
12723 */
12724 bzero(data, size);
12725 } else if (cmd & IOC_VOID) {
12726 if (is64bit) {
12727 *(user_addr_t *)data = udata;
12728 } else {
12729 *(uint32_t *)data = (uint32_t)udata;
12730 }
12731 }
12732
12733 /* Check to see if it's a generic command */
12734 switch (cmd) {
12735 case FSIOC_SYNC_VOLUME:
12736 error = handle_sync_volume(vp, arg_vp, data, ctx);
12737 break;
12738
12739 case FSIOC_ROUTEFS_SETROUTEID:
12740 #if ROUTEFS
12741 error = handle_routes(udata);
12742 #endif
12743 break;
12744
12745 case FSIOC_SET_PACKAGE_EXTS: {
12746 user_addr_t ext_strings;
12747 uint32_t num_entries;
12748 uint32_t max_width;
12749
12750 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12751 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12752 error = EPERM;
12753 break;
12754 }
12755
12756 if ((is64bit && size != sizeof(user64_package_ext_info))
12757 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12758 // either you're 64-bit and passed a 64-bit struct or
12759 // you're 32-bit and passed a 32-bit struct. otherwise
12760 // it's not ok.
12761 error = EINVAL;
12762 break;
12763 }
12764
12765 if (is64bit) {
12766 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12767 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12768 }
12769 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12770 num_entries = ((user64_package_ext_info *)data)->num_entries;
12771 max_width = ((user64_package_ext_info *)data)->max_width;
12772 } else {
12773 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12774 num_entries = ((user32_package_ext_info *)data)->num_entries;
12775 max_width = ((user32_package_ext_info *)data)->max_width;
12776 }
12777 error = set_package_extensions_table(ext_strings, num_entries, max_width);
12778 }
12779 break;
12780
12781 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12782 {
12783 mount_t mp;
12784
12785 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12786 break;
12787 }
12788 if ((mp = vp->v_mount) != NULL) {
12789 mount_lock(mp);
12790 if (data[0] != 0) {
12791 for (int i = 0; i < MFSTYPENAMELEN; i++) {
12792 if (!data[i]) {
12793 goto continue_copy;
12794 }
12795 }
12796 /*
12797 * Getting here means we have a user data
12798 * string which has no NULL termination in
12799 * its first MFSTYPENAMELEN bytes. This is
12800 * bogus, let's avoid strlcpy-ing the read
12801 * data and return an error.
12802 */
12803 error = EINVAL;
12804 goto unlock;
12805 continue_copy:
12806 vfs_setfstypename_locked(mp, data);
12807 if (vfs_isrdonly(mp) &&
12808 strcmp(data, "mtmfs") == 0) {
12809 mp->mnt_kern_flag |=
12810 MNTK_EXTENDED_SECURITY;
12811 mp->mnt_kern_flag &=
12812 ~MNTK_AUTH_OPAQUE;
12813 }
12814 } else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12815 const char *name =
12816 vfs_getfstypenameref_locked(mp, NULL);
12817 if (strcmp(name, "mtmfs") == 0) {
12818 mp->mnt_kern_flag &=
12819 ~MNTK_EXTENDED_SECURITY;
12820 }
12821 vfs_setfstypename_locked(mp, NULL);
12822 }
12823 unlock:
12824 mount_unlock(mp);
12825 }
12826 }
12827 break;
12828
12829 case DISK_CONDITIONER_IOC_GET: {
12830 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12831 }
12832 break;
12833
12834 case DISK_CONDITIONER_IOC_SET: {
12835 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12836 }
12837 break;
12838
12839 case FSIOC_CAS_BSDFLAGS:
12840 error = handle_flags(vp, data, ctx);
12841 break;
12842
12843 case FSIOC_FD_ONLY_OPEN_ONCE: {
12844 error = 0;
12845 if (vnode_usecount(vp) > 1) {
12846 vnode_lock_spin(vp);
12847 if (vp->v_lflag & VL_HASSTREAMS) {
12848 if (vnode_isinuse_locked(vp, 1, 1)) {
12849 error = EBUSY;
12850 }
12851 } else if (vnode_usecount(vp) > 1) {
12852 error = EBUSY;
12853 }
12854 vnode_unlock(vp);
12855 }
12856 }
12857 break;
12858
12859 case FSIOC_EVAL_ROOTAUTH:
12860 error = handle_auth(vp, cmd, data, options, ctx);
12861 break;
12862
12863 case FSIOC_TEST_FSE_ACCESS_GRANTED:
12864 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12865 break;
12866
12867 #if CONFIG_EXCLAVES
12868 case FSIOC_EXCLAVE_FS_REGISTER:
12869 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12870 error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
12871 } else {
12872 error = EPERM;
12873 }
12874 break;
12875
12876 case FSIOC_EXCLAVE_FS_UNREGISTER:
12877 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12878 error = vfs_exclave_fs_unregister(vp);
12879 } else {
12880 error = EPERM;
12881 }
12882 break;
12883
12884 case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
12885 exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
12886 exclave_fs_base_dir_t *dirs = NULL;
12887 if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12888 error = EPERM;
12889 break;
12890 }
12891 if (get_base_dirs->base_dirs) {
12892 if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
12893 error = EINVAL;
12894 break;
12895 }
12896 dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
12897 if (!dirs) {
12898 error = ENOSPC;
12899 break;
12900 }
12901 }
12902 error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
12903 if (!error && dirs) {
12904 error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
12905 get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
12906 }
12907 if (dirs) {
12908 kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
12909 }
12910 }
12911 break;
12912 #endif
12913
12914 default: {
12915 /*
12916 * Other, known commands shouldn't be passed down here.
12917 * (When adding a selector to this list, it may be prudent
12918 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
12919 */
12920 switch (cmd) {
12921 case F_PUNCHHOLE:
12922 case F_TRIM_ACTIVE_FILE:
12923 case F_RDADVISE:
12924 case F_TRANSCODEKEY:
12925 case F_GETPROTECTIONLEVEL:
12926 case F_GETDEFAULTPROTLEVEL:
12927 case F_MAKECOMPRESSED:
12928 case F_SET_GREEDY_MODE:
12929 case F_SETSTATICCONTENT:
12930 case F_SETIOTYPE:
12931 case F_SETBACKINGSTORE:
12932 case F_GETPATH_MTMINFO:
12933 case APFSIOC_REVERT_TO_SNAPSHOT:
12934 case FSIOC_FIOSEEKHOLE:
12935 case FSIOC_FIOSEEKDATA:
12936 case HFS_GET_BOOT_INFO:
12937 case HFS_SET_BOOT_INFO:
12938 case FIOPINSWAP:
12939 case F_CHKCLEAN:
12940 case F_FULLFSYNC:
12941 case F_BARRIERFSYNC:
12942 case F_FREEZE_FS:
12943 case F_THAW_FS:
12944 case FSIOC_KERNEL_ROOTAUTH:
12945 case FSIOC_GRAFT_FS:
12946 case FSIOC_UNGRAFT_FS:
12947 case FSIOC_AUTH_FS:
12948 error = EINVAL;
12949 goto outdrop;
12950 }
12951 /* Invoke the filesystem-specific code */
12952 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12953 }
12954 } /* end switch stmt */
12955
12956 /*
12957 * if no errors, copy any data to user. Size was
12958 * already set and checked above.
12959 */
12960 if (error == 0 && (cmd & IOC_OUT) && size) {
12961 error = copyout(data, udata, size);
12962 }
12963
12964 outdrop:
12965 if (memp) {
12966 kfree_data(memp, size);
12967 }
12968
12969 return error;
12970 }
12971
12972 /* ARGSUSED */
12973 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12974 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12975 {
12976 int error;
12977 struct nameidata nd;
12978 uint32_t nameiflags;
12979 vnode_t vp = NULL;
12980 vfs_context_t ctx = vfs_context_current();
12981
12982 AUDIT_ARG(cmd, (int)uap->cmd);
12983 AUDIT_ARG(value32, uap->options);
12984 /* Get the vnode for the file we are getting info on: */
12985 nameiflags = 0;
12986 //
12987 // if we come through fsctl() then the file is by definition not open.
12988 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12989 // lest the caller mistakenly thinks the only open is their own (but in
12990 // reality it's someone elses).
12991 //
12992 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12993 return EINVAL;
12994 }
12995 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12996 nameiflags |= FOLLOW;
12997 }
12998 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12999 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
13000 }
13001 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
13002 UIO_USERSPACE, uap->path, ctx);
13003 if ((error = namei(&nd))) {
13004 goto done;
13005 }
13006 vp = nd.ni_vp;
13007 nameidone(&nd);
13008
13009 #if CONFIG_MACF
13010 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
13011 if (error) {
13012 goto done;
13013 }
13014 #endif
13015
13016 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13017
13018 done:
13019 if (vp) {
13020 vnode_put(vp);
13021 }
13022 return error;
13023 }
13024 /* ARGSUSED */
13025 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)13026 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
13027 {
13028 int error;
13029 vnode_t vp = NULL;
13030 vfs_context_t ctx = vfs_context_current();
13031 int fd = -1;
13032
13033 AUDIT_ARG(fd, uap->fd);
13034 AUDIT_ARG(cmd, (int)uap->cmd);
13035 AUDIT_ARG(value32, uap->options);
13036
13037 /* Get the vnode for the file we are getting info on: */
13038 if ((error = file_vnode(uap->fd, &vp))) {
13039 return error;
13040 }
13041 fd = uap->fd;
13042 if ((error = vnode_getwithref(vp))) {
13043 file_drop(fd);
13044 return error;
13045 }
13046
13047 #if CONFIG_MACF
13048 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
13049 file_drop(fd);
13050 vnode_put(vp);
13051 return error;
13052 }
13053 #endif
13054
13055 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
13056
13057 file_drop(fd);
13058
13059 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
13060 if (vp) {
13061 vnode_put(vp);
13062 }
13063
13064 return error;
13065 }
13066 /* end of fsctl system call */
13067
13068 #define FILESEC_ACCESS_ENTITLEMENT \
13069 "com.apple.private.vfs.filesec-access"
13070
13071 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13072 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13073 {
13074 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13075 /*
13076 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13077 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13078 */
13079 if ((!setting && vfs_context_issuser(ctx)) ||
13080 IOTaskHasEntitlement(vfs_context_task(ctx),
13081 FILESEC_ACCESS_ENTITLEMENT)) {
13082 return 0;
13083 }
13084 }
13085
13086 return EPERM;
13087 }
13088
13089 /*
13090 * Retrieve the data of an extended attribute.
13091 */
13092 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13093 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13094 {
13095 vnode_t vp;
13096 struct nameidata nd;
13097 char attrname[XATTR_MAXNAMELEN + 1];
13098 vfs_context_t ctx = vfs_context_current();
13099 uio_t auio = NULL;
13100 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13101 size_t attrsize = 0;
13102 size_t namelen;
13103 u_int32_t nameiflags;
13104 int error;
13105 UIO_STACKBUF(uio_buf, 1);
13106
13107 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13108 return EINVAL;
13109 }
13110
13111 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13112 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13113 if ((error = namei(&nd))) {
13114 return error;
13115 }
13116 vp = nd.ni_vp;
13117 nameidone(&nd);
13118
13119 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13120 if (error != 0) {
13121 goto out;
13122 }
13123 if (xattr_protected(attrname) &&
13124 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13125 goto out;
13126 }
13127 /*
13128 * the specific check for 0xffffffff is a hack to preserve
13129 * binaray compatibilty in K64 with applications that discovered
13130 * that passing in a buf pointer and a size of -1 resulted in
13131 * just the size of the indicated extended attribute being returned.
13132 * this isn't part of the documented behavior, but because of the
13133 * original implemtation's check for "uap->size > 0", this behavior
13134 * was allowed. In K32 that check turned into a signed comparison
13135 * even though uap->size is unsigned... in K64, we blow by that
13136 * check because uap->size is unsigned and doesn't get sign smeared
13137 * in the munger for a 32 bit user app. we also need to add a
13138 * check to limit the maximum size of the buffer being passed in...
13139 * unfortunately, the underlying fileystems seem to just malloc
13140 * the requested size even if the actual extended attribute is tiny.
13141 * because that malloc is for kernel wired memory, we have to put a
13142 * sane limit on it.
13143 *
13144 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13145 * U64 running on K64 will yield -1 (64 bits wide)
13146 * U32/U64 running on K32 will yield -1 (32 bits wide)
13147 */
13148 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13149 goto no_uio;
13150 }
13151
13152 if (uap->value) {
13153 if (uap->size > (size_t)XATTR_MAXSIZE) {
13154 uap->size = XATTR_MAXSIZE;
13155 }
13156
13157 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13158 &uio_buf[0], sizeof(uio_buf));
13159 uio_addiov(auio, uap->value, uap->size);
13160 }
13161 no_uio:
13162 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13163 out:
13164 vnode_put(vp);
13165
13166 if (auio) {
13167 *retval = uap->size - uio_resid(auio);
13168 } else {
13169 *retval = (user_ssize_t)attrsize;
13170 }
13171
13172 return error;
13173 }
13174
13175 /*
13176 * Retrieve the data of an extended attribute.
13177 */
13178 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13179 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13180 {
13181 vnode_t vp;
13182 char attrname[XATTR_MAXNAMELEN + 1];
13183 vfs_context_t ctx = vfs_context_current();
13184 uio_t auio = NULL;
13185 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13186 size_t attrsize = 0;
13187 size_t namelen;
13188 int error;
13189 UIO_STACKBUF(uio_buf, 1);
13190
13191 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13192 return EINVAL;
13193 }
13194
13195 if ((error = file_vnode(uap->fd, &vp))) {
13196 return error;
13197 }
13198 if ((error = vnode_getwithref(vp))) {
13199 file_drop(uap->fd);
13200 return error;
13201 }
13202 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13203 if (error != 0) {
13204 goto out;
13205 }
13206 if (xattr_protected(attrname) &&
13207 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13208 goto out;
13209 }
13210 if (uap->value && uap->size > 0) {
13211 if (uap->size > (size_t)XATTR_MAXSIZE) {
13212 uap->size = XATTR_MAXSIZE;
13213 }
13214
13215 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13216 &uio_buf[0], sizeof(uio_buf));
13217 uio_addiov(auio, uap->value, uap->size);
13218 }
13219
13220 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13221 out:
13222 (void)vnode_put(vp);
13223 file_drop(uap->fd);
13224
13225 if (auio) {
13226 *retval = uap->size - uio_resid(auio);
13227 } else {
13228 *retval = (user_ssize_t)attrsize;
13229 }
13230 return error;
13231 }
13232
13233 /* struct for checkdirs iteration */
13234 struct setxattr_ctx {
13235 struct nameidata nd;
13236 char attrname[XATTR_MAXNAMELEN + 1];
13237 UIO_STACKBUF(uio_buf, 1);
13238 };
13239
13240 /*
13241 * Set the data of an extended attribute.
13242 */
13243 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13244 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13245 {
13246 vnode_t vp;
13247 vfs_context_t ctx = vfs_context_current();
13248 uio_t auio = NULL;
13249 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13250 size_t namelen;
13251 u_int32_t nameiflags;
13252 int error;
13253 struct setxattr_ctx *sactx;
13254
13255 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13256 return EINVAL;
13257 }
13258
13259 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13260 if (sactx == NULL) {
13261 return ENOMEM;
13262 }
13263
13264 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13265 if (error != 0) {
13266 if (error == EPERM) {
13267 /* if the string won't fit in attrname, copyinstr emits EPERM */
13268 error = ENAMETOOLONG;
13269 }
13270 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13271 goto out;
13272 }
13273 if (xattr_protected(sactx->attrname) &&
13274 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13275 goto out;
13276 }
13277 if (uap->size != 0 && uap->value == 0) {
13278 error = EINVAL;
13279 goto out;
13280 }
13281 if (uap->size > INT_MAX) {
13282 error = E2BIG;
13283 goto out;
13284 }
13285
13286 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13287 #if CONFIG_FILE_LEASES
13288 nameiflags |= WANTPARENT;
13289 #endif
13290 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13291 if ((error = namei(&sactx->nd))) {
13292 goto out;
13293 }
13294 vp = sactx->nd.ni_vp;
13295 #if CONFIG_FILE_LEASES
13296 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13297 vnode_put(sactx->nd.ni_dvp);
13298 #endif
13299 nameidone(&sactx->nd);
13300
13301 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13302 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13303 uio_addiov(auio, uap->value, uap->size);
13304
13305 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13306 #if CONFIG_FSE
13307 if (error == 0) {
13308 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13309 FSE_ARG_VNODE, vp,
13310 FSE_ARG_DONE);
13311 }
13312 #endif
13313 vnode_put(vp);
13314 out:
13315 kfree_type(struct setxattr_ctx, sactx);
13316 *retval = 0;
13317 return error;
13318 }
13319
13320 /*
13321 * Set the data of an extended attribute.
13322 */
13323 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13324 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13325 {
13326 vnode_t vp;
13327 char attrname[XATTR_MAXNAMELEN + 1];
13328 vfs_context_t ctx = vfs_context_current();
13329 uio_t auio = NULL;
13330 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13331 size_t namelen;
13332 int error;
13333 UIO_STACKBUF(uio_buf, 1);
13334
13335 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13336 return EINVAL;
13337 }
13338
13339 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13340 if (error != 0) {
13341 if (error == EPERM) {
13342 /* if the string won't fit in attrname, copyinstr emits EPERM */
13343 return ENAMETOOLONG;
13344 }
13345 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13346 return error;
13347 }
13348 if (xattr_protected(attrname) &&
13349 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13350 return error;
13351 }
13352 if (uap->size != 0 && uap->value == 0) {
13353 return EINVAL;
13354 }
13355 if (uap->size > INT_MAX) {
13356 return E2BIG;
13357 }
13358 if ((error = file_vnode(uap->fd, &vp))) {
13359 return error;
13360 }
13361 if ((error = vnode_getwithref(vp))) {
13362 file_drop(uap->fd);
13363 return error;
13364 }
13365
13366 #if CONFIG_FILE_LEASES
13367 vnode_breakdirlease(vp, true, O_WRONLY);
13368 #endif
13369
13370 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13371 &uio_buf[0], sizeof(uio_buf));
13372 uio_addiov(auio, uap->value, uap->size);
13373
13374 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13375 #if CONFIG_FSE
13376 if (error == 0) {
13377 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13378 FSE_ARG_VNODE, vp,
13379 FSE_ARG_DONE);
13380 }
13381 #endif
13382 vnode_put(vp);
13383 file_drop(uap->fd);
13384 *retval = 0;
13385 return error;
13386 }
13387
13388 /*
13389 * Remove an extended attribute.
13390 * XXX Code duplication here.
13391 */
13392 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13393 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13394 {
13395 vnode_t vp;
13396 struct nameidata nd;
13397 char attrname[XATTR_MAXNAMELEN + 1];
13398 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13399 vfs_context_t ctx = vfs_context_current();
13400 size_t namelen;
13401 u_int32_t nameiflags;
13402 int error;
13403
13404 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13405 return EINVAL;
13406 }
13407
13408 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13409 if (error != 0) {
13410 return error;
13411 }
13412 if (xattr_protected(attrname)) {
13413 return EPERM;
13414 }
13415 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13416 #if CONFIG_FILE_LEASES
13417 nameiflags |= WANTPARENT;
13418 #endif
13419 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13420 if ((error = namei(&nd))) {
13421 return error;
13422 }
13423 vp = nd.ni_vp;
13424 #if CONFIG_FILE_LEASES
13425 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13426 vnode_put(nd.ni_dvp);
13427 #endif
13428 nameidone(&nd);
13429
13430 error = vn_removexattr(vp, attrname, uap->options, ctx);
13431 #if CONFIG_FSE
13432 if (error == 0) {
13433 add_fsevent(FSE_XATTR_REMOVED, ctx,
13434 FSE_ARG_VNODE, vp,
13435 FSE_ARG_DONE);
13436 }
13437 #endif
13438 vnode_put(vp);
13439 *retval = 0;
13440 return error;
13441 }
13442
13443 /*
13444 * Remove an extended attribute.
13445 * XXX Code duplication here.
13446 */
13447 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13448 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13449 {
13450 vnode_t vp;
13451 char attrname[XATTR_MAXNAMELEN + 1];
13452 size_t namelen;
13453 int error;
13454 #if CONFIG_FSE
13455 vfs_context_t ctx = vfs_context_current();
13456 #endif
13457
13458 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13459 return EINVAL;
13460 }
13461
13462 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13463 if (error != 0) {
13464 return error;
13465 }
13466 if (xattr_protected(attrname)) {
13467 return EPERM;
13468 }
13469 if ((error = file_vnode(uap->fd, &vp))) {
13470 return error;
13471 }
13472 if ((error = vnode_getwithref(vp))) {
13473 file_drop(uap->fd);
13474 return error;
13475 }
13476
13477 #if CONFIG_FILE_LEASES
13478 vnode_breakdirlease(vp, true, O_WRONLY);
13479 #endif
13480
13481 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13482 #if CONFIG_FSE
13483 if (error == 0) {
13484 add_fsevent(FSE_XATTR_REMOVED, ctx,
13485 FSE_ARG_VNODE, vp,
13486 FSE_ARG_DONE);
13487 }
13488 #endif
13489 vnode_put(vp);
13490 file_drop(uap->fd);
13491 *retval = 0;
13492 return error;
13493 }
13494
13495 /*
13496 * Retrieve the list of extended attribute names.
13497 * XXX Code duplication here.
13498 */
13499 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13500 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13501 {
13502 vnode_t vp;
13503 struct nameidata nd;
13504 vfs_context_t ctx = vfs_context_current();
13505 uio_t auio = NULL;
13506 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13507 size_t attrsize = 0;
13508 u_int32_t nameiflags;
13509 int error;
13510 UIO_STACKBUF(uio_buf, 1);
13511
13512 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13513 return EINVAL;
13514 }
13515
13516 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13517 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13518 if ((error = namei(&nd))) {
13519 return error;
13520 }
13521 vp = nd.ni_vp;
13522 nameidone(&nd);
13523 if (uap->namebuf != 0 && uap->bufsize > 0) {
13524 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13525 &uio_buf[0], sizeof(uio_buf));
13526 uio_addiov(auio, uap->namebuf, uap->bufsize);
13527 }
13528
13529 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13530
13531 vnode_put(vp);
13532 if (auio) {
13533 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13534 } else {
13535 *retval = (user_ssize_t)attrsize;
13536 }
13537 return error;
13538 }
13539
13540 /*
13541 * Retrieve the list of extended attribute names.
13542 * XXX Code duplication here.
13543 */
13544 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13545 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13546 {
13547 vnode_t vp;
13548 uio_t auio = NULL;
13549 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13550 size_t attrsize = 0;
13551 int error;
13552 UIO_STACKBUF(uio_buf, 1);
13553
13554 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13555 return EINVAL;
13556 }
13557
13558 if ((error = file_vnode(uap->fd, &vp))) {
13559 return error;
13560 }
13561 if ((error = vnode_getwithref(vp))) {
13562 file_drop(uap->fd);
13563 return error;
13564 }
13565 if (uap->namebuf != 0 && uap->bufsize > 0) {
13566 auio = uio_createwithbuffer(1, 0, spacetype,
13567 UIO_READ, &uio_buf[0], sizeof(uio_buf));
13568 uio_addiov(auio, uap->namebuf, uap->bufsize);
13569 }
13570
13571 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13572
13573 vnode_put(vp);
13574 file_drop(uap->fd);
13575 if (auio) {
13576 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13577 } else {
13578 *retval = (user_ssize_t)attrsize;
13579 }
13580 return error;
13581 }
13582
13583 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13584 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13585 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13586 {
13587 int error;
13588 struct mount *mp = NULL;
13589 vnode_t vp;
13590 int length;
13591 int bpflags;
13592 /* maximum number of times to retry build_path */
13593 unsigned int retries = 0x10;
13594
13595 if (bufsize > FSGETPATH_MAXBUFLEN) {
13596 return EINVAL;
13597 }
13598
13599 if (buf == NULL) {
13600 return ENOMEM;
13601 }
13602
13603 retry:
13604 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13605 error = ENOTSUP; /* unexpected failure */
13606 return ENOTSUP;
13607 }
13608
13609 #if CONFIG_UNION_MOUNTS
13610 unionget:
13611 #endif /* CONFIG_UNION_MOUNTS */
13612 if (objid == 2) {
13613 struct vfs_attr vfsattr;
13614 int use_vfs_root = TRUE;
13615
13616 VFSATTR_INIT(&vfsattr);
13617 VFSATTR_WANTED(&vfsattr, f_capabilities);
13618 if (!(options & FSOPT_ISREALFSID) &&
13619 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13620 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13621 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13622 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13623 use_vfs_root = FALSE;
13624 }
13625 }
13626
13627 if (use_vfs_root) {
13628 error = VFS_ROOT(mp, &vp, ctx);
13629 } else {
13630 error = VFS_VGET(mp, objid, &vp, ctx);
13631 }
13632 } else {
13633 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13634 }
13635
13636 #if CONFIG_UNION_MOUNTS
13637 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13638 /*
13639 * If the fileid isn't found and we're in a union
13640 * mount volume, then see if the fileid is in the
13641 * mounted-on volume.
13642 */
13643 struct mount *tmp = mp;
13644 mp = vnode_mount(tmp->mnt_vnodecovered);
13645 vfs_unbusy(tmp);
13646 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13647 goto unionget;
13648 }
13649 } else {
13650 vfs_unbusy(mp);
13651 }
13652 #else
13653 vfs_unbusy(mp);
13654 #endif /* CONFIG_UNION_MOUNTS */
13655
13656 if (error) {
13657 return error;
13658 }
13659
13660 #if CONFIG_MACF
13661 error = mac_vnode_check_fsgetpath(ctx, vp);
13662 if (error) {
13663 vnode_put(vp);
13664 return error;
13665 }
13666 #endif
13667
13668 /* Obtain the absolute path to this vnode. */
13669 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13670 if (options & FSOPT_NOFIRMLINKPATH) {
13671 bpflags |= BUILDPATH_NO_FIRMLINK;
13672 }
13673 bpflags |= BUILDPATH_CHECK_MOVED;
13674 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13675 vnode_put(vp);
13676
13677 if (error) {
13678 /* there was a race building the path, try a few more times */
13679 if (error == EAGAIN) {
13680 --retries;
13681 if (retries > 0) {
13682 goto retry;
13683 }
13684
13685 error = ENOENT;
13686 }
13687 goto out;
13688 }
13689
13690 AUDIT_ARG(text, buf);
13691
13692 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13693 unsigned long path_words[NUMPARMS];
13694 size_t path_len = sizeof(path_words);
13695
13696 if ((size_t)length < path_len) {
13697 memcpy((char *)path_words, buf, length);
13698 memset((char *)path_words + length, 0, path_len - length);
13699
13700 path_len = length;
13701 } else {
13702 memcpy((char *)path_words, buf + (length - path_len), path_len);
13703 }
13704
13705 kdebug_vfs_lookup(path_words, (int)path_len, vp,
13706 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13707 }
13708
13709 *pathlen = length; /* may be superseded by error */
13710
13711 out:
13712 return error;
13713 }
13714
13715 /*
13716 * Obtain the full pathname of a file system object by id.
13717 */
13718 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13719 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13720 uint32_t options, user_ssize_t *retval)
13721 {
13722 vfs_context_t ctx = vfs_context_current();
13723 fsid_t fsid;
13724 char *realpath;
13725 int length;
13726 int error;
13727
13728 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13729 return EINVAL;
13730 }
13731
13732 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13733 return error;
13734 }
13735 AUDIT_ARG(value32, fsid.val[0]);
13736 AUDIT_ARG(value64, objid);
13737 /* Restrict output buffer size for now. */
13738
13739 if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13740 return EINVAL;
13741 }
13742 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13743 if (realpath == NULL) {
13744 return ENOMEM;
13745 }
13746
13747 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13748 options, &length);
13749
13750 if (error) {
13751 goto out;
13752 }
13753
13754 error = copyout((caddr_t)realpath, buf, length);
13755
13756 *retval = (user_ssize_t)length; /* may be superseded by error */
13757 out:
13758 kfree_data(realpath, bufsize);
13759 return error;
13760 }
13761
13762 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13763 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13764 {
13765 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13766 0, retval);
13767 }
13768
13769 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13770 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13771 {
13772 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13773 uap->options, retval);
13774 }
13775
13776 /*
13777 * Common routine to handle various flavors of statfs data heading out
13778 * to user space.
13779 *
13780 * Returns: 0 Success
13781 * EFAULT
13782 */
13783 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13784 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13785 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13786 boolean_t partial_copy)
13787 {
13788 int error;
13789 int my_size, copy_size;
13790
13791 if (is_64_bit) {
13792 struct user64_statfs sfs;
13793 my_size = copy_size = sizeof(sfs);
13794 bzero(&sfs, my_size);
13795 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13796 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13797 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13798 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13799 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13800 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13801 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13802 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13803 sfs.f_files = (user64_long_t)sfsp->f_files;
13804 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13805 sfs.f_fsid = sfsp->f_fsid;
13806 sfs.f_owner = sfsp->f_owner;
13807 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13808 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13809 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13810
13811 if (partial_copy) {
13812 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13813 }
13814 error = copyout((caddr_t)&sfs, bufp, copy_size);
13815 } else {
13816 struct user32_statfs sfs;
13817
13818 my_size = copy_size = sizeof(sfs);
13819 bzero(&sfs, my_size);
13820
13821 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13822 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13823 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13824
13825 /*
13826 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13827 * have to fudge the numbers here in that case. We inflate the blocksize in order
13828 * to reflect the filesystem size as best we can.
13829 */
13830 if ((sfsp->f_blocks > INT_MAX)
13831 /* Hack for 4061702 . I think the real fix is for Carbon to
13832 * look for some volume capability and not depend on hidden
13833 * semantics agreed between a FS and carbon.
13834 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13835 * for Carbon to set bNoVolumeSizes volume attribute.
13836 * Without this the webdavfs files cannot be copied onto
13837 * disk as they look huge. This change should not affect
13838 * XSAN as they should not setting these to -1..
13839 */
13840 && (sfsp->f_blocks != 0xffffffffffffffffULL)
13841 && (sfsp->f_bfree != 0xffffffffffffffffULL)
13842 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13843 int shift;
13844
13845 /*
13846 * Work out how far we have to shift the block count down to make it fit.
13847 * Note that it's possible to have to shift so far that the resulting
13848 * blocksize would be unreportably large. At that point, we will clip
13849 * any values that don't fit.
13850 *
13851 * For safety's sake, we also ensure that f_iosize is never reported as
13852 * being smaller than f_bsize.
13853 */
13854 for (shift = 0; shift < 32; shift++) {
13855 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13856 break;
13857 }
13858 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13859 break;
13860 }
13861 }
13862 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13863 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13864 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13865 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13866 #undef __SHIFT_OR_CLIP
13867 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13868 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13869 } else {
13870 /* filesystem is small enough to be reported honestly */
13871 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13872 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13873 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13874 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13875 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13876 }
13877 sfs.f_files = (user32_long_t)sfsp->f_files;
13878 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13879 sfs.f_fsid = sfsp->f_fsid;
13880 sfs.f_owner = sfsp->f_owner;
13881 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13882 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13883 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13884
13885 if (partial_copy) {
13886 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13887 }
13888 error = copyout((caddr_t)&sfs, bufp, copy_size);
13889 }
13890
13891 if (sizep != NULL) {
13892 *sizep = my_size;
13893 }
13894 return error;
13895 }
13896
13897 /*
13898 * copy stat structure into user_stat structure.
13899 */
13900 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13901 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13902 {
13903 bzero(usbp, sizeof(*usbp));
13904
13905 usbp->st_dev = sbp->st_dev;
13906 usbp->st_ino = sbp->st_ino;
13907 usbp->st_mode = sbp->st_mode;
13908 usbp->st_nlink = sbp->st_nlink;
13909 usbp->st_uid = sbp->st_uid;
13910 usbp->st_gid = sbp->st_gid;
13911 usbp->st_rdev = sbp->st_rdev;
13912 #ifndef _POSIX_C_SOURCE
13913 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13914 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13915 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13916 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13917 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13918 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13919 #else
13920 usbp->st_atime = sbp->st_atime;
13921 usbp->st_atimensec = sbp->st_atimensec;
13922 usbp->st_mtime = sbp->st_mtime;
13923 usbp->st_mtimensec = sbp->st_mtimensec;
13924 usbp->st_ctime = sbp->st_ctime;
13925 usbp->st_ctimensec = sbp->st_ctimensec;
13926 #endif
13927 usbp->st_size = sbp->st_size;
13928 usbp->st_blocks = sbp->st_blocks;
13929 usbp->st_blksize = sbp->st_blksize;
13930 usbp->st_flags = sbp->st_flags;
13931 usbp->st_gen = sbp->st_gen;
13932 usbp->st_lspare = sbp->st_lspare;
13933 usbp->st_qspare[0] = sbp->st_qspare[0];
13934 usbp->st_qspare[1] = sbp->st_qspare[1];
13935 }
13936
13937 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13938 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13939 {
13940 bzero(usbp, sizeof(*usbp));
13941
13942 usbp->st_dev = sbp->st_dev;
13943 usbp->st_ino = sbp->st_ino;
13944 usbp->st_mode = sbp->st_mode;
13945 usbp->st_nlink = sbp->st_nlink;
13946 usbp->st_uid = sbp->st_uid;
13947 usbp->st_gid = sbp->st_gid;
13948 usbp->st_rdev = sbp->st_rdev;
13949 #ifndef _POSIX_C_SOURCE
13950 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13951 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13952 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13953 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13954 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13955 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13956 #else
13957 usbp->st_atime = sbp->st_atime;
13958 usbp->st_atimensec = sbp->st_atimensec;
13959 usbp->st_mtime = sbp->st_mtime;
13960 usbp->st_mtimensec = sbp->st_mtimensec;
13961 usbp->st_ctime = sbp->st_ctime;
13962 usbp->st_ctimensec = sbp->st_ctimensec;
13963 #endif
13964 usbp->st_size = sbp->st_size;
13965 usbp->st_blocks = sbp->st_blocks;
13966 usbp->st_blksize = sbp->st_blksize;
13967 usbp->st_flags = sbp->st_flags;
13968 usbp->st_gen = sbp->st_gen;
13969 usbp->st_lspare = sbp->st_lspare;
13970 usbp->st_qspare[0] = sbp->st_qspare[0];
13971 usbp->st_qspare[1] = sbp->st_qspare[1];
13972 }
13973
13974 /*
13975 * copy stat64 structure into user_stat64 structure.
13976 */
13977 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13978 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13979 {
13980 bzero(usbp, sizeof(*usbp));
13981
13982 usbp->st_dev = sbp->st_dev;
13983 usbp->st_ino = sbp->st_ino;
13984 usbp->st_mode = sbp->st_mode;
13985 usbp->st_nlink = sbp->st_nlink;
13986 usbp->st_uid = sbp->st_uid;
13987 usbp->st_gid = sbp->st_gid;
13988 usbp->st_rdev = sbp->st_rdev;
13989 #ifndef _POSIX_C_SOURCE
13990 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13991 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13992 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13993 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13994 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13995 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13996 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13997 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13998 #else
13999 usbp->st_atime = sbp->st_atime;
14000 usbp->st_atimensec = sbp->st_atimensec;
14001 usbp->st_mtime = sbp->st_mtime;
14002 usbp->st_mtimensec = sbp->st_mtimensec;
14003 usbp->st_ctime = sbp->st_ctime;
14004 usbp->st_ctimensec = sbp->st_ctimensec;
14005 usbp->st_birthtime = sbp->st_birthtime;
14006 usbp->st_birthtimensec = sbp->st_birthtimensec;
14007 #endif
14008 usbp->st_size = sbp->st_size;
14009 usbp->st_blocks = sbp->st_blocks;
14010 usbp->st_blksize = sbp->st_blksize;
14011 usbp->st_flags = sbp->st_flags;
14012 usbp->st_gen = sbp->st_gen;
14013 usbp->st_lspare = sbp->st_lspare;
14014 usbp->st_qspare[0] = sbp->st_qspare[0];
14015 usbp->st_qspare[1] = sbp->st_qspare[1];
14016 }
14017
14018 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)14019 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
14020 {
14021 bzero(usbp, sizeof(*usbp));
14022
14023 usbp->st_dev = sbp->st_dev;
14024 usbp->st_ino = sbp->st_ino;
14025 usbp->st_mode = sbp->st_mode;
14026 usbp->st_nlink = sbp->st_nlink;
14027 usbp->st_uid = sbp->st_uid;
14028 usbp->st_gid = sbp->st_gid;
14029 usbp->st_rdev = sbp->st_rdev;
14030 #ifndef _POSIX_C_SOURCE
14031 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
14032 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
14033 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
14034 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
14035 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
14036 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
14037 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
14038 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
14039 #else
14040 usbp->st_atime = sbp->st_atime;
14041 usbp->st_atimensec = sbp->st_atimensec;
14042 usbp->st_mtime = sbp->st_mtime;
14043 usbp->st_mtimensec = sbp->st_mtimensec;
14044 usbp->st_ctime = sbp->st_ctime;
14045 usbp->st_ctimensec = sbp->st_ctimensec;
14046 usbp->st_birthtime = sbp->st_birthtime;
14047 usbp->st_birthtimensec = sbp->st_birthtimensec;
14048 #endif
14049 usbp->st_size = sbp->st_size;
14050 usbp->st_blocks = sbp->st_blocks;
14051 usbp->st_blksize = sbp->st_blksize;
14052 usbp->st_flags = sbp->st_flags;
14053 usbp->st_gen = sbp->st_gen;
14054 usbp->st_lspare = sbp->st_lspare;
14055 usbp->st_qspare[0] = sbp->st_qspare[0];
14056 usbp->st_qspare[1] = sbp->st_qspare[1];
14057 }
14058
14059 /*
14060 * Purge buffer cache for simulating cold starts
14061 */
14062 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)14063 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
14064 {
14065 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14066
14067 return VNODE_RETURNED;
14068 }
14069
14070 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14071 vfs_purge_callback(mount_t mp, __unused void * arg)
14072 {
14073 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14074
14075 return VFS_RETURNED;
14076 }
14077
14078 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14079 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14080
14081 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14082 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14083 {
14084 if (!kauth_cred_issuser(kauth_cred_get())) {
14085 return EPERM;
14086 }
14087
14088 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14089
14090 /* also flush any VM pagers backed by files */
14091 if (vfs_purge_vm_pagers) {
14092 vm_purge_filebacked_pagers();
14093 }
14094
14095 return 0;
14096 }
14097
14098 /*
14099 * gets the vnode associated with the (unnamed) snapshot directory
14100 * for a Filesystem. The snapshot directory vnode is returned with
14101 * an iocount on it.
14102 */
14103 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14104 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14105 {
14106 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14107 }
14108
14109 /*
14110 * Get the snapshot vnode.
14111 *
14112 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14113 * needs nameidone() on ndp.
14114 *
14115 * If the snapshot vnode exists it is returned in ndp->ni_vp.
14116 *
14117 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14118 * not needed.
14119 */
14120 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14121 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14122 user_addr_t name, struct nameidata *ndp, int32_t op,
14123 #if !CONFIG_TRIGGERS
14124 __unused
14125 #endif
14126 enum path_operation pathop,
14127 vfs_context_t ctx)
14128 {
14129 int error, i;
14130 caddr_t name_buf;
14131 size_t name_len;
14132 struct vfs_attr vfa;
14133
14134 *sdvpp = NULLVP;
14135 *rvpp = NULLVP;
14136
14137 error = vnode_getfromfd(ctx, dirfd, rvpp);
14138 if (error) {
14139 return error;
14140 }
14141
14142 if (!vnode_isvroot(*rvpp)) {
14143 error = EINVAL;
14144 goto out;
14145 }
14146
14147 /* Make sure the filesystem supports snapshots */
14148 VFSATTR_INIT(&vfa);
14149 VFSATTR_WANTED(&vfa, f_capabilities);
14150 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14151 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14152 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14153 VOL_CAP_INT_SNAPSHOT)) ||
14154 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14155 VOL_CAP_INT_SNAPSHOT))) {
14156 error = ENOTSUP;
14157 goto out;
14158 }
14159
14160 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14161 if (error) {
14162 goto out;
14163 }
14164
14165 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14166 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14167 if (error) {
14168 goto out1;
14169 }
14170
14171 /*
14172 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14173 * (the length returned by copyinstr includes the terminating NUL)
14174 */
14175 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14176 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14177 error = EINVAL;
14178 goto out1;
14179 }
14180 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14181 ;
14182 }
14183 if (i < (int)name_len) {
14184 error = EINVAL;
14185 goto out1;
14186 }
14187
14188 #if CONFIG_MACF
14189 if (op == CREATE) {
14190 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14191 name_buf);
14192 } else if (op == DELETE) {
14193 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14194 name_buf);
14195 }
14196 if (error) {
14197 goto out1;
14198 }
14199 #endif
14200
14201 /* Check if the snapshot already exists ... */
14202 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14203 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14204 ndp->ni_dvp = *sdvpp;
14205
14206 error = namei(ndp);
14207 out1:
14208 zfree(ZV_NAMEI, name_buf);
14209 out:
14210 if (error) {
14211 if (*sdvpp) {
14212 vnode_put(*sdvpp);
14213 *sdvpp = NULLVP;
14214 }
14215 if (*rvpp) {
14216 vnode_put(*rvpp);
14217 *rvpp = NULLVP;
14218 }
14219 }
14220 return error;
14221 }
14222
14223 /*
14224 * create a filesystem snapshot (for supporting filesystems)
14225 *
14226 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14227 * We get to the (unnamed) snapshot directory vnode and create the vnode
14228 * for the snapshot in it.
14229 *
14230 * Restrictions:
14231 *
14232 * a) Passed in name for snapshot cannot have slashes.
14233 * b) name can't be "." or ".."
14234 *
14235 * Since this requires superuser privileges, vnode_authorize calls are not
14236 * made.
14237 */
14238 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14239 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14240 vfs_context_t ctx)
14241 {
14242 vnode_t rvp, snapdvp;
14243 int error;
14244 struct nameidata *ndp;
14245
14246 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14247
14248 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14249 OP_LINK, ctx);
14250 if (error) {
14251 goto out;
14252 }
14253
14254 if (ndp->ni_vp) {
14255 vnode_put(ndp->ni_vp);
14256 error = EEXIST;
14257 } else {
14258 struct vnode_attr *vap;
14259 vnode_t vp = NULLVP;
14260
14261 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14262
14263 VATTR_INIT(vap);
14264 VATTR_SET(vap, va_type, VREG);
14265 VATTR_SET(vap, va_mode, 0);
14266
14267 error = vn_create(snapdvp, &vp, ndp, vap,
14268 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14269 if (!error && vp) {
14270 vnode_put(vp);
14271 }
14272
14273 kfree_type(struct vnode_attr, vap);
14274 }
14275
14276 nameidone(ndp);
14277 vnode_put(snapdvp);
14278 vnode_put(rvp);
14279 out:
14280 kfree_type(struct nameidata, ndp);
14281
14282 return error;
14283 }
14284
14285 /*
14286 * Delete a Filesystem snapshot
14287 *
14288 * get the vnode for the unnamed snapshot directory and the snapshot and
14289 * delete the snapshot.
14290 */
14291 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14292 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14293 vfs_context_t ctx)
14294 {
14295 vnode_t rvp, snapdvp;
14296 int error;
14297 struct nameidata *ndp;
14298
14299 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14300
14301 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14302 OP_UNLINK, ctx);
14303 if (error) {
14304 goto out;
14305 }
14306
14307 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14308 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14309
14310 vnode_put(ndp->ni_vp);
14311 nameidone(ndp);
14312 vnode_put(snapdvp);
14313 vnode_put(rvp);
14314 out:
14315 kfree_type(struct nameidata, ndp);
14316
14317 return error;
14318 }
14319
14320 /*
14321 * Revert a filesystem to a snapshot
14322 *
14323 * Marks the filesystem to revert to the given snapshot on next mount.
14324 */
14325 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14326 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14327 vfs_context_t ctx)
14328 {
14329 int error;
14330 vnode_t rvp;
14331 mount_t mp;
14332 struct fs_snapshot_revert_args revert_data;
14333 struct componentname cnp;
14334 caddr_t name_buf;
14335 size_t name_len;
14336
14337 error = vnode_getfromfd(ctx, dirfd, &rvp);
14338 if (error) {
14339 return error;
14340 }
14341 mp = vnode_mount(rvp);
14342
14343 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14344 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14345 if (error) {
14346 zfree(ZV_NAMEI, name_buf);
14347 vnode_put(rvp);
14348 return error;
14349 }
14350
14351 #if CONFIG_MACF
14352 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14353 if (error) {
14354 zfree(ZV_NAMEI, name_buf);
14355 vnode_put(rvp);
14356 return error;
14357 }
14358 #endif
14359
14360 /*
14361 * Grab mount_iterref so that we can release the vnode,
14362 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14363 */
14364 error = mount_iterref(mp, 0);
14365 vnode_put(rvp);
14366 if (error) {
14367 zfree(ZV_NAMEI, name_buf);
14368 return error;
14369 }
14370
14371 memset(&cnp, 0, sizeof(cnp));
14372 cnp.cn_pnbuf = (char *)name_buf;
14373 cnp.cn_nameiop = LOOKUP;
14374 cnp.cn_flags = ISLASTCN | HASBUF;
14375 cnp.cn_pnlen = MAXPATHLEN;
14376 cnp.cn_nameptr = cnp.cn_pnbuf;
14377 cnp.cn_namelen = (int)name_len;
14378 revert_data.sr_cnp = &cnp;
14379
14380 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14381 mount_iterdrop(mp);
14382 zfree(ZV_NAMEI, name_buf);
14383
14384 if (error) {
14385 /* If there was any error, try again using VNOP_IOCTL */
14386
14387 vnode_t snapdvp;
14388 struct nameidata namend;
14389
14390 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14391 OP_LOOKUP, ctx);
14392 if (error) {
14393 return error;
14394 }
14395
14396
14397 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14398 0, ctx);
14399
14400 vnode_put(namend.ni_vp);
14401 nameidone(&namend);
14402 vnode_put(snapdvp);
14403 vnode_put(rvp);
14404 }
14405
14406 return error;
14407 }
14408
14409 /*
14410 * rename a Filesystem snapshot
14411 *
14412 * get the vnode for the unnamed snapshot directory and the snapshot and
14413 * rename the snapshot. This is a very specialised (and simple) case of
14414 * rename(2) (which has to deal with a lot more complications). It differs
14415 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14416 */
14417 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14418 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14419 __unused uint32_t flags, vfs_context_t ctx)
14420 {
14421 vnode_t rvp, snapdvp;
14422 int error, i;
14423 caddr_t newname_buf;
14424 size_t name_len;
14425 vnode_t fvp;
14426 struct nameidata *fromnd, *tond;
14427 /* carving out a chunk for structs that are too big to be on stack. */
14428 struct {
14429 struct nameidata from_node;
14430 struct nameidata to_node;
14431 } * __rename_data;
14432
14433 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14434 fromnd = &__rename_data->from_node;
14435 tond = &__rename_data->to_node;
14436
14437 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14438 OP_UNLINK, ctx);
14439 if (error) {
14440 goto out;
14441 }
14442 fvp = fromnd->ni_vp;
14443
14444 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14445 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14446 if (error) {
14447 goto out1;
14448 }
14449
14450 /*
14451 * Some sanity checks- new name can't be empty, "." or ".." or have
14452 * slashes.
14453 * (the length returned by copyinstr includes the terminating NUL)
14454 *
14455 * The FS rename VNOP is suppossed to handle this but we'll pick it
14456 * off here itself.
14457 */
14458 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14459 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14460 error = EINVAL;
14461 goto out1;
14462 }
14463 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14464 ;
14465 }
14466 if (i < (int)name_len) {
14467 error = EINVAL;
14468 goto out1;
14469 }
14470
14471 #if CONFIG_MACF
14472 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14473 newname_buf);
14474 if (error) {
14475 goto out1;
14476 }
14477 #endif
14478
14479 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14480 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14481 tond->ni_dvp = snapdvp;
14482
14483 error = namei(tond);
14484 if (error) {
14485 goto out2;
14486 } else if (tond->ni_vp) {
14487 /*
14488 * snapshot rename behaves differently than rename(2) - if the
14489 * new name exists, EEXIST is returned.
14490 */
14491 vnode_put(tond->ni_vp);
14492 error = EEXIST;
14493 goto out2;
14494 }
14495
14496 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14497 &tond->ni_cnd, ctx);
14498
14499 out2:
14500 nameidone(tond);
14501 out1:
14502 zfree(ZV_NAMEI, newname_buf);
14503 vnode_put(fvp);
14504 vnode_put(snapdvp);
14505 vnode_put(rvp);
14506 nameidone(fromnd);
14507 out:
14508 kfree_type(typeof(*__rename_data), __rename_data);
14509 return error;
14510 }
14511
14512 /*
14513 * Mount a Filesystem snapshot
14514 *
14515 * get the vnode for the unnamed snapshot directory and the snapshot and
14516 * mount the snapshot.
14517 */
14518 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14519 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14520 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14521 {
14522 mount_t mp;
14523 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14524 struct fs_snapshot_mount_args smnt_data;
14525 int error;
14526 struct nameidata *snapndp, *dirndp;
14527 /* carving out a chunk for structs that are too big to be on stack. */
14528 struct {
14529 struct nameidata snapnd;
14530 struct nameidata dirnd;
14531 } * __snapshot_mount_data;
14532
14533 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14534 snapndp = &__snapshot_mount_data->snapnd;
14535 dirndp = &__snapshot_mount_data->dirnd;
14536
14537 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14538 OP_LOOKUP, ctx);
14539 if (error) {
14540 goto out;
14541 }
14542
14543 snapvp = snapndp->ni_vp;
14544 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14545 error = EIO;
14546 goto out1;
14547 }
14548
14549 /* Get the vnode to be covered */
14550 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14551 UIO_USERSPACE, directory, ctx);
14552 error = namei(dirndp);
14553 if (error) {
14554 goto out1;
14555 }
14556
14557 vp = dirndp->ni_vp;
14558 pvp = dirndp->ni_dvp;
14559 mp = vnode_mount(rvp);
14560
14561 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14562 error = EINVAL;
14563 goto out2;
14564 }
14565
14566 #if CONFIG_MACF
14567 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14568 mp->mnt_vfsstat.f_fstypename);
14569 if (error) {
14570 goto out2;
14571 }
14572 #endif
14573
14574 smnt_data.sm_mp = mp;
14575 smnt_data.sm_cnp = &snapndp->ni_cnd;
14576 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14577 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & (MNT_DONTBROWSE | MNT_IGNORE_OWNERSHIP),
14578 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14579
14580 out2:
14581 vnode_put(vp);
14582 vnode_put(pvp);
14583 nameidone(dirndp);
14584 out1:
14585 vnode_put(snapvp);
14586 vnode_put(snapdvp);
14587 vnode_put(rvp);
14588 nameidone(snapndp);
14589 out:
14590 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14591 return error;
14592 }
14593
14594 /*
14595 * Root from a snapshot of the filesystem
14596 *
14597 * Marks the filesystem to root from the given snapshot on next boot.
14598 */
14599 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14600 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14601 vfs_context_t ctx)
14602 {
14603 int error;
14604 vnode_t rvp;
14605 mount_t mp;
14606 struct fs_snapshot_root_args root_data;
14607 struct componentname cnp;
14608 caddr_t name_buf;
14609 size_t name_len;
14610
14611 error = vnode_getfromfd(ctx, dirfd, &rvp);
14612 if (error) {
14613 return error;
14614 }
14615 mp = vnode_mount(rvp);
14616
14617 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14618 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14619 if (error) {
14620 zfree(ZV_NAMEI, name_buf);
14621 vnode_put(rvp);
14622 return error;
14623 }
14624
14625 // XXX MAC checks ?
14626
14627 /*
14628 * Grab mount_iterref so that we can release the vnode,
14629 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14630 */
14631 error = mount_iterref(mp, 0);
14632 vnode_put(rvp);
14633 if (error) {
14634 zfree(ZV_NAMEI, name_buf);
14635 return error;
14636 }
14637
14638 memset(&cnp, 0, sizeof(cnp));
14639 cnp.cn_pnbuf = (char *)name_buf;
14640 cnp.cn_nameiop = LOOKUP;
14641 cnp.cn_flags = ISLASTCN | HASBUF;
14642 cnp.cn_pnlen = MAXPATHLEN;
14643 cnp.cn_nameptr = cnp.cn_pnbuf;
14644 cnp.cn_namelen = (int)name_len;
14645 root_data.sr_cnp = &cnp;
14646
14647 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14648
14649 mount_iterdrop(mp);
14650 zfree(ZV_NAMEI, name_buf);
14651
14652 return error;
14653 }
14654
14655 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14656 vfs_context_can_snapshot(vfs_context_t ctx)
14657 {
14658 static const char * const snapshot_entitlements[] = {
14659 "com.apple.private.vfs.snapshot",
14660 "com.apple.developer.vfs.snapshot",
14661 "com.apple.private.apfs.arv.limited.snapshot",
14662 };
14663 static const size_t nentitlements =
14664 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14665 size_t i;
14666
14667 task_t task = vfs_context_task(ctx);
14668 for (i = 0; i < nentitlements; i++) {
14669 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14670 return TRUE;
14671 }
14672 }
14673 return FALSE;
14674 }
14675
14676 /*
14677 * FS snapshot operations dispatcher
14678 */
14679 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14680 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14681 __unused int32_t *retval)
14682 {
14683 int error;
14684 vfs_context_t ctx = vfs_context_current();
14685
14686 AUDIT_ARG(fd, uap->dirfd);
14687 AUDIT_ARG(value32, uap->op);
14688
14689 if (!vfs_context_can_snapshot(ctx)) {
14690 return EPERM;
14691 }
14692
14693 /*
14694 * Enforce user authorization for snapshot modification operations,
14695 * or if trying to root from snapshot.
14696 */
14697 if (uap->op != SNAPSHOT_OP_MOUNT) {
14698 vnode_t dvp = NULLVP;
14699 vnode_t devvp = NULLVP;
14700 mount_t mp;
14701
14702 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14703 if (error) {
14704 return error;
14705 }
14706 mp = vnode_mount(dvp);
14707 devvp = mp->mnt_devvp;
14708
14709 /* get an iocount on devvp */
14710 if (devvp == NULLVP) {
14711 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14712 /* for mounts which arent block devices */
14713 if (error == ENOENT) {
14714 error = ENXIO;
14715 }
14716 } else {
14717 error = vnode_getwithref(devvp);
14718 }
14719
14720 if (error) {
14721 vnode_put(dvp);
14722 return error;
14723 }
14724
14725 if ((vfs_context_issuser(ctx) == 0) &&
14726 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14727 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14728 error = EPERM;
14729 }
14730 vnode_put(dvp);
14731 vnode_put(devvp);
14732
14733 if (error) {
14734 return error;
14735 }
14736 }
14737
14738 switch (uap->op) {
14739 case SNAPSHOT_OP_CREATE:
14740 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14741 break;
14742 case SNAPSHOT_OP_DELETE:
14743 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14744 break;
14745 case SNAPSHOT_OP_RENAME:
14746 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14747 uap->flags, ctx);
14748 break;
14749 case SNAPSHOT_OP_MOUNT:
14750 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14751 uap->data, uap->flags, ctx);
14752 break;
14753 case SNAPSHOT_OP_REVERT:
14754 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14755 break;
14756 #if CONFIG_MNT_ROOTSNAP
14757 case SNAPSHOT_OP_ROOT:
14758 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14759 break;
14760 #endif /* CONFIG_MNT_ROOTSNAP */
14761 default:
14762 error = ENOSYS;
14763 }
14764
14765 return error;
14766 }
14767