1 /*
2 * Copyright (c) 1995-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114
115 #include <security/audit/audit.h>
116 #include <bsm/audit_kevents.h>
117
118 #include <mach/mach_types.h>
119 #include <kern/kern_types.h>
120 #include <kern/kalloc.h>
121 #include <kern/task.h>
122
123 #include <vm/vm_pageout.h>
124 #include <vm/vm_protos.h>
125
126 #include <libkern/OSAtomic.h>
127 #include <os/atomic_private.h>
128 #include <pexpert/pexpert.h>
129 #include <IOKit/IOBSD.h>
130
131 // deps for MIG call
132 #include <kern/host.h>
133 #include <kern/ipc_misc.h>
134 #include <mach/host_priv.h>
135 #include <mach/vfs_nspace.h>
136 #include <os/log.h>
137
138 #include <nfs/nfs_conf.h>
139
140 #if ROUTEFS
141 #include <miscfs/routefs/routefs.h>
142 #endif /* ROUTEFS */
143
144 #if CONFIG_MACF
145 #include <security/mac.h>
146 #include <security/mac_framework.h>
147 #endif
148
149 #if CONFIG_FSE
150 #define GET_PATH(x) \
151 ((x) = get_pathbuff())
152 #define RELEASE_PATH(x) \
153 release_pathbuff(x)
154 #else
155 #define GET_PATH(x) \
156 ((x) = zalloc(ZV_NAMEI))
157 #define RELEASE_PATH(x) \
158 zfree(ZV_NAMEI, x)
159 #endif /* CONFIG_FSE */
160
161 #ifndef HFS_GET_BOOT_INFO
162 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
163 #endif
164
165 #ifndef HFS_SET_BOOT_INFO
166 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
167 #endif
168
169 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
170 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
171 #endif
172
173 extern void disk_conditioner_unmount(mount_t mp);
174
175 /* struct for checkdirs iteration */
176 struct cdirargs {
177 vnode_t olddp;
178 vnode_t newdp;
179 };
180 /* callback for checkdirs iteration */
181 static int checkdirs_callback(proc_t p, void * arg);
182
183 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
184 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
185 void enablequotas(struct mount *mp, vfs_context_t ctx);
186 static int getfsstat_callback(mount_t mp, void * arg);
187 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
188 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
189 static int sync_callback(mount_t, void *);
190 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
191 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
192 boolean_t partial_copy);
193 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
194 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
195 struct componentname *cnp, user_addr_t fsmountargs,
196 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
197 void vfs_notify_mount(vnode_t pdvp);
198
199 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
200
201 struct fd_vn_data * fg_vn_data_alloc(void);
202
203 /*
204 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
205 * Concurrent lookups (or lookups by ids) on hard links can cause the
206 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
207 * does) to return ENOENT as the path cannot be returned from the name cache
208 * alone. We have no option but to retry and hope to get one namei->reverse path
209 * generation done without an intervening lookup, lookup by id on the hard link
210 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
211 * which currently are the MAC hooks for rename, unlink and rmdir.
212 */
213 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
214
215 /* Max retry limit for rename due to vnode recycling. */
216 #define MAX_RENAME_ERECYCLE_RETRIES 1024
217
218 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
219 int unlink_flags);
220
221 #ifdef CONFIG_IMGSRC_ACCESS
222 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
223 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
224 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
225 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
226 static void mount_end_update(mount_t mp);
227 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
228 #endif /* CONFIG_IMGSRC_ACCESS */
229
230 //snapshot functions
231 #if CONFIG_MNT_ROOTSNAP
232 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
233 #else
234 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
235 #endif
236
237 __private_extern__
238 int sync_internal(void);
239
240 __private_extern__
241 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
242
243 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
244 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
245
246 /* vars for sync mutex */
247 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
248 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
249
250 extern lck_rw_t rootvnode_rw_lock;
251
252 /*
253 * incremented each time a mount or unmount operation occurs
254 * used to invalidate the cached value of the rootvp in the
255 * mount structure utilized by cache_lookup_path
256 */
257 uint32_t mount_generation = 0;
258
259 /* counts number of mount and unmount operations */
260 unsigned int vfs_nummntops = 0;
261
262 /* system-wide, per-boot unique mount ID */
263 static _Atomic uint64_t mount_unique_id = 1;
264
265 extern const struct fileops vnops;
266 #if CONFIG_APPLEDOUBLE
267 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
268 #endif /* CONFIG_APPLEDOUBLE */
269
270 /*
271 * Virtual File System System Calls
272 */
273
274 /*
275 * Private in-kernel mounting spi (specific use-cases only)
276 */
277 boolean_t
vfs_iskernelmount(mount_t mp)278 vfs_iskernelmount(mount_t mp)
279 {
280 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
281 }
282
283 __private_extern__
284 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)285 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
286 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
287 vfs_context_t ctx)
288 {
289 struct nameidata nd;
290 boolean_t did_namei;
291 int error;
292
293 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
294 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
295
296 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
297
298 /*
299 * Get the vnode to be covered if it's not supplied
300 */
301 if (vp == NULLVP) {
302 error = namei(&nd);
303 if (error) {
304 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
305 printf("failed to locate mount-on path: %s ", path);
306 }
307 return error;
308 }
309 vp = nd.ni_vp;
310 pvp = nd.ni_dvp;
311 did_namei = TRUE;
312 } else {
313 char *pnbuf = CAST_DOWN(char *, path);
314
315 nd.ni_cnd.cn_pnbuf = pnbuf;
316 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
317 did_namei = FALSE;
318 }
319
320 kern_flags |= KERNEL_MOUNT_KMOUNT;
321 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
322 syscall_flags, kern_flags, NULL, ctx);
323
324 if (did_namei) {
325 vnode_put(vp);
326 vnode_put(pvp);
327 nameidone(&nd);
328 }
329
330 return error;
331 }
332
333 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)334 vfs_mount_at_path(const char *fstype, const char *path,
335 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
336 int mnt_flags, int flags)
337 {
338 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
339 int error, km_flags = 0;
340 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
341
342 /*
343 * This call is currently restricted to specific use cases.
344 */
345 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
346 return ENOTSUP;
347 }
348
349 #if !defined(XNU_TARGET_OS_OSX)
350 if (strcmp(fstype, "lifs") == 0) {
351 syscall_flags |= MNT_NOEXEC;
352 }
353 #endif
354
355 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
356 km_flags |= KERNEL_MOUNT_NOAUTH;
357 }
358 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
359 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
360 }
361
362 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
363 syscall_flags, km_flags, ctx);
364 if (error) {
365 printf("%s: mount on %s failed, error %d\n", __func__, path,
366 error);
367 }
368
369 return error;
370 }
371
372 int
vfs_mount_override_type_name(mount_t mp,const char * name)373 vfs_mount_override_type_name(mount_t mp, const char *name)
374 {
375 if (mp == NULL || name == NULL) {
376 return EINVAL;
377 }
378
379 /* Override the FS type name. */
380 mount_lock_spin(mp);
381 strlcpy(mp->fstypename_override, name, sizeof(mp->fstypename_override));
382 mp->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
383 mount_unlock(mp);
384
385 return 0;
386 }
387
388 /*
389 * Mount a file system.
390 */
391 /* ARGSUSED */
392 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)393 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
394 {
395 struct __mac_mount_args muap;
396
397 muap.type = uap->type;
398 muap.path = uap->path;
399 muap.flags = uap->flags;
400 muap.data = uap->data;
401 muap.mac_p = USER_ADDR_NULL;
402 return __mac_mount(p, &muap, retval);
403 }
404
405 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)406 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
407 {
408 struct componentname cn;
409 vfs_context_t ctx = vfs_context_current();
410 size_t dummy = 0;
411 int error;
412 int flags = uap->flags;
413 char fstypename[MFSNAMELEN];
414 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
415 vnode_t pvp;
416 vnode_t vp;
417
418 AUDIT_ARG(fd, uap->fd);
419 AUDIT_ARG(fflags, flags);
420 /* fstypename will get audited by mount_common */
421
422 /* Sanity check the flags */
423 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
424 return ENOTSUP;
425 }
426
427 if (flags & MNT_UNION) {
428 return EPERM;
429 }
430
431 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
432 if (error) {
433 return error;
434 }
435
436 if ((error = file_vnode(uap->fd, &vp)) != 0) {
437 return error;
438 }
439
440 if ((error = vnode_getwithref(vp)) != 0) {
441 file_drop(uap->fd);
442 return error;
443 }
444
445 pvp = vnode_getparent(vp);
446 if (pvp == NULL) {
447 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
448 error = EBUSY;
449 } else {
450 error = EINVAL;
451 }
452 vnode_put(vp);
453 file_drop(uap->fd);
454 return error;
455 }
456
457 memset(&cn, 0, sizeof(struct componentname));
458 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
459 cn.cn_pnlen = MAXPATHLEN;
460
461 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
462 zfree(ZV_NAMEI, cn.cn_pnbuf);
463 vnode_put(pvp);
464 vnode_put(vp);
465 file_drop(uap->fd);
466 return error;
467 }
468
469 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
470
471 zfree(ZV_NAMEI, cn.cn_pnbuf);
472 vnode_put(pvp);
473 vnode_put(vp);
474 file_drop(uap->fd);
475
476 return error;
477 }
478
479 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
480
481 /*
482 * Get the size of a graft file (a manifest or payload file).
483 * The vp should be an iocounted vnode.
484 */
485 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)486 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
487 {
488 struct stat64 sb = {};
489 int error;
490
491 *size = 0;
492
493 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
494 if (error) {
495 return error;
496 }
497
498 if (sb.st_size == 0) {
499 error = ENODATA;
500 } else if (sb.st_size > MAX_GRAFT_METADATA_SIZE) {
501 error = EFBIG;
502 } else {
503 *size = (size_t) sb.st_size;
504 }
505
506 return error;
507 }
508
509 /*
510 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
511 * `size` must already be validated.
512 */
513 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)514 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
515 {
516 return vn_rdwr(UIO_READ, graft_vp,
517 (caddr_t) buf, (int) size, /* offset */ 0,
518 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
519 vfs_context_ucred(vctx), /* resid */ NULL,
520 vfs_context_proc(vctx));
521 }
522
523 /*
524 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
525 * and read it into `buf`.
526 */
527 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)528 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
529 {
530 vnode_t metadata_vp = NULLVP;
531 int error;
532
533 // Convert this graft fd to a vnode.
534 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
535 goto out;
536 }
537
538 // Get (and validate) size information.
539 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
540 goto out;
541 }
542
543 // Read each file into the provided buffer - we must get the expected amount of bytes.
544 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
545 goto out;
546 }
547
548 out:
549 if (metadata_vp) {
550 vnode_put(metadata_vp);
551 metadata_vp = NULLVP;
552 }
553
554 return error;
555 }
556
557 /*
558 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
559 * provided in `gfs`, saving the size of data read in `gfs`.
560 */
561 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)562 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
563 fsioc_graft_fs_t *gfs)
564 {
565 int error;
566
567 // Read the authentic manifest.
568 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
569 &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
570 return error;
571 }
572
573 // The user manifest is currently unused, but set its size.
574 gfs->user_manifest_size = 0;
575
576 // Read the payload.
577 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
578 &gfs->payload_size, gfs->payload))) {
579 return error;
580 }
581
582 return 0;
583 }
584
585 /*
586 * Call into the filesystem to verify and graft a cryptex.
587 */
588 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)589 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
590 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
591 {
592 fsioc_graft_fs_t gfs = {};
593 uint64_t graft_dir_ino = 0;
594 struct stat64 sb = {};
595 int error;
596
597 // Pre-flight arguments.
598 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
599 // Make sure that this graft version matches what we support.
600 return ENOTSUP;
601 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
602 // For this type, cryptex VP must live on same volume as the target of graft.
603 return EXDEV;
604 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
605 // We cannot graft upon non-directories.
606 return ENOTDIR;
607 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
608 sbc_args->sbc_payload_fd < 0) {
609 // We cannot graft without a manifest and payload.
610 return EINVAL;
611 }
612
613 if (mounton_vp) {
614 // Get the mounton's inode number.
615 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
616 if (error) {
617 return error;
618 }
619 graft_dir_ino = (uint64_t) sb.st_ino;
620 }
621
622 // Create buffers (of our maximum-defined size) to store authentication info.
623 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
624 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
625
626 if (!gfs.authentic_manifest || !gfs.payload) {
627 error = ENOMEM;
628 goto out;
629 }
630
631 // Read our fd's into our buffers.
632 // (Note that this will set the buffer size fields in `gfs`.)
633 error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
634 if (error) {
635 goto out;
636 }
637
638 gfs.graft_version = FSIOC_GRAFT_VERSION;
639 gfs.graft_type = graft_type;
640 gfs.graft_4cc = sbc_args->sbc_4cc;
641 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
642 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
643 }
644 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
645 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
646 }
647 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
648 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
649 }
650 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
651 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
652 }
653 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
654 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
655 }
656 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
657 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
658 }
659 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
660
661 // Call into the FS to perform the graft (and validation).
662 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
663
664 out:
665 if (gfs.authentic_manifest) {
666 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
667 gfs.authentic_manifest = NULL;
668 }
669 if (gfs.payload) {
670 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
671 gfs.payload = NULL;
672 }
673
674 return error;
675 }
676
677 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
678
679 /*
680 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
681 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
682 */
683 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)684 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
685 {
686 int ua_dmgfd = uap->dmg_fd;
687 user_addr_t ua_mountdir = uap->mountdir;
688 uint32_t ua_grafttype = uap->graft_type;
689 user_addr_t ua_graftargs = uap->gda;
690
691 graftdmg_args_un kern_gda = {};
692 int error = 0;
693 secure_boot_cryptex_args_t *sbc_args = NULL;
694
695 vnode_t cryptex_vp = NULLVP;
696 vnode_t mounton_vp = NULLVP;
697 struct nameidata nd = {};
698 vfs_context_t ctx = vfs_context_current();
699
700 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
701 return EPERM;
702 }
703
704 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
705 if (error) {
706 return error;
707 }
708
709 // Copy mount dir in, if provided.
710 if (ua_mountdir != USER_ADDR_NULL) {
711 // Acquire vnode for mount-on path
712 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
713 UIO_USERSPACE, ua_mountdir, ctx);
714
715 error = namei(&nd);
716 if (error) {
717 return error;
718 }
719 mounton_vp = nd.ni_vp;
720 }
721
722 // Convert fd to vnode.
723 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
724 if (error) {
725 goto graftout;
726 }
727
728 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_DOWNLEVEL) {
729 error = EINVAL;
730 } else {
731 sbc_args = &kern_gda.sbc_args;
732 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
733 }
734
735 graftout:
736 if (cryptex_vp) {
737 vnode_put(cryptex_vp);
738 cryptex_vp = NULLVP;
739 }
740 if (mounton_vp) {
741 vnode_put(mounton_vp);
742 mounton_vp = NULLVP;
743 }
744 if (ua_mountdir != USER_ADDR_NULL) {
745 nameidone(&nd);
746 }
747
748 return error;
749 }
750
751 /*
752 * Ungraft a cryptex disk image (via mount dir FD)
753 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
754 */
755 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)756 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
757 {
758 int error = 0;
759 user_addr_t ua_mountdir = uap->mountdir;
760 fsioc_ungraft_fs_t ugfs;
761 vnode_t mounton_vp = NULLVP;
762 struct nameidata nd = {};
763 vfs_context_t ctx = vfs_context_current();
764
765 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
766 return EPERM;
767 }
768
769 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
770 return EINVAL;
771 }
772
773 ugfs.ungraft_flags = 0;
774
775 // Acquire vnode for mount-on path
776 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
777 UIO_USERSPACE, ua_mountdir, ctx);
778
779 error = namei(&nd);
780 if (error) {
781 return error;
782 }
783 mounton_vp = nd.ni_vp;
784
785 // Call into the FS to perform the ungraft
786 error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
787
788 vnode_put(mounton_vp);
789 nameidone(&nd);
790
791 return error;
792 }
793
794
795 void
vfs_notify_mount(vnode_t pdvp)796 vfs_notify_mount(vnode_t pdvp)
797 {
798 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
799 lock_vnode_and_post(pdvp, NOTE_WRITE);
800 }
801
802 /*
803 * __mac_mount:
804 * Mount a file system taking into account MAC label behavior.
805 * See mount(2) man page for more information
806 *
807 * Parameters: p Process requesting the mount
808 * uap User argument descriptor (see below)
809 * retval (ignored)
810 *
811 * Indirect: uap->type Filesystem type
812 * uap->path Path to mount
813 * uap->data Mount arguments
814 * uap->mac_p MAC info
815 * uap->flags Mount flags
816 *
817 *
818 * Returns: 0 Success
819 * !0 Not success
820 */
821 boolean_t root_fs_upgrade_try = FALSE;
822
823 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)824 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
825 {
826 vnode_t pvp = NULL;
827 vnode_t vp = NULL;
828 int need_nameidone = 0;
829 vfs_context_t ctx = vfs_context_current();
830 char fstypename[MFSNAMELEN];
831 struct nameidata nd;
832 size_t dummy = 0;
833 char *labelstr = NULL;
834 size_t labelsz = 0;
835 int flags = uap->flags;
836 int error;
837 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
838 boolean_t is_64bit = IS_64BIT_PROCESS(p);
839 #else
840 #pragma unused(p)
841 #endif
842 /*
843 * Get the fs type name from user space
844 */
845 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
846 if (error) {
847 return error;
848 }
849
850 /*
851 * Get the vnode to be covered
852 */
853 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
854 UIO_USERSPACE, uap->path, ctx);
855 if (flags & MNT_NOFOLLOW) {
856 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
857 }
858 error = namei(&nd);
859 if (error) {
860 goto out;
861 }
862 need_nameidone = 1;
863 vp = nd.ni_vp;
864 pvp = nd.ni_dvp;
865
866 #ifdef CONFIG_IMGSRC_ACCESS
867 /* Mounting image source cannot be batched with other operations */
868 if (flags == MNT_IMGSRC_BY_INDEX) {
869 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
870 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
871 goto out;
872 }
873 #endif /* CONFIG_IMGSRC_ACCESS */
874
875 #if CONFIG_MACF
876 /*
877 * Get the label string (if any) from user space
878 */
879 if (uap->mac_p != USER_ADDR_NULL) {
880 struct user_mac mac;
881 size_t ulen = 0;
882
883 if (is_64bit) {
884 struct user64_mac mac64;
885 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
886 mac.m_buflen = (user_size_t)mac64.m_buflen;
887 mac.m_string = (user_addr_t)mac64.m_string;
888 } else {
889 struct user32_mac mac32;
890 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
891 mac.m_buflen = mac32.m_buflen;
892 mac.m_string = mac32.m_string;
893 }
894 if (error) {
895 goto out;
896 }
897 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
898 (mac.m_buflen < 2)) {
899 error = EINVAL;
900 goto out;
901 }
902 labelsz = mac.m_buflen;
903 labelstr = kalloc_data(labelsz, Z_WAITOK);
904 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
905 if (error) {
906 goto out;
907 }
908 AUDIT_ARG(mac_string, labelstr);
909 }
910 #endif /* CONFIG_MACF */
911
912 AUDIT_ARG(fflags, flags);
913
914 #if !CONFIG_UNION_MOUNTS
915 if (flags & MNT_UNION) {
916 error = EPERM;
917 goto out;
918 }
919 #endif
920
921 if ((vp->v_flag & VROOT) &&
922 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
923 #if CONFIG_UNION_MOUNTS
924 if (!(flags & MNT_UNION)) {
925 flags |= MNT_UPDATE;
926 } else {
927 /*
928 * For a union mount on '/', treat it as fresh
929 * mount instead of update.
930 * Otherwise, union mouting on '/' used to panic the
931 * system before, since mnt_vnodecovered was found to
932 * be NULL for '/' which is required for unionlookup
933 * after it gets ENOENT on union mount.
934 */
935 flags = (flags & ~(MNT_UPDATE));
936 }
937 #else
938 flags |= MNT_UPDATE;
939 #endif /* CONFIG_UNION_MOUNTS */
940
941 #if SECURE_KERNEL
942 if ((flags & MNT_RDONLY) == 0) {
943 /* Release kernels are not allowed to mount "/" as rw */
944 error = EPERM;
945 goto out;
946 }
947 #endif
948
949 /*
950 * See 7392553 for more details on why this check exists.
951 * Suffice to say: If this check is ON and something tries
952 * to mount the rootFS RW, we'll turn off the codesign
953 * bitmap optimization.
954 */
955 #if CHECK_CS_VALIDATION_BITMAP
956 if ((flags & MNT_RDONLY) == 0) {
957 root_fs_upgrade_try = TRUE;
958 }
959 #endif
960 }
961
962 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
963 labelstr, ctx);
964
965 out:
966
967 #if CONFIG_MACF
968 kfree_data(labelstr, labelsz);
969 #endif /* CONFIG_MACF */
970
971 if (vp) {
972 vnode_put(vp);
973 }
974 if (pvp) {
975 vnode_put(pvp);
976 }
977 if (need_nameidone) {
978 nameidone(&nd);
979 }
980
981 return error;
982 }
983
984 /*
985 * common mount implementation (final stage of mounting)
986 *
987 * Arguments:
988 * fstypename file system type (ie it's vfs name)
989 * pvp parent of covered vnode
990 * vp covered vnode
991 * cnp component name (ie path) of covered vnode
992 * flags generic mount flags
993 * fsmountargs file system specific data
994 * labelstr optional MAC label
995 * kernelmount TRUE for mounts initiated from inside the kernel
996 * ctx caller's context
997 */
998 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)999 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
1000 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
1001 char *labelstr, vfs_context_t ctx)
1002 {
1003 #if !CONFIG_MACF
1004 #pragma unused(labelstr)
1005 #endif
1006 struct vnode *devvp = NULLVP;
1007 struct vnode *device_vnode = NULLVP;
1008 #if CONFIG_MACF
1009 struct vnode *rvp;
1010 #endif
1011 struct mount *mp = NULL;
1012 struct vfstable *vfsp = (struct vfstable *)0;
1013 struct proc *p = vfs_context_proc(ctx);
1014 int error, flag = 0;
1015 bool flag_set = false;
1016 user_addr_t devpath = USER_ADDR_NULL;
1017 int ronly = 0;
1018 int mntalloc = 0;
1019 boolean_t vfsp_ref = FALSE;
1020 boolean_t is_rwlock_locked = FALSE;
1021 boolean_t did_rele = FALSE;
1022 boolean_t have_usecount = FALSE;
1023 boolean_t did_set_lmount = FALSE;
1024 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1025
1026 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1027 /* Check for mutually-exclusive flag bits */
1028 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1029 int bitcount = 0;
1030 while (checkflags != 0) {
1031 checkflags &= (checkflags - 1);
1032 bitcount++;
1033 }
1034
1035 if (bitcount > 1) {
1036 //not allowed to request multiple mount-by-role flags
1037 error = EINVAL;
1038 goto out1;
1039 }
1040 #endif
1041
1042 /*
1043 * Process an update for an existing mount
1044 */
1045 if (flags & MNT_UPDATE) {
1046 if ((vp->v_flag & VROOT) == 0) {
1047 error = EINVAL;
1048 goto out1;
1049 }
1050 mp = vp->v_mount;
1051
1052 /* if unmount or mount in progress, return error */
1053 mount_lock_spin(mp);
1054 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1055 mount_unlock(mp);
1056 error = EBUSY;
1057 goto out1;
1058 }
1059 mp->mnt_lflag |= MNT_LMOUNT;
1060 did_set_lmount = TRUE;
1061 mount_unlock(mp);
1062 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1063 is_rwlock_locked = TRUE;
1064 /*
1065 * We only allow the filesystem to be reloaded if it
1066 * is currently mounted read-only.
1067 */
1068 if ((flags & MNT_RELOAD) &&
1069 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1070 error = ENOTSUP;
1071 goto out1;
1072 }
1073
1074 /*
1075 * If content protection is enabled, update mounts are not
1076 * allowed to turn it off.
1077 */
1078 if ((mp->mnt_flag & MNT_CPROTECT) &&
1079 ((flags & MNT_CPROTECT) == 0)) {
1080 error = EINVAL;
1081 goto out1;
1082 }
1083
1084 /*
1085 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1086 * failure to return an error for this so we'll just silently
1087 * add it if it is not passed in.
1088 */
1089 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1090 ((flags & MNT_REMOVABLE) == 0)) {
1091 flags |= MNT_REMOVABLE;
1092 }
1093
1094 /* Can't downgrade the backer of the root FS */
1095 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1096 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1097 error = ENOTSUP;
1098 goto out1;
1099 }
1100
1101 /*
1102 * Only root, or the user that did the original mount is
1103 * permitted to update it.
1104 */
1105 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1106 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1107 goto out1;
1108 }
1109 #if CONFIG_MACF
1110 error = mac_mount_check_remount(ctx, mp);
1111 if (error != 0) {
1112 goto out1;
1113 }
1114 #endif
1115 /*
1116 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1117 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1118 */
1119 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1120 flags |= MNT_NOSUID | MNT_NODEV;
1121 if (mp->mnt_flag & MNT_NOEXEC) {
1122 flags |= MNT_NOEXEC;
1123 }
1124 }
1125 flag = mp->mnt_flag;
1126 flag_set = true;
1127
1128
1129
1130 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1131
1132 vfsp = mp->mnt_vtable;
1133 goto update;
1134 } // MNT_UPDATE
1135
1136 /*
1137 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1138 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1139 */
1140 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1141 flags |= MNT_NOSUID | MNT_NODEV;
1142 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1143 flags |= MNT_NOEXEC;
1144 }
1145 }
1146
1147 /* XXXAUDIT: Should we capture the type on the error path as well? */
1148 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1149 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1150 mount_list_lock();
1151 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1152 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1153 vfsp->vfc_refcount++;
1154 vfsp_ref = TRUE;
1155 break;
1156 }
1157 }
1158 mount_list_unlock();
1159 if (vfsp == NULL) {
1160 error = ENODEV;
1161 goto out1;
1162 }
1163
1164 /*
1165 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1166 * except in ROSV configs and for the initial BaseSystem root.
1167 */
1168 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1169 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1170 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1171 error = EINVAL; /* unsupported request */
1172 goto out1;
1173 }
1174
1175 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1176 if (error != 0) {
1177 goto out1;
1178 }
1179
1180 /*
1181 * Allocate and initialize the filesystem (mount_t)
1182 */
1183 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1184 mntalloc = 1;
1185
1186 /* Initialize the default IO constraints */
1187 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1188 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1189 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1190 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1191 mp->mnt_devblocksize = DEV_BSIZE;
1192 mp->mnt_alignmentmask = PAGE_MASK;
1193 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1194 mp->mnt_ioscale = 1;
1195 mp->mnt_ioflags = 0;
1196 mp->mnt_realrootvp = NULLVP;
1197 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1198
1199 mp->mnt_lflag |= MNT_LMOUNT;
1200 did_set_lmount = TRUE;
1201
1202 TAILQ_INIT(&mp->mnt_vnodelist);
1203 TAILQ_INIT(&mp->mnt_workerqueue);
1204 TAILQ_INIT(&mp->mnt_newvnodes);
1205 mount_lock_init(mp);
1206 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1207 is_rwlock_locked = TRUE;
1208 mp->mnt_op = vfsp->vfc_vfsops;
1209 mp->mnt_vtable = vfsp;
1210 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1211 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1212 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1213 do {
1214 size_t pathlen = MAXPATHLEN;
1215
1216 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1217 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1218 }
1219 } while (0);
1220 mp->mnt_vnodecovered = vp;
1221 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1222 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1223 mp->mnt_devbsdunit = 0;
1224 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1225
1226 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1227 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1228
1229 if (kernelmount) {
1230 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1231 }
1232 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1233 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1234 }
1235
1236 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1237 // kernel mounted devfs
1238 mp->mnt_kern_flag |= MNTK_SYSTEM;
1239 }
1240
1241 update:
1242
1243 /*
1244 * Set the mount level flags.
1245 */
1246 if (flags & MNT_RDONLY) {
1247 mp->mnt_flag |= MNT_RDONLY;
1248 } else if (mp->mnt_flag & MNT_RDONLY) {
1249 // disallow read/write upgrades of file systems that
1250 // had the TYPENAME_OVERRIDE feature set.
1251 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1252 error = EPERM;
1253 goto out1;
1254 }
1255 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1256 }
1257 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1258 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1259 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1260 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1261 MNT_QUARANTINE | MNT_CPROTECT);
1262
1263 #if SECURE_KERNEL
1264 #if !CONFIG_MNT_SUID
1265 /*
1266 * On release builds of iOS based platforms, always enforce NOSUID on
1267 * all mounts. We do this here because we can catch update mounts as well as
1268 * non-update mounts in this case.
1269 */
1270 mp->mnt_flag |= (MNT_NOSUID);
1271 #endif
1272 #endif
1273
1274 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1275 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1276 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1277 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1278 MNT_QUARANTINE | MNT_CPROTECT);
1279
1280 #if CONFIG_MACF
1281 if (flags & MNT_MULTILABEL) {
1282 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1283 error = EINVAL;
1284 goto out1;
1285 }
1286 mp->mnt_flag |= MNT_MULTILABEL;
1287 }
1288 #endif
1289 /*
1290 * Process device path for local file systems if requested.
1291 *
1292 * Snapshot and mount-by-role mounts do not use this path; they are
1293 * passing other opaque data in the device path field.
1294 *
1295 * Basesystemroot mounts pass a device path to be resolved here,
1296 * but it's just a char * already inside the kernel, which
1297 * kernel_mount() shoved into a user_addr_t to call us. So for such
1298 * mounts we must skip copyin (both of the address and of the string
1299 * (in NDINIT).
1300 */
1301 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1302 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1303 boolean_t do_copyin_devpath = true;
1304 #if CONFIG_BASESYSTEMROOT
1305 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1306 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1307 // We have been passed fsmountargs, which is typed as a user_addr_t,
1308 // but is actually a char ** pointing to a (kernelspace) string.
1309 // We manually unpack it with a series of casts and dereferences
1310 // that reverses what was done just above us on the stack in
1311 // imageboot_pivot_image().
1312 // After retrieving the path to the dev node (which we will NDINIT
1313 // in a moment), we pass NULL fsmountargs on to the filesystem.
1314 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1315 char **devnamepp = (char **)fsmountargs;
1316 char *devnamep = *devnamepp;
1317 devpath = CAST_USER_ADDR_T(devnamep);
1318 do_copyin_devpath = false;
1319 fsmountargs = USER_ADDR_NULL;
1320
1321 //Now that we have a mp, denote that this mount is for the basesystem.
1322 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1323 }
1324 #endif // CONFIG_BASESYSTEMROOT
1325
1326 if (do_copyin_devpath) {
1327 if (vfs_context_is64bit(ctx)) {
1328 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1329 goto out1;
1330 }
1331 fsmountargs += sizeof(devpath);
1332 } else {
1333 user32_addr_t tmp;
1334 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1335 goto out1;
1336 }
1337 /* munge into LP64 addr */
1338 devpath = CAST_USER_ADDR_T(tmp);
1339 fsmountargs += sizeof(tmp);
1340 }
1341 }
1342
1343 /* Lookup device and authorize access to it */
1344 if ((devpath)) {
1345 struct nameidata nd;
1346
1347 enum uio_seg seg = UIO_USERSPACE;
1348 #if CONFIG_BASESYSTEMROOT
1349 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1350 seg = UIO_SYSSPACE;
1351 }
1352 #endif // CONFIG_BASESYSTEMROOT
1353
1354 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1355 if ((error = namei(&nd))) {
1356 goto out1;
1357 }
1358
1359 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1360 devvp = nd.ni_vp;
1361
1362 nameidone(&nd);
1363
1364 if (devvp->v_type != VBLK) {
1365 error = ENOTBLK;
1366 goto out2;
1367 }
1368 if (major(devvp->v_rdev) >= nblkdev) {
1369 error = ENXIO;
1370 goto out2;
1371 }
1372 /*
1373 * If mount by non-root, then verify that user has necessary
1374 * permissions on the device.
1375 */
1376 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1377 mode_t accessmode = KAUTH_VNODE_READ_DATA;
1378
1379 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1380 accessmode |= KAUTH_VNODE_WRITE_DATA;
1381 }
1382 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1383 goto out2;
1384 }
1385 }
1386 }
1387 /* On first mount, preflight and open device */
1388 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1389 if ((error = vnode_ref(devvp))) {
1390 goto out2;
1391 }
1392 /*
1393 * Disallow multiple mounts of the same device.
1394 * Disallow mounting of a device that is currently in use
1395 * (except for root, which might share swap device for miniroot).
1396 * Flush out any old buffers remaining from a previous use.
1397 */
1398 if ((error = vfs_mountedon(devvp))) {
1399 goto out3;
1400 }
1401
1402 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1403 error = EBUSY;
1404 goto out3;
1405 }
1406 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1407 error = ENOTBLK;
1408 goto out3;
1409 }
1410 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1411 goto out3;
1412 }
1413
1414 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1415 #if CONFIG_MACF
1416 error = mac_vnode_check_open(ctx,
1417 devvp,
1418 ronly ? FREAD : FREAD | FWRITE);
1419 if (error) {
1420 goto out3;
1421 }
1422 #endif /* MAC */
1423 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1424 goto out3;
1425 }
1426
1427 mp->mnt_devvp = devvp;
1428 device_vnode = devvp;
1429 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1430 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1431 (device_vnode = mp->mnt_devvp)) {
1432 dev_t dev;
1433 int maj;
1434 /*
1435 * If upgrade to read-write by non-root, then verify
1436 * that user has necessary permissions on the device.
1437 */
1438 vnode_getalways(device_vnode);
1439
1440 if (suser(vfs_context_ucred(ctx), NULL) &&
1441 (error = vnode_authorize(device_vnode, NULL,
1442 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1443 ctx)) != 0) {
1444 vnode_put(device_vnode);
1445 goto out2;
1446 }
1447
1448 /* Tell the device that we're upgrading */
1449 dev = (dev_t)device_vnode->v_rdev;
1450 maj = major(dev);
1451
1452 if ((u_int)maj >= (u_int)nblkdev) {
1453 panic("Volume mounted on a device with invalid major number.");
1454 }
1455
1456 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1457 vnode_put(device_vnode);
1458 device_vnode = NULLVP;
1459 if (error != 0) {
1460 goto out2;
1461 }
1462 }
1463 } // localargs && !(snapshot | data | vm)
1464
1465 #if CONFIG_MACF
1466 if ((flags & MNT_UPDATE) == 0) {
1467 mac_mount_label_init(mp);
1468 mac_mount_label_associate(ctx, mp);
1469 }
1470 if (labelstr) {
1471 if ((flags & MNT_UPDATE) != 0) {
1472 error = mac_mount_check_label_update(ctx, mp);
1473 if (error != 0) {
1474 goto out3;
1475 }
1476 }
1477 }
1478 #endif
1479 /*
1480 * Mount the filesystem. We already asserted that internal_flags
1481 * cannot have more than one mount-by-role bit set.
1482 */
1483 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1484 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1485 (caddr_t)fsmountargs, 0, ctx);
1486 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1487 #if CONFIG_ROSV_STARTUP
1488 struct mount *origin_mp = (struct mount*)fsmountargs;
1489 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1490 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1491 if (error) {
1492 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1493 } else {
1494 /* Mark volume associated with system volume */
1495 mp->mnt_kern_flag |= MNTK_SYSTEM;
1496
1497 /* Attempt to acquire the mnt_devvp and set it up */
1498 struct vnode *mp_devvp = NULL;
1499 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1500 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1501 0, &mp_devvp, vfs_context_kernel());
1502 if (!lerr) {
1503 mp->mnt_devvp = mp_devvp;
1504 //vnode_lookup took an iocount, need to drop it.
1505 vnode_put(mp_devvp);
1506 // now set `device_vnode` to the devvp that was acquired.
1507 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1508 // note that though the iocount above was dropped, the mount acquires
1509 // an implicit reference against the device.
1510 device_vnode = mp_devvp;
1511 }
1512 }
1513 }
1514 #else
1515 error = EINVAL;
1516 #endif
1517 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1518 #if CONFIG_MOUNT_VM
1519 struct mount *origin_mp = (struct mount*)fsmountargs;
1520 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1521 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1522 if (error) {
1523 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1524 } else {
1525 /* Mark volume associated with system volume and a swap mount */
1526 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1527 /* Attempt to acquire the mnt_devvp and set it up */
1528 struct vnode *mp_devvp = NULL;
1529 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1530 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1531 0, &mp_devvp, vfs_context_kernel());
1532 if (!lerr) {
1533 mp->mnt_devvp = mp_devvp;
1534 //vnode_lookup took an iocount, need to drop it.
1535 vnode_put(mp_devvp);
1536
1537 // now set `device_vnode` to the devvp that was acquired.
1538 // note that though the iocount above was dropped, the mount acquires
1539 // an implicit reference against the device.
1540 device_vnode = mp_devvp;
1541 }
1542 }
1543 }
1544 #else
1545 error = EINVAL;
1546 #endif
1547 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1548 #if CONFIG_MOUNT_PREBOOTRECOVERY
1549 struct mount *origin_mp = (struct mount*)fsmountargs;
1550 uint32_t mount_role = 0;
1551 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1552 mount_role = VFS_PREBOOT_ROLE;
1553 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1554 mount_role = VFS_RECOVERY_ROLE;
1555 }
1556
1557 if (mount_role != 0) {
1558 fs_role_mount_args_t frma = {origin_mp, mount_role};
1559 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1560 if (error) {
1561 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1562 } else {
1563 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1564 /* Mark volume associated with system volume */
1565 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1566 /* Attempt to acquire the mnt_devvp and set it up */
1567 struct vnode *mp_devvp = NULL;
1568 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1569 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1570 0, &mp_devvp, vfs_context_kernel());
1571 if (!lerr) {
1572 mp->mnt_devvp = mp_devvp;
1573 //vnode_lookup took an iocount, need to drop it.
1574 vnode_put(mp_devvp);
1575
1576 // now set `device_vnode` to the devvp that was acquired.
1577 // note that though the iocount above was dropped, the mount acquires
1578 // an implicit reference against the device.
1579 device_vnode = mp_devvp;
1580 }
1581 }
1582 }
1583 } else {
1584 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1585 error = EINVAL;
1586 }
1587 #else
1588 error = EINVAL;
1589 #endif
1590 } else {
1591 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1592 }
1593
1594 if (flags & MNT_UPDATE) {
1595 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1596 mp->mnt_flag &= ~MNT_RDONLY;
1597 }
1598 mp->mnt_flag &= ~
1599 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1600 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1601 if (error) {
1602 mp->mnt_flag = flag; /* restore flag value */
1603 }
1604 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1605 lck_rw_done(&mp->mnt_rwlock);
1606 is_rwlock_locked = FALSE;
1607 if (!error) {
1608 enablequotas(mp, ctx);
1609 }
1610 goto exit;
1611 }
1612
1613 /*
1614 * Put the new filesystem on the mount list after root.
1615 */
1616 if (error == 0) {
1617 struct vfs_attr vfsattr;
1618 if (device_vnode) {
1619 /*
1620 * cache the IO attributes for the underlying physical media...
1621 * an error return indicates the underlying driver doesn't
1622 * support all the queries necessary... however, reasonable
1623 * defaults will have been set, so no reason to bail or care
1624 *
1625 * Need to do this before calling the MAC hook as it needs
1626 * information from this call.
1627 */
1628 vfs_init_io_attributes(device_vnode, mp);
1629 }
1630
1631 #if CONFIG_MACF
1632 error = mac_mount_check_mount_late(ctx, mp);
1633 if (error != 0) {
1634 goto out4;
1635 }
1636
1637 if (vfs_flags(mp) & MNT_MULTILABEL) {
1638 error = VFS_ROOT(mp, &rvp, ctx);
1639 if (error) {
1640 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1641 goto out4;
1642 }
1643 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1644 /*
1645 * drop reference provided by VFS_ROOT
1646 */
1647 vnode_put(rvp);
1648
1649 if (error) {
1650 goto out4;
1651 }
1652 }
1653 #endif /* MAC */
1654
1655 vnode_lock_spin(vp);
1656 CLR(vp->v_flag, VMOUNT);
1657 vp->v_mountedhere = mp;
1658 vnode_unlock(vp);
1659
1660 /*
1661 * taking the name_cache_lock exclusively will
1662 * insure that everyone is out of the fast path who
1663 * might be trying to use a now stale copy of
1664 * vp->v_mountedhere->mnt_realrootvp
1665 * bumping mount_generation causes the cached values
1666 * to be invalidated
1667 */
1668 name_cache_lock();
1669 mount_generation++;
1670 name_cache_unlock();
1671
1672 error = vnode_ref(vp);
1673 if (error != 0) {
1674 goto out4;
1675 }
1676
1677 have_usecount = TRUE;
1678
1679 error = checkdirs(vp, ctx);
1680 if (error != 0) {
1681 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1682 goto out4;
1683 }
1684 /*
1685 * there is no cleanup code here so I have made it void
1686 * we need to revisit this
1687 */
1688 (void)VFS_START(mp, 0, ctx);
1689
1690 if (mount_list_add(mp) != 0) {
1691 /*
1692 * The system is shutting down trying to umount
1693 * everything, so fail with a plausible errno.
1694 */
1695 error = EBUSY;
1696 goto out4;
1697 }
1698 lck_rw_done(&mp->mnt_rwlock);
1699 is_rwlock_locked = FALSE;
1700
1701 /* Check if this mounted file system supports EAs or named streams. */
1702 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1703 VFSATTR_INIT(&vfsattr);
1704 VFSATTR_WANTED(&vfsattr, f_capabilities);
1705 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1706 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1707 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1708 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1709 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1710 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1711 }
1712 #if NAMEDSTREAMS
1713 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1714 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1715 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1716 }
1717 #endif
1718 /* Check if this file system supports path from id lookups. */
1719 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1720 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1721 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1722 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1723 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1724 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1725 }
1726
1727 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1728 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1729 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1730 }
1731 }
1732 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1733 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1734 }
1735 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1736 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1737 }
1738 /* increment the operations count */
1739 OSAddAtomic(1, &vfs_nummntops);
1740 enablequotas(mp, ctx);
1741
1742 if (device_vnode) {
1743 device_vnode->v_specflags |= SI_MOUNTEDON;
1744 }
1745
1746 /* Now that mount is setup, notify the listeners */
1747 vfs_notify_mount(pvp);
1748 IOBSDMountChange(mp, kIOMountChangeMount);
1749 } else {
1750 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1751 if (mp->mnt_vnodelist.tqh_first != NULL) {
1752 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1753 mp->mnt_vtable->vfc_name, error);
1754 }
1755
1756 vnode_lock_spin(vp);
1757 CLR(vp->v_flag, VMOUNT);
1758 vnode_unlock(vp);
1759 mount_list_lock();
1760 mp->mnt_vtable->vfc_refcount--;
1761 mount_list_unlock();
1762
1763 if (device_vnode) {
1764 vnode_rele(device_vnode);
1765 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1766 }
1767 lck_rw_done(&mp->mnt_rwlock);
1768 is_rwlock_locked = FALSE;
1769
1770 /*
1771 * if we get here, we have a mount structure that needs to be freed,
1772 * but since the coveredvp hasn't yet been updated to point at it,
1773 * no need to worry about other threads holding a crossref on this mp
1774 * so it's ok to just free it
1775 */
1776 mount_lock_destroy(mp);
1777 #if CONFIG_MACF
1778 mac_mount_label_destroy(mp);
1779 #endif
1780 zfree(mount_zone, mp);
1781 did_set_lmount = false;
1782 }
1783 exit:
1784 /*
1785 * drop I/O count on the device vp if there was one
1786 */
1787 if (devpath && devvp) {
1788 vnode_put(devvp);
1789 }
1790
1791 if (did_set_lmount) {
1792 mount_lock_spin(mp);
1793 mp->mnt_lflag &= ~MNT_LMOUNT;
1794 mount_unlock(mp);
1795 }
1796
1797 return error;
1798
1799 /* Error condition exits */
1800 out4:
1801 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1802
1803 /*
1804 * If the mount has been placed on the covered vp,
1805 * it may have been discovered by now, so we have
1806 * to treat this just like an unmount
1807 */
1808 mount_lock_spin(mp);
1809 mp->mnt_lflag |= MNT_LDEAD;
1810 mount_unlock(mp);
1811
1812 if (device_vnode != NULLVP) {
1813 vnode_rele(device_vnode);
1814 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1815 ctx);
1816 did_rele = TRUE;
1817 }
1818
1819 vnode_lock_spin(vp);
1820
1821 mp->mnt_crossref++;
1822 vp->v_mountedhere = (mount_t) 0;
1823
1824 vnode_unlock(vp);
1825
1826 if (have_usecount) {
1827 vnode_rele(vp);
1828 }
1829 out3:
1830 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1831 vnode_rele(devvp);
1832 }
1833 out2:
1834 if (devpath && devvp) {
1835 vnode_put(devvp);
1836 }
1837 out1:
1838 /* Release mnt_rwlock only when it was taken */
1839 if (is_rwlock_locked == TRUE) {
1840 if (flag_set) {
1841 mp->mnt_flag = flag; /* restore mnt_flag value */
1842 }
1843 lck_rw_done(&mp->mnt_rwlock);
1844 }
1845
1846 if (did_set_lmount) {
1847 mount_lock_spin(mp);
1848 mp->mnt_lflag &= ~MNT_LMOUNT;
1849 mount_unlock(mp);
1850 }
1851
1852 if (mntalloc) {
1853 if (mp->mnt_crossref) {
1854 mount_dropcrossref(mp, vp, 0);
1855 } else {
1856 mount_lock_destroy(mp);
1857 #if CONFIG_MACF
1858 mac_mount_label_destroy(mp);
1859 #endif
1860 zfree(mount_zone, mp);
1861 }
1862 }
1863 if (vfsp_ref) {
1864 mount_list_lock();
1865 vfsp->vfc_refcount--;
1866 mount_list_unlock();
1867 }
1868
1869 return error;
1870 }
1871
1872 /*
1873 * Flush in-core data, check for competing mount attempts,
1874 * and set VMOUNT
1875 */
1876 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1877 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1878 {
1879 #if !CONFIG_MACF
1880 #pragma unused(cnp,fsname)
1881 #endif
1882 struct vnode_attr va;
1883 int error;
1884 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1885 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1886 boolean_t is_busy;
1887
1888 if (!skip_auth) {
1889 /*
1890 * If the user is not root, ensure that they own the directory
1891 * onto which we are attempting to mount.
1892 */
1893 VATTR_INIT(&va);
1894 VATTR_WANTED(&va, va_uid);
1895 if ((error = vnode_getattr(vp, &va, ctx)) ||
1896 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1897 (!vfs_context_issuser(ctx)))) {
1898 error = EPERM;
1899 goto out;
1900 }
1901 }
1902
1903 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1904 goto out;
1905 }
1906
1907 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1908 goto out;
1909 }
1910
1911 if (vp->v_type != VDIR) {
1912 error = ENOTDIR;
1913 goto out;
1914 }
1915
1916 vnode_lock_spin(vp);
1917 is_busy = is_fmount ?
1918 (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1919 (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1920 if (is_busy) {
1921 vnode_unlock(vp);
1922 error = EBUSY;
1923 goto out;
1924 }
1925 SET(vp->v_flag, VMOUNT);
1926 vnode_unlock(vp);
1927
1928 #if CONFIG_MACF
1929 error = mac_mount_check_mount(ctx, vp,
1930 cnp, fsname);
1931 if (error != 0) {
1932 vnode_lock_spin(vp);
1933 CLR(vp->v_flag, VMOUNT);
1934 vnode_unlock(vp);
1935 }
1936 #endif
1937
1938 out:
1939 return error;
1940 }
1941
1942 #if CONFIG_IMGSRC_ACCESS
1943
1944 #define DEBUG_IMGSRC 0
1945
1946 #if DEBUG_IMGSRC
1947 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1948 #else
1949 #define IMGSRC_DEBUG(args...) do { } while(0)
1950 #endif
1951
1952 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1953 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1954 {
1955 struct nameidata nd;
1956 vnode_t vp, realdevvp;
1957 mode_t accessmode;
1958 int error;
1959 enum uio_seg uio = UIO_USERSPACE;
1960
1961 if (ctx == vfs_context_kernel()) {
1962 uio = UIO_SYSSPACE;
1963 }
1964
1965 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1966 if ((error = namei(&nd))) {
1967 IMGSRC_DEBUG("namei() failed with %d\n", error);
1968 return error;
1969 }
1970
1971 vp = nd.ni_vp;
1972
1973 if (!vnode_isblk(vp)) {
1974 IMGSRC_DEBUG("Not block device.\n");
1975 error = ENOTBLK;
1976 goto out;
1977 }
1978
1979 realdevvp = mp->mnt_devvp;
1980 if (realdevvp == NULLVP) {
1981 IMGSRC_DEBUG("No device backs the mount.\n");
1982 error = ENXIO;
1983 goto out;
1984 }
1985
1986 error = vnode_getwithref(realdevvp);
1987 if (error != 0) {
1988 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1989 goto out;
1990 }
1991
1992 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1993 IMGSRC_DEBUG("Wrong dev_t.\n");
1994 error = ENXIO;
1995 goto out1;
1996 }
1997
1998 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1999
2000 /*
2001 * If mount by non-root, then verify that user has necessary
2002 * permissions on the device.
2003 */
2004 if (!vfs_context_issuser(ctx)) {
2005 accessmode = KAUTH_VNODE_READ_DATA;
2006 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2007 accessmode |= KAUTH_VNODE_WRITE_DATA;
2008 }
2009 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2010 IMGSRC_DEBUG("Access denied.\n");
2011 goto out1;
2012 }
2013 }
2014
2015 *devvpp = vp;
2016
2017 out1:
2018 vnode_put(realdevvp);
2019
2020 out:
2021 nameidone(&nd);
2022
2023 if (error) {
2024 vnode_put(vp);
2025 }
2026
2027 return error;
2028 }
2029
2030 /*
2031 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2032 * and call checkdirs()
2033 */
2034 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2035 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2036 {
2037 int error;
2038
2039 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2040
2041 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2042 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2043
2044 vnode_lock_spin(vp);
2045 CLR(vp->v_flag, VMOUNT);
2046 vp->v_mountedhere = mp;
2047 vnode_unlock(vp);
2048
2049 /*
2050 * taking the name_cache_lock exclusively will
2051 * insure that everyone is out of the fast path who
2052 * might be trying to use a now stale copy of
2053 * vp->v_mountedhere->mnt_realrootvp
2054 * bumping mount_generation causes the cached values
2055 * to be invalidated
2056 */
2057 name_cache_lock();
2058 mount_generation++;
2059 name_cache_unlock();
2060
2061 error = vnode_ref(vp);
2062 if (error != 0) {
2063 goto out;
2064 }
2065
2066 error = checkdirs(vp, ctx);
2067 if (error != 0) {
2068 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2069 vnode_rele(vp);
2070 goto out;
2071 }
2072
2073 out:
2074 if (error != 0) {
2075 mp->mnt_vnodecovered = NULLVP;
2076 }
2077 return error;
2078 }
2079
2080 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2081 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2082 {
2083 vnode_rele(vp);
2084 vnode_lock_spin(vp);
2085 vp->v_mountedhere = (mount_t)NULL;
2086 vnode_unlock(vp);
2087
2088 mp->mnt_vnodecovered = NULLVP;
2089 }
2090
2091 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2092 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2093 {
2094 int error;
2095
2096 /* unmount in progress return error */
2097 mount_lock_spin(mp);
2098 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2099 mount_unlock(mp);
2100 return EBUSY;
2101 }
2102 mount_unlock(mp);
2103 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2104
2105 /*
2106 * We only allow the filesystem to be reloaded if it
2107 * is currently mounted read-only.
2108 */
2109 if ((flags & MNT_RELOAD) &&
2110 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2111 error = ENOTSUP;
2112 goto out;
2113 }
2114
2115 /*
2116 * Only root, or the user that did the original mount is
2117 * permitted to update it.
2118 */
2119 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2120 (!vfs_context_issuser(ctx))) {
2121 error = EPERM;
2122 goto out;
2123 }
2124 #if CONFIG_MACF
2125 error = mac_mount_check_remount(ctx, mp);
2126 if (error != 0) {
2127 goto out;
2128 }
2129 #endif
2130
2131 out:
2132 if (error) {
2133 lck_rw_done(&mp->mnt_rwlock);
2134 }
2135
2136 return error;
2137 }
2138
2139 static void
mount_end_update(mount_t mp)2140 mount_end_update(mount_t mp)
2141 {
2142 lck_rw_done(&mp->mnt_rwlock);
2143 }
2144
2145 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2146 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2147 {
2148 vnode_t vp;
2149
2150 if (height >= MAX_IMAGEBOOT_NESTING) {
2151 return EINVAL;
2152 }
2153
2154 vp = imgsrc_rootvnodes[height];
2155 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2156 *rvpp = vp;
2157 return 0;
2158 } else {
2159 return ENOENT;
2160 }
2161 }
2162
2163 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2164 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2165 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2166 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2167 {
2168 int error;
2169 mount_t mp;
2170 boolean_t placed = FALSE;
2171 struct vfstable *vfsp;
2172 user_addr_t devpath;
2173 char *old_mntonname;
2174 vnode_t rvp;
2175 vnode_t devvp;
2176 uint32_t height;
2177 uint32_t flags;
2178
2179 /* If we didn't imageboot, nothing to move */
2180 if (imgsrc_rootvnodes[0] == NULLVP) {
2181 return EINVAL;
2182 }
2183
2184 /* Only root can do this */
2185 if (!vfs_context_issuser(ctx)) {
2186 return EPERM;
2187 }
2188
2189 IMGSRC_DEBUG("looking for root vnode.\n");
2190
2191 /*
2192 * Get root vnode of filesystem we're moving.
2193 */
2194 if (by_index) {
2195 if (is64bit) {
2196 struct user64_mnt_imgsrc_args mia64;
2197 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2198 if (error != 0) {
2199 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2200 return error;
2201 }
2202
2203 height = mia64.mi_height;
2204 flags = mia64.mi_flags;
2205 devpath = (user_addr_t)mia64.mi_devpath;
2206 } else {
2207 struct user32_mnt_imgsrc_args mia32;
2208 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2209 if (error != 0) {
2210 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2211 return error;
2212 }
2213
2214 height = mia32.mi_height;
2215 flags = mia32.mi_flags;
2216 devpath = mia32.mi_devpath;
2217 }
2218 } else {
2219 /*
2220 * For binary compatibility--assumes one level of nesting.
2221 */
2222 if (is64bit) {
2223 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2224 return error;
2225 }
2226 } else {
2227 user32_addr_t tmp;
2228 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2229 return error;
2230 }
2231
2232 /* munge into LP64 addr */
2233 devpath = CAST_USER_ADDR_T(tmp);
2234 }
2235
2236 height = 0;
2237 flags = 0;
2238 }
2239
2240 if (flags != 0) {
2241 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2242 return EINVAL;
2243 }
2244
2245 error = get_imgsrc_rootvnode(height, &rvp);
2246 if (error != 0) {
2247 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2248 return error;
2249 }
2250
2251 IMGSRC_DEBUG("got old root vnode\n");
2252
2253 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2254
2255 /* Can only move once */
2256 mp = vnode_mount(rvp);
2257 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2258 IMGSRC_DEBUG("Already moved.\n");
2259 error = EBUSY;
2260 goto out0;
2261 }
2262
2263 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2264 IMGSRC_DEBUG("Starting updated.\n");
2265
2266 /* Get exclusive rwlock on mount, authorize update on mp */
2267 error = mount_begin_update(mp, ctx, 0);
2268 if (error != 0) {
2269 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2270 goto out0;
2271 }
2272
2273 /*
2274 * It can only be moved once. Flag is set under the rwlock,
2275 * so we're now safe to proceed.
2276 */
2277 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2278 IMGSRC_DEBUG("Already moved [2]\n");
2279 goto out1;
2280 }
2281
2282 IMGSRC_DEBUG("Preparing coveredvp.\n");
2283
2284 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2285 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2286 if (error != 0) {
2287 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2288 goto out1;
2289 }
2290
2291 IMGSRC_DEBUG("Covered vp OK.\n");
2292
2293 /* Sanity check the name caller has provided */
2294 vfsp = mp->mnt_vtable;
2295 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2296 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2297 vfsp->vfc_name, fsname);
2298 error = EINVAL;
2299 goto out2;
2300 }
2301
2302 /* Check the device vnode and update mount-from name, for local filesystems */
2303 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2304 IMGSRC_DEBUG("Local, doing device validation.\n");
2305
2306 if (devpath != USER_ADDR_NULL) {
2307 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2308 if (error) {
2309 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2310 goto out2;
2311 }
2312
2313 vnode_put(devvp);
2314 }
2315 }
2316
2317 /*
2318 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2319 * and increment the name cache's mount generation
2320 */
2321
2322 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2323 error = place_mount_and_checkdirs(mp, vp, ctx);
2324 if (error != 0) {
2325 goto out2;
2326 }
2327
2328 placed = TRUE;
2329
2330 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2331 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2332
2333 /* Forbid future moves */
2334 mount_lock(mp);
2335 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2336 mount_unlock(mp);
2337
2338 /* Finally, add to mount list, completely ready to go */
2339 if (mount_list_add(mp) != 0) {
2340 /*
2341 * The system is shutting down trying to umount
2342 * everything, so fail with a plausible errno.
2343 */
2344 error = EBUSY;
2345 goto out3;
2346 }
2347
2348 mount_end_update(mp);
2349 vnode_put(rvp);
2350 zfree(ZV_NAMEI, old_mntonname);
2351
2352 vfs_notify_mount(pvp);
2353
2354 return 0;
2355 out3:
2356 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2357
2358 mount_lock(mp);
2359 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2360 mount_unlock(mp);
2361
2362 out2:
2363 /*
2364 * Placing the mp on the vnode clears VMOUNT,
2365 * so cleanup is different after that point
2366 */
2367 if (placed) {
2368 /* Rele the vp, clear VMOUNT and v_mountedhere */
2369 undo_place_on_covered_vp(mp, vp);
2370 } else {
2371 vnode_lock_spin(vp);
2372 CLR(vp->v_flag, VMOUNT);
2373 vnode_unlock(vp);
2374 }
2375 out1:
2376 mount_end_update(mp);
2377
2378 out0:
2379 vnode_put(rvp);
2380 zfree(ZV_NAMEI, old_mntonname);
2381 return error;
2382 }
2383
2384 #endif /* CONFIG_IMGSRC_ACCESS */
2385
2386 void
enablequotas(struct mount * mp,vfs_context_t ctx)2387 enablequotas(struct mount *mp, vfs_context_t ctx)
2388 {
2389 struct nameidata qnd;
2390 int type;
2391 char qfpath[MAXPATHLEN];
2392 const char *qfname = QUOTAFILENAME;
2393 const char *qfopsname = QUOTAOPSNAME;
2394 const char *qfextension[] = INITQFNAMES;
2395
2396 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2397 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2398 return;
2399 }
2400 /*
2401 * Enable filesystem disk quotas if necessary.
2402 * We ignore errors as this should not interfere with final mount
2403 */
2404 for (type = 0; type < MAXQUOTAS; type++) {
2405 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2406 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2407 CAST_USER_ADDR_T(qfpath), ctx);
2408 if (namei(&qnd) != 0) {
2409 continue; /* option file to trigger quotas is not present */
2410 }
2411 vnode_put(qnd.ni_vp);
2412 nameidone(&qnd);
2413 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2414
2415 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2416 }
2417 return;
2418 }
2419
2420
2421 static int
checkdirs_callback(proc_t p,void * arg)2422 checkdirs_callback(proc_t p, void * arg)
2423 {
2424 struct cdirargs *cdrp = (struct cdirargs *)arg;
2425 vnode_t olddp = cdrp->olddp;
2426 vnode_t newdp = cdrp->newdp;
2427 struct filedesc *fdp = &p->p_fd;
2428 vnode_t new_cvp = newdp;
2429 vnode_t new_rvp = newdp;
2430 vnode_t old_cvp = NULL;
2431 vnode_t old_rvp = NULL;
2432
2433 /*
2434 * XXX Also needs to iterate each thread in the process to see if it
2435 * XXX is using a per-thread current working directory, and, if so,
2436 * XXX update that as well.
2437 */
2438
2439 /*
2440 * First, with the proc_fdlock held, check to see if we will need
2441 * to do any work. If not, we will get out fast.
2442 */
2443 proc_fdlock(p);
2444 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2445 proc_fdunlock(p);
2446 return PROC_RETURNED;
2447 }
2448 proc_fdunlock(p);
2449
2450 /*
2451 * Ok, we will have to do some work. Always take two refs
2452 * because we might need that many. We'll dispose of whatever
2453 * we ended up not using.
2454 */
2455 if (vnode_ref(newdp) != 0) {
2456 return PROC_RETURNED;
2457 }
2458 if (vnode_ref(newdp) != 0) {
2459 vnode_rele(newdp);
2460 return PROC_RETURNED;
2461 }
2462
2463 proc_dirs_lock_exclusive(p);
2464 /*
2465 * Now do the work. Note: we dropped the proc_fdlock, so we
2466 * have to do all of the checks again.
2467 */
2468 proc_fdlock(p);
2469 if (fdp->fd_cdir == olddp) {
2470 old_cvp = olddp;
2471 fdp->fd_cdir = newdp;
2472 new_cvp = NULL;
2473 }
2474 if (fdp->fd_rdir == olddp) {
2475 old_rvp = olddp;
2476 fdp->fd_rdir = newdp;
2477 new_rvp = NULL;
2478 }
2479 proc_fdunlock(p);
2480 proc_dirs_unlock_exclusive(p);
2481
2482 /*
2483 * Dispose of any references that are no longer needed.
2484 */
2485 if (old_cvp != NULL) {
2486 vnode_rele(old_cvp);
2487 }
2488 if (old_rvp != NULL) {
2489 vnode_rele(old_rvp);
2490 }
2491 if (new_cvp != NULL) {
2492 vnode_rele(new_cvp);
2493 }
2494 if (new_rvp != NULL) {
2495 vnode_rele(new_rvp);
2496 }
2497
2498 return PROC_RETURNED;
2499 }
2500
2501
2502
2503 /*
2504 * Scan all active processes to see if any of them have a current
2505 * or root directory onto which the new filesystem has just been
2506 * mounted. If so, replace them with the new mount point.
2507 */
2508 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2509 checkdirs(vnode_t olddp, vfs_context_t ctx)
2510 {
2511 vnode_t newdp;
2512 vnode_t tvp;
2513 int err;
2514 struct cdirargs cdr;
2515
2516 if (olddp->v_usecount == 1) {
2517 return 0;
2518 }
2519 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2520
2521 if (err != 0) {
2522 #if DIAGNOSTIC
2523 panic("mount: lost mount: error %d", err);
2524 #endif
2525 return err;
2526 }
2527
2528 cdr.olddp = olddp;
2529 cdr.newdp = newdp;
2530 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2531 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2532
2533 if (rootvnode == olddp) {
2534 vnode_ref(newdp);
2535 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2536 tvp = rootvnode;
2537 rootvnode = newdp;
2538 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2539 vnode_rele(tvp);
2540 }
2541
2542 vnode_put(newdp);
2543 return 0;
2544 }
2545
2546 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2547 "com.apple.private.vfs.role-account-unmount"
2548
2549 /*
2550 * Unmount a file system.
2551 *
2552 * Note: unmount takes a path to the vnode mounted on as argument,
2553 * not special file (as before).
2554 */
2555 /* ARGSUSED */
2556 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2557 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2558 {
2559 vnode_t vp;
2560 struct mount *mp;
2561 int error;
2562 struct nameidata nd;
2563 vfs_context_t ctx;
2564
2565 /*
2566 * If the process has the entitlement, use the kernel's context when
2567 * performing lookup on the mount path as the process might lack proper
2568 * permission to access the directory.
2569 */
2570 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2571 vfs_context_kernel() : vfs_context_current();
2572
2573 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2574 UIO_USERSPACE, uap->path, ctx);
2575 error = namei(&nd);
2576 if (error) {
2577 return error;
2578 }
2579 vp = nd.ni_vp;
2580 mp = vp->v_mount;
2581 nameidone(&nd);
2582
2583 #if CONFIG_MACF
2584 error = mac_mount_check_umount(ctx, mp);
2585 if (error != 0) {
2586 vnode_put(vp);
2587 return error;
2588 }
2589 #endif
2590 /*
2591 * Must be the root of the filesystem
2592 */
2593 if ((vp->v_flag & VROOT) == 0) {
2594 vnode_put(vp);
2595 return EINVAL;
2596 }
2597 mount_ref(mp, 0);
2598 vnode_put(vp);
2599 /* safedounmount consumes the mount ref */
2600 return safedounmount(mp, uap->flags, ctx);
2601 }
2602
2603 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2604 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2605 {
2606 mount_t mp;
2607
2608 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2609 if (mp == (mount_t)0) {
2610 return ENOENT;
2611 }
2612 mount_ref(mp, 0);
2613 mount_iterdrop(mp);
2614 /* safedounmount consumes the mount ref */
2615 return safedounmount(mp, flags, ctx);
2616 }
2617
2618 /*
2619 * The mount struct comes with a mount ref which will be consumed.
2620 * Do the actual file system unmount, prevent some common foot shooting.
2621 */
2622 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2623 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2624 {
2625 int error;
2626 proc_t p = vfs_context_proc(ctx);
2627
2628 /*
2629 * If the file system is not responding and MNT_NOBLOCK
2630 * is set and not a forced unmount then return EBUSY.
2631 */
2632 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2633 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2634 error = EBUSY;
2635 goto out;
2636 }
2637
2638 /*
2639 * Skip authorization in two cases:
2640 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2641 * This entitlement allows non-root processes unmount volumes mounted by
2642 * other processes.
2643 * - If the mount is tagged as permissive and this is not a forced-unmount
2644 * attempt.
2645 */
2646 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2647 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2648 /*
2649 * Only root, or the user that did the original mount is
2650 * permitted to unmount this filesystem.
2651 */
2652 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2653 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2654 goto out;
2655 }
2656 }
2657 /*
2658 * Don't allow unmounting the root file system, or other volumes
2659 * associated with it (for example, the associated VM or DATA mounts) .
2660 */
2661 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2662 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2663 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2664 mp->mnt_vfsstat.f_mntonname);
2665 }
2666 error = EBUSY; /* the root (or associated volumes) is always busy */
2667 goto out;
2668 }
2669
2670 /*
2671 * If the mount is providing the root filesystem's disk image
2672 * (i.e. imageboot), don't allow unmounting
2673 */
2674 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2675 error = EBUSY;
2676 goto out;
2677 }
2678
2679 return dounmount(mp, flags, 1, ctx);
2680
2681 out:
2682 mount_drop(mp, 0);
2683 return error;
2684 }
2685
2686 /*
2687 * Do the actual file system unmount.
2688 */
2689 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2690 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2691 {
2692 vnode_t coveredvp = (vnode_t)0;
2693 int error;
2694 int needwakeup = 0;
2695 int forcedunmount = 0;
2696 int lflags = 0;
2697 struct vnode *devvp = NULLVP;
2698 #if CONFIG_TRIGGERS
2699 proc_t p = vfs_context_proc(ctx);
2700 int did_vflush = 0;
2701 int pflags_save = 0;
2702 #endif /* CONFIG_TRIGGERS */
2703
2704 #if CONFIG_FSE
2705 if (!(flags & MNT_FORCE)) {
2706 fsevent_unmount(mp, ctx); /* has to come first! */
2707 }
2708 #endif
2709
2710 mount_lock(mp);
2711
2712 /*
2713 * If already an unmount in progress just return EBUSY.
2714 * Even a forced unmount cannot override.
2715 */
2716 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2717 if (withref != 0) {
2718 mount_drop(mp, 1);
2719 }
2720 mount_unlock(mp);
2721 return EBUSY;
2722 }
2723
2724 if (flags & MNT_FORCE) {
2725 forcedunmount = 1;
2726 mp->mnt_lflag |= MNT_LFORCE;
2727 }
2728
2729 #if CONFIG_TRIGGERS
2730 if (flags & MNT_NOBLOCK && p != kernproc) {
2731 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2732 }
2733 #endif
2734
2735 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2736 mp->mnt_lflag |= MNT_LUNMOUNT;
2737 mp->mnt_flag &= ~MNT_ASYNC;
2738 /*
2739 * anyone currently in the fast path that
2740 * trips over the cached rootvp will be
2741 * dumped out and forced into the slow path
2742 * to regenerate a new cached value
2743 */
2744 mp->mnt_realrootvp = NULLVP;
2745 mount_unlock(mp);
2746
2747 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2748 /*
2749 * Force unmount any mounts in this filesystem.
2750 * If any unmounts fail - just leave them dangling.
2751 * Avoids recursion.
2752 */
2753 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2754 }
2755
2756 /*
2757 * taking the name_cache_lock exclusively will
2758 * insure that everyone is out of the fast path who
2759 * might be trying to use a now stale copy of
2760 * vp->v_mountedhere->mnt_realrootvp
2761 * bumping mount_generation causes the cached values
2762 * to be invalidated
2763 */
2764 name_cache_lock();
2765 mount_generation++;
2766 name_cache_unlock();
2767
2768
2769 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2770 if (withref != 0) {
2771 mount_drop(mp, 0);
2772 }
2773 error = 0;
2774 if (forcedunmount == 0) {
2775 ubc_umount(mp); /* release cached vnodes */
2776 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2777 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2778 if (error) {
2779 mount_lock(mp);
2780 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2781 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2782 mp->mnt_lflag &= ~MNT_LFORCE;
2783 goto out;
2784 }
2785 }
2786 }
2787
2788 IOBSDMountChange(mp, kIOMountChangeUnmount);
2789
2790 #if CONFIG_TRIGGERS
2791 vfs_nested_trigger_unmounts(mp, flags, ctx);
2792 did_vflush = 1;
2793 #endif
2794 if (forcedunmount) {
2795 lflags |= FORCECLOSE;
2796 }
2797 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2798 if ((forcedunmount == 0) && error) {
2799 mount_lock(mp);
2800 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2801 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2802 mp->mnt_lflag &= ~MNT_LFORCE;
2803 goto out;
2804 }
2805
2806 /* make sure there are no one in the mount iterations or lookup */
2807 mount_iterdrain(mp);
2808
2809 error = VFS_UNMOUNT(mp, flags, ctx);
2810 if (error) {
2811 mount_iterreset(mp);
2812 mount_lock(mp);
2813 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2814 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2815 mp->mnt_lflag &= ~MNT_LFORCE;
2816 goto out;
2817 }
2818
2819 /* increment the operations count */
2820 if (!error) {
2821 OSAddAtomic(1, &vfs_nummntops);
2822 }
2823
2824 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2825 /* hold an io reference and drop the usecount before close */
2826 devvp = mp->mnt_devvp;
2827 vnode_getalways(devvp);
2828 vnode_rele(devvp);
2829 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2830 ctx);
2831 vnode_clearmountedon(devvp);
2832 vnode_put(devvp);
2833 }
2834 lck_rw_done(&mp->mnt_rwlock);
2835 mount_list_remove(mp);
2836 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2837
2838 /* mark the mount point hook in the vp but not drop the ref yet */
2839 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2840 /*
2841 * The covered vnode needs special handling. Trying to get an
2842 * iocount must not block here as this may lead to deadlocks
2843 * if the Filesystem to which the covered vnode belongs is
2844 * undergoing forced unmounts. Since we hold a usecount, the
2845 * vnode cannot be reused (it can, however, still be terminated)
2846 */
2847 vnode_getalways(coveredvp);
2848 vnode_lock_spin(coveredvp);
2849
2850 mp->mnt_crossref++;
2851 coveredvp->v_mountedhere = (struct mount *)0;
2852 CLR(coveredvp->v_flag, VMOUNT);
2853
2854 vnode_unlock(coveredvp);
2855 vnode_put(coveredvp);
2856 }
2857
2858 mount_list_lock();
2859 mp->mnt_vtable->vfc_refcount--;
2860 mount_list_unlock();
2861
2862 cache_purgevfs(mp); /* remove cache entries for this file sys */
2863 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2864 mount_lock(mp);
2865 mp->mnt_lflag |= MNT_LDEAD;
2866
2867 if (mp->mnt_lflag & MNT_LWAIT) {
2868 /*
2869 * do the wakeup here
2870 * in case we block in mount_refdrain
2871 * which will drop the mount lock
2872 * and allow anyone blocked in vfs_busy
2873 * to wakeup and see the LDEAD state
2874 */
2875 mp->mnt_lflag &= ~MNT_LWAIT;
2876 wakeup((caddr_t)mp);
2877 }
2878 mount_refdrain(mp);
2879
2880 /* free disk_conditioner_info structure for this mount */
2881 disk_conditioner_unmount(mp);
2882
2883 out:
2884 if (mp->mnt_lflag & MNT_LWAIT) {
2885 mp->mnt_lflag &= ~MNT_LWAIT;
2886 needwakeup = 1;
2887 }
2888
2889 #if CONFIG_TRIGGERS
2890 if (flags & MNT_NOBLOCK && p != kernproc) {
2891 // Restore P_NOREMOTEHANG bit to its previous value
2892 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2893 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2894 }
2895 }
2896
2897 /*
2898 * Callback and context are set together under the mount lock, and
2899 * never cleared, so we're safe to examine them here, drop the lock,
2900 * and call out.
2901 */
2902 if (mp->mnt_triggercallback != NULL) {
2903 mount_unlock(mp);
2904 if (error == 0) {
2905 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2906 } else if (did_vflush) {
2907 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2908 }
2909 } else {
2910 mount_unlock(mp);
2911 }
2912 #else
2913 mount_unlock(mp);
2914 #endif /* CONFIG_TRIGGERS */
2915
2916 lck_rw_done(&mp->mnt_rwlock);
2917
2918 if (needwakeup) {
2919 wakeup((caddr_t)mp);
2920 }
2921
2922 if (!error) {
2923 if ((coveredvp != NULLVP)) {
2924 vnode_t pvp = NULLVP;
2925
2926 /*
2927 * The covered vnode needs special handling. Trying to
2928 * get an iocount must not block here as this may lead
2929 * to deadlocks if the Filesystem to which the covered
2930 * vnode belongs is undergoing forced unmounts. Since we
2931 * hold a usecount, the vnode cannot be reused
2932 * (it can, however, still be terminated).
2933 */
2934 vnode_getalways(coveredvp);
2935
2936 mount_dropcrossref(mp, coveredvp, 0);
2937 /*
2938 * We'll _try_ to detect if this really needs to be
2939 * done. The coveredvp can only be in termination (or
2940 * terminated) if the coveredvp's mount point is in a
2941 * forced unmount (or has been) since we still hold the
2942 * ref.
2943 */
2944 if (!vnode_isrecycled(coveredvp)) {
2945 pvp = vnode_getparent(coveredvp);
2946 #if CONFIG_TRIGGERS
2947 if (coveredvp->v_resolve) {
2948 vnode_trigger_rearm(coveredvp, ctx);
2949 }
2950 #endif
2951 }
2952
2953 vnode_rele(coveredvp);
2954 vnode_put(coveredvp);
2955 coveredvp = NULLVP;
2956
2957 if (pvp) {
2958 lock_vnode_and_post(pvp, NOTE_WRITE);
2959 vnode_put(pvp);
2960 }
2961 } else if (mp->mnt_flag & MNT_ROOTFS) {
2962 mount_lock_destroy(mp);
2963 #if CONFIG_MACF
2964 mac_mount_label_destroy(mp);
2965 #endif
2966 zfree(mount_zone, mp);
2967 } else {
2968 panic("dounmount: no coveredvp");
2969 }
2970 }
2971 return error;
2972 }
2973
2974 /*
2975 * Unmount any mounts in this filesystem.
2976 */
2977 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2978 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2979 {
2980 mount_t smp;
2981 fsid_t *fsids, fsid;
2982 int fsids_sz;
2983 int count = 0, i, m = 0;
2984 vnode_t vp;
2985
2986 mount_list_lock();
2987
2988 // Get an array to hold the submounts fsids.
2989 TAILQ_FOREACH(smp, &mountlist, mnt_list)
2990 count++;
2991 fsids_sz = count * sizeof(fsid_t);
2992 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
2993 if (fsids == NULL) {
2994 mount_list_unlock();
2995 goto out;
2996 }
2997 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
2998
2999 /*
3000 * Fill the array with submount fsids.
3001 * Since mounts are always added to the tail of the mount list, the
3002 * list is always in mount order.
3003 * For each mount check if the mounted-on vnode belongs to a
3004 * mount that's already added to our array of mounts to be unmounted.
3005 */
3006 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3007 vp = smp->mnt_vnodecovered;
3008 if (vp == NULL) {
3009 continue;
3010 }
3011 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3012 for (i = 0; i <= m; i++) {
3013 if (fsids[i].val[0] == fsid.val[0] &&
3014 fsids[i].val[1] == fsid.val[1]) {
3015 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3016 break;
3017 }
3018 }
3019 }
3020 mount_list_unlock();
3021
3022 // Unmount the submounts in reverse order. Ignore errors.
3023 for (i = m; i > 0; i--) {
3024 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3025 if (smp) {
3026 mount_ref(smp, 0);
3027 mount_iterdrop(smp);
3028 (void) dounmount(smp, flags, 1, ctx);
3029 }
3030 }
3031 out:
3032 kfree_data(fsids, fsids_sz);
3033 }
3034
3035 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3036 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3037 {
3038 vnode_hold(dp);
3039 vnode_lock(dp);
3040 mp->mnt_crossref--;
3041
3042 if (mp->mnt_crossref < 0) {
3043 panic("mount cross refs -ve");
3044 }
3045
3046 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3047 if (need_put) {
3048 vnode_put_locked(dp);
3049 }
3050 vnode_drop_and_unlock(dp);
3051
3052 mount_lock_destroy(mp);
3053 #if CONFIG_MACF
3054 mac_mount_label_destroy(mp);
3055 #endif
3056 zfree(mount_zone, mp);
3057 return;
3058 }
3059 if (need_put) {
3060 vnode_put_locked(dp);
3061 }
3062 vnode_drop_and_unlock(dp);
3063 }
3064
3065
3066 /*
3067 * Sync each mounted filesystem.
3068 */
3069 #if DIAGNOSTIC
3070 int syncprt = 0;
3071 #endif
3072
3073 int print_vmpage_stat = 0;
3074
3075 /*
3076 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3077 * mounted read-write with the passed waitfor value.
3078 *
3079 * Parameters: mp mount-point descriptor per mounted file-system instance.
3080 * arg user argument (please see below)
3081 *
3082 * User argument is a pointer to 32 bit unsigned integer which describes the
3083 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3084 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3085 * waitfor value.
3086 *
3087 * Returns: VFS_RETURNED
3088 */
3089 static int
sync_callback(mount_t mp,void * arg)3090 sync_callback(mount_t mp, void *arg)
3091 {
3092 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3093 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3094 unsigned waitfor = MNT_NOWAIT;
3095
3096 if (arg) {
3097 waitfor = *(uint32_t*)arg;
3098 }
3099
3100 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3101 if (waitfor != MNT_WAIT &&
3102 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3103 waitfor != MNT_NOWAIT &&
3104 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3105 waitfor != MNT_DWAIT &&
3106 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3107 panic("Passed inappropriate waitfor %u to "
3108 "sync_callback()", waitfor);
3109 }
3110
3111 mp->mnt_flag &= ~MNT_ASYNC;
3112 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3113 if (asyncflag) {
3114 mp->mnt_flag |= MNT_ASYNC;
3115 }
3116 }
3117
3118 return VFS_RETURNED;
3119 }
3120
3121 /* ARGSUSED */
3122 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3123 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3124 {
3125 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3126
3127 if (print_vmpage_stat) {
3128 vm_countdirtypages();
3129 }
3130
3131 #if DIAGNOSTIC
3132 if (syncprt) {
3133 vfs_bufstats();
3134 }
3135 #endif /* DIAGNOSTIC */
3136 return 0;
3137 }
3138
3139 typedef enum {
3140 SYNC_ALL = 0,
3141 SYNC_ONLY_RELIABLE_MEDIA = 1,
3142 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3143 } sync_type_t;
3144
3145 static int
sync_internal_callback(mount_t mp,void * arg)3146 sync_internal_callback(mount_t mp, void *arg)
3147 {
3148 if (arg) {
3149 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3150 (mp->mnt_flag & MNT_LOCAL);
3151 sync_type_t sync_type = *((sync_type_t *)arg);
3152
3153 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3154 return VFS_RETURNED;
3155 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3156 return VFS_RETURNED;
3157 }
3158 }
3159
3160 (void)sync_callback(mp, NULL);
3161
3162 return VFS_RETURNED;
3163 }
3164
3165 int sync_thread_state = 0;
3166 int sync_timeout_seconds = 5;
3167
3168 #define SYNC_THREAD_RUN 0x0001
3169 #define SYNC_THREAD_RUNNING 0x0002
3170
3171 #if CONFIG_PHYS_WRITE_ACCT
3172 thread_t pm_sync_thread;
3173 #endif /* CONFIG_PHYS_WRITE_ACCT */
3174
3175 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3176 sync_thread(__unused void *arg, __unused wait_result_t wr)
3177 {
3178 sync_type_t sync_type;
3179 #if CONFIG_PHYS_WRITE_ACCT
3180 pm_sync_thread = current_thread();
3181 #endif /* CONFIG_PHYS_WRITE_ACCT */
3182
3183 lck_mtx_lock(&sync_mtx_lck);
3184 while (sync_thread_state & SYNC_THREAD_RUN) {
3185 sync_thread_state &= ~SYNC_THREAD_RUN;
3186 lck_mtx_unlock(&sync_mtx_lck);
3187
3188 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3189 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3190 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3191 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3192
3193 lck_mtx_lock(&sync_mtx_lck);
3194 }
3195 /*
3196 * This wakeup _has_ to be issued before the lock is released otherwise
3197 * we may end up waking up a thread in sync_internal which is
3198 * expecting a wakeup from a thread it just created and not from this
3199 * thread which is about to exit.
3200 */
3201 wakeup(&sync_thread_state);
3202 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3203 #if CONFIG_PHYS_WRITE_ACCT
3204 pm_sync_thread = NULL;
3205 #endif /* CONFIG_PHYS_WRITE_ACCT */
3206 lck_mtx_unlock(&sync_mtx_lck);
3207
3208 if (print_vmpage_stat) {
3209 vm_countdirtypages();
3210 }
3211
3212 #if DIAGNOSTIC
3213 if (syncprt) {
3214 vfs_bufstats();
3215 }
3216 #endif /* DIAGNOSTIC */
3217 }
3218
3219 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3220
3221 /*
3222 * An in-kernel sync for power management to call.
3223 * This function always returns within sync_timeout seconds.
3224 */
3225 __private_extern__ int
sync_internal(void)3226 sync_internal(void)
3227 {
3228 thread_t thd = NULL;
3229 int error;
3230 int thread_created = FALSE;
3231 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3232
3233 lck_mtx_lock(&sync_mtx_lck);
3234 sync_thread_state |= SYNC_THREAD_RUN;
3235 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3236 int kr;
3237
3238 sync_thread_state |= SYNC_THREAD_RUNNING;
3239 kr = kernel_thread_start(sync_thread, NULL, &thd);
3240 if (kr != KERN_SUCCESS) {
3241 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3242 lck_mtx_unlock(&sync_mtx_lck);
3243 printf("sync_thread failed\n");
3244 return 0;
3245 }
3246 thread_created = TRUE;
3247 }
3248
3249 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3250 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3251 if (error) {
3252 struct timeval now;
3253
3254 microtime(&now);
3255 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3256 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3257 sync_timeout_last_print.tv_sec = now.tv_sec;
3258 }
3259 }
3260
3261 if (thread_created) {
3262 thread_deallocate(thd);
3263 }
3264
3265 return 0;
3266 } /* end of sync_internal call */
3267
3268 /*
3269 * Change filesystem quotas.
3270 */
3271 #if QUOTA
3272 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3273 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3274 {
3275 struct mount *mp;
3276 int error, quota_cmd, quota_status = 0;
3277 caddr_t datap;
3278 size_t fnamelen;
3279 struct nameidata nd;
3280 vfs_context_t ctx = vfs_context_current();
3281 struct dqblk my_dqblk = {};
3282
3283 AUDIT_ARG(uid, uap->uid);
3284 AUDIT_ARG(cmd, uap->cmd);
3285 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3286 uap->path, ctx);
3287 error = namei(&nd);
3288 if (error) {
3289 return error;
3290 }
3291 mp = nd.ni_vp->v_mount;
3292 mount_ref(mp, 0);
3293 vnode_put(nd.ni_vp);
3294 nameidone(&nd);
3295
3296 #if CONFIG_MACF
3297 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3298 if (error != 0) {
3299 goto out;
3300 }
3301 #endif
3302
3303 /* copyin any data we will need for downstream code */
3304 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3305
3306 switch (quota_cmd) {
3307 case Q_QUOTAON:
3308 /* uap->arg specifies a file from which to take the quotas */
3309 fnamelen = MAXPATHLEN;
3310 datap = zalloc(ZV_NAMEI);
3311 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3312 break;
3313 case Q_GETQUOTA:
3314 /* uap->arg is a pointer to a dqblk structure. */
3315 datap = (caddr_t) &my_dqblk;
3316 break;
3317 case Q_SETQUOTA:
3318 case Q_SETUSE:
3319 /* uap->arg is a pointer to a dqblk structure. */
3320 datap = (caddr_t) &my_dqblk;
3321 if (proc_is64bit(p)) {
3322 struct user_dqblk my_dqblk64;
3323 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3324 if (error == 0) {
3325 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3326 }
3327 } else {
3328 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3329 }
3330 break;
3331 case Q_QUOTASTAT:
3332 /* uap->arg is a pointer to an integer */
3333 datap = (caddr_t) "a_status;
3334 break;
3335 default:
3336 datap = NULL;
3337 break;
3338 } /* switch */
3339
3340 if (error == 0) {
3341 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3342 }
3343
3344 switch (quota_cmd) {
3345 case Q_QUOTAON:
3346 if (datap != NULL) {
3347 zfree(ZV_NAMEI, datap);
3348 }
3349 break;
3350 case Q_GETQUOTA:
3351 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3352 if (error == 0) {
3353 if (proc_is64bit(p)) {
3354 struct user_dqblk my_dqblk64;
3355
3356 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3357 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3358 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3359 } else {
3360 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3361 }
3362 }
3363 break;
3364 case Q_QUOTASTAT:
3365 /* uap->arg is a pointer to an integer */
3366 if (error == 0) {
3367 error = copyout(datap, uap->arg, sizeof(quota_status));
3368 }
3369 break;
3370 default:
3371 break;
3372 } /* switch */
3373
3374 out:
3375 mount_drop(mp, 0);
3376 return error;
3377 }
3378 #else
3379 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3380 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3381 {
3382 return EOPNOTSUPP;
3383 }
3384 #endif /* QUOTA */
3385
3386 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3387 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3388 {
3389 int error;
3390 vfs_context_t ctx = vfs_context_current();
3391
3392 #if CONFIG_MACF
3393 error = mac_mount_check_stat(ctx, mp);
3394 if (error != 0) {
3395 return error;
3396 }
3397 #endif
3398
3399 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3400 if (error != 0) {
3401 return error;
3402 }
3403
3404 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3405 }
3406
3407 /*
3408 * Get filesystem statistics.
3409 *
3410 * Returns: 0 Success
3411 * namei:???
3412 * vfs_update_vfsstat:???
3413 * munge_statfs:EFAULT
3414 */
3415 /* ARGSUSED */
3416 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3417 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3418 {
3419 int error;
3420 struct mount *mp;
3421 struct nameidata nd;
3422 vfs_context_t ctx = vfs_context_current();
3423 vnode_t vp;
3424
3425 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3426 UIO_USERSPACE, uap->path, ctx);
3427 error = namei(&nd);
3428 if (error != 0) {
3429 return error;
3430 }
3431 vp = nd.ni_vp;
3432 mp = vp->v_mount;
3433 nameidone(&nd);
3434
3435 error = statfs_internal(p, mp, uap->buf);
3436 vnode_put(vp);
3437
3438 return error;
3439 }
3440
3441 /*
3442 * Get filesystem statistics.
3443 */
3444 /* ARGSUSED */
3445 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3446 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3447 {
3448 int error;
3449 vnode_t vp = NULL;
3450 struct mount *mp;
3451
3452 AUDIT_ARG(fd, uap->fd);
3453
3454 if ((error = file_vnode(uap->fd, &vp)) ||
3455 (error = vnode_getwithref(vp))) {
3456 goto out;
3457 }
3458
3459 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3460
3461 mp = vp->v_mount;
3462 if (!mp) {
3463 error = EBADF;
3464 goto out_vnode;
3465 }
3466
3467 error = statfs_internal(p, mp, uap->buf);
3468
3469 out_vnode:
3470 vnode_put(vp);
3471
3472 out:
3473 if (vp != NULL) {
3474 file_drop(uap->fd);
3475 }
3476
3477 return error;
3478 }
3479
3480 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3481 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3482 {
3483 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3484
3485 bzero(sfs, sizeof(*sfs));
3486
3487 sfs->f_bsize = vsfs->f_bsize;
3488 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3489 sfs->f_blocks = vsfs->f_blocks;
3490 sfs->f_bfree = vsfs->f_bfree;
3491 sfs->f_bavail = vsfs->f_bavail;
3492 sfs->f_files = vsfs->f_files;
3493 sfs->f_ffree = vsfs->f_ffree;
3494 sfs->f_fsid = vsfs->f_fsid;
3495 sfs->f_owner = vsfs->f_owner;
3496 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3497 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3498 sfs->f_fssubtype = vsfs->f_fssubtype;
3499 sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3500 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3501 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3502 } else {
3503 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3504 }
3505 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3506 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3507 }
3508
3509 /*
3510 * Get file system statistics in 64-bit mode
3511 */
3512 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3513 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3514 {
3515 struct mount *mp;
3516 int error;
3517 struct nameidata *ndp;
3518 struct statfs64 *sfsp;
3519 vfs_context_t ctxp = vfs_context_current();
3520 vnode_t vp;
3521 struct {
3522 struct nameidata nd;
3523 struct statfs64 sfs;
3524 } *__nameidata_statfs64;
3525
3526 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3527 Z_WAITOK);
3528 ndp = &__nameidata_statfs64->nd;
3529
3530 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3531 UIO_USERSPACE, uap->path, ctxp);
3532 error = namei(ndp);
3533 if (error != 0) {
3534 goto out;
3535 }
3536 vp = ndp->ni_vp;
3537 mp = vp->v_mount;
3538 nameidone(ndp);
3539
3540 #if CONFIG_MACF
3541 error = mac_mount_check_stat(ctxp, mp);
3542 if (error != 0) {
3543 vnode_put(vp);
3544 goto out;
3545 }
3546 #endif
3547
3548 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3549 if (error != 0) {
3550 vnode_put(vp);
3551 goto out;
3552 }
3553
3554 sfsp = &__nameidata_statfs64->sfs;
3555 vfs_get_statfs64(mp, sfsp);
3556 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3557 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3558 /* This process does not want to see a seperate data volume mountpoint */
3559 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3560 }
3561 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3562 vnode_put(vp);
3563
3564 out:
3565 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3566
3567 return error;
3568 }
3569
3570 /*
3571 * Get file system statistics in 64-bit mode
3572 */
3573 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3574 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3575 {
3576 struct vnode *vp;
3577 struct mount *mp;
3578 struct statfs64 sfs;
3579 int error;
3580
3581 AUDIT_ARG(fd, uap->fd);
3582
3583 if ((error = file_vnode(uap->fd, &vp))) {
3584 return error;
3585 }
3586
3587 error = vnode_getwithref(vp);
3588 if (error) {
3589 file_drop(uap->fd);
3590 return error;
3591 }
3592
3593 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3594
3595 mp = vp->v_mount;
3596 if (!mp) {
3597 error = EBADF;
3598 goto out;
3599 }
3600
3601 #if CONFIG_MACF
3602 error = mac_mount_check_stat(vfs_context_current(), mp);
3603 if (error != 0) {
3604 goto out;
3605 }
3606 #endif
3607
3608 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3609 goto out;
3610 }
3611
3612 vfs_get_statfs64(mp, &sfs);
3613 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3614 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3615 /* This process does not want to see a seperate data volume mountpoint */
3616 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3617 }
3618 error = copyout(&sfs, uap->buf, sizeof(sfs));
3619
3620 out:
3621 file_drop(uap->fd);
3622 vnode_put(vp);
3623
3624 return error;
3625 }
3626
3627 struct getfsstat_struct {
3628 user_addr_t sfsp;
3629 user_addr_t *mp;
3630 int count;
3631 int maxcount;
3632 int flags;
3633 int error;
3634 };
3635
3636
3637 static int
getfsstat_callback(mount_t mp,void * arg)3638 getfsstat_callback(mount_t mp, void * arg)
3639 {
3640 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3641 struct vfsstatfs *sp;
3642 int error, my_size;
3643 vfs_context_t ctx = vfs_context_current();
3644
3645 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3646 #if CONFIG_MACF
3647 error = mac_mount_check_stat(ctx, mp);
3648 if (error != 0) {
3649 fstp->error = error;
3650 return VFS_RETURNED_DONE;
3651 }
3652 #endif
3653 sp = &mp->mnt_vfsstat;
3654 /*
3655 * If MNT_NOWAIT is specified, do not refresh the
3656 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3657 */
3658 if ((mp->mnt_lflag & MNT_LDEAD) ||
3659 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3660 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3661 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3662 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3663 return VFS_RETURNED;
3664 }
3665
3666 /*
3667 * Need to handle LP64 version of struct statfs
3668 */
3669 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3670 if (error) {
3671 fstp->error = error;
3672 return VFS_RETURNED_DONE;
3673 }
3674 fstp->sfsp += my_size;
3675
3676 if (fstp->mp) {
3677 #if CONFIG_MACF
3678 error = mac_mount_label_get(mp, *fstp->mp);
3679 if (error) {
3680 fstp->error = error;
3681 return VFS_RETURNED_DONE;
3682 }
3683 #endif
3684 fstp->mp++;
3685 }
3686 }
3687 fstp->count++;
3688 return VFS_RETURNED;
3689 }
3690
3691 /*
3692 * Get statistics on all filesystems.
3693 */
3694 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3695 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3696 {
3697 struct __mac_getfsstat_args muap;
3698
3699 muap.buf = uap->buf;
3700 muap.bufsize = uap->bufsize;
3701 muap.mac = USER_ADDR_NULL;
3702 muap.macsize = 0;
3703 muap.flags = uap->flags;
3704
3705 return __mac_getfsstat(p, &muap, retval);
3706 }
3707
3708 /*
3709 * __mac_getfsstat: Get MAC-related file system statistics
3710 *
3711 * Parameters: p (ignored)
3712 * uap User argument descriptor (see below)
3713 * retval Count of file system statistics (N stats)
3714 *
3715 * Indirect: uap->bufsize Buffer size
3716 * uap->macsize MAC info size
3717 * uap->buf Buffer where information will be returned
3718 * uap->mac MAC info
3719 * uap->flags File system flags
3720 *
3721 *
3722 * Returns: 0 Success
3723 * !0 Not success
3724 *
3725 */
3726 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3727 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3728 {
3729 user_addr_t sfsp;
3730 user_addr_t *mp;
3731 size_t count, maxcount, bufsize, macsize;
3732 struct getfsstat_struct fst;
3733
3734 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3735 return EINVAL;
3736 }
3737
3738 bufsize = (size_t) uap->bufsize;
3739 macsize = (size_t) uap->macsize;
3740
3741 if (IS_64BIT_PROCESS(p)) {
3742 maxcount = bufsize / sizeof(struct user64_statfs);
3743 } else {
3744 maxcount = bufsize / sizeof(struct user32_statfs);
3745 }
3746 sfsp = uap->buf;
3747 count = 0;
3748
3749 mp = NULL;
3750
3751 #if CONFIG_MACF
3752 if (uap->mac != USER_ADDR_NULL) {
3753 u_int32_t *mp0;
3754 int error;
3755 unsigned int i;
3756
3757 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3758 if (count != maxcount) {
3759 return EINVAL;
3760 }
3761
3762 /* Copy in the array */
3763 mp0 = kalloc_data(macsize, Z_WAITOK);
3764 if (mp0 == NULL) {
3765 return ENOMEM;
3766 }
3767
3768 error = copyin(uap->mac, mp0, macsize);
3769 if (error) {
3770 kfree_data(mp0, macsize);
3771 return error;
3772 }
3773
3774 /* Normalize to an array of user_addr_t */
3775 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3776 if (mp == NULL) {
3777 kfree_data(mp0, macsize);
3778 return ENOMEM;
3779 }
3780
3781 for (i = 0; i < count; i++) {
3782 if (IS_64BIT_PROCESS(p)) {
3783 mp[i] = ((user_addr_t *)mp0)[i];
3784 } else {
3785 mp[i] = (user_addr_t)mp0[i];
3786 }
3787 }
3788 kfree_data(mp0, macsize);
3789 }
3790 #endif
3791
3792
3793 fst.sfsp = sfsp;
3794 fst.mp = mp;
3795 fst.flags = uap->flags;
3796 fst.count = 0;
3797 fst.error = 0;
3798 fst.maxcount = (int)maxcount;
3799
3800
3801 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3802
3803 if (mp) {
3804 kfree_data(mp, count * sizeof(user_addr_t));
3805 }
3806
3807 if (fst.error) {
3808 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3809 return fst.error;
3810 }
3811
3812 if (fst.sfsp && fst.count > fst.maxcount) {
3813 *retval = fst.maxcount;
3814 } else {
3815 *retval = fst.count;
3816 }
3817 return 0;
3818 }
3819
3820 static int
getfsstat64_callback(mount_t mp,void * arg)3821 getfsstat64_callback(mount_t mp, void * arg)
3822 {
3823 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3824 struct vfsstatfs *sp;
3825 struct statfs64 sfs;
3826 int error;
3827
3828 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3829 #if CONFIG_MACF
3830 error = mac_mount_check_stat(vfs_context_current(), mp);
3831 if (error != 0) {
3832 fstp->error = error;
3833 return VFS_RETURNED_DONE;
3834 }
3835 #endif
3836 sp = &mp->mnt_vfsstat;
3837 /*
3838 * If MNT_NOWAIT is specified, do not refresh the fsstat
3839 * cache. MNT_WAIT overrides MNT_NOWAIT.
3840 *
3841 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3842 * getfsstat, since the constants are out of the same
3843 * namespace.
3844 */
3845 if ((mp->mnt_lflag & MNT_LDEAD) ||
3846 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3847 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3848 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3849 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3850 return VFS_RETURNED;
3851 }
3852
3853 vfs_get_statfs64(mp, &sfs);
3854 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3855 if (error) {
3856 fstp->error = error;
3857 return VFS_RETURNED_DONE;
3858 }
3859 fstp->sfsp += sizeof(sfs);
3860 }
3861 fstp->count++;
3862 return VFS_RETURNED;
3863 }
3864
3865 /*
3866 * Get statistics on all file systems in 64 bit mode.
3867 */
3868 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3869 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3870 {
3871 user_addr_t sfsp;
3872 int count, maxcount;
3873 struct getfsstat_struct fst;
3874
3875 maxcount = uap->bufsize / sizeof(struct statfs64);
3876
3877 sfsp = uap->buf;
3878 count = 0;
3879
3880 fst.sfsp = sfsp;
3881 fst.flags = uap->flags;
3882 fst.count = 0;
3883 fst.error = 0;
3884 fst.maxcount = maxcount;
3885
3886 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3887
3888 if (fst.error) {
3889 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3890 return fst.error;
3891 }
3892
3893 if (fst.sfsp && fst.count > fst.maxcount) {
3894 *retval = fst.maxcount;
3895 } else {
3896 *retval = fst.count;
3897 }
3898
3899 return 0;
3900 }
3901
3902 /*
3903 * gets the associated vnode with the file descriptor passed.
3904 * as input
3905 *
3906 * INPUT
3907 * ctx - vfs context of caller
3908 * fd - file descriptor for which vnode is required.
3909 * vpp - Pointer to pointer to vnode to be returned.
3910 *
3911 * The vnode is returned with an iocount so any vnode obtained
3912 * by this call needs a vnode_put
3913 *
3914 */
3915 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3916 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3917 {
3918 int error;
3919 vnode_t vp;
3920 struct fileproc *fp;
3921 proc_t p = vfs_context_proc(ctx);
3922
3923 *vpp = NULLVP;
3924
3925 error = fp_getfvp(p, fd, &fp, &vp);
3926 if (error) {
3927 return error;
3928 }
3929
3930 error = vnode_getwithref(vp);
3931 if (error) {
3932 (void)fp_drop(p, fd, fp, 0);
3933 return error;
3934 }
3935
3936 (void)fp_drop(p, fd, fp, 0);
3937 *vpp = vp;
3938 return error;
3939 }
3940
3941 /*
3942 * Wrapper function around namei to start lookup from a directory
3943 * specified by a file descriptor ni_dirfd.
3944 *
3945 * In addition to all the errors returned by namei, this call can
3946 * return ENOTDIR if the file descriptor does not refer to a directory.
3947 * and EBADF if the file descriptor is not valid.
3948 */
3949 int
nameiat(struct nameidata * ndp,int dirfd)3950 nameiat(struct nameidata *ndp, int dirfd)
3951 {
3952 if ((dirfd != AT_FDCWD) &&
3953 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3954 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3955 int error = 0;
3956 char c;
3957
3958 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3959 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3960 if (error) {
3961 return error;
3962 }
3963 } else {
3964 c = *((char *)(ndp->ni_dirp));
3965 }
3966
3967 if (c != '/') {
3968 vnode_t dvp_at;
3969
3970 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3971 &dvp_at);
3972 if (error) {
3973 return error;
3974 }
3975
3976 if (vnode_vtype(dvp_at) != VDIR) {
3977 vnode_put(dvp_at);
3978 return ENOTDIR;
3979 }
3980
3981 ndp->ni_dvp = dvp_at;
3982 ndp->ni_cnd.cn_flags |= USEDVP;
3983 error = namei(ndp);
3984 ndp->ni_cnd.cn_flags &= ~USEDVP;
3985 vnode_put(dvp_at);
3986 return error;
3987 }
3988 }
3989
3990 return namei(ndp);
3991 }
3992
3993 /*
3994 * Change current working directory to a given file descriptor.
3995 */
3996 /* ARGSUSED */
3997 static int
common_fchdir(proc_t p,struct fchdir_args * uap,int per_thread)3998 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3999 {
4000 vnode_t vp;
4001 vnode_t tdp;
4002 vnode_t tvp;
4003 struct mount *mp;
4004 int error, should_put = 1;
4005 vfs_context_t ctx = vfs_context_current();
4006
4007 AUDIT_ARG(fd, uap->fd);
4008 if (per_thread && uap->fd == -1) {
4009 /*
4010 * Switching back from per-thread to per process CWD; verify we
4011 * in fact have one before proceeding. The only success case
4012 * for this code path is to return 0 preemptively after zapping
4013 * the thread structure contents.
4014 */
4015 thread_t th = vfs_context_thread(ctx);
4016 if (th) {
4017 uthread_t uth = get_bsdthread_info(th);
4018 tvp = uth->uu_cdir;
4019 uth->uu_cdir = NULLVP;
4020 if (tvp != NULLVP) {
4021 vnode_rele(tvp);
4022 return 0;
4023 }
4024 }
4025 return EBADF;
4026 }
4027
4028 if ((error = file_vnode(uap->fd, &vp))) {
4029 return error;
4030 }
4031 if ((error = vnode_getwithref(vp))) {
4032 file_drop(uap->fd);
4033 return error;
4034 }
4035
4036 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4037
4038 if (vp->v_type != VDIR) {
4039 error = ENOTDIR;
4040 goto out;
4041 }
4042
4043 #if CONFIG_MACF
4044 error = mac_vnode_check_chdir(ctx, vp);
4045 if (error) {
4046 goto out;
4047 }
4048 #endif
4049 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4050 if (error) {
4051 goto out;
4052 }
4053
4054 while (!error && (mp = vp->v_mountedhere) != NULL) {
4055 if (vfs_busy(mp, LK_NOWAIT)) {
4056 error = EACCES;
4057 goto out;
4058 }
4059 error = VFS_ROOT(mp, &tdp, ctx);
4060 vfs_unbusy(mp);
4061 if (error) {
4062 break;
4063 }
4064 vnode_put(vp);
4065 vp = tdp;
4066 }
4067 if (error) {
4068 goto out;
4069 }
4070 if ((error = vnode_ref(vp))) {
4071 goto out;
4072 }
4073 vnode_put(vp);
4074 should_put = 0;
4075
4076 if (per_thread) {
4077 thread_t th = vfs_context_thread(ctx);
4078 if (th) {
4079 uthread_t uth = get_bsdthread_info(th);
4080 tvp = uth->uu_cdir;
4081 uth->uu_cdir = vp;
4082 OSBitOrAtomic(P_THCWD, &p->p_flag);
4083 } else {
4084 vnode_rele(vp);
4085 error = ENOENT;
4086 goto out;
4087 }
4088 } else {
4089 proc_dirs_lock_exclusive(p);
4090 proc_fdlock(p);
4091 tvp = p->p_fd.fd_cdir;
4092 p->p_fd.fd_cdir = vp;
4093 proc_fdunlock(p);
4094 proc_dirs_unlock_exclusive(p);
4095 }
4096
4097 if (tvp) {
4098 vnode_rele(tvp);
4099 }
4100
4101 out:
4102 if (should_put) {
4103 vnode_put(vp);
4104 }
4105 file_drop(uap->fd);
4106
4107 return error;
4108 }
4109
4110 int
fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4111 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4112 {
4113 return common_fchdir(p, uap, 0);
4114 }
4115
4116 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4117 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4118 {
4119 return common_fchdir(p, (void *)uap, 1);
4120 }
4121
4122
4123 /*
4124 * Change current working directory (".").
4125 *
4126 * Returns: 0 Success
4127 * change_dir:ENOTDIR
4128 * change_dir:???
4129 * vnode_ref:ENOENT No such file or directory
4130 */
4131 /* ARGSUSED */
4132 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4133 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4134 {
4135 int error;
4136 vnode_t tvp;
4137
4138 error = change_dir(ndp, ctx);
4139 if (error) {
4140 return error;
4141 }
4142 if ((error = vnode_ref(ndp->ni_vp))) {
4143 vnode_put(ndp->ni_vp);
4144 return error;
4145 }
4146 /*
4147 * drop the iocount we picked up in change_dir
4148 */
4149 vnode_put(ndp->ni_vp);
4150
4151 if (per_thread) {
4152 thread_t th = vfs_context_thread(ctx);
4153 if (th) {
4154 uthread_t uth = get_bsdthread_info(th);
4155 tvp = uth->uu_cdir;
4156 uth->uu_cdir = ndp->ni_vp;
4157 OSBitOrAtomic(P_THCWD, &p->p_flag);
4158 } else {
4159 vnode_rele(ndp->ni_vp);
4160 return ENOENT;
4161 }
4162 } else {
4163 proc_dirs_lock_exclusive(p);
4164 proc_fdlock(p);
4165 tvp = p->p_fd.fd_cdir;
4166 p->p_fd.fd_cdir = ndp->ni_vp;
4167 proc_fdunlock(p);
4168 proc_dirs_unlock_exclusive(p);
4169 }
4170
4171 if (tvp) {
4172 vnode_rele(tvp);
4173 }
4174
4175 return 0;
4176 }
4177
4178
4179 /*
4180 * Change current working directory (".").
4181 *
4182 * Returns: 0 Success
4183 * chdir_internal:ENOTDIR
4184 * chdir_internal:ENOENT No such file or directory
4185 * chdir_internal:???
4186 */
4187 /* ARGSUSED */
4188 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4189 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4190 {
4191 struct nameidata nd;
4192 vfs_context_t ctx = vfs_context_current();
4193
4194 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4195 UIO_USERSPACE, uap->path, ctx);
4196
4197 return chdir_internal(p, ctx, &nd, per_thread);
4198 }
4199
4200
4201 /*
4202 * chdir
4203 *
4204 * Change current working directory (".") for the entire process
4205 *
4206 * Parameters: p Process requesting the call
4207 * uap User argument descriptor (see below)
4208 * retval (ignored)
4209 *
4210 * Indirect parameters: uap->path Directory path
4211 *
4212 * Returns: 0 Success
4213 * common_chdir: ENOTDIR
4214 * common_chdir: ENOENT No such file or directory
4215 * common_chdir: ???
4216 *
4217 */
4218 int
chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4219 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4220 {
4221 return common_chdir(p, (void *)uap, 0);
4222 }
4223
4224 /*
4225 * __pthread_chdir
4226 *
4227 * Change current working directory (".") for a single thread
4228 *
4229 * Parameters: p Process requesting the call
4230 * uap User argument descriptor (see below)
4231 * retval (ignored)
4232 *
4233 * Indirect parameters: uap->path Directory path
4234 *
4235 * Returns: 0 Success
4236 * common_chdir: ENOTDIR
4237 * common_chdir: ENOENT No such file or directory
4238 * common_chdir: ???
4239 *
4240 */
4241 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4242 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4243 {
4244 return common_chdir(p, (void *)uap, 1);
4245 }
4246
4247
4248 /*
4249 * Change notion of root (``/'') directory.
4250 */
4251 /* ARGSUSED */
4252 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4253 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4254 {
4255 struct filedesc *fdp = &p->p_fd;
4256 int error;
4257 struct nameidata nd;
4258 vnode_t tvp;
4259 vfs_context_t ctx = vfs_context_current();
4260
4261 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4262 return error;
4263 }
4264
4265 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4266 UIO_USERSPACE, uap->path, ctx);
4267 error = change_dir(&nd, ctx);
4268 if (error) {
4269 return error;
4270 }
4271
4272 #if CONFIG_MACF
4273 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4274 &nd.ni_cnd);
4275 if (error) {
4276 vnode_put(nd.ni_vp);
4277 return error;
4278 }
4279 #endif
4280
4281 if ((error = vnode_ref(nd.ni_vp))) {
4282 vnode_put(nd.ni_vp);
4283 return error;
4284 }
4285 vnode_put(nd.ni_vp);
4286
4287 /*
4288 * This lock provides the guarantee that as long as you hold the lock
4289 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4290 * on a referenced vnode in namei when determining the rootvnode for
4291 * a process.
4292 */
4293 /* needed for synchronization with lookup */
4294 proc_dirs_lock_exclusive(p);
4295 /* needed for setting the flag and other activities on the fd itself */
4296 proc_fdlock(p);
4297 tvp = fdp->fd_rdir;
4298 fdp->fd_rdir = nd.ni_vp;
4299 fdt_flag_set(fdp, FD_CHROOT);
4300 proc_fdunlock(p);
4301 proc_dirs_unlock_exclusive(p);
4302
4303 if (tvp != NULL) {
4304 vnode_rele(tvp);
4305 }
4306
4307 return 0;
4308 }
4309
4310 #define PATHSTATICBUFLEN 256
4311 #define PIVOT_ROOT_ENTITLEMENT \
4312 "com.apple.private.vfs.pivot-root"
4313
4314 #if defined(XNU_TARGET_OS_OSX)
4315 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4316 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4317 {
4318 int error;
4319 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4320 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4321 char *new_rootfs_path_before_buf = NULL;
4322 char *old_rootfs_path_after_buf = NULL;
4323 char *incoming = NULL;
4324 char *outgoing = NULL;
4325 vnode_t incoming_rootvp = NULLVP;
4326 size_t bytes_copied;
4327
4328 /*
4329 * XXX : Additional restrictions needed
4330 * - perhaps callable only once.
4331 */
4332 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4333 return error;
4334 }
4335
4336 /*
4337 * pivot_root can be executed by launchd only.
4338 * Enforce entitlement.
4339 */
4340 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4341 return EPERM;
4342 }
4343
4344 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4345 if (error == ENAMETOOLONG) {
4346 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4347 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4348 }
4349
4350 if (error) {
4351 goto out;
4352 }
4353
4354 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4355 if (error == ENAMETOOLONG) {
4356 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4357 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4358 }
4359 if (error) {
4360 goto out;
4361 }
4362
4363 if (new_rootfs_path_before_buf) {
4364 incoming = new_rootfs_path_before_buf;
4365 } else {
4366 incoming = &new_rootfs_path_before[0];
4367 }
4368
4369 if (old_rootfs_path_after_buf) {
4370 outgoing = old_rootfs_path_after_buf;
4371 } else {
4372 outgoing = &old_rootfs_path_after[0];
4373 }
4374
4375 /*
4376 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4377 * Userland is not allowed to pivot to an image.
4378 */
4379 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4380 if (error) {
4381 goto out;
4382 }
4383 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4384 if (error) {
4385 goto out;
4386 }
4387
4388 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4389
4390 out:
4391 if (incoming_rootvp != NULLVP) {
4392 vnode_put(incoming_rootvp);
4393 incoming_rootvp = NULLVP;
4394 }
4395
4396 if (old_rootfs_path_after_buf) {
4397 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4398 }
4399
4400 if (new_rootfs_path_before_buf) {
4401 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4402 }
4403
4404 return error;
4405 }
4406 #else
4407 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4408 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4409 {
4410 return nosys(p, NULL, retval);
4411 }
4412 #endif /* XNU_TARGET_OS_OSX */
4413
4414 /*
4415 * Common routine for chroot and chdir.
4416 *
4417 * Returns: 0 Success
4418 * ENOTDIR Not a directory
4419 * namei:??? [anything namei can return]
4420 * vnode_authorize:??? [anything vnode_authorize can return]
4421 */
4422 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4423 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4424 {
4425 vnode_t vp;
4426 int error;
4427
4428 if ((error = namei(ndp))) {
4429 return error;
4430 }
4431 nameidone(ndp);
4432 vp = ndp->ni_vp;
4433
4434 if (vp->v_type != VDIR) {
4435 vnode_put(vp);
4436 return ENOTDIR;
4437 }
4438
4439 #if CONFIG_MACF
4440 error = mac_vnode_check_chdir(ctx, vp);
4441 if (error) {
4442 vnode_put(vp);
4443 return error;
4444 }
4445 #endif
4446
4447 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4448 if (error) {
4449 vnode_put(vp);
4450 return error;
4451 }
4452
4453 return error;
4454 }
4455
4456 /*
4457 * Free the vnode data (for directories) associated with the file glob.
4458 */
4459 struct fd_vn_data *
fg_vn_data_alloc(void)4460 fg_vn_data_alloc(void)
4461 {
4462 struct fd_vn_data *fvdata;
4463
4464 /* Allocate per fd vnode data */
4465 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4466 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4467 return fvdata;
4468 }
4469
4470 /*
4471 * Free the vnode data (for directories) associated with the file glob.
4472 */
4473 void
fg_vn_data_free(void * fgvndata)4474 fg_vn_data_free(void *fgvndata)
4475 {
4476 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4477
4478 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4479 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4480 kfree_type(struct fd_vn_data, fvdata);
4481 }
4482
4483 /*
4484 * Check permissions, allocate an open file structure,
4485 * and call the device open routine if any.
4486 *
4487 * Returns: 0 Success
4488 * EINVAL
4489 * EINTR
4490 * falloc:ENFILE
4491 * falloc:EMFILE
4492 * falloc:ENOMEM
4493 * vn_open_auth:???
4494 * dupfdopen:???
4495 * VNOP_ADVLOCK:???
4496 * vnode_setsize:???
4497 *
4498 * XXX Need to implement uid, gid
4499 */
4500 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4501 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4502 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4503 {
4504 proc_t p = vfs_context_proc(ctx);
4505 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4506 struct fileproc *fp;
4507 vnode_t vp;
4508 int flags, oflags, amode;
4509 int type, indx, error;
4510 struct vfs_context context;
4511 vnode_t authvp = NULLVP;
4512
4513 oflags = uflags;
4514
4515 amode = oflags & O_ACCMODE;
4516 /*
4517 * Because O_RDONLY is 0, it is not possible to distinguish between
4518 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4519 * with FREAD/FWRITE.
4520 */
4521 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4522 return EINVAL;
4523 }
4524
4525 flags = FFLAGS(uflags);
4526 CLR(flags, FENCRYPTED);
4527 CLR(flags, FUNENCRYPTED);
4528
4529 AUDIT_ARG(fflags, oflags);
4530 AUDIT_ARG(mode, vap->va_mode);
4531
4532 if ((error = falloc_withinit(p, &fp, &indx, ctx, fp_init, initarg)) != 0) {
4533 return error;
4534 }
4535 if (flags & O_CLOEXEC) {
4536 fp->fp_flags |= FP_CLOEXEC;
4537 }
4538 if (flags & O_CLOFORK) {
4539 fp->fp_flags |= FP_CLOFORK;
4540 }
4541
4542 /* setup state to recognize when fdesc_open was called */
4543 uu->uu_dupfd = -1;
4544
4545 /*
4546 * Disable read/write access if file is opened with O_EVTONLY and
4547 * the process has requested to deny read/write access.
4548 */
4549 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4550 flags &= ~(FREAD | FWRITE);
4551 }
4552
4553 if (authfd != AUTH_OPEN_NOAUTHFD) {
4554 error = vnode_getfromfd(ctx, authfd, &authvp);
4555 if (error) {
4556 fp_free(p, indx, fp);
4557 return error;
4558 }
4559 }
4560
4561 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4562 if (authvp != NULLVP) {
4563 vnode_put(authvp);
4564 }
4565 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4566 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4567 *retval = indx;
4568 return 0;
4569 }
4570 }
4571 if (error == ERESTART) {
4572 error = EINTR;
4573 }
4574 fp_free(p, indx, fp);
4575 return error;
4576 }
4577
4578 if (authvp != NULLVP) {
4579 vnode_put(authvp);
4580 }
4581
4582 uu->uu_dupfd = 0;
4583 vp = ndp->ni_vp;
4584
4585 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4586 fp->fp_glob->fg_ops = &vnops;
4587 fp_set_data(fp, vp);
4588
4589 #if CONFIG_FILE_LEASES
4590 /*
4591 * If we are creating a file or open with truncate, we need to break the
4592 * lease if there is a read lease placed on the parent dir.
4593 */
4594 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4595 vnode_breakdirlease(vp, true, oflags);
4596 }
4597 /* Now check if there is a lease placed on the file itself. */
4598 error = vnode_breaklease(vp, oflags, ctx);
4599 if (error) {
4600 goto bad;
4601 }
4602 #endif /* CONFIG_FILE_LEASES */
4603
4604 if (flags & (O_EXLOCK | O_SHLOCK)) {
4605 struct flock lf = {
4606 .l_whence = SEEK_SET,
4607 };
4608
4609 if (flags & O_EXLOCK) {
4610 lf.l_type = F_WRLCK;
4611 } else {
4612 lf.l_type = F_RDLCK;
4613 }
4614 type = F_FLOCK;
4615 if ((flags & FNONBLOCK) == 0) {
4616 type |= F_WAIT;
4617 }
4618 #if CONFIG_MACF
4619 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4620 F_SETLK, &lf);
4621 if (error) {
4622 goto bad;
4623 }
4624 #endif
4625 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4626 goto bad;
4627 }
4628 fp->fp_glob->fg_flag |= FWASLOCKED;
4629 }
4630
4631 /* try to truncate by setting the size attribute */
4632 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4633 goto bad;
4634 }
4635
4636 /*
4637 * For directories we hold some additional information in the fd.
4638 */
4639 if (vnode_vtype(vp) == VDIR) {
4640 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4641 } else {
4642 fp->fp_glob->fg_vn_data = NULL;
4643 }
4644
4645 vnode_put(vp);
4646
4647 /*
4648 * The first terminal open (without a O_NOCTTY) by a session leader
4649 * results in it being set as the controlling terminal.
4650 */
4651 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4652 !(flags & O_NOCTTY)) {
4653 int tmp = 0;
4654
4655 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4656 (caddr_t)&tmp, ctx);
4657 }
4658
4659 proc_fdlock(p);
4660 procfdtbl_releasefd(p, indx, NULL);
4661
4662 #if CONFIG_SECLUDED_MEMORY
4663 if (secluded_for_filecache &&
4664 FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4665 vnode_vtype(vp) == VREG) {
4666 memory_object_control_t moc;
4667
4668 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4669
4670 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4671 /* nothing to do... */
4672 } else if (fp->fp_glob->fg_flag & FWRITE) {
4673 /* writable -> no longer eligible for secluded pages */
4674 memory_object_mark_eligible_for_secluded(moc,
4675 FALSE);
4676 } else if (secluded_for_filecache == 1) {
4677 char pathname[32] = { 0, };
4678 size_t copied;
4679 /* XXX FBDP: better way to detect /Applications/ ? */
4680 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4681 (void)copyinstr(ndp->ni_dirp,
4682 pathname,
4683 sizeof(pathname),
4684 &copied);
4685 } else {
4686 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4687 pathname,
4688 sizeof(pathname),
4689 &copied);
4690 }
4691 pathname[sizeof(pathname) - 1] = '\0';
4692 if (strncmp(pathname,
4693 "/Applications/",
4694 strlen("/Applications/")) == 0 &&
4695 strncmp(pathname,
4696 "/Applications/Camera.app/",
4697 strlen("/Applications/Camera.app/")) != 0) {
4698 /*
4699 * not writable
4700 * AND from "/Applications/"
4701 * AND not from "/Applications/Camera.app/"
4702 * ==> eligible for secluded
4703 */
4704 memory_object_mark_eligible_for_secluded(moc,
4705 TRUE);
4706 }
4707 } else if (secluded_for_filecache == 2) {
4708 size_t len = strlen(vp->v_name);
4709 if (!strncmp(vp->v_name, "dyld", len) ||
4710 !strncmp(vp->v_name, "launchd", len) ||
4711 !strncmp(vp->v_name, "Camera", len) ||
4712 !strncmp(vp->v_name, "mediaserverd", len) ||
4713 !strncmp(vp->v_name, "SpringBoard", len) ||
4714 !strncmp(vp->v_name, "backboardd", len)) {
4715 /*
4716 * This file matters when launching Camera:
4717 * do not store its contents in the secluded
4718 * pool that will be drained on Camera launch.
4719 */
4720 memory_object_mark_eligible_for_secluded(moc,
4721 FALSE);
4722 }
4723 }
4724 }
4725 #endif /* CONFIG_SECLUDED_MEMORY */
4726
4727 fp_drop(p, indx, fp, 1);
4728 proc_fdunlock(p);
4729
4730 *retval = indx;
4731
4732 return 0;
4733 bad:
4734 context = *vfs_context_current();
4735 context.vc_ucred = fp->fp_glob->fg_cred;
4736
4737 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4738 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4739 struct flock lf = {
4740 .l_whence = SEEK_SET,
4741 .l_type = F_UNLCK,
4742 };
4743
4744 (void)VNOP_ADVLOCK(
4745 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4746 }
4747
4748 vn_close(vp, fp->fp_glob->fg_flag, &context);
4749 vnode_put(vp);
4750 fp_free(p, indx, fp);
4751
4752 return error;
4753 }
4754
4755 /*
4756 * While most of the *at syscall handlers can call nameiat() which
4757 * is a wrapper around namei, the use of namei and initialisation
4758 * of nameidata are far removed and in different functions - namei
4759 * gets called in vn_open_auth for open1. So we'll just do here what
4760 * nameiat() does.
4761 */
4762 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4763 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4764 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4765 int dirfd, int authfd)
4766 {
4767 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4768 int error;
4769 char c;
4770
4771 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4772 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4773 if (error) {
4774 return error;
4775 }
4776 } else {
4777 c = *((char *)(ndp->ni_dirp));
4778 }
4779
4780 if (c != '/') {
4781 vnode_t dvp_at;
4782
4783 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4784 &dvp_at);
4785 if (error) {
4786 return error;
4787 }
4788
4789 if (vnode_vtype(dvp_at) != VDIR) {
4790 vnode_put(dvp_at);
4791 return ENOTDIR;
4792 }
4793
4794 ndp->ni_dvp = dvp_at;
4795 ndp->ni_cnd.cn_flags |= USEDVP;
4796 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4797 retval, authfd);
4798 vnode_put(dvp_at);
4799 return error;
4800 }
4801 }
4802
4803 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4804 }
4805
4806 /*
4807 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4808 *
4809 * Parameters: p Process requesting the open
4810 * uap User argument descriptor (see below)
4811 * retval Pointer to an area to receive the
4812 * return calue from the system call
4813 *
4814 * Indirect: uap->path Path to open (same as 'open')
4815 * uap->flags Flags to open (same as 'open'
4816 * uap->uid UID to set, if creating
4817 * uap->gid GID to set, if creating
4818 * uap->mode File mode, if creating (same as 'open')
4819 * uap->xsecurity ACL to set, if creating
4820 *
4821 * Returns: 0 Success
4822 * !0 errno value
4823 *
4824 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4825 *
4826 * XXX: We should enummerate the possible errno values here, and where
4827 * in the code they originated.
4828 */
4829 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4830 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4831 {
4832 int ciferror;
4833 kauth_filesec_t xsecdst;
4834 struct vnode_attr va;
4835 struct nameidata nd;
4836 int cmode;
4837
4838 AUDIT_ARG(owner, uap->uid, uap->gid);
4839
4840 xsecdst = NULL;
4841 if ((uap->xsecurity != USER_ADDR_NULL) &&
4842 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4843 return ciferror;
4844 }
4845
4846 VATTR_INIT(&va);
4847 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4848 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4849 if (uap->uid != KAUTH_UID_NONE) {
4850 VATTR_SET(&va, va_uid, uap->uid);
4851 }
4852 if (uap->gid != KAUTH_GID_NONE) {
4853 VATTR_SET(&va, va_gid, uap->gid);
4854 }
4855 if (xsecdst != NULL) {
4856 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4857 va.va_vaflags |= VA_FILESEC_ACL;
4858 }
4859
4860 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4861 uap->path, vfs_context_current());
4862
4863 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4864 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4865 if (xsecdst != NULL) {
4866 kauth_filesec_free(xsecdst);
4867 }
4868
4869 return ciferror;
4870 }
4871
4872 /*
4873 * Go through the data-protected atomically controlled open (2)
4874 *
4875 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4876 */
4877 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4878 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4879 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4880 {
4881 /*
4882 * Follow the same path as normal open(2)
4883 * Look up the item if it exists, and acquire the vnode.
4884 */
4885 struct vnode_attr va;
4886 struct nameidata nd;
4887 int cmode;
4888 int error;
4889 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4890
4891 VATTR_INIT(&va);
4892 /* Mask off all but regular access permissions */
4893 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4894 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4895
4896 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4897 path, ctx);
4898
4899 /*
4900 * Initialize the extra fields in vnode_attr to pass down our
4901 * extra fields.
4902 * 1. target cprotect class.
4903 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4904 */
4905 if (flags & O_CREAT) {
4906 /* lower level kernel code validates that the class is valid before applying it. */
4907 if (class != PROTECTION_CLASS_DEFAULT) {
4908 /*
4909 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4910 * file behave the same as open (2)
4911 */
4912 VATTR_SET(&va, va_dataprotect_class, class);
4913 }
4914 }
4915
4916 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4917 if (flags & (O_RDWR | O_WRONLY)) {
4918 /*
4919 * Not allowed to write raw encrypted bytes or when opening authenticated.
4920 */
4921 return EINVAL;
4922 }
4923 if (dpflags & O_DP_GETRAWENCRYPTED) {
4924 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4925 }
4926 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4927 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4928 }
4929 if (dpflags & O_DP_AUTHENTICATE) {
4930 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4931 }
4932 }
4933
4934 error = open1at(vfs_context_current(), &nd, flags, &va,
4935 NULL, NULL, retval, fd, authfd);
4936
4937 return error;
4938 }
4939
4940 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)4941 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
4942 {
4943 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
4944 return EINVAL;
4945 }
4946
4947 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
4948 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
4949 }
4950
4951 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)4952 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4953 {
4954 if (uap->dpflags & O_DP_AUTHENTICATE) {
4955 return EINVAL;
4956 }
4957
4958 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
4959 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
4960 }
4961
4962 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)4963 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4964 int fd, enum uio_seg segflg, int *retval)
4965 {
4966 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4967 struct {
4968 struct vnode_attr va;
4969 struct nameidata nd;
4970 } *__open_data;
4971 struct vnode_attr *vap;
4972 struct nameidata *ndp;
4973 int cmode;
4974 int error;
4975
4976 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
4977 vap = &__open_data->va;
4978 ndp = &__open_data->nd;
4979
4980 VATTR_INIT(vap);
4981 /* Mask off all but regular access permissions */
4982 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4983 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
4984
4985 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4986 segflg, path, ctx);
4987
4988 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
4989
4990 kfree_type(typeof(*__open_data), __open_data);
4991
4992 return error;
4993 }
4994
4995 int
open(proc_t p,struct open_args * uap,int32_t * retval)4996 open(proc_t p, struct open_args *uap, int32_t *retval)
4997 {
4998 __pthread_testcancel(1);
4999 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5000 }
5001
5002 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5003 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5004 int32_t *retval)
5005 {
5006 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5007 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5008 }
5009
5010 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5011 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5012 int32_t *retval)
5013 {
5014 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5015 uap->mode, uap->fd, UIO_USERSPACE, retval);
5016 }
5017
5018 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5019 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5020 {
5021 __pthread_testcancel(1);
5022 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5023 }
5024
5025 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5026
5027 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5028 vfs_context_can_open_by_id(vfs_context_t ctx)
5029 {
5030 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5031 return TRUE;
5032 }
5033
5034 return IOTaskHasEntitlement(vfs_context_task(ctx),
5035 OPEN_BY_ID_ENTITLEMENT);
5036 }
5037
5038 /*
5039 * openbyid_np: open a file given a file system id and a file system object id
5040 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5041 * file systems that don't support object ids it is a node id (uint64_t).
5042 *
5043 * Parameters: p Process requesting the open
5044 * uap User argument descriptor (see below)
5045 * retval Pointer to an area to receive the
5046 * return calue from the system call
5047 *
5048 * Indirect: uap->path Path to open (same as 'open')
5049 *
5050 * uap->fsid id of target file system
5051 * uap->objid id of target file system object
5052 * uap->flags Flags to open (same as 'open')
5053 *
5054 * Returns: 0 Success
5055 * !0 errno value
5056 *
5057 *
5058 * XXX: We should enummerate the possible errno values here, and where
5059 * in the code they originated.
5060 */
5061 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5062 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5063 {
5064 fsid_t fsid;
5065 uint64_t objid;
5066 int error;
5067 char *buf = NULL;
5068 int buflen = MAXPATHLEN;
5069 int pathlen = 0;
5070 vfs_context_t ctx = vfs_context_current();
5071
5072 if (!vfs_context_can_open_by_id(ctx)) {
5073 return EPERM;
5074 }
5075
5076 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5077 return error;
5078 }
5079
5080 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5081 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5082 return error;
5083 }
5084
5085 AUDIT_ARG(value32, fsid.val[0]);
5086 AUDIT_ARG(value64, objid);
5087
5088 /*resolve path from fsis, objid*/
5089 do {
5090 buf = kalloc_data(buflen + 1, Z_WAITOK);
5091 if (buf == NULL) {
5092 return ENOMEM;
5093 }
5094
5095 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5096 buf, FSOPT_ISREALFSID, &pathlen);
5097
5098 if (error) {
5099 kfree_data(buf, buflen + 1);
5100 buf = NULL;
5101 }
5102 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5103
5104 if (error) {
5105 return error;
5106 }
5107
5108 buf[pathlen] = 0;
5109
5110 error = openat_internal(
5111 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5112
5113 kfree_data(buf, buflen + 1);
5114
5115 return error;
5116 }
5117
5118
5119 /*
5120 * Create a special file.
5121 */
5122 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5123 int fd);
5124
5125 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5126 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5127 mode_t mode, int fd)
5128 {
5129 vfs_context_t ctx = vfs_context_current();
5130 struct nameidata nd;
5131 vnode_t vp, dvp;
5132 int error;
5133
5134 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5135 if ((mode & S_IFMT) == S_IFIFO) {
5136 return mkfifo1(ctx, upath, vap, fd);
5137 }
5138
5139 AUDIT_ARG(mode, mode);
5140 AUDIT_ARG(value32, vap->va_rdev);
5141
5142 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5143 return error;
5144 }
5145 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5146 UIO_USERSPACE, upath, ctx);
5147 error = nameiat(&nd, fd);
5148 if (error) {
5149 return error;
5150 }
5151 dvp = nd.ni_dvp;
5152 vp = nd.ni_vp;
5153
5154 if (vp != NULL) {
5155 error = EEXIST;
5156 goto out;
5157 }
5158
5159 switch (mode & S_IFMT) {
5160 case S_IFCHR:
5161 VATTR_SET(vap, va_type, VCHR);
5162 break;
5163 case S_IFBLK:
5164 VATTR_SET(vap, va_type, VBLK);
5165 break;
5166 default:
5167 error = EINVAL;
5168 goto out;
5169 }
5170
5171 #if CONFIG_MACF
5172 error = mac_vnode_check_create(ctx,
5173 nd.ni_dvp, &nd.ni_cnd, vap);
5174 if (error) {
5175 goto out;
5176 }
5177 #endif
5178
5179 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5180 goto out;
5181 }
5182
5183 #if CONFIG_FILE_LEASES
5184 vnode_breakdirlease(dvp, false, O_WRONLY);
5185 #endif
5186
5187 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5188 goto out;
5189 }
5190
5191 if (vp) {
5192 int update_flags = 0;
5193
5194 // Make sure the name & parent pointers are hooked up
5195 if (vp->v_name == NULL) {
5196 update_flags |= VNODE_UPDATE_NAME;
5197 }
5198 if (vp->v_parent == NULLVP) {
5199 update_flags |= VNODE_UPDATE_PARENT;
5200 }
5201
5202 if (update_flags) {
5203 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5204 }
5205
5206 #if CONFIG_FSE
5207 add_fsevent(FSE_CREATE_FILE, ctx,
5208 FSE_ARG_VNODE, vp,
5209 FSE_ARG_DONE);
5210 #endif
5211 }
5212
5213 out:
5214 /*
5215 * nameidone has to happen before we vnode_put(dvp)
5216 * since it may need to release the fs_nodelock on the dvp
5217 */
5218 nameidone(&nd);
5219
5220 if (vp) {
5221 vnode_put(vp);
5222 }
5223 vnode_put(dvp);
5224
5225 return error;
5226 }
5227
5228 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5229 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5230 {
5231 struct vnode_attr va;
5232
5233 VATTR_INIT(&va);
5234 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5235 VATTR_SET(&va, va_rdev, uap->dev);
5236
5237 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5238 }
5239
5240 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5241 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5242 {
5243 struct vnode_attr va;
5244
5245 VATTR_INIT(&va);
5246 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5247 VATTR_SET(&va, va_rdev, uap->dev);
5248
5249 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5250 }
5251
5252 /*
5253 * Create a named pipe.
5254 *
5255 * Returns: 0 Success
5256 * EEXIST
5257 * namei:???
5258 * vnode_authorize:???
5259 * vn_create:???
5260 */
5261 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5262 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5263 {
5264 vnode_t vp, dvp;
5265 int error;
5266 struct nameidata nd;
5267
5268 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5269 UIO_USERSPACE, upath, ctx);
5270 error = nameiat(&nd, fd);
5271 if (error) {
5272 return error;
5273 }
5274 dvp = nd.ni_dvp;
5275 vp = nd.ni_vp;
5276
5277 /* check that this is a new file and authorize addition */
5278 if (vp != NULL) {
5279 error = EEXIST;
5280 goto out;
5281 }
5282 VATTR_SET(vap, va_type, VFIFO);
5283
5284 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5285 goto out;
5286 }
5287
5288 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5289 out:
5290 /*
5291 * nameidone has to happen before we vnode_put(dvp)
5292 * since it may need to release the fs_nodelock on the dvp
5293 */
5294 nameidone(&nd);
5295
5296 if (vp) {
5297 vnode_put(vp);
5298 }
5299 vnode_put(dvp);
5300
5301 return error;
5302 }
5303
5304
5305 /*
5306 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5307 *
5308 * Parameters: p Process requesting the open
5309 * uap User argument descriptor (see below)
5310 * retval (Ignored)
5311 *
5312 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5313 * uap->uid UID to set
5314 * uap->gid GID to set
5315 * uap->mode File mode to set (same as 'mkfifo')
5316 * uap->xsecurity ACL to set, if creating
5317 *
5318 * Returns: 0 Success
5319 * !0 errno value
5320 *
5321 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5322 *
5323 * XXX: We should enummerate the possible errno values here, and where
5324 * in the code they originated.
5325 */
5326 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5327 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5328 {
5329 int ciferror;
5330 kauth_filesec_t xsecdst;
5331 struct vnode_attr va;
5332
5333 AUDIT_ARG(owner, uap->uid, uap->gid);
5334
5335 xsecdst = KAUTH_FILESEC_NONE;
5336 if (uap->xsecurity != USER_ADDR_NULL) {
5337 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5338 return ciferror;
5339 }
5340 }
5341
5342 VATTR_INIT(&va);
5343 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5344 if (uap->uid != KAUTH_UID_NONE) {
5345 VATTR_SET(&va, va_uid, uap->uid);
5346 }
5347 if (uap->gid != KAUTH_GID_NONE) {
5348 VATTR_SET(&va, va_gid, uap->gid);
5349 }
5350 if (xsecdst != KAUTH_FILESEC_NONE) {
5351 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5352 va.va_vaflags |= VA_FILESEC_ACL;
5353 }
5354
5355 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5356
5357 if (xsecdst != KAUTH_FILESEC_NONE) {
5358 kauth_filesec_free(xsecdst);
5359 }
5360 return ciferror;
5361 }
5362
5363 /* ARGSUSED */
5364 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5365 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5366 {
5367 struct vnode_attr va;
5368
5369 VATTR_INIT(&va);
5370 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5371
5372 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5373 }
5374
5375 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5376 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5377 {
5378 struct vnode_attr va;
5379
5380 VATTR_INIT(&va);
5381 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5382
5383 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5384 }
5385
5386 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5387 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5388 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5389
5390 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5391 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5392 {
5393 int ret, len = _len;
5394
5395 *truncated_path = 0;
5396
5397 if (firmlink) {
5398 ret = vn_getpath(dvp, path, &len);
5399 } else {
5400 ret = vn_getpath_no_firmlink(dvp, path, &len);
5401 }
5402 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5403 if (leafname) {
5404 path[len - 1] = '/';
5405 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5406 if (len > MAXPATHLEN) {
5407 char *ptr;
5408
5409 // the string got truncated!
5410 *truncated_path = 1;
5411 ptr = strrchr(path, '/');
5412 if (ptr) {
5413 *ptr = '\0'; // chop off the string at the last directory component
5414 }
5415 len = (int)strlen(path) + 1;
5416 }
5417 }
5418 } else if (ret == 0) {
5419 *truncated_path = 1;
5420 } else if (ret != 0) {
5421 struct vnode *mydvp = dvp;
5422
5423 if (ret != ENOSPC) {
5424 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5425 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5426 }
5427 *truncated_path = 1;
5428
5429 do {
5430 if (mydvp->v_parent != NULL) {
5431 mydvp = mydvp->v_parent;
5432 } else if (mydvp->v_mount) {
5433 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5434 break;
5435 } else {
5436 // no parent and no mount point? only thing is to punt and say "/" changed
5437 strlcpy(path, "/", _len);
5438 len = 2;
5439 mydvp = NULL;
5440 }
5441
5442 if (mydvp == NULL) {
5443 break;
5444 }
5445
5446 len = _len;
5447 if (firmlink) {
5448 ret = vn_getpath(mydvp, path, &len);
5449 } else {
5450 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5451 }
5452 } while (ret == ENOSPC);
5453 }
5454
5455 return len;
5456 }
5457
5458 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5459 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5460 {
5461 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5462 }
5463
5464 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5465 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5466 {
5467 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5468 }
5469
5470 /*
5471 * Make a hard file link.
5472 *
5473 * Returns: 0 Success
5474 * EPERM
5475 * EEXIST
5476 * EXDEV
5477 * namei:???
5478 * vnode_authorize:???
5479 * VNOP_LINK:???
5480 */
5481 /* ARGSUSED */
5482 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5483 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5484 user_addr_t link, int flag, enum uio_seg segflg)
5485 {
5486 vnode_t vp, pvp, dvp, lvp;
5487 struct nameidata nd;
5488 int follow;
5489 int error;
5490 #if CONFIG_FSE
5491 fse_info finfo;
5492 #endif
5493 int need_event, has_listeners, need_kpath2;
5494 char *target_path = NULL;
5495 char *no_firmlink_path = NULL;
5496 int truncated = 0;
5497 int truncated_no_firmlink_path = 0;
5498
5499 vp = dvp = lvp = NULLVP;
5500
5501 /* look up the object we are linking to */
5502 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5503 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5504 segflg, path, ctx);
5505
5506 error = nameiat(&nd, fd1);
5507 if (error) {
5508 return error;
5509 }
5510 vp = nd.ni_vp;
5511
5512 nameidone(&nd);
5513
5514 /*
5515 * Normally, linking to directories is not supported.
5516 * However, some file systems may have limited support.
5517 */
5518 if (vp->v_type == VDIR) {
5519 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5520 error = EPERM; /* POSIX */
5521 goto out;
5522 }
5523
5524 /* Linking to a directory requires ownership. */
5525 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5526 struct vnode_attr dva;
5527
5528 VATTR_INIT(&dva);
5529 VATTR_WANTED(&dva, va_uid);
5530 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5531 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5532 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5533 error = EACCES;
5534 goto out;
5535 }
5536 }
5537 }
5538
5539 /* lookup the target node */
5540 #if CONFIG_TRIGGERS
5541 nd.ni_op = OP_LINK;
5542 #endif
5543 nd.ni_cnd.cn_nameiop = CREATE;
5544 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5545 nd.ni_dirp = link;
5546 error = nameiat(&nd, fd2);
5547 if (error != 0) {
5548 goto out;
5549 }
5550 dvp = nd.ni_dvp;
5551 lvp = nd.ni_vp;
5552
5553 #if CONFIG_MACF
5554 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5555 goto out2;
5556 }
5557 #endif
5558
5559 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5560 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5561 goto out2;
5562 }
5563
5564 /* target node must not exist */
5565 if (lvp != NULLVP) {
5566 error = EEXIST;
5567 goto out2;
5568 }
5569 /* cannot link across mountpoints */
5570 if (vnode_mount(vp) != vnode_mount(dvp)) {
5571 error = EXDEV;
5572 goto out2;
5573 }
5574
5575 /* authorize creation of the target note */
5576 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5577 goto out2;
5578 }
5579
5580 #if CONFIG_FILE_LEASES
5581 vnode_breakdirlease(dvp, false, O_WRONLY);
5582 #endif
5583
5584 /* and finally make the link */
5585 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5586 if (error) {
5587 goto out2;
5588 }
5589
5590 #if CONFIG_MACF
5591 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5592 #endif
5593
5594 #if CONFIG_FSE
5595 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5596 #else
5597 need_event = 0;
5598 #endif
5599 has_listeners = kauth_authorize_fileop_has_listeners();
5600
5601 need_kpath2 = 0;
5602 #if CONFIG_AUDIT
5603 if (AUDIT_RECORD_EXISTS()) {
5604 need_kpath2 = 1;
5605 }
5606 #endif
5607
5608 if (need_event || has_listeners || need_kpath2) {
5609 char *link_to_path = NULL;
5610 int len, link_name_len;
5611 int len_no_firmlink_path = 0;
5612
5613 /* build the path to the new link file */
5614 GET_PATH(target_path);
5615
5616 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5617 if (no_firmlink_path == NULL) {
5618 GET_PATH(no_firmlink_path);
5619 }
5620 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5621
5622 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5623
5624 if (has_listeners) {
5625 /* build the path to file we are linking to */
5626 GET_PATH(link_to_path);
5627
5628 link_name_len = MAXPATHLEN;
5629 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5630 /*
5631 * Call out to allow 3rd party notification of rename.
5632 * Ignore result of kauth_authorize_fileop call.
5633 */
5634 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5635 (uintptr_t)link_to_path,
5636 (uintptr_t)target_path);
5637 }
5638 if (link_to_path != NULL) {
5639 RELEASE_PATH(link_to_path);
5640 }
5641 }
5642 #if CONFIG_FSE
5643 if (need_event) {
5644 /* construct fsevent */
5645 if (get_fse_info(vp, &finfo, ctx) == 0) {
5646 if (truncated_no_firmlink_path) {
5647 finfo.mode |= FSE_TRUNCATED_PATH;
5648 }
5649
5650 // build the path to the destination of the link
5651 add_fsevent(FSE_CREATE_FILE, ctx,
5652 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5653 FSE_ARG_FINFO, &finfo,
5654 FSE_ARG_DONE);
5655 }
5656
5657 pvp = vp->v_parent;
5658 // need an iocount on pvp in this case
5659 if (pvp && pvp != dvp) {
5660 error = vnode_get(pvp);
5661 if (error) {
5662 pvp = NULLVP;
5663 error = 0;
5664 }
5665 }
5666 if (pvp) {
5667 add_fsevent(FSE_STAT_CHANGED, ctx,
5668 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5669 }
5670 if (pvp && pvp != dvp) {
5671 vnode_put(pvp);
5672 }
5673 }
5674 #endif
5675 }
5676 out2:
5677 /*
5678 * nameidone has to happen before we vnode_put(dvp)
5679 * since it may need to release the fs_nodelock on the dvp
5680 */
5681 nameidone(&nd);
5682 if (target_path != NULL) {
5683 RELEASE_PATH(target_path);
5684 }
5685 if (no_firmlink_path != NULL) {
5686 RELEASE_PATH(no_firmlink_path);
5687 no_firmlink_path = NULL;
5688 }
5689 out:
5690 if (lvp) {
5691 vnode_put(lvp);
5692 }
5693 if (dvp) {
5694 vnode_put(dvp);
5695 }
5696 vnode_put(vp);
5697 return error;
5698 }
5699
5700 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5701 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5702 {
5703 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5704 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5705 }
5706
5707 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5708 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5709 {
5710 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5711 return EINVAL;
5712 }
5713
5714 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5715 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5716 }
5717
5718 /*
5719 * Make a symbolic link.
5720 *
5721 * We could add support for ACLs here too...
5722 */
5723 /* ARGSUSED */
5724 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5725 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5726 user_addr_t link, enum uio_seg segflg)
5727 {
5728 struct vnode_attr va;
5729 char *path;
5730 int error;
5731 struct nameidata nd;
5732 vnode_t vp, dvp;
5733 size_t dummy = 0;
5734 proc_t p;
5735
5736 error = 0;
5737 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5738 path = zalloc(ZV_NAMEI);
5739 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5740 } else {
5741 path = (char *)path_data;
5742 }
5743 if (error) {
5744 goto out;
5745 }
5746 AUDIT_ARG(text, path); /* This is the link string */
5747
5748 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5749 segflg, link, ctx);
5750
5751 error = nameiat(&nd, fd);
5752 if (error) {
5753 goto out;
5754 }
5755 dvp = nd.ni_dvp;
5756 vp = nd.ni_vp;
5757
5758 p = vfs_context_proc(ctx);
5759 VATTR_INIT(&va);
5760 VATTR_SET(&va, va_type, VLNK);
5761 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5762
5763 #if CONFIG_MACF
5764 error = mac_vnode_check_create(ctx,
5765 dvp, &nd.ni_cnd, &va);
5766 #endif
5767 if (error != 0) {
5768 goto skipit;
5769 }
5770
5771 if (vp != NULL) {
5772 error = EEXIST;
5773 goto skipit;
5774 }
5775
5776 /* authorize */
5777 if (error == 0) {
5778 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5779 }
5780 /* get default ownership, etc. */
5781 if (error == 0) {
5782 error = vnode_authattr_new(dvp, &va, 0, ctx);
5783 }
5784
5785 #if CONFIG_FILE_LEASES
5786 vnode_breakdirlease(dvp, false, O_WRONLY);
5787 #endif
5788
5789 if (error == 0) {
5790 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5791 }
5792
5793 /* do fallback attribute handling */
5794 if (error == 0 && vp) {
5795 error = vnode_setattr_fallback(vp, &va, ctx);
5796 }
5797
5798 #if CONFIG_MACF
5799 if (error == 0 && vp) {
5800 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5801 }
5802 #endif
5803
5804 if (error == 0) {
5805 int update_flags = 0;
5806
5807 /*check if a new vnode was created, else try to get one*/
5808 if (vp == NULL) {
5809 nd.ni_cnd.cn_nameiop = LOOKUP;
5810 #if CONFIG_TRIGGERS
5811 nd.ni_op = OP_LOOKUP;
5812 #endif
5813 /*
5814 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5815 * reallocated again in namei().
5816 */
5817 nd.ni_cnd.cn_flags &= HASBUF;
5818 error = nameiat(&nd, fd);
5819 if (error) {
5820 goto skipit;
5821 }
5822 vp = nd.ni_vp;
5823 }
5824
5825 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5826 /* call out to allow 3rd party notification of rename.
5827 * Ignore result of kauth_authorize_fileop call.
5828 */
5829 if (kauth_authorize_fileop_has_listeners() &&
5830 namei(&nd) == 0) {
5831 char *new_link_path = NULL;
5832 int len;
5833
5834 /* build the path to the new link file */
5835 new_link_path = get_pathbuff();
5836 len = MAXPATHLEN;
5837 vn_getpath(dvp, new_link_path, &len);
5838 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5839 new_link_path[len - 1] = '/';
5840 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5841 }
5842
5843 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5844 (uintptr_t)path, (uintptr_t)new_link_path);
5845 if (new_link_path != NULL) {
5846 release_pathbuff(new_link_path);
5847 }
5848 }
5849 #endif
5850 // Make sure the name & parent pointers are hooked up
5851 if (vp->v_name == NULL) {
5852 update_flags |= VNODE_UPDATE_NAME;
5853 }
5854 if (vp->v_parent == NULLVP) {
5855 update_flags |= VNODE_UPDATE_PARENT;
5856 }
5857
5858 if (update_flags) {
5859 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5860 }
5861
5862 #if CONFIG_FSE
5863 add_fsevent(FSE_CREATE_FILE, ctx,
5864 FSE_ARG_VNODE, vp,
5865 FSE_ARG_DONE);
5866 #endif
5867 }
5868
5869 skipit:
5870 /*
5871 * nameidone has to happen before we vnode_put(dvp)
5872 * since it may need to release the fs_nodelock on the dvp
5873 */
5874 nameidone(&nd);
5875
5876 if (vp) {
5877 vnode_put(vp);
5878 }
5879 vnode_put(dvp);
5880 out:
5881 if (path && (path != (char *)path_data)) {
5882 zfree(ZV_NAMEI, path);
5883 }
5884
5885 return error;
5886 }
5887
5888 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5889 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5890 {
5891 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5892 uap->link, UIO_USERSPACE);
5893 }
5894
5895 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5896 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5897 __unused int32_t *retval)
5898 {
5899 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5900 uap->path2, UIO_USERSPACE);
5901 }
5902
5903 /*
5904 * Delete a whiteout from the filesystem.
5905 * No longer supported.
5906 */
5907 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5908 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5909 {
5910 return ENOTSUP;
5911 }
5912
5913 /*
5914 * Delete a name from the filesystem.
5915 */
5916 /* ARGSUSED */
5917 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5918 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5919 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5920 {
5921 struct {
5922 struct nameidata nd;
5923 #if CONFIG_FSE
5924 struct vnode_attr va;
5925 fse_info finfo;
5926 #endif
5927 } *__unlink_data;
5928 struct nameidata *ndp;
5929 vnode_t vp, dvp;
5930 int error;
5931 struct componentname *cnp;
5932 char *path = NULL;
5933 char *no_firmlink_path = NULL;
5934 int len_path = 0;
5935 int len_no_firmlink_path = 0;
5936 int flags;
5937 int need_event;
5938 int has_listeners;
5939 int truncated_path;
5940 int truncated_no_firmlink_path;
5941 int batched;
5942 struct vnode_attr *vap;
5943 int do_retry;
5944 int retry_count = 0;
5945 int cn_flags;
5946
5947 cn_flags = LOCKPARENT;
5948 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5949 cn_flags |= AUDITVNPATH1;
5950 }
5951 /* If a starting dvp is passed, it trumps any fd passed. */
5952 if (start_dvp) {
5953 cn_flags |= USEDVP;
5954 }
5955
5956 #if NAMEDRSRCFORK
5957 /* unlink or delete is allowed on rsrc forks and named streams */
5958 cn_flags |= CN_ALLOWRSRCFORK;
5959 #endif
5960
5961 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
5962 ndp = &__unlink_data->nd;
5963 #if CONFIG_FSE
5964 fse_info *finfop = &__unlink_data->finfo;
5965 #endif
5966
5967 retry:
5968 do_retry = 0;
5969 flags = 0;
5970 need_event = 0;
5971 has_listeners = 0;
5972 truncated_path = 0;
5973 truncated_no_firmlink_path = 0;
5974 vap = NULL;
5975
5976 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5977
5978 ndp->ni_dvp = start_dvp;
5979 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE;
5980 cnp = &ndp->ni_cnd;
5981
5982 continue_lookup:
5983 error = nameiat(ndp, fd);
5984 if (error) {
5985 goto early_out;
5986 }
5987
5988 dvp = ndp->ni_dvp;
5989 vp = ndp->ni_vp;
5990
5991 /* With Carbon delete semantics, busy files cannot be deleted */
5992 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5993 flags |= VNODE_REMOVE_NODELETEBUSY;
5994 }
5995
5996 /* Skip any potential upcalls if told to. */
5997 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5998 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5999 }
6000
6001 if (vp) {
6002 batched = vnode_compound_remove_available(vp);
6003 /*
6004 * The root of a mounted filesystem cannot be deleted.
6005 */
6006 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6007 error = EBUSY;
6008 goto out;
6009 }
6010
6011 #if DEVELOPMENT || DEBUG
6012 /*
6013 * XXX VSWAP: Check for entitlements or special flag here
6014 * so we can restrict access appropriately.
6015 */
6016 #else /* DEVELOPMENT || DEBUG */
6017
6018 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6019 error = EPERM;
6020 goto out;
6021 }
6022 #endif /* DEVELOPMENT || DEBUG */
6023
6024 if (!batched) {
6025 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6026 if (error) {
6027 if (error == ENOENT) {
6028 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6029 do_retry = 1;
6030 retry_count++;
6031 }
6032 }
6033 goto out;
6034 }
6035 }
6036 } else {
6037 batched = 1;
6038
6039 if (!vnode_compound_remove_available(dvp)) {
6040 panic("No vp, but no compound remove?");
6041 }
6042 }
6043
6044 #if CONFIG_FSE
6045 need_event = need_fsevent(FSE_DELETE, dvp);
6046 if (need_event) {
6047 if (!batched) {
6048 if ((vp->v_flag & VISHARDLINK) == 0) {
6049 /* XXX need to get these data in batched VNOP */
6050 get_fse_info(vp, finfop, ctx);
6051 }
6052 } else {
6053 error =
6054 vfs_get_notify_attributes(&__unlink_data->va);
6055 if (error) {
6056 goto out;
6057 }
6058
6059 vap = &__unlink_data->va;
6060 }
6061 }
6062 #endif
6063 has_listeners = kauth_authorize_fileop_has_listeners();
6064 if (need_event || has_listeners) {
6065 if (path == NULL) {
6066 GET_PATH(path);
6067 }
6068 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6069 if (no_firmlink_path == NULL) {
6070 GET_PATH(no_firmlink_path);
6071 }
6072 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6073 }
6074
6075 #if NAMEDRSRCFORK
6076 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6077 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6078 } else
6079 #endif
6080 {
6081 #if CONFIG_FILE_LEASES
6082 vnode_breakdirlease(dvp, false, O_WRONLY);
6083 #endif
6084
6085 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6086 vp = ndp->ni_vp;
6087 if (error == EKEEPLOOKING) {
6088 if (!batched) {
6089 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6090 }
6091
6092 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6093 panic("EKEEPLOOKING, but continue flag not set?");
6094 }
6095
6096 if (vnode_isdir(vp)) {
6097 error = EISDIR;
6098 goto out;
6099 }
6100 goto continue_lookup;
6101 } else if (error == ENOENT && batched) {
6102 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6103 /*
6104 * For compound VNOPs, the authorization callback may
6105 * return ENOENT in case of racing hardlink lookups
6106 * hitting the name cache, redrive the lookup.
6107 */
6108 do_retry = 1;
6109 retry_count += 1;
6110 goto out;
6111 }
6112 }
6113 }
6114
6115 /*
6116 * Call out to allow 3rd party notification of delete.
6117 * Ignore result of kauth_authorize_fileop call.
6118 */
6119 if (!error) {
6120 if (has_listeners) {
6121 kauth_authorize_fileop(vfs_context_ucred(ctx),
6122 KAUTH_FILEOP_DELETE,
6123 (uintptr_t)vp,
6124 (uintptr_t)path);
6125 }
6126
6127 if (vp->v_flag & VISHARDLINK) {
6128 //
6129 // if a hardlink gets deleted we want to blow away the
6130 // v_parent link because the path that got us to this
6131 // instance of the link is no longer valid. this will
6132 // force the next call to get the path to ask the file
6133 // system instead of just following the v_parent link.
6134 //
6135 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6136 }
6137
6138 #if CONFIG_FSE
6139 if (need_event) {
6140 if (vp->v_flag & VISHARDLINK) {
6141 get_fse_info(vp, finfop, ctx);
6142 } else if (vap) {
6143 vnode_get_fse_info_from_vap(vp, finfop, vap);
6144 }
6145 if (truncated_path) {
6146 finfop->mode |= FSE_TRUNCATED_PATH;
6147 }
6148 add_fsevent(FSE_DELETE, ctx,
6149 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6150 FSE_ARG_FINFO, finfop,
6151 FSE_ARG_DONE);
6152 }
6153 #endif
6154
6155 #if CONFIG_MACF
6156 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6157 #endif
6158 }
6159
6160 out:
6161 if (path != NULL) {
6162 RELEASE_PATH(path);
6163 path = NULL;
6164 }
6165
6166 if (no_firmlink_path != NULL) {
6167 RELEASE_PATH(no_firmlink_path);
6168 no_firmlink_path = NULL;
6169 }
6170 #if NAMEDRSRCFORK
6171 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6172 * will cause its shadow file to go away if necessary.
6173 */
6174 if (vp && (vnode_isnamedstream(vp)) &&
6175 (vp->v_parent != NULLVP) &&
6176 vnode_isshadow(vp)) {
6177 vnode_recycle(vp);
6178 }
6179 #endif
6180 /*
6181 * nameidone has to happen before we vnode_put(dvp)
6182 * since it may need to release the fs_nodelock on the dvp
6183 */
6184 nameidone(ndp);
6185 vnode_put(dvp);
6186 if (vp) {
6187 vnode_put(vp);
6188 }
6189
6190 if (do_retry) {
6191 goto retry;
6192 }
6193
6194 early_out:
6195 kfree_type(typeof(*__unlink_data), __unlink_data);
6196 return error;
6197 }
6198
6199 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6200 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6201 enum uio_seg segflg, int unlink_flags)
6202 {
6203 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6204 unlink_flags);
6205 }
6206
6207 /*
6208 * Delete a name from the filesystem using Carbon semantics.
6209 */
6210 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6211 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6212 {
6213 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6214 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6215 }
6216
6217 /*
6218 * Delete a name from the filesystem using POSIX semantics.
6219 */
6220 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6221 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6222 {
6223 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6224 uap->path, UIO_USERSPACE, 0);
6225 }
6226
6227 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6228 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6229 {
6230 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6231 return EINVAL;
6232 }
6233
6234 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6235 int unlink_flags = 0;
6236
6237 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6238 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6239 }
6240 return rmdirat_internal(vfs_context_current(), uap->fd,
6241 uap->path, UIO_USERSPACE, unlink_flags);
6242 } else {
6243 return unlinkat_internal(vfs_context_current(), uap->fd,
6244 NULLVP, uap->path, UIO_USERSPACE, 0);
6245 }
6246 }
6247
6248 /*
6249 * Reposition read/write file offset.
6250 */
6251 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6252 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6253 {
6254 struct fileproc *fp;
6255 vnode_t vp;
6256 struct vfs_context *ctx;
6257 off_t offset = uap->offset, file_size;
6258 int error;
6259
6260 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6261 if (error == ENOTSUP) {
6262 return ESPIPE;
6263 }
6264 return error;
6265 }
6266 if (vnode_isfifo(vp)) {
6267 file_drop(uap->fd);
6268 return ESPIPE;
6269 }
6270
6271
6272 ctx = vfs_context_current();
6273 #if CONFIG_MACF
6274 if (uap->whence == L_INCR && uap->offset == 0) {
6275 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6276 fp->fp_glob);
6277 } else {
6278 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6279 fp->fp_glob);
6280 }
6281 if (error) {
6282 file_drop(uap->fd);
6283 return error;
6284 }
6285 #endif
6286 if ((error = vnode_getwithref(vp))) {
6287 file_drop(uap->fd);
6288 return error;
6289 }
6290
6291 switch (uap->whence) {
6292 case L_INCR:
6293 offset += fp->fp_glob->fg_offset;
6294 break;
6295 case L_XTND:
6296 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6297 break;
6298 }
6299 offset += file_size;
6300 break;
6301 case L_SET:
6302 break;
6303 case SEEK_HOLE:
6304 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6305 break;
6306 case SEEK_DATA:
6307 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6308 break;
6309 default:
6310 error = EINVAL;
6311 }
6312 if (error == 0) {
6313 if (uap->offset > 0 && offset < 0) {
6314 /* Incremented/relative move past max size */
6315 error = EOVERFLOW;
6316 } else {
6317 /*
6318 * Allow negative offsets on character devices, per
6319 * POSIX 1003.1-2001. Most likely for writing disk
6320 * labels.
6321 */
6322 if (offset < 0 && vp->v_type != VCHR) {
6323 /* Decremented/relative move before start */
6324 error = EINVAL;
6325 } else {
6326 /* Success */
6327 fp->fp_glob->fg_offset = offset;
6328 *retval = fp->fp_glob->fg_offset;
6329 }
6330 }
6331 }
6332
6333 /*
6334 * An lseek can affect whether data is "available to read." Use
6335 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6336 */
6337 post_event_if_success(vp, error, NOTE_NONE);
6338 (void)vnode_put(vp);
6339 file_drop(uap->fd);
6340 return error;
6341 }
6342
6343
6344 /*
6345 * Check access permissions.
6346 *
6347 * Returns: 0 Success
6348 * vnode_authorize:???
6349 */
6350 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6351 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6352 {
6353 kauth_action_t action;
6354 int error;
6355
6356 /*
6357 * If just the regular access bits, convert them to something
6358 * that vnode_authorize will understand.
6359 */
6360 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6361 action = 0;
6362 if (uflags & R_OK) {
6363 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6364 }
6365 if (uflags & W_OK) {
6366 if (vnode_isdir(vp)) {
6367 action |= KAUTH_VNODE_ADD_FILE |
6368 KAUTH_VNODE_ADD_SUBDIRECTORY;
6369 /* might want delete rights here too */
6370 } else {
6371 action |= KAUTH_VNODE_WRITE_DATA;
6372 }
6373 }
6374 if (uflags & X_OK) {
6375 if (vnode_isdir(vp)) {
6376 action |= KAUTH_VNODE_SEARCH;
6377 } else {
6378 action |= KAUTH_VNODE_EXECUTE;
6379 }
6380 }
6381 } else {
6382 /* take advantage of definition of uflags */
6383 action = uflags >> 8;
6384 }
6385
6386 #if CONFIG_MACF
6387 error = mac_vnode_check_access(ctx, vp, uflags);
6388 if (error) {
6389 return error;
6390 }
6391 #endif /* MAC */
6392
6393 /* action == 0 means only check for existence */
6394 if (action != 0) {
6395 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6396 } else {
6397 error = 0;
6398 }
6399
6400 return error;
6401 }
6402
6403
6404
6405 /*
6406 * access_extended: Check access permissions in bulk.
6407 *
6408 * Description: uap->entries Pointer to an array of accessx
6409 * descriptor structs, plus one or
6410 * more NULL terminated strings (see
6411 * "Notes" section below).
6412 * uap->size Size of the area pointed to by
6413 * uap->entries.
6414 * uap->results Pointer to the results array.
6415 *
6416 * Returns: 0 Success
6417 * ENOMEM Insufficient memory
6418 * EINVAL Invalid arguments
6419 * namei:EFAULT Bad address
6420 * namei:ENAMETOOLONG Filename too long
6421 * namei:ENOENT No such file or directory
6422 * namei:ELOOP Too many levels of symbolic links
6423 * namei:EBADF Bad file descriptor
6424 * namei:ENOTDIR Not a directory
6425 * namei:???
6426 * access1:
6427 *
6428 * Implicit returns:
6429 * uap->results Array contents modified
6430 *
6431 * Notes: The uap->entries are structured as an arbitrary length array
6432 * of accessx descriptors, followed by one or more NULL terminated
6433 * strings
6434 *
6435 * struct accessx_descriptor[0]
6436 * ...
6437 * struct accessx_descriptor[n]
6438 * char name_data[0];
6439 *
6440 * We determine the entry count by walking the buffer containing
6441 * the uap->entries argument descriptor. For each descriptor we
6442 * see, the valid values for the offset ad_name_offset will be
6443 * in the byte range:
6444 *
6445 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6446 * to
6447 * [ uap->entries + uap->size - 2 ]
6448 *
6449 * since we must have at least one string, and the string must
6450 * be at least one character plus the NULL terminator in length.
6451 *
6452 * XXX: Need to support the check-as uid argument
6453 */
6454 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6455 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6456 {
6457 struct accessx_descriptor *input = NULL;
6458 errno_t *result = NULL;
6459 errno_t error = 0;
6460 int wantdelete = 0;
6461 size_t desc_max, desc_actual = 0;
6462 unsigned int i, j;
6463 struct vfs_context context;
6464 struct nameidata nd;
6465 int niopts;
6466 vnode_t vp = NULL;
6467 vnode_t dvp = NULL;
6468 #define ACCESSX_MAX_DESCR_ON_STACK 10
6469 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6470
6471 context.vc_ucred = NULL;
6472
6473 /*
6474 * Validate parameters; if valid, copy the descriptor array and string
6475 * arguments into local memory. Before proceeding, the following
6476 * conditions must have been met:
6477 *
6478 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6479 * o There must be sufficient room in the request for at least one
6480 * descriptor and a one yte NUL terminated string.
6481 * o The allocation of local storage must not fail.
6482 */
6483 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6484 return ENOMEM;
6485 }
6486 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6487 return EINVAL;
6488 }
6489 if (uap->size <= sizeof(stack_input)) {
6490 input = stack_input;
6491 } else {
6492 input = kalloc_data(uap->size, Z_WAITOK);
6493 if (input == NULL) {
6494 error = ENOMEM;
6495 goto out;
6496 }
6497 }
6498 error = copyin(uap->entries, input, uap->size);
6499 if (error) {
6500 goto out;
6501 }
6502
6503 AUDIT_ARG(opaque, input, uap->size);
6504
6505 /*
6506 * Force NUL termination of the copyin buffer to avoid nami() running
6507 * off the end. If the caller passes us bogus data, they may get a
6508 * bogus result.
6509 */
6510 ((char *)input)[uap->size - 1] = 0;
6511
6512 /*
6513 * Access is defined as checking against the process' real identity,
6514 * even if operations are checking the effective identity. This
6515 * requires that we use a local vfs context.
6516 */
6517 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6518 context.vc_thread = current_thread();
6519
6520 /*
6521 * Find out how many entries we have, so we can allocate the result
6522 * array by walking the list and adjusting the count downward by the
6523 * earliest string offset we see.
6524 */
6525 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6526 desc_actual = desc_max;
6527 for (i = 0; i < desc_actual; i++) {
6528 /*
6529 * Take the offset to the name string for this entry and
6530 * convert to an input array index, which would be one off
6531 * the end of the array if this entry was the lowest-addressed
6532 * name string.
6533 */
6534 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6535
6536 /*
6537 * An offset greater than the max allowable offset is an error.
6538 * It is also an error for any valid entry to point
6539 * to a location prior to the end of the current entry, if
6540 * it's not a reference to the string of the previous entry.
6541 */
6542 if (j > desc_max || (j != 0 && j <= i)) {
6543 error = EINVAL;
6544 goto out;
6545 }
6546
6547 /* Also do not let ad_name_offset point to something beyond the size of the input */
6548 if (input[i].ad_name_offset >= uap->size) {
6549 error = EINVAL;
6550 goto out;
6551 }
6552
6553 /*
6554 * An offset of 0 means use the previous descriptor's offset;
6555 * this is used to chain multiple requests for the same file
6556 * to avoid multiple lookups.
6557 */
6558 if (j == 0) {
6559 /* This is not valid for the first entry */
6560 if (i == 0) {
6561 error = EINVAL;
6562 goto out;
6563 }
6564 continue;
6565 }
6566
6567 /*
6568 * If the offset of the string for this descriptor is before
6569 * what we believe is the current actual last descriptor,
6570 * then we need to adjust our estimate downward; this permits
6571 * the string table following the last descriptor to be out
6572 * of order relative to the descriptor list.
6573 */
6574 if (j < desc_actual) {
6575 desc_actual = j;
6576 }
6577 }
6578
6579 /*
6580 * We limit the actual number of descriptors we are willing to process
6581 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6582 * requested does not exceed this limit,
6583 */
6584 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6585 error = ENOMEM;
6586 goto out;
6587 }
6588 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6589 if (result == NULL) {
6590 error = ENOMEM;
6591 goto out;
6592 }
6593
6594 /*
6595 * Do the work by iterating over the descriptor entries we know to
6596 * at least appear to contain valid data.
6597 */
6598 error = 0;
6599 for (i = 0; i < desc_actual; i++) {
6600 /*
6601 * If the ad_name_offset is 0, then we use the previous
6602 * results to make the check; otherwise, we are looking up
6603 * a new file name.
6604 */
6605 if (input[i].ad_name_offset != 0) {
6606 /* discard old vnodes */
6607 if (vp) {
6608 vnode_put(vp);
6609 vp = NULL;
6610 }
6611 if (dvp) {
6612 vnode_put(dvp);
6613 dvp = NULL;
6614 }
6615
6616 /*
6617 * Scan forward in the descriptor list to see if we
6618 * need the parent vnode. We will need it if we are
6619 * deleting, since we must have rights to remove
6620 * entries in the parent directory, as well as the
6621 * rights to delete the object itself.
6622 */
6623 wantdelete = input[i].ad_flags & _DELETE_OK;
6624 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6625 if (input[j].ad_flags & _DELETE_OK) {
6626 wantdelete = 1;
6627 }
6628 }
6629
6630 niopts = FOLLOW | AUDITVNPATH1;
6631
6632 /* need parent for vnode_authorize for deletion test */
6633 if (wantdelete) {
6634 niopts |= WANTPARENT;
6635 }
6636
6637 /* do the lookup */
6638 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6639 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6640 &context);
6641 error = namei(&nd);
6642 if (!error) {
6643 vp = nd.ni_vp;
6644 if (wantdelete) {
6645 dvp = nd.ni_dvp;
6646 }
6647 }
6648 nameidone(&nd);
6649 }
6650
6651 /*
6652 * Handle lookup errors.
6653 */
6654 switch (error) {
6655 case ENOENT:
6656 case EACCES:
6657 case EPERM:
6658 case ENOTDIR:
6659 result[i] = error;
6660 break;
6661 case 0:
6662 /* run this access check */
6663 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6664 break;
6665 default:
6666 /* fatal lookup error */
6667
6668 goto out;
6669 }
6670 }
6671
6672 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6673
6674 /* copy out results */
6675 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6676
6677 out:
6678 if (input && input != stack_input) {
6679 kfree_data(input, uap->size);
6680 }
6681 if (result) {
6682 kfree_data(result, desc_actual * sizeof(errno_t));
6683 }
6684 if (vp) {
6685 vnode_put(vp);
6686 }
6687 if (dvp) {
6688 vnode_put(dvp);
6689 }
6690 if (IS_VALID_CRED(context.vc_ucred)) {
6691 kauth_cred_unref(&context.vc_ucred);
6692 }
6693 return error;
6694 }
6695
6696
6697 /*
6698 * Returns: 0 Success
6699 * namei:EFAULT Bad address
6700 * namei:ENAMETOOLONG Filename too long
6701 * namei:ENOENT No such file or directory
6702 * namei:ELOOP Too many levels of symbolic links
6703 * namei:EBADF Bad file descriptor
6704 * namei:ENOTDIR Not a directory
6705 * namei:???
6706 * access1:
6707 */
6708 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6709 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6710 int flag, enum uio_seg segflg)
6711 {
6712 int error;
6713 struct nameidata nd;
6714 int niopts;
6715 struct vfs_context context;
6716 #if NAMEDRSRCFORK
6717 int is_namedstream = 0;
6718 #endif
6719
6720 /*
6721 * Unless the AT_EACCESS option is used, Access is defined as checking
6722 * against the process' real identity, even if operations are checking
6723 * the effective identity. So we need to tweak the credential
6724 * in the context for that case.
6725 */
6726 if (!(flag & AT_EACCESS)) {
6727 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6728 } else {
6729 context.vc_ucred = ctx->vc_ucred;
6730 }
6731 context.vc_thread = ctx->vc_thread;
6732
6733
6734 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6735 /* need parent for vnode_authorize for deletion test */
6736 if (amode & _DELETE_OK) {
6737 niopts |= WANTPARENT;
6738 }
6739 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6740 path, &context);
6741 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6742 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6743 }
6744
6745 #if NAMEDRSRCFORK
6746 /* access(F_OK) calls are allowed for resource forks. */
6747 if (amode == F_OK) {
6748 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6749 }
6750 #endif
6751 error = nameiat(&nd, fd);
6752 if (error) {
6753 goto out;
6754 }
6755
6756 #if NAMEDRSRCFORK
6757 /* Grab reference on the shadow stream file vnode to
6758 * force an inactive on release which will mark it
6759 * for recycle.
6760 */
6761 if (vnode_isnamedstream(nd.ni_vp) &&
6762 (nd.ni_vp->v_parent != NULLVP) &&
6763 vnode_isshadow(nd.ni_vp)) {
6764 is_namedstream = 1;
6765 vnode_ref(nd.ni_vp);
6766 }
6767 #endif
6768
6769 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6770
6771 #if NAMEDRSRCFORK
6772 if (is_namedstream) {
6773 vnode_rele(nd.ni_vp);
6774 }
6775 #endif
6776
6777 vnode_put(nd.ni_vp);
6778 if (amode & _DELETE_OK) {
6779 vnode_put(nd.ni_dvp);
6780 }
6781 nameidone(&nd);
6782
6783 out:
6784 if (!(flag & AT_EACCESS)) {
6785 kauth_cred_unref(&context.vc_ucred);
6786 }
6787 return error;
6788 }
6789
6790 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6791 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6792 {
6793 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6794 uap->path, uap->flags, 0, UIO_USERSPACE);
6795 }
6796
6797 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6798 faccessat(__unused proc_t p, struct faccessat_args *uap,
6799 __unused int32_t *retval)
6800 {
6801 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6802 return EINVAL;
6803 }
6804
6805 return faccessat_internal(vfs_context_current(), uap->fd,
6806 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6807 }
6808
6809 /*
6810 * Returns: 0 Success
6811 * EFAULT
6812 * copyout:EFAULT
6813 * namei:???
6814 * vn_stat:???
6815 */
6816 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6817 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6818 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6819 enum uio_seg segflg, int fd, int flag)
6820 {
6821 struct nameidata nd;
6822 int follow;
6823 union {
6824 struct stat sb;
6825 struct stat64 sb64;
6826 } source = {};
6827 union {
6828 struct user64_stat user64_sb;
6829 struct user32_stat user32_sb;
6830 struct user64_stat64 user64_sb64;
6831 struct user32_stat64 user32_sb64;
6832 } dest = {};
6833 caddr_t sbp;
6834 int error, my_size;
6835 kauth_filesec_t fsec;
6836 size_t xsecurity_bufsize;
6837 void * statptr;
6838 struct fileproc *fp = NULL;
6839 int needsrealdev = 0;
6840
6841 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6842 NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6843 segflg, path, ctx);
6844 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6845 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6846 }
6847
6848 #if NAMEDRSRCFORK
6849 int is_namedstream = 0;
6850 /* stat calls are allowed for resource forks. */
6851 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6852 #endif
6853
6854 if (flag & AT_FDONLY) {
6855 vnode_t fvp;
6856
6857 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6858 if (error) {
6859 return error;
6860 }
6861 if ((error = vnode_getwithref(fvp))) {
6862 file_drop(fd);
6863 return error;
6864 }
6865 nd.ni_vp = fvp;
6866 } else {
6867 error = nameiat(&nd, fd);
6868 if (error) {
6869 return error;
6870 }
6871 }
6872 fsec = KAUTH_FILESEC_NONE;
6873
6874 statptr = (void *)&source;
6875
6876 #if NAMEDRSRCFORK
6877 /* Grab reference on the shadow stream file vnode to
6878 * force an inactive on release which will mark it
6879 * for recycle.
6880 */
6881 if (vnode_isnamedstream(nd.ni_vp) &&
6882 (nd.ni_vp->v_parent != NULLVP) &&
6883 vnode_isshadow(nd.ni_vp)) {
6884 is_namedstream = 1;
6885 vnode_ref(nd.ni_vp);
6886 }
6887 #endif
6888
6889 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6890 if (fp && (xsecurity == USER_ADDR_NULL)) {
6891 /*
6892 * If the caller has the file open, and is not
6893 * requesting extended security information, we are
6894 * going to let them get the basic stat information.
6895 */
6896 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6897 fp->fp_glob->fg_cred);
6898 } else {
6899 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6900 isstat64, needsrealdev, ctx);
6901 }
6902
6903 #if NAMEDRSRCFORK
6904 if (is_namedstream) {
6905 vnode_rele(nd.ni_vp);
6906 }
6907 #endif
6908 vnode_put(nd.ni_vp);
6909 nameidone(&nd);
6910 if (fp) {
6911 file_drop(fd);
6912 fp = NULL;
6913 }
6914
6915 if (error) {
6916 return error;
6917 }
6918 /* Zap spare fields */
6919 if (isstat64 != 0) {
6920 source.sb64.st_lspare = 0;
6921 source.sb64.st_qspare[0] = 0LL;
6922 source.sb64.st_qspare[1] = 0LL;
6923 if (vfs_context_is64bit(ctx)) {
6924 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6925 my_size = sizeof(dest.user64_sb64);
6926 sbp = (caddr_t)&dest.user64_sb64;
6927 } else {
6928 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6929 my_size = sizeof(dest.user32_sb64);
6930 sbp = (caddr_t)&dest.user32_sb64;
6931 }
6932 /*
6933 * Check if we raced (post lookup) against the last unlink of a file.
6934 */
6935 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6936 source.sb64.st_nlink = 1;
6937 }
6938 } else {
6939 source.sb.st_lspare = 0;
6940 source.sb.st_qspare[0] = 0LL;
6941 source.sb.st_qspare[1] = 0LL;
6942 if (vfs_context_is64bit(ctx)) {
6943 munge_user64_stat(&source.sb, &dest.user64_sb);
6944 my_size = sizeof(dest.user64_sb);
6945 sbp = (caddr_t)&dest.user64_sb;
6946 } else {
6947 munge_user32_stat(&source.sb, &dest.user32_sb);
6948 my_size = sizeof(dest.user32_sb);
6949 sbp = (caddr_t)&dest.user32_sb;
6950 }
6951
6952 /*
6953 * Check if we raced (post lookup) against the last unlink of a file.
6954 */
6955 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6956 source.sb.st_nlink = 1;
6957 }
6958 }
6959 if ((error = copyout(sbp, ub, my_size)) != 0) {
6960 goto out;
6961 }
6962
6963 /* caller wants extended security information? */
6964 if (xsecurity != USER_ADDR_NULL) {
6965 /* did we get any? */
6966 if (fsec == KAUTH_FILESEC_NONE) {
6967 if (susize(xsecurity_size, 0) != 0) {
6968 error = EFAULT;
6969 goto out;
6970 }
6971 } else {
6972 /* find the user buffer size */
6973 xsecurity_bufsize = fusize(xsecurity_size);
6974
6975 /* copy out the actual data size */
6976 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6977 error = EFAULT;
6978 goto out;
6979 }
6980
6981 /* if the caller supplied enough room, copy out to it */
6982 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6983 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6984 }
6985 }
6986 }
6987 out:
6988 if (fsec != KAUTH_FILESEC_NONE) {
6989 kauth_filesec_free(fsec);
6990 }
6991 return error;
6992 }
6993
6994 /*
6995 * stat_extended: Get file status; with extended security (ACL).
6996 *
6997 * Parameters: p (ignored)
6998 * uap User argument descriptor (see below)
6999 * retval (ignored)
7000 *
7001 * Indirect: uap->path Path of file to get status from
7002 * uap->ub User buffer (holds file status info)
7003 * uap->xsecurity ACL to get (extended security)
7004 * uap->xsecurity_size Size of ACL
7005 *
7006 * Returns: 0 Success
7007 * !0 errno value
7008 *
7009 */
7010 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7011 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7012 __unused int32_t *retval)
7013 {
7014 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7015 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7016 0);
7017 }
7018
7019 /*
7020 * Returns: 0 Success
7021 * fstatat_internal:??? [see fstatat_internal() in this file]
7022 */
7023 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7024 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7025 {
7026 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7027 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7028 }
7029
7030 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7031 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7032 {
7033 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7034 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7035 }
7036
7037 /*
7038 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7039 *
7040 * Parameters: p (ignored)
7041 * uap User argument descriptor (see below)
7042 * retval (ignored)
7043 *
7044 * Indirect: uap->path Path of file to get status from
7045 * uap->ub User buffer (holds file status info)
7046 * uap->xsecurity ACL to get (extended security)
7047 * uap->xsecurity_size Size of ACL
7048 *
7049 * Returns: 0 Success
7050 * !0 errno value
7051 *
7052 */
7053 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7054 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7055 {
7056 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7057 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7058 0);
7059 }
7060
7061 /*
7062 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7063 *
7064 * Parameters: p (ignored)
7065 * uap User argument descriptor (see below)
7066 * retval (ignored)
7067 *
7068 * Indirect: uap->path Path of file to get status from
7069 * uap->ub User buffer (holds file status info)
7070 * uap->xsecurity ACL to get (extended security)
7071 * uap->xsecurity_size Size of ACL
7072 *
7073 * Returns: 0 Success
7074 * !0 errno value
7075 *
7076 */
7077 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7078 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7079 {
7080 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7081 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7082 AT_SYMLINK_NOFOLLOW);
7083 }
7084
7085 /*
7086 * Get file status; this version does not follow links.
7087 */
7088 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7089 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7090 {
7091 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7092 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7093 }
7094
7095 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7096 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7097 {
7098 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7099 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7100 }
7101
7102 /*
7103 * lstat64_extended: Get file status; can handle large inode numbers; does not
7104 * follow links; with extended security (ACL).
7105 *
7106 * Parameters: p (ignored)
7107 * uap User argument descriptor (see below)
7108 * retval (ignored)
7109 *
7110 * Indirect: uap->path Path of file to get status from
7111 * uap->ub User buffer (holds file status info)
7112 * uap->xsecurity ACL to get (extended security)
7113 * uap->xsecurity_size Size of ACL
7114 *
7115 * Returns: 0 Success
7116 * !0 errno value
7117 *
7118 */
7119 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7120 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7121 {
7122 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7123 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7124 AT_SYMLINK_NOFOLLOW);
7125 }
7126
7127 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7128 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7129 {
7130 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7131 return EINVAL;
7132 }
7133
7134 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7135 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7136 }
7137
7138 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7139 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7140 __unused int32_t *retval)
7141 {
7142 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7143 return EINVAL;
7144 }
7145
7146 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7147 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7148 }
7149
7150 /*
7151 * Get configurable pathname variables.
7152 *
7153 * Returns: 0 Success
7154 * namei:???
7155 * vn_pathconf:???
7156 *
7157 * Notes: Global implementation constants are intended to be
7158 * implemented in this function directly; all other constants
7159 * are per-FS implementation, and therefore must be handled in
7160 * each respective FS, instead.
7161 *
7162 * XXX We implement some things globally right now that should actually be
7163 * XXX per-FS; we will need to deal with this at some point.
7164 */
7165 /* ARGSUSED */
7166 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7167 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7168 {
7169 int error;
7170 struct nameidata nd;
7171 vfs_context_t ctx = vfs_context_current();
7172
7173 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7174 UIO_USERSPACE, uap->path, ctx);
7175 error = namei(&nd);
7176 if (error) {
7177 return error;
7178 }
7179
7180 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7181
7182 vnode_put(nd.ni_vp);
7183 nameidone(&nd);
7184 return error;
7185 }
7186
7187 /*
7188 * Return target name of a symbolic link.
7189 */
7190 /* ARGSUSED */
7191 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7192 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7193 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7194 int *retval)
7195 {
7196 vnode_t vp;
7197 uio_t auio;
7198 int error;
7199 struct nameidata nd;
7200 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
7201 bool put_vnode;
7202
7203 if (bufsize > INT32_MAX) {
7204 return EINVAL;
7205 }
7206
7207 if (lnk_vp) {
7208 vp = lnk_vp;
7209 put_vnode = false;
7210 } else {
7211 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7212 seg, path, ctx);
7213
7214 error = nameiat(&nd, fd);
7215 if (error) {
7216 return error;
7217 }
7218 vp = nd.ni_vp;
7219 put_vnode = true;
7220 nameidone(&nd);
7221 }
7222
7223 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7224 &uio_buf[0], sizeof(uio_buf));
7225 uio_addiov(auio, buf, bufsize);
7226 if (vp->v_type != VLNK) {
7227 error = EINVAL;
7228 } else {
7229 #if CONFIG_MACF
7230 error = mac_vnode_check_readlink(ctx, vp);
7231 #endif
7232 if (error == 0) {
7233 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7234 ctx);
7235 }
7236 if (error == 0) {
7237 error = VNOP_READLINK(vp, auio, ctx);
7238 }
7239 }
7240
7241 if (put_vnode) {
7242 vnode_put(vp);
7243 }
7244
7245 *retval = (int)(bufsize - uio_resid(auio));
7246 return error;
7247 }
7248
7249 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7250 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7251 {
7252 enum uio_seg procseg;
7253 vnode_t vp;
7254 int error;
7255
7256 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7257
7258 AUDIT_ARG(fd, uap->fd);
7259
7260 if ((error = file_vnode(uap->fd, &vp))) {
7261 return error;
7262 }
7263 if ((error = vnode_getwithref(vp))) {
7264 file_drop(uap->fd);
7265 return error;
7266 }
7267
7268 error = readlinkat_internal(vfs_context_current(), -1,
7269 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7270 uap->bufsize, procseg, retval);
7271
7272 vnode_put(vp);
7273 file_drop(uap->fd);
7274 return error;
7275 }
7276
7277 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7278 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7279 {
7280 enum uio_seg procseg;
7281
7282 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7283 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7284 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7285 uap->count, procseg, retval);
7286 }
7287
7288 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7289 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7290 {
7291 enum uio_seg procseg;
7292
7293 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7294 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7295 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7296 retval);
7297 }
7298
7299 /*
7300 * Change file flags, the deep inner layer.
7301 */
7302 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7303 chflags0(vnode_t vp, struct vnode_attr *va,
7304 int (*setattr)(vnode_t, void *, vfs_context_t),
7305 void *arg, vfs_context_t ctx)
7306 {
7307 kauth_action_t action = 0;
7308 int error;
7309
7310 #if CONFIG_MACF
7311 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7312 if (error) {
7313 goto out;
7314 }
7315 #endif
7316
7317 /* request authorisation, disregard immutability */
7318 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7319 goto out;
7320 }
7321 /*
7322 * Request that the auth layer disregard those file flags it's allowed to when
7323 * authorizing this operation; we need to do this in order to be able to
7324 * clear immutable flags.
7325 */
7326 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7327 goto out;
7328 }
7329 error = (*setattr)(vp, arg, ctx);
7330
7331 #if CONFIG_MACF
7332 if (error == 0) {
7333 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7334 }
7335 #endif
7336
7337 out:
7338 return error;
7339 }
7340
7341 /*
7342 * Change file flags.
7343 *
7344 * NOTE: this will vnode_put() `vp'
7345 */
7346 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7347 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7348 {
7349 struct vnode_attr va;
7350 int error;
7351
7352 VATTR_INIT(&va);
7353 VATTR_SET(&va, va_flags, flags);
7354
7355 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7356 vnode_put(vp);
7357
7358 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7359 error = ENOTSUP;
7360 }
7361
7362 return error;
7363 }
7364
7365 /*
7366 * Change flags of a file given a path name.
7367 */
7368 /* ARGSUSED */
7369 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7370 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7371 {
7372 vnode_t vp;
7373 vfs_context_t ctx = vfs_context_current();
7374 int error;
7375 struct nameidata nd;
7376 uint32_t wantparent = 0;
7377
7378 #if CONFIG_FILE_LEASES
7379 wantparent = WANTPARENT;
7380 #endif
7381
7382 AUDIT_ARG(fflags, uap->flags);
7383 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7384 UIO_USERSPACE, uap->path, ctx);
7385 error = namei(&nd);
7386 if (error) {
7387 return error;
7388 }
7389 vp = nd.ni_vp;
7390
7391 #if CONFIG_FILE_LEASES
7392 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7393 vnode_put(nd.ni_dvp);
7394 #endif
7395
7396 nameidone(&nd);
7397
7398 /* we don't vnode_put() here because chflags1 does internally */
7399 error = chflags1(vp, uap->flags, ctx);
7400
7401 return error;
7402 }
7403
7404 /*
7405 * Change flags of a file given a file descriptor.
7406 */
7407 /* ARGSUSED */
7408 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7409 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7410 {
7411 vnode_t vp;
7412 int error;
7413
7414 AUDIT_ARG(fd, uap->fd);
7415 AUDIT_ARG(fflags, uap->flags);
7416 if ((error = file_vnode(uap->fd, &vp))) {
7417 return error;
7418 }
7419
7420 if ((error = vnode_getwithref(vp))) {
7421 file_drop(uap->fd);
7422 return error;
7423 }
7424
7425 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7426
7427 #if CONFIG_FILE_LEASES
7428 vnode_breakdirlease(vp, true, O_WRONLY);
7429 #endif
7430
7431 /* we don't vnode_put() here because chflags1 does internally */
7432 error = chflags1(vp, uap->flags, vfs_context_current());
7433
7434 file_drop(uap->fd);
7435 return error;
7436 }
7437
7438 /*
7439 * Change security information on a filesystem object.
7440 *
7441 * Returns: 0 Success
7442 * EPERM Operation not permitted
7443 * vnode_authattr:??? [anything vnode_authattr can return]
7444 * vnode_authorize:??? [anything vnode_authorize can return]
7445 * vnode_setattr:??? [anything vnode_setattr can return]
7446 *
7447 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7448 * translated to EPERM before being returned.
7449 */
7450 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7451 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7452 {
7453 kauth_action_t action;
7454 int error;
7455
7456 AUDIT_ARG(mode, vap->va_mode);
7457 /* XXX audit new args */
7458
7459 #if NAMEDSTREAMS
7460 /* chmod calls are not allowed for resource forks. */
7461 if (vp->v_flag & VISNAMEDSTREAM) {
7462 return EPERM;
7463 }
7464 #endif
7465
7466 #if CONFIG_MACF
7467 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7468 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7469 return error;
7470 }
7471
7472 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7473 if ((error = mac_vnode_check_setowner(ctx, vp,
7474 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7475 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7476 return error;
7477 }
7478 }
7479
7480 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7481 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7482 return error;
7483 }
7484 #endif
7485
7486 /* make sure that the caller is allowed to set this security information */
7487 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7488 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7489 if (error == EACCES) {
7490 error = EPERM;
7491 }
7492 return error;
7493 }
7494
7495 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7496 return error;
7497 }
7498
7499 #if CONFIG_MACF
7500 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7501 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7502 }
7503
7504 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7505 mac_vnode_notify_setowner(ctx, vp,
7506 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7507 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7508 }
7509
7510 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7511 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7512 }
7513 #endif
7514
7515 return error;
7516 }
7517
7518
7519 /*
7520 * Change mode of a file given a path name.
7521 *
7522 * Returns: 0 Success
7523 * namei:??? [anything namei can return]
7524 * chmod_vnode:??? [anything chmod_vnode can return]
7525 */
7526 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7527 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7528 int fd, int flag, enum uio_seg segflg)
7529 {
7530 struct nameidata nd;
7531 int follow, error;
7532 uint32_t wantparent = 0;
7533
7534 #if CONFIG_FILE_LEASES
7535 wantparent = WANTPARENT;
7536 #endif
7537
7538 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7539 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7540 segflg, path, ctx);
7541 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7542 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7543 }
7544 if ((error = nameiat(&nd, fd))) {
7545 return error;
7546 }
7547
7548 #if CONFIG_FILE_LEASES
7549 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7550 vnode_put(nd.ni_dvp);
7551 #endif
7552
7553 error = chmod_vnode(ctx, nd.ni_vp, vap);
7554 vnode_put(nd.ni_vp);
7555 nameidone(&nd);
7556 return error;
7557 }
7558
7559 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7560 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7561 gid_t gid, user_addr_t xsecurity)
7562 {
7563 int error;
7564
7565 VATTR_INIT(pva);
7566
7567 if (mode != -1) {
7568 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7569 } else {
7570 pva->va_mode = 0;
7571 }
7572
7573 if (uid != KAUTH_UID_NONE) {
7574 VATTR_SET(pva, va_uid, uid);
7575 }
7576
7577 if (gid != KAUTH_GID_NONE) {
7578 VATTR_SET(pva, va_gid, gid);
7579 }
7580
7581 *pxsecdst = NULL;
7582 switch (xsecurity) {
7583 case USER_ADDR_NULL:
7584 break;
7585
7586 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7587 VATTR_SET(pva, va_acl, NULL);
7588 break;
7589
7590 default:
7591 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7592 return error;
7593 }
7594
7595 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7596 pva->va_vaflags |= VA_FILESEC_ACL;
7597 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7598 break;
7599 }
7600
7601 return 0;
7602 }
7603
7604 /*
7605 * chmod_extended: Change the mode of a file given a path name; with extended
7606 * argument list (including extended security (ACL)).
7607 *
7608 * Parameters: p Process requesting the open
7609 * uap User argument descriptor (see below)
7610 * retval (ignored)
7611 *
7612 * Indirect: uap->path Path to object (same as 'chmod')
7613 * uap->uid UID to set
7614 * uap->gid GID to set
7615 * uap->mode File mode to set (same as 'chmod')
7616 * uap->xsecurity ACL to set (or delete)
7617 *
7618 * Returns: 0 Success
7619 * !0 errno value
7620 *
7621 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7622 *
7623 * XXX: We should enummerate the possible errno values here, and where
7624 * in the code they originated.
7625 */
7626 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7627 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7628 {
7629 int error;
7630 struct vnode_attr va;
7631 kauth_filesec_t xsecdst = NULL;
7632
7633 AUDIT_ARG(owner, uap->uid, uap->gid);
7634
7635 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7636 uap->gid, uap->xsecurity);
7637
7638 if (error) {
7639 return error;
7640 }
7641
7642 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7643 UIO_USERSPACE);
7644
7645 if (xsecdst != NULL) {
7646 kauth_filesec_free(xsecdst);
7647 }
7648 return error;
7649 }
7650
7651 /*
7652 * Returns: 0 Success
7653 * chmodat:??? [anything chmodat can return]
7654 */
7655 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7656 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7657 int flag, enum uio_seg segflg)
7658 {
7659 struct vnode_attr va;
7660
7661 VATTR_INIT(&va);
7662 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7663
7664 return chmodat(ctx, path, &va, fd, flag, segflg);
7665 }
7666
7667 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7668 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7669 {
7670 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7671 AT_FDCWD, 0, UIO_USERSPACE);
7672 }
7673
7674 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7675 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7676 {
7677 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7678 return EINVAL;
7679 }
7680
7681 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7682 uap->fd, uap->flag, UIO_USERSPACE);
7683 }
7684
7685 /*
7686 * Change mode of a file given a file descriptor.
7687 */
7688 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7689 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7690 {
7691 vnode_t vp;
7692 int error;
7693
7694 AUDIT_ARG(fd, fd);
7695
7696 if ((error = file_vnode(fd, &vp)) != 0) {
7697 return error;
7698 }
7699 if ((error = vnode_getwithref(vp)) != 0) {
7700 file_drop(fd);
7701 return error;
7702 }
7703 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7704
7705 #if CONFIG_FILE_LEASES
7706 vnode_breakdirlease(vp, true, O_WRONLY);
7707 #endif
7708
7709 error = chmod_vnode(vfs_context_current(), vp, vap);
7710 (void)vnode_put(vp);
7711 file_drop(fd);
7712
7713 return error;
7714 }
7715
7716 /*
7717 * fchmod_extended: Change mode of a file given a file descriptor; with
7718 * extended argument list (including extended security (ACL)).
7719 *
7720 * Parameters: p Process requesting to change file mode
7721 * uap User argument descriptor (see below)
7722 * retval (ignored)
7723 *
7724 * Indirect: uap->mode File mode to set (same as 'chmod')
7725 * uap->uid UID to set
7726 * uap->gid GID to set
7727 * uap->xsecurity ACL to set (or delete)
7728 * uap->fd File descriptor of file to change mode
7729 *
7730 * Returns: 0 Success
7731 * !0 errno value
7732 *
7733 */
7734 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7735 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7736 {
7737 int error;
7738 struct vnode_attr va;
7739 kauth_filesec_t xsecdst = NULL;
7740
7741 AUDIT_ARG(owner, uap->uid, uap->gid);
7742
7743 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7744 uap->gid, uap->xsecurity);
7745
7746 if (error) {
7747 return error;
7748 }
7749
7750 error = fchmod1(p, uap->fd, &va);
7751
7752 if (xsecdst != NULL) {
7753 kauth_filesec_free(xsecdst);
7754 }
7755 return error;
7756 }
7757
7758 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7759 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7760 {
7761 struct vnode_attr va;
7762
7763 VATTR_INIT(&va);
7764 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7765
7766 return fchmod1(p, uap->fd, &va);
7767 }
7768
7769
7770 /*
7771 * Set ownership given a path name.
7772 */
7773 /* ARGSUSED */
7774 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7775 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7776 gid_t gid, int flag, enum uio_seg segflg)
7777 {
7778 vnode_t vp;
7779 struct vnode_attr va;
7780 int error;
7781 struct nameidata nd;
7782 int follow;
7783 kauth_action_t action;
7784 uint32_t wantparent = 0;
7785
7786 #if CONFIG_FILE_LEASES
7787 wantparent = WANTPARENT;
7788 #endif
7789
7790 AUDIT_ARG(owner, uid, gid);
7791
7792 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7793 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent, segflg,
7794 path, ctx);
7795 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7796 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7797 }
7798 error = nameiat(&nd, fd);
7799 if (error) {
7800 return error;
7801 }
7802 vp = nd.ni_vp;
7803
7804 VATTR_INIT(&va);
7805 if (uid != (uid_t)VNOVAL) {
7806 VATTR_SET(&va, va_uid, uid);
7807 }
7808 if (gid != (gid_t)VNOVAL) {
7809 VATTR_SET(&va, va_gid, gid);
7810 }
7811
7812 #if CONFIG_MACF
7813 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7814 if (error) {
7815 goto out;
7816 }
7817 #endif
7818
7819 /* preflight and authorize attribute changes */
7820 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7821 goto out;
7822 }
7823 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7824 goto out;
7825 }
7826
7827 #if CONFIG_FILE_LEASES
7828 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7829 #endif
7830
7831 error = vnode_setattr(vp, &va, ctx);
7832
7833 #if CONFIG_MACF
7834 if (error == 0) {
7835 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7836 }
7837 #endif
7838
7839 out:
7840 /*
7841 * EACCES is only allowed from namei(); permissions failure should
7842 * return EPERM, so we need to translate the error code.
7843 */
7844 if (error == EACCES) {
7845 error = EPERM;
7846 }
7847
7848 #if CONFIG_FILE_LEASES
7849 vnode_put(nd.ni_dvp);
7850 #endif
7851 nameidone(&nd);
7852 vnode_put(vp);
7853 return error;
7854 }
7855
7856 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7857 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7858 {
7859 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7860 uap->uid, uap->gid, 0, UIO_USERSPACE);
7861 }
7862
7863 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7864 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7865 {
7866 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7867 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7868 }
7869
7870 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7871 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7872 {
7873 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7874 return EINVAL;
7875 }
7876
7877 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7878 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7879 }
7880
7881 /*
7882 * Set ownership given a file descriptor.
7883 */
7884 /* ARGSUSED */
7885 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7886 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7887 {
7888 struct vnode_attr va;
7889 vfs_context_t ctx = vfs_context_current();
7890 vnode_t vp;
7891 int error;
7892 kauth_action_t action;
7893
7894 AUDIT_ARG(owner, uap->uid, uap->gid);
7895 AUDIT_ARG(fd, uap->fd);
7896
7897 if ((error = file_vnode(uap->fd, &vp))) {
7898 return error;
7899 }
7900
7901 if ((error = vnode_getwithref(vp))) {
7902 file_drop(uap->fd);
7903 return error;
7904 }
7905 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7906
7907 VATTR_INIT(&va);
7908 if (uap->uid != VNOVAL) {
7909 VATTR_SET(&va, va_uid, uap->uid);
7910 }
7911 if (uap->gid != VNOVAL) {
7912 VATTR_SET(&va, va_gid, uap->gid);
7913 }
7914
7915 #if NAMEDSTREAMS
7916 /* chown calls are not allowed for resource forks. */
7917 if (vp->v_flag & VISNAMEDSTREAM) {
7918 error = EPERM;
7919 goto out;
7920 }
7921 #endif
7922
7923 #if CONFIG_MACF
7924 error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7925 if (error) {
7926 goto out;
7927 }
7928 #endif
7929
7930 /* preflight and authorize attribute changes */
7931 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7932 goto out;
7933 }
7934 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7935 if (error == EACCES) {
7936 error = EPERM;
7937 }
7938 goto out;
7939 }
7940
7941 #if CONFIG_FILE_LEASES
7942 vnode_breakdirlease(vp, true, O_WRONLY);
7943 #endif
7944
7945 error = vnode_setattr(vp, &va, ctx);
7946
7947 #if CONFIG_MACF
7948 if (error == 0) {
7949 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7950 }
7951 #endif
7952
7953 out:
7954 (void)vnode_put(vp);
7955 file_drop(uap->fd);
7956 return error;
7957 }
7958
7959 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7960 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7961 {
7962 int error;
7963
7964 if (usrtvp == USER_ADDR_NULL) {
7965 struct timeval old_tv;
7966 /* XXX Y2038 bug because of microtime argument */
7967 microtime(&old_tv);
7968 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7969 tsp[1] = tsp[0];
7970 } else {
7971 if (IS_64BIT_PROCESS(current_proc())) {
7972 struct user64_timeval tv[2];
7973 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7974 if (error) {
7975 return error;
7976 }
7977 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
7978 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
7979 } else {
7980 struct user32_timeval tv[2];
7981 error = copyin(usrtvp, (void *)tv, sizeof(tv));
7982 if (error) {
7983 return error;
7984 }
7985 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7986 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7987 }
7988 }
7989 return 0;
7990 }
7991
7992 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)7993 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7994 int nullflag)
7995 {
7996 int error;
7997 struct vnode_attr va;
7998 kauth_action_t action;
7999
8000 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8001
8002 VATTR_INIT(&va);
8003 VATTR_SET(&va, va_access_time, ts[0]);
8004 VATTR_SET(&va, va_modify_time, ts[1]);
8005 if (nullflag) {
8006 va.va_vaflags |= VA_UTIMES_NULL;
8007 }
8008
8009 #if NAMEDSTREAMS
8010 /* utimes calls are not allowed for resource forks. */
8011 if (vp->v_flag & VISNAMEDSTREAM) {
8012 error = EPERM;
8013 goto out;
8014 }
8015 #endif
8016
8017 #if CONFIG_MACF
8018 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8019 if (error) {
8020 goto out;
8021 }
8022 #endif
8023 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8024 if (!nullflag && error == EACCES) {
8025 error = EPERM;
8026 }
8027 goto out;
8028 }
8029
8030 /* since we may not need to auth anything, check here */
8031 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8032 if (!nullflag && error == EACCES) {
8033 error = EPERM;
8034 }
8035 goto out;
8036 }
8037 error = vnode_setattr(vp, &va, ctx);
8038
8039 #if CONFIG_MACF
8040 if (error == 0) {
8041 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8042 }
8043 #endif
8044
8045 out:
8046 return error;
8047 }
8048
8049 /*
8050 * Set the access and modification times of a file.
8051 */
8052 /* ARGSUSED */
8053 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8054 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8055 {
8056 struct timespec ts[2];
8057 user_addr_t usrtvp;
8058 int error;
8059 struct nameidata nd;
8060 vfs_context_t ctx = vfs_context_current();
8061 uint32_t wantparent = 0;
8062
8063 #if CONFIG_FILE_LEASES
8064 wantparent = WANTPARENT;
8065 #endif
8066
8067 /*
8068 * AUDIT: Needed to change the order of operations to do the
8069 * name lookup first because auditing wants the path.
8070 */
8071 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8072 UIO_USERSPACE, uap->path, ctx);
8073 error = namei(&nd);
8074 if (error) {
8075 return error;
8076 }
8077
8078 /*
8079 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8080 * the current time instead.
8081 */
8082 usrtvp = uap->tptr;
8083 if ((error = getutimes(usrtvp, ts)) != 0) {
8084 goto out;
8085 }
8086
8087 #if CONFIG_FILE_LEASES
8088 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8089 #endif
8090
8091 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8092
8093 out:
8094 #if CONFIG_FILE_LEASES
8095 vnode_put(nd.ni_dvp);
8096 #endif
8097 nameidone(&nd);
8098 vnode_put(nd.ni_vp);
8099 return error;
8100 }
8101
8102 /*
8103 * Set the access and modification times of a file.
8104 */
8105 /* ARGSUSED */
8106 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8107 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8108 {
8109 struct timespec ts[2];
8110 vnode_t vp;
8111 user_addr_t usrtvp;
8112 int error;
8113
8114 AUDIT_ARG(fd, uap->fd);
8115 usrtvp = uap->tptr;
8116 if ((error = getutimes(usrtvp, ts)) != 0) {
8117 return error;
8118 }
8119 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8120 return error;
8121 }
8122 if ((error = vnode_getwithref(vp))) {
8123 file_drop(uap->fd);
8124 return error;
8125 }
8126
8127 #if CONFIG_FILE_LEASES
8128 vnode_breakdirlease(vp, true, O_WRONLY);
8129 #endif
8130
8131 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8132
8133 vnode_put(vp);
8134 file_drop(uap->fd);
8135 return error;
8136 }
8137
8138 static int
truncate_validate_common(proc_t p,off_t length)8139 truncate_validate_common(proc_t p, off_t length)
8140 {
8141 rlim_t fsize_limit;
8142
8143 if (length < 0) {
8144 return EINVAL;
8145 }
8146
8147 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8148 if ((rlim_t)length > fsize_limit) {
8149 psignal(p, SIGXFSZ);
8150 return EFBIG;
8151 }
8152
8153 return 0;
8154 }
8155
8156 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8157 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8158 vfs_context_t ctx, boolean_t need_auth)
8159 {
8160 struct vnode_attr va;
8161 kauth_action_t action;
8162 int error;
8163
8164 VATTR_INIT(&va);
8165 VATTR_SET(&va, va_data_size, length);
8166
8167 #if CONFIG_MACF
8168 error = mac_vnode_check_truncate(ctx, cred, vp);
8169 if (error) {
8170 return error;
8171 }
8172 #endif
8173
8174 /*
8175 * If we reached here from `ftruncate` then we already did an effective
8176 * `vnode_authorize` upon open. We honour the result from then.
8177 */
8178 if (need_auth) {
8179 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8180 return error;
8181 }
8182
8183 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8184 return error;
8185 }
8186 }
8187
8188 #if CONFIG_FILE_LEASES
8189 /* Check if there is a lease placed on the parent directory. */
8190 vnode_breakdirlease(vp, true, O_WRONLY);
8191
8192 /* Now check if there is a lease placed on the file itself. */
8193 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8194 #endif
8195
8196 error = vnode_setattr(vp, &va, ctx);
8197
8198 #if CONFIG_MACF
8199 if (error == 0) {
8200 mac_vnode_notify_truncate(ctx, cred, vp);
8201 }
8202 #endif
8203
8204 return error;
8205 }
8206
8207 /*
8208 * Truncate a file given its path name.
8209 */
8210 /* ARGSUSED */
8211 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8212 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8213 {
8214 vfs_context_t ctx = vfs_context_current();
8215 vnode_t vp;
8216 int error;
8217 struct nameidata nd;
8218
8219 if ((error = truncate_validate_common(p, uap->length))) {
8220 return error;
8221 }
8222
8223 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8224 UIO_USERSPACE, uap->path, ctx);
8225
8226 if ((error = namei(&nd))) {
8227 return error;
8228 }
8229
8230 vp = nd.ni_vp;
8231 nameidone(&nd);
8232
8233 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8234 vnode_put(vp);
8235
8236 return error;
8237 }
8238
8239 /*
8240 * Truncate a file given a file descriptor.
8241 */
8242 /* ARGSUSED */
8243 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8244 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8245 {
8246 vnode_t vp;
8247 struct fileproc *fp;
8248 int error;
8249
8250 AUDIT_ARG(fd, uap->fd);
8251
8252 if ((error = truncate_validate_common(p, uap->length))) {
8253 return error;
8254 }
8255
8256 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8257 return error;
8258 }
8259
8260 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8261 case DTYPE_PSXSHM:
8262 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8263 goto out;
8264 case DTYPE_VNODE:
8265 break;
8266 default:
8267 error = EINVAL;
8268 goto out;
8269 }
8270
8271 vp = (vnode_t)fp_get_data(fp);
8272
8273 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8274 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8275 error = EINVAL;
8276 goto out;
8277 }
8278
8279 if ((error = vnode_getwithref(vp)) != 0) {
8280 goto out;
8281 }
8282
8283 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8284
8285 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8286 vfs_context_current(), false);
8287 vnode_put(vp);
8288
8289 out:
8290 file_drop(uap->fd);
8291 return error;
8292 }
8293
8294
8295 /*
8296 * Sync an open file with synchronized I/O _file_ integrity completion
8297 */
8298 /* ARGSUSED */
8299 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8300 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8301 {
8302 __pthread_testcancel(1);
8303 return fsync_common(p, uap, MNT_WAIT);
8304 }
8305
8306
8307 /*
8308 * Sync an open file with synchronized I/O _file_ integrity completion
8309 *
8310 * Notes: This is a legacy support function that does not test for
8311 * thread cancellation points.
8312 */
8313 /* ARGSUSED */
8314 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8315 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8316 {
8317 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8318 }
8319
8320
8321 /*
8322 * Sync an open file with synchronized I/O _data_ integrity completion
8323 */
8324 /* ARGSUSED */
8325 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8326 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8327 {
8328 __pthread_testcancel(1);
8329 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8330 }
8331
8332
8333 /*
8334 * fsync_common
8335 *
8336 * Common fsync code to support both synchronized I/O file integrity completion
8337 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8338 *
8339 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8340 * will only guarantee that the file data contents are retrievable. If
8341 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8342 * includes additional metadata unnecessary for retrieving the file data
8343 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8344 * storage.
8345 *
8346 * Parameters: p The process
8347 * uap->fd The descriptor to synchronize
8348 * flags The data integrity flags
8349 *
8350 * Returns: int Success
8351 * fp_getfvp:EBADF Bad file descriptor
8352 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8353 * VNOP_FSYNC:??? unspecified
8354 *
8355 * Notes: We use struct fsync_args because it is a short name, and all
8356 * caller argument structures are otherwise identical.
8357 */
8358 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8359 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8360 {
8361 vnode_t vp;
8362 struct fileproc *fp;
8363 vfs_context_t ctx = vfs_context_current();
8364 int error;
8365
8366 AUDIT_ARG(fd, uap->fd);
8367
8368 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8369 return error;
8370 }
8371 if ((error = vnode_getwithref(vp))) {
8372 file_drop(uap->fd);
8373 return error;
8374 }
8375
8376 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8377
8378 error = VNOP_FSYNC(vp, flags, ctx);
8379
8380 #if NAMEDRSRCFORK
8381 /* Sync resource fork shadow file if necessary. */
8382 if ((error == 0) &&
8383 (vp->v_flag & VISNAMEDSTREAM) &&
8384 (vp->v_parent != NULLVP) &&
8385 vnode_isshadow(vp) &&
8386 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8387 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8388 }
8389 #endif
8390
8391 (void)vnode_put(vp);
8392 file_drop(uap->fd);
8393 return error;
8394 }
8395
8396 /*
8397 * Duplicate files. Source must be a file, target must be a file or
8398 * must not exist.
8399 *
8400 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8401 * perform inheritance correctly.
8402 */
8403 /* ARGSUSED */
8404 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8405 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8406 {
8407 vnode_t tvp, fvp, tdvp, sdvp;
8408 struct nameidata fromnd, tond;
8409 int error;
8410 vfs_context_t ctx = vfs_context_current();
8411
8412 /* Check that the flags are valid. */
8413 if (uap->flags & ~CPF_MASK) {
8414 return EINVAL;
8415 }
8416
8417 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8418 UIO_USERSPACE, uap->from, ctx);
8419 if ((error = namei(&fromnd))) {
8420 return error;
8421 }
8422 fvp = fromnd.ni_vp;
8423
8424 NDINIT(&tond, CREATE, OP_LINK,
8425 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8426 UIO_USERSPACE, uap->to, ctx);
8427 if ((error = namei(&tond))) {
8428 goto out1;
8429 }
8430 tdvp = tond.ni_dvp;
8431 tvp = tond.ni_vp;
8432
8433 if (tvp != NULL) {
8434 if (!(uap->flags & CPF_OVERWRITE)) {
8435 error = EEXIST;
8436 goto out;
8437 }
8438 }
8439
8440 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8441 error = EISDIR;
8442 goto out;
8443 }
8444
8445 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8446 error = EOPNOTSUPP;
8447 goto out;
8448 }
8449
8450 #if CONFIG_MACF
8451 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8452 goto out;
8453 }
8454 #endif /* CONFIG_MACF */
8455
8456 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8457 goto out;
8458 }
8459 if (tvp) {
8460 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8461 goto out;
8462 }
8463 }
8464 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8465 goto out;
8466 }
8467
8468 if (fvp == tdvp) {
8469 error = EINVAL;
8470 }
8471 /*
8472 * If source is the same as the destination (that is the
8473 * same inode number) then there is nothing to do.
8474 * (fixed to have POSIX semantics - CSM 3/2/98)
8475 */
8476 if (fvp == tvp) {
8477 error = -1;
8478 }
8479
8480 #if CONFIG_FILE_LEASES
8481 vnode_breakdirlease(tdvp, false, O_WRONLY);
8482 #endif
8483
8484 if (!error) {
8485 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8486 }
8487 out:
8488 sdvp = tond.ni_startdir;
8489 /*
8490 * nameidone has to happen before we vnode_put(tdvp)
8491 * since it may need to release the fs_nodelock on the tdvp
8492 */
8493 nameidone(&tond);
8494
8495 if (tvp) {
8496 vnode_put(tvp);
8497 }
8498 vnode_put(tdvp);
8499 vnode_put(sdvp);
8500 out1:
8501 vnode_put(fvp);
8502
8503 nameidone(&fromnd);
8504
8505 if (error == -1) {
8506 return 0;
8507 }
8508 return error;
8509 }
8510
8511 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8512
8513 /*
8514 * Helper function for doing clones. The caller is expected to provide an
8515 * iocounted source vnode and release it.
8516 */
8517 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8518 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8519 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8520 {
8521 vnode_t tvp, tdvp;
8522 struct nameidata tond;
8523 int error;
8524 int follow;
8525 boolean_t free_src_acl;
8526 boolean_t attr_cleanup;
8527 enum vtype v_type;
8528 kauth_action_t action;
8529 struct componentname *cnp;
8530 uint32_t defaulted = 0;
8531 struct vnode_attr va;
8532 struct vnode_attr nva;
8533 uint32_t vnop_flags;
8534
8535 v_type = vnode_vtype(fvp);
8536 switch (v_type) {
8537 case VLNK:
8538 /* FALLTHRU */
8539 case VREG:
8540 action = KAUTH_VNODE_ADD_FILE;
8541 break;
8542 case VDIR:
8543 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8544 fvp->v_mountedhere) {
8545 return EINVAL;
8546 }
8547 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8548 break;
8549 default:
8550 return EINVAL;
8551 }
8552
8553 AUDIT_ARG(fd2, dst_dirfd);
8554 AUDIT_ARG(value32, flags);
8555
8556 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8557 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8558 UIO_USERSPACE, dst, ctx);
8559 if ((error = nameiat(&tond, dst_dirfd))) {
8560 return error;
8561 }
8562 cnp = &tond.ni_cnd;
8563 tdvp = tond.ni_dvp;
8564 tvp = tond.ni_vp;
8565
8566 free_src_acl = FALSE;
8567 attr_cleanup = FALSE;
8568
8569 if (tvp != NULL) {
8570 error = EEXIST;
8571 goto out;
8572 }
8573
8574 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8575 error = EXDEV;
8576 goto out;
8577 }
8578
8579 #if CONFIG_MACF
8580 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8581 goto out;
8582 }
8583 #endif
8584 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8585 goto out;
8586 }
8587
8588 action = KAUTH_VNODE_GENERIC_READ_BITS;
8589 if (data_read_authorised) {
8590 action &= ~KAUTH_VNODE_READ_DATA;
8591 }
8592 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8593 goto out;
8594 }
8595
8596 /*
8597 * certain attributes may need to be changed from the source, we ask for
8598 * those here with the exception of source file's ACLs unless the CLONE_ACL
8599 * flag is specified. By default, the clone file will inherit the target
8600 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8601 * will inherit the source file's ACLs instead.
8602 */
8603 VATTR_INIT(&va);
8604 VATTR_WANTED(&va, va_uid);
8605 VATTR_WANTED(&va, va_gid);
8606 VATTR_WANTED(&va, va_mode);
8607 VATTR_WANTED(&va, va_flags);
8608 if (flags & CLONE_ACL) {
8609 VATTR_WANTED(&va, va_acl);
8610 }
8611
8612 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8613 goto out;
8614 }
8615
8616 VATTR_INIT(&nva);
8617 VATTR_SET(&nva, va_type, v_type);
8618 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8619 VATTR_SET(&nva, va_acl, va.va_acl);
8620 free_src_acl = TRUE;
8621 }
8622
8623 /* Handle ACL inheritance, initialize vap. */
8624 if (v_type == VLNK) {
8625 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8626 } else {
8627 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8628 if (error) {
8629 goto out;
8630 }
8631 attr_cleanup = TRUE;
8632 }
8633
8634 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8635 /*
8636 * We've got initial values for all security parameters,
8637 * If we are superuser, then we can change owners to be the
8638 * same as the source. Both superuser and the owner have default
8639 * WRITE_SECURITY privileges so all other fields can be taken
8640 * from source as well.
8641 */
8642 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8643 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8644 VATTR_SET(&nva, va_uid, va.va_uid);
8645 }
8646 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8647 VATTR_SET(&nva, va_gid, va.va_gid);
8648 }
8649 } else {
8650 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8651 }
8652
8653 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8654 VATTR_SET(&nva, va_mode, va.va_mode);
8655 }
8656 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8657 VATTR_SET(&nva, va_flags,
8658 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8659 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8660 }
8661
8662 #if CONFIG_FILE_LEASES
8663 vnode_breakdirlease(tdvp, false, O_WRONLY);
8664 #endif
8665
8666 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8667
8668 if (!error && tvp) {
8669 int update_flags = 0;
8670 #if CONFIG_FSE
8671 int fsevent;
8672 #endif /* CONFIG_FSE */
8673
8674 /*
8675 * If some of the requested attributes weren't handled by the
8676 * VNOP, use our fallback code.
8677 */
8678 if (!VATTR_ALL_SUPPORTED(&nva)) {
8679 (void)vnode_setattr_fallback(tvp, &nva, ctx);
8680 }
8681
8682 #if CONFIG_MACF
8683 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8684 VNODE_LABEL_CREATE, ctx);
8685 #endif
8686
8687 // Make sure the name & parent pointers are hooked up
8688 if (tvp->v_name == NULL) {
8689 update_flags |= VNODE_UPDATE_NAME;
8690 }
8691 if (tvp->v_parent == NULLVP) {
8692 update_flags |= VNODE_UPDATE_PARENT;
8693 }
8694
8695 if (update_flags) {
8696 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8697 cnp->cn_namelen, cnp->cn_hash, update_flags);
8698 }
8699
8700 #if CONFIG_FSE
8701 switch (vnode_vtype(tvp)) {
8702 case VLNK:
8703 /* FALLTHRU */
8704 case VREG:
8705 fsevent = FSE_CREATE_FILE;
8706 break;
8707 case VDIR:
8708 fsevent = FSE_CREATE_DIR;
8709 break;
8710 default:
8711 goto out;
8712 }
8713
8714 if (need_fsevent(fsevent, tvp)) {
8715 /*
8716 * The following is a sequence of three explicit events.
8717 * A pair of FSE_CLONE events representing the source and destination
8718 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8719 * fseventsd may coalesce the destination clone and create events
8720 * into a single event resulting in the following sequence for a client
8721 * FSE_CLONE (src)
8722 * FSE_CLONE | FSE_CREATE (dst)
8723 */
8724 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8725 FSE_ARG_DONE);
8726 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8727 FSE_ARG_DONE);
8728 }
8729 #endif /* CONFIG_FSE */
8730 }
8731
8732 out:
8733 if (attr_cleanup) {
8734 vn_attribute_cleanup(&nva, defaulted);
8735 }
8736 if (free_src_acl && va.va_acl) {
8737 kauth_acl_free(va.va_acl);
8738 }
8739 nameidone(&tond);
8740 if (tvp) {
8741 vnode_put(tvp);
8742 }
8743 vnode_put(tdvp);
8744 return error;
8745 }
8746
8747 /*
8748 * clone files or directories, target must not exist.
8749 */
8750 /* ARGSUSED */
8751 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8752 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8753 __unused int32_t *retval)
8754 {
8755 vnode_t fvp;
8756 struct nameidata fromnd;
8757 int follow;
8758 int error;
8759 vfs_context_t ctx = vfs_context_current();
8760
8761 /* Check that the flags are valid. */
8762 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8763 return EINVAL;
8764 }
8765
8766 AUDIT_ARG(fd, uap->src_dirfd);
8767
8768 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8769 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8770 UIO_USERSPACE, uap->src, ctx);
8771 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8772 return error;
8773 }
8774
8775 fvp = fromnd.ni_vp;
8776 nameidone(&fromnd);
8777
8778 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8779 uap->flags, ctx);
8780
8781 vnode_put(fvp);
8782 return error;
8783 }
8784
8785 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8786 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8787 __unused int32_t *retval)
8788 {
8789 vnode_t fvp;
8790 struct fileproc *fp;
8791 int error;
8792 vfs_context_t ctx = vfs_context_current();
8793
8794 /* Check that the flags are valid. */
8795 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8796 return EINVAL;
8797 }
8798
8799 AUDIT_ARG(fd, uap->src_fd);
8800 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8801 if (error) {
8802 return error;
8803 }
8804
8805 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8806 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8807 error = EBADF;
8808 goto out;
8809 }
8810
8811 if ((error = vnode_getwithref(fvp))) {
8812 goto out;
8813 }
8814
8815 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8816
8817 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8818 uap->flags, ctx);
8819
8820 vnode_put(fvp);
8821 out:
8822 file_drop(uap->src_fd);
8823 return error;
8824 }
8825
8826 static int
rename_submounts_callback(mount_t mp,void * arg)8827 rename_submounts_callback(mount_t mp, void *arg)
8828 {
8829 int error = 0;
8830 mount_t pmp = (mount_t)arg;
8831 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8832
8833 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8834 return 0;
8835 }
8836
8837 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8838 return 0;
8839 }
8840
8841 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8842 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8843 return -1;
8844 }
8845
8846 size_t pathlen = MAXPATHLEN;
8847 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8848 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8849 }
8850
8851 vfs_unbusy(mp);
8852
8853 return error;
8854 }
8855
8856 /*
8857 * Rename files. Source and destination must either both be directories,
8858 * or both not be directories. If target is a directory, it must be empty.
8859 */
8860 /* ARGSUSED */
8861 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8862 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8863 int tofd, user_addr_t to, int segflg, u_int uflags)
8864 {
8865 vnode_t tvp, tdvp;
8866 vnode_t fvp, fdvp;
8867 vnode_t mnt_fvp;
8868 struct nameidata *fromnd, *tond;
8869 int error = 0;
8870 int do_retry;
8871 int retry_count;
8872 int mntrename;
8873 int need_event;
8874 int need_kpath2;
8875 int has_listeners;
8876 const char *oname = NULL;
8877 char *from_name = NULL, *to_name = NULL;
8878 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8879 int from_len = 0, to_len = 0;
8880 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8881 int holding_mntlock;
8882 int vn_authorize_skipped;
8883 mount_t locked_mp = NULL;
8884 vnode_t oparent = NULLVP;
8885 #if CONFIG_FSE
8886 fse_info from_finfo = {}, to_finfo;
8887 #endif
8888 int from_truncated = 0, to_truncated = 0;
8889 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8890 int batched = 0;
8891 struct vnode_attr *fvap, *tvap;
8892 int continuing = 0;
8893 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8894 int32_t nofollow_any = 0;
8895 /* carving out a chunk for structs that are too big to be on stack. */
8896 struct {
8897 struct nameidata from_node, to_node;
8898 struct vnode_attr fv_attr, tv_attr;
8899 } * __rename_data;
8900
8901 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8902 fromnd = &__rename_data->from_node;
8903 tond = &__rename_data->to_node;
8904
8905 holding_mntlock = 0;
8906 do_retry = 0;
8907 retry_count = 0;
8908 retry:
8909 fvp = tvp = NULL;
8910 fdvp = tdvp = NULL;
8911 fvap = tvap = NULL;
8912 mnt_fvp = NULLVP;
8913 mntrename = FALSE;
8914 vn_authorize_skipped = FALSE;
8915
8916 if (uflags & RENAME_NOFOLLOW_ANY) {
8917 nofollow_any = NAMEI_NOFOLLOW_ANY;
8918 }
8919 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8920 segflg, from, ctx);
8921 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8922
8923 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8924 segflg, to, ctx);
8925 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8926
8927 continue_lookup:
8928 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8929 if ((error = nameiat(fromnd, fromfd))) {
8930 goto out1;
8931 }
8932 fdvp = fromnd->ni_dvp;
8933 fvp = fromnd->ni_vp;
8934
8935 if (fvp && fvp->v_type == VDIR) {
8936 tond->ni_cnd.cn_flags |= WILLBEDIR;
8937 }
8938 }
8939
8940 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8941 if ((error = nameiat(tond, tofd))) {
8942 /*
8943 * Translate error code for rename("dir1", "dir2/.").
8944 */
8945 if (error == EISDIR && fvp->v_type == VDIR) {
8946 error = EINVAL;
8947 }
8948 goto out1;
8949 }
8950 tdvp = tond->ni_dvp;
8951 tvp = tond->ni_vp;
8952 }
8953
8954 #if DEVELOPMENT || DEBUG
8955 /*
8956 * XXX VSWAP: Check for entitlements or special flag here
8957 * so we can restrict access appropriately.
8958 */
8959 #else /* DEVELOPMENT || DEBUG */
8960
8961 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8962 error = EPERM;
8963 goto out1;
8964 }
8965
8966 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8967 error = EPERM;
8968 goto out1;
8969 }
8970 #endif /* DEVELOPMENT || DEBUG */
8971
8972 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8973 error = ENOENT;
8974 goto out1;
8975 }
8976
8977 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8978 int32_t pval = 0;
8979 int err = 0;
8980
8981 /*
8982 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
8983 * has the same name as target iff the following conditions are met:
8984 * 1. the target file system is case insensitive
8985 * 2. source and target directories are the same
8986 * 3. source and target files are the same
8987 * 4. name only differs in case (determined by underlying filesystem)
8988 */
8989 if (fvp != tvp || fdvp != tdvp) {
8990 error = EEXIST;
8991 goto out1;
8992 }
8993
8994 /*
8995 * Assume that the target file system is case sensitive if
8996 * _PC_CASE_SENSITIVE selector isn't supported.
8997 */
8998 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
8999 if (err != 0 || pval != 0) {
9000 error = EEXIST;
9001 goto out1;
9002 }
9003 }
9004
9005 batched = vnode_compound_rename_available(fdvp);
9006
9007 #if CONFIG_FSE
9008 need_event = need_fsevent(FSE_RENAME, fdvp);
9009 if (need_event) {
9010 if (fvp) {
9011 get_fse_info(fvp, &from_finfo, ctx);
9012 } else {
9013 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9014 if (error) {
9015 goto out1;
9016 }
9017
9018 fvap = &__rename_data->fv_attr;
9019 }
9020
9021 if (tvp) {
9022 get_fse_info(tvp, &to_finfo, ctx);
9023 } else if (batched) {
9024 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9025 if (error) {
9026 goto out1;
9027 }
9028
9029 tvap = &__rename_data->tv_attr;
9030 }
9031 }
9032 #else
9033 need_event = 0;
9034 #endif /* CONFIG_FSE */
9035
9036 has_listeners = kauth_authorize_fileop_has_listeners();
9037
9038 need_kpath2 = 0;
9039 #if CONFIG_AUDIT
9040 if (AUDIT_RECORD_EXISTS()) {
9041 need_kpath2 = 1;
9042 }
9043 #endif
9044
9045 if (need_event || has_listeners) {
9046 if (from_name == NULL) {
9047 GET_PATH(from_name);
9048 }
9049
9050 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9051
9052 if (from_name_no_firmlink == NULL) {
9053 GET_PATH(from_name_no_firmlink);
9054 }
9055
9056 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9057 }
9058
9059 if (need_event || need_kpath2 || has_listeners) {
9060 if (to_name == NULL) {
9061 GET_PATH(to_name);
9062 }
9063
9064 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9065
9066 if (to_name_no_firmlink == NULL) {
9067 GET_PATH(to_name_no_firmlink);
9068 }
9069
9070 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9071 if (to_name && need_kpath2) {
9072 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9073 }
9074 }
9075 if (!fvp) {
9076 /*
9077 * Claim: this check will never reject a valid rename.
9078 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9079 * Suppose fdvp and tdvp are not on the same mount.
9080 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9081 * then you can't move it to within another dir on the same mountpoint.
9082 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9083 *
9084 * If this check passes, then we are safe to pass these vnodes to the same FS.
9085 */
9086 if (fdvp->v_mount != tdvp->v_mount) {
9087 error = EXDEV;
9088 goto out1;
9089 }
9090 goto skipped_lookup;
9091 }
9092
9093 /*
9094 * If the source and destination are the same (i.e. they're
9095 * links to the same vnode) and the target file system is
9096 * case sensitive, then there is nothing to do.
9097 *
9098 * XXX Come back to this.
9099 */
9100 if (fvp == tvp) {
9101 int pathconf_val;
9102
9103 /*
9104 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9105 * then assume that this file system is case sensitive.
9106 */
9107 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9108 pathconf_val != 0) {
9109 vn_authorize_skipped = TRUE;
9110 goto out1;
9111 }
9112 }
9113
9114 /*
9115 * Allow the renaming of mount points.
9116 * - target must not exist
9117 * - target must reside in the same directory as source
9118 * - union mounts cannot be renamed
9119 * - the root fs, and tightly-linked system volumes, cannot be renamed
9120 *
9121 * XXX Handle this in VFS after a continued lookup (if we missed
9122 * in the cache to start off)
9123 *
9124 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9125 * we'll skip past here. The file system is responsible for
9126 * checking that @tvp is not a descendent of @fvp and vice versa
9127 * so it should always return EINVAL if either @tvp or @fvp is the
9128 * root of a volume.
9129 */
9130 if ((fvp->v_flag & VROOT) &&
9131 (fvp->v_type == VDIR) &&
9132 (tvp == NULL) &&
9133 (fvp->v_mountedhere == NULL) &&
9134 (fdvp == tdvp) &&
9135 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9136 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9137 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9138 vnode_t coveredvp;
9139
9140 /* switch fvp to the covered vnode */
9141 coveredvp = fvp->v_mount->mnt_vnodecovered;
9142 if ((vnode_getwithref(coveredvp))) {
9143 error = ENOENT;
9144 goto out1;
9145 }
9146 /*
9147 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9148 * later.
9149 */
9150 mnt_fvp = fvp;
9151
9152 fvp = coveredvp;
9153 mntrename = TRUE;
9154 }
9155 /*
9156 * Check for cross-device rename.
9157 */
9158 if ((fvp->v_mount != tdvp->v_mount) ||
9159 (tvp && (fvp->v_mount != tvp->v_mount))) {
9160 error = EXDEV;
9161 goto out1;
9162 }
9163
9164 /*
9165 * If source is the same as the destination (that is the
9166 * same inode number) then there is nothing to do...
9167 * EXCEPT if the underlying file system supports case
9168 * insensitivity and is case preserving. In this case
9169 * the file system needs to handle the special case of
9170 * getting the same vnode as target (fvp) and source (tvp).
9171 *
9172 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9173 * and _PC_CASE_PRESERVING can have this exception, and they need to
9174 * handle the special case of getting the same vnode as target and
9175 * source. NOTE: Then the target is unlocked going into vnop_rename,
9176 * so not to cause locking problems. There is a single reference on tvp.
9177 *
9178 * NOTE - that fvp == tvp also occurs if they are hard linked and
9179 * that correct behaviour then is just to return success without doing
9180 * anything.
9181 *
9182 * XXX filesystem should take care of this itself, perhaps...
9183 */
9184 if (fvp == tvp && fdvp == tdvp) {
9185 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9186 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9187 fromnd->ni_cnd.cn_namelen)) {
9188 vn_authorize_skipped = TRUE;
9189 goto out1;
9190 }
9191 }
9192
9193 if (holding_mntlock && fvp->v_mount != locked_mp) {
9194 /*
9195 * we're holding a reference and lock
9196 * on locked_mp, but it no longer matches
9197 * what we want to do... so drop our hold
9198 */
9199 mount_unlock_renames(locked_mp);
9200 mount_drop(locked_mp, 0);
9201 holding_mntlock = 0;
9202 }
9203 if (tdvp != fdvp && fvp->v_type == VDIR) {
9204 /*
9205 * serialize renames that re-shape
9206 * the tree... if holding_mntlock is
9207 * set, then we're ready to go...
9208 * otherwise we
9209 * first need to drop the iocounts
9210 * we picked up, second take the
9211 * lock to serialize the access,
9212 * then finally start the lookup
9213 * process over with the lock held
9214 */
9215 if (!holding_mntlock) {
9216 /*
9217 * need to grab a reference on
9218 * the mount point before we
9219 * drop all the iocounts... once
9220 * the iocounts are gone, the mount
9221 * could follow
9222 */
9223 locked_mp = fvp->v_mount;
9224 mount_ref(locked_mp, 0);
9225
9226 /*
9227 * nameidone has to happen before we vnode_put(tvp)
9228 * since it may need to release the fs_nodelock on the tvp
9229 */
9230 nameidone(tond);
9231
9232 if (tvp) {
9233 vnode_put(tvp);
9234 }
9235 vnode_put(tdvp);
9236
9237 /*
9238 * nameidone has to happen before we vnode_put(fdvp)
9239 * since it may need to release the fs_nodelock on the fvp
9240 */
9241 nameidone(fromnd);
9242
9243 vnode_put(fvp);
9244 vnode_put(fdvp);
9245
9246 if (mnt_fvp != NULLVP) {
9247 vnode_put(mnt_fvp);
9248 }
9249
9250 mount_lock_renames(locked_mp);
9251 holding_mntlock = 1;
9252
9253 goto retry;
9254 }
9255 } else {
9256 /*
9257 * when we dropped the iocounts to take
9258 * the lock, we allowed the identity of
9259 * the various vnodes to change... if they did,
9260 * we may no longer be dealing with a rename
9261 * that reshapes the tree... once we're holding
9262 * the iocounts, the vnodes can't change type
9263 * so we're free to drop the lock at this point
9264 * and continue on
9265 */
9266 if (holding_mntlock) {
9267 mount_unlock_renames(locked_mp);
9268 mount_drop(locked_mp, 0);
9269 holding_mntlock = 0;
9270 }
9271 }
9272
9273 if (!batched) {
9274 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9275 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9276 flags, NULL);
9277 if (error) {
9278 if (error == ENOENT) {
9279 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9280 /*
9281 * We encountered a race where after doing the namei,
9282 * tvp stops being valid. If so, simply re-drive the rename
9283 * call from the top.
9284 */
9285 do_retry = 1;
9286 retry_count += 1;
9287 }
9288 }
9289 goto out1;
9290 }
9291 }
9292
9293 /* Release the 'mnt_fvp' now that it is no longer needed. */
9294 if (mnt_fvp != NULLVP) {
9295 vnode_put(mnt_fvp);
9296 mnt_fvp = NULLVP;
9297 }
9298
9299 // save these off so we can later verify that fvp is the same
9300 oname = fvp->v_name;
9301 oparent = fvp->v_parent;
9302
9303 skipped_lookup:
9304 #if CONFIG_FILE_LEASES
9305 /* Lease break needed for source's parent dir? */
9306 vnode_breakdirlease(fdvp, false, O_WRONLY);
9307
9308 /* Lease break needed for target's parent dir? */
9309 vnode_breakdirlease(tdvp, false, O_WRONLY);
9310 #endif
9311
9312 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9313 tdvp, &tvp, &tond->ni_cnd, tvap,
9314 flags, ctx);
9315
9316 if (holding_mntlock) {
9317 /*
9318 * we can drop our serialization
9319 * lock now
9320 */
9321 mount_unlock_renames(locked_mp);
9322 mount_drop(locked_mp, 0);
9323 holding_mntlock = 0;
9324 }
9325 if (error) {
9326 if (error == EDATALESS) {
9327 /*
9328 * If we've been here before, something has gone
9329 * horribly wrong and we should just get out lest
9330 * we spiral around the drain forever.
9331 */
9332 if (flags & VFS_RENAME_DATALESS) {
9333 error = EIO;
9334 goto out1;
9335 }
9336
9337 /*
9338 * The object we're renaming is dataless (or has a
9339 * dataless descendent) and requires materialization
9340 * before the rename occurs. But we're holding the
9341 * mount point's rename lock, so it's not safe to
9342 * make the upcall.
9343 *
9344 * In this case, we release the lock, perform the
9345 * materialization, and start the whole thing over.
9346 */
9347 error = vnode_materialize_dataless_file(fvp,
9348 NAMESPACE_HANDLER_RENAME_OP);
9349
9350 if (error == 0) {
9351 /*
9352 * The next time around we need to tell the
9353 * file system that the materializtaion has
9354 * been performed.
9355 */
9356 flags |= VFS_RENAME_DATALESS;
9357 do_retry = 1;
9358 }
9359 goto out1;
9360 }
9361 if (error == EKEEPLOOKING) {
9362 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9363 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9364 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9365 }
9366 }
9367
9368 fromnd->ni_vp = fvp;
9369 tond->ni_vp = tvp;
9370
9371 goto continue_lookup;
9372 }
9373
9374 /*
9375 * We may encounter a race in the VNOP where the destination didn't
9376 * exist when we did the namei, but it does by the time we go and
9377 * try to create the entry. In this case, we should re-drive this rename
9378 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9379 * but other filesystems susceptible to this race could return it, too.
9380 */
9381 if (error == ERECYCLE) {
9382 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9383 do_retry = 1;
9384 retry_count += 1;
9385 } else {
9386 printf("rename retry limit due to ERECYCLE reached\n");
9387 error = ENOENT;
9388 }
9389 }
9390
9391 /*
9392 * For compound VNOPs, the authorization callback may return
9393 * ENOENT in case of racing hardlink lookups hitting the name
9394 * cache, redrive the lookup.
9395 */
9396 if (batched && error == ENOENT) {
9397 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9398 do_retry = 1;
9399 retry_count += 1;
9400 }
9401 }
9402
9403 goto out1;
9404 }
9405
9406 /* call out to allow 3rd party notification of rename.
9407 * Ignore result of kauth_authorize_fileop call.
9408 */
9409 kauth_authorize_fileop(vfs_context_ucred(ctx),
9410 KAUTH_FILEOP_RENAME,
9411 (uintptr_t)from_name, (uintptr_t)to_name);
9412 if (flags & VFS_RENAME_SWAP) {
9413 kauth_authorize_fileop(vfs_context_ucred(ctx),
9414 KAUTH_FILEOP_RENAME,
9415 (uintptr_t)to_name, (uintptr_t)from_name);
9416 }
9417
9418 #if CONFIG_FSE
9419 if (from_name != NULL && to_name != NULL) {
9420 if (from_truncated || to_truncated) {
9421 // set it here since only the from_finfo gets reported up to user space
9422 from_finfo.mode |= FSE_TRUNCATED_PATH;
9423 }
9424
9425 if (tvap && tvp) {
9426 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9427 }
9428 if (fvap) {
9429 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9430 }
9431
9432 if (tvp) {
9433 add_fsevent(FSE_RENAME, ctx,
9434 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9435 FSE_ARG_FINFO, &from_finfo,
9436 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9437 FSE_ARG_FINFO, &to_finfo,
9438 FSE_ARG_DONE);
9439 if (flags & VFS_RENAME_SWAP) {
9440 /*
9441 * Strictly speaking, swap is the equivalent of
9442 * *three* renames. FSEvents clients should only take
9443 * the events as a hint, so we only bother reporting
9444 * two.
9445 */
9446 add_fsevent(FSE_RENAME, ctx,
9447 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9448 FSE_ARG_FINFO, &to_finfo,
9449 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9450 FSE_ARG_FINFO, &from_finfo,
9451 FSE_ARG_DONE);
9452 }
9453 } else {
9454 add_fsevent(FSE_RENAME, ctx,
9455 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9456 FSE_ARG_FINFO, &from_finfo,
9457 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9458 FSE_ARG_DONE);
9459 }
9460 }
9461 #endif /* CONFIG_FSE */
9462
9463 /*
9464 * update filesystem's mount point data
9465 */
9466 if (mntrename) {
9467 char *cp, *pathend, *mpname;
9468 char * tobuf;
9469 struct mount *mp;
9470 int maxlen;
9471 size_t len = 0;
9472
9473 mp = fvp->v_mountedhere;
9474
9475 if (vfs_busy(mp, LK_NOWAIT)) {
9476 error = EBUSY;
9477 goto out1;
9478 }
9479 tobuf = zalloc(ZV_NAMEI);
9480
9481 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9482 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9483 } else {
9484 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9485 }
9486 if (!error) {
9487 /* find current mount point prefix */
9488 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9489 for (cp = pathend; *cp != '\0'; ++cp) {
9490 if (*cp == '/') {
9491 pathend = cp + 1;
9492 }
9493 }
9494 /* find last component of target name */
9495 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9496 if (*cp == '/') {
9497 mpname = cp + 1;
9498 }
9499 }
9500
9501 /* Update f_mntonname of sub mounts */
9502 vfs_iterate(0, rename_submounts_callback, (void *)mp);
9503
9504 /* append name to prefix */
9505 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9506 bzero(pathend, maxlen);
9507
9508 strlcpy(pathend, mpname, maxlen);
9509 }
9510 zfree(ZV_NAMEI, tobuf);
9511
9512 vfs_unbusy(mp);
9513
9514 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9515 }
9516 /*
9517 * fix up name & parent pointers. note that we first
9518 * check that fvp has the same name/parent pointers it
9519 * had before the rename call... this is a 'weak' check
9520 * at best...
9521 *
9522 * XXX oparent and oname may not be set in the compound vnop case
9523 */
9524 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9525 int update_flags;
9526
9527 update_flags = VNODE_UPDATE_NAME;
9528
9529 if (fdvp != tdvp) {
9530 update_flags |= VNODE_UPDATE_PARENT;
9531 }
9532
9533 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9534 }
9535 out1:
9536 /*
9537 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9538 * skipped earlier as no actual rename was performed.
9539 */
9540 if (vn_authorize_skipped && error == 0) {
9541 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9542 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9543 flags, NULL);
9544 if (error && error == ENOENT) {
9545 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9546 do_retry = 1;
9547 retry_count += 1;
9548 }
9549 }
9550 }
9551 if (to_name != NULL) {
9552 RELEASE_PATH(to_name);
9553 to_name = NULL;
9554 }
9555 if (to_name_no_firmlink != NULL) {
9556 RELEASE_PATH(to_name_no_firmlink);
9557 to_name_no_firmlink = NULL;
9558 }
9559 if (from_name != NULL) {
9560 RELEASE_PATH(from_name);
9561 from_name = NULL;
9562 }
9563 if (from_name_no_firmlink != NULL) {
9564 RELEASE_PATH(from_name_no_firmlink);
9565 from_name_no_firmlink = NULL;
9566 }
9567 if (holding_mntlock) {
9568 mount_unlock_renames(locked_mp);
9569 mount_drop(locked_mp, 0);
9570 holding_mntlock = 0;
9571 }
9572 if (tdvp) {
9573 /*
9574 * nameidone has to happen before we vnode_put(tdvp)
9575 * since it may need to release the fs_nodelock on the tdvp
9576 */
9577 nameidone(tond);
9578
9579 if (tvp) {
9580 vnode_put(tvp);
9581 }
9582 vnode_put(tdvp);
9583 }
9584 if (fdvp) {
9585 /*
9586 * nameidone has to happen before we vnode_put(fdvp)
9587 * since it may need to release the fs_nodelock on the fdvp
9588 */
9589 nameidone(fromnd);
9590
9591 if (fvp) {
9592 vnode_put(fvp);
9593 }
9594 vnode_put(fdvp);
9595 }
9596 if (mnt_fvp != NULLVP) {
9597 vnode_put(mnt_fvp);
9598 }
9599 /*
9600 * If things changed after we did the namei, then we will re-drive
9601 * this rename call from the top.
9602 */
9603 if (do_retry) {
9604 do_retry = 0;
9605 goto retry;
9606 }
9607
9608 kfree_type(typeof(*__rename_data), __rename_data);
9609 return error;
9610 }
9611
9612 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9613 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9614 {
9615 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9616 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9617 }
9618
9619 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9620 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9621 {
9622 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9623 return EINVAL;
9624 }
9625
9626 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9627 return EINVAL;
9628 }
9629
9630 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9631 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9632 }
9633
9634 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9635 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9636 {
9637 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9638 uap->tofd, uap->to, UIO_USERSPACE, 0);
9639 }
9640
9641 /*
9642 * Make a directory file.
9643 *
9644 * Returns: 0 Success
9645 * EEXIST
9646 * namei:???
9647 * vnode_authorize:???
9648 * vn_create:???
9649 */
9650 /* ARGSUSED */
9651 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9652 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9653 enum uio_seg segflg)
9654 {
9655 vnode_t vp, dvp;
9656 int error;
9657 int update_flags = 0;
9658 int batched;
9659 struct nameidata nd;
9660
9661 AUDIT_ARG(mode, vap->va_mode);
9662 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9663 path, ctx);
9664 nd.ni_cnd.cn_flags |= WILLBEDIR;
9665 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9666
9667 continue_lookup:
9668 error = nameiat(&nd, fd);
9669 if (error) {
9670 return error;
9671 }
9672 dvp = nd.ni_dvp;
9673 vp = nd.ni_vp;
9674
9675 if (vp != NULL) {
9676 error = EEXIST;
9677 goto out;
9678 }
9679
9680 batched = vnode_compound_mkdir_available(dvp);
9681
9682 VATTR_SET(vap, va_type, VDIR);
9683
9684 /*
9685 * XXX
9686 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9687 * only get EXISTS or EISDIR for existing path components, and not that it could see
9688 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9689 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9690 */
9691 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9692 if (error == EACCES || error == EPERM) {
9693 int error2;
9694
9695 nameidone(&nd);
9696 vnode_put(dvp);
9697 dvp = NULLVP;
9698
9699 /*
9700 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9701 * rather than EACCESS if the target exists.
9702 */
9703 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9704 path, ctx);
9705 error2 = nameiat(&nd, fd);
9706 if (error2) {
9707 goto out;
9708 } else {
9709 vp = nd.ni_vp;
9710 error = EEXIST;
9711 goto out;
9712 }
9713 }
9714
9715 goto out;
9716 }
9717
9718 #if CONFIG_FILE_LEASES
9719 vnode_breakdirlease(dvp, false, O_WRONLY);
9720 #endif
9721
9722 /*
9723 * make the directory
9724 */
9725 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9726 if (error == EKEEPLOOKING) {
9727 nd.ni_vp = vp;
9728 goto continue_lookup;
9729 }
9730
9731 goto out;
9732 }
9733
9734 // Make sure the name & parent pointers are hooked up
9735 if (vp->v_name == NULL) {
9736 update_flags |= VNODE_UPDATE_NAME;
9737 }
9738 if (vp->v_parent == NULLVP) {
9739 update_flags |= VNODE_UPDATE_PARENT;
9740 }
9741
9742 if (update_flags) {
9743 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9744 }
9745
9746 #if CONFIG_FSE
9747 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9748 #endif
9749
9750 out:
9751 /*
9752 * nameidone has to happen before we vnode_put(dvp)
9753 * since it may need to release the fs_nodelock on the dvp
9754 */
9755 nameidone(&nd);
9756
9757 if (vp) {
9758 vnode_put(vp);
9759 }
9760 if (dvp) {
9761 vnode_put(dvp);
9762 }
9763
9764 return error;
9765 }
9766
9767 /*
9768 * mkdir_extended: Create a directory; with extended security (ACL).
9769 *
9770 * Parameters: p Process requesting to create the directory
9771 * uap User argument descriptor (see below)
9772 * retval (ignored)
9773 *
9774 * Indirect: uap->path Path of directory to create
9775 * uap->mode Access permissions to set
9776 * uap->xsecurity ACL to set
9777 *
9778 * Returns: 0 Success
9779 * !0 Not success
9780 *
9781 */
9782 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9783 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9784 {
9785 int ciferror;
9786 kauth_filesec_t xsecdst;
9787 struct vnode_attr va;
9788
9789 AUDIT_ARG(owner, uap->uid, uap->gid);
9790
9791 xsecdst = NULL;
9792 if ((uap->xsecurity != USER_ADDR_NULL) &&
9793 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9794 return ciferror;
9795 }
9796
9797 VATTR_INIT(&va);
9798 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9799 if (xsecdst != NULL) {
9800 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9801 va.va_vaflags |= VA_FILESEC_ACL;
9802 }
9803
9804 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9805 UIO_USERSPACE);
9806 if (xsecdst != NULL) {
9807 kauth_filesec_free(xsecdst);
9808 }
9809 return ciferror;
9810 }
9811
9812 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9813 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9814 {
9815 struct vnode_attr va;
9816
9817 VATTR_INIT(&va);
9818 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9819
9820 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9821 UIO_USERSPACE);
9822 }
9823
9824 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9825 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9826 {
9827 struct vnode_attr va;
9828
9829 VATTR_INIT(&va);
9830 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9831
9832 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9833 UIO_USERSPACE);
9834 }
9835
9836 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9837 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9838 enum uio_seg segflg, int unlink_flags)
9839 {
9840 struct {
9841 struct nameidata nd;
9842 #if CONFIG_FSE
9843 struct vnode_attr va;
9844 #endif /* CONFIG_FSE */
9845 } *__rmdir_data;
9846 vnode_t vp, dvp;
9847 int error;
9848 struct nameidata *ndp;
9849 char *path = NULL;
9850 char *no_firmlink_path = NULL;
9851 int len_path = 0;
9852 int len_no_firmlink_path = 0;
9853 int has_listeners = 0;
9854 int need_event = 0;
9855 int truncated_path = 0;
9856 int truncated_no_firmlink_path = 0;
9857 struct vnode_attr *vap = NULL;
9858 int restart_count = 0;
9859 int batched;
9860
9861 int restart_flag;
9862
9863 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9864 ndp = &__rmdir_data->nd;
9865
9866 /*
9867 * This loop exists to restart rmdir in the unlikely case that two
9868 * processes are simultaneously trying to remove the same directory
9869 * containing orphaned appleDouble files.
9870 */
9871 do {
9872 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9873 segflg, dirpath, ctx);
9874 ndp->ni_flag = NAMEI_COMPOUNDRMDIR;
9875 continue_lookup:
9876 restart_flag = 0;
9877 vap = NULL;
9878
9879 error = nameiat(ndp, fd);
9880 if (error) {
9881 goto err_out;
9882 }
9883
9884 dvp = ndp->ni_dvp;
9885 vp = ndp->ni_vp;
9886
9887 if (vp) {
9888 batched = vnode_compound_rmdir_available(vp);
9889
9890 if (vp->v_flag & VROOT) {
9891 /*
9892 * The root of a mounted filesystem cannot be deleted.
9893 */
9894 error = EBUSY;
9895 goto out;
9896 }
9897
9898 #if DEVELOPMENT || DEBUG
9899 /*
9900 * XXX VSWAP: Check for entitlements or special flag here
9901 * so we can restrict access appropriately.
9902 */
9903 #else /* DEVELOPMENT || DEBUG */
9904
9905 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9906 error = EPERM;
9907 goto out;
9908 }
9909 #endif /* DEVELOPMENT || DEBUG */
9910
9911 /*
9912 * Removed a check here; we used to abort if vp's vid
9913 * was not the same as what we'd seen the last time around.
9914 * I do not think that check was valid, because if we retry
9915 * and all dirents are gone, the directory could legitimately
9916 * be recycled but still be present in a situation where we would
9917 * have had permission to delete. Therefore, we won't make
9918 * an effort to preserve that check now that we may not have a
9919 * vp here.
9920 */
9921
9922 if (!batched) {
9923 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9924 if (error) {
9925 if (error == ENOENT) {
9926 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9927 restart_flag = 1;
9928 restart_count += 1;
9929 }
9930 }
9931 goto out;
9932 }
9933 }
9934 } else {
9935 batched = 1;
9936
9937 if (!vnode_compound_rmdir_available(dvp)) {
9938 panic("No error, but no compound rmdir?");
9939 }
9940 }
9941
9942 #if CONFIG_FSE
9943 fse_info finfo = {0};
9944
9945 need_event = need_fsevent(FSE_DELETE, dvp);
9946 if (need_event) {
9947 if (!batched) {
9948 get_fse_info(vp, &finfo, ctx);
9949 } else {
9950 error = vfs_get_notify_attributes(&__rmdir_data->va);
9951 if (error) {
9952 goto out;
9953 }
9954
9955 vap = &__rmdir_data->va;
9956 }
9957 }
9958 #endif
9959 has_listeners = kauth_authorize_fileop_has_listeners();
9960 if (need_event || has_listeners) {
9961 if (path == NULL) {
9962 GET_PATH(path);
9963 }
9964
9965 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9966
9967 if (no_firmlink_path == NULL) {
9968 GET_PATH(no_firmlink_path);
9969 }
9970
9971 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9972 #if CONFIG_FSE
9973 if (truncated_no_firmlink_path) {
9974 finfo.mode |= FSE_TRUNCATED_PATH;
9975 }
9976 #endif
9977 }
9978
9979 #if CONFIG_FILE_LEASES
9980 vnode_breakdirlease(dvp, false, O_WRONLY);
9981 #endif
9982
9983 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
9984 ndp->ni_vp = vp;
9985 if (vp == NULLVP) {
9986 /* Couldn't find a vnode */
9987 goto out;
9988 }
9989
9990 if (error == EKEEPLOOKING) {
9991 goto continue_lookup;
9992 } else if (batched && error == ENOENT) {
9993 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9994 /*
9995 * For compound VNOPs, the authorization callback
9996 * may return ENOENT in case of racing hard link lookups
9997 * redrive the lookup.
9998 */
9999 restart_flag = 1;
10000 restart_count += 1;
10001 goto out;
10002 }
10003 }
10004
10005 /*
10006 * XXX There's no provision for passing flags
10007 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10008 * because it's not empty, then we try again
10009 * with VNOP_REMOVE(), passing in a special
10010 * flag that clever file systems will know
10011 * how to handle.
10012 */
10013 if (error == ENOTEMPTY &&
10014 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10015 /*
10016 * If this fails, we want to keep the original
10017 * error.
10018 */
10019 if (vn_remove(dvp, &vp, ndp,
10020 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10021 error = 0;
10022 }
10023 }
10024
10025 #if CONFIG_APPLEDOUBLE
10026 /*
10027 * Special case to remove orphaned AppleDouble
10028 * files. I don't like putting this in the kernel,
10029 * but carbon does not like putting this in carbon either,
10030 * so here we are.
10031 */
10032 if (error == ENOTEMPTY) {
10033 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10034 if (ad_error == EBUSY) {
10035 error = ad_error;
10036 goto out;
10037 }
10038
10039
10040 /*
10041 * Assuming everything went well, we will try the RMDIR again
10042 */
10043 if (!ad_error) {
10044 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10045 }
10046 }
10047 #endif /* CONFIG_APPLEDOUBLE */
10048 /*
10049 * Call out to allow 3rd party notification of delete.
10050 * Ignore result of kauth_authorize_fileop call.
10051 */
10052 if (!error) {
10053 if (has_listeners) {
10054 kauth_authorize_fileop(vfs_context_ucred(ctx),
10055 KAUTH_FILEOP_DELETE,
10056 (uintptr_t)vp,
10057 (uintptr_t)path);
10058 }
10059
10060 if (vp->v_flag & VISHARDLINK) {
10061 // see the comment in unlink1() about why we update
10062 // the parent of a hard link when it is removed
10063 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10064 }
10065
10066 #if CONFIG_FSE
10067 if (need_event) {
10068 if (vap) {
10069 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10070 }
10071 add_fsevent(FSE_DELETE, ctx,
10072 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10073 FSE_ARG_FINFO, &finfo,
10074 FSE_ARG_DONE);
10075 }
10076 #endif
10077
10078 #if CONFIG_MACF
10079 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10080 #endif
10081 }
10082
10083 out:
10084 if (path != NULL) {
10085 RELEASE_PATH(path);
10086 path = NULL;
10087 }
10088
10089 if (no_firmlink_path != NULL) {
10090 RELEASE_PATH(no_firmlink_path);
10091 no_firmlink_path = NULL;
10092 }
10093
10094 /*
10095 * nameidone has to happen before we vnode_put(dvp)
10096 * since it may need to release the fs_nodelock on the dvp
10097 */
10098 nameidone(ndp);
10099 vnode_put(dvp);
10100
10101 if (vp) {
10102 vnode_put(vp);
10103 }
10104
10105 if (restart_flag == 0) {
10106 wakeup_one((caddr_t)vp);
10107 goto err_out;
10108 }
10109 tsleep(vp, PVFS, "rm AD", 1);
10110 } while (restart_flag != 0);
10111
10112 err_out:
10113 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10114
10115 return error;
10116 }
10117
10118 /*
10119 * Remove a directory file.
10120 */
10121 /* ARGSUSED */
10122 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10123 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10124 {
10125 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10126 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10127 }
10128
10129 /* Get direntry length padded to 8 byte alignment */
10130 #define DIRENT64_LEN(namlen) \
10131 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10132
10133 /* Get dirent length padded to 4 byte alignment */
10134 #define DIRENT_LEN(namelen) \
10135 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10136
10137 /* Get the end of this dirent */
10138 #define DIRENT_END(dep) \
10139 (((char *)(dep)) + (dep)->d_reclen - 1)
10140
10141 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10142 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10143 int *numdirent, vfs_context_t ctxp)
10144 {
10145 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10146 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10147 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10148 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10149 } else {
10150 size_t bufsize;
10151 void * bufptr;
10152 uio_t auio;
10153 struct direntry *entry64;
10154 struct dirent *dep;
10155 size_t bytesread;
10156 int error;
10157
10158 /*
10159 * We're here because the underlying file system does not
10160 * support direnties or we mounted denying support so we must
10161 * fall back to dirents and convert them to direntries.
10162 *
10163 * Our kernel buffer needs to be smaller since re-packing will
10164 * expand each dirent. The worse case (when the name length
10165 * is 3 or less) corresponds to a struct direntry size of 32
10166 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10167 * (4-byte aligned). So having a buffer that is 3/8 the size
10168 * will prevent us from reading more than we can pack.
10169 *
10170 * Since this buffer is wired memory, we will limit the
10171 * buffer size to a maximum of 32K. We would really like to
10172 * use 32K in the MIN(), but we use magic number 87371 to
10173 * prevent uio_resid() * 3 / 8 from overflowing.
10174 */
10175 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10176 bufptr = kalloc_data(bufsize, Z_WAITOK);
10177 if (bufptr == NULL) {
10178 return ENOMEM;
10179 }
10180
10181 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10182 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10183 auio->uio_offset = uio->uio_offset;
10184
10185 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10186
10187 dep = (struct dirent *)bufptr;
10188 bytesread = bufsize - uio_resid(auio);
10189
10190 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10191 /*
10192 * Convert all the entries and copy them out to user's buffer.
10193 */
10194 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10195 /* First check that the dirent struct up to d_name is within the buffer */
10196 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10197 /* Check that the length of the entire dirent is within the buffer */
10198 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10199 /* Check that the actual length including the name doesn't exceed d_reclen */
10200 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10201 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10202 vp->v_mount->mnt_vfsstat.f_mntonname,
10203 vp->v_name ? vp->v_name : "<unknown>");
10204 error = EIO;
10205 break;
10206 }
10207
10208 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10209
10210 bzero(entry64, enbufsize);
10211 /* Convert a dirent to a dirent64. */
10212 entry64->d_ino = dep->d_ino;
10213 entry64->d_seekoff = 0;
10214 entry64->d_reclen = (uint16_t)enbufsize;
10215 entry64->d_namlen = dep->d_namlen;
10216 entry64->d_type = dep->d_type;
10217 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10218
10219 /* Move to next entry. */
10220 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10221
10222 /* Copy entry64 to user's buffer. */
10223 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10224 }
10225
10226 /* Update the real offset using the offset we got from VNOP_READDIR. */
10227 if (error == 0) {
10228 uio->uio_offset = auio->uio_offset;
10229 }
10230 uio_free(auio);
10231 kfree_data(bufptr, bufsize);
10232 kfree_type(struct direntry, entry64);
10233 return error;
10234 }
10235 }
10236
10237 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10238
10239 /*
10240 * Read a block of directory entries in a file system independent format.
10241 */
10242 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10243 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10244 off_t *offset, int *eofflag, int flags)
10245 {
10246 vnode_t vp;
10247 struct vfs_context context = *vfs_context_current(); /* local copy */
10248 struct fileproc *fp;
10249 uio_t auio;
10250 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10251 off_t loff;
10252 int error, numdirent;
10253 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10254
10255 get_from_fd:
10256 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10257 if (error) {
10258 return error;
10259 }
10260
10261 vn_offset_lock(fp->fp_glob);
10262 if (((vnode_t)fp_get_data(fp)) != vp) {
10263 vn_offset_unlock(fp->fp_glob);
10264 file_drop(fd);
10265 goto get_from_fd;
10266 }
10267
10268 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10269 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10270 error = EBADF;
10271 goto out;
10272 }
10273
10274 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10275 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10276 }
10277
10278 #if CONFIG_MACF
10279 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10280 if (error) {
10281 goto out;
10282 }
10283 #endif
10284
10285 if ((error = vnode_getwithref(vp))) {
10286 goto out;
10287 }
10288 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10289
10290 #if CONFIG_UNION_MOUNTS
10291 unionread:
10292 #endif /* CONFIG_UNION_MOUNTS */
10293 if (vp->v_type != VDIR) {
10294 (void)vnode_put(vp);
10295 error = EINVAL;
10296 goto out;
10297 }
10298
10299 #if CONFIG_MACF
10300 error = mac_vnode_check_readdir(&context, vp);
10301 if (error != 0) {
10302 (void)vnode_put(vp);
10303 goto out;
10304 }
10305 #endif /* MAC */
10306
10307 loff = fp->fp_glob->fg_offset;
10308 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10309 uio_addiov(auio, bufp, bufsize);
10310
10311 if (flags & VNODE_READDIR_EXTENDED) {
10312 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10313 fp->fp_glob->fg_offset = uio_offset(auio);
10314 } else {
10315 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10316 fp->fp_glob->fg_offset = uio_offset(auio);
10317 }
10318 if (error) {
10319 (void)vnode_put(vp);
10320 goto out;
10321 }
10322
10323 #if CONFIG_UNION_MOUNTS
10324 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10325 (vp->v_mount->mnt_flag & MNT_UNION)) {
10326 vnode_t uvp;
10327
10328 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10329 if (vnode_ref(uvp) == 0) {
10330 fp_set_data(fp, uvp);
10331 fp->fp_glob->fg_offset = 0;
10332 vnode_rele(vp);
10333 vnode_put(vp);
10334 vp = uvp;
10335 goto unionread;
10336 } else {
10337 /* could not get a ref, can't replace in fd */
10338 vnode_put(uvp);
10339 }
10340 }
10341 }
10342 #endif /* CONFIG_UNION_MOUNTS */
10343
10344 vnode_put(vp);
10345 if (offset) {
10346 *offset = loff;
10347 }
10348
10349 *bytesread = bufsize - uio_resid(auio);
10350 out:
10351 vn_offset_unlock(fp->fp_glob);
10352 file_drop(fd);
10353 return error;
10354 }
10355
10356
10357 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10358 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10359 {
10360 off_t offset;
10361 ssize_t bytesread;
10362 int error, eofflag;
10363
10364 AUDIT_ARG(fd, uap->fd);
10365 error = getdirentries_common(uap->fd, uap->buf, uap->count,
10366 &bytesread, &offset, &eofflag, 0);
10367
10368 if (error == 0) {
10369 if (proc_is64bit(p)) {
10370 user64_long_t base = (user64_long_t)offset;
10371 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10372 } else {
10373 user32_long_t base = (user32_long_t)offset;
10374 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10375 }
10376 *retval = (int)bytesread;
10377 }
10378 return error;
10379 }
10380
10381 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10382 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10383 {
10384 off_t offset;
10385 ssize_t bytesread;
10386 int error, eofflag;
10387 user_size_t bufsize;
10388
10389 AUDIT_ARG(fd, uap->fd);
10390
10391 /*
10392 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10393 * then the kernel carves out the last 4 bytes to return extended
10394 * information to userspace (namely whether we reached EOF with this call).
10395 */
10396 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10397 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10398 } else {
10399 bufsize = uap->bufsize;
10400 }
10401
10402 error = getdirentries_common(uap->fd, uap->buf, bufsize,
10403 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10404
10405 if (error == 0) {
10406 *retval = bytesread;
10407 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10408
10409 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10410 getdirentries64_flags_t flags = 0;
10411 if (eofflag) {
10412 flags |= GETDIRENTRIES64_EOF;
10413 }
10414 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10415 sizeof(flags));
10416 }
10417 }
10418 return error;
10419 }
10420
10421
10422 /*
10423 * Set the mode mask for creation of filesystem nodes.
10424 * XXX implement xsecurity
10425 */
10426 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10427 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10428 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10429 {
10430 AUDIT_ARG(mask, newmask);
10431 proc_fdlock(p);
10432 *retval = p->p_fd.fd_cmask;
10433 p->p_fd.fd_cmask = newmask & ALLPERMS;
10434 proc_fdunlock(p);
10435 return 0;
10436 }
10437
10438 /*
10439 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10440 *
10441 * Parameters: p Process requesting to set the umask
10442 * uap User argument descriptor (see below)
10443 * retval umask of the process (parameter p)
10444 *
10445 * Indirect: uap->newmask umask to set
10446 * uap->xsecurity ACL to set
10447 *
10448 * Returns: 0 Success
10449 * !0 Not success
10450 *
10451 */
10452 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10453 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10454 {
10455 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10456 }
10457
10458 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10459 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10460 {
10461 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10462 }
10463
10464 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10465 "com.apple.private.vfs.revoke-mounted-device"
10466
10467 /*
10468 * Void all references to file by ripping underlying filesystem
10469 * away from vnode.
10470 */
10471 /* ARGSUSED */
10472 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10473 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10474 {
10475 vnode_t vp;
10476 struct vnode_attr va;
10477 vfs_context_t ctx = vfs_context_current();
10478 int error;
10479 struct nameidata nd;
10480
10481 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10482 uap->path, ctx);
10483 error = namei(&nd);
10484 if (error) {
10485 return error;
10486 }
10487 vp = nd.ni_vp;
10488
10489 nameidone(&nd);
10490
10491 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10492 error = ENOTSUP;
10493 goto out;
10494 }
10495
10496 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10497 error = EBUSY;
10498 goto out;
10499 }
10500
10501 #if CONFIG_MACF
10502 error = mac_vnode_check_revoke(ctx, vp);
10503 if (error) {
10504 goto out;
10505 }
10506 #endif
10507
10508 VATTR_INIT(&va);
10509 VATTR_WANTED(&va, va_uid);
10510 if ((error = vnode_getattr(vp, &va, ctx))) {
10511 goto out;
10512 }
10513 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10514 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10515 goto out;
10516 }
10517 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10518 VNOP_REVOKE(vp, REVOKEALL, ctx);
10519 }
10520 out:
10521 vnode_put(vp);
10522 return error;
10523 }
10524
10525
10526 /*
10527 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10528 * The following system calls are designed to support features
10529 * which are specific to the HFS & HFS Plus volume formats
10530 */
10531
10532
10533 /*
10534 * Obtain attribute information on objects in a directory while enumerating
10535 * the directory.
10536 */
10537 /* ARGSUSED */
10538 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10539 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10540 {
10541 vnode_t vp;
10542 struct fileproc *fp;
10543 uio_t auio = NULL;
10544 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10545 uint32_t count = 0, savecount = 0;
10546 uint32_t newstate = 0;
10547 int error, eofflag = 0;
10548 off_t loff = 0;
10549 struct attrlist attributelist;
10550 vfs_context_t ctx = vfs_context_current();
10551 int fd = uap->fd;
10552 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10553 kauth_action_t action;
10554
10555 AUDIT_ARG(fd, fd);
10556
10557 /* Get the attributes into kernel space */
10558 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10559 return error;
10560 }
10561 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10562 return error;
10563 }
10564 savecount = count;
10565
10566 get_from_fd:
10567 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10568 return error;
10569 }
10570
10571 vn_offset_lock(fp->fp_glob);
10572 if (((vnode_t)fp_get_data(fp)) != vp) {
10573 vn_offset_unlock(fp->fp_glob);
10574 file_drop(fd);
10575 goto get_from_fd;
10576 }
10577
10578 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10579 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10580 error = EBADF;
10581 goto out;
10582 }
10583
10584
10585 #if CONFIG_MACF
10586 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10587 fp->fp_glob);
10588 if (error) {
10589 goto out;
10590 }
10591 #endif
10592
10593
10594 if ((error = vnode_getwithref(vp))) {
10595 goto out;
10596 }
10597
10598 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10599
10600 #if CONFIG_UNION_MOUNTS
10601 unionread:
10602 #endif /* CONFIG_UNION_MOUNTS */
10603 if (vp->v_type != VDIR) {
10604 (void)vnode_put(vp);
10605 error = EINVAL;
10606 goto out;
10607 }
10608
10609 #if CONFIG_MACF
10610 error = mac_vnode_check_readdir(ctx, vp);
10611 if (error != 0) {
10612 (void)vnode_put(vp);
10613 goto out;
10614 }
10615 #endif /* MAC */
10616
10617 /* set up the uio structure which will contain the users return buffer */
10618 loff = fp->fp_glob->fg_offset;
10619 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10620 uio_addiov(auio, uap->buffer, uap->buffersize);
10621
10622 /*
10623 * If the only item requested is file names, we can let that past with
10624 * just LIST_DIRECTORY. If they want any other attributes, that means
10625 * they need SEARCH as well.
10626 */
10627 action = KAUTH_VNODE_LIST_DIRECTORY;
10628 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10629 attributelist.fileattr || attributelist.dirattr) {
10630 action |= KAUTH_VNODE_SEARCH;
10631 }
10632
10633 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10634 /* Believe it or not, uap->options only has 32-bits of valid
10635 * info, so truncate before extending again */
10636
10637 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10638 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10639 }
10640
10641 if (error) {
10642 (void) vnode_put(vp);
10643 goto out;
10644 }
10645
10646 #if CONFIG_UNION_MOUNTS
10647 /*
10648 * If we've got the last entry of a directory in a union mount
10649 * then reset the eofflag and pretend there's still more to come.
10650 * The next call will again set eofflag and the buffer will be empty,
10651 * so traverse to the underlying directory and do the directory
10652 * read there.
10653 */
10654 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10655 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10656 eofflag = 0;
10657 } else { // Empty buffer
10658 vnode_t uvp;
10659 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10660 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10661 fp_set_data(fp, uvp);
10662 fp->fp_glob->fg_offset = 0; // reset index for new dir
10663 count = savecount;
10664 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10665 vnode_put(vp);
10666 vp = uvp;
10667 goto unionread;
10668 } else {
10669 /* could not get a ref, can't replace in fd */
10670 vnode_put(uvp);
10671 }
10672 }
10673 }
10674 }
10675 #endif /* CONFIG_UNION_MOUNTS */
10676
10677 (void)vnode_put(vp);
10678
10679 if (error) {
10680 goto out;
10681 }
10682 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10683
10684 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10685 goto out;
10686 }
10687 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10688 goto out;
10689 }
10690 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10691 goto out;
10692 }
10693
10694 *retval = eofflag; /* similar to getdirentries */
10695 error = 0;
10696 out:
10697 vn_offset_unlock(fp->fp_glob);
10698 file_drop(fd);
10699 return error; /* return error earlier, an retval of 0 or 1 now */
10700 } /* end of getdirentriesattr system call */
10701
10702 /*
10703 * Exchange data between two files
10704 */
10705
10706 /* ARGSUSED */
10707 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10708 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10709 {
10710 struct nameidata fnd, snd;
10711 vfs_context_t ctx = vfs_context_current();
10712 vnode_t fvp;
10713 vnode_t svp;
10714 int error;
10715 u_int32_t nameiflags;
10716 char *fpath = NULL;
10717 char *spath = NULL;
10718 int flen = 0, slen = 0;
10719 int from_truncated = 0, to_truncated = 0;
10720 #if CONFIG_FSE
10721 fse_info f_finfo, s_finfo;
10722 #endif
10723
10724 nameiflags = 0;
10725 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10726 nameiflags |= FOLLOW;
10727 }
10728
10729 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10730 UIO_USERSPACE, uap->path1, ctx);
10731
10732 error = namei(&fnd);
10733 if (error) {
10734 goto out2;
10735 }
10736
10737 nameidone(&fnd);
10738 fvp = fnd.ni_vp;
10739
10740 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10741 UIO_USERSPACE, uap->path2, ctx);
10742
10743 error = namei(&snd);
10744 if (error) {
10745 vnode_put(fvp);
10746 goto out2;
10747 }
10748 nameidone(&snd);
10749 svp = snd.ni_vp;
10750
10751 /*
10752 * if the files are the same, return an inval error
10753 */
10754 if (svp == fvp) {
10755 error = EINVAL;
10756 goto out;
10757 }
10758
10759 /*
10760 * if the files are on different volumes, return an error
10761 */
10762 if (svp->v_mount != fvp->v_mount) {
10763 error = EXDEV;
10764 goto out;
10765 }
10766
10767 /* If they're not files, return an error */
10768 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10769 error = EINVAL;
10770 goto out;
10771 }
10772
10773 #if CONFIG_MACF
10774 error = mac_vnode_check_exchangedata(ctx,
10775 fvp, svp);
10776 if (error) {
10777 goto out;
10778 }
10779 #endif
10780 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10781 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10782 goto out;
10783 }
10784
10785 if (
10786 #if CONFIG_FSE
10787 need_fsevent(FSE_EXCHANGE, fvp) ||
10788 #endif
10789 kauth_authorize_fileop_has_listeners()) {
10790 GET_PATH(fpath);
10791 GET_PATH(spath);
10792
10793 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10794 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10795
10796 #if CONFIG_FSE
10797 get_fse_info(fvp, &f_finfo, ctx);
10798 get_fse_info(svp, &s_finfo, ctx);
10799 if (from_truncated || to_truncated) {
10800 // set it here since only the f_finfo gets reported up to user space
10801 f_finfo.mode |= FSE_TRUNCATED_PATH;
10802 }
10803 #endif
10804 }
10805 /* Ok, make the call */
10806 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10807
10808 if (error == 0) {
10809 const char *tmpname;
10810
10811 if (fpath != NULL && spath != NULL) {
10812 /* call out to allow 3rd party notification of exchangedata.
10813 * Ignore result of kauth_authorize_fileop call.
10814 */
10815 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10816 (uintptr_t)fpath, (uintptr_t)spath);
10817 }
10818 name_cache_lock();
10819
10820 tmpname = fvp->v_name;
10821 fvp->v_name = svp->v_name;
10822 svp->v_name = tmpname;
10823
10824 if (fvp->v_parent != svp->v_parent) {
10825 vnode_t tmp;
10826
10827 tmp = fvp->v_parent;
10828 fvp->v_parent = svp->v_parent;
10829 svp->v_parent = tmp;
10830 }
10831 name_cache_unlock();
10832
10833 #if CONFIG_FSE
10834 if (fpath != NULL && spath != NULL) {
10835 add_fsevent(FSE_EXCHANGE, ctx,
10836 FSE_ARG_STRING, flen, fpath,
10837 FSE_ARG_FINFO, &f_finfo,
10838 FSE_ARG_STRING, slen, spath,
10839 FSE_ARG_FINFO, &s_finfo,
10840 FSE_ARG_DONE);
10841 }
10842 #endif
10843 }
10844
10845 out:
10846 if (fpath != NULL) {
10847 RELEASE_PATH(fpath);
10848 }
10849 if (spath != NULL) {
10850 RELEASE_PATH(spath);
10851 }
10852 vnode_put(svp);
10853 vnode_put(fvp);
10854 out2:
10855 return error;
10856 }
10857
10858 /*
10859 * Return (in MB) the amount of freespace on the given vnode's volume.
10860 */
10861 uint32_t freespace_mb(vnode_t vp);
10862
10863 uint32_t
freespace_mb(vnode_t vp)10864 freespace_mb(vnode_t vp)
10865 {
10866 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10867 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10868 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10869 }
10870
10871 #if CONFIG_SEARCHFS
10872
10873 /* ARGSUSED */
10874
10875 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10876 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10877 {
10878 vnode_t vp, tvp;
10879 int i, error = 0;
10880 int fserror = 0;
10881 struct nameidata nd;
10882 struct user64_fssearchblock searchblock;
10883 struct searchstate *state;
10884 struct attrlist *returnattrs;
10885 struct timeval timelimit;
10886 void *searchparams1, *searchparams2;
10887 uio_t auio = NULL;
10888 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10889 uint32_t nummatches;
10890 size_t mallocsize;
10891 uint32_t nameiflags;
10892 vfs_context_t ctx = vfs_context_current();
10893 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
10894
10895 /* Start by copying in fsearchblock parameter list */
10896 if (IS_64BIT_PROCESS(p)) {
10897 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10898 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10899 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10900 } else {
10901 struct user32_fssearchblock tmp_searchblock;
10902
10903 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10904 // munge into 64-bit version
10905 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10906 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10907 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10908 searchblock.maxmatches = tmp_searchblock.maxmatches;
10909 /*
10910 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10911 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10912 */
10913 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10914 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10915 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10916 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10917 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10918 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10919 searchblock.searchattrs = tmp_searchblock.searchattrs;
10920 }
10921 if (error) {
10922 return error;
10923 }
10924
10925 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10926 */
10927 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10928 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10929 return EINVAL;
10930 }
10931
10932 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10933 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
10934 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10935 /* block. */
10936 /* */
10937 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
10938 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
10939 /* assumes the size is still 556 bytes it will continue to work */
10940
10941 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10942 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10943
10944 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10945
10946 /* Now set up the various pointers to the correct place in our newly allocated memory */
10947
10948 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10949 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10950 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10951
10952 /* Now copy in the stuff given our local variables. */
10953
10954 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10955 goto freeandexit;
10956 }
10957
10958 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10959 goto freeandexit;
10960 }
10961
10962 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10963 goto freeandexit;
10964 }
10965
10966 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10967 goto freeandexit;
10968 }
10969
10970 /*
10971 * When searching a union mount, need to set the
10972 * start flag at the first call on each layer to
10973 * reset state for the new volume.
10974 */
10975 if (uap->options & SRCHFS_START) {
10976 state->ss_union_layer = 0;
10977 } else {
10978 uap->options |= state->ss_union_flags;
10979 }
10980 state->ss_union_flags = 0;
10981
10982 /*
10983 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10984 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10985 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10986 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10987 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10988 */
10989
10990 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10991 attrreference_t* string_ref;
10992 u_int32_t* start_length;
10993 user64_size_t param_length;
10994
10995 /* validate searchparams1 */
10996 param_length = searchblock.sizeofsearchparams1;
10997 /* skip the word that specifies length of the buffer */
10998 start_length = (u_int32_t*) searchparams1;
10999 start_length = start_length + 1;
11000 string_ref = (attrreference_t*) start_length;
11001
11002 /* ensure no negative offsets or too big offsets */
11003 if (string_ref->attr_dataoffset < 0) {
11004 error = EINVAL;
11005 goto freeandexit;
11006 }
11007 if (string_ref->attr_length > MAXPATHLEN) {
11008 error = EINVAL;
11009 goto freeandexit;
11010 }
11011
11012 /* Check for pointer overflow in the string ref */
11013 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11014 error = EINVAL;
11015 goto freeandexit;
11016 }
11017
11018 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11019 error = EINVAL;
11020 goto freeandexit;
11021 }
11022 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11023 error = EINVAL;
11024 goto freeandexit;
11025 }
11026 }
11027
11028 /* set up the uio structure which will contain the users return buffer */
11029 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11030 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11031
11032 nameiflags = 0;
11033 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11034 nameiflags |= FOLLOW;
11035 }
11036 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11037 UIO_USERSPACE, uap->path, ctx);
11038
11039 error = namei(&nd);
11040 if (error) {
11041 goto freeandexit;
11042 }
11043 vp = nd.ni_vp;
11044 nameidone(&nd);
11045
11046 /*
11047 * Switch to the root vnode for the volume
11048 */
11049 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11050 vnode_put(vp);
11051 if (error) {
11052 goto freeandexit;
11053 }
11054 vp = tvp;
11055
11056 #if CONFIG_UNION_MOUNTS
11057 /*
11058 * If it's a union mount, the path lookup takes
11059 * us to the top layer. But we may need to descend
11060 * to a lower layer. For non-union mounts the layer
11061 * is always zero.
11062 */
11063 for (i = 0; i < (int) state->ss_union_layer; i++) {
11064 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11065 break;
11066 }
11067 tvp = vp;
11068 vp = vp->v_mount->mnt_vnodecovered;
11069 if (vp == NULL) {
11070 vnode_put(tvp);
11071 error = ENOENT;
11072 goto freeandexit;
11073 }
11074 error = vnode_getwithref(vp);
11075 vnode_put(tvp);
11076 if (error) {
11077 goto freeandexit;
11078 }
11079 }
11080 #endif /* CONFIG_UNION_MOUNTS */
11081
11082 #if CONFIG_MACF
11083 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11084 if (error) {
11085 vnode_put(vp);
11086 goto freeandexit;
11087 }
11088 #endif
11089
11090
11091 /*
11092 * If searchblock.maxmatches == 0, then skip the search. This has happened
11093 * before and sometimes the underlying code doesnt deal with it well.
11094 */
11095 if (searchblock.maxmatches == 0) {
11096 nummatches = 0;
11097 goto saveandexit;
11098 }
11099
11100 /*
11101 * Allright, we have everything we need, so lets make that call.
11102 *
11103 * We keep special track of the return value from the file system:
11104 * EAGAIN is an acceptable error condition that shouldn't keep us
11105 * from copying out any results...
11106 */
11107
11108 fserror = VNOP_SEARCHFS(vp,
11109 searchparams1,
11110 searchparams2,
11111 &searchblock.searchattrs,
11112 (uint32_t)searchblock.maxmatches,
11113 &timelimit,
11114 returnattrs,
11115 &nummatches,
11116 (uint32_t)uap->scriptcode,
11117 (uint32_t)uap->options,
11118 auio,
11119 (struct searchstate *) &state->ss_fsstate,
11120 ctx);
11121
11122 #if CONFIG_UNION_MOUNTS
11123 /*
11124 * If it's a union mount we need to be called again
11125 * to search the mounted-on filesystem.
11126 */
11127 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11128 state->ss_union_flags = SRCHFS_START;
11129 state->ss_union_layer++; // search next layer down
11130 fserror = EAGAIN;
11131 }
11132 #endif /* CONFIG_UNION_MOUNTS */
11133
11134 saveandexit:
11135
11136 vnode_put(vp);
11137
11138 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11139 * search state. Everything was already put into he return buffer by the vop call. */
11140
11141 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11142 goto freeandexit;
11143 }
11144
11145 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11146 goto freeandexit;
11147 }
11148
11149 error = fserror;
11150
11151 freeandexit:
11152
11153 kfree_data(searchparams1, mallocsize);
11154
11155 return error;
11156 } /* end of searchfs system call */
11157
11158 #else /* CONFIG_SEARCHFS */
11159
11160 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11161 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11162 {
11163 return ENOTSUP;
11164 }
11165
11166 #endif /* CONFIG_SEARCHFS */
11167
11168
11169 #if CONFIG_DATALESS_FILES
11170
11171 /*
11172 * === Namespace Resolver Up-call Mechanism ===
11173 *
11174 * When I/O is performed to a dataless file or directory (read, write,
11175 * lookup-in, etc.), the file system performs an upcall to the namespace
11176 * resolver (filecoordinationd) to materialize the object.
11177 *
11178 * We need multiple up-calls to be in flight at once, and we need these
11179 * up-calls to be interruptible, thus the following implementation:
11180 *
11181 * => The nspace_resolver_request represents the in-kernel request state.
11182 * It contains a request ID, storage space for the errno code returned
11183 * by filecoordinationd, and flags.
11184 *
11185 * => The request ID is simply a global monotonically incrementing 32-bit
11186 * number. Outstanding requests are stored in a hash table, and the
11187 * hash function is extremely simple.
11188 *
11189 * => When an upcall is to be made to filecoordinationd, a request structure
11190 * is allocated on the stack (it is small, and needs to live only during
11191 * the duration of the call to resolve_nspace_item_ext()). It is
11192 * initialized and inserted into the table. Some backpressure from
11193 * filecoordinationd is applied by limiting the numnber of entries that
11194 * can be inserted into the table (and thus limiting the number of
11195 * outstanding requests issued to filecoordinationd); waiting for an
11196 * available slot is interruptible.
11197 *
11198 * => Once the request has been inserted into the table, the up-call is made
11199 * to filecoordinationd via a MiG-generated stub. The up-call returns
11200 * immediately and filecoordinationd processes the request asynchronously.
11201 *
11202 * => The caller now waits for the request to complete. Tnis is achieved by
11203 * sleeping on the address of the request structure and waiting for
11204 * filecoordinationd to mark the request structure as complete. This
11205 * is an interruptible sleep call; if interrupted, the request structure
11206 * is removed from the table and EINTR is returned to the caller. If
11207 * this occurs, an advisory up-call is made to filecoordinationd with
11208 * the request ID to indicate that the request can be aborted or
11209 * de-prioritized at the discretion of filecoordinationd.
11210 *
11211 * => When filecoordinationd has completed the request, it signals completion
11212 * by writing to the vfs.nspace.complete sysctl node. Only a process
11213 * decorated as a namespace resolver can write to this sysctl node. The
11214 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11215 * The request ID is looked up in the table, and if the request is found,
11216 * the error code is stored in the request structure and a wakeup()
11217 * issued on the address of the request structure. If the request is not
11218 * found, we simply drop the completion notification, assuming that the
11219 * caller was interrupted.
11220 *
11221 * => When the waiting thread wakes up, it extracts the error code from the
11222 * request structure, removes the request from the table, and returns the
11223 * error code to the calling function. Fini!
11224 */
11225
11226 struct nspace_resolver_request {
11227 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11228 vnode_t r_vp;
11229 uint32_t r_req_id;
11230 int r_resolver_error;
11231 int r_flags;
11232 };
11233
11234 #define RRF_COMPLETE 0x0001
11235
11236 static uint32_t
next_nspace_req_id(void)11237 next_nspace_req_id(void)
11238 {
11239 static uint32_t next_req_id;
11240
11241 return OSAddAtomic(1, &next_req_id);
11242 }
11243
11244 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11245 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11246
11247 static LIST_HEAD(nspace_resolver_requesthead,
11248 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11249 static u_long nspace_resolver_request_hashmask;
11250 static u_int nspace_resolver_request_count;
11251 static bool nspace_resolver_request_wait_slot;
11252 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11253 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11254 &nspace_resolver_request_lck_grp);
11255
11256 #define NSPACE_REQ_LOCK() \
11257 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11258 #define NSPACE_REQ_UNLOCK() \
11259 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11260
11261 #define NSPACE_RESOLVER_HASH(req_id) \
11262 (&nspace_resolver_request_hashtbl[(req_id) & \
11263 nspace_resolver_request_hashmask])
11264
11265 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id)11266 nspace_resolver_req_lookup(uint32_t req_id)
11267 {
11268 struct nspace_resolver_requesthead *bucket;
11269 struct nspace_resolver_request *req;
11270
11271 bucket = NSPACE_RESOLVER_HASH(req_id);
11272 LIST_FOREACH(req, bucket, r_hashlink) {
11273 if (req->r_req_id == req_id) {
11274 return req;
11275 }
11276 }
11277
11278 return NULL;
11279 }
11280
11281 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11282 nspace_resolver_req_add(struct nspace_resolver_request *req)
11283 {
11284 struct nspace_resolver_requesthead *bucket;
11285 int error;
11286
11287 while (nspace_resolver_request_count >=
11288 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11289 nspace_resolver_request_wait_slot = true;
11290 error = msleep(&nspace_resolver_request_count,
11291 &nspace_resolver_request_hash_mutex,
11292 PVFS | PCATCH, "nspacerq", NULL);
11293 if (error) {
11294 return error;
11295 }
11296 }
11297
11298 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11299 #if DIAGNOSTIC
11300 assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
11301 #endif /* DIAGNOSTIC */
11302 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11303 nspace_resolver_request_count++;
11304
11305 return 0;
11306 }
11307
11308 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11309 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11310 {
11311 struct nspace_resolver_requesthead *bucket;
11312
11313 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11314 #if DIAGNOSTIC
11315 assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
11316 #endif /* DIAGNOSTIC */
11317 LIST_REMOVE(req, r_hashlink);
11318 nspace_resolver_request_count--;
11319
11320 if (nspace_resolver_request_wait_slot) {
11321 nspace_resolver_request_wait_slot = false;
11322 wakeup(&nspace_resolver_request_count);
11323 }
11324 }
11325
11326 static void
nspace_resolver_req_cancel(uint32_t req_id)11327 nspace_resolver_req_cancel(uint32_t req_id)
11328 {
11329 kern_return_t kr;
11330 mach_port_t mp;
11331
11332 // Failures here aren't fatal -- the cancellation message
11333 // sent to the resolver is merely advisory.
11334
11335 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11336 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11337 return;
11338 }
11339
11340 kr = send_nspace_resolve_cancel(mp, req_id);
11341 if (kr != KERN_SUCCESS) {
11342 os_log_error(OS_LOG_DEFAULT,
11343 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11344 }
11345
11346 ipc_port_release_send(mp);
11347 }
11348
11349 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11350 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11351 {
11352 bool send_cancel_message = false;
11353 int error;
11354
11355 NSPACE_REQ_LOCK();
11356
11357 while ((req->r_flags & RRF_COMPLETE) == 0) {
11358 error = msleep(req, &nspace_resolver_request_hash_mutex,
11359 PVFS | PCATCH, "nspace", NULL);
11360 if (error && error != ERESTART) {
11361 req->r_resolver_error = (error == EINTR) ? EINTR :
11362 ETIMEDOUT;
11363 send_cancel_message = true;
11364 break;
11365 }
11366 }
11367
11368 nspace_resolver_req_remove(req);
11369
11370 NSPACE_REQ_UNLOCK();
11371
11372 if (send_cancel_message) {
11373 nspace_resolver_req_cancel(req->r_req_id);
11374 }
11375
11376 return req->r_resolver_error;
11377 }
11378
11379 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11380 nspace_resolver_req_mark_complete(
11381 struct nspace_resolver_request *req,
11382 int resolver_error)
11383 {
11384 req->r_resolver_error = resolver_error;
11385 req->r_flags |= RRF_COMPLETE;
11386 wakeup(req);
11387 }
11388
11389 static void
nspace_resolver_req_completed(uint32_t req_id,int resolver_error,uint64_t orig_gencount)11390 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
11391 {
11392 struct nspace_resolver_request *req;
11393
11394 NSPACE_REQ_LOCK();
11395
11396 // If we don't find the request corresponding to our req_id,
11397 // just drop the completion signal on the floor; it's likely
11398 // that the requester interrupted with a signal.
11399
11400 req = nspace_resolver_req_lookup(req_id);
11401 if (req) {
11402 mount_t locked_mp = NULL;
11403
11404 locked_mp = req->r_vp->v_mount;
11405 mount_ref(locked_mp, 0);
11406 mount_lock_renames(locked_mp);
11407
11408 //
11409 // if the resolver isn't already returning an error and we have an
11410 // orig_gencount, then get an iocount on the request vnode and check
11411 // that the gencount on req->r_vp has not changed.
11412 //
11413 // note: a ref was taken on req->r_vp when the request was created
11414 // and that ref will be dropped by that thread when it wakes up.
11415 //
11416 if (resolver_error == 0 &&
11417 orig_gencount != 0 &&
11418 vnode_getwithref(req->r_vp) == 0) {
11419 struct vnode_attr va;
11420 uint64_t cur_gencount;
11421
11422 VATTR_INIT(&va);
11423 VATTR_WANTED(&va, va_recursive_gencount);
11424
11425 if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
11426 cur_gencount = va.va_recursive_gencount;
11427 } else {
11428 cur_gencount = 0;
11429 }
11430
11431 if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
11432 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
11433
11434 // this error will be returned to the thread that initiated the
11435 // materialization of req->r_vp.
11436 resolver_error = EBUSY;
11437
11438 // note: we explicitly do not return an error to the caller (i.e.
11439 // the thread that did the materialization) because they said they
11440 // don't want one.
11441 }
11442
11443 vnode_put(req->r_vp);
11444 }
11445
11446 mount_unlock_renames(locked_mp);
11447 mount_drop(locked_mp, 0);
11448
11449 nspace_resolver_req_mark_complete(req, resolver_error);
11450 }
11451
11452 NSPACE_REQ_UNLOCK();
11453
11454 return;
11455 }
11456
11457 static struct proc *nspace_resolver_proc;
11458
11459 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11460 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11461 {
11462 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11463 p == nspace_resolver_proc) ? 1 : 0;
11464 return 0;
11465 }
11466
11467 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11468
11469 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11470 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11471 {
11472 vfs_context_t ctx = vfs_context_current();
11473 int error = 0;
11474
11475 //
11476 // The system filecoordinationd runs as uid == 0. This also
11477 // has the nice side-effect of filtering out filecoordinationd
11478 // running in the simulator.
11479 //
11480 if (!vfs_context_issuser(ctx) ||
11481 !vfs_context_is_dataless_resolver(ctx)) {
11482 return EPERM;
11483 }
11484
11485 if (is_resolver) {
11486 NSPACE_REQ_LOCK();
11487
11488 if (nspace_resolver_proc == NULL) {
11489 proc_lock(p);
11490 p->p_lflag |= P_LNSPACE_RESOLVER;
11491 proc_unlock(p);
11492 nspace_resolver_proc = p;
11493 } else {
11494 error = EBUSY;
11495 }
11496
11497 NSPACE_REQ_UNLOCK();
11498 } else {
11499 // This is basically just like the exit case.
11500 // nspace_resolver_exited() will verify that the
11501 // process is the resolver, and will clear the
11502 // global.
11503 nspace_resolver_exited(p);
11504 }
11505
11506 return error;
11507 }
11508
11509 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11510 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11511 {
11512 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11513 (p->p_vfs_iopolicy &
11514 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11515 *is_prevented = 1;
11516 } else {
11517 *is_prevented = 0;
11518 }
11519 return 0;
11520 }
11521
11522 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11523 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11524 {
11525 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11526 return is_prevented ? 0 : EBUSY;
11527 }
11528
11529 if (is_prevented) {
11530 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11531 } else {
11532 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11533 }
11534 return 0;
11535 }
11536
11537 static int
nspace_materialization_get_thread_state(int * is_prevented)11538 nspace_materialization_get_thread_state(int *is_prevented)
11539 {
11540 uthread_t ut = current_uthread();
11541
11542 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11543 return 0;
11544 }
11545
11546 static int
nspace_materialization_set_thread_state(int is_prevented)11547 nspace_materialization_set_thread_state(int is_prevented)
11548 {
11549 uthread_t ut = current_uthread();
11550
11551 if (is_prevented) {
11552 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11553 } else {
11554 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11555 }
11556 return 0;
11557 }
11558
11559 /* the vfs.nspace branch */
11560 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11561
11562 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11563 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11564 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11565 {
11566 struct proc *p = req->p;
11567 int new_value, old_value, changed = 0;
11568 int error;
11569
11570 error = nspace_resolver_get_proc_state(p, &old_value);
11571 if (error) {
11572 return error;
11573 }
11574
11575 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11576 &changed);
11577 if (error == 0 && changed) {
11578 error = nspace_resolver_set_proc_state(p, new_value);
11579 }
11580 return error;
11581 }
11582
11583 /* decorate this process as the dataless file resolver */
11584 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11585 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11586 0, 0, sysctl_nspace_resolver, "I", "");
11587
11588 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11589 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11590 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11591 {
11592 struct proc *p = req->p;
11593 int new_value, old_value, changed = 0;
11594 int error;
11595
11596 error = nspace_materialization_get_proc_state(p, &old_value);
11597 if (error) {
11598 return error;
11599 }
11600
11601 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11602 &changed);
11603 if (error == 0 && changed) {
11604 error = nspace_materialization_set_proc_state(p, new_value);
11605 }
11606 return error;
11607 }
11608
11609 /* decorate this process as not wanting to materialize dataless files */
11610 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11611 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11612 0, 0, sysctl_nspace_prevent_materialization, "I", "");
11613
11614 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11615 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11616 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11617 {
11618 int new_value, old_value, changed = 0;
11619 int error;
11620
11621 error = nspace_materialization_get_thread_state(&old_value);
11622 if (error) {
11623 return error;
11624 }
11625
11626 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11627 &changed);
11628 if (error == 0 && changed) {
11629 error = nspace_materialization_set_thread_state(new_value);
11630 }
11631 return error;
11632 }
11633
11634 /* decorate this thread as not wanting to materialize dataless files */
11635 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11636 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11637 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11638
11639 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11640 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11641 __unused int arg2, struct sysctl_req *req)
11642 {
11643 struct proc *p = req->p;
11644 uint32_t req_status[2] = { 0, 0 };
11645 uint64_t gencount = 0;
11646 int error, is_resolver, changed = 0, gencount_changed;
11647
11648 error = nspace_resolver_get_proc_state(p, &is_resolver);
11649 if (error) {
11650 return error;
11651 }
11652
11653 if (!is_resolver) {
11654 return EPERM;
11655 }
11656
11657 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11658 &changed);
11659 if (error) {
11660 return error;
11661 }
11662
11663 // get the gencount if it was passed
11664 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11665 &gencount_changed);
11666 if (error) {
11667 gencount = 0;
11668 // we ignore the error because the gencount was optional
11669 error = 0;
11670 }
11671
11672 /*
11673 * req_status[0] is the req_id
11674 *
11675 * req_status[1] is the errno
11676 */
11677 if (error == 0 && changed) {
11678 nspace_resolver_req_completed(req_status[0],
11679 (int)req_status[1], gencount);
11680 }
11681 return error;
11682 }
11683
11684 /* Resolver reports completed reqs here. */
11685 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11686 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11687 0, 0, sysctl_nspace_complete, "-", "");
11688
11689 #endif /* CONFIG_DATALESS_FILES */
11690
11691 #if CONFIG_DATALESS_FILES
11692 #define __no_dataless_unused /* nothing */
11693 #else
11694 #define __no_dataless_unused __unused
11695 #endif
11696
11697 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11698 vfs_context_dataless_materialization_is_prevented(
11699 vfs_context_t const ctx __no_dataless_unused)
11700 {
11701 #if CONFIG_DATALESS_FILES
11702 proc_t const p = vfs_context_proc(ctx);
11703 thread_t const t = vfs_context_thread(ctx);
11704 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11705
11706 /*
11707 * Kernel context ==> return EDEADLK, as we would with any random
11708 * process decorated as no-materialize.
11709 */
11710 if (ctx == vfs_context_kernel()) {
11711 return EDEADLK;
11712 }
11713
11714 /*
11715 * If the process has the dataless-manipulation entitlement,
11716 * materialization is prevented, and depending on the kind
11717 * of file system operation, things get to proceed as if the
11718 * object is not dataless.
11719 */
11720 if (vfs_context_is_dataless_manipulator(ctx)) {
11721 return EJUSTRETURN;
11722 }
11723
11724 /*
11725 * Per-thread decorations override any process-wide decorations.
11726 * (Foundation uses this, and this overrides even the dataless-
11727 * manipulation entitlement so as to make API contracts consistent.)
11728 */
11729 if (ut != NULL) {
11730 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11731 return EDEADLK;
11732 }
11733 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11734 return 0;
11735 }
11736 }
11737
11738 /*
11739 * If the process's iopolicy specifies that dataless files
11740 * can be materialized, then we let it go ahead.
11741 */
11742 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11743 return 0;
11744 }
11745 #endif /* CONFIG_DATALESS_FILES */
11746
11747 /*
11748 * The default behavior is to not materialize dataless files;
11749 * return to the caller that deadlock was detected.
11750 */
11751 return EDEADLK;
11752 }
11753
11754 void
nspace_resolver_init(void)11755 nspace_resolver_init(void)
11756 {
11757 #if CONFIG_DATALESS_FILES
11758 nspace_resolver_request_hashtbl =
11759 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11760 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11761 #endif /* CONFIG_DATALESS_FILES */
11762 }
11763
11764 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11765 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11766 {
11767 #if CONFIG_DATALESS_FILES
11768 struct nspace_resolver_requesthead *bucket;
11769 struct nspace_resolver_request *req;
11770 u_long idx;
11771
11772 NSPACE_REQ_LOCK();
11773
11774 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11775 p == nspace_resolver_proc) {
11776 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11777 bucket = &nspace_resolver_request_hashtbl[idx];
11778 LIST_FOREACH(req, bucket, r_hashlink) {
11779 nspace_resolver_req_mark_complete(req,
11780 ETIMEDOUT);
11781 }
11782 }
11783 nspace_resolver_proc = NULL;
11784 }
11785
11786 NSPACE_REQ_UNLOCK();
11787 #endif /* CONFIG_DATALESS_FILES */
11788 }
11789
11790 int
resolve_nspace_item(struct vnode * vp,uint64_t op)11791 resolve_nspace_item(struct vnode *vp, uint64_t op)
11792 {
11793 return resolve_nspace_item_ext(vp, op, NULL);
11794 }
11795
11796 #define DATALESS_RESOLVER_ENTITLEMENT \
11797 "com.apple.private.vfs.dataless-resolver"
11798 #define DATALESS_MANIPULATION_ENTITLEMENT \
11799 "com.apple.private.vfs.dataless-manipulation"
11800
11801 #if CONFIG_DATALESS_FILES
11802 /*
11803 * Return TRUE if the vfs context is associated with the dataless
11804 * resolver.
11805 */
11806 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11807 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11808 {
11809 return IOTaskHasEntitlement(vfs_context_task(ctx),
11810 DATALESS_RESOLVER_ENTITLEMENT);
11811 }
11812 #endif /* CONFIG_DATALESS_FILES */
11813
11814 /*
11815 * Return TRUE if the vfs context is associated with a process entitled
11816 * for dataless manipulation.
11817 *
11818 * XXX Arguably belongs in vfs_subr.c, but is here because of the
11819 * complication around CONFIG_DATALESS_FILES.
11820 */
11821 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11822 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11823 {
11824 #if CONFIG_DATALESS_FILES
11825 task_t task = vfs_context_task(ctx);
11826 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11827 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11828 #else
11829 return false;
11830 #endif /* CONFIG_DATALESS_FILES */
11831 }
11832
11833 #if CONFIG_DATALESS_FILES
11834 static void
log_materialization_prevented(vnode_t vp,uint64_t op)11835 log_materialization_prevented(vnode_t vp, uint64_t op)
11836 {
11837 char p_name[MAXCOMLEN + 1];
11838 char *vntype;
11839 proc_selfname(&p_name[0], sizeof(p_name));
11840
11841 if (vp->v_type == VREG) {
11842 vntype = "File";
11843 } else if (vp->v_type == VDIR) {
11844 vntype = "Dir";
11845 } else if (vp->v_type == VLNK) {
11846 vntype = "SymLink";
11847 } else {
11848 vntype = "Other";
11849 }
11850
11851 #if DEVELOPMENT
11852 char *path = NULL;
11853 int len;
11854
11855 path = get_pathbuff();
11856 len = MAXPATHLEN;
11857 if (path) {
11858 vn_getpath(vp, path, &len);
11859 }
11860
11861 os_log_debug(OS_LOG_DEFAULT,
11862 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
11863 p_name, proc_selfpid(),
11864 op, vntype, path ? path : "<unknown-path>");
11865 if (path) {
11866 release_pathbuff(path);
11867 }
11868 #else
11869 os_log_debug(OS_LOG_DEFAULT,
11870 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
11871 p_name, proc_selfpid(),
11872 op, vntype);
11873 #endif
11874 }
11875 #endif /* CONFIG_DATALESS_FILES */
11876
11877 static int
vfs_materialize_item(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused)11878 vfs_materialize_item(
11879 struct vnode *vp __no_dataless_unused,
11880 uint64_t op __no_dataless_unused,
11881 int64_t offset __no_dataless_unused,
11882 int64_t size __no_dataless_unused,
11883 char *lookup_name __no_dataless_unused,
11884 size_t const namelen __no_dataless_unused)
11885 {
11886 #if CONFIG_DATALESS_FILES
11887 struct nspace_resolver_request req;
11888 kern_return_t kern_ret;
11889 mach_port_t mach_port;
11890 char *path = NULL;
11891 vfs_context_t context;
11892 int path_len;
11893 int error;
11894 audit_token_t atoken;
11895
11896 /*
11897 * If this is a snapshot event and the vnode is on a disk image just
11898 * pretend nothing happened since any change to the disk image will
11899 * cause the disk image itself to get backed up and this avoids multi-
11900 * way deadlocks between the snapshot handler and the ever popular
11901 * diskimages-helper process. The variable nspace_allow_virtual_devs
11902 * allows this behavior to be overridden (for use by the Mobile
11903 * TimeMachine testing infrastructure which uses disk images).
11904 */
11905 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11906 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11907 return ENOTSUP;
11908 }
11909
11910 context = vfs_context_current();
11911
11912 error = vfs_context_dataless_materialization_is_prevented(context);
11913 if (error) {
11914 log_materialization_prevented(vp, op);
11915 return error;
11916 }
11917
11918 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
11919 &mach_port);
11920 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
11921 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11922 /*
11923 * Treat this like being unable to access the backing store
11924 * server.
11925 */
11926 return ETIMEDOUT;
11927 }
11928
11929 path = zalloc(ZV_NAMEI);
11930 path_len = MAXPATHLEN;
11931
11932 error = vn_getpath(vp, path, &path_len);
11933 if (error) {
11934 goto out_release_port;
11935 }
11936
11937 error = vfs_context_copy_audit_token(context, &atoken);
11938 if (error) {
11939 goto out_release_port;
11940 }
11941
11942 req.r_req_id = next_nspace_req_id();
11943 req.r_resolver_error = 0;
11944 req.r_flags = 0;
11945 req.r_vp = vp;
11946
11947 NSPACE_REQ_LOCK();
11948 error = nspace_resolver_req_add(&req);
11949 NSPACE_REQ_UNLOCK();
11950 if (error) {
11951 goto out_release_port;
11952 }
11953
11954 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11955 if (vp->v_type == VDIR) {
11956 char *tmpname = NULL;
11957
11958 /*
11959 * If the caller provided a lookup_name *and* a name length,
11960 * then we assume the lookup_name is not NUL-terminated.
11961 * Allocate a temporary buffer in this case to provide
11962 * a NUL-terminated path name to the IPC call.
11963 */
11964 if (lookup_name != NULL && namelen != 0) {
11965 if (namelen >= PATH_MAX) {
11966 error = EINVAL;
11967 goto out_release_port;
11968 }
11969 tmpname = zalloc(ZV_NAMEI);
11970 strlcpy(tmpname, lookup_name, namelen + 1);
11971 lookup_name = tmpname;
11972 } else if (lookup_name != NULL) {
11973 /*
11974 * If the caller provided a lookup_name with a
11975 * zero name length, then we assume it's NUL-
11976 * terminated. Verify it has a valid length.
11977 */
11978 if (strlen(lookup_name) >= PATH_MAX) {
11979 error = EINVAL;
11980 goto out_release_port;
11981 }
11982 }
11983
11984 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
11985 req.r_req_id, (uint32_t)(op & 0xffffffff),
11986 lookup_name == NULL ? "" : lookup_name, path, atoken);
11987
11988 if (tmpname != NULL) {
11989 zfree(ZV_NAMEI, tmpname);
11990
11991 /*
11992 * Poison lookup_name rather than reference
11993 * freed memory.
11994 */
11995 lookup_name = NULL;
11996 }
11997 } else {
11998 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
11999 req.r_req_id, (uint32_t)(op & 0xffffffff),
12000 offset, size, path, atoken);
12001 }
12002 if (kern_ret != KERN_SUCCESS) {
12003 /*
12004 * Also treat this like being unable to access the backing
12005 * store server.
12006 */
12007 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12008 kern_ret);
12009 error = ETIMEDOUT;
12010
12011 NSPACE_REQ_LOCK();
12012 nspace_resolver_req_remove(&req);
12013 NSPACE_REQ_UNLOCK();
12014 goto out_release_port;
12015 }
12016
12017 /*
12018 * Give back the memory we allocated earlier while we wait; we
12019 * no longer need it.
12020 */
12021 zfree(ZV_NAMEI, path);
12022 path = NULL;
12023
12024 /*
12025 * Request has been submitted to the resolver. Now (interruptibly)
12026 * wait for completion. Upon requrn, the request will have been
12027 * removed from the lookup table.
12028 */
12029 error = nspace_resolver_req_wait(&req);
12030
12031 out_release_port:
12032 if (path != NULL) {
12033 zfree(ZV_NAMEI, path);
12034 }
12035 ipc_port_release_send(mach_port);
12036
12037 return error;
12038 #else
12039 return ENOTSUP;
12040 #endif /* CONFIG_DATALESS_FILES */
12041 }
12042
12043 /*
12044 * vfs_materialize_file: Materialize a regular file.
12045 *
12046 * Inputs:
12047 * vp The dataless file to be materialized.
12048 *
12049 * op What kind of operation is being performed:
12050 * -> NAMESPACE_HANDLER_READ_OP
12051 * -> NAMESPACE_HANDLER_WRITE_OP
12052 * -> NAMESPACE_HANDLER_LINK_CREATE
12053 * -> NAMESPACE_HANDLER_DELETE_OP
12054 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12055 * -> NAMESPACE_HANDLER_RENAME_OP
12056 *
12057 * offset offset of I/O for READ or WRITE. Ignored for
12058 * other ops.
12059 *
12060 * size size of I/O for READ or WRITE Ignored for
12061 * other ops.
12062 *
12063 * If offsize or size are -1 for a READ or WRITE, then the resolver should
12064 * consider the range to be unknown.
12065 *
12066 * Upon successful return, the caller may proceed with the operation.
12067 * N.B. the file may still be "dataless" in this case.
12068 */
12069 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12070 vfs_materialize_file(
12071 struct vnode *vp,
12072 uint64_t op,
12073 int64_t offset,
12074 int64_t size)
12075 {
12076 if (vp->v_type != VREG) {
12077 return EFTYPE;
12078 }
12079 return vfs_materialize_item(vp, op, offset, size, NULL, 0);
12080 }
12081
12082 /*
12083 * vfs_materialize_dir:
12084 *
12085 * Inputs:
12086 * vp The dataless directory to be materialized.
12087 *
12088 * op What kind of operation is being performed:
12089 * -> NAMESPACE_HANDLER_READ_OP
12090 * -> NAMESPACE_HANDLER_WRITE_OP
12091 * -> NAMESPACE_HANDLER_DELETE_OP
12092 * -> NAMESPACE_HANDLER_RENAME_OP
12093 * -> NAMESPACE_HANDLER_LOOKUP_OP
12094 *
12095 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12096 * other ops. May or may not be NUL-terminated; see below.
12097 *
12098 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12099 * terminated and namelen is the number of valid bytes in
12100 * lookup_name. If zero, then lookup_name is assumed to be
12101 * NUL-terminated.
12102 *
12103 * Upon successful return, the caller may proceed with the operation.
12104 * N.B. the directory may still be "dataless" in this case.
12105 */
12106 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12107 vfs_materialize_dir(
12108 struct vnode *vp,
12109 uint64_t op,
12110 char *lookup_name,
12111 size_t namelen)
12112 {
12113 if (vp->v_type != VDIR) {
12114 return EFTYPE;
12115 }
12116 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12117 return EINVAL;
12118 }
12119 return vfs_materialize_item(vp, op, 0, 0, lookup_name, namelen);
12120 }
12121
12122 int
resolve_nspace_item_ext(struct vnode * vp __no_dataless_unused,uint64_t op __no_dataless_unused,void * arg __unused)12123 resolve_nspace_item_ext(
12124 struct vnode *vp __no_dataless_unused,
12125 uint64_t op __no_dataless_unused,
12126 void *arg __unused)
12127 {
12128 #if CONFIG_DATALESS_FILES
12129 int error;
12130 mach_port_t mp;
12131 char *path = NULL;
12132 int path_len;
12133 kern_return_t kr;
12134 struct nspace_resolver_request req;
12135
12136 // only allow namespace events on regular files, directories and symlinks.
12137 if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
12138 return EFTYPE;
12139 }
12140
12141 //
12142 // if this is a snapshot event and the vnode is on a
12143 // disk image just pretend nothing happened since any
12144 // change to the disk image will cause the disk image
12145 // itself to get backed up and this avoids multi-way
12146 // deadlocks between the snapshot handler and the ever
12147 // popular diskimages-helper process. the variable
12148 // nspace_allow_virtual_devs allows this behavior to
12149 // be overridden (for use by the Mobile TimeMachine
12150 // testing infrastructure which uses disk images)
12151 //
12152 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12153 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12154 return ENOTSUP;
12155 }
12156
12157 error = vfs_context_dataless_materialization_is_prevented(
12158 vfs_context_current());
12159 if (error) {
12160 log_materialization_prevented(vp, op);
12161 return error;
12162 }
12163
12164 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
12165 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
12166 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12167 // Treat this like being unable to access the backing
12168 // store server.
12169 return ETIMEDOUT;
12170 }
12171
12172 path = zalloc(ZV_NAMEI);
12173 path_len = MAXPATHLEN;
12174
12175 error = vn_getpath(vp, path, &path_len);
12176 if (error == 0) {
12177 int xxx_rdar44371223; /* XXX Mig bug */
12178 req.r_req_id = next_nspace_req_id();
12179 req.r_resolver_error = 0;
12180 req.r_flags = 0;
12181
12182 if ((error = vnode_ref(vp)) == 0) { // take a ref so that the vnode doesn't go away
12183 req.r_vp = vp;
12184 } else {
12185 goto out_release_port;
12186 }
12187
12188 NSPACE_REQ_LOCK();
12189 error = nspace_resolver_req_add(&req);
12190 NSPACE_REQ_UNLOCK();
12191 if (error) {
12192 vnode_rele(req.r_vp);
12193 goto out_release_port;
12194 }
12195
12196 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12197 kr = send_nspace_resolve_path(mp, req.r_req_id,
12198 proc_getpid(current_proc()), (uint32_t)(op & 0xffffffff),
12199 path, &xxx_rdar44371223);
12200 if (kr != KERN_SUCCESS) {
12201 // Also treat this like being unable to access
12202 // the backing store server.
12203 os_log_error(OS_LOG_DEFAULT,
12204 "NSPACE resolve_path failure: %d", kr);
12205 error = ETIMEDOUT;
12206
12207 NSPACE_REQ_LOCK();
12208 nspace_resolver_req_remove(&req);
12209 NSPACE_REQ_UNLOCK();
12210 vnode_rele(req.r_vp);
12211 goto out_release_port;
12212 }
12213
12214 // Give back the memory we allocated earlier while
12215 // we wait; we no longer need it.
12216 zfree(ZV_NAMEI, path);
12217 path = NULL;
12218
12219 // Request has been submitted to the resolver.
12220 // Now (interruptibly) wait for completion.
12221 // Upon requrn, the request will have been removed
12222 // from the lookup table.
12223 error = nspace_resolver_req_wait(&req);
12224
12225 vnode_rele(req.r_vp);
12226 }
12227
12228 out_release_port:
12229 if (path != NULL) {
12230 zfree(ZV_NAMEI, path);
12231 }
12232 ipc_port_release_send(mp);
12233
12234 return error;
12235 #else
12236 return ENOTSUP;
12237 #endif /* CONFIG_DATALESS_FILES */
12238 }
12239
12240 int
nspace_snapshot_event(__unused vnode_t vp,__unused time_t ctime,__unused uint64_t op_type,__unused void * arg)12241 nspace_snapshot_event(__unused vnode_t vp, __unused time_t ctime,
12242 __unused uint64_t op_type, __unused void *arg)
12243 {
12244 return 0;
12245 }
12246
12247 #if 0
12248 static int
12249 build_volfs_path(struct vnode *vp, char *path, int *len)
12250 {
12251 struct vnode_attr va;
12252 int ret;
12253
12254 VATTR_INIT(&va);
12255 VATTR_WANTED(&va, va_fsid);
12256 VATTR_WANTED(&va, va_fileid);
12257
12258 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12259 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12260 ret = -1;
12261 } else {
12262 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12263 ret = 0;
12264 }
12265
12266 return ret;
12267 }
12268 #endif
12269
12270 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12271 fsctl_bogus_command_compat(unsigned long cmd)
12272 {
12273 switch (cmd) {
12274 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12275 return FSIOC_SYNC_VOLUME;
12276 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12277 return FSIOC_ROUTEFS_SETROUTEID;
12278 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12279 return FSIOC_SET_PACKAGE_EXTS;
12280 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12281 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12282 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12283 return DISK_CONDITIONER_IOC_GET;
12284 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12285 return DISK_CONDITIONER_IOC_SET;
12286 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12287 return FSIOC_FIOSEEKHOLE;
12288 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12289 return FSIOC_FIOSEEKDATA;
12290 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12291 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12292 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12293 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12294 }
12295
12296 return cmd;
12297 }
12298
12299 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12300 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12301 {
12302 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12303 }
12304
12305 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12306 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12307 {
12308 struct vfs_attr vfa;
12309 mount_t mp = vp->v_mount;
12310 unsigned arg;
12311 int error;
12312
12313 /* record vid of vp so we can drop it below. */
12314 uint32_t vvid = vp->v_id;
12315
12316 /*
12317 * Then grab mount_iterref so that we can release the vnode.
12318 * Without this, a thread may call vnode_iterate_prepare then
12319 * get into a deadlock because we've never released the root vp
12320 */
12321 error = mount_iterref(mp, 0);
12322 if (error) {
12323 return error;
12324 }
12325 vnode_hold(vp);
12326 vnode_put(vp);
12327
12328 arg = MNT_NOWAIT;
12329 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12330 arg = MNT_WAIT;
12331 }
12332
12333 /*
12334 * If the filessytem supports multiple filesytems in a
12335 * partition (For eg APFS volumes in a container, it knows
12336 * that the waitfor argument to VFS_SYNC are flags.
12337 */
12338 VFSATTR_INIT(&vfa);
12339 VFSATTR_WANTED(&vfa, f_capabilities);
12340 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12341 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12342 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12343 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12344 arg |= MNT_VOLUME;
12345 }
12346
12347 /* issue the sync for this volume */
12348 (void)sync_callback(mp, &arg);
12349
12350 /*
12351 * Then release the mount_iterref once we're done syncing; it's not
12352 * needed for the VNOP_IOCTL below
12353 */
12354 mount_iterdrop(mp);
12355
12356 if (arg & FSCTL_SYNC_FULLSYNC) {
12357 /* re-obtain vnode iocount on the root vp, if possible */
12358 error = vnode_getwithvid(vp, vvid);
12359 if (error == 0) {
12360 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12361 vnode_put(vp);
12362 }
12363 }
12364 vnode_drop(vp);
12365 /* mark the argument VP as having been released */
12366 *arg_vp = NULL;
12367 return error;
12368 }
12369
12370 #if ROUTEFS
12371 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12372 handle_routes(user_addr_t udata)
12373 {
12374 char routepath[MAXPATHLEN];
12375 size_t len = 0;
12376 int error;
12377
12378 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12379 return error;
12380 }
12381 bzero(routepath, MAXPATHLEN);
12382 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12383 if (error) {
12384 return error;
12385 }
12386 error = routefs_kernel_mount(routepath);
12387 return error;
12388 }
12389 #endif
12390
12391 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12392 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12393 {
12394 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12395 struct vnode_attr va;
12396 int error;
12397
12398 VATTR_INIT(&va);
12399 VATTR_SET(&va, va_flags, cas->new_flags);
12400
12401 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12402
12403 #if CONFIG_FSE
12404 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12405 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12406 }
12407 #endif
12408
12409 return error;
12410 }
12411
12412 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12413 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12414 {
12415 struct mount *mp = NULL;
12416 errno_t rootauth = 0;
12417
12418 mp = vp->v_mount;
12419
12420 /*
12421 * query the underlying FS and see if it reports something
12422 * sane for this vnode. If volume is authenticated via
12423 * chunklist, leave that for the caller to determine.
12424 */
12425 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12426
12427 return rootauth;
12428 }
12429
12430 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12431 "com.apple.private.kernel.set-package-extensions"
12432
12433 /*
12434 * Make a filesystem-specific control call:
12435 */
12436 /* ARGSUSED */
12437 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12438 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12439 {
12440 int error = 0;
12441 boolean_t is64bit;
12442 u_int size;
12443 #define STK_PARAMS 128
12444 char stkbuf[STK_PARAMS] = {0};
12445 caddr_t data, memp;
12446 vnode_t vp = *arg_vp;
12447
12448 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12449 return ENOTTY;
12450 }
12451
12452 cmd = fsctl_bogus_command_compat(cmd);
12453
12454 size = IOCPARM_LEN(cmd);
12455 if (size > IOCPARM_MAX) {
12456 return EINVAL;
12457 }
12458
12459 is64bit = proc_is64bit(p);
12460
12461 memp = NULL;
12462
12463 if (size > sizeof(stkbuf)) {
12464 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12465 return ENOMEM;
12466 }
12467 data = memp;
12468 } else {
12469 data = &stkbuf[0];
12470 };
12471
12472 if (cmd & IOC_IN) {
12473 if (size) {
12474 error = copyin(udata, data, size);
12475 if (error) {
12476 if (memp) {
12477 kfree_data(memp, size);
12478 }
12479 return error;
12480 }
12481 } else {
12482 if (is64bit) {
12483 *(user_addr_t *)data = udata;
12484 } else {
12485 *(uint32_t *)data = (uint32_t)udata;
12486 }
12487 };
12488 } else if ((cmd & IOC_OUT) && size) {
12489 /*
12490 * Zero the buffer so the user always
12491 * gets back something deterministic.
12492 */
12493 bzero(data, size);
12494 } else if (cmd & IOC_VOID) {
12495 if (is64bit) {
12496 *(user_addr_t *)data = udata;
12497 } else {
12498 *(uint32_t *)data = (uint32_t)udata;
12499 }
12500 }
12501
12502 /* Check to see if it's a generic command */
12503 switch (cmd) {
12504 case FSIOC_SYNC_VOLUME:
12505 error = handle_sync_volume(vp, arg_vp, data, ctx);
12506 break;
12507
12508 case FSIOC_ROUTEFS_SETROUTEID:
12509 #if ROUTEFS
12510 error = handle_routes(udata);
12511 #endif
12512 break;
12513
12514 case FSIOC_SET_PACKAGE_EXTS: {
12515 user_addr_t ext_strings;
12516 uint32_t num_entries;
12517 uint32_t max_width;
12518
12519 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12520 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12521 error = EPERM;
12522 break;
12523 }
12524
12525 if ((is64bit && size != sizeof(user64_package_ext_info))
12526 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12527 // either you're 64-bit and passed a 64-bit struct or
12528 // you're 32-bit and passed a 32-bit struct. otherwise
12529 // it's not ok.
12530 error = EINVAL;
12531 break;
12532 }
12533
12534 if (is64bit) {
12535 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12536 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12537 }
12538 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12539 num_entries = ((user64_package_ext_info *)data)->num_entries;
12540 max_width = ((user64_package_ext_info *)data)->max_width;
12541 } else {
12542 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12543 num_entries = ((user32_package_ext_info *)data)->num_entries;
12544 max_width = ((user32_package_ext_info *)data)->max_width;
12545 }
12546 error = set_package_extensions_table(ext_strings, num_entries, max_width);
12547 }
12548 break;
12549
12550 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12551 {
12552 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12553 break;
12554 }
12555 if (vp->v_mount) {
12556 mount_lock(vp->v_mount);
12557 if (data[0] != 0) {
12558 int i;
12559 for (i = 0; i < MFSTYPENAMELEN; i++) {
12560 if (!data[i]) {
12561 goto continue_copy;
12562 }
12563 }
12564 /*
12565 * Getting here means we have a user data string which has no
12566 * NULL termination in its first MFSTYPENAMELEN bytes.
12567 * This is bogus, let's avoid strlcpy-ing the read data and
12568 * return an error.
12569 */
12570 error = EINVAL;
12571 goto unlock;
12572 continue_copy:
12573 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
12574 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
12575 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12576 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
12577 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
12578 }
12579 } else {
12580 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
12581 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
12582 }
12583 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
12584 vp->v_mount->fstypename_override[0] = '\0';
12585 }
12586 unlock:
12587 mount_unlock(vp->v_mount);
12588 }
12589 }
12590 break;
12591
12592 case DISK_CONDITIONER_IOC_GET: {
12593 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12594 }
12595 break;
12596
12597 case DISK_CONDITIONER_IOC_SET: {
12598 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12599 }
12600 break;
12601
12602 case FSIOC_CAS_BSDFLAGS:
12603 error = handle_flags(vp, data, ctx);
12604 break;
12605
12606 case FSIOC_FD_ONLY_OPEN_ONCE: {
12607 error = 0;
12608 if (vnode_usecount(vp) > 1) {
12609 vnode_lock_spin(vp);
12610 if (vp->v_lflag & VL_HASSTREAMS) {
12611 if (vnode_isinuse_locked(vp, 1, 1)) {
12612 error = EBUSY;
12613 }
12614 } else if (vnode_usecount(vp) > 1) {
12615 error = EBUSY;
12616 }
12617 vnode_unlock(vp);
12618 }
12619 }
12620 break;
12621
12622 case FSIOC_EVAL_ROOTAUTH:
12623 error = handle_auth(vp, cmd, data, options, ctx);
12624 break;
12625
12626 case FSIOC_TEST_FSE_ACCESS_GRANTED:
12627 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12628 break;
12629
12630 default: {
12631 /* other, known commands shouldn't be passed down here */
12632 switch (cmd) {
12633 case F_PUNCHHOLE:
12634 case F_TRIM_ACTIVE_FILE:
12635 case F_RDADVISE:
12636 case F_TRANSCODEKEY:
12637 case F_GETPROTECTIONLEVEL:
12638 case F_GETDEFAULTPROTLEVEL:
12639 case F_MAKECOMPRESSED:
12640 case F_SET_GREEDY_MODE:
12641 case F_SETSTATICCONTENT:
12642 case F_SETIOTYPE:
12643 case F_SETBACKINGSTORE:
12644 case F_GETPATH_MTMINFO:
12645 case APFSIOC_REVERT_TO_SNAPSHOT:
12646 case FSIOC_FIOSEEKHOLE:
12647 case FSIOC_FIOSEEKDATA:
12648 case HFS_GET_BOOT_INFO:
12649 case HFS_SET_BOOT_INFO:
12650 case FIOPINSWAP:
12651 case F_CHKCLEAN:
12652 case F_FULLFSYNC:
12653 case F_BARRIERFSYNC:
12654 case F_FREEZE_FS:
12655 case F_THAW_FS:
12656 case FSIOC_KERNEL_ROOTAUTH:
12657 case FSIOC_GRAFT_FS:
12658 case FSIOC_UNGRAFT_FS:
12659 case FSIOC_AUTH_FS:
12660 error = EINVAL;
12661 goto outdrop;
12662 }
12663 /* Invoke the filesystem-specific code */
12664 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12665 }
12666 } /* end switch stmt */
12667
12668 /*
12669 * if no errors, copy any data to user. Size was
12670 * already set and checked above.
12671 */
12672 if (error == 0 && (cmd & IOC_OUT) && size) {
12673 error = copyout(data, udata, size);
12674 }
12675
12676 outdrop:
12677 if (memp) {
12678 kfree_data(memp, size);
12679 }
12680
12681 return error;
12682 }
12683
12684 /* ARGSUSED */
12685 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12686 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12687 {
12688 int error;
12689 struct nameidata nd;
12690 uint32_t nameiflags;
12691 vnode_t vp = NULL;
12692 vfs_context_t ctx = vfs_context_current();
12693
12694 AUDIT_ARG(cmd, (int)uap->cmd);
12695 AUDIT_ARG(value32, uap->options);
12696 /* Get the vnode for the file we are getting info on: */
12697 nameiflags = 0;
12698 //
12699 // if we come through fsctl() then the file is by definition not open.
12700 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12701 // lest the caller mistakenly thinks the only open is their own (but in
12702 // reality it's someone elses).
12703 //
12704 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12705 return EINVAL;
12706 }
12707 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12708 nameiflags |= FOLLOW;
12709 }
12710 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12711 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12712 }
12713 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12714 UIO_USERSPACE, uap->path, ctx);
12715 if ((error = namei(&nd))) {
12716 goto done;
12717 }
12718 vp = nd.ni_vp;
12719 nameidone(&nd);
12720
12721 #if CONFIG_MACF
12722 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12723 if (error) {
12724 goto done;
12725 }
12726 #endif
12727
12728 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12729
12730 done:
12731 if (vp) {
12732 vnode_put(vp);
12733 }
12734 return error;
12735 }
12736 /* ARGSUSED */
12737 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12738 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12739 {
12740 int error;
12741 vnode_t vp = NULL;
12742 vfs_context_t ctx = vfs_context_current();
12743 int fd = -1;
12744
12745 AUDIT_ARG(fd, uap->fd);
12746 AUDIT_ARG(cmd, (int)uap->cmd);
12747 AUDIT_ARG(value32, uap->options);
12748
12749 /* Get the vnode for the file we are getting info on: */
12750 if ((error = file_vnode(uap->fd, &vp))) {
12751 return error;
12752 }
12753 fd = uap->fd;
12754 if ((error = vnode_getwithref(vp))) {
12755 file_drop(fd);
12756 return error;
12757 }
12758
12759 #if CONFIG_MACF
12760 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12761 file_drop(fd);
12762 vnode_put(vp);
12763 return error;
12764 }
12765 #endif
12766
12767 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12768
12769 file_drop(fd);
12770
12771 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12772 if (vp) {
12773 vnode_put(vp);
12774 }
12775
12776 return error;
12777 }
12778 /* end of fsctl system call */
12779
12780 #define FILESEC_ACCESS_ENTITLEMENT \
12781 "com.apple.private.vfs.filesec-access"
12782
12783 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)12784 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
12785 {
12786 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
12787 /*
12788 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
12789 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
12790 */
12791 if ((!setting && vfs_context_issuser(ctx)) ||
12792 IOTaskHasEntitlement(vfs_context_task(ctx),
12793 FILESEC_ACCESS_ENTITLEMENT)) {
12794 return 0;
12795 }
12796 }
12797
12798 return EPERM;
12799 }
12800
12801 /*
12802 * Retrieve the data of an extended attribute.
12803 */
12804 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)12805 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
12806 {
12807 vnode_t vp;
12808 struct nameidata nd;
12809 char attrname[XATTR_MAXNAMELEN + 1];
12810 vfs_context_t ctx = vfs_context_current();
12811 uio_t auio = NULL;
12812 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12813 size_t attrsize = 0;
12814 size_t namelen;
12815 u_int32_t nameiflags;
12816 int error;
12817 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12818
12819 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12820 return EINVAL;
12821 }
12822
12823 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12824 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
12825 if ((error = namei(&nd))) {
12826 return error;
12827 }
12828 vp = nd.ni_vp;
12829 nameidone(&nd);
12830
12831 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12832 if (error != 0) {
12833 goto out;
12834 }
12835 if (xattr_protected(attrname) &&
12836 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12837 goto out;
12838 }
12839 /*
12840 * the specific check for 0xffffffff is a hack to preserve
12841 * binaray compatibilty in K64 with applications that discovered
12842 * that passing in a buf pointer and a size of -1 resulted in
12843 * just the size of the indicated extended attribute being returned.
12844 * this isn't part of the documented behavior, but because of the
12845 * original implemtation's check for "uap->size > 0", this behavior
12846 * was allowed. In K32 that check turned into a signed comparison
12847 * even though uap->size is unsigned... in K64, we blow by that
12848 * check because uap->size is unsigned and doesn't get sign smeared
12849 * in the munger for a 32 bit user app. we also need to add a
12850 * check to limit the maximum size of the buffer being passed in...
12851 * unfortunately, the underlying fileystems seem to just malloc
12852 * the requested size even if the actual extended attribute is tiny.
12853 * because that malloc is for kernel wired memory, we have to put a
12854 * sane limit on it.
12855 *
12856 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
12857 * U64 running on K64 will yield -1 (64 bits wide)
12858 * U32/U64 running on K32 will yield -1 (32 bits wide)
12859 */
12860 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
12861 goto no_uio;
12862 }
12863
12864 if (uap->value) {
12865 if (uap->size > (size_t)XATTR_MAXSIZE) {
12866 uap->size = XATTR_MAXSIZE;
12867 }
12868
12869 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12870 &uio_buf[0], sizeof(uio_buf));
12871 uio_addiov(auio, uap->value, uap->size);
12872 }
12873 no_uio:
12874 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
12875 out:
12876 vnode_put(vp);
12877
12878 if (auio) {
12879 *retval = uap->size - uio_resid(auio);
12880 } else {
12881 *retval = (user_ssize_t)attrsize;
12882 }
12883
12884 return error;
12885 }
12886
12887 /*
12888 * Retrieve the data of an extended attribute.
12889 */
12890 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)12891 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
12892 {
12893 vnode_t vp;
12894 char attrname[XATTR_MAXNAMELEN + 1];
12895 vfs_context_t ctx = vfs_context_current();
12896 uio_t auio = NULL;
12897 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12898 size_t attrsize = 0;
12899 size_t namelen;
12900 int error;
12901 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12902
12903 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12904 return EINVAL;
12905 }
12906
12907 if ((error = file_vnode(uap->fd, &vp))) {
12908 return error;
12909 }
12910 if ((error = vnode_getwithref(vp))) {
12911 file_drop(uap->fd);
12912 return error;
12913 }
12914 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12915 if (error != 0) {
12916 goto out;
12917 }
12918 if (xattr_protected(attrname) &&
12919 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
12920 goto out;
12921 }
12922 if (uap->value && uap->size > 0) {
12923 if (uap->size > (size_t)XATTR_MAXSIZE) {
12924 uap->size = XATTR_MAXSIZE;
12925 }
12926
12927 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
12928 &uio_buf[0], sizeof(uio_buf));
12929 uio_addiov(auio, uap->value, uap->size);
12930 }
12931
12932 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
12933 out:
12934 (void)vnode_put(vp);
12935 file_drop(uap->fd);
12936
12937 if (auio) {
12938 *retval = uap->size - uio_resid(auio);
12939 } else {
12940 *retval = (user_ssize_t)attrsize;
12941 }
12942 return error;
12943 }
12944
12945 /* struct for checkdirs iteration */
12946 struct setxattr_ctx {
12947 struct nameidata nd;
12948 char attrname[XATTR_MAXNAMELEN + 1];
12949 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
12950 };
12951
12952 /*
12953 * Set the data of an extended attribute.
12954 */
12955 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)12956 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
12957 {
12958 vnode_t vp;
12959 vfs_context_t ctx = vfs_context_current();
12960 uio_t auio = NULL;
12961 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12962 size_t namelen;
12963 u_int32_t nameiflags;
12964 int error;
12965 struct setxattr_ctx *sactx;
12966
12967 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12968 return EINVAL;
12969 }
12970
12971 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
12972 if (sactx == NULL) {
12973 return ENOMEM;
12974 }
12975
12976 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
12977 if (error != 0) {
12978 if (error == EPERM) {
12979 /* if the string won't fit in attrname, copyinstr emits EPERM */
12980 error = ENAMETOOLONG;
12981 }
12982 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12983 goto out;
12984 }
12985 if (xattr_protected(sactx->attrname) &&
12986 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
12987 goto out;
12988 }
12989 if (uap->size != 0 && uap->value == 0) {
12990 error = EINVAL;
12991 goto out;
12992 }
12993 if (uap->size > INT_MAX) {
12994 error = E2BIG;
12995 goto out;
12996 }
12997
12998 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12999 #if CONFIG_FILE_LEASES
13000 nameiflags |= WANTPARENT;
13001 #endif
13002 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13003 if ((error = namei(&sactx->nd))) {
13004 goto out;
13005 }
13006 vp = sactx->nd.ni_vp;
13007 #if CONFIG_FILE_LEASES
13008 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13009 vnode_put(sactx->nd.ni_dvp);
13010 #endif
13011 nameidone(&sactx->nd);
13012
13013 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13014 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13015 uio_addiov(auio, uap->value, uap->size);
13016
13017 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13018 #if CONFIG_FSE
13019 if (error == 0) {
13020 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13021 FSE_ARG_VNODE, vp,
13022 FSE_ARG_DONE);
13023 }
13024 #endif
13025 vnode_put(vp);
13026 out:
13027 kfree_type(struct setxattr_ctx, sactx);
13028 *retval = 0;
13029 return error;
13030 }
13031
13032 /*
13033 * Set the data of an extended attribute.
13034 */
13035 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13036 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13037 {
13038 vnode_t vp;
13039 char attrname[XATTR_MAXNAMELEN + 1];
13040 vfs_context_t ctx = vfs_context_current();
13041 uio_t auio = NULL;
13042 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13043 size_t namelen;
13044 int error;
13045 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13046
13047 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13048 return EINVAL;
13049 }
13050
13051 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13052 if (error != 0) {
13053 if (error == EPERM) {
13054 /* if the string won't fit in attrname, copyinstr emits EPERM */
13055 return ENAMETOOLONG;
13056 }
13057 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13058 return error;
13059 }
13060 if (xattr_protected(attrname) &&
13061 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13062 return error;
13063 }
13064 if (uap->size != 0 && uap->value == 0) {
13065 return EINVAL;
13066 }
13067 if (uap->size > INT_MAX) {
13068 return E2BIG;
13069 }
13070 if ((error = file_vnode(uap->fd, &vp))) {
13071 return error;
13072 }
13073 if ((error = vnode_getwithref(vp))) {
13074 file_drop(uap->fd);
13075 return error;
13076 }
13077
13078 #if CONFIG_FILE_LEASES
13079 vnode_breakdirlease(vp, true, O_WRONLY);
13080 #endif
13081
13082 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13083 &uio_buf[0], sizeof(uio_buf));
13084 uio_addiov(auio, uap->value, uap->size);
13085
13086 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13087 #if CONFIG_FSE
13088 if (error == 0) {
13089 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13090 FSE_ARG_VNODE, vp,
13091 FSE_ARG_DONE);
13092 }
13093 #endif
13094 vnode_put(vp);
13095 file_drop(uap->fd);
13096 *retval = 0;
13097 return error;
13098 }
13099
13100 /*
13101 * Remove an extended attribute.
13102 * XXX Code duplication here.
13103 */
13104 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13105 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13106 {
13107 vnode_t vp;
13108 struct nameidata nd;
13109 char attrname[XATTR_MAXNAMELEN + 1];
13110 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13111 vfs_context_t ctx = vfs_context_current();
13112 size_t namelen;
13113 u_int32_t nameiflags;
13114 int error;
13115
13116 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13117 return EINVAL;
13118 }
13119
13120 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13121 if (error != 0) {
13122 return error;
13123 }
13124 if (xattr_protected(attrname)) {
13125 return EPERM;
13126 }
13127 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13128 #if CONFIG_FILE_LEASES
13129 nameiflags |= WANTPARENT;
13130 #endif
13131 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13132 if ((error = namei(&nd))) {
13133 return error;
13134 }
13135 vp = nd.ni_vp;
13136 #if CONFIG_FILE_LEASES
13137 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13138 vnode_put(nd.ni_dvp);
13139 #endif
13140 nameidone(&nd);
13141
13142 error = vn_removexattr(vp, attrname, uap->options, ctx);
13143 #if CONFIG_FSE
13144 if (error == 0) {
13145 add_fsevent(FSE_XATTR_REMOVED, ctx,
13146 FSE_ARG_VNODE, vp,
13147 FSE_ARG_DONE);
13148 }
13149 #endif
13150 vnode_put(vp);
13151 *retval = 0;
13152 return error;
13153 }
13154
13155 /*
13156 * Remove an extended attribute.
13157 * XXX Code duplication here.
13158 */
13159 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13160 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13161 {
13162 vnode_t vp;
13163 char attrname[XATTR_MAXNAMELEN + 1];
13164 size_t namelen;
13165 int error;
13166 #if CONFIG_FSE
13167 vfs_context_t ctx = vfs_context_current();
13168 #endif
13169
13170 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13171 return EINVAL;
13172 }
13173
13174 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13175 if (error != 0) {
13176 return error;
13177 }
13178 if (xattr_protected(attrname)) {
13179 return EPERM;
13180 }
13181 if ((error = file_vnode(uap->fd, &vp))) {
13182 return error;
13183 }
13184 if ((error = vnode_getwithref(vp))) {
13185 file_drop(uap->fd);
13186 return error;
13187 }
13188
13189 #if CONFIG_FILE_LEASES
13190 vnode_breakdirlease(vp, true, O_WRONLY);
13191 #endif
13192
13193 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13194 #if CONFIG_FSE
13195 if (error == 0) {
13196 add_fsevent(FSE_XATTR_REMOVED, ctx,
13197 FSE_ARG_VNODE, vp,
13198 FSE_ARG_DONE);
13199 }
13200 #endif
13201 vnode_put(vp);
13202 file_drop(uap->fd);
13203 *retval = 0;
13204 return error;
13205 }
13206
13207 /*
13208 * Retrieve the list of extended attribute names.
13209 * XXX Code duplication here.
13210 */
13211 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13212 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13213 {
13214 vnode_t vp;
13215 struct nameidata nd;
13216 vfs_context_t ctx = vfs_context_current();
13217 uio_t auio = NULL;
13218 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13219 size_t attrsize = 0;
13220 u_int32_t nameiflags;
13221 int error;
13222 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13223
13224 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13225 return EINVAL;
13226 }
13227
13228 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13229 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13230 if ((error = namei(&nd))) {
13231 return error;
13232 }
13233 vp = nd.ni_vp;
13234 nameidone(&nd);
13235 if (uap->namebuf != 0 && uap->bufsize > 0) {
13236 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13237 &uio_buf[0], sizeof(uio_buf));
13238 uio_addiov(auio, uap->namebuf, uap->bufsize);
13239 }
13240
13241 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13242
13243 vnode_put(vp);
13244 if (auio) {
13245 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13246 } else {
13247 *retval = (user_ssize_t)attrsize;
13248 }
13249 return error;
13250 }
13251
13252 /*
13253 * Retrieve the list of extended attribute names.
13254 * XXX Code duplication here.
13255 */
13256 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13257 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13258 {
13259 vnode_t vp;
13260 uio_t auio = NULL;
13261 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13262 size_t attrsize = 0;
13263 int error;
13264 uio_stackbuf_t uio_buf[UIO_SIZEOF(1)];
13265
13266 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13267 return EINVAL;
13268 }
13269
13270 if ((error = file_vnode(uap->fd, &vp))) {
13271 return error;
13272 }
13273 if ((error = vnode_getwithref(vp))) {
13274 file_drop(uap->fd);
13275 return error;
13276 }
13277 if (uap->namebuf != 0 && uap->bufsize > 0) {
13278 auio = uio_createwithbuffer(1, 0, spacetype,
13279 UIO_READ, &uio_buf[0], sizeof(uio_buf));
13280 uio_addiov(auio, uap->namebuf, uap->bufsize);
13281 }
13282
13283 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13284
13285 vnode_put(vp);
13286 file_drop(uap->fd);
13287 if (auio) {
13288 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13289 } else {
13290 *retval = (user_ssize_t)attrsize;
13291 }
13292 return error;
13293 }
13294
13295 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13296 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13297 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13298 {
13299 int error;
13300 struct mount *mp = NULL;
13301 vnode_t vp;
13302 int length;
13303 int bpflags;
13304 /* maximum number of times to retry build_path */
13305 unsigned int retries = 0x10;
13306
13307 if (bufsize > PAGE_SIZE) {
13308 return EINVAL;
13309 }
13310
13311 if (buf == NULL) {
13312 return ENOMEM;
13313 }
13314
13315 retry:
13316 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13317 error = ENOTSUP; /* unexpected failure */
13318 return ENOTSUP;
13319 }
13320
13321 #if CONFIG_UNION_MOUNTS
13322 unionget:
13323 #endif /* CONFIG_UNION_MOUNTS */
13324 if (objid == 2) {
13325 struct vfs_attr vfsattr;
13326 int use_vfs_root = TRUE;
13327
13328 VFSATTR_INIT(&vfsattr);
13329 VFSATTR_WANTED(&vfsattr, f_capabilities);
13330 if (!(options & FSOPT_ISREALFSID) &&
13331 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13332 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13333 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13334 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13335 use_vfs_root = FALSE;
13336 }
13337 }
13338
13339 if (use_vfs_root) {
13340 error = VFS_ROOT(mp, &vp, ctx);
13341 } else {
13342 error = VFS_VGET(mp, objid, &vp, ctx);
13343 }
13344 } else {
13345 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13346 }
13347
13348 #if CONFIG_UNION_MOUNTS
13349 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13350 /*
13351 * If the fileid isn't found and we're in a union
13352 * mount volume, then see if the fileid is in the
13353 * mounted-on volume.
13354 */
13355 struct mount *tmp = mp;
13356 mp = vnode_mount(tmp->mnt_vnodecovered);
13357 vfs_unbusy(tmp);
13358 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13359 goto unionget;
13360 }
13361 } else {
13362 vfs_unbusy(mp);
13363 }
13364 #else
13365 vfs_unbusy(mp);
13366 #endif /* CONFIG_UNION_MOUNTS */
13367
13368 if (error) {
13369 return error;
13370 }
13371
13372 #if CONFIG_MACF
13373 error = mac_vnode_check_fsgetpath(ctx, vp);
13374 if (error) {
13375 vnode_put(vp);
13376 return error;
13377 }
13378 #endif
13379
13380 /* Obtain the absolute path to this vnode. */
13381 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13382 if (options & FSOPT_NOFIRMLINKPATH) {
13383 bpflags |= BUILDPATH_NO_FIRMLINK;
13384 }
13385 bpflags |= BUILDPATH_CHECK_MOVED;
13386 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13387 vnode_put(vp);
13388
13389 if (error) {
13390 /* there was a race building the path, try a few more times */
13391 if (error == EAGAIN) {
13392 --retries;
13393 if (retries > 0) {
13394 goto retry;
13395 }
13396
13397 error = ENOENT;
13398 }
13399 goto out;
13400 }
13401
13402 AUDIT_ARG(text, buf);
13403
13404 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13405 unsigned long path_words[NUMPARMS];
13406 size_t path_len = sizeof(path_words);
13407
13408 if ((size_t)length < path_len) {
13409 memcpy((char *)path_words, buf, length);
13410 memset((char *)path_words + length, 0, path_len - length);
13411
13412 path_len = length;
13413 } else {
13414 memcpy((char *)path_words, buf + (length - path_len), path_len);
13415 }
13416
13417 kdebug_vfs_lookup(path_words, (int)path_len, vp,
13418 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13419 }
13420
13421 *pathlen = length; /* may be superseded by error */
13422
13423 out:
13424 return error;
13425 }
13426
13427 /*
13428 * Obtain the full pathname of a file system object by id.
13429 */
13430 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13431 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13432 uint32_t options, user_ssize_t *retval)
13433 {
13434 vfs_context_t ctx = vfs_context_current();
13435 fsid_t fsid;
13436 char *realpath;
13437 int length;
13438 int error;
13439
13440 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13441 return EINVAL;
13442 }
13443
13444 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13445 return error;
13446 }
13447 AUDIT_ARG(value32, fsid.val[0]);
13448 AUDIT_ARG(value64, objid);
13449 /* Restrict output buffer size for now. */
13450
13451 if (bufsize > PAGE_SIZE || bufsize <= 0) {
13452 return EINVAL;
13453 }
13454 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13455 if (realpath == NULL) {
13456 return ENOMEM;
13457 }
13458
13459 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13460 options, &length);
13461
13462 if (error) {
13463 goto out;
13464 }
13465
13466 error = copyout((caddr_t)realpath, buf, length);
13467
13468 *retval = (user_ssize_t)length; /* may be superseded by error */
13469 out:
13470 kfree_data(realpath, bufsize);
13471 return error;
13472 }
13473
13474 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13475 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13476 {
13477 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13478 0, retval);
13479 }
13480
13481 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13482 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13483 {
13484 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13485 uap->options, retval);
13486 }
13487
13488 /*
13489 * Common routine to handle various flavors of statfs data heading out
13490 * to user space.
13491 *
13492 * Returns: 0 Success
13493 * EFAULT
13494 */
13495 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13496 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13497 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13498 boolean_t partial_copy)
13499 {
13500 int error;
13501 int my_size, copy_size;
13502
13503 if (is_64_bit) {
13504 struct user64_statfs sfs;
13505 my_size = copy_size = sizeof(sfs);
13506 bzero(&sfs, my_size);
13507 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13508 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13509 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13510 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13511 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13512 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13513 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13514 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13515 sfs.f_files = (user64_long_t)sfsp->f_files;
13516 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13517 sfs.f_fsid = sfsp->f_fsid;
13518 sfs.f_owner = sfsp->f_owner;
13519 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13520 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13521 } else {
13522 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13523 }
13524 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13525 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13526
13527 if (partial_copy) {
13528 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13529 }
13530 error = copyout((caddr_t)&sfs, bufp, copy_size);
13531 } else {
13532 struct user32_statfs sfs;
13533
13534 my_size = copy_size = sizeof(sfs);
13535 bzero(&sfs, my_size);
13536
13537 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13538 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13539 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13540
13541 /*
13542 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13543 * have to fudge the numbers here in that case. We inflate the blocksize in order
13544 * to reflect the filesystem size as best we can.
13545 */
13546 if ((sfsp->f_blocks > INT_MAX)
13547 /* Hack for 4061702 . I think the real fix is for Carbon to
13548 * look for some volume capability and not depend on hidden
13549 * semantics agreed between a FS and carbon.
13550 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13551 * for Carbon to set bNoVolumeSizes volume attribute.
13552 * Without this the webdavfs files cannot be copied onto
13553 * disk as they look huge. This change should not affect
13554 * XSAN as they should not setting these to -1..
13555 */
13556 && (sfsp->f_blocks != 0xffffffffffffffffULL)
13557 && (sfsp->f_bfree != 0xffffffffffffffffULL)
13558 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13559 int shift;
13560
13561 /*
13562 * Work out how far we have to shift the block count down to make it fit.
13563 * Note that it's possible to have to shift so far that the resulting
13564 * blocksize would be unreportably large. At that point, we will clip
13565 * any values that don't fit.
13566 *
13567 * For safety's sake, we also ensure that f_iosize is never reported as
13568 * being smaller than f_bsize.
13569 */
13570 for (shift = 0; shift < 32; shift++) {
13571 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13572 break;
13573 }
13574 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13575 break;
13576 }
13577 }
13578 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13579 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13580 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13581 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13582 #undef __SHIFT_OR_CLIP
13583 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13584 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13585 } else {
13586 /* filesystem is small enough to be reported honestly */
13587 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13588 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13589 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13590 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13591 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13592 }
13593 sfs.f_files = (user32_long_t)sfsp->f_files;
13594 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13595 sfs.f_fsid = sfsp->f_fsid;
13596 sfs.f_owner = sfsp->f_owner;
13597 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
13598 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
13599 } else {
13600 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
13601 }
13602 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13603 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13604
13605 if (partial_copy) {
13606 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13607 }
13608 error = copyout((caddr_t)&sfs, bufp, copy_size);
13609 }
13610
13611 if (sizep != NULL) {
13612 *sizep = my_size;
13613 }
13614 return error;
13615 }
13616
13617 /*
13618 * copy stat structure into user_stat structure.
13619 */
13620 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13621 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13622 {
13623 bzero(usbp, sizeof(*usbp));
13624
13625 usbp->st_dev = sbp->st_dev;
13626 usbp->st_ino = sbp->st_ino;
13627 usbp->st_mode = sbp->st_mode;
13628 usbp->st_nlink = sbp->st_nlink;
13629 usbp->st_uid = sbp->st_uid;
13630 usbp->st_gid = sbp->st_gid;
13631 usbp->st_rdev = sbp->st_rdev;
13632 #ifndef _POSIX_C_SOURCE
13633 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13634 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13635 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13636 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13637 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13638 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13639 #else
13640 usbp->st_atime = sbp->st_atime;
13641 usbp->st_atimensec = sbp->st_atimensec;
13642 usbp->st_mtime = sbp->st_mtime;
13643 usbp->st_mtimensec = sbp->st_mtimensec;
13644 usbp->st_ctime = sbp->st_ctime;
13645 usbp->st_ctimensec = sbp->st_ctimensec;
13646 #endif
13647 usbp->st_size = sbp->st_size;
13648 usbp->st_blocks = sbp->st_blocks;
13649 usbp->st_blksize = sbp->st_blksize;
13650 usbp->st_flags = sbp->st_flags;
13651 usbp->st_gen = sbp->st_gen;
13652 usbp->st_lspare = sbp->st_lspare;
13653 usbp->st_qspare[0] = sbp->st_qspare[0];
13654 usbp->st_qspare[1] = sbp->st_qspare[1];
13655 }
13656
13657 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13658 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13659 {
13660 bzero(usbp, sizeof(*usbp));
13661
13662 usbp->st_dev = sbp->st_dev;
13663 usbp->st_ino = sbp->st_ino;
13664 usbp->st_mode = sbp->st_mode;
13665 usbp->st_nlink = sbp->st_nlink;
13666 usbp->st_uid = sbp->st_uid;
13667 usbp->st_gid = sbp->st_gid;
13668 usbp->st_rdev = sbp->st_rdev;
13669 #ifndef _POSIX_C_SOURCE
13670 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13671 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13672 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13673 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13674 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13675 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13676 #else
13677 usbp->st_atime = sbp->st_atime;
13678 usbp->st_atimensec = sbp->st_atimensec;
13679 usbp->st_mtime = sbp->st_mtime;
13680 usbp->st_mtimensec = sbp->st_mtimensec;
13681 usbp->st_ctime = sbp->st_ctime;
13682 usbp->st_ctimensec = sbp->st_ctimensec;
13683 #endif
13684 usbp->st_size = sbp->st_size;
13685 usbp->st_blocks = sbp->st_blocks;
13686 usbp->st_blksize = sbp->st_blksize;
13687 usbp->st_flags = sbp->st_flags;
13688 usbp->st_gen = sbp->st_gen;
13689 usbp->st_lspare = sbp->st_lspare;
13690 usbp->st_qspare[0] = sbp->st_qspare[0];
13691 usbp->st_qspare[1] = sbp->st_qspare[1];
13692 }
13693
13694 /*
13695 * copy stat64 structure into user_stat64 structure.
13696 */
13697 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13698 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13699 {
13700 bzero(usbp, sizeof(*usbp));
13701
13702 usbp->st_dev = sbp->st_dev;
13703 usbp->st_ino = sbp->st_ino;
13704 usbp->st_mode = sbp->st_mode;
13705 usbp->st_nlink = sbp->st_nlink;
13706 usbp->st_uid = sbp->st_uid;
13707 usbp->st_gid = sbp->st_gid;
13708 usbp->st_rdev = sbp->st_rdev;
13709 #ifndef _POSIX_C_SOURCE
13710 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13711 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13712 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13713 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13714 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13715 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13716 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13717 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13718 #else
13719 usbp->st_atime = sbp->st_atime;
13720 usbp->st_atimensec = sbp->st_atimensec;
13721 usbp->st_mtime = sbp->st_mtime;
13722 usbp->st_mtimensec = sbp->st_mtimensec;
13723 usbp->st_ctime = sbp->st_ctime;
13724 usbp->st_ctimensec = sbp->st_ctimensec;
13725 usbp->st_birthtime = sbp->st_birthtime;
13726 usbp->st_birthtimensec = sbp->st_birthtimensec;
13727 #endif
13728 usbp->st_size = sbp->st_size;
13729 usbp->st_blocks = sbp->st_blocks;
13730 usbp->st_blksize = sbp->st_blksize;
13731 usbp->st_flags = sbp->st_flags;
13732 usbp->st_gen = sbp->st_gen;
13733 usbp->st_lspare = sbp->st_lspare;
13734 usbp->st_qspare[0] = sbp->st_qspare[0];
13735 usbp->st_qspare[1] = sbp->st_qspare[1];
13736 }
13737
13738 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13739 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13740 {
13741 bzero(usbp, sizeof(*usbp));
13742
13743 usbp->st_dev = sbp->st_dev;
13744 usbp->st_ino = sbp->st_ino;
13745 usbp->st_mode = sbp->st_mode;
13746 usbp->st_nlink = sbp->st_nlink;
13747 usbp->st_uid = sbp->st_uid;
13748 usbp->st_gid = sbp->st_gid;
13749 usbp->st_rdev = sbp->st_rdev;
13750 #ifndef _POSIX_C_SOURCE
13751 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13752 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13753 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13754 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13755 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13756 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13757 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13758 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13759 #else
13760 usbp->st_atime = sbp->st_atime;
13761 usbp->st_atimensec = sbp->st_atimensec;
13762 usbp->st_mtime = sbp->st_mtime;
13763 usbp->st_mtimensec = sbp->st_mtimensec;
13764 usbp->st_ctime = sbp->st_ctime;
13765 usbp->st_ctimensec = sbp->st_ctimensec;
13766 usbp->st_birthtime = sbp->st_birthtime;
13767 usbp->st_birthtimensec = sbp->st_birthtimensec;
13768 #endif
13769 usbp->st_size = sbp->st_size;
13770 usbp->st_blocks = sbp->st_blocks;
13771 usbp->st_blksize = sbp->st_blksize;
13772 usbp->st_flags = sbp->st_flags;
13773 usbp->st_gen = sbp->st_gen;
13774 usbp->st_lspare = sbp->st_lspare;
13775 usbp->st_qspare[0] = sbp->st_qspare[0];
13776 usbp->st_qspare[1] = sbp->st_qspare[1];
13777 }
13778
13779 /*
13780 * Purge buffer cache for simulating cold starts
13781 */
13782 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13783 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13784 {
13785 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
13786
13787 return VNODE_RETURNED;
13788 }
13789
13790 static int
vfs_purge_callback(mount_t mp,__unused void * arg)13791 vfs_purge_callback(mount_t mp, __unused void * arg)
13792 {
13793 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
13794
13795 return VFS_RETURNED;
13796 }
13797
13798 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)13799 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
13800 {
13801 if (!kauth_cred_issuser(kauth_cred_get())) {
13802 return EPERM;
13803 }
13804
13805 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
13806
13807 return 0;
13808 }
13809
13810 /*
13811 * gets the vnode associated with the (unnamed) snapshot directory
13812 * for a Filesystem. The snapshot directory vnode is returned with
13813 * an iocount on it.
13814 */
13815 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)13816 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
13817 {
13818 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
13819 }
13820
13821 /*
13822 * Get the snapshot vnode.
13823 *
13824 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
13825 * needs nameidone() on ndp.
13826 *
13827 * If the snapshot vnode exists it is returned in ndp->ni_vp.
13828 *
13829 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
13830 * not needed.
13831 */
13832 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)13833 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
13834 user_addr_t name, struct nameidata *ndp, int32_t op,
13835 #if !CONFIG_TRIGGERS
13836 __unused
13837 #endif
13838 enum path_operation pathop,
13839 vfs_context_t ctx)
13840 {
13841 int error, i;
13842 caddr_t name_buf;
13843 size_t name_len;
13844 struct vfs_attr vfa;
13845
13846 *sdvpp = NULLVP;
13847 *rvpp = NULLVP;
13848
13849 error = vnode_getfromfd(ctx, dirfd, rvpp);
13850 if (error) {
13851 return error;
13852 }
13853
13854 if (!vnode_isvroot(*rvpp)) {
13855 error = EINVAL;
13856 goto out;
13857 }
13858
13859 /* Make sure the filesystem supports snapshots */
13860 VFSATTR_INIT(&vfa);
13861 VFSATTR_WANTED(&vfa, f_capabilities);
13862 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
13863 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
13864 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
13865 VOL_CAP_INT_SNAPSHOT)) ||
13866 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
13867 VOL_CAP_INT_SNAPSHOT))) {
13868 error = ENOTSUP;
13869 goto out;
13870 }
13871
13872 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
13873 if (error) {
13874 goto out;
13875 }
13876
13877 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13878 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13879 if (error) {
13880 goto out1;
13881 }
13882
13883 /*
13884 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
13885 * (the length returned by copyinstr includes the terminating NUL)
13886 */
13887 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
13888 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
13889 error = EINVAL;
13890 goto out1;
13891 }
13892 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
13893 ;
13894 }
13895 if (i < (int)name_len) {
13896 error = EINVAL;
13897 goto out1;
13898 }
13899
13900 #if CONFIG_MACF
13901 if (op == CREATE) {
13902 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
13903 name_buf);
13904 } else if (op == DELETE) {
13905 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
13906 name_buf);
13907 }
13908 if (error) {
13909 goto out1;
13910 }
13911 #endif
13912
13913 /* Check if the snapshot already exists ... */
13914 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
13915 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
13916 ndp->ni_dvp = *sdvpp;
13917
13918 error = namei(ndp);
13919 out1:
13920 zfree(ZV_NAMEI, name_buf);
13921 out:
13922 if (error) {
13923 if (*sdvpp) {
13924 vnode_put(*sdvpp);
13925 *sdvpp = NULLVP;
13926 }
13927 if (*rvpp) {
13928 vnode_put(*rvpp);
13929 *rvpp = NULLVP;
13930 }
13931 }
13932 return error;
13933 }
13934
13935 /*
13936 * create a filesystem snapshot (for supporting filesystems)
13937 *
13938 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
13939 * We get to the (unnamed) snapshot directory vnode and create the vnode
13940 * for the snapshot in it.
13941 *
13942 * Restrictions:
13943 *
13944 * a) Passed in name for snapshot cannot have slashes.
13945 * b) name can't be "." or ".."
13946 *
13947 * Since this requires superuser privileges, vnode_authorize calls are not
13948 * made.
13949 */
13950 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)13951 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
13952 vfs_context_t ctx)
13953 {
13954 vnode_t rvp, snapdvp;
13955 int error;
13956 struct nameidata *ndp;
13957
13958 ndp = kalloc_type(struct nameidata, Z_WAITOK);
13959
13960 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
13961 OP_LINK, ctx);
13962 if (error) {
13963 goto out;
13964 }
13965
13966 if (ndp->ni_vp) {
13967 vnode_put(ndp->ni_vp);
13968 error = EEXIST;
13969 } else {
13970 struct vnode_attr *vap;
13971 vnode_t vp = NULLVP;
13972
13973 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
13974
13975 VATTR_INIT(vap);
13976 VATTR_SET(vap, va_type, VREG);
13977 VATTR_SET(vap, va_mode, 0);
13978
13979 error = vn_create(snapdvp, &vp, ndp, vap,
13980 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
13981 if (!error && vp) {
13982 vnode_put(vp);
13983 }
13984
13985 kfree_type(struct vnode_attr, vap);
13986 }
13987
13988 nameidone(ndp);
13989 vnode_put(snapdvp);
13990 vnode_put(rvp);
13991 out:
13992 kfree_type(struct nameidata, ndp);
13993
13994 return error;
13995 }
13996
13997 /*
13998 * Delete a Filesystem snapshot
13999 *
14000 * get the vnode for the unnamed snapshot directory and the snapshot and
14001 * delete the snapshot.
14002 */
14003 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14004 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14005 vfs_context_t ctx)
14006 {
14007 vnode_t rvp, snapdvp;
14008 int error;
14009 struct nameidata *ndp;
14010
14011 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14012
14013 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14014 OP_UNLINK, ctx);
14015 if (error) {
14016 goto out;
14017 }
14018
14019 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14020 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14021
14022 vnode_put(ndp->ni_vp);
14023 nameidone(ndp);
14024 vnode_put(snapdvp);
14025 vnode_put(rvp);
14026 out:
14027 kfree_type(struct nameidata, ndp);
14028
14029 return error;
14030 }
14031
14032 /*
14033 * Revert a filesystem to a snapshot
14034 *
14035 * Marks the filesystem to revert to the given snapshot on next mount.
14036 */
14037 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14038 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14039 vfs_context_t ctx)
14040 {
14041 int error;
14042 vnode_t rvp;
14043 mount_t mp;
14044 struct fs_snapshot_revert_args revert_data;
14045 struct componentname cnp;
14046 caddr_t name_buf;
14047 size_t name_len;
14048
14049 error = vnode_getfromfd(ctx, dirfd, &rvp);
14050 if (error) {
14051 return error;
14052 }
14053 mp = vnode_mount(rvp);
14054
14055 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14056 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14057 if (error) {
14058 zfree(ZV_NAMEI, name_buf);
14059 vnode_put(rvp);
14060 return error;
14061 }
14062
14063 #if CONFIG_MACF
14064 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14065 if (error) {
14066 zfree(ZV_NAMEI, name_buf);
14067 vnode_put(rvp);
14068 return error;
14069 }
14070 #endif
14071
14072 /*
14073 * Grab mount_iterref so that we can release the vnode,
14074 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14075 */
14076 error = mount_iterref(mp, 0);
14077 vnode_put(rvp);
14078 if (error) {
14079 zfree(ZV_NAMEI, name_buf);
14080 return error;
14081 }
14082
14083 memset(&cnp, 0, sizeof(cnp));
14084 cnp.cn_pnbuf = (char *)name_buf;
14085 cnp.cn_nameiop = LOOKUP;
14086 cnp.cn_flags = ISLASTCN | HASBUF;
14087 cnp.cn_pnlen = MAXPATHLEN;
14088 cnp.cn_nameptr = cnp.cn_pnbuf;
14089 cnp.cn_namelen = (int)name_len;
14090 revert_data.sr_cnp = &cnp;
14091
14092 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14093 mount_iterdrop(mp);
14094 zfree(ZV_NAMEI, name_buf);
14095
14096 if (error) {
14097 /* If there was any error, try again using VNOP_IOCTL */
14098
14099 vnode_t snapdvp;
14100 struct nameidata namend;
14101
14102 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14103 OP_LOOKUP, ctx);
14104 if (error) {
14105 return error;
14106 }
14107
14108
14109 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14110 0, ctx);
14111
14112 vnode_put(namend.ni_vp);
14113 nameidone(&namend);
14114 vnode_put(snapdvp);
14115 vnode_put(rvp);
14116 }
14117
14118 return error;
14119 }
14120
14121 /*
14122 * rename a Filesystem snapshot
14123 *
14124 * get the vnode for the unnamed snapshot directory and the snapshot and
14125 * rename the snapshot. This is a very specialised (and simple) case of
14126 * rename(2) (which has to deal with a lot more complications). It differs
14127 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14128 */
14129 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14130 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14131 __unused uint32_t flags, vfs_context_t ctx)
14132 {
14133 vnode_t rvp, snapdvp;
14134 int error, i;
14135 caddr_t newname_buf;
14136 size_t name_len;
14137 vnode_t fvp;
14138 struct nameidata *fromnd, *tond;
14139 /* carving out a chunk for structs that are too big to be on stack. */
14140 struct {
14141 struct nameidata from_node;
14142 struct nameidata to_node;
14143 } * __rename_data;
14144
14145 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14146 fromnd = &__rename_data->from_node;
14147 tond = &__rename_data->to_node;
14148
14149 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14150 OP_UNLINK, ctx);
14151 if (error) {
14152 goto out;
14153 }
14154 fvp = fromnd->ni_vp;
14155
14156 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14157 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14158 if (error) {
14159 goto out1;
14160 }
14161
14162 /*
14163 * Some sanity checks- new name can't be empty, "." or ".." or have
14164 * slashes.
14165 * (the length returned by copyinstr includes the terminating NUL)
14166 *
14167 * The FS rename VNOP is suppossed to handle this but we'll pick it
14168 * off here itself.
14169 */
14170 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14171 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14172 error = EINVAL;
14173 goto out1;
14174 }
14175 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14176 ;
14177 }
14178 if (i < (int)name_len) {
14179 error = EINVAL;
14180 goto out1;
14181 }
14182
14183 #if CONFIG_MACF
14184 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14185 newname_buf);
14186 if (error) {
14187 goto out1;
14188 }
14189 #endif
14190
14191 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14192 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14193 tond->ni_dvp = snapdvp;
14194
14195 error = namei(tond);
14196 if (error) {
14197 goto out2;
14198 } else if (tond->ni_vp) {
14199 /*
14200 * snapshot rename behaves differently than rename(2) - if the
14201 * new name exists, EEXIST is returned.
14202 */
14203 vnode_put(tond->ni_vp);
14204 error = EEXIST;
14205 goto out2;
14206 }
14207
14208 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14209 &tond->ni_cnd, ctx);
14210
14211 out2:
14212 nameidone(tond);
14213 out1:
14214 zfree(ZV_NAMEI, newname_buf);
14215 vnode_put(fvp);
14216 vnode_put(snapdvp);
14217 vnode_put(rvp);
14218 nameidone(fromnd);
14219 out:
14220 kfree_type(typeof(*__rename_data), __rename_data);
14221 return error;
14222 }
14223
14224 /*
14225 * Mount a Filesystem snapshot
14226 *
14227 * get the vnode for the unnamed snapshot directory and the snapshot and
14228 * mount the snapshot.
14229 */
14230 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14231 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14232 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14233 {
14234 mount_t mp;
14235 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14236 struct fs_snapshot_mount_args smnt_data;
14237 int error;
14238 struct nameidata *snapndp, *dirndp;
14239 /* carving out a chunk for structs that are too big to be on stack. */
14240 struct {
14241 struct nameidata snapnd;
14242 struct nameidata dirnd;
14243 } * __snapshot_mount_data;
14244
14245 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14246 snapndp = &__snapshot_mount_data->snapnd;
14247 dirndp = &__snapshot_mount_data->dirnd;
14248
14249 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14250 OP_LOOKUP, ctx);
14251 if (error) {
14252 goto out;
14253 }
14254
14255 snapvp = snapndp->ni_vp;
14256 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14257 error = EIO;
14258 goto out1;
14259 }
14260
14261 /* Get the vnode to be covered */
14262 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14263 UIO_USERSPACE, directory, ctx);
14264 error = namei(dirndp);
14265 if (error) {
14266 goto out1;
14267 }
14268
14269 vp = dirndp->ni_vp;
14270 pvp = dirndp->ni_dvp;
14271 mp = vnode_mount(rvp);
14272
14273 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14274 error = EINVAL;
14275 goto out2;
14276 }
14277
14278 #if CONFIG_MACF
14279 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14280 mp->mnt_vfsstat.f_fstypename);
14281 if (error) {
14282 goto out2;
14283 }
14284 #endif
14285
14286 smnt_data.sm_mp = mp;
14287 smnt_data.sm_cnp = &snapndp->ni_cnd;
14288 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14289 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
14290 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14291
14292 out2:
14293 vnode_put(vp);
14294 vnode_put(pvp);
14295 nameidone(dirndp);
14296 out1:
14297 vnode_put(snapvp);
14298 vnode_put(snapdvp);
14299 vnode_put(rvp);
14300 nameidone(snapndp);
14301 out:
14302 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14303 return error;
14304 }
14305
14306 /*
14307 * Root from a snapshot of the filesystem
14308 *
14309 * Marks the filesystem to root from the given snapshot on next boot.
14310 */
14311 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14312 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14313 vfs_context_t ctx)
14314 {
14315 int error;
14316 vnode_t rvp;
14317 mount_t mp;
14318 struct fs_snapshot_root_args root_data;
14319 struct componentname cnp;
14320 caddr_t name_buf;
14321 size_t name_len;
14322
14323 error = vnode_getfromfd(ctx, dirfd, &rvp);
14324 if (error) {
14325 return error;
14326 }
14327 mp = vnode_mount(rvp);
14328
14329 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14330 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14331 if (error) {
14332 zfree(ZV_NAMEI, name_buf);
14333 vnode_put(rvp);
14334 return error;
14335 }
14336
14337 // XXX MAC checks ?
14338
14339 /*
14340 * Grab mount_iterref so that we can release the vnode,
14341 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14342 */
14343 error = mount_iterref(mp, 0);
14344 vnode_put(rvp);
14345 if (error) {
14346 zfree(ZV_NAMEI, name_buf);
14347 return error;
14348 }
14349
14350 memset(&cnp, 0, sizeof(cnp));
14351 cnp.cn_pnbuf = (char *)name_buf;
14352 cnp.cn_nameiop = LOOKUP;
14353 cnp.cn_flags = ISLASTCN | HASBUF;
14354 cnp.cn_pnlen = MAXPATHLEN;
14355 cnp.cn_nameptr = cnp.cn_pnbuf;
14356 cnp.cn_namelen = (int)name_len;
14357 root_data.sr_cnp = &cnp;
14358
14359 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14360
14361 mount_iterdrop(mp);
14362 zfree(ZV_NAMEI, name_buf);
14363
14364 return error;
14365 }
14366
14367 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14368 vfs_context_can_snapshot(vfs_context_t ctx)
14369 {
14370 static const char * const snapshot_entitlements[] = {
14371 "com.apple.private.vfs.snapshot",
14372 "com.apple.developer.vfs.snapshot",
14373 "com.apple.private.apfs.arv.limited.snapshot",
14374 };
14375 static const size_t nentitlements =
14376 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14377 size_t i;
14378
14379 task_t task = vfs_context_task(ctx);
14380 for (i = 0; i < nentitlements; i++) {
14381 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14382 return TRUE;
14383 }
14384 }
14385 return FALSE;
14386 }
14387
14388 /*
14389 * FS snapshot operations dispatcher
14390 */
14391 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14392 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14393 __unused int32_t *retval)
14394 {
14395 int error;
14396 vfs_context_t ctx = vfs_context_current();
14397
14398 AUDIT_ARG(fd, uap->dirfd);
14399 AUDIT_ARG(value32, uap->op);
14400
14401 if (!vfs_context_can_snapshot(ctx)) {
14402 return EPERM;
14403 }
14404
14405 /*
14406 * Enforce user authorization for snapshot modification operations,
14407 * or if trying to root from snapshot.
14408 */
14409 if (uap->op != SNAPSHOT_OP_MOUNT) {
14410 vnode_t dvp = NULLVP;
14411 vnode_t devvp = NULLVP;
14412 mount_t mp;
14413
14414 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14415 if (error) {
14416 return error;
14417 }
14418 mp = vnode_mount(dvp);
14419 devvp = mp->mnt_devvp;
14420
14421 /* get an iocount on devvp */
14422 if (devvp == NULLVP) {
14423 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14424 /* for mounts which arent block devices */
14425 if (error == ENOENT) {
14426 error = ENXIO;
14427 }
14428 } else {
14429 error = vnode_getwithref(devvp);
14430 }
14431
14432 if (error) {
14433 vnode_put(dvp);
14434 return error;
14435 }
14436
14437 if ((vfs_context_issuser(ctx) == 0) &&
14438 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14439 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14440 error = EPERM;
14441 }
14442 vnode_put(dvp);
14443 vnode_put(devvp);
14444
14445 if (error) {
14446 return error;
14447 }
14448 }
14449
14450 switch (uap->op) {
14451 case SNAPSHOT_OP_CREATE:
14452 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14453 break;
14454 case SNAPSHOT_OP_DELETE:
14455 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14456 break;
14457 case SNAPSHOT_OP_RENAME:
14458 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14459 uap->flags, ctx);
14460 break;
14461 case SNAPSHOT_OP_MOUNT:
14462 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14463 uap->data, uap->flags, ctx);
14464 break;
14465 case SNAPSHOT_OP_REVERT:
14466 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14467 break;
14468 #if CONFIG_MNT_ROOTSNAP
14469 case SNAPSHOT_OP_ROOT:
14470 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14471 break;
14472 #endif /* CONFIG_MNT_ROOTSNAP */
14473 default:
14474 error = ENOSYS;
14475 }
14476
14477 return error;
14478 }
14479