1 /*
2 * Copyright (c) 1995-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
66 */
67 /*
68 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
69 * support for mandatory and extensible security protections. This notice
70 * is included in support of clause 2.2 (b) of the Apple Public License,
71 * Version 2.0.
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/namei.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/file_internal.h>
80 #include <sys/stat.h>
81 #include <sys/vnode_internal.h>
82 #include <sys/mount_internal.h>
83 #include <sys/proc_internal.h>
84 #include <sys/kauth.h>
85 #include <sys/uio_internal.h>
86 #include <kern/kalloc.h>
87 #include <sys/mman.h>
88 #include <sys/dirent.h>
89 #include <sys/attr.h>
90 #include <sys/sysctl.h>
91 #include <sys/ubc.h>
92 #include <sys/quota.h>
93 #include <sys/kdebug.h>
94 #include <sys/fsevents.h>
95 #include <sys/imgsrc.h>
96 #include <sys/sysproto.h>
97 #include <sys/sysctl.h>
98 #include <sys/xattr.h>
99 #include <sys/fcntl.h>
100 #include <sys/stdio.h>
101 #include <sys/fsctl.h>
102 #include <sys/ubc_internal.h>
103 #include <sys/disk.h>
104 #include <sys/content_protection.h>
105 #include <sys/clonefile.h>
106 #include <sys/snapshot.h>
107 #include <sys/priv.h>
108 #include <sys/fsgetpath.h>
109 #include <machine/cons.h>
110 #include <machine/limits.h>
111 #include <miscfs/specfs/specdev.h>
112
113 #include <vfs/vfs_disk_conditioner.h>
114 #if CONFIG_EXCLAVES
115 #include <vfs/vfs_exclave_fs.h>
116 #endif
117
118 #include <security/audit/audit.h>
119 #include <bsm/audit_kevents.h>
120
121 #include <mach/mach_types.h>
122 #include <kern/kern_types.h>
123 #include <kern/kalloc.h>
124 #include <kern/task.h>
125
126 #include <vm/vm_pageout.h>
127 #include <vm/vm_protos.h>
128
129 #include <libkern/OSAtomic.h>
130 #include <os/atomic_private.h>
131 #include <pexpert/pexpert.h>
132 #include <IOKit/IOBSD.h>
133
134 // deps for MIG call
135 #include <kern/host.h>
136 #include <kern/ipc_misc.h>
137 #include <mach/host_priv.h>
138 #include <mach/vfs_nspace.h>
139 #include <os/log.h>
140
141 #include <nfs/nfs_conf.h>
142
143 #if ROUTEFS
144 #include <miscfs/routefs/routefs.h>
145 #endif /* ROUTEFS */
146
147 #if CONFIG_MACF
148 #include <security/mac.h>
149 #include <security/mac_framework.h>
150 #endif
151
152 #if CONFIG_FSE
153 #define GET_PATH(x) \
154 ((x) = get_pathbuff())
155 #define RELEASE_PATH(x) \
156 release_pathbuff(x)
157 #else
158 #define GET_PATH(x) \
159 ((x) = zalloc(ZV_NAMEI))
160 #define RELEASE_PATH(x) \
161 zfree(ZV_NAMEI, x)
162 #endif /* CONFIG_FSE */
163
164 #ifndef HFS_GET_BOOT_INFO
165 #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004)
166 #endif
167
168 #ifndef HFS_SET_BOOT_INFO
169 #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005)
170 #endif
171
172 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
173 #define APFSIOC_REVERT_TO_SNAPSHOT _IOW('J', 1, u_int64_t)
174 #endif
175
176 extern void disk_conditioner_unmount(mount_t mp);
177
178 /* struct for checkdirs iteration */
179 struct cdirargs {
180 vnode_t olddp;
181 vnode_t newdp;
182 };
183 /* callback for checkdirs iteration */
184 static int checkdirs_callback(proc_t p, void * arg);
185
186 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
187 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
188 void enablequotas(struct mount *mp, vfs_context_t ctx);
189 static int getfsstat_callback(mount_t mp, void * arg);
190 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
191 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
192 static int sync_callback(mount_t, void *);
193 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
194 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
195 boolean_t partial_copy);
196 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
197 static int mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
198 struct componentname *cnp, user_addr_t fsmountargs,
199 int flags, uint32_t internal_flags, char *labelstr, vfs_context_t ctx);
200 void vfs_notify_mount(vnode_t pdvp);
201
202 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags);
203
204 struct fd_vn_data * fg_vn_data_alloc(void);
205
206 /*
207 * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
208 * Concurrent lookups (or lookups by ids) on hard links can cause the
209 * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
210 * does) to return ENOENT as the path cannot be returned from the name cache
211 * alone. We have no option but to retry and hope to get one namei->reverse path
212 * generation done without an intervening lookup, lookup by id on the hard link
213 * item. This is only an issue for MAC hooks which cannot reenter the filesystem
214 * which currently are the MAC hooks for rename, unlink and rmdir.
215 */
216 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
217
218 /* Max retry limit for rename due to vnode recycling. */
219 #define MAX_RENAME_ERECYCLE_RETRIES 1024
220
221 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
222 int unlink_flags);
223
224 #ifdef CONFIG_IMGSRC_ACCESS
225 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
226 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
227 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
228 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
229 static void mount_end_update(mount_t mp);
230 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
231 #endif /* CONFIG_IMGSRC_ACCESS */
232
233 //snapshot functions
234 #if CONFIG_MNT_ROOTSNAP
235 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
236 #else
237 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
238 #endif
239
240 __private_extern__
241 int sync_internal(void);
242
243 __private_extern__
244 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
245
246 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
247 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
248
249 /* vars for sync mutex */
250 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
251 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
252
253 extern lck_rw_t rootvnode_rw_lock;
254
255 VFS_SMR_DECLARE;
256 extern uint32_t nc_smr_enabled;
257
258 /*
259 * incremented each time a mount or unmount operation occurs
260 * used to invalidate the cached value of the rootvp in the
261 * mount structure utilized by cache_lookup_path
262 */
263 uint32_t mount_generation = 0;
264
265 /* counts number of mount and unmount operations */
266 unsigned int vfs_nummntops = 0;
267
268 /* system-wide, per-boot unique mount ID */
269 static _Atomic uint64_t mount_unique_id = 1;
270
271 extern const struct fileops vnops;
272 #if CONFIG_APPLEDOUBLE
273 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
274 #endif /* CONFIG_APPLEDOUBLE */
275
276 /* Maximum buffer length supported by fsgetpath(2) */
277 #define FSGETPATH_MAXBUFLEN 8192
278
279 /*
280 * Virtual File System System Calls
281 */
282
283 /*
284 * Private in-kernel mounting spi (specific use-cases only)
285 */
286 boolean_t
vfs_iskernelmount(mount_t mp)287 vfs_iskernelmount(mount_t mp)
288 {
289 return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
290 }
291
292 __private_extern__
293 int
kernel_mount(const char * fstype,vnode_t pvp,vnode_t vp,const char * path,void * data,__unused size_t datalen,int syscall_flags,uint32_t kern_flags,vfs_context_t ctx)294 kernel_mount(const char *fstype, vnode_t pvp, vnode_t vp, const char *path,
295 void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags,
296 vfs_context_t ctx)
297 {
298 struct nameidata nd;
299 boolean_t did_namei;
300 int error;
301
302 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
303 UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
304
305 kern_flags &= KERNEL_MOUNT_SANITIZE_MASK;
306
307 /*
308 * Get the vnode to be covered if it's not supplied
309 */
310 if (vp == NULLVP) {
311 error = namei(&nd);
312 if (error) {
313 if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
314 printf("failed to locate mount-on path: %s ", path);
315 }
316 return error;
317 }
318 vp = nd.ni_vp;
319 pvp = nd.ni_dvp;
320 did_namei = TRUE;
321 } else {
322 char *pnbuf = CAST_DOWN(char *, path);
323
324 nd.ni_cnd.cn_pnbuf = pnbuf;
325 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
326 did_namei = FALSE;
327 }
328
329 kern_flags |= KERNEL_MOUNT_KMOUNT;
330 error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
331 syscall_flags, kern_flags, NULL, ctx);
332
333 if (did_namei) {
334 vnode_put(vp);
335 vnode_put(pvp);
336 nameidone(&nd);
337 }
338
339 return error;
340 }
341
342 int
vfs_mount_at_path(const char * fstype,const char * path,vnode_t pvp,vnode_t vp,void * data,size_t datalen,int mnt_flags,int flags)343 vfs_mount_at_path(const char *fstype, const char *path,
344 vnode_t pvp, vnode_t vp, void *data, size_t datalen,
345 int mnt_flags, int flags)
346 {
347 int syscall_flags = MNT_AUTOMOUNTED | mnt_flags;
348 int error, km_flags = 0;
349 vfs_context_t ctx = (flags & VFS_MOUNT_FLAG_CURRENT_CONTEXT) ? vfs_context_current() : vfs_context_kernel();
350
351 /*
352 * This call is currently restricted to specific use cases.
353 */
354 if ((strcmp(fstype, "lifs") != 0) && (strcmp(fstype, "nfs") != 0)) {
355 return ENOTSUP;
356 }
357
358 #if !defined(XNU_TARGET_OS_OSX)
359 if (strcmp(fstype, "lifs") == 0) {
360 syscall_flags |= MNT_NOEXEC;
361 }
362 #endif
363
364 if (flags & VFS_MOUNT_FLAG_NOAUTH) {
365 km_flags |= KERNEL_MOUNT_NOAUTH;
366 }
367 if (flags & VFS_MOUNT_FLAG_PERMIT_UNMOUNT) {
368 km_flags |= KERNEL_MOUNT_PERMIT_UNMOUNT;
369 }
370
371 error = kernel_mount(fstype, pvp, vp, path, data, datalen,
372 syscall_flags, km_flags, ctx);
373 if (error) {
374 printf("%s: mount on %s failed, error %d\n", __func__, path,
375 error);
376 }
377
378 return error;
379 }
380
381 /*
382 * Mount a file system.
383 */
384 /* ARGSUSED */
385 int
mount(proc_t p,struct mount_args * uap,__unused int32_t * retval)386 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
387 {
388 struct __mac_mount_args muap;
389
390 muap.type = uap->type;
391 muap.path = uap->path;
392 muap.flags = uap->flags;
393 muap.data = uap->data;
394 muap.mac_p = USER_ADDR_NULL;
395 return __mac_mount(p, &muap, retval);
396 }
397
398 int
fmount(__unused proc_t p,struct fmount_args * uap,__unused int32_t * retval)399 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
400 {
401 struct componentname cn;
402 vfs_context_t ctx = vfs_context_current();
403 size_t dummy = 0;
404 int error;
405 int flags = uap->flags;
406 char fstypename[MFSNAMELEN];
407 char *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
408 vnode_t pvp;
409 vnode_t vp;
410
411 AUDIT_ARG(fd, uap->fd);
412 AUDIT_ARG(fflags, flags);
413 /* fstypename will get audited by mount_common */
414
415 /* Sanity check the flags */
416 if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
417 return ENOTSUP;
418 }
419
420 if (flags & MNT_UNION) {
421 return EPERM;
422 }
423
424 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
425 if (error) {
426 return error;
427 }
428
429 if ((error = file_vnode(uap->fd, &vp)) != 0) {
430 return error;
431 }
432
433 if ((error = vnode_getwithref(vp)) != 0) {
434 file_drop(uap->fd);
435 return error;
436 }
437
438 pvp = vnode_getparent(vp);
439 if (pvp == NULL) {
440 if (vp->v_mountedhere || (vp->v_flag & VROOT) != 0) {
441 error = EBUSY;
442 } else {
443 error = EINVAL;
444 }
445 vnode_put(vp);
446 file_drop(uap->fd);
447 return error;
448 }
449
450 memset(&cn, 0, sizeof(struct componentname));
451 cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
452 cn.cn_pnlen = MAXPATHLEN;
453
454 if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
455 zfree(ZV_NAMEI, cn.cn_pnbuf);
456 vnode_put(pvp);
457 vnode_put(vp);
458 file_drop(uap->fd);
459 return error;
460 }
461
462 error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, KERNEL_MOUNT_FMOUNT, labelstr, ctx);
463
464 zfree(ZV_NAMEI, cn.cn_pnbuf);
465 vnode_put(pvp);
466 vnode_put(vp);
467 file_drop(uap->fd);
468
469 return error;
470 }
471
472 #define MAX_GRAFT_METADATA_SIZE 16384 /* bytes */
473
474 /*
475 * Get the size of a graft file (a manifest or payload file).
476 * The vp should be an iocounted vnode.
477 */
478 static int
get_and_verify_graft_metadata_vp_size(vnode_t graft_vp,vfs_context_t vctx,size_t * size)479 get_and_verify_graft_metadata_vp_size(vnode_t graft_vp, vfs_context_t vctx, size_t *size)
480 {
481 struct stat64 sb = {};
482 int error;
483
484 *size = 0;
485
486 error = vn_stat(graft_vp, &sb, NULL, 1, 0, vctx);
487 if (error) {
488 return error;
489 }
490
491 if (sb.st_size == 0) {
492 error = ENODATA;
493 } else if ((size_t) sb.st_size > MAX_GRAFT_METADATA_SIZE) {
494 error = EFBIG;
495 } else {
496 *size = (size_t) sb.st_size;
497 }
498
499 return error;
500 }
501
502 /*
503 * Read in a graft file (a manifest or payload file) of size `size` into `buf`.
504 * `size` must already be validated.
505 */
506 static int
read_graft_metadata_vp(vnode_t graft_vp,vfs_context_t vctx,size_t size,void * buf)507 read_graft_metadata_vp(vnode_t graft_vp, vfs_context_t vctx, size_t size, void *buf)
508 {
509 return vn_rdwr(UIO_READ, graft_vp,
510 (caddr_t) buf, (int) size, /* offset */ 0,
511 UIO_SYSSPACE, IO_NOCACHE | IO_RAOFF | IO_UNIT,
512 vfs_context_ucred(vctx), /* resid */ NULL,
513 vfs_context_proc(vctx));
514 }
515
516 /*
517 * Convert a single graft file descriptor into a vnode, get its size (saving it to `size`),
518 * and read it into `buf`.
519 */
520 static int
graft_secureboot_read_fd(int fd,vfs_context_t vctx,size_t * size,void * buf)521 graft_secureboot_read_fd(int fd, vfs_context_t vctx, size_t *size, void *buf)
522 {
523 vnode_t metadata_vp = NULLVP;
524 int error;
525
526 // Convert this graft fd to a vnode.
527 if ((error = vnode_getfromfd(vctx, fd, &metadata_vp)) != 0) {
528 goto out;
529 }
530
531 // Get (and validate) size information.
532 if ((error = get_and_verify_graft_metadata_vp_size(metadata_vp, vctx, size)) != 0) {
533 goto out;
534 }
535
536 // Read each file into the provided buffer - we must get the expected amount of bytes.
537 if ((error = read_graft_metadata_vp(metadata_vp, vctx, *size, buf)) != 0) {
538 goto out;
539 }
540
541 out:
542 if (metadata_vp) {
543 vnode_put(metadata_vp);
544 metadata_vp = NULLVP;
545 }
546
547 return error;
548 }
549
550 /*
551 * Read graft file descriptors into buffers of size MAX_GRAFT_METADATA_SIZE
552 * provided in `gfs`, saving the size of data read in `gfs`.
553 */
554 static int
graft_secureboot_read_metadata(secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,fsioc_graft_fs_t * gfs)555 graft_secureboot_read_metadata(secure_boot_cryptex_args_t *sbc_args, vfs_context_t vctx,
556 fsioc_graft_fs_t *gfs)
557 {
558 int error;
559
560 // Read the authentic manifest.
561 if ((error = graft_secureboot_read_fd(sbc_args->sbc_authentic_manifest_fd, vctx,
562 &gfs->authentic_manifest_size, gfs->authentic_manifest))) {
563 return error;
564 }
565
566 // The user manifest is currently unused, but set its size.
567 gfs->user_manifest_size = 0;
568
569 // Read the payload.
570 if ((error = graft_secureboot_read_fd(sbc_args->sbc_payload_fd, vctx,
571 &gfs->payload_size, gfs->payload))) {
572 return error;
573 }
574
575 return 0;
576 }
577
578 /*
579 * Call into the filesystem to verify and graft a cryptex.
580 */
581 static int
graft_secureboot_cryptex(uint32_t graft_type,secure_boot_cryptex_args_t * sbc_args,vfs_context_t vctx,vnode_t cryptex_vp,vnode_t mounton_vp)582 graft_secureboot_cryptex(uint32_t graft_type, secure_boot_cryptex_args_t *sbc_args,
583 vfs_context_t vctx, vnode_t cryptex_vp, vnode_t mounton_vp)
584 {
585 fsioc_graft_fs_t gfs = {};
586 uint64_t graft_dir_ino = 0;
587 struct stat64 sb = {};
588 int error;
589
590 // Pre-flight arguments.
591 if (sbc_args->sbc_version != GRAFTDMG_SECURE_BOOT_CRYPTEX_ARGS_VERSION) {
592 // Make sure that this graft version matches what we support.
593 return ENOTSUP;
594 } else if (mounton_vp && cryptex_vp->v_mount != mounton_vp->v_mount) {
595 // For this type, cryptex VP must live on same volume as the target of graft.
596 return EXDEV;
597 } else if (mounton_vp && mounton_vp->v_type != VDIR) {
598 // We cannot graft upon non-directories.
599 return ENOTDIR;
600 } else if (sbc_args->sbc_authentic_manifest_fd < 0 ||
601 sbc_args->sbc_payload_fd < 0) {
602 // We cannot graft without a manifest and payload.
603 return EINVAL;
604 }
605
606 if (mounton_vp) {
607 // Get the mounton's inode number.
608 error = vn_stat(mounton_vp, &sb, NULL, 1, 0, vctx);
609 if (error) {
610 return error;
611 }
612 graft_dir_ino = (uint64_t) sb.st_ino;
613 }
614
615 // Create buffers (of our maximum-defined size) to store authentication info.
616 gfs.authentic_manifest = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
617 gfs.payload = kalloc_data(MAX_GRAFT_METADATA_SIZE, Z_WAITOK | Z_ZERO);
618
619 if (!gfs.authentic_manifest || !gfs.payload) {
620 error = ENOMEM;
621 goto out;
622 }
623
624 // Read our fd's into our buffers.
625 // (Note that this will set the buffer size fields in `gfs`.)
626 error = graft_secureboot_read_metadata(sbc_args, vctx, &gfs);
627 if (error) {
628 goto out;
629 }
630
631 gfs.graft_version = FSIOC_GRAFT_VERSION;
632 gfs.graft_type = graft_type;
633 gfs.graft_4cc = sbc_args->sbc_4cc;
634 if (sbc_args->sbc_flags & SBC_PRESERVE_MOUNT) {
635 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_MOUNT;
636 }
637 if (sbc_args->sbc_flags & SBC_ALTERNATE_SHARED_REGION) {
638 gfs.graft_flags |= FSCTL_GRAFT_ALTERNATE_SHARED_REGION;
639 }
640 if (sbc_args->sbc_flags & SBC_SYSTEM_CONTENT) {
641 gfs.graft_flags |= FSCTL_GRAFT_SYSTEM_CONTENT;
642 }
643 if (sbc_args->sbc_flags & SBC_PANIC_ON_AUTHFAIL) {
644 gfs.graft_flags |= FSCTL_GRAFT_PANIC_ON_AUTHFAIL;
645 }
646 if (sbc_args->sbc_flags & SBC_STRICT_AUTH) {
647 gfs.graft_flags |= FSCTL_GRAFT_STRICT_AUTH;
648 }
649 if (sbc_args->sbc_flags & SBC_PRESERVE_GRAFT) {
650 gfs.graft_flags |= FSCTL_GRAFT_PRESERVE_GRAFT;
651 }
652 gfs.dir_ino = graft_dir_ino; // ino from mounton_vp (if not provided, the parent directory)
653
654 // Call into the FS to perform the graft (and validation).
655 error = VNOP_IOCTL(cryptex_vp, FSIOC_GRAFT_FS, (caddr_t)&gfs, 0, vctx);
656
657 out:
658 if (gfs.authentic_manifest) {
659 kfree_data(gfs.authentic_manifest, MAX_GRAFT_METADATA_SIZE);
660 gfs.authentic_manifest = NULL;
661 }
662 if (gfs.payload) {
663 kfree_data(gfs.payload, MAX_GRAFT_METADATA_SIZE);
664 gfs.payload = NULL;
665 }
666
667 return error;
668 }
669
670 #define GRAFTDMG_ENTITLEMENT "com.apple.private.vfs.graftdmg"
671
672 /*
673 * Graft a cryptex disk image (via FD) onto the appropriate mount-point
674 * { int graftdmg(int dmg_fd, const char *mountdir, uint32_t graft_type, graftdmg_args_un *gda); }
675 */
676 int
graftdmg(__unused proc_t p,struct graftdmg_args * uap,__unused int32_t * retval)677 graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval)
678 {
679 int ua_dmgfd = uap->dmg_fd;
680 user_addr_t ua_mountdir = uap->mountdir;
681 uint32_t ua_grafttype = uap->graft_type;
682 user_addr_t ua_graftargs = uap->gda;
683
684 graftdmg_args_un kern_gda = {};
685 int error = 0;
686 secure_boot_cryptex_args_t *sbc_args = NULL;
687
688 vnode_t cryptex_vp = NULLVP;
689 vnode_t mounton_vp = NULLVP;
690 struct nameidata nd = {};
691 vfs_context_t ctx = vfs_context_current();
692
693 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
694 return EPERM;
695 }
696
697 error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un));
698 if (error) {
699 return error;
700 }
701
702 // Copy mount dir in, if provided.
703 if (ua_mountdir != USER_ADDR_NULL) {
704 // Acquire vnode for mount-on path
705 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
706 UIO_USERSPACE, ua_mountdir, ctx);
707
708 error = namei(&nd);
709 if (error) {
710 return error;
711 }
712 mounton_vp = nd.ni_vp;
713 }
714
715 // Convert fd to vnode.
716 error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp);
717 if (error) {
718 goto graftout;
719 }
720
721 if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) {
722 error = EINVAL;
723 } else {
724 sbc_args = &kern_gda.sbc_args;
725 error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp);
726 }
727
728 graftout:
729 if (cryptex_vp) {
730 vnode_put(cryptex_vp);
731 cryptex_vp = NULLVP;
732 }
733 if (mounton_vp) {
734 vnode_put(mounton_vp);
735 mounton_vp = NULLVP;
736 }
737 if (ua_mountdir != USER_ADDR_NULL) {
738 nameidone(&nd);
739 }
740
741 return error;
742 }
743
744 /*
745 * Ungraft a cryptex disk image (via mount dir FD)
746 * { int ungraftdmg(const char *mountdir, uint64_t flags); }
747 */
748 int
ungraftdmg(__unused proc_t p,struct ungraftdmg_args * uap,__unused int32_t * retval)749 ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *retval)
750 {
751 int error = 0;
752 user_addr_t ua_mountdir = uap->mountdir;
753 fsioc_ungraft_fs_t ugfs;
754 vnode_t mounton_vp = NULLVP;
755 struct nameidata nd = {};
756 vfs_context_t ctx = vfs_context_current();
757
758 if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) {
759 return EPERM;
760 }
761
762 if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) {
763 return EINVAL;
764 }
765
766 ugfs.ungraft_flags = 0;
767
768 // Acquire vnode for mount-on path
769 NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1),
770 UIO_USERSPACE, ua_mountdir, ctx);
771
772 error = namei(&nd);
773 if (error) {
774 return error;
775 }
776 mounton_vp = nd.ni_vp;
777
778 // Call into the FS to perform the ungraft
779 error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx);
780
781 vnode_put(mounton_vp);
782 nameidone(&nd);
783
784 return error;
785 }
786
787
788 void
vfs_notify_mount(vnode_t pdvp)789 vfs_notify_mount(vnode_t pdvp)
790 {
791 vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
792 lock_vnode_and_post(pdvp, NOTE_WRITE);
793 }
794
795 /*
796 * __mac_mount:
797 * Mount a file system taking into account MAC label behavior.
798 * See mount(2) man page for more information
799 *
800 * Parameters: p Process requesting the mount
801 * uap User argument descriptor (see below)
802 * retval (ignored)
803 *
804 * Indirect: uap->type Filesystem type
805 * uap->path Path to mount
806 * uap->data Mount arguments
807 * uap->mac_p MAC info
808 * uap->flags Mount flags
809 *
810 *
811 * Returns: 0 Success
812 * !0 Not success
813 */
814 boolean_t root_fs_upgrade_try = FALSE;
815
816 int
__mac_mount(struct proc * p,register struct __mac_mount_args * uap,__unused int32_t * retval)817 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
818 {
819 vnode_t pvp = NULL;
820 vnode_t vp = NULL;
821 int need_nameidone = 0;
822 vfs_context_t ctx = vfs_context_current();
823 char fstypename[MFSNAMELEN];
824 struct nameidata nd;
825 size_t dummy = 0;
826 char *labelstr = NULL;
827 size_t labelsz = 0;
828 int flags = uap->flags;
829 int error;
830 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
831 boolean_t is_64bit = IS_64BIT_PROCESS(p);
832 #else
833 #pragma unused(p)
834 #endif
835 /*
836 * Get the fs type name from user space
837 */
838 error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
839 if (error) {
840 return error;
841 }
842
843 /*
844 * Get the vnode to be covered
845 */
846 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
847 UIO_USERSPACE, uap->path, ctx);
848 if (flags & MNT_NOFOLLOW) {
849 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
850 }
851 error = namei(&nd);
852 if (error) {
853 goto out;
854 }
855 need_nameidone = 1;
856 vp = nd.ni_vp;
857 pvp = nd.ni_dvp;
858
859 #ifdef CONFIG_IMGSRC_ACCESS
860 /* Mounting image source cannot be batched with other operations */
861 if (flags == MNT_IMGSRC_BY_INDEX) {
862 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
863 ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
864 goto out;
865 }
866 #endif /* CONFIG_IMGSRC_ACCESS */
867
868 #if CONFIG_MACF
869 /*
870 * Get the label string (if any) from user space
871 */
872 if (uap->mac_p != USER_ADDR_NULL) {
873 struct user_mac mac;
874 size_t ulen = 0;
875
876 if (is_64bit) {
877 struct user64_mac mac64;
878 error = copyin(uap->mac_p, &mac64, sizeof(mac64));
879 mac.m_buflen = (user_size_t)mac64.m_buflen;
880 mac.m_string = (user_addr_t)mac64.m_string;
881 } else {
882 struct user32_mac mac32;
883 error = copyin(uap->mac_p, &mac32, sizeof(mac32));
884 mac.m_buflen = mac32.m_buflen;
885 mac.m_string = mac32.m_string;
886 }
887 if (error) {
888 goto out;
889 }
890 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
891 (mac.m_buflen < 2)) {
892 error = EINVAL;
893 goto out;
894 }
895 labelsz = mac.m_buflen;
896 labelstr = kalloc_data(labelsz, Z_WAITOK);
897 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
898 if (error) {
899 goto out;
900 }
901 AUDIT_ARG(mac_string, labelstr);
902 }
903 #endif /* CONFIG_MACF */
904
905 AUDIT_ARG(fflags, flags);
906
907 #if !CONFIG_UNION_MOUNTS
908 if (flags & MNT_UNION) {
909 error = EPERM;
910 goto out;
911 }
912 #endif
913
914 if ((vp->v_flag & VROOT) &&
915 (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
916 #if CONFIG_UNION_MOUNTS
917 if (!(flags & MNT_UNION)) {
918 flags |= MNT_UPDATE;
919 } else {
920 /*
921 * For a union mount on '/', treat it as fresh
922 * mount instead of update.
923 * Otherwise, union mouting on '/' used to panic the
924 * system before, since mnt_vnodecovered was found to
925 * be NULL for '/' which is required for unionlookup
926 * after it gets ENOENT on union mount.
927 */
928 flags = (flags & ~(MNT_UPDATE));
929 }
930 #else
931 flags |= MNT_UPDATE;
932 #endif /* CONFIG_UNION_MOUNTS */
933
934 #if SECURE_KERNEL
935 if ((flags & MNT_RDONLY) == 0) {
936 /* Release kernels are not allowed to mount "/" as rw */
937 error = EPERM;
938 goto out;
939 }
940 #endif
941
942 /*
943 * See 7392553 for more details on why this check exists.
944 * Suffice to say: If this check is ON and something tries
945 * to mount the rootFS RW, we'll turn off the codesign
946 * bitmap optimization.
947 */
948 #if CHECK_CS_VALIDATION_BITMAP
949 if ((flags & MNT_RDONLY) == 0) {
950 root_fs_upgrade_try = TRUE;
951 }
952 #endif
953 }
954
955 error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
956 labelstr, ctx);
957
958 out:
959
960 #if CONFIG_MACF
961 kfree_data(labelstr, labelsz);
962 #endif /* CONFIG_MACF */
963
964 if (vp) {
965 vnode_put(vp);
966 }
967 if (pvp) {
968 vnode_put(pvp);
969 }
970 if (need_nameidone) {
971 nameidone(&nd);
972 }
973
974 return error;
975 }
976
977 /*
978 * common mount implementation (final stage of mounting)
979 *
980 * Arguments:
981 * fstypename file system type (ie it's vfs name)
982 * pvp parent of covered vnode
983 * vp covered vnode
984 * cnp component name (ie path) of covered vnode
985 * flags generic mount flags
986 * fsmountargs file system specific data
987 * labelstr optional MAC label
988 * kernelmount TRUE for mounts initiated from inside the kernel
989 * ctx caller's context
990 */
991 static int
mount_common(const char * fstypename,vnode_t pvp,vnode_t vp,struct componentname * cnp,user_addr_t fsmountargs,int flags,uint32_t internal_flags,char * labelstr,vfs_context_t ctx)992 mount_common(const char *fstypename, vnode_t pvp, vnode_t vp,
993 struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
994 char *labelstr, vfs_context_t ctx)
995 {
996 #if !CONFIG_MACF
997 #pragma unused(labelstr)
998 #endif
999 struct vnode *devvp = NULLVP;
1000 struct vnode *device_vnode = NULLVP;
1001 #if CONFIG_MACF
1002 struct vnode *rvp;
1003 #endif
1004 struct mount *mp = NULL;
1005 struct vfstable *vfsp = (struct vfstable *)0;
1006 struct proc *p = vfs_context_proc(ctx);
1007 int error, flag = 0;
1008 bool flag_set = false;
1009 user_addr_t devpath = USER_ADDR_NULL;
1010 int ronly = 0;
1011 int mntalloc = 0;
1012 boolean_t vfsp_ref = FALSE;
1013 boolean_t is_rwlock_locked = FALSE;
1014 boolean_t did_rele = FALSE;
1015 boolean_t have_usecount = FALSE;
1016 boolean_t did_set_lmount = FALSE;
1017 boolean_t kernelmount = !!(internal_flags & KERNEL_MOUNT_KMOUNT);
1018
1019 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
1020 /* Check for mutually-exclusive flag bits */
1021 uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
1022 int bitcount = 0;
1023 while (checkflags != 0) {
1024 checkflags &= (checkflags - 1);
1025 bitcount++;
1026 }
1027
1028 if (bitcount > 1) {
1029 //not allowed to request multiple mount-by-role flags
1030 error = EINVAL;
1031 goto out1;
1032 }
1033 #endif
1034
1035 /*
1036 * Process an update for an existing mount
1037 */
1038 if (flags & MNT_UPDATE) {
1039 if ((vp->v_flag & VROOT) == 0) {
1040 error = EINVAL;
1041 goto out1;
1042 }
1043 mp = vp->v_mount;
1044
1045 /* if unmount or mount in progress, return error */
1046 mount_lock_spin(mp);
1047 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1048 mount_unlock(mp);
1049 error = EBUSY;
1050 goto out1;
1051 }
1052 mp->mnt_lflag |= MNT_LMOUNT;
1053 did_set_lmount = TRUE;
1054 mount_unlock(mp);
1055 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1056 is_rwlock_locked = TRUE;
1057 /*
1058 * We only allow the filesystem to be reloaded if it
1059 * is currently mounted read-only.
1060 */
1061 if ((flags & MNT_RELOAD) &&
1062 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1063 error = ENOTSUP;
1064 goto out1;
1065 }
1066
1067 /*
1068 * If content protection is enabled, update mounts are not
1069 * allowed to turn it off.
1070 */
1071 if ((mp->mnt_flag & MNT_CPROTECT) &&
1072 ((flags & MNT_CPROTECT) == 0)) {
1073 error = EINVAL;
1074 goto out1;
1075 }
1076
1077 /*
1078 * can't turn off MNT_REMOVABLE either but it may be an unexpected
1079 * failure to return an error for this so we'll just silently
1080 * add it if it is not passed in.
1081 */
1082 if ((mp->mnt_flag & MNT_REMOVABLE) &&
1083 ((flags & MNT_REMOVABLE) == 0)) {
1084 flags |= MNT_REMOVABLE;
1085 }
1086
1087 /* Can't downgrade the backer of the root FS */
1088 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
1089 (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
1090 error = ENOTSUP;
1091 goto out1;
1092 }
1093
1094 /*
1095 * Only root, or the user that did the original mount is
1096 * permitted to update it.
1097 */
1098 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1099 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
1100 goto out1;
1101 }
1102 #if CONFIG_MACF
1103 error = mac_mount_check_remount(ctx, mp);
1104 if (error != 0) {
1105 goto out1;
1106 }
1107 #endif
1108 /*
1109 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
1110 * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
1111 */
1112 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1113 flags |= MNT_NOSUID | MNT_NODEV;
1114 if (mp->mnt_flag & MNT_NOEXEC) {
1115 flags |= MNT_NOEXEC;
1116 }
1117 }
1118 flag = mp->mnt_flag;
1119 flag_set = true;
1120
1121
1122
1123 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
1124
1125 vfsp = mp->mnt_vtable;
1126 goto update;
1127 } // MNT_UPDATE
1128
1129 /*
1130 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
1131 * MNT_NOEXEC if mount point is already MNT_NOEXEC.
1132 */
1133 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
1134 flags |= MNT_NOSUID | MNT_NODEV;
1135 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
1136 flags |= MNT_NOEXEC;
1137 }
1138 }
1139
1140 /* XXXAUDIT: Should we capture the type on the error path as well? */
1141 /* XXX cast-away const (audit_arg_text() does not modify its input) */
1142 AUDIT_ARG(text, (char *)(uintptr_t)fstypename);
1143 mount_list_lock();
1144 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1145 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
1146 vfsp->vfc_refcount++;
1147 vfsp_ref = TRUE;
1148 break;
1149 }
1150 }
1151 mount_list_unlock();
1152 if (vfsp == NULL) {
1153 error = ENODEV;
1154 goto out1;
1155 }
1156
1157 /*
1158 * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
1159 * except in ROSV configs and for the initial BaseSystem root.
1160 */
1161 if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
1162 ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
1163 ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
1164 error = EINVAL; /* unsupported request */
1165 goto out1;
1166 }
1167
1168 error = prepare_coveredvp(vp, ctx, cnp, fstypename, internal_flags);
1169 if (error != 0) {
1170 goto out1;
1171 }
1172
1173 /*
1174 * Allocate and initialize the filesystem (mount_t)
1175 */
1176 mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
1177 mntalloc = 1;
1178
1179 /* Initialize the default IO constraints */
1180 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
1181 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
1182 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
1183 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
1184 mp->mnt_devblocksize = DEV_BSIZE;
1185 mp->mnt_alignmentmask = PAGE_MASK;
1186 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
1187 mp->mnt_ioscale = 1;
1188 mp->mnt_ioflags = 0;
1189 mp->mnt_realrootvp = NULLVP;
1190 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
1191
1192 mp->mnt_lflag |= MNT_LMOUNT;
1193 did_set_lmount = TRUE;
1194
1195 TAILQ_INIT(&mp->mnt_vnodelist);
1196 TAILQ_INIT(&mp->mnt_workerqueue);
1197 TAILQ_INIT(&mp->mnt_newvnodes);
1198 mount_lock_init(mp);
1199 lck_rw_lock_exclusive(&mp->mnt_rwlock);
1200 is_rwlock_locked = TRUE;
1201 mp->mnt_op = vfsp->vfc_vfsops;
1202 mp->mnt_vtable = vfsp;
1203 //mp->mnt_stat.f_type = vfsp->vfc_typenum;
1204 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
1205 strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
1206 do {
1207 size_t pathlen = MAXPATHLEN;
1208
1209 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
1210 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1211 }
1212 } while (0);
1213 mp->mnt_vnodecovered = vp;
1214 mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
1215 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
1216 mp->mnt_devbsdunit = 0;
1217 mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
1218
1219 /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
1220 vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
1221
1222 if (kernelmount) {
1223 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
1224 }
1225 if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
1226 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
1227 }
1228
1229 if (KERNEL_MOUNT_DEVFS & internal_flags) {
1230 // kernel mounted devfs
1231 mp->mnt_kern_flag |= MNTK_SYSTEM;
1232 }
1233
1234 update:
1235
1236 /*
1237 * Set the mount level flags.
1238 */
1239 if (flags & MNT_RDONLY) {
1240 mp->mnt_flag |= MNT_RDONLY;
1241 } else if (mp->mnt_flag & MNT_RDONLY) {
1242 // disallow read/write upgrades of file systems that
1243 // had the TYPENAME_OVERRIDE feature set.
1244 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
1245 error = EPERM;
1246 goto out1;
1247 }
1248 mp->mnt_kern_flag |= MNTK_WANTRDWR;
1249 }
1250 mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1251 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1252 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1253 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1254 MNT_QUARANTINE | MNT_CPROTECT);
1255
1256 #if SECURE_KERNEL
1257 #if !CONFIG_MNT_SUID
1258 /*
1259 * On release builds of iOS based platforms, always enforce NOSUID on
1260 * all mounts. We do this here because we can catch update mounts as well as
1261 * non-update mounts in this case.
1262 */
1263 mp->mnt_flag |= (MNT_NOSUID);
1264 #endif
1265 #endif
1266
1267 mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
1268 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
1269 MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
1270 MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
1271 MNT_QUARANTINE | MNT_CPROTECT);
1272
1273 #if CONFIG_MACF
1274 if (flags & MNT_MULTILABEL) {
1275 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
1276 error = EINVAL;
1277 goto out1;
1278 }
1279 mp->mnt_flag |= MNT_MULTILABEL;
1280 }
1281 #endif
1282 /*
1283 * Process device path for local file systems if requested.
1284 *
1285 * Snapshot and mount-by-role mounts do not use this path; they are
1286 * passing other opaque data in the device path field.
1287 *
1288 * Basesystemroot mounts pass a device path to be resolved here,
1289 * but it's just a char * already inside the kernel, which
1290 * kernel_mount() shoved into a user_addr_t to call us. So for such
1291 * mounts we must skip copyin (both of the address and of the string
1292 * (in NDINIT).
1293 */
1294 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
1295 !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
1296 boolean_t do_copyin_devpath = true;
1297 #if CONFIG_BASESYSTEMROOT
1298 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1299 // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
1300 // We have been passed fsmountargs, which is typed as a user_addr_t,
1301 // but is actually a char ** pointing to a (kernelspace) string.
1302 // We manually unpack it with a series of casts and dereferences
1303 // that reverses what was done just above us on the stack in
1304 // imageboot_pivot_image().
1305 // After retrieving the path to the dev node (which we will NDINIT
1306 // in a moment), we pass NULL fsmountargs on to the filesystem.
1307 _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
1308 char **devnamepp = (char **)fsmountargs;
1309 char *devnamep = *devnamepp;
1310 devpath = CAST_USER_ADDR_T(devnamep);
1311 do_copyin_devpath = false;
1312 fsmountargs = USER_ADDR_NULL;
1313
1314 //Now that we have a mp, denote that this mount is for the basesystem.
1315 mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
1316 }
1317 #endif // CONFIG_BASESYSTEMROOT
1318
1319 if (do_copyin_devpath) {
1320 if (vfs_context_is64bit(ctx)) {
1321 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1322 goto out1;
1323 }
1324 fsmountargs += sizeof(devpath);
1325 } else {
1326 user32_addr_t tmp;
1327 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1328 goto out1;
1329 }
1330 /* munge into LP64 addr */
1331 devpath = CAST_USER_ADDR_T(tmp);
1332 fsmountargs += sizeof(tmp);
1333 }
1334 }
1335
1336 /* Lookup device and authorize access to it */
1337 if ((devpath)) {
1338 struct nameidata nd;
1339
1340 enum uio_seg seg = UIO_USERSPACE;
1341 #if CONFIG_BASESYSTEMROOT
1342 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
1343 seg = UIO_SYSSPACE;
1344 }
1345 #endif // CONFIG_BASESYSTEMROOT
1346
1347 NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
1348 if ((error = namei(&nd))) {
1349 goto out1;
1350 }
1351
1352 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1353 devvp = nd.ni_vp;
1354
1355 nameidone(&nd);
1356
1357 if (devvp->v_type != VBLK) {
1358 error = ENOTBLK;
1359 goto out2;
1360 }
1361 if (major(devvp->v_rdev) >= nblkdev) {
1362 error = ENXIO;
1363 goto out2;
1364 }
1365 /*
1366 * If mount by non-root, then verify that user has necessary
1367 * permissions on the device.
1368 */
1369 if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1370 kauth_action_t accessmode = KAUTH_VNODE_READ_DATA;
1371
1372 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1373 accessmode |= KAUTH_VNODE_WRITE_DATA;
1374 }
1375 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1376 goto out2;
1377 }
1378 }
1379 }
1380 /* On first mount, preflight and open device */
1381 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1382 if ((error = vnode_ref(devvp))) {
1383 goto out2;
1384 }
1385 /*
1386 * Disallow multiple mounts of the same device.
1387 * Disallow mounting of a device that is currently in use
1388 * (except for root, which might share swap device for miniroot).
1389 * Flush out any old buffers remaining from a previous use.
1390 */
1391 if ((error = vfs_setmounting(devvp))) {
1392 vnode_rele(devvp);
1393 goto out2;
1394 }
1395
1396 if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1397 error = EBUSY;
1398 goto out3;
1399 }
1400 if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1401 error = ENOTBLK;
1402 goto out3;
1403 }
1404 if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1405 goto out3;
1406 }
1407
1408 ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1409 #if CONFIG_MACF
1410 error = mac_vnode_check_open(ctx,
1411 devvp,
1412 ronly ? FREAD : FREAD | FWRITE);
1413 if (error) {
1414 goto out3;
1415 }
1416 #endif /* MAC */
1417 if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1418 goto out3;
1419 }
1420
1421 mp->mnt_devvp = devvp;
1422 device_vnode = devvp;
1423 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1424 (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1425 (device_vnode = mp->mnt_devvp)) {
1426 dev_t dev;
1427 int maj;
1428 /*
1429 * If upgrade to read-write by non-root, then verify
1430 * that user has necessary permissions on the device.
1431 */
1432 vnode_getalways(device_vnode);
1433
1434 if (suser(vfs_context_ucred(ctx), NULL) &&
1435 (error = vnode_authorize(device_vnode, NULL,
1436 KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1437 ctx)) != 0) {
1438 vnode_put(device_vnode);
1439 goto out2;
1440 }
1441
1442 /* Tell the device that we're upgrading */
1443 dev = (dev_t)device_vnode->v_rdev;
1444 maj = major(dev);
1445
1446 if ((u_int)maj >= (u_int)nblkdev) {
1447 panic("Volume mounted on a device with invalid major number.");
1448 }
1449
1450 error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1451 vnode_put(device_vnode);
1452 device_vnode = NULLVP;
1453 if (error != 0) {
1454 goto out2;
1455 }
1456 }
1457 } // localargs && !(snapshot | data | vm)
1458
1459 #if CONFIG_MACF
1460 if ((flags & MNT_UPDATE) == 0) {
1461 mac_mount_label_init(mp);
1462 mac_mount_label_associate(ctx, mp);
1463 }
1464 if (labelstr) {
1465 if ((flags & MNT_UPDATE) != 0) {
1466 error = mac_mount_check_label_update(ctx, mp);
1467 if (error != 0) {
1468 goto out3;
1469 }
1470 }
1471 }
1472 #endif
1473 /*
1474 * Mount the filesystem. We already asserted that internal_flags
1475 * cannot have more than one mount-by-role bit set.
1476 */
1477 if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1478 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1479 (caddr_t)fsmountargs, 0, ctx);
1480 } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1481 #if CONFIG_ROSV_STARTUP
1482 struct mount *origin_mp = (struct mount*)fsmountargs;
1483 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1484 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1485 if (error) {
1486 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1487 } else {
1488 /* Mark volume associated with system volume */
1489 mp->mnt_kern_flag |= MNTK_SYSTEM;
1490
1491 /* Attempt to acquire the mnt_devvp and set it up */
1492 struct vnode *mp_devvp = NULL;
1493 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1494 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1495 0, &mp_devvp, vfs_context_kernel());
1496 if (!lerr) {
1497 mp->mnt_devvp = mp_devvp;
1498 //vnode_lookup took an iocount, need to drop it.
1499 vnode_put(mp_devvp);
1500 // now set `device_vnode` to the devvp that was acquired.
1501 // this is needed in order to ensure vfs_init_io_attributes is invoked.
1502 // note that though the iocount above was dropped, the mount acquires
1503 // an implicit reference against the device.
1504 device_vnode = mp_devvp;
1505 }
1506 }
1507 }
1508 #else
1509 error = EINVAL;
1510 #endif
1511 } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1512 #if CONFIG_MOUNT_VM
1513 struct mount *origin_mp = (struct mount*)fsmountargs;
1514 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1515 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1516 if (error) {
1517 printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1518 } else {
1519 /* Mark volume associated with system volume and a swap mount */
1520 mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1521 /* Attempt to acquire the mnt_devvp and set it up */
1522 struct vnode *mp_devvp = NULL;
1523 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1524 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1525 0, &mp_devvp, vfs_context_kernel());
1526 if (!lerr) {
1527 mp->mnt_devvp = mp_devvp;
1528 //vnode_lookup took an iocount, need to drop it.
1529 vnode_put(mp_devvp);
1530
1531 // now set `device_vnode` to the devvp that was acquired.
1532 // note that though the iocount above was dropped, the mount acquires
1533 // an implicit reference against the device.
1534 device_vnode = mp_devvp;
1535 }
1536 }
1537 }
1538 #else
1539 error = EINVAL;
1540 #endif
1541 } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1542 #if CONFIG_MOUNT_PREBOOTRECOVERY
1543 struct mount *origin_mp = (struct mount*)fsmountargs;
1544 uint32_t mount_role = 0;
1545 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1546 mount_role = VFS_PREBOOT_ROLE;
1547 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1548 mount_role = VFS_RECOVERY_ROLE;
1549 }
1550
1551 if (mount_role != 0) {
1552 fs_role_mount_args_t frma = {origin_mp, mount_role};
1553 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1554 if (error) {
1555 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1556 } else {
1557 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1558 /* Mark volume associated with system volume */
1559 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1560 /* Attempt to acquire the mnt_devvp and set it up */
1561 struct vnode *mp_devvp = NULL;
1562 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1563 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1564 0, &mp_devvp, vfs_context_kernel());
1565 if (!lerr) {
1566 mp->mnt_devvp = mp_devvp;
1567 //vnode_lookup took an iocount, need to drop it.
1568 vnode_put(mp_devvp);
1569
1570 // now set `device_vnode` to the devvp that was acquired.
1571 // note that though the iocount above was dropped, the mount acquires
1572 // an implicit reference against the device.
1573 device_vnode = mp_devvp;
1574 }
1575 }
1576 }
1577 } else {
1578 printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1579 error = EINVAL;
1580 }
1581 #else
1582 error = EINVAL;
1583 #endif
1584 } else {
1585 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1586 }
1587
1588 if (flags & MNT_UPDATE) {
1589 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1590 mp->mnt_flag &= ~MNT_RDONLY;
1591 }
1592 mp->mnt_flag &= ~
1593 (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1594 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1595 if (error) {
1596 mp->mnt_flag = flag; /* restore flag value */
1597 }
1598 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1599 lck_rw_done(&mp->mnt_rwlock);
1600 is_rwlock_locked = FALSE;
1601 if (!error) {
1602 enablequotas(mp, ctx);
1603 }
1604 goto exit;
1605 }
1606
1607 /*
1608 * Put the new filesystem on the mount list after root.
1609 */
1610 if (error == 0) {
1611 struct vfs_attr vfsattr;
1612 if (device_vnode) {
1613 /*
1614 * cache the IO attributes for the underlying physical media...
1615 * an error return indicates the underlying driver doesn't
1616 * support all the queries necessary... however, reasonable
1617 * defaults will have been set, so no reason to bail or care
1618 *
1619 * Need to do this before calling the MAC hook as it needs
1620 * information from this call.
1621 */
1622 vfs_init_io_attributes(device_vnode, mp);
1623 }
1624
1625 #if CONFIG_MACF
1626 error = mac_mount_check_mount_late(ctx, mp);
1627 if (error != 0) {
1628 goto out4;
1629 }
1630
1631 if (vfs_flags(mp) & MNT_MULTILABEL) {
1632 error = VFS_ROOT(mp, &rvp, ctx);
1633 if (error) {
1634 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1635 goto out4;
1636 }
1637 error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1638 /*
1639 * drop reference provided by VFS_ROOT
1640 */
1641 vnode_put(rvp);
1642
1643 if (error) {
1644 goto out4;
1645 }
1646 }
1647 #endif /* MAC */
1648
1649 vnode_lock_spin(vp);
1650 CLR(vp->v_flag, VMOUNT);
1651 vp->v_mountedhere = mp;
1652 SET(vp->v_flag, VMOUNTEDHERE);
1653 vnode_unlock(vp);
1654
1655 /*
1656 * taking the name_cache_lock exclusively will
1657 * insure that everyone is out of the fast path who
1658 * might be trying to use a now stale copy of
1659 * vp->v_mountedhere->mnt_realrootvp
1660 * bumping mount_generation causes the cached values
1661 * to be invalidated
1662 */
1663 name_cache_lock();
1664 mount_generation++;
1665 name_cache_unlock();
1666
1667 error = vnode_ref(vp);
1668 if (error != 0) {
1669 goto out4;
1670 }
1671
1672 have_usecount = TRUE;
1673
1674 error = checkdirs(vp, ctx);
1675 if (error != 0) {
1676 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1677 goto out4;
1678 }
1679 /*
1680 * there is no cleanup code here so I have made it void
1681 * we need to revisit this
1682 */
1683 (void)VFS_START(mp, 0, ctx);
1684
1685 if (mount_list_add(mp) != 0) {
1686 /*
1687 * The system is shutting down trying to umount
1688 * everything, so fail with a plausible errno.
1689 */
1690 error = EBUSY;
1691 goto out4;
1692 }
1693 lck_rw_done(&mp->mnt_rwlock);
1694 is_rwlock_locked = FALSE;
1695
1696 /* Check if this mounted file system supports EAs or named streams. */
1697 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1698 VFSATTR_INIT(&vfsattr);
1699 VFSATTR_WANTED(&vfsattr, f_capabilities);
1700 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1701 vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1702 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1703 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1704 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1705 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1706 }
1707 #if NAMEDSTREAMS
1708 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1709 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1710 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1711 }
1712 #endif
1713 /* Check if this file system supports path from id lookups. */
1714 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1715 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1716 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1717 } else if (mp->mnt_flag & MNT_DOVOLFS) {
1718 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1719 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1720 }
1721
1722 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1723 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1724 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1725 }
1726 }
1727 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1728 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1729 }
1730 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1731 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1732 }
1733 /* increment the operations count */
1734 OSAddAtomic(1, &vfs_nummntops);
1735 enablequotas(mp, ctx);
1736
1737 if (device_vnode) {
1738 vfs_setmountedon(device_vnode);
1739 }
1740
1741 /* Now that mount is setup, notify the listeners */
1742 vfs_notify_mount(pvp);
1743 IOBSDMountChange(mp, kIOMountChangeMount);
1744 } else {
1745 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1746 if (mp->mnt_vnodelist.tqh_first != NULL) {
1747 panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1748 mp->mnt_vtable->vfc_name, error);
1749 }
1750
1751 vnode_lock_spin(vp);
1752 CLR(vp->v_flag, VMOUNT);
1753 vnode_unlock(vp);
1754 mount_list_lock();
1755 mp->mnt_vtable->vfc_refcount--;
1756 mount_list_unlock();
1757
1758 if (device_vnode) {
1759 vnode_rele(device_vnode);
1760 VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1761 vfs_clearmounting(device_vnode);
1762 }
1763 lck_rw_done(&mp->mnt_rwlock);
1764 is_rwlock_locked = FALSE;
1765
1766 if (nc_smr_enabled) {
1767 vfs_smr_synchronize();
1768 }
1769
1770 /*
1771 * if we get here, we have a mount structure that needs to be freed,
1772 * but since the coveredvp hasn't yet been updated to point at it,
1773 * no need to worry about other threads holding a crossref on this mp
1774 * so it's ok to just free it
1775 */
1776 mount_lock_destroy(mp);
1777 #if CONFIG_MACF
1778 mac_mount_label_destroy(mp);
1779 #endif
1780 zfree(mount_zone, mp);
1781 did_set_lmount = false;
1782 }
1783 exit:
1784 /*
1785 * drop I/O count on the device vp if there was one
1786 */
1787 if (devpath && devvp) {
1788 vnode_put(devvp);
1789 }
1790
1791 if (did_set_lmount) {
1792 mount_lock_spin(mp);
1793 mp->mnt_lflag &= ~MNT_LMOUNT;
1794 mount_unlock(mp);
1795 }
1796
1797 return error;
1798
1799 /* Error condition exits */
1800 out4:
1801 (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1802
1803 /*
1804 * If the mount has been placed on the covered vp,
1805 * it may have been discovered by now, so we have
1806 * to treat this just like an unmount
1807 */
1808 mount_lock_spin(mp);
1809 mp->mnt_lflag |= MNT_LDEAD;
1810 mount_unlock(mp);
1811
1812 if (device_vnode != NULLVP) {
1813 vnode_rele(device_vnode);
1814 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1815 ctx);
1816 vfs_clearmounting(device_vnode);
1817 did_rele = TRUE;
1818 }
1819
1820 vnode_lock_spin(vp);
1821
1822 mp->mnt_crossref++;
1823 CLR(vp->v_flag, VMOUNTEDHERE);
1824 vp->v_mountedhere = (mount_t) 0;
1825
1826 vnode_unlock(vp);
1827
1828 if (have_usecount) {
1829 vnode_rele(vp);
1830 }
1831 out3:
1832 if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1833 vnode_rele(devvp);
1834 vfs_clearmounting(devvp);
1835 }
1836 out2:
1837 if (devpath && devvp) {
1838 vnode_put(devvp);
1839 }
1840 out1:
1841 /* Release mnt_rwlock only when it was taken */
1842 if (is_rwlock_locked == TRUE) {
1843 if (flag_set) {
1844 mp->mnt_flag = flag; /* restore mnt_flag value */
1845 }
1846 lck_rw_done(&mp->mnt_rwlock);
1847 }
1848
1849 if (did_set_lmount) {
1850 mount_lock_spin(mp);
1851 mp->mnt_lflag &= ~MNT_LMOUNT;
1852 mount_unlock(mp);
1853 }
1854
1855 if (mntalloc) {
1856 if (mp->mnt_crossref) {
1857 mount_dropcrossref(mp, vp, 0);
1858 } else {
1859 if (nc_smr_enabled) {
1860 vfs_smr_synchronize();
1861 }
1862
1863 mount_lock_destroy(mp);
1864 #if CONFIG_MACF
1865 mac_mount_label_destroy(mp);
1866 #endif
1867 zfree(mount_zone, mp);
1868 }
1869 }
1870 if (vfsp_ref) {
1871 mount_list_lock();
1872 vfsp->vfc_refcount--;
1873 mount_list_unlock();
1874 }
1875
1876 return error;
1877 }
1878
1879 /*
1880 * Flush in-core data, check for competing mount attempts,
1881 * and set VMOUNT
1882 */
1883 int
prepare_coveredvp(vnode_t vp,vfs_context_t ctx,struct componentname * cnp,const char * fsname,uint32_t internal_flags)1884 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, uint32_t internal_flags)
1885 {
1886 #if !CONFIG_MACF
1887 #pragma unused(cnp,fsname)
1888 #endif
1889 struct vnode_attr va;
1890 int error;
1891 boolean_t skip_auth = !!(internal_flags & KERNEL_MOUNT_NOAUTH);
1892 boolean_t is_fmount = !!(internal_flags & KERNEL_MOUNT_FMOUNT);
1893 boolean_t is_busy;
1894
1895 if (!skip_auth) {
1896 /*
1897 * If the user is not root, ensure that they own the directory
1898 * onto which we are attempting to mount.
1899 */
1900 VATTR_INIT(&va);
1901 VATTR_WANTED(&va, va_uid);
1902 if ((error = vnode_getattr(vp, &va, ctx)) ||
1903 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1904 (!vfs_context_issuser(ctx)))) {
1905 error = EPERM;
1906 goto out;
1907 }
1908 }
1909
1910 if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1911 goto out;
1912 }
1913
1914 if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1915 goto out;
1916 }
1917
1918 if (vp->v_type != VDIR) {
1919 error = ENOTDIR;
1920 goto out;
1921 }
1922
1923 vnode_lock_spin(vp);
1924 is_busy = is_fmount ?
1925 (ISSET(vp->v_flag, VMOUNT) || (vp->v_mountedhere != NULL)) :
1926 (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL));
1927 if (is_busy) {
1928 vnode_unlock(vp);
1929 error = EBUSY;
1930 goto out;
1931 }
1932 SET(vp->v_flag, VMOUNT);
1933 vnode_unlock(vp);
1934
1935 #if CONFIG_MACF
1936 error = mac_mount_check_mount(ctx, vp,
1937 cnp, fsname);
1938 if (error != 0) {
1939 vnode_lock_spin(vp);
1940 CLR(vp->v_flag, VMOUNT);
1941 vnode_unlock(vp);
1942 }
1943 #endif
1944
1945 out:
1946 return error;
1947 }
1948
1949 #if CONFIG_IMGSRC_ACCESS
1950
1951 #define DEBUG_IMGSRC 0
1952
1953 #if DEBUG_IMGSRC
1954 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1955 #else
1956 #define IMGSRC_DEBUG(args...) do { } while(0)
1957 #endif
1958
1959 static int
authorize_devpath_and_update_mntfromname(mount_t mp,user_addr_t devpath,vnode_t * devvpp,vfs_context_t ctx)1960 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1961 {
1962 struct nameidata nd;
1963 vnode_t vp, realdevvp;
1964 kauth_action_t accessmode;
1965 int error;
1966 enum uio_seg uio = UIO_USERSPACE;
1967
1968 if (ctx == vfs_context_kernel()) {
1969 uio = UIO_SYSSPACE;
1970 }
1971
1972 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1973 if ((error = namei(&nd))) {
1974 IMGSRC_DEBUG("namei() failed with %d\n", error);
1975 return error;
1976 }
1977
1978 vp = nd.ni_vp;
1979
1980 if (!vnode_isblk(vp)) {
1981 IMGSRC_DEBUG("Not block device.\n");
1982 error = ENOTBLK;
1983 goto out;
1984 }
1985
1986 realdevvp = mp->mnt_devvp;
1987 if (realdevvp == NULLVP) {
1988 IMGSRC_DEBUG("No device backs the mount.\n");
1989 error = ENXIO;
1990 goto out;
1991 }
1992
1993 error = vnode_getwithref(realdevvp);
1994 if (error != 0) {
1995 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1996 goto out;
1997 }
1998
1999 if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
2000 IMGSRC_DEBUG("Wrong dev_t.\n");
2001 error = ENXIO;
2002 goto out1;
2003 }
2004
2005 strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
2006
2007 /*
2008 * If mount by non-root, then verify that user has necessary
2009 * permissions on the device.
2010 */
2011 if (!vfs_context_issuser(ctx)) {
2012 accessmode = KAUTH_VNODE_READ_DATA;
2013 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2014 accessmode |= KAUTH_VNODE_WRITE_DATA;
2015 }
2016 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
2017 IMGSRC_DEBUG("Access denied.\n");
2018 goto out1;
2019 }
2020 }
2021
2022 *devvpp = vp;
2023
2024 out1:
2025 vnode_put(realdevvp);
2026
2027 out:
2028 nameidone(&nd);
2029
2030 if (error) {
2031 vnode_put(vp);
2032 }
2033
2034 return error;
2035 }
2036
2037 /*
2038 * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
2039 * and call checkdirs()
2040 */
2041 static int
place_mount_and_checkdirs(mount_t mp,vnode_t vp,vfs_context_t ctx)2042 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
2043 {
2044 int error;
2045
2046 mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
2047
2048 IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
2049 mp->mnt_vtable->vfc_name, vnode_getname(vp));
2050
2051 vnode_lock_spin(vp);
2052 CLR(vp->v_flag, VMOUNT);
2053 vp->v_mountedhere = mp;
2054 SET(vp->v_flag, VMOUNTEDHERE);
2055 vnode_unlock(vp);
2056
2057 /*
2058 * taking the name_cache_lock exclusively will
2059 * insure that everyone is out of the fast path who
2060 * might be trying to use a now stale copy of
2061 * vp->v_mountedhere->mnt_realrootvp
2062 * bumping mount_generation causes the cached values
2063 * to be invalidated
2064 */
2065 name_cache_lock();
2066 mount_generation++;
2067 name_cache_unlock();
2068
2069 error = vnode_ref(vp);
2070 if (error != 0) {
2071 goto out;
2072 }
2073
2074 error = checkdirs(vp, ctx);
2075 if (error != 0) {
2076 /* Unmount the filesystem as cdir/rdirs cannot be updated */
2077 vnode_rele(vp);
2078 goto out;
2079 }
2080
2081 out:
2082 if (error != 0) {
2083 mp->mnt_vnodecovered = NULLVP;
2084 }
2085 return error;
2086 }
2087
2088 static void
undo_place_on_covered_vp(mount_t mp,vnode_t vp)2089 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
2090 {
2091 vnode_rele(vp);
2092 vnode_lock_spin(vp);
2093 CLR(vp->v_flag, (VMOUNT | VMOUNTEDHERE));
2094 vp->v_mountedhere = (mount_t)NULL;
2095 vnode_unlock(vp);
2096
2097 mp->mnt_vnodecovered = NULLVP;
2098 }
2099
2100 static int
mount_begin_update(mount_t mp,vfs_context_t ctx,int flags)2101 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
2102 {
2103 int error;
2104
2105 /* unmount in progress return error */
2106 mount_lock_spin(mp);
2107 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2108 mount_unlock(mp);
2109 return EBUSY;
2110 }
2111 mount_unlock(mp);
2112 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2113
2114 /*
2115 * We only allow the filesystem to be reloaded if it
2116 * is currently mounted read-only.
2117 */
2118 if ((flags & MNT_RELOAD) &&
2119 ((mp->mnt_flag & MNT_RDONLY) == 0)) {
2120 error = ENOTSUP;
2121 goto out;
2122 }
2123
2124 /*
2125 * Only root, or the user that did the original mount is
2126 * permitted to update it.
2127 */
2128 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
2129 (!vfs_context_issuser(ctx))) {
2130 error = EPERM;
2131 goto out;
2132 }
2133 #if CONFIG_MACF
2134 error = mac_mount_check_remount(ctx, mp);
2135 if (error != 0) {
2136 goto out;
2137 }
2138 #endif
2139
2140 out:
2141 if (error) {
2142 lck_rw_done(&mp->mnt_rwlock);
2143 }
2144
2145 return error;
2146 }
2147
2148 static void
mount_end_update(mount_t mp)2149 mount_end_update(mount_t mp)
2150 {
2151 lck_rw_done(&mp->mnt_rwlock);
2152 }
2153
2154 static int
get_imgsrc_rootvnode(uint32_t height,vnode_t * rvpp)2155 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
2156 {
2157 vnode_t vp;
2158
2159 if (height >= MAX_IMAGEBOOT_NESTING) {
2160 return EINVAL;
2161 }
2162
2163 vp = imgsrc_rootvnodes[height];
2164 if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
2165 *rvpp = vp;
2166 return 0;
2167 } else {
2168 return ENOENT;
2169 }
2170 }
2171
2172 static int
relocate_imageboot_source(vnode_t pvp,vnode_t vp,struct componentname * cnp,const char * fsname,vfs_context_t ctx,boolean_t is64bit,user_addr_t fsmountargs,boolean_t by_index)2173 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
2174 struct componentname *cnp, const char *fsname, vfs_context_t ctx,
2175 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
2176 {
2177 int error;
2178 mount_t mp;
2179 boolean_t placed = FALSE;
2180 struct vfstable *vfsp;
2181 user_addr_t devpath;
2182 char *old_mntonname;
2183 vnode_t rvp;
2184 vnode_t devvp;
2185 uint32_t height;
2186 uint32_t flags;
2187
2188 /* If we didn't imageboot, nothing to move */
2189 if (imgsrc_rootvnodes[0] == NULLVP) {
2190 return EINVAL;
2191 }
2192
2193 /* Only root can do this */
2194 if (!vfs_context_issuser(ctx)) {
2195 return EPERM;
2196 }
2197
2198 IMGSRC_DEBUG("looking for root vnode.\n");
2199
2200 /*
2201 * Get root vnode of filesystem we're moving.
2202 */
2203 if (by_index) {
2204 if (is64bit) {
2205 struct user64_mnt_imgsrc_args mia64;
2206 error = copyin(fsmountargs, &mia64, sizeof(mia64));
2207 if (error != 0) {
2208 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2209 return error;
2210 }
2211
2212 height = mia64.mi_height;
2213 flags = mia64.mi_flags;
2214 devpath = (user_addr_t)mia64.mi_devpath;
2215 } else {
2216 struct user32_mnt_imgsrc_args mia32;
2217 error = copyin(fsmountargs, &mia32, sizeof(mia32));
2218 if (error != 0) {
2219 IMGSRC_DEBUG("Failed to copy in arguments.\n");
2220 return error;
2221 }
2222
2223 height = mia32.mi_height;
2224 flags = mia32.mi_flags;
2225 devpath = mia32.mi_devpath;
2226 }
2227 } else {
2228 /*
2229 * For binary compatibility--assumes one level of nesting.
2230 */
2231 if (is64bit) {
2232 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
2233 return error;
2234 }
2235 } else {
2236 user32_addr_t tmp;
2237 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
2238 return error;
2239 }
2240
2241 /* munge into LP64 addr */
2242 devpath = CAST_USER_ADDR_T(tmp);
2243 }
2244
2245 height = 0;
2246 flags = 0;
2247 }
2248
2249 if (flags != 0) {
2250 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
2251 return EINVAL;
2252 }
2253
2254 error = get_imgsrc_rootvnode(height, &rvp);
2255 if (error != 0) {
2256 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
2257 return error;
2258 }
2259
2260 IMGSRC_DEBUG("got old root vnode\n");
2261
2262 old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
2263
2264 /* Can only move once */
2265 mp = vnode_mount(rvp);
2266 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2267 IMGSRC_DEBUG("Already moved.\n");
2268 error = EBUSY;
2269 goto out0;
2270 }
2271
2272 IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
2273 IMGSRC_DEBUG("Starting updated.\n");
2274
2275 /* Get exclusive rwlock on mount, authorize update on mp */
2276 error = mount_begin_update(mp, ctx, 0);
2277 if (error != 0) {
2278 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
2279 goto out0;
2280 }
2281
2282 /*
2283 * It can only be moved once. Flag is set under the rwlock,
2284 * so we're now safe to proceed.
2285 */
2286 if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
2287 IMGSRC_DEBUG("Already moved [2]\n");
2288 goto out1;
2289 }
2290
2291 IMGSRC_DEBUG("Preparing coveredvp.\n");
2292
2293 /* Mark covered vnode as mount in progress, authorize placing mount on top */
2294 error = prepare_coveredvp(vp, ctx, cnp, fsname, 0);
2295 if (error != 0) {
2296 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
2297 goto out1;
2298 }
2299
2300 IMGSRC_DEBUG("Covered vp OK.\n");
2301
2302 /* Sanity check the name caller has provided */
2303 vfsp = mp->mnt_vtable;
2304 if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
2305 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
2306 vfsp->vfc_name, fsname);
2307 error = EINVAL;
2308 goto out2;
2309 }
2310
2311 /* Check the device vnode and update mount-from name, for local filesystems */
2312 if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
2313 IMGSRC_DEBUG("Local, doing device validation.\n");
2314
2315 if (devpath != USER_ADDR_NULL) {
2316 error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
2317 if (error) {
2318 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
2319 goto out2;
2320 }
2321
2322 vnode_put(devvp);
2323 }
2324 }
2325
2326 /*
2327 * Place mp on top of vnode, ref the vnode, call checkdirs(),
2328 * and increment the name cache's mount generation
2329 */
2330
2331 IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
2332 error = place_mount_and_checkdirs(mp, vp, ctx);
2333 if (error != 0) {
2334 goto out2;
2335 }
2336
2337 placed = TRUE;
2338
2339 strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
2340 strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
2341
2342 /* Forbid future moves */
2343 mount_lock(mp);
2344 mp->mnt_kern_flag |= MNTK_HAS_MOVED;
2345 mount_unlock(mp);
2346
2347 /* Finally, add to mount list, completely ready to go */
2348 if (mount_list_add(mp) != 0) {
2349 /*
2350 * The system is shutting down trying to umount
2351 * everything, so fail with a plausible errno.
2352 */
2353 error = EBUSY;
2354 goto out3;
2355 }
2356
2357 mount_end_update(mp);
2358 vnode_put(rvp);
2359 zfree(ZV_NAMEI, old_mntonname);
2360
2361 vfs_notify_mount(pvp);
2362
2363 return 0;
2364 out3:
2365 strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
2366
2367 mount_lock(mp);
2368 mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
2369 mount_unlock(mp);
2370
2371 out2:
2372 /*
2373 * Placing the mp on the vnode clears VMOUNT,
2374 * so cleanup is different after that point
2375 */
2376 if (placed) {
2377 /* Rele the vp, clear VMOUNT and v_mountedhere */
2378 undo_place_on_covered_vp(mp, vp);
2379 } else {
2380 vnode_lock_spin(vp);
2381 CLR(vp->v_flag, VMOUNT);
2382 vnode_unlock(vp);
2383 }
2384 out1:
2385 mount_end_update(mp);
2386
2387 out0:
2388 vnode_put(rvp);
2389 zfree(ZV_NAMEI, old_mntonname);
2390 return error;
2391 }
2392
2393 #endif /* CONFIG_IMGSRC_ACCESS */
2394
2395 void
enablequotas(struct mount * mp,vfs_context_t ctx)2396 enablequotas(struct mount *mp, vfs_context_t ctx)
2397 {
2398 struct nameidata qnd;
2399 int type;
2400 char qfpath[MAXPATHLEN];
2401 const char *qfname = QUOTAFILENAME;
2402 const char *qfopsname = QUOTAOPSNAME;
2403 const char *qfextension[] = INITQFNAMES;
2404
2405 /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2406 if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2407 return;
2408 }
2409 /*
2410 * Enable filesystem disk quotas if necessary.
2411 * We ignore errors as this should not interfere with final mount
2412 */
2413 for (type = 0; type < MAXQUOTAS; type++) {
2414 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2415 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2416 CAST_USER_ADDR_T(qfpath), ctx);
2417 if (namei(&qnd) != 0) {
2418 continue; /* option file to trigger quotas is not present */
2419 }
2420 vnode_put(qnd.ni_vp);
2421 nameidone(&qnd);
2422 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2423
2424 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2425 }
2426 return;
2427 }
2428
2429
2430 static int
checkdirs_callback(proc_t p,void * arg)2431 checkdirs_callback(proc_t p, void * arg)
2432 {
2433 struct cdirargs *cdrp = (struct cdirargs *)arg;
2434 vnode_t olddp = cdrp->olddp;
2435 vnode_t newdp = cdrp->newdp;
2436 struct filedesc *fdp = &p->p_fd;
2437 vnode_t new_cvp = newdp;
2438 vnode_t new_rvp = newdp;
2439 vnode_t old_cvp = NULL;
2440 vnode_t old_rvp = NULL;
2441
2442 /*
2443 * XXX Also needs to iterate each thread in the process to see if it
2444 * XXX is using a per-thread current working directory, and, if so,
2445 * XXX update that as well.
2446 */
2447
2448 /*
2449 * First, with the proc_fdlock held, check to see if we will need
2450 * to do any work. If not, we will get out fast.
2451 */
2452 proc_fdlock(p);
2453 if (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp) {
2454 proc_fdunlock(p);
2455 return PROC_RETURNED;
2456 }
2457 proc_fdunlock(p);
2458
2459 /*
2460 * Ok, we will have to do some work. Always take two refs
2461 * because we might need that many. We'll dispose of whatever
2462 * we ended up not using.
2463 */
2464 if (vnode_ref(newdp) != 0) {
2465 return PROC_RETURNED;
2466 }
2467 if (vnode_ref(newdp) != 0) {
2468 vnode_rele(newdp);
2469 return PROC_RETURNED;
2470 }
2471
2472 proc_dirs_lock_exclusive(p);
2473 /*
2474 * Now do the work. Note: we dropped the proc_fdlock, so we
2475 * have to do all of the checks again.
2476 */
2477 proc_fdlock(p);
2478 if (fdp->fd_cdir == olddp) {
2479 old_cvp = olddp;
2480 fdp->fd_cdir = newdp;
2481 new_cvp = NULL;
2482 }
2483 if (fdp->fd_rdir == olddp) {
2484 old_rvp = olddp;
2485 fdp->fd_rdir = newdp;
2486 new_rvp = NULL;
2487 }
2488 proc_fdunlock(p);
2489 proc_dirs_unlock_exclusive(p);
2490
2491 /*
2492 * Dispose of any references that are no longer needed.
2493 */
2494 if (old_cvp != NULL) {
2495 vnode_rele(old_cvp);
2496 }
2497 if (old_rvp != NULL) {
2498 vnode_rele(old_rvp);
2499 }
2500 if (new_cvp != NULL) {
2501 vnode_rele(new_cvp);
2502 }
2503 if (new_rvp != NULL) {
2504 vnode_rele(new_rvp);
2505 }
2506
2507 return PROC_RETURNED;
2508 }
2509
2510
2511
2512 /*
2513 * Scan all active processes to see if any of them have a current
2514 * or root directory onto which the new filesystem has just been
2515 * mounted. If so, replace them with the new mount point.
2516 */
2517 static int
checkdirs(vnode_t olddp,vfs_context_t ctx)2518 checkdirs(vnode_t olddp, vfs_context_t ctx)
2519 {
2520 vnode_t newdp;
2521 vnode_t tvp;
2522 int err;
2523 struct cdirargs cdr;
2524
2525 if (olddp->v_usecount == 1) {
2526 return 0;
2527 }
2528 err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2529
2530 if (err != 0) {
2531 #if DIAGNOSTIC
2532 panic("mount: lost mount: error %d", err);
2533 #endif
2534 return err;
2535 }
2536
2537 cdr.olddp = olddp;
2538 cdr.newdp = newdp;
2539 /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2540 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2541
2542 if (rootvnode == olddp) {
2543 vnode_ref(newdp);
2544 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2545 tvp = rootvnode;
2546 rootvnode = newdp;
2547 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2548 vnode_rele(tvp);
2549 }
2550
2551 vnode_put(newdp);
2552 return 0;
2553 }
2554
2555 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT \
2556 "com.apple.private.vfs.role-account-unmount"
2557
2558 /*
2559 * Unmount a file system.
2560 *
2561 * Note: unmount takes a path to the vnode mounted on as argument,
2562 * not special file (as before).
2563 */
2564 /* ARGSUSED */
2565 int
unmount(__unused proc_t p,struct unmount_args * uap,__unused int32_t * retval)2566 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2567 {
2568 vnode_t vp;
2569 struct mount *mp;
2570 int error;
2571 struct nameidata nd;
2572 vfs_context_t ctx;
2573
2574 /*
2575 * If the process has the entitlement, use the kernel's context when
2576 * performing lookup on the mount path as the process might lack proper
2577 * permission to access the directory.
2578 */
2579 ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ?
2580 vfs_context_kernel() : vfs_context_current();
2581
2582 NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2583 UIO_USERSPACE, uap->path, ctx);
2584 error = namei(&nd);
2585 if (error) {
2586 return error;
2587 }
2588 vp = nd.ni_vp;
2589 mp = vp->v_mount;
2590 nameidone(&nd);
2591
2592 /*
2593 * Must be the root of the filesystem
2594 */
2595 if ((vp->v_flag & VROOT) == 0) {
2596 vnode_put(vp);
2597 return EINVAL;
2598 }
2599 #if CONFIG_MACF
2600 error = mac_mount_check_umount(ctx, mp);
2601 if (error != 0) {
2602 vnode_put(vp);
2603 return error;
2604 }
2605 #endif
2606 mount_ref(mp, 0);
2607 vnode_put(vp);
2608 /* safedounmount consumes the mount ref */
2609 return safedounmount(mp, uap->flags, ctx);
2610 }
2611
2612 int
vfs_unmountbyfsid(fsid_t * fsid,int flags,vfs_context_t ctx)2613 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2614 {
2615 mount_t mp;
2616
2617 mp = mount_list_lookupby_fsid(fsid, 0, 1);
2618 if (mp == (mount_t)0) {
2619 return ENOENT;
2620 }
2621 mount_ref(mp, 0);
2622 mount_iterdrop(mp);
2623 /* safedounmount consumes the mount ref */
2624 return safedounmount(mp, flags, ctx);
2625 }
2626
2627 /*
2628 * The mount struct comes with a mount ref which will be consumed.
2629 * Do the actual file system unmount, prevent some common foot shooting.
2630 */
2631 int
safedounmount(struct mount * mp,int flags,vfs_context_t ctx)2632 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2633 {
2634 int error;
2635 proc_t p = vfs_context_proc(ctx);
2636
2637 /*
2638 * If the file system is not responding and MNT_NOBLOCK
2639 * is set and not a forced unmount then return EBUSY.
2640 */
2641 if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2642 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2643 error = EBUSY;
2644 goto out;
2645 }
2646
2647 /*
2648 * Skip authorization in two cases:
2649 * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2650 * This entitlement allows non-root processes unmount volumes mounted by
2651 * other processes.
2652 * - If the mount is tagged as permissive and this is not a forced-unmount
2653 * attempt.
2654 */
2655 if (!IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2656 (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2657 /*
2658 * Only root, or the user that did the original mount is
2659 * permitted to unmount this filesystem.
2660 */
2661 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2662 (error = suser(kauth_cred_get(), &p->p_acflag))) {
2663 goto out;
2664 }
2665 }
2666 /*
2667 * Don't allow unmounting the root file system, or other volumes
2668 * associated with it (for example, the associated VM or DATA mounts) .
2669 */
2670 if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2671 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2672 printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2673 mp->mnt_vfsstat.f_mntonname);
2674 }
2675 error = EBUSY; /* the root (or associated volumes) is always busy */
2676 goto out;
2677 }
2678
2679 /*
2680 * If the mount is providing the root filesystem's disk image
2681 * (i.e. imageboot), don't allow unmounting
2682 */
2683 if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2684 error = EBUSY;
2685 goto out;
2686 }
2687
2688 return dounmount(mp, flags, 1, ctx);
2689
2690 out:
2691 mount_drop(mp, 0);
2692 return error;
2693 }
2694
2695 /*
2696 * Do the actual file system unmount.
2697 */
2698 int
dounmount(struct mount * mp,int flags,int withref,vfs_context_t ctx)2699 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2700 {
2701 vnode_t coveredvp = (vnode_t)0;
2702 int error;
2703 int needwakeup = 0;
2704 int forcedunmount = 0;
2705 int lflags = 0;
2706 struct vnode *devvp = NULLVP;
2707 #if CONFIG_TRIGGERS
2708 proc_t p = vfs_context_proc(ctx);
2709 int did_vflush = 0;
2710 int pflags_save = 0;
2711 #endif /* CONFIG_TRIGGERS */
2712
2713 #if CONFIG_FSE
2714 if (!(flags & MNT_FORCE)) {
2715 fsevent_unmount(mp, ctx); /* has to come first! */
2716 }
2717 #endif
2718
2719 mount_lock(mp);
2720
2721 /*
2722 * If already an unmount in progress just return EBUSY.
2723 * Even a forced unmount cannot override.
2724 */
2725 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2726 if (withref != 0) {
2727 mount_drop(mp, 1);
2728 }
2729 mount_unlock(mp);
2730 return EBUSY;
2731 }
2732
2733 if (flags & MNT_FORCE) {
2734 forcedunmount = 1;
2735 mp->mnt_lflag |= MNT_LFORCE;
2736 }
2737
2738 #if CONFIG_TRIGGERS
2739 if (flags & MNT_NOBLOCK && p != kernproc) {
2740 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2741 }
2742 #endif
2743
2744 mp->mnt_kern_flag |= MNTK_UNMOUNT;
2745 mp->mnt_lflag |= MNT_LUNMOUNT;
2746 mp->mnt_flag &= ~MNT_ASYNC;
2747 /*
2748 * anyone currently in the fast path that
2749 * trips over the cached rootvp will be
2750 * dumped out and forced into the slow path
2751 * to regenerate a new cached value
2752 */
2753 mp->mnt_realrootvp = NULLVP;
2754 mount_unlock(mp);
2755
2756 if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2757 /*
2758 * Force unmount any mounts in this filesystem.
2759 * If any unmounts fail - just leave them dangling.
2760 * Avoids recursion.
2761 */
2762 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2763 }
2764
2765 /*
2766 * taking the name_cache_lock exclusively will
2767 * insure that everyone is out of the fast path who
2768 * might be trying to use a now stale copy of
2769 * vp->v_mountedhere->mnt_realrootvp
2770 * bumping mount_generation causes the cached values
2771 * to be invalidated
2772 */
2773 name_cache_lock();
2774 mount_generation++;
2775 name_cache_unlock();
2776
2777
2778 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2779 if (withref != 0) {
2780 mount_drop(mp, 0);
2781 }
2782 error = 0;
2783 if (forcedunmount == 0) {
2784 ubc_umount(mp); /* release cached vnodes */
2785 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2786 error = VFS_SYNC(mp, MNT_WAIT, ctx);
2787 if (error) {
2788 mount_lock(mp);
2789 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2790 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2791 mp->mnt_lflag &= ~MNT_LFORCE;
2792 goto out;
2793 }
2794 }
2795 }
2796
2797 IOBSDMountChange(mp, kIOMountChangeUnmount);
2798
2799 #if CONFIG_TRIGGERS
2800 vfs_nested_trigger_unmounts(mp, flags, ctx);
2801 did_vflush = 1;
2802 #endif
2803 if (forcedunmount) {
2804 lflags |= FORCECLOSE;
2805 }
2806 error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags);
2807 if ((forcedunmount == 0) && error) {
2808 mount_lock(mp);
2809 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2810 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2811 mp->mnt_lflag &= ~MNT_LFORCE;
2812 goto out;
2813 }
2814
2815 /* make sure there are no one in the mount iterations or lookup */
2816 mount_iterdrain(mp);
2817
2818 error = VFS_UNMOUNT(mp, flags, ctx);
2819 if (error) {
2820 mount_iterreset(mp);
2821 mount_lock(mp);
2822 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2823 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2824 mp->mnt_lflag &= ~MNT_LFORCE;
2825 goto out;
2826 }
2827
2828 /* increment the operations count */
2829 if (!error) {
2830 OSAddAtomic(1, &vfs_nummntops);
2831 }
2832
2833 if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2834 /* hold an io reference and drop the usecount before close */
2835 devvp = mp->mnt_devvp;
2836 vnode_getalways(devvp);
2837 vnode_rele(devvp);
2838 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2839 ctx);
2840 vnode_clearmountedon(devvp);
2841 vnode_put(devvp);
2842 }
2843 lck_rw_done(&mp->mnt_rwlock);
2844 mount_list_remove(mp);
2845 lck_rw_lock_exclusive(&mp->mnt_rwlock);
2846
2847 /* mark the mount point hook in the vp but not drop the ref yet */
2848 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2849 /*
2850 * The covered vnode needs special handling. Trying to get an
2851 * iocount must not block here as this may lead to deadlocks
2852 * if the Filesystem to which the covered vnode belongs is
2853 * undergoing forced unmounts. Since we hold a usecount, the
2854 * vnode cannot be reused (it can, however, still be terminated)
2855 */
2856 vnode_getalways(coveredvp);
2857 vnode_lock_spin(coveredvp);
2858
2859 mp->mnt_crossref++;
2860 coveredvp->v_mountedhere = (struct mount *)0;
2861 CLR(coveredvp->v_flag, VMOUNT | VMOUNTEDHERE);
2862 vnode_unlock(coveredvp);
2863 vnode_put(coveredvp);
2864 }
2865
2866 mount_list_lock();
2867 mp->mnt_vtable->vfc_refcount--;
2868 mount_list_unlock();
2869
2870 cache_purgevfs(mp); /* remove cache entries for this file sys */
2871 vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2872 mount_lock(mp);
2873 mp->mnt_lflag |= MNT_LDEAD;
2874
2875 if (mp->mnt_lflag & MNT_LWAIT) {
2876 /*
2877 * do the wakeup here
2878 * in case we block in mount_refdrain
2879 * which will drop the mount lock
2880 * and allow anyone blocked in vfs_busy
2881 * to wakeup and see the LDEAD state
2882 */
2883 mp->mnt_lflag &= ~MNT_LWAIT;
2884 wakeup((caddr_t)mp);
2885 }
2886 mount_refdrain(mp);
2887
2888 /* free disk_conditioner_info structure for this mount */
2889 disk_conditioner_unmount(mp);
2890
2891 out:
2892 if (mp->mnt_lflag & MNT_LWAIT) {
2893 mp->mnt_lflag &= ~MNT_LWAIT;
2894 needwakeup = 1;
2895 }
2896
2897 #if CONFIG_TRIGGERS
2898 if (flags & MNT_NOBLOCK && p != kernproc) {
2899 // Restore P_NOREMOTEHANG bit to its previous value
2900 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2901 OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2902 }
2903 }
2904
2905 /*
2906 * Callback and context are set together under the mount lock, and
2907 * never cleared, so we're safe to examine them here, drop the lock,
2908 * and call out.
2909 */
2910 if (mp->mnt_triggercallback != NULL) {
2911 mount_unlock(mp);
2912 if (error == 0) {
2913 mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2914 } else if (did_vflush) {
2915 mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2916 }
2917 } else {
2918 mount_unlock(mp);
2919 }
2920 #else
2921 mount_unlock(mp);
2922 #endif /* CONFIG_TRIGGERS */
2923
2924 lck_rw_done(&mp->mnt_rwlock);
2925
2926 if (needwakeup) {
2927 wakeup((caddr_t)mp);
2928 }
2929
2930 if (!error) {
2931 if ((coveredvp != NULLVP)) {
2932 vnode_t pvp = NULLVP;
2933
2934 /*
2935 * The covered vnode needs special handling. Trying to
2936 * get an iocount must not block here as this may lead
2937 * to deadlocks if the Filesystem to which the covered
2938 * vnode belongs is undergoing forced unmounts. Since we
2939 * hold a usecount, the vnode cannot be reused
2940 * (it can, however, still be terminated).
2941 */
2942 vnode_getalways(coveredvp);
2943
2944 mount_dropcrossref(mp, coveredvp, 0);
2945 /*
2946 * We'll _try_ to detect if this really needs to be
2947 * done. The coveredvp can only be in termination (or
2948 * terminated) if the coveredvp's mount point is in a
2949 * forced unmount (or has been) since we still hold the
2950 * ref.
2951 */
2952 if (!vnode_isrecycled(coveredvp)) {
2953 pvp = vnode_getparent(coveredvp);
2954 #if CONFIG_TRIGGERS
2955 if (coveredvp->v_resolve) {
2956 vnode_trigger_rearm(coveredvp, ctx);
2957 }
2958 #endif
2959 }
2960
2961 vnode_rele(coveredvp);
2962 vnode_put(coveredvp);
2963 coveredvp = NULLVP;
2964
2965 if (pvp) {
2966 lock_vnode_and_post(pvp, NOTE_WRITE);
2967 vnode_put(pvp);
2968 }
2969 } else if (mp->mnt_flag & MNT_ROOTFS) {
2970 if (nc_smr_enabled) {
2971 vfs_smr_synchronize();
2972 }
2973
2974 mount_lock_destroy(mp);
2975 #if CONFIG_MACF
2976 mac_mount_label_destroy(mp);
2977 #endif
2978 zfree(mount_zone, mp);
2979 } else {
2980 panic("dounmount: no coveredvp");
2981 }
2982 }
2983 return error;
2984 }
2985
2986 /*
2987 * Unmount any mounts in this filesystem.
2988 */
2989 void
dounmount_submounts(struct mount * mp,int flags,vfs_context_t ctx)2990 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2991 {
2992 mount_t smp;
2993 fsid_t *fsids, fsid;
2994 int fsids_sz;
2995 int count = 0, i, m = 0;
2996 vnode_t vp;
2997
2998 mount_list_lock();
2999
3000 // Get an array to hold the submounts fsids.
3001 TAILQ_FOREACH(smp, &mountlist, mnt_list)
3002 count++;
3003 fsids_sz = count * sizeof(fsid_t);
3004 fsids = kalloc_data(fsids_sz, Z_NOWAIT);
3005 if (fsids == NULL) {
3006 mount_list_unlock();
3007 goto out;
3008 }
3009 fsids[0] = mp->mnt_vfsstat.f_fsid; // Prime the pump
3010
3011 /*
3012 * Fill the array with submount fsids.
3013 * Since mounts are always added to the tail of the mount list, the
3014 * list is always in mount order.
3015 * For each mount check if the mounted-on vnode belongs to a
3016 * mount that's already added to our array of mounts to be unmounted.
3017 */
3018 for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
3019 vp = smp->mnt_vnodecovered;
3020 if (vp == NULL) {
3021 continue;
3022 }
3023 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid; // Underlying fsid
3024 for (i = 0; i <= m; i++) {
3025 if (fsids[i].val[0] == fsid.val[0] &&
3026 fsids[i].val[1] == fsid.val[1]) {
3027 fsids[++m] = smp->mnt_vfsstat.f_fsid;
3028 break;
3029 }
3030 }
3031 }
3032 mount_list_unlock();
3033
3034 // Unmount the submounts in reverse order. Ignore errors.
3035 for (i = m; i > 0; i--) {
3036 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
3037 if (smp) {
3038 mount_ref(smp, 0);
3039 mount_iterdrop(smp);
3040 (void) dounmount(smp, flags, 1, ctx);
3041 }
3042 }
3043 out:
3044 kfree_data(fsids, fsids_sz);
3045 }
3046
3047 void
mount_dropcrossref(mount_t mp,vnode_t dp,int need_put)3048 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
3049 {
3050 vnode_hold(dp);
3051 vnode_lock(dp);
3052 mp->mnt_crossref--;
3053
3054 if (mp->mnt_crossref < 0) {
3055 panic("mount cross refs -ve");
3056 }
3057
3058 if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
3059 if (need_put) {
3060 vnode_put_locked(dp);
3061 }
3062 vnode_drop_and_unlock(dp);
3063
3064 if (nc_smr_enabled) {
3065 vfs_smr_synchronize();
3066 }
3067
3068 mount_lock_destroy(mp);
3069 #if CONFIG_MACF
3070 mac_mount_label_destroy(mp);
3071 #endif
3072 zfree(mount_zone, mp);
3073 return;
3074 }
3075 if (need_put) {
3076 vnode_put_locked(dp);
3077 }
3078 vnode_drop_and_unlock(dp);
3079 }
3080
3081
3082 /*
3083 * Sync each mounted filesystem.
3084 */
3085 #if DIAGNOSTIC
3086 int syncprt = 0;
3087 #endif
3088
3089 int print_vmpage_stat = 0;
3090
3091 /*
3092 * sync_callback: simple wrapper that calls VFS_SYNC() on volumes
3093 * mounted read-write with the passed waitfor value.
3094 *
3095 * Parameters: mp mount-point descriptor per mounted file-system instance.
3096 * arg user argument (please see below)
3097 *
3098 * User argument is a pointer to 32 bit unsigned integer which describes the
3099 * type of waitfor value to set for calling VFS_SYNC(). If user argument is
3100 * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
3101 * waitfor value.
3102 *
3103 * Returns: VFS_RETURNED
3104 */
3105 static int
sync_callback(mount_t mp,void * arg)3106 sync_callback(mount_t mp, void *arg)
3107 {
3108 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
3109 int asyncflag = mp->mnt_flag & MNT_ASYNC;
3110 unsigned waitfor = MNT_NOWAIT;
3111
3112 if (arg) {
3113 waitfor = *(uint32_t*)arg;
3114 }
3115
3116 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
3117 if (waitfor != MNT_WAIT &&
3118 waitfor != (MNT_WAIT | MNT_VOLUME) &&
3119 waitfor != MNT_NOWAIT &&
3120 waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
3121 waitfor != MNT_DWAIT &&
3122 waitfor != (MNT_DWAIT | MNT_VOLUME)) {
3123 panic("Passed inappropriate waitfor %u to "
3124 "sync_callback()", waitfor);
3125 }
3126
3127 mp->mnt_flag &= ~MNT_ASYNC;
3128 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
3129 if (asyncflag) {
3130 mp->mnt_flag |= MNT_ASYNC;
3131 }
3132 }
3133
3134 return VFS_RETURNED;
3135 }
3136
3137 /* ARGSUSED */
3138 int
sync(__unused proc_t p,__unused struct sync_args * uap,__unused int32_t * retval)3139 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
3140 {
3141 vfs_iterate(LK_NOWAIT, sync_callback, NULL);
3142
3143 if (print_vmpage_stat) {
3144 vm_countdirtypages();
3145 }
3146
3147 #if DIAGNOSTIC
3148 if (syncprt) {
3149 vfs_bufstats();
3150 }
3151 #endif /* DIAGNOSTIC */
3152 return 0;
3153 }
3154
3155 typedef enum {
3156 SYNC_ALL = 0,
3157 SYNC_ONLY_RELIABLE_MEDIA = 1,
3158 SYNC_ONLY_UNRELIABLE_MEDIA = 2
3159 } sync_type_t;
3160
3161 static int
sync_internal_callback(mount_t mp,void * arg)3162 sync_internal_callback(mount_t mp, void *arg)
3163 {
3164 if (arg) {
3165 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
3166 (mp->mnt_flag & MNT_LOCAL);
3167 sync_type_t sync_type = *((sync_type_t *)arg);
3168
3169 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
3170 return VFS_RETURNED;
3171 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
3172 return VFS_RETURNED;
3173 }
3174 }
3175
3176 (void)sync_callback(mp, NULL);
3177
3178 return VFS_RETURNED;
3179 }
3180
3181 int sync_thread_state = 0;
3182 int sync_timeout_seconds = 5;
3183
3184 #define SYNC_THREAD_RUN 0x0001
3185 #define SYNC_THREAD_RUNNING 0x0002
3186
3187 #if CONFIG_PHYS_WRITE_ACCT
3188 thread_t pm_sync_thread;
3189 #endif /* CONFIG_PHYS_WRITE_ACCT */
3190
3191 static void
sync_thread(__unused void * arg,__unused wait_result_t wr)3192 sync_thread(__unused void *arg, __unused wait_result_t wr)
3193 {
3194 sync_type_t sync_type;
3195 #if CONFIG_PHYS_WRITE_ACCT
3196 pm_sync_thread = current_thread();
3197 #endif /* CONFIG_PHYS_WRITE_ACCT */
3198
3199 lck_mtx_lock(&sync_mtx_lck);
3200 while (sync_thread_state & SYNC_THREAD_RUN) {
3201 sync_thread_state &= ~SYNC_THREAD_RUN;
3202 lck_mtx_unlock(&sync_mtx_lck);
3203
3204 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
3205 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3206 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
3207 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
3208
3209 lck_mtx_lock(&sync_mtx_lck);
3210 }
3211 /*
3212 * This wakeup _has_ to be issued before the lock is released otherwise
3213 * we may end up waking up a thread in sync_internal which is
3214 * expecting a wakeup from a thread it just created and not from this
3215 * thread which is about to exit.
3216 */
3217 wakeup(&sync_thread_state);
3218 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3219 #if CONFIG_PHYS_WRITE_ACCT
3220 pm_sync_thread = NULL;
3221 #endif /* CONFIG_PHYS_WRITE_ACCT */
3222 lck_mtx_unlock(&sync_mtx_lck);
3223
3224 if (print_vmpage_stat) {
3225 vm_countdirtypages();
3226 }
3227
3228 #if DIAGNOSTIC
3229 if (syncprt) {
3230 vfs_bufstats();
3231 }
3232 #endif /* DIAGNOSTIC */
3233 }
3234
3235 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
3236
3237 /*
3238 * An in-kernel sync for power management to call.
3239 * This function always returns within sync_timeout seconds.
3240 */
3241 __private_extern__ int
sync_internal(void)3242 sync_internal(void)
3243 {
3244 thread_t thd = NULL;
3245 int error;
3246 int thread_created = FALSE;
3247 struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
3248
3249 lck_mtx_lock(&sync_mtx_lck);
3250 sync_thread_state |= SYNC_THREAD_RUN;
3251 if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
3252 int kr;
3253
3254 sync_thread_state |= SYNC_THREAD_RUNNING;
3255 kr = kernel_thread_start(sync_thread, NULL, &thd);
3256 if (kr != KERN_SUCCESS) {
3257 sync_thread_state &= ~SYNC_THREAD_RUNNING;
3258 lck_mtx_unlock(&sync_mtx_lck);
3259 printf("sync_thread failed\n");
3260 return 0;
3261 }
3262 thread_created = TRUE;
3263 }
3264
3265 error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
3266 (PVFS | PDROP | PCATCH), "sync_thread", &ts);
3267 if (error) {
3268 struct timeval now;
3269
3270 microtime(&now);
3271 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
3272 printf("sync timed out: %d sec\n", sync_timeout_seconds);
3273 sync_timeout_last_print.tv_sec = now.tv_sec;
3274 }
3275 }
3276
3277 if (thread_created) {
3278 thread_deallocate(thd);
3279 }
3280
3281 return 0;
3282 } /* end of sync_internal call */
3283
3284 /*
3285 * Change filesystem quotas.
3286 */
3287 #if QUOTA
3288 int
quotactl(proc_t p,struct quotactl_args * uap,__unused int32_t * retval)3289 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
3290 {
3291 struct mount *mp;
3292 int error, quota_cmd, quota_status = 0;
3293 caddr_t datap;
3294 size_t fnamelen;
3295 struct nameidata nd;
3296 vfs_context_t ctx = vfs_context_current();
3297 struct dqblk my_dqblk = {};
3298
3299 AUDIT_ARG(uid, uap->uid);
3300 AUDIT_ARG(cmd, uap->cmd);
3301 NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3302 uap->path, ctx);
3303 error = namei(&nd);
3304 if (error) {
3305 return error;
3306 }
3307 mp = nd.ni_vp->v_mount;
3308 mount_ref(mp, 0);
3309 vnode_put(nd.ni_vp);
3310 nameidone(&nd);
3311
3312 #if CONFIG_MACF
3313 error = mac_mount_check_quotactl(ctx, mp, uap->cmd, uap->uid);
3314 if (error != 0) {
3315 goto out;
3316 }
3317 #endif
3318
3319 /* copyin any data we will need for downstream code */
3320 quota_cmd = uap->cmd >> SUBCMDSHIFT;
3321
3322 switch (quota_cmd) {
3323 case Q_QUOTAON:
3324 /* uap->arg specifies a file from which to take the quotas */
3325 fnamelen = MAXPATHLEN;
3326 datap = zalloc(ZV_NAMEI);
3327 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
3328 break;
3329 case Q_GETQUOTA:
3330 /* uap->arg is a pointer to a dqblk structure. */
3331 datap = (caddr_t) &my_dqblk;
3332 break;
3333 case Q_SETQUOTA:
3334 case Q_SETUSE:
3335 /* uap->arg is a pointer to a dqblk structure. */
3336 datap = (caddr_t) &my_dqblk;
3337 if (proc_is64bit(p)) {
3338 struct user_dqblk my_dqblk64;
3339 error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
3340 if (error == 0) {
3341 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
3342 }
3343 } else {
3344 error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
3345 }
3346 break;
3347 case Q_QUOTASTAT:
3348 /* uap->arg is a pointer to an integer */
3349 datap = (caddr_t) "a_status;
3350 break;
3351 default:
3352 datap = NULL;
3353 break;
3354 } /* switch */
3355
3356 if (error == 0) {
3357 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3358 }
3359
3360 switch (quota_cmd) {
3361 case Q_QUOTAON:
3362 if (datap != NULL) {
3363 zfree(ZV_NAMEI, datap);
3364 }
3365 break;
3366 case Q_GETQUOTA:
3367 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3368 if (error == 0) {
3369 if (proc_is64bit(p)) {
3370 struct user_dqblk my_dqblk64;
3371
3372 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3373 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3374 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3375 } else {
3376 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3377 }
3378 }
3379 break;
3380 case Q_QUOTASTAT:
3381 /* uap->arg is a pointer to an integer */
3382 if (error == 0) {
3383 error = copyout(datap, uap->arg, sizeof(quota_status));
3384 }
3385 break;
3386 default:
3387 break;
3388 } /* switch */
3389
3390 out:
3391 mount_drop(mp, 0);
3392 return error;
3393 }
3394 #else
3395 int
quotactl(__unused proc_t p,__unused struct quotactl_args * uap,__unused int32_t * retval)3396 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3397 {
3398 return EOPNOTSUPP;
3399 }
3400 #endif /* QUOTA */
3401
3402 static int
statfs_internal(proc_t p,struct mount * mp,user_addr_t bufp)3403 statfs_internal(proc_t p, struct mount *mp, user_addr_t bufp)
3404 {
3405 int error;
3406 vfs_context_t ctx = vfs_context_current();
3407
3408 #if CONFIG_MACF
3409 error = mac_mount_check_stat(ctx, mp);
3410 if (error != 0) {
3411 return error;
3412 }
3413 #endif
3414
3415 error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3416 if (error != 0) {
3417 return error;
3418 }
3419
3420 return munge_statfs(mp, &mp->mnt_vfsstat, bufp, NULL, IS_64BIT_PROCESS(p), TRUE);
3421 }
3422
3423 /*
3424 * Get filesystem statistics.
3425 *
3426 * Returns: 0 Success
3427 * namei:???
3428 * vfs_update_vfsstat:???
3429 * munge_statfs:EFAULT
3430 */
3431 /* ARGSUSED */
3432 int
statfs(proc_t p,struct statfs_args * uap,__unused int32_t * retval)3433 statfs(proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3434 {
3435 int error;
3436 struct mount *mp;
3437 struct nameidata nd;
3438 vfs_context_t ctx = vfs_context_current();
3439 vnode_t vp;
3440
3441 NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3442 UIO_USERSPACE, uap->path, ctx);
3443 error = namei(&nd);
3444 if (error != 0) {
3445 return error;
3446 }
3447 vp = nd.ni_vp;
3448 mp = vp->v_mount;
3449 nameidone(&nd);
3450
3451 error = statfs_internal(p, mp, uap->buf);
3452 vnode_put(vp);
3453
3454 return error;
3455 }
3456
3457 /*
3458 * Get filesystem statistics.
3459 */
3460 /* ARGSUSED */
3461 int
fstatfs(proc_t p,struct fstatfs_args * uap,__unused int32_t * retval)3462 fstatfs(proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3463 {
3464 int error;
3465 vnode_t vp = NULL;
3466 struct mount *mp;
3467
3468 AUDIT_ARG(fd, uap->fd);
3469
3470 if ((error = file_vnode(uap->fd, &vp)) ||
3471 (error = vnode_getwithref(vp))) {
3472 goto out;
3473 }
3474
3475 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3476
3477 mp = vp->v_mount;
3478 if (!mp) {
3479 error = EBADF;
3480 goto out_vnode;
3481 }
3482
3483 error = statfs_internal(p, mp, uap->buf);
3484
3485 out_vnode:
3486 vnode_put(vp);
3487
3488 out:
3489 if (vp != NULL) {
3490 file_drop(uap->fd);
3491 }
3492
3493 return error;
3494 }
3495
3496 void
vfs_get_statfs64(struct mount * mp,struct statfs64 * sfs)3497 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3498 {
3499 struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3500
3501 bzero(sfs, sizeof(*sfs));
3502
3503 sfs->f_bsize = vsfs->f_bsize;
3504 sfs->f_iosize = (int32_t)vsfs->f_iosize;
3505 sfs->f_blocks = vsfs->f_blocks;
3506 sfs->f_bfree = vsfs->f_bfree;
3507 sfs->f_bavail = vsfs->f_bavail;
3508 sfs->f_files = vsfs->f_files;
3509 sfs->f_ffree = vsfs->f_ffree;
3510 sfs->f_fsid = vsfs->f_fsid;
3511 sfs->f_owner = vsfs->f_owner;
3512 sfs->f_type = mp->mnt_vtable->vfc_typenum;
3513 sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3514 sfs->f_fssubtype = vsfs->f_fssubtype;
3515 sfs->f_flags_ext = 0;
3516 if (mp->mnt_kern_flag & MNTK_SYSTEMDATA) {
3517 sfs->f_flags_ext |= MNT_EXT_ROOT_DATA_VOL;
3518 }
3519 if (mp->mnt_kern_flag & MNTK_FSKIT) {
3520 sfs->f_flags_ext |= MNT_EXT_FSKIT;
3521 }
3522 vfs_getfstypename(mp, sfs->f_fstypename, MFSTYPENAMELEN);
3523 strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3524 strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3525 }
3526
3527 /*
3528 * Get file system statistics in 64-bit mode
3529 */
3530 int
statfs64(__unused struct proc * p,struct statfs64_args * uap,__unused int32_t * retval)3531 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3532 {
3533 struct mount *mp;
3534 int error;
3535 struct nameidata *ndp;
3536 struct statfs64 *sfsp;
3537 vfs_context_t ctxp = vfs_context_current();
3538 vnode_t vp;
3539 struct {
3540 struct nameidata nd;
3541 struct statfs64 sfs;
3542 } *__nameidata_statfs64;
3543
3544 __nameidata_statfs64 = kalloc_type(typeof(*__nameidata_statfs64),
3545 Z_WAITOK);
3546 ndp = &__nameidata_statfs64->nd;
3547
3548 NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3549 UIO_USERSPACE, uap->path, ctxp);
3550 error = namei(ndp);
3551 if (error != 0) {
3552 goto out;
3553 }
3554 vp = ndp->ni_vp;
3555 mp = vp->v_mount;
3556 nameidone(ndp);
3557
3558 #if CONFIG_MACF
3559 error = mac_mount_check_stat(ctxp, mp);
3560 if (error != 0) {
3561 vnode_put(vp);
3562 goto out;
3563 }
3564 #endif
3565
3566 error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3567 if (error != 0) {
3568 vnode_put(vp);
3569 goto out;
3570 }
3571
3572 sfsp = &__nameidata_statfs64->sfs;
3573 vfs_get_statfs64(mp, sfsp);
3574 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3575 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3576 /* This process does not want to see a seperate data volume mountpoint */
3577 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3578 }
3579 error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3580 vnode_put(vp);
3581
3582 out:
3583 kfree_type(typeof(*__nameidata_statfs64), __nameidata_statfs64);
3584
3585 return error;
3586 }
3587
3588 /*
3589 * Get file system statistics in 64-bit mode
3590 */
3591 int
fstatfs64(__unused struct proc * p,struct fstatfs64_args * uap,__unused int32_t * retval)3592 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3593 {
3594 struct vnode *vp;
3595 struct mount *mp;
3596 struct statfs64 sfs;
3597 int error;
3598
3599 AUDIT_ARG(fd, uap->fd);
3600
3601 if ((error = file_vnode(uap->fd, &vp))) {
3602 return error;
3603 }
3604
3605 error = vnode_getwithref(vp);
3606 if (error) {
3607 file_drop(uap->fd);
3608 return error;
3609 }
3610
3611 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3612
3613 mp = vp->v_mount;
3614 if (!mp) {
3615 error = EBADF;
3616 goto out;
3617 }
3618
3619 #if CONFIG_MACF
3620 error = mac_mount_check_stat(vfs_context_current(), mp);
3621 if (error != 0) {
3622 goto out;
3623 }
3624 #endif
3625
3626 if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3627 goto out;
3628 }
3629
3630 vfs_get_statfs64(mp, &sfs);
3631 if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3632 (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3633 /* This process does not want to see a seperate data volume mountpoint */
3634 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3635 }
3636 error = copyout(&sfs, uap->buf, sizeof(sfs));
3637
3638 out:
3639 file_drop(uap->fd);
3640 vnode_put(vp);
3641
3642 return error;
3643 }
3644
3645 struct getfsstat_struct {
3646 user_addr_t sfsp;
3647 user_addr_t *mp;
3648 int count;
3649 int maxcount;
3650 int flags;
3651 int error;
3652 };
3653
3654
3655 static int
getfsstat_callback(mount_t mp,void * arg)3656 getfsstat_callback(mount_t mp, void * arg)
3657 {
3658 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3659 struct vfsstatfs *sp;
3660 int error, my_size;
3661 vfs_context_t ctx = vfs_context_current();
3662
3663 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3664 #if CONFIG_MACF
3665 error = mac_mount_check_stat(ctx, mp);
3666 if (error != 0) {
3667 fstp->error = error;
3668 return VFS_RETURNED_DONE;
3669 }
3670 #endif
3671 sp = &mp->mnt_vfsstat;
3672 /*
3673 * If MNT_NOWAIT is specified, do not refresh the
3674 * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3675 */
3676 if ((mp->mnt_lflag & MNT_LDEAD) ||
3677 (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3678 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3679 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3680 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3681 return VFS_RETURNED;
3682 }
3683
3684 /*
3685 * Need to handle LP64 version of struct statfs
3686 */
3687 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3688 if (error) {
3689 fstp->error = error;
3690 return VFS_RETURNED_DONE;
3691 }
3692 fstp->sfsp += my_size;
3693
3694 if (fstp->mp) {
3695 #if CONFIG_MACF
3696 error = mac_mount_label_get(mp, *fstp->mp);
3697 if (error) {
3698 fstp->error = error;
3699 return VFS_RETURNED_DONE;
3700 }
3701 #endif
3702 fstp->mp++;
3703 }
3704 }
3705 fstp->count++;
3706 return VFS_RETURNED;
3707 }
3708
3709 /*
3710 * Get statistics on all filesystems.
3711 */
3712 int
getfsstat(__unused proc_t p,struct getfsstat_args * uap,int * retval)3713 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3714 {
3715 struct __mac_getfsstat_args muap;
3716
3717 muap.buf = uap->buf;
3718 muap.bufsize = uap->bufsize;
3719 muap.mac = USER_ADDR_NULL;
3720 muap.macsize = 0;
3721 muap.flags = uap->flags;
3722
3723 return __mac_getfsstat(p, &muap, retval);
3724 }
3725
3726 /*
3727 * __mac_getfsstat: Get MAC-related file system statistics
3728 *
3729 * Parameters: p (ignored)
3730 * uap User argument descriptor (see below)
3731 * retval Count of file system statistics (N stats)
3732 *
3733 * Indirect: uap->bufsize Buffer size
3734 * uap->macsize MAC info size
3735 * uap->buf Buffer where information will be returned
3736 * uap->mac MAC info
3737 * uap->flags File system flags
3738 *
3739 *
3740 * Returns: 0 Success
3741 * !0 Not success
3742 *
3743 */
3744 int
__mac_getfsstat(__unused proc_t p,struct __mac_getfsstat_args * uap,int * retval)3745 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3746 {
3747 user_addr_t sfsp;
3748 user_addr_t *mp;
3749 size_t count, maxcount, bufsize, macsize;
3750 struct getfsstat_struct fst;
3751
3752 if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3753 return EINVAL;
3754 }
3755
3756 bufsize = (size_t) uap->bufsize;
3757 macsize = (size_t) uap->macsize;
3758
3759 if (IS_64BIT_PROCESS(p)) {
3760 maxcount = bufsize / sizeof(struct user64_statfs);
3761 } else {
3762 maxcount = bufsize / sizeof(struct user32_statfs);
3763 }
3764 sfsp = uap->buf;
3765 count = 0;
3766
3767 mp = NULL;
3768
3769 #if CONFIG_MACF
3770 if (uap->mac != USER_ADDR_NULL) {
3771 u_int32_t *mp0;
3772 int error;
3773 unsigned int i;
3774
3775 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3776 if (count != maxcount) {
3777 return EINVAL;
3778 }
3779
3780 /* Copy in the array */
3781 mp0 = kalloc_data(macsize, Z_WAITOK);
3782 if (mp0 == NULL) {
3783 return ENOMEM;
3784 }
3785
3786 error = copyin(uap->mac, mp0, macsize);
3787 if (error) {
3788 kfree_data(mp0, macsize);
3789 return error;
3790 }
3791
3792 /* Normalize to an array of user_addr_t */
3793 mp = kalloc_data(count * sizeof(user_addr_t), Z_WAITOK);
3794 if (mp == NULL) {
3795 kfree_data(mp0, macsize);
3796 return ENOMEM;
3797 }
3798
3799 for (i = 0; i < count; i++) {
3800 if (IS_64BIT_PROCESS(p)) {
3801 mp[i] = ((user_addr_t *)mp0)[i];
3802 } else {
3803 mp[i] = (user_addr_t)mp0[i];
3804 }
3805 }
3806 kfree_data(mp0, macsize);
3807 }
3808 #endif
3809
3810
3811 fst.sfsp = sfsp;
3812 fst.mp = mp;
3813 fst.flags = uap->flags;
3814 fst.count = 0;
3815 fst.error = 0;
3816 fst.maxcount = (int)maxcount;
3817
3818
3819 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3820
3821 if (mp) {
3822 kfree_data(mp, count * sizeof(user_addr_t));
3823 }
3824
3825 if (fst.error) {
3826 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3827 return fst.error;
3828 }
3829
3830 if (fst.sfsp && fst.count > fst.maxcount) {
3831 *retval = fst.maxcount;
3832 } else {
3833 *retval = fst.count;
3834 }
3835 return 0;
3836 }
3837
3838 static int
getfsstat64_callback(mount_t mp,void * arg)3839 getfsstat64_callback(mount_t mp, void * arg)
3840 {
3841 struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3842 struct vfsstatfs *sp;
3843 struct statfs64 sfs;
3844 int error;
3845
3846 if (fstp->sfsp && fstp->count < fstp->maxcount) {
3847 #if CONFIG_MACF
3848 error = mac_mount_check_stat(vfs_context_current(), mp);
3849 if (error != 0) {
3850 fstp->error = error;
3851 return VFS_RETURNED_DONE;
3852 }
3853 #endif
3854 sp = &mp->mnt_vfsstat;
3855 /*
3856 * If MNT_NOWAIT is specified, do not refresh the fsstat
3857 * cache. MNT_WAIT overrides MNT_NOWAIT.
3858 *
3859 * We treat MNT_DWAIT as MNT_WAIT for all instances of
3860 * getfsstat, since the constants are out of the same
3861 * namespace.
3862 */
3863 if ((mp->mnt_lflag & MNT_LDEAD) ||
3864 ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3865 (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3866 (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3867 KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3868 return VFS_RETURNED;
3869 }
3870
3871 vfs_get_statfs64(mp, &sfs);
3872 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3873 if (error) {
3874 fstp->error = error;
3875 return VFS_RETURNED_DONE;
3876 }
3877 fstp->sfsp += sizeof(sfs);
3878 }
3879 fstp->count++;
3880 return VFS_RETURNED;
3881 }
3882
3883 /*
3884 * Get statistics on all file systems in 64 bit mode.
3885 */
3886 int
getfsstat64(__unused proc_t p,struct getfsstat64_args * uap,int * retval)3887 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3888 {
3889 user_addr_t sfsp;
3890 int count, maxcount;
3891 struct getfsstat_struct fst;
3892
3893 maxcount = uap->bufsize / sizeof(struct statfs64);
3894
3895 sfsp = uap->buf;
3896 count = 0;
3897
3898 fst.sfsp = sfsp;
3899 fst.flags = uap->flags;
3900 fst.count = 0;
3901 fst.error = 0;
3902 fst.maxcount = maxcount;
3903
3904 vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3905
3906 if (fst.error) {
3907 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3908 return fst.error;
3909 }
3910
3911 if (fst.sfsp && fst.count > fst.maxcount) {
3912 *retval = fst.maxcount;
3913 } else {
3914 *retval = fst.count;
3915 }
3916
3917 return 0;
3918 }
3919
3920 /*
3921 * gets the associated vnode with the file descriptor passed.
3922 * as input
3923 *
3924 * INPUT
3925 * ctx - vfs context of caller
3926 * fd - file descriptor for which vnode is required.
3927 * vpp - Pointer to pointer to vnode to be returned.
3928 *
3929 * The vnode is returned with an iocount so any vnode obtained
3930 * by this call needs a vnode_put
3931 *
3932 */
3933 int
vnode_getfromfd(vfs_context_t ctx,int fd,vnode_t * vpp)3934 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3935 {
3936 int error;
3937 vnode_t vp;
3938 struct fileproc *fp;
3939 proc_t p = vfs_context_proc(ctx);
3940
3941 *vpp = NULLVP;
3942
3943 error = fp_getfvp(p, fd, &fp, &vp);
3944 if (error) {
3945 return error;
3946 }
3947
3948 error = vnode_getwithref(vp);
3949 if (error) {
3950 (void)fp_drop(p, fd, fp, 0);
3951 return error;
3952 }
3953
3954 (void)fp_drop(p, fd, fp, 0);
3955 *vpp = vp;
3956 return error;
3957 }
3958
3959 /*
3960 * Wrapper function around namei to start lookup from a directory
3961 * specified by a file descriptor ni_dirfd.
3962 *
3963 * In addition to all the errors returned by namei, this call can
3964 * return ENOTDIR if the file descriptor does not refer to a directory.
3965 * and EBADF if the file descriptor is not valid.
3966 */
3967 int
nameiat(struct nameidata * ndp,int dirfd)3968 nameiat(struct nameidata *ndp, int dirfd)
3969 {
3970 if ((dirfd != AT_FDCWD) &&
3971 !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3972 !(ndp->ni_cnd.cn_flags & USEDVP)) {
3973 int error = 0;
3974 char c;
3975
3976 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3977 error = copyin(ndp->ni_dirp, &c, sizeof(char));
3978 if (error) {
3979 return error;
3980 }
3981 } else {
3982 c = *((char *)(ndp->ni_dirp));
3983 }
3984
3985 if (c != '/') {
3986 vnode_t dvp_at;
3987
3988 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3989 &dvp_at);
3990 if (error) {
3991 return error;
3992 }
3993
3994 if (vnode_vtype(dvp_at) != VDIR) {
3995 vnode_put(dvp_at);
3996 return ENOTDIR;
3997 }
3998
3999 ndp->ni_dvp = dvp_at;
4000 ndp->ni_cnd.cn_flags |= USEDVP;
4001 error = namei(ndp);
4002 ndp->ni_cnd.cn_flags &= ~USEDVP;
4003 vnode_put(dvp_at);
4004 return error;
4005 }
4006 }
4007
4008 return namei(ndp);
4009 }
4010
4011 /*
4012 * Change current working directory to a given file descriptor.
4013 */
4014 /* ARGSUSED */
4015 int
fchdir(proc_t p,vfs_context_t ctx,int fd,bool per_thread)4016 fchdir(proc_t p, vfs_context_t ctx, int fd, bool per_thread)
4017 {
4018 vnode_t vp;
4019 vnode_t tdp;
4020 vnode_t tvp;
4021 struct mount *mp;
4022 int error, should_put = 1;
4023
4024 AUDIT_ARG(fd, fd);
4025 if (per_thread && fd == -1) {
4026 /*
4027 * Switching back from per-thread to per process CWD; verify we
4028 * in fact have one before proceeding. The only success case
4029 * for this code path is to return 0 preemptively after zapping
4030 * the thread structure contents.
4031 */
4032 thread_t th = vfs_context_thread(ctx);
4033 if (th) {
4034 uthread_t uth = get_bsdthread_info(th);
4035 tvp = uth->uu_cdir;
4036 uth->uu_cdir = NULLVP;
4037 if (tvp != NULLVP) {
4038 vnode_rele(tvp);
4039 return 0;
4040 }
4041 }
4042 return EBADF;
4043 }
4044
4045 if ((error = file_vnode(fd, &vp))) {
4046 return error;
4047 }
4048 if ((error = vnode_getwithref(vp))) {
4049 file_drop(fd);
4050 return error;
4051 }
4052
4053 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
4054
4055 if (vp->v_type != VDIR) {
4056 error = ENOTDIR;
4057 goto out;
4058 }
4059
4060 #if CONFIG_MACF
4061 error = mac_vnode_check_chdir(ctx, vp);
4062 if (error) {
4063 goto out;
4064 }
4065 #endif
4066 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4067 if (error) {
4068 goto out;
4069 }
4070
4071 while (!error && (mp = vp->v_mountedhere) != NULL) {
4072 if (vfs_busy(mp, LK_NOWAIT)) {
4073 error = EACCES;
4074 goto out;
4075 }
4076 error = VFS_ROOT(mp, &tdp, ctx);
4077 vfs_unbusy(mp);
4078 if (error) {
4079 break;
4080 }
4081 vnode_put(vp);
4082 vp = tdp;
4083 }
4084 if (error) {
4085 goto out;
4086 }
4087 if ((error = vnode_ref(vp))) {
4088 goto out;
4089 }
4090 vnode_put(vp);
4091 should_put = 0;
4092
4093 if (per_thread) {
4094 thread_t th = vfs_context_thread(ctx);
4095 if (th) {
4096 uthread_t uth = get_bsdthread_info(th);
4097 tvp = uth->uu_cdir;
4098 uth->uu_cdir = vp;
4099 OSBitOrAtomic(P_THCWD, &p->p_flag);
4100 } else {
4101 vnode_rele(vp);
4102 error = ENOENT;
4103 goto out;
4104 }
4105 } else {
4106 proc_dirs_lock_exclusive(p);
4107 proc_fdlock(p);
4108 tvp = p->p_fd.fd_cdir;
4109 p->p_fd.fd_cdir = vp;
4110 proc_fdunlock(p);
4111 proc_dirs_unlock_exclusive(p);
4112 }
4113
4114 if (tvp) {
4115 vnode_rele(tvp);
4116 }
4117
4118 out:
4119 if (should_put) {
4120 vnode_put(vp);
4121 }
4122 file_drop(fd);
4123
4124 return error;
4125 }
4126
4127 int
sys_fchdir(proc_t p,struct fchdir_args * uap,__unused int32_t * retval)4128 sys_fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
4129 {
4130 return fchdir(p, vfs_context_current(), uap->fd, false);
4131 }
4132
4133 int
__pthread_fchdir(proc_t p,struct __pthread_fchdir_args * uap,__unused int32_t * retval)4134 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
4135 {
4136 return fchdir(p, vfs_context_current(), uap->fd, true);
4137 }
4138
4139
4140 /*
4141 * Change current working directory (".").
4142 *
4143 * Returns: 0 Success
4144 * change_dir:ENOTDIR
4145 * change_dir:???
4146 * vnode_ref:ENOENT No such file or directory
4147 */
4148 /* ARGSUSED */
4149 int
chdir_internal(proc_t p,vfs_context_t ctx,struct nameidata * ndp,int per_thread)4150 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
4151 {
4152 int error;
4153 vnode_t tvp;
4154
4155 error = change_dir(ndp, ctx);
4156 if (error) {
4157 return error;
4158 }
4159 if ((error = vnode_ref(ndp->ni_vp))) {
4160 vnode_put(ndp->ni_vp);
4161 return error;
4162 }
4163 /*
4164 * drop the iocount we picked up in change_dir
4165 */
4166 vnode_put(ndp->ni_vp);
4167
4168 if (per_thread) {
4169 thread_t th = vfs_context_thread(ctx);
4170 if (th) {
4171 uthread_t uth = get_bsdthread_info(th);
4172 tvp = uth->uu_cdir;
4173 uth->uu_cdir = ndp->ni_vp;
4174 OSBitOrAtomic(P_THCWD, &p->p_flag);
4175 } else {
4176 vnode_rele(ndp->ni_vp);
4177 return ENOENT;
4178 }
4179 } else {
4180 proc_dirs_lock_exclusive(p);
4181 proc_fdlock(p);
4182 tvp = p->p_fd.fd_cdir;
4183 p->p_fd.fd_cdir = ndp->ni_vp;
4184 proc_fdunlock(p);
4185 proc_dirs_unlock_exclusive(p);
4186 }
4187
4188 if (tvp) {
4189 vnode_rele(tvp);
4190 }
4191
4192 return 0;
4193 }
4194
4195
4196 /*
4197 * Change current working directory (".").
4198 *
4199 * Returns: 0 Success
4200 * chdir_internal:ENOTDIR
4201 * chdir_internal:ENOENT No such file or directory
4202 * chdir_internal:???
4203 */
4204 /* ARGSUSED */
4205 static int
common_chdir(proc_t p,struct chdir_args * uap,int per_thread)4206 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
4207 {
4208 struct nameidata nd;
4209 vfs_context_t ctx = vfs_context_current();
4210
4211 NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
4212 UIO_USERSPACE, uap->path, ctx);
4213
4214 return chdir_internal(p, ctx, &nd, per_thread);
4215 }
4216
4217
4218 /*
4219 * chdir
4220 *
4221 * Change current working directory (".") for the entire process
4222 *
4223 * Parameters: p Process requesting the call
4224 * uap User argument descriptor (see below)
4225 * retval (ignored)
4226 *
4227 * Indirect parameters: uap->path Directory path
4228 *
4229 * Returns: 0 Success
4230 * common_chdir: ENOTDIR
4231 * common_chdir: ENOENT No such file or directory
4232 * common_chdir: ???
4233 *
4234 */
4235 int
sys_chdir(proc_t p,struct chdir_args * uap,__unused int32_t * retval)4236 sys_chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
4237 {
4238 return common_chdir(p, (void *)uap, 0);
4239 }
4240
4241 /*
4242 * __pthread_chdir
4243 *
4244 * Change current working directory (".") for a single thread
4245 *
4246 * Parameters: p Process requesting the call
4247 * uap User argument descriptor (see below)
4248 * retval (ignored)
4249 *
4250 * Indirect parameters: uap->path Directory path
4251 *
4252 * Returns: 0 Success
4253 * common_chdir: ENOTDIR
4254 * common_chdir: ENOENT No such file or directory
4255 * common_chdir: ???
4256 *
4257 */
4258 int
__pthread_chdir(proc_t p,struct __pthread_chdir_args * uap,__unused int32_t * retval)4259 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
4260 {
4261 return common_chdir(p, (void *)uap, 1);
4262 }
4263
4264
4265 /*
4266 * Change notion of root (``/'') directory.
4267 */
4268 /* ARGSUSED */
4269 int
chroot(proc_t p,struct chroot_args * uap,__unused int32_t * retval)4270 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
4271 {
4272 struct filedesc *fdp = &p->p_fd;
4273 int error;
4274 struct nameidata nd;
4275 vnode_t tvp;
4276 vfs_context_t ctx = vfs_context_current();
4277
4278 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4279 return error;
4280 }
4281
4282 NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
4283 UIO_USERSPACE, uap->path, ctx);
4284 error = change_dir(&nd, ctx);
4285 if (error) {
4286 return error;
4287 }
4288
4289 #if CONFIG_MACF
4290 error = mac_vnode_check_chroot(ctx, nd.ni_vp,
4291 &nd.ni_cnd);
4292 if (error) {
4293 vnode_put(nd.ni_vp);
4294 return error;
4295 }
4296 #endif
4297
4298 if ((error = vnode_ref(nd.ni_vp))) {
4299 vnode_put(nd.ni_vp);
4300 return error;
4301 }
4302 vnode_put(nd.ni_vp);
4303
4304 /*
4305 * This lock provides the guarantee that as long as you hold the lock
4306 * fdp->fd_rdir has a usecount on it. This is used to take an iocount
4307 * on a referenced vnode in namei when determining the rootvnode for
4308 * a process.
4309 */
4310 /* needed for synchronization with lookup */
4311 proc_dirs_lock_exclusive(p);
4312 /* needed for setting the flag and other activities on the fd itself */
4313 proc_fdlock(p);
4314 tvp = fdp->fd_rdir;
4315 fdp->fd_rdir = nd.ni_vp;
4316 fdt_flag_set(fdp, FD_CHROOT);
4317 proc_fdunlock(p);
4318 proc_dirs_unlock_exclusive(p);
4319
4320 if (tvp != NULL) {
4321 vnode_rele(tvp);
4322 }
4323
4324 return 0;
4325 }
4326
4327 #define PATHSTATICBUFLEN 256
4328 #define PIVOT_ROOT_ENTITLEMENT \
4329 "com.apple.private.vfs.pivot-root"
4330
4331 #if defined(XNU_TARGET_OS_OSX)
4332 int
pivot_root(proc_t p,struct pivot_root_args * uap,__unused int * retval)4333 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
4334 {
4335 int error;
4336 char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
4337 char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
4338 char *new_rootfs_path_before_buf = NULL;
4339 char *old_rootfs_path_after_buf = NULL;
4340 char *incoming = NULL;
4341 char *outgoing = NULL;
4342 vnode_t incoming_rootvp = NULLVP;
4343 size_t bytes_copied;
4344
4345 /*
4346 * XXX : Additional restrictions needed
4347 * - perhaps callable only once.
4348 */
4349 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4350 return error;
4351 }
4352
4353 /*
4354 * pivot_root can be executed by launchd only.
4355 * Enforce entitlement.
4356 */
4357 if ((proc_getpid(p) != 1) || !IOCurrentTaskHasEntitlement(PIVOT_ROOT_ENTITLEMENT)) {
4358 return EPERM;
4359 }
4360
4361 error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4362 if (error == ENAMETOOLONG) {
4363 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4364 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4365 }
4366
4367 if (error) {
4368 goto out;
4369 }
4370
4371 error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4372 if (error == ENAMETOOLONG) {
4373 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4374 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4375 }
4376 if (error) {
4377 goto out;
4378 }
4379
4380 if (new_rootfs_path_before_buf) {
4381 incoming = new_rootfs_path_before_buf;
4382 } else {
4383 incoming = &new_rootfs_path_before[0];
4384 }
4385
4386 if (old_rootfs_path_after_buf) {
4387 outgoing = old_rootfs_path_after_buf;
4388 } else {
4389 outgoing = &old_rootfs_path_after[0];
4390 }
4391
4392 /*
4393 * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4394 * Userland is not allowed to pivot to an image.
4395 */
4396 error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4397 if (error) {
4398 goto out;
4399 }
4400 error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4401 if (error) {
4402 goto out;
4403 }
4404
4405 error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4406
4407 out:
4408 if (incoming_rootvp != NULLVP) {
4409 vnode_put(incoming_rootvp);
4410 incoming_rootvp = NULLVP;
4411 }
4412
4413 if (old_rootfs_path_after_buf) {
4414 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4415 }
4416
4417 if (new_rootfs_path_before_buf) {
4418 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4419 }
4420
4421 return error;
4422 }
4423 #else
4424 int
pivot_root(proc_t p,__unused struct pivot_root_args * uap,int * retval)4425 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4426 {
4427 return nosys(p, NULL, retval);
4428 }
4429 #endif /* XNU_TARGET_OS_OSX */
4430
4431 /*
4432 * Common routine for chroot and chdir.
4433 *
4434 * Returns: 0 Success
4435 * ENOTDIR Not a directory
4436 * namei:??? [anything namei can return]
4437 * vnode_authorize:??? [anything vnode_authorize can return]
4438 */
4439 static int
change_dir(struct nameidata * ndp,vfs_context_t ctx)4440 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4441 {
4442 vnode_t vp;
4443 int error;
4444
4445 if ((error = namei(ndp))) {
4446 return error;
4447 }
4448 nameidone(ndp);
4449 vp = ndp->ni_vp;
4450
4451 if (vp->v_type != VDIR) {
4452 vnode_put(vp);
4453 return ENOTDIR;
4454 }
4455
4456 #if CONFIG_MACF
4457 error = mac_vnode_check_chdir(ctx, vp);
4458 if (error) {
4459 vnode_put(vp);
4460 return error;
4461 }
4462 #endif
4463
4464 error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4465 if (error) {
4466 vnode_put(vp);
4467 return error;
4468 }
4469
4470 return error;
4471 }
4472
4473 /*
4474 * Free the vnode data (for directories) associated with the file glob.
4475 */
4476 struct fd_vn_data *
fg_vn_data_alloc(void)4477 fg_vn_data_alloc(void)
4478 {
4479 struct fd_vn_data *fvdata;
4480
4481 /* Allocate per fd vnode data */
4482 fvdata = kalloc_type(struct fd_vn_data, Z_WAITOK | Z_ZERO);
4483 lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4484 return fvdata;
4485 }
4486
4487 /*
4488 * Free the vnode data (for directories) associated with the file glob.
4489 */
4490 void
fg_vn_data_free(void * fgvndata)4491 fg_vn_data_free(void *fgvndata)
4492 {
4493 struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4494
4495 kfree_data(fvdata->fv_buf, fvdata->fv_bufallocsiz);
4496 lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4497 kfree_type(struct fd_vn_data, fvdata);
4498 }
4499
4500 /*
4501 * Check permissions, allocate an open file structure,
4502 * and call the device open routine if any.
4503 *
4504 * Returns: 0 Success
4505 * EINVAL
4506 * EINTR
4507 * falloc:ENFILE
4508 * falloc:EMFILE
4509 * falloc:ENOMEM
4510 * vn_open_auth:???
4511 * dupfdopen:???
4512 * VNOP_ADVLOCK:???
4513 * vnode_setsize:???
4514 *
4515 * XXX Need to implement uid, gid
4516 */
4517 int
open1(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int authfd)4518 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4519 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval, int authfd)
4520 {
4521 proc_t p = vfs_context_proc(ctx);
4522 kauth_cred_t p_cred = current_cached_proc_cred(PROC_NULL);
4523 uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4524 struct fileproc *fp;
4525 vnode_t vp;
4526 int flags, oflags, amode;
4527 int type, indx, error;
4528 struct vfs_context context;
4529 vnode_t authvp = NULLVP;
4530
4531 oflags = uflags;
4532
4533 amode = oflags & O_ACCMODE;
4534 /*
4535 * Because O_RDONLY is 0, it is not possible to distinguish between
4536 * O_EXEC | O_RDONLY and O_EXEC, therefore FEXEC/FSEARCH can't be set together
4537 * with FREAD/FWRITE.
4538 */
4539 if ((amode == O_ACCMODE) || (amode && (oflags & O_EXEC))) {
4540 return EINVAL;
4541 }
4542
4543 flags = FFLAGS(uflags);
4544 CLR(flags, FENCRYPTED);
4545 CLR(flags, FUNENCRYPTED);
4546
4547 AUDIT_ARG(fflags, oflags);
4548 AUDIT_ARG(mode, vap->va_mode);
4549
4550 if ((error = falloc_withinit(p, p_cred, ctx, &fp, &indx, fp_init, initarg)) != 0) {
4551 return error;
4552 }
4553 if (flags & O_CLOEXEC) {
4554 fp->fp_flags |= FP_CLOEXEC;
4555 }
4556 if (flags & O_CLOFORK) {
4557 fp->fp_flags |= FP_CLOFORK;
4558 }
4559
4560 /* setup state to recognize when fdesc_open was called */
4561 uu->uu_dupfd = -1;
4562
4563 /*
4564 * Disable read/write access if file is opened with O_EVTONLY and
4565 * the process has requested to deny read/write access.
4566 */
4567 if ((flags & O_EVTONLY) && proc_disallow_rw_for_o_evtonly(p)) {
4568 flags &= ~(FREAD | FWRITE);
4569 }
4570
4571 if (authfd != AUTH_OPEN_NOAUTHFD) {
4572 error = vnode_getfromfd(ctx, authfd, &authvp);
4573 if (error) {
4574 fp_free(p, indx, fp);
4575 return error;
4576 }
4577 }
4578
4579 if ((error = vn_open_auth(ndp, &flags, vap, authvp))) {
4580 if (authvp != NULLVP) {
4581 vnode_put(authvp);
4582 }
4583 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {
4584 if ((error = dupfdopen(p, indx, uu->uu_dupfd, flags, error)) == 0) {
4585 *retval = indx;
4586 return 0;
4587 }
4588 }
4589 if (error == ERESTART) {
4590 error = EINTR;
4591 }
4592 fp_free(p, indx, fp);
4593 return error;
4594 }
4595
4596 if (authvp != NULLVP) {
4597 vnode_put(authvp);
4598 }
4599
4600 uu->uu_dupfd = 0;
4601 vp = ndp->ni_vp;
4602
4603 fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4604 fp->fp_glob->fg_ops = &vnops;
4605 fp_set_data(fp, vp);
4606
4607 #if CONFIG_FILE_LEASES
4608 /*
4609 * If we are creating a file or open with truncate, we need to break the
4610 * lease if there is a read lease placed on the parent dir.
4611 */
4612 if ((vnode_vtype(vp) == VREG) && (flags & (O_CREAT | O_TRUNC))) {
4613 vnode_breakdirlease(vp, true, oflags);
4614 }
4615 /* Now check if there is a lease placed on the file itself. */
4616 error = vnode_breaklease(vp, oflags, ctx);
4617 if (error) {
4618 goto bad;
4619 }
4620 #endif /* CONFIG_FILE_LEASES */
4621
4622 if (flags & (O_EXLOCK | O_SHLOCK)) {
4623 struct flock lf = {
4624 .l_whence = SEEK_SET,
4625 };
4626
4627 if (flags & O_EXLOCK) {
4628 lf.l_type = F_WRLCK;
4629 } else {
4630 lf.l_type = F_RDLCK;
4631 }
4632 type = F_FLOCK;
4633 if ((flags & FNONBLOCK) == 0) {
4634 type |= F_WAIT;
4635 }
4636 #if CONFIG_MACF
4637 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4638 F_SETLK, &lf);
4639 if (error) {
4640 goto bad;
4641 }
4642 #endif
4643 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4644 goto bad;
4645 }
4646 fp->fp_glob->fg_flag |= FWASLOCKED;
4647 }
4648
4649 /* try to truncate by setting the size attribute */
4650 if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4651 goto bad;
4652 }
4653
4654 /*
4655 * For directories we hold some additional information in the fd.
4656 */
4657 if (vnode_vtype(vp) == VDIR) {
4658 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4659 } else {
4660 fp->fp_glob->fg_vn_data = NULL;
4661 }
4662
4663 #if CONFIG_SECLUDED_MEMORY
4664 if (secluded_for_filecache && vnode_vtype(vp) == VREG) {
4665 memory_object_control_t moc;
4666 const char *v_name;
4667
4668 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4669
4670 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4671 /* nothing to do... */
4672 } else if (fp->fp_glob->fg_flag & FWRITE) {
4673 /* writable -> no longer eligible for secluded pages */
4674 memory_object_mark_eligible_for_secluded(moc,
4675 FALSE);
4676 } else if (secluded_for_filecache == SECLUDED_FILECACHE_APPS) {
4677 char pathname[32] = { 0, };
4678 size_t copied;
4679 /* XXX FBDP: better way to detect /Applications/ ? */
4680 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4681 (void)copyinstr(ndp->ni_dirp,
4682 pathname,
4683 sizeof(pathname),
4684 &copied);
4685 } else {
4686 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4687 pathname,
4688 sizeof(pathname),
4689 &copied);
4690 }
4691 pathname[sizeof(pathname) - 1] = '\0';
4692 if (strncmp(pathname,
4693 "/Applications/",
4694 strlen("/Applications/")) == 0 &&
4695 strncmp(pathname,
4696 "/Applications/Camera.app/",
4697 strlen("/Applications/Camera.app/")) != 0) {
4698 /*
4699 * not writable
4700 * AND from "/Applications/"
4701 * AND not from "/Applications/Camera.app/"
4702 * ==> eligible for secluded
4703 */
4704 memory_object_mark_eligible_for_secluded(moc,
4705 TRUE);
4706 }
4707 } else if (secluded_for_filecache == SECLUDED_FILECACHE_RDONLY &&
4708 (v_name = vnode_getname(vp))) {
4709 size_t len = strlen(v_name);
4710
4711 if (!strncmp(v_name, "dyld", len) ||
4712 !strncmp(v_name, "launchd", len) ||
4713 !strncmp(v_name, "Camera", len) ||
4714 !strncmp(v_name, "SpringBoard", len) ||
4715 !strncmp(v_name, "backboardd", len)) {
4716 /*
4717 * This file matters when launching Camera:
4718 * do not store its contents in the secluded
4719 * pool that will be drained on Camera launch.
4720 */
4721 memory_object_mark_eligible_for_secluded(moc,
4722 FALSE);
4723 } else if (!strncmp(v_name, "mediaserverd", len)) {
4724 memory_object_mark_eligible_for_secluded(moc,
4725 FALSE);
4726 memory_object_mark_for_realtime(moc,
4727 true);
4728 } else if (!strncmp(v_name, "bluetoothd", len)) {
4729 /*
4730 * bluetoothd might be needed for realtime audio
4731 * playback.
4732 */
4733 memory_object_mark_eligible_for_secluded(moc,
4734 FALSE);
4735 memory_object_mark_for_realtime(moc,
4736 true);
4737 } else {
4738 char pathname[64] = { 0, };
4739 size_t copied;
4740 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4741 (void)copyinstr(ndp->ni_dirp,
4742 pathname,
4743 sizeof(pathname),
4744 &copied);
4745 } else {
4746 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4747 pathname,
4748 sizeof(pathname),
4749 &copied);
4750 }
4751 pathname[sizeof(pathname) - 1] = '\0';
4752 if (strncmp(pathname,
4753 "/Library/Audio/Plug-Ins/",
4754 strlen("/Library/Audio/Plug-Ins/")) == 0 ||
4755 strncmp(pathname,
4756 "/System/Library/Audio/Plug-Ins/",
4757 strlen("/System/Library/Audio/Plug-Ins/")) == 0) {
4758 /*
4759 * This may be an audio plugin required
4760 * for realtime playback.
4761 * ==> NOT eligible for secluded.
4762 */
4763 memory_object_mark_eligible_for_secluded(moc,
4764 FALSE);
4765 memory_object_mark_for_realtime(moc,
4766 true);
4767 }
4768 }
4769 vnode_putname(v_name);
4770 }
4771 }
4772 #endif /* CONFIG_SECLUDED_MEMORY */
4773
4774 vnode_put(vp);
4775
4776 /*
4777 * The first terminal open (without a O_NOCTTY) by a session leader
4778 * results in it being set as the controlling terminal.
4779 */
4780 if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4781 !(flags & O_NOCTTY)) {
4782 int tmp = 0;
4783
4784 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4785 (caddr_t)&tmp, ctx);
4786 }
4787
4788 proc_fdlock(p);
4789 procfdtbl_releasefd(p, indx, NULL);
4790
4791 fp_drop(p, indx, fp, 1);
4792 proc_fdunlock(p);
4793
4794 *retval = indx;
4795
4796 return 0;
4797 bad:
4798 context = *vfs_context_current();
4799 context.vc_ucred = fp->fp_glob->fg_cred;
4800
4801 if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4802 (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4803 struct flock lf = {
4804 .l_whence = SEEK_SET,
4805 .l_type = F_UNLCK,
4806 };
4807
4808 (void)VNOP_ADVLOCK(
4809 vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4810 }
4811
4812 vn_close(vp, fp->fp_glob->fg_flag, &context);
4813 vnode_put(vp);
4814 fp_free(p, indx, fp);
4815
4816 return error;
4817 }
4818
4819 /*
4820 * While most of the *at syscall handlers can call nameiat() which
4821 * is a wrapper around namei, the use of namei and initialisation
4822 * of nameidata are far removed and in different functions - namei
4823 * gets called in vn_open_auth for open1. So we'll just do here what
4824 * nameiat() does.
4825 */
4826 static int
open1at(vfs_context_t ctx,struct nameidata * ndp,int uflags,struct vnode_attr * vap,fp_initfn_t fp_init,void * initarg,int32_t * retval,int dirfd,int authfd)4827 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4828 struct vnode_attr *vap, fp_initfn_t fp_init, void *initarg, int32_t *retval,
4829 int dirfd, int authfd)
4830 {
4831 if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4832 int error;
4833 char c;
4834
4835 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4836 error = copyin(ndp->ni_dirp, &c, sizeof(char));
4837 if (error) {
4838 return error;
4839 }
4840 } else {
4841 c = *((char *)(ndp->ni_dirp));
4842 }
4843
4844 if (c != '/') {
4845 vnode_t dvp_at;
4846
4847 error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4848 &dvp_at);
4849 if (error) {
4850 return error;
4851 }
4852
4853 if (vnode_vtype(dvp_at) != VDIR) {
4854 vnode_put(dvp_at);
4855 return ENOTDIR;
4856 }
4857
4858 ndp->ni_dvp = dvp_at;
4859 ndp->ni_cnd.cn_flags |= USEDVP;
4860 error = open1(ctx, ndp, uflags, vap, fp_init, initarg,
4861 retval, authfd);
4862 vnode_put(dvp_at);
4863 return error;
4864 }
4865 }
4866
4867 return open1(ctx, ndp, uflags, vap, fp_init, initarg, retval, authfd);
4868 }
4869
4870 /*
4871 * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4872 *
4873 * Parameters: p Process requesting the open
4874 * uap User argument descriptor (see below)
4875 * retval Pointer to an area to receive the
4876 * return calue from the system call
4877 *
4878 * Indirect: uap->path Path to open (same as 'open')
4879 * uap->flags Flags to open (same as 'open'
4880 * uap->uid UID to set, if creating
4881 * uap->gid GID to set, if creating
4882 * uap->mode File mode, if creating (same as 'open')
4883 * uap->xsecurity ACL to set, if creating
4884 *
4885 * Returns: 0 Success
4886 * !0 errno value
4887 *
4888 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
4889 *
4890 * XXX: We should enummerate the possible errno values here, and where
4891 * in the code they originated.
4892 */
4893 int
open_extended(proc_t p,struct open_extended_args * uap,int32_t * retval)4894 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4895 {
4896 int ciferror;
4897 kauth_filesec_t xsecdst;
4898 struct vnode_attr va;
4899 struct nameidata nd;
4900 int cmode;
4901
4902 AUDIT_ARG(owner, uap->uid, uap->gid);
4903
4904 xsecdst = NULL;
4905 if ((uap->xsecurity != USER_ADDR_NULL) &&
4906 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4907 return ciferror;
4908 }
4909
4910 VATTR_INIT(&va);
4911 cmode = ((uap->mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
4912 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4913 if (uap->uid != KAUTH_UID_NONE) {
4914 VATTR_SET(&va, va_uid, uap->uid);
4915 }
4916 if (uap->gid != KAUTH_GID_NONE) {
4917 VATTR_SET(&va, va_gid, uap->gid);
4918 }
4919 if (xsecdst != NULL) {
4920 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4921 va.va_vaflags |= VA_FILESEC_ACL;
4922 }
4923
4924 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4925 uap->path, vfs_context_current());
4926
4927 ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4928 NULL, NULL, retval, AUTH_OPEN_NOAUTHFD);
4929 if (xsecdst != NULL) {
4930 kauth_filesec_free(xsecdst);
4931 }
4932
4933 return ciferror;
4934 }
4935
4936 /*
4937 * Go through the data-protected atomically controlled open (2)
4938 *
4939 * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4940 */
4941 static int
openat_dprotected_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int class,int dpflags,int fd,int authfd,enum uio_seg segflg,int * retval)4942 openat_dprotected_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4943 int class, int dpflags, int fd, int authfd, enum uio_seg segflg, int *retval)
4944 {
4945 /*
4946 * Follow the same path as normal open(2)
4947 * Look up the item if it exists, and acquire the vnode.
4948 */
4949 struct vnode_attr va;
4950 struct nameidata nd;
4951 int cmode;
4952 int error;
4953 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
4954
4955 VATTR_INIT(&va);
4956 /* Mask off all but regular access permissions */
4957 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4958 VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4959
4960 NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg,
4961 path, ctx);
4962
4963 /*
4964 * Initialize the extra fields in vnode_attr to pass down our
4965 * extra fields.
4966 * 1. target cprotect class.
4967 * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4968 */
4969 if (flags & O_CREAT) {
4970 /* lower level kernel code validates that the class is valid before applying it. */
4971 if (class != PROTECTION_CLASS_DEFAULT) {
4972 /*
4973 * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4974 * file behave the same as open (2)
4975 */
4976 VATTR_SET(&va, va_dataprotect_class, class);
4977 }
4978 }
4979
4980 if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED | O_DP_AUTHENTICATE)) {
4981 if (flags & (O_RDWR | O_WRONLY)) {
4982 /*
4983 * Not allowed to write raw encrypted bytes or when opening authenticated.
4984 */
4985 return EINVAL;
4986 }
4987 if (dpflags & O_DP_GETRAWENCRYPTED) {
4988 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4989 }
4990 if (dpflags & O_DP_GETRAWUNENCRYPTED) {
4991 VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4992 }
4993 if (dpflags & O_DP_AUTHENTICATE) {
4994 VATTR_SET(&va, va_dataprotect_flags, VA_DP_AUTHENTICATE);
4995 }
4996 }
4997
4998 error = open1at(vfs_context_current(), &nd, flags, &va,
4999 NULL, NULL, retval, fd, authfd);
5000
5001 return error;
5002 }
5003
5004 int
openat_dprotected_np(__unused proc_t p,struct openat_dprotected_np_args * uap,int32_t * retval)5005 openat_dprotected_np(__unused proc_t p, struct openat_dprotected_np_args *uap, int32_t *retval)
5006 {
5007 if ((uap->dpflags & O_DP_AUTHENTICATE) && (uap->flags & O_CREAT)) {
5008 return EINVAL;
5009 }
5010
5011 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5012 uap->class, uap->dpflags, uap->fd, uap->authfd, UIO_USERSPACE, retval);
5013 }
5014
5015 int
open_dprotected_np(__unused proc_t p,struct open_dprotected_np_args * uap,int32_t * retval)5016 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
5017 {
5018 if (uap->dpflags & O_DP_AUTHENTICATE) {
5019 return EINVAL;
5020 }
5021
5022 return openat_dprotected_internal(vfs_context_current(), uap->path, uap->flags, uap->mode,
5023 uap->class, uap->dpflags, AT_FDCWD, AUTH_OPEN_NOAUTHFD, UIO_USERSPACE, retval);
5024 }
5025
5026 static int
openat_internal(vfs_context_t ctx,user_addr_t path,int flags,int mode,int fd,enum uio_seg segflg,int * retval)5027 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
5028 int fd, enum uio_seg segflg, int *retval)
5029 {
5030 struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd;
5031 struct {
5032 struct vnode_attr va;
5033 struct nameidata nd;
5034 } *__open_data;
5035 struct vnode_attr *vap;
5036 struct nameidata *ndp;
5037 int cmode;
5038 int error;
5039
5040 __open_data = kalloc_type(typeof(*__open_data), Z_WAITOK);
5041 vap = &__open_data->va;
5042 ndp = &__open_data->nd;
5043
5044 VATTR_INIT(vap);
5045 /* Mask off all but regular access permissions */
5046 cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
5047 VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
5048
5049 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
5050 segflg, path, ctx);
5051
5052 error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD);
5053
5054 kfree_type(typeof(*__open_data), __open_data);
5055
5056 return error;
5057 }
5058
5059 int
open(proc_t p,struct open_args * uap,int32_t * retval)5060 open(proc_t p, struct open_args *uap, int32_t *retval)
5061 {
5062 __pthread_testcancel(1);
5063 return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
5064 }
5065
5066 int
open_nocancel(__unused proc_t p,struct open_nocancel_args * uap,int32_t * retval)5067 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
5068 int32_t *retval)
5069 {
5070 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5071 uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
5072 }
5073
5074 int
openat_nocancel(__unused proc_t p,struct openat_nocancel_args * uap,int32_t * retval)5075 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
5076 int32_t *retval)
5077 {
5078 return openat_internal(vfs_context_current(), uap->path, uap->flags,
5079 uap->mode, uap->fd, UIO_USERSPACE, retval);
5080 }
5081
5082 int
openat(proc_t p,struct openat_args * uap,int32_t * retval)5083 openat(proc_t p, struct openat_args *uap, int32_t *retval)
5084 {
5085 __pthread_testcancel(1);
5086 return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
5087 }
5088
5089 #define OPEN_BY_ID_ENTITLEMENT "com.apple.private.vfs.open-by-id"
5090
5091 static boolean_t
vfs_context_can_open_by_id(vfs_context_t ctx)5092 vfs_context_can_open_by_id(vfs_context_t ctx)
5093 {
5094 if (csproc_get_platform_binary(vfs_context_proc(ctx))) {
5095 return TRUE;
5096 }
5097
5098 return IOTaskHasEntitlement(vfs_context_task(ctx),
5099 OPEN_BY_ID_ENTITLEMENT);
5100 }
5101
5102 /*
5103 * openbyid_np: open a file given a file system id and a file system object id
5104 * the hfs file system object id is an fsobj_id_t {uint32, uint32}
5105 * file systems that don't support object ids it is a node id (uint64_t).
5106 *
5107 * Parameters: p Process requesting the open
5108 * uap User argument descriptor (see below)
5109 * retval Pointer to an area to receive the
5110 * return calue from the system call
5111 *
5112 * Indirect: uap->path Path to open (same as 'open')
5113 *
5114 * uap->fsid id of target file system
5115 * uap->objid id of target file system object
5116 * uap->flags Flags to open (same as 'open')
5117 *
5118 * Returns: 0 Success
5119 * !0 errno value
5120 *
5121 *
5122 * XXX: We should enummerate the possible errno values here, and where
5123 * in the code they originated.
5124 */
5125 int
openbyid_np(__unused proc_t p,struct openbyid_np_args * uap,int * retval)5126 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
5127 {
5128 fsid_t fsid;
5129 uint64_t objid;
5130 int error;
5131 char *buf = NULL;
5132 int buflen = MAXPATHLEN;
5133 int pathlen = 0;
5134 vfs_context_t ctx = vfs_context_current();
5135
5136 if (!vfs_context_can_open_by_id(ctx)) {
5137 return EPERM;
5138 }
5139
5140 if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
5141 return error;
5142 }
5143
5144 /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
5145 if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
5146 return error;
5147 }
5148
5149 AUDIT_ARG(value32, fsid.val[0]);
5150 AUDIT_ARG(value64, objid);
5151
5152 /*resolve path from fsis, objid*/
5153 do {
5154 buf = kalloc_data(buflen + 1, Z_WAITOK);
5155 if (buf == NULL) {
5156 return ENOMEM;
5157 }
5158
5159 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
5160 buf, FSOPT_ISREALFSID, &pathlen);
5161
5162 if (error) {
5163 kfree_data(buf, buflen + 1);
5164 buf = NULL;
5165 }
5166 } while (error == ENOSPC && (buflen += MAXPATHLEN));
5167
5168 if (error) {
5169 return error;
5170 }
5171
5172 buf[pathlen] = 0;
5173
5174 error = openat_internal(
5175 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
5176
5177 kfree_data(buf, buflen + 1);
5178
5179 return error;
5180 }
5181
5182
5183 /*
5184 * Create a special file.
5185 */
5186 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap,
5187 int fd);
5188
5189 static int
mknodat_internal(proc_t p,user_addr_t upath,struct vnode_attr * vap,mode_t mode,int fd)5190 mknodat_internal(proc_t p, user_addr_t upath, struct vnode_attr *vap,
5191 mode_t mode, int fd)
5192 {
5193 vfs_context_t ctx = vfs_context_current();
5194 struct nameidata nd;
5195 vnode_t vp, dvp;
5196 int error;
5197
5198 /* If it's a mknod() of a FIFO, call mkfifo1() instead */
5199 if ((mode & S_IFMT) == S_IFIFO) {
5200 return mkfifo1(ctx, upath, vap, fd);
5201 }
5202
5203 AUDIT_ARG(mode, mode);
5204 AUDIT_ARG(value32, vap->va_rdev);
5205
5206 if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
5207 return error;
5208 }
5209 NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
5210 UIO_USERSPACE, upath, ctx);
5211 error = nameiat(&nd, fd);
5212 if (error) {
5213 return error;
5214 }
5215 dvp = nd.ni_dvp;
5216 vp = nd.ni_vp;
5217
5218 if (vp != NULL) {
5219 error = EEXIST;
5220 goto out;
5221 }
5222
5223 switch (mode & S_IFMT) {
5224 case S_IFCHR:
5225 VATTR_SET(vap, va_type, VCHR);
5226 break;
5227 case S_IFBLK:
5228 VATTR_SET(vap, va_type, VBLK);
5229 break;
5230 default:
5231 error = EINVAL;
5232 goto out;
5233 }
5234
5235 #if CONFIG_MACF
5236 error = mac_vnode_check_create(ctx,
5237 nd.ni_dvp, &nd.ni_cnd, vap);
5238 if (error) {
5239 goto out;
5240 }
5241 #endif
5242
5243 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5244 goto out;
5245 }
5246
5247 #if CONFIG_FILE_LEASES
5248 vnode_breakdirlease(dvp, false, O_WRONLY);
5249 #endif
5250
5251 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
5252 goto out;
5253 }
5254
5255 if (vp) {
5256 int update_flags = 0;
5257
5258 // Make sure the name & parent pointers are hooked up
5259 if (vp->v_name == NULL) {
5260 update_flags |= VNODE_UPDATE_NAME;
5261 }
5262 if (vp->v_parent == NULLVP) {
5263 update_flags |= VNODE_UPDATE_PARENT;
5264 }
5265
5266 if (update_flags) {
5267 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5268 }
5269
5270 #if CONFIG_FSE
5271 add_fsevent(FSE_CREATE_FILE, ctx,
5272 FSE_ARG_VNODE, vp,
5273 FSE_ARG_DONE);
5274 #endif
5275 }
5276
5277 out:
5278 /*
5279 * nameidone has to happen before we vnode_put(dvp)
5280 * since it may need to release the fs_nodelock on the dvp
5281 */
5282 nameidone(&nd);
5283
5284 if (vp) {
5285 vnode_put(vp);
5286 }
5287 vnode_put(dvp);
5288
5289 return error;
5290 }
5291
5292 int
mknod(proc_t p,struct mknod_args * uap,__unused int32_t * retval)5293 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
5294 {
5295 struct vnode_attr va;
5296
5297 VATTR_INIT(&va);
5298 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5299 VATTR_SET(&va, va_rdev, uap->dev);
5300
5301 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, AT_FDCWD);
5302 }
5303
5304 int
mknodat(proc_t p,struct mknodat_args * uap,__unused int32_t * retval)5305 mknodat(proc_t p, struct mknodat_args *uap, __unused int32_t *retval)
5306 {
5307 struct vnode_attr va;
5308
5309 VATTR_INIT(&va);
5310 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5311 VATTR_SET(&va, va_rdev, uap->dev);
5312
5313 return mknodat_internal(p, uap->path, &va, (mode_t)uap->mode, uap->fd);
5314 }
5315
5316 /*
5317 * Create a named pipe.
5318 *
5319 * Returns: 0 Success
5320 * EEXIST
5321 * namei:???
5322 * vnode_authorize:???
5323 * vn_create:???
5324 */
5325 static int
mkfifo1(vfs_context_t ctx,user_addr_t upath,struct vnode_attr * vap,int fd)5326 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap, int fd)
5327 {
5328 vnode_t vp, dvp;
5329 int error;
5330 struct nameidata nd;
5331
5332 NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
5333 UIO_USERSPACE, upath, ctx);
5334 error = nameiat(&nd, fd);
5335 if (error) {
5336 return error;
5337 }
5338 dvp = nd.ni_dvp;
5339 vp = nd.ni_vp;
5340
5341 /* check that this is a new file and authorize addition */
5342 if (vp != NULL) {
5343 error = EEXIST;
5344 goto out;
5345 }
5346 VATTR_SET(vap, va_type, VFIFO);
5347
5348 if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
5349 goto out;
5350 }
5351
5352 error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
5353 out:
5354 /*
5355 * nameidone has to happen before we vnode_put(dvp)
5356 * since it may need to release the fs_nodelock on the dvp
5357 */
5358 nameidone(&nd);
5359
5360 if (vp) {
5361 vnode_put(vp);
5362 }
5363 vnode_put(dvp);
5364
5365 return error;
5366 }
5367
5368
5369 /*
5370 * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
5371 *
5372 * Parameters: p Process requesting the open
5373 * uap User argument descriptor (see below)
5374 * retval (Ignored)
5375 *
5376 * Indirect: uap->path Path to fifo (same as 'mkfifo')
5377 * uap->uid UID to set
5378 * uap->gid GID to set
5379 * uap->mode File mode to set (same as 'mkfifo')
5380 * uap->xsecurity ACL to set, if creating
5381 *
5382 * Returns: 0 Success
5383 * !0 errno value
5384 *
5385 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
5386 *
5387 * XXX: We should enummerate the possible errno values here, and where
5388 * in the code they originated.
5389 */
5390 int
mkfifo_extended(proc_t p,struct mkfifo_extended_args * uap,__unused int32_t * retval)5391 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
5392 {
5393 int ciferror;
5394 kauth_filesec_t xsecdst;
5395 struct vnode_attr va;
5396
5397 AUDIT_ARG(owner, uap->uid, uap->gid);
5398
5399 xsecdst = KAUTH_FILESEC_NONE;
5400 if (uap->xsecurity != USER_ADDR_NULL) {
5401 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
5402 return ciferror;
5403 }
5404 }
5405
5406 VATTR_INIT(&va);
5407 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5408 if (uap->uid != KAUTH_UID_NONE) {
5409 VATTR_SET(&va, va_uid, uap->uid);
5410 }
5411 if (uap->gid != KAUTH_GID_NONE) {
5412 VATTR_SET(&va, va_gid, uap->gid);
5413 }
5414 if (xsecdst != KAUTH_FILESEC_NONE) {
5415 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5416 va.va_vaflags |= VA_FILESEC_ACL;
5417 }
5418
5419 ciferror = mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5420
5421 if (xsecdst != KAUTH_FILESEC_NONE) {
5422 kauth_filesec_free(xsecdst);
5423 }
5424 return ciferror;
5425 }
5426
5427 /* ARGSUSED */
5428 int
mkfifo(proc_t p,struct mkfifo_args * uap,__unused int32_t * retval)5429 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
5430 {
5431 struct vnode_attr va;
5432
5433 VATTR_INIT(&va);
5434 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5435
5436 return mkfifo1(vfs_context_current(), uap->path, &va, AT_FDCWD);
5437 }
5438
5439 int
mkfifoat(proc_t p,struct mkfifoat_args * uap,__unused int32_t * retval)5440 mkfifoat(proc_t p, struct mkfifoat_args *uap, __unused int32_t *retval)
5441 {
5442 struct vnode_attr va;
5443
5444 VATTR_INIT(&va);
5445 VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd.fd_cmask);
5446
5447 return mkfifo1(vfs_context_current(), uap->path, &va, uap->fd);
5448 }
5449
5450 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
5451 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5452 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
5453
5454 int
safe_getpath_new(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path,int firmlink)5455 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
5456 {
5457 int ret, len = _len;
5458
5459 *truncated_path = 0;
5460
5461 if (firmlink) {
5462 ret = vn_getpath(dvp, path, &len);
5463 } else {
5464 ret = vn_getpath_no_firmlink(dvp, path, &len);
5465 }
5466 if (ret == 0 && len < (MAXPATHLEN - 1)) {
5467 if (leafname) {
5468 path[len - 1] = '/';
5469 len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
5470 if (len > MAXPATHLEN) {
5471 char *ptr;
5472
5473 // the string got truncated!
5474 *truncated_path = 1;
5475 ptr = strrchr(path, '/');
5476 if (ptr) {
5477 *ptr = '\0'; // chop off the string at the last directory component
5478 }
5479 len = (int)strlen(path) + 1;
5480 }
5481 }
5482 } else if (ret == 0) {
5483 *truncated_path = 1;
5484 } else if (ret != 0) {
5485 struct vnode *mydvp = dvp;
5486
5487 if (ret != ENOSPC) {
5488 printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
5489 dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
5490 }
5491 *truncated_path = 1;
5492
5493 do {
5494 if (mydvp->v_parent != NULL) {
5495 mydvp = mydvp->v_parent;
5496 } else if (mydvp->v_mount) {
5497 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
5498 break;
5499 } else {
5500 // no parent and no mount point? only thing is to punt and say "/" changed
5501 strlcpy(path, "/", _len);
5502 len = 2;
5503 mydvp = NULL;
5504 }
5505
5506 if (mydvp == NULL) {
5507 break;
5508 }
5509
5510 len = _len;
5511 if (firmlink) {
5512 ret = vn_getpath(mydvp, path, &len);
5513 } else {
5514 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5515 }
5516 } while (ret == ENOSPC);
5517 }
5518
5519 return len;
5520 }
5521
5522 int
safe_getpath(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5523 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5524 {
5525 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5526 }
5527
5528 int
safe_getpath_no_firmlink(struct vnode * dvp,char * leafname,char * path,int _len,int * truncated_path)5529 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5530 {
5531 return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5532 }
5533
5534 /*
5535 * Make a hard file link.
5536 *
5537 * Returns: 0 Success
5538 * EPERM
5539 * EEXIST
5540 * EXDEV
5541 * namei:???
5542 * vnode_authorize:???
5543 * VNOP_LINK:???
5544 */
5545 /* ARGSUSED */
5546 static int
linkat_internal(vfs_context_t ctx,int fd1,user_addr_t path,int fd2,user_addr_t link,int flag,enum uio_seg segflg)5547 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5548 user_addr_t link, int flag, enum uio_seg segflg)
5549 {
5550 vnode_t vp, pvp, dvp, lvp;
5551 struct nameidata nd;
5552 int follow;
5553 int error;
5554 #if CONFIG_FSE
5555 fse_info finfo;
5556 #endif
5557 int need_event, has_listeners, need_kpath2;
5558 char *target_path = NULL;
5559 char *no_firmlink_path = NULL;
5560 int truncated = 0;
5561 int truncated_no_firmlink_path = 0;
5562
5563 vp = dvp = lvp = NULLVP;
5564
5565 /* look up the object we are linking to */
5566 follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5567 NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5568 segflg, path, ctx);
5569
5570 error = nameiat(&nd, fd1);
5571 if (error) {
5572 return error;
5573 }
5574 vp = nd.ni_vp;
5575
5576 nameidone(&nd);
5577
5578 /*
5579 * Normally, linking to directories is not supported.
5580 * However, some file systems may have limited support.
5581 */
5582 if (vp->v_type == VDIR) {
5583 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5584 error = EPERM; /* POSIX */
5585 goto out;
5586 }
5587
5588 /* Linking to a directory requires ownership. */
5589 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5590 struct vnode_attr dva;
5591
5592 VATTR_INIT(&dva);
5593 VATTR_WANTED(&dva, va_uid);
5594 if (vnode_getattr(vp, &dva, ctx) != 0 ||
5595 !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5596 (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5597 error = EACCES;
5598 goto out;
5599 }
5600 }
5601 }
5602
5603 /* lookup the target node */
5604 #if CONFIG_TRIGGERS
5605 nd.ni_op = OP_LINK;
5606 #endif
5607 nd.ni_cnd.cn_nameiop = CREATE;
5608 nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5609 nd.ni_dirp = link;
5610 error = nameiat(&nd, fd2);
5611 if (error != 0) {
5612 goto out;
5613 }
5614 dvp = nd.ni_dvp;
5615 lvp = nd.ni_vp;
5616
5617 #if CONFIG_MACF
5618 if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5619 goto out2;
5620 }
5621 #endif
5622
5623 /* or to anything that kauth doesn't want us to (eg. immutable items) */
5624 if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5625 goto out2;
5626 }
5627
5628 /* target node must not exist */
5629 if (lvp != NULLVP) {
5630 error = EEXIST;
5631 goto out2;
5632 }
5633 /* cannot link across mountpoints */
5634 if (vnode_mount(vp) != vnode_mount(dvp)) {
5635 error = EXDEV;
5636 goto out2;
5637 }
5638
5639 /* authorize creation of the target note */
5640 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5641 goto out2;
5642 }
5643
5644 #if CONFIG_FILE_LEASES
5645 vnode_breakdirlease(dvp, false, O_WRONLY);
5646 #endif
5647
5648 /* and finally make the link */
5649 error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5650 if (error) {
5651 goto out2;
5652 }
5653
5654 #if CONFIG_MACF
5655 (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5656 #endif
5657
5658 #if CONFIG_FSE
5659 need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5660 #else
5661 need_event = 0;
5662 #endif
5663 has_listeners = kauth_authorize_fileop_has_listeners();
5664
5665 need_kpath2 = 0;
5666 #if CONFIG_AUDIT
5667 if (AUDIT_RECORD_EXISTS()) {
5668 need_kpath2 = 1;
5669 }
5670 #endif
5671
5672 if (need_event || has_listeners || need_kpath2) {
5673 char *link_to_path = NULL;
5674 int len, link_name_len;
5675 int len_no_firmlink_path = 0;
5676
5677 /* build the path to the new link file */
5678 GET_PATH(target_path);
5679
5680 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5681 if (no_firmlink_path == NULL) {
5682 GET_PATH(no_firmlink_path);
5683 }
5684 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5685
5686 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5687
5688 if (has_listeners) {
5689 /* build the path to file we are linking to */
5690 GET_PATH(link_to_path);
5691
5692 link_name_len = MAXPATHLEN;
5693 if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5694 /*
5695 * Call out to allow 3rd party notification of rename.
5696 * Ignore result of kauth_authorize_fileop call.
5697 */
5698 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5699 (uintptr_t)link_to_path,
5700 (uintptr_t)target_path);
5701 }
5702 if (link_to_path != NULL) {
5703 RELEASE_PATH(link_to_path);
5704 }
5705 }
5706 #if CONFIG_FSE
5707 if (need_event) {
5708 /* construct fsevent */
5709 if (get_fse_info(vp, &finfo, ctx) == 0) {
5710 if (truncated_no_firmlink_path) {
5711 finfo.mode |= FSE_TRUNCATED_PATH;
5712 }
5713
5714 // build the path to the destination of the link
5715 add_fsevent(FSE_CREATE_FILE, ctx,
5716 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5717 FSE_ARG_FINFO, &finfo,
5718 FSE_ARG_DONE);
5719 }
5720
5721 pvp = vp->v_parent;
5722 // need an iocount on parent vnode in this case
5723 if (pvp && pvp != dvp) {
5724 pvp = vnode_getparent_if_different(vp, dvp);
5725 }
5726 if (pvp) {
5727 add_fsevent(FSE_STAT_CHANGED, ctx,
5728 FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5729 }
5730 if (pvp && pvp != dvp) {
5731 vnode_put(pvp);
5732 }
5733 }
5734 #endif
5735 }
5736 out2:
5737 /*
5738 * nameidone has to happen before we vnode_put(dvp)
5739 * since it may need to release the fs_nodelock on the dvp
5740 */
5741 nameidone(&nd);
5742 if (target_path != NULL) {
5743 RELEASE_PATH(target_path);
5744 }
5745 if (no_firmlink_path != NULL) {
5746 RELEASE_PATH(no_firmlink_path);
5747 no_firmlink_path = NULL;
5748 }
5749 out:
5750 if (lvp) {
5751 vnode_put(lvp);
5752 }
5753 if (dvp) {
5754 vnode_put(dvp);
5755 }
5756 vnode_put(vp);
5757 return error;
5758 }
5759
5760 int
link(__unused proc_t p,struct link_args * uap,__unused int32_t * retval)5761 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5762 {
5763 return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5764 AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5765 }
5766
5767 int
linkat(__unused proc_t p,struct linkat_args * uap,__unused int32_t * retval)5768 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5769 {
5770 if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5771 return EINVAL;
5772 }
5773
5774 return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5775 uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5776 }
5777
5778 /*
5779 * Make a symbolic link.
5780 *
5781 * We could add support for ACLs here too...
5782 */
5783 /* ARGSUSED */
5784 static int
symlinkat_internal(vfs_context_t ctx,user_addr_t path_data,int fd,user_addr_t link,enum uio_seg segflg)5785 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5786 user_addr_t link, enum uio_seg segflg)
5787 {
5788 struct vnode_attr va;
5789 char *path;
5790 int error;
5791 struct nameidata nd;
5792 vnode_t vp, dvp;
5793 size_t dummy = 0;
5794 proc_t p;
5795
5796 error = 0;
5797 if (UIO_SEG_IS_USER_SPACE(segflg)) {
5798 path = zalloc(ZV_NAMEI);
5799 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5800 } else {
5801 path = (char *)path_data;
5802 }
5803 if (error) {
5804 goto out;
5805 }
5806 AUDIT_ARG(text, path); /* This is the link string */
5807
5808 NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5809 segflg, link, ctx);
5810
5811 error = nameiat(&nd, fd);
5812 if (error) {
5813 goto out;
5814 }
5815 dvp = nd.ni_dvp;
5816 vp = nd.ni_vp;
5817
5818 p = vfs_context_proc(ctx);
5819 VATTR_INIT(&va);
5820 VATTR_SET(&va, va_type, VLNK);
5821 VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd.fd_cmask);
5822
5823 #if CONFIG_MACF
5824 error = mac_vnode_check_create(ctx,
5825 dvp, &nd.ni_cnd, &va);
5826 #endif
5827 if (error != 0) {
5828 goto skipit;
5829 }
5830
5831 if (vp != NULL) {
5832 error = EEXIST;
5833 goto skipit;
5834 }
5835
5836 /* authorize */
5837 if (error == 0) {
5838 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5839 }
5840 /* get default ownership, etc. */
5841 if (error == 0) {
5842 error = vnode_authattr_new(dvp, &va, 0, ctx);
5843 }
5844
5845 #if CONFIG_FILE_LEASES
5846 vnode_breakdirlease(dvp, false, O_WRONLY);
5847 #endif
5848
5849 if (error == 0) {
5850 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5851 }
5852
5853 /* do fallback attribute handling */
5854 if (error == 0 && vp) {
5855 error = vnode_setattr_fallback(vp, &va, ctx);
5856 }
5857
5858 #if CONFIG_MACF
5859 if (error == 0 && vp) {
5860 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5861 }
5862 #endif
5863
5864 if (error == 0) {
5865 int update_flags = 0;
5866
5867 /*check if a new vnode was created, else try to get one*/
5868 if (vp == NULL) {
5869 nd.ni_cnd.cn_nameiop = LOOKUP;
5870 #if CONFIG_TRIGGERS
5871 nd.ni_op = OP_LOOKUP;
5872 #endif
5873 /*
5874 * Clear all flags except HASBUF to prevent 'cn_pnbuf' buffer to be
5875 * reallocated again in namei().
5876 */
5877 nd.ni_cnd.cn_flags &= HASBUF;
5878 error = nameiat(&nd, fd);
5879 if (error) {
5880 goto skipit;
5881 }
5882 vp = nd.ni_vp;
5883 }
5884
5885 #if 0 /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5886 /* call out to allow 3rd party notification of rename.
5887 * Ignore result of kauth_authorize_fileop call.
5888 */
5889 if (kauth_authorize_fileop_has_listeners() &&
5890 namei(&nd) == 0) {
5891 char *new_link_path = NULL;
5892 int len;
5893
5894 /* build the path to the new link file */
5895 new_link_path = get_pathbuff();
5896 len = MAXPATHLEN;
5897 vn_getpath(dvp, new_link_path, &len);
5898 if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5899 new_link_path[len - 1] = '/';
5900 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5901 }
5902
5903 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5904 (uintptr_t)path, (uintptr_t)new_link_path);
5905 if (new_link_path != NULL) {
5906 release_pathbuff(new_link_path);
5907 }
5908 }
5909 #endif
5910 // Make sure the name & parent pointers are hooked up
5911 if (vp->v_name == NULL) {
5912 update_flags |= VNODE_UPDATE_NAME;
5913 }
5914 if (vp->v_parent == NULLVP) {
5915 update_flags |= VNODE_UPDATE_PARENT;
5916 }
5917
5918 if (update_flags) {
5919 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5920 }
5921
5922 #if CONFIG_FSE
5923 add_fsevent(FSE_CREATE_FILE, ctx,
5924 FSE_ARG_VNODE, vp,
5925 FSE_ARG_DONE);
5926 #endif
5927 }
5928
5929 skipit:
5930 /*
5931 * nameidone has to happen before we vnode_put(dvp)
5932 * since it may need to release the fs_nodelock on the dvp
5933 */
5934 nameidone(&nd);
5935
5936 if (vp) {
5937 vnode_put(vp);
5938 }
5939 vnode_put(dvp);
5940 out:
5941 if (path && (path != (char *)path_data)) {
5942 zfree(ZV_NAMEI, path);
5943 }
5944
5945 return error;
5946 }
5947
5948 int
symlink(__unused proc_t p,struct symlink_args * uap,__unused int32_t * retval)5949 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5950 {
5951 return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5952 uap->link, UIO_USERSPACE);
5953 }
5954
5955 int
symlinkat(__unused proc_t p,struct symlinkat_args * uap,__unused int32_t * retval)5956 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5957 __unused int32_t *retval)
5958 {
5959 return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5960 uap->path2, UIO_USERSPACE);
5961 }
5962
5963 /*
5964 * Delete a whiteout from the filesystem.
5965 * No longer supported.
5966 */
5967 int
undelete(__unused proc_t p,__unused struct undelete_args * uap,__unused int32_t * retval)5968 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5969 {
5970 return ENOTSUP;
5971 }
5972
5973 /*
5974 * Delete a name from the filesystem.
5975 */
5976 /* ARGSUSED */
5977 static int
unlinkat_internal(vfs_context_t ctx,int fd,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)5978 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5979 user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5980 {
5981 struct {
5982 struct nameidata nd;
5983 #if CONFIG_FSE
5984 struct vnode_attr va;
5985 fse_info finfo;
5986 #endif
5987 } *__unlink_data;
5988 struct nameidata *ndp;
5989 vnode_t vp, dvp;
5990 int error;
5991 struct componentname *cnp;
5992 char *path = NULL;
5993 char *no_firmlink_path = NULL;
5994 int len_path = 0;
5995 int len_no_firmlink_path = 0;
5996 int flags;
5997 int need_event;
5998 int has_listeners;
5999 int truncated_path;
6000 int truncated_no_firmlink_path;
6001 int batched;
6002 struct vnode_attr *vap;
6003 int do_retry;
6004 int retry_count = 0;
6005 int cn_flags;
6006 int nofollow_any = 0;
6007
6008 cn_flags = LOCKPARENT;
6009 if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
6010 cn_flags |= AUDITVNPATH1;
6011 }
6012 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
6013 nofollow_any = NAMEI_NOFOLLOW_ANY;
6014 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
6015 }
6016 /* If a starting dvp is passed, it trumps any fd passed. */
6017 if (start_dvp) {
6018 cn_flags |= USEDVP;
6019 }
6020
6021 #if NAMEDRSRCFORK
6022 /* unlink or delete is allowed on rsrc forks and named streams */
6023 cn_flags |= CN_ALLOWRSRCFORK;
6024 #endif
6025
6026 __unlink_data = kalloc_type(typeof(*__unlink_data), Z_WAITOK);
6027 ndp = &__unlink_data->nd;
6028 #if CONFIG_FSE
6029 fse_info *finfop = &__unlink_data->finfo;
6030 #endif
6031
6032 retry:
6033 do_retry = 0;
6034 flags = 0;
6035 need_event = 0;
6036 has_listeners = 0;
6037 truncated_path = 0;
6038 truncated_no_firmlink_path = 0;
6039 vap = NULL;
6040
6041 NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
6042
6043 ndp->ni_dvp = start_dvp;
6044 ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any;
6045 cnp = &ndp->ni_cnd;
6046
6047 continue_lookup:
6048 error = nameiat(ndp, fd);
6049 if (error) {
6050 goto early_out;
6051 }
6052
6053 dvp = ndp->ni_dvp;
6054 vp = ndp->ni_vp;
6055
6056 /* With Carbon delete semantics, busy files cannot be deleted */
6057 if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
6058 flags |= VNODE_REMOVE_NODELETEBUSY;
6059 }
6060
6061 /* Skip any potential upcalls if told to. */
6062 if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
6063 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
6064 }
6065
6066 if (vp) {
6067 batched = vnode_compound_remove_available(vp);
6068 /*
6069 * The root of a mounted filesystem cannot be deleted.
6070 */
6071 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
6072 error = EBUSY;
6073 goto out;
6074 }
6075
6076 #if DEVELOPMENT || DEBUG
6077 /*
6078 * XXX VSWAP: Check for entitlements or special flag here
6079 * so we can restrict access appropriately.
6080 */
6081 #else /* DEVELOPMENT || DEBUG */
6082
6083 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
6084 error = EPERM;
6085 goto out;
6086 }
6087 #endif /* DEVELOPMENT || DEBUG */
6088
6089 if (!batched) {
6090 error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
6091 if (error) {
6092 if (error == ENOENT) {
6093 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6094 do_retry = 1;
6095 retry_count++;
6096 }
6097 }
6098 goto out;
6099 }
6100 }
6101 } else {
6102 batched = 1;
6103
6104 if (!vnode_compound_remove_available(dvp)) {
6105 panic("No vp, but no compound remove?");
6106 }
6107 }
6108
6109 #if CONFIG_FSE
6110 need_event = need_fsevent(FSE_DELETE, dvp);
6111 if (need_event) {
6112 if (!batched) {
6113 if ((vp->v_flag & VISHARDLINK) == 0) {
6114 /* XXX need to get these data in batched VNOP */
6115 get_fse_info(vp, finfop, ctx);
6116 }
6117 } else {
6118 error =
6119 vfs_get_notify_attributes(&__unlink_data->va);
6120 if (error) {
6121 goto out;
6122 }
6123
6124 vap = &__unlink_data->va;
6125 }
6126 }
6127 #endif
6128 has_listeners = kauth_authorize_fileop_has_listeners();
6129 if (need_event || has_listeners) {
6130 if (path == NULL) {
6131 GET_PATH(path);
6132 }
6133 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
6134 if (no_firmlink_path == NULL) {
6135 GET_PATH(no_firmlink_path);
6136 }
6137 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
6138 }
6139
6140 #if NAMEDRSRCFORK
6141 if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
6142 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
6143 } else
6144 #endif
6145 {
6146 #if CONFIG_FILE_LEASES
6147 vnode_breakdirlease(dvp, false, O_WRONLY);
6148 #endif
6149
6150 error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx);
6151 vp = ndp->ni_vp;
6152 if (error == EKEEPLOOKING) {
6153 if (!batched) {
6154 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
6155 }
6156
6157 if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6158 panic("EKEEPLOOKING, but continue flag not set?");
6159 }
6160
6161 if (vnode_isdir(vp)) {
6162 error = EISDIR;
6163 goto out;
6164 }
6165 goto continue_lookup;
6166 } else if (error == ENOENT && batched) {
6167 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6168 /*
6169 * For compound VNOPs, the authorization callback may
6170 * return ENOENT in case of racing hardlink lookups
6171 * hitting the name cache, redrive the lookup.
6172 */
6173 do_retry = 1;
6174 retry_count += 1;
6175 goto out;
6176 }
6177 }
6178 }
6179
6180 /*
6181 * Call out to allow 3rd party notification of delete.
6182 * Ignore result of kauth_authorize_fileop call.
6183 */
6184 if (!error) {
6185 if (has_listeners) {
6186 kauth_authorize_fileop(vfs_context_ucred(ctx),
6187 KAUTH_FILEOP_DELETE,
6188 (uintptr_t)vp,
6189 (uintptr_t)path);
6190 }
6191
6192 if (vp->v_flag & VISHARDLINK) {
6193 //
6194 // if a hardlink gets deleted we want to blow away the
6195 // v_parent link because the path that got us to this
6196 // instance of the link is no longer valid. this will
6197 // force the next call to get the path to ask the file
6198 // system instead of just following the v_parent link.
6199 //
6200 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
6201 }
6202
6203 #if CONFIG_FSE
6204 if (need_event) {
6205 if (vp->v_flag & VISHARDLINK) {
6206 get_fse_info(vp, finfop, ctx);
6207 } else if (vap) {
6208 vnode_get_fse_info_from_vap(vp, finfop, vap);
6209 }
6210 if (truncated_path) {
6211 finfop->mode |= FSE_TRUNCATED_PATH;
6212 }
6213 add_fsevent(FSE_DELETE, ctx,
6214 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
6215 FSE_ARG_FINFO, finfop,
6216 FSE_ARG_DONE);
6217 }
6218 #endif
6219
6220 #if CONFIG_MACF
6221 mac_vnode_notify_unlink(ctx, dvp, vp, cnp);
6222 #endif
6223 }
6224
6225 out:
6226 if (path != NULL) {
6227 RELEASE_PATH(path);
6228 path = NULL;
6229 }
6230
6231 if (no_firmlink_path != NULL) {
6232 RELEASE_PATH(no_firmlink_path);
6233 no_firmlink_path = NULL;
6234 }
6235 #if NAMEDRSRCFORK
6236 /* recycle the deleted rsrc fork vnode to force a reclaim, which
6237 * will cause its shadow file to go away if necessary.
6238 */
6239 if (vp && (vnode_isnamedstream(vp)) &&
6240 (vp->v_parent != NULLVP) &&
6241 vnode_isshadow(vp)) {
6242 vnode_recycle(vp);
6243 }
6244 #endif
6245 /*
6246 * nameidone has to happen before we vnode_put(dvp)
6247 * since it may need to release the fs_nodelock on the dvp
6248 */
6249 nameidone(ndp);
6250 vnode_put(dvp);
6251 if (vp) {
6252 vnode_put(vp);
6253 }
6254
6255 if (do_retry) {
6256 goto retry;
6257 }
6258
6259 early_out:
6260 kfree_type(typeof(*__unlink_data), __unlink_data);
6261 return error;
6262 }
6263
6264 int
unlink1(vfs_context_t ctx,vnode_t start_dvp,user_addr_t path_arg,enum uio_seg segflg,int unlink_flags)6265 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
6266 enum uio_seg segflg, int unlink_flags)
6267 {
6268 return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
6269 unlink_flags);
6270 }
6271
6272 /*
6273 * Delete a name from the filesystem using Carbon semantics.
6274 */
6275 int
delete(__unused proc_t p,struct delete_args * uap,__unused int32_t * retval)6276 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
6277 {
6278 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6279 uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
6280 }
6281
6282 /*
6283 * Delete a name from the filesystem using POSIX semantics.
6284 */
6285 int
unlink(__unused proc_t p,struct unlink_args * uap,__unused int32_t * retval)6286 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
6287 {
6288 return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
6289 uap->path, UIO_USERSPACE, 0);
6290 }
6291
6292 int
unlinkat(__unused proc_t p,struct unlinkat_args * uap,__unused int32_t * retval)6293 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
6294 {
6295 int unlink_flags = 0;
6296
6297 if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY)) {
6298 return EINVAL;
6299 }
6300
6301 if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) {
6302 unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY;
6303 }
6304
6305 if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
6306 if (uap->flag & AT_REMOVEDIR_DATALESS) {
6307 unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
6308 }
6309 return rmdirat_internal(vfs_context_current(), uap->fd,
6310 uap->path, UIO_USERSPACE, unlink_flags);
6311 } else {
6312 return unlinkat_internal(vfs_context_current(), uap->fd,
6313 NULLVP, uap->path, UIO_USERSPACE, unlink_flags);
6314 }
6315 }
6316
6317 /*
6318 * Reposition read/write file offset.
6319 */
6320 int
lseek(proc_t p,struct lseek_args * uap,off_t * retval)6321 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
6322 {
6323 struct fileproc *fp;
6324 vnode_t vp;
6325 struct vfs_context *ctx;
6326 off_t offset = uap->offset, file_size;
6327 int error;
6328
6329 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
6330 if (error == ENOTSUP) {
6331 return ESPIPE;
6332 }
6333 return error;
6334 }
6335 if (vnode_isfifo(vp)) {
6336 file_drop(uap->fd);
6337 return ESPIPE;
6338 }
6339
6340
6341 ctx = vfs_context_current();
6342 #if CONFIG_MACF
6343 if (uap->whence == L_INCR && uap->offset == 0) {
6344 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
6345 fp->fp_glob);
6346 } else {
6347 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
6348 fp->fp_glob);
6349 }
6350 if (error) {
6351 file_drop(uap->fd);
6352 return error;
6353 }
6354 #endif
6355 if ((error = vnode_getwithref(vp))) {
6356 file_drop(uap->fd);
6357 return error;
6358 }
6359
6360 switch (uap->whence) {
6361 case L_INCR:
6362 offset += fp->fp_glob->fg_offset;
6363 break;
6364 case L_XTND:
6365 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
6366 break;
6367 }
6368 offset += file_size;
6369 break;
6370 case L_SET:
6371 break;
6372 case SEEK_HOLE:
6373 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
6374 break;
6375 case SEEK_DATA:
6376 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
6377 break;
6378 default:
6379 error = EINVAL;
6380 }
6381 if (error == 0) {
6382 if (uap->offset > 0 && offset < 0) {
6383 /* Incremented/relative move past max size */
6384 error = EOVERFLOW;
6385 } else {
6386 /*
6387 * Allow negative offsets on character devices, per
6388 * POSIX 1003.1-2001. Most likely for writing disk
6389 * labels.
6390 */
6391 if (offset < 0 && vp->v_type != VCHR) {
6392 /* Decremented/relative move before start */
6393 error = EINVAL;
6394 } else {
6395 /* Success */
6396 fp->fp_glob->fg_offset = offset;
6397 *retval = fp->fp_glob->fg_offset;
6398 }
6399 }
6400 }
6401
6402 /*
6403 * An lseek can affect whether data is "available to read." Use
6404 * hint of NOTE_NONE so no EVFILT_VNODE events fire
6405 */
6406 post_event_if_success(vp, error, NOTE_NONE);
6407 (void)vnode_put(vp);
6408 file_drop(uap->fd);
6409 return error;
6410 }
6411
6412
6413 /*
6414 * Check access permissions.
6415 *
6416 * Returns: 0 Success
6417 * vnode_authorize:???
6418 */
6419 static int
access1(vnode_t vp,vnode_t dvp,int uflags,vfs_context_t ctx)6420 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
6421 {
6422 kauth_action_t action;
6423 int error;
6424
6425 /*
6426 * If just the regular access bits, convert them to something
6427 * that vnode_authorize will understand.
6428 */
6429 if (!(uflags & _ACCESS_EXTENDED_MASK)) {
6430 action = 0;
6431 if (uflags & R_OK) {
6432 action |= KAUTH_VNODE_READ_DATA; /* aka KAUTH_VNODE_LIST_DIRECTORY */
6433 }
6434 if (uflags & W_OK) {
6435 if (vnode_isdir(vp)) {
6436 action |= KAUTH_VNODE_ADD_FILE |
6437 KAUTH_VNODE_ADD_SUBDIRECTORY;
6438 /* might want delete rights here too */
6439 } else {
6440 action |= KAUTH_VNODE_WRITE_DATA;
6441 }
6442 }
6443 if (uflags & X_OK) {
6444 if (vnode_isdir(vp)) {
6445 action |= KAUTH_VNODE_SEARCH;
6446 } else {
6447 action |= KAUTH_VNODE_EXECUTE;
6448 }
6449 }
6450 } else {
6451 /* take advantage of definition of uflags */
6452 action = uflags >> 8;
6453 }
6454
6455 #if CONFIG_MACF
6456 error = mac_vnode_check_access(ctx, vp, uflags);
6457 if (error) {
6458 return error;
6459 }
6460 #endif /* MAC */
6461
6462 /* action == 0 means only check for existence */
6463 if (action != 0) {
6464 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
6465 } else {
6466 error = 0;
6467 }
6468
6469 return error;
6470 }
6471
6472
6473
6474 /*
6475 * access_extended: Check access permissions in bulk.
6476 *
6477 * Description: uap->entries Pointer to an array of accessx
6478 * descriptor structs, plus one or
6479 * more NULL terminated strings (see
6480 * "Notes" section below).
6481 * uap->size Size of the area pointed to by
6482 * uap->entries.
6483 * uap->results Pointer to the results array.
6484 *
6485 * Returns: 0 Success
6486 * ENOMEM Insufficient memory
6487 * EINVAL Invalid arguments
6488 * namei:EFAULT Bad address
6489 * namei:ENAMETOOLONG Filename too long
6490 * namei:ENOENT No such file or directory
6491 * namei:ELOOP Too many levels of symbolic links
6492 * namei:EBADF Bad file descriptor
6493 * namei:ENOTDIR Not a directory
6494 * namei:???
6495 * access1:
6496 *
6497 * Implicit returns:
6498 * uap->results Array contents modified
6499 *
6500 * Notes: The uap->entries are structured as an arbitrary length array
6501 * of accessx descriptors, followed by one or more NULL terminated
6502 * strings
6503 *
6504 * struct accessx_descriptor[0]
6505 * ...
6506 * struct accessx_descriptor[n]
6507 * char name_data[0];
6508 *
6509 * We determine the entry count by walking the buffer containing
6510 * the uap->entries argument descriptor. For each descriptor we
6511 * see, the valid values for the offset ad_name_offset will be
6512 * in the byte range:
6513 *
6514 * [ uap->entries + sizeof(struct accessx_descriptor) ]
6515 * to
6516 * [ uap->entries + uap->size - 2 ]
6517 *
6518 * since we must have at least one string, and the string must
6519 * be at least one character plus the NULL terminator in length.
6520 *
6521 * XXX: Need to support the check-as uid argument
6522 */
6523 int
access_extended(__unused proc_t p,struct access_extended_args * uap,__unused int32_t * retval)6524 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
6525 {
6526 struct accessx_descriptor *input = NULL;
6527 errno_t *result = NULL;
6528 errno_t error = 0;
6529 int wantdelete = 0;
6530 size_t desc_max, desc_actual = 0;
6531 unsigned int i, j;
6532 struct vfs_context context;
6533 struct nameidata nd;
6534 int niopts;
6535 vnode_t vp = NULL;
6536 vnode_t dvp = NULL;
6537 #define ACCESSX_MAX_DESCR_ON_STACK 10
6538 struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
6539
6540 context.vc_ucred = NULL;
6541
6542 /*
6543 * Validate parameters; if valid, copy the descriptor array and string
6544 * arguments into local memory. Before proceeding, the following
6545 * conditions must have been met:
6546 *
6547 * o The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
6548 * o There must be sufficient room in the request for at least one
6549 * descriptor and a one yte NUL terminated string.
6550 * o The allocation of local storage must not fail.
6551 */
6552 if (uap->size > ACCESSX_MAX_TABLESIZE) {
6553 return ENOMEM;
6554 }
6555 if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6556 return EINVAL;
6557 }
6558 if (uap->size <= sizeof(stack_input)) {
6559 input = stack_input;
6560 } else {
6561 input = kalloc_data(uap->size, Z_WAITOK);
6562 if (input == NULL) {
6563 error = ENOMEM;
6564 goto out;
6565 }
6566 }
6567 error = copyin(uap->entries, input, uap->size);
6568 if (error) {
6569 goto out;
6570 }
6571
6572 AUDIT_ARG(opaque, input, uap->size);
6573
6574 /*
6575 * Force NUL termination of the copyin buffer to avoid nami() running
6576 * off the end. If the caller passes us bogus data, they may get a
6577 * bogus result.
6578 */
6579 ((char *)input)[uap->size - 1] = 0;
6580
6581 /*
6582 * Access is defined as checking against the process' real identity,
6583 * even if operations are checking the effective identity. This
6584 * requires that we use a local vfs context.
6585 */
6586 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6587 context.vc_thread = current_thread();
6588
6589 /*
6590 * Find out how many entries we have, so we can allocate the result
6591 * array by walking the list and adjusting the count downward by the
6592 * earliest string offset we see.
6593 */
6594 desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6595 desc_actual = desc_max;
6596 for (i = 0; i < desc_actual; i++) {
6597 /*
6598 * Take the offset to the name string for this entry and
6599 * convert to an input array index, which would be one off
6600 * the end of the array if this entry was the lowest-addressed
6601 * name string.
6602 */
6603 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6604
6605 /*
6606 * An offset greater than the max allowable offset is an error.
6607 * It is also an error for any valid entry to point
6608 * to a location prior to the end of the current entry, if
6609 * it's not a reference to the string of the previous entry.
6610 */
6611 if (j > desc_max || (j != 0 && j <= i)) {
6612 error = EINVAL;
6613 goto out;
6614 }
6615
6616 /* Also do not let ad_name_offset point to something beyond the size of the input */
6617 if (input[i].ad_name_offset >= uap->size) {
6618 error = EINVAL;
6619 goto out;
6620 }
6621
6622 /*
6623 * An offset of 0 means use the previous descriptor's offset;
6624 * this is used to chain multiple requests for the same file
6625 * to avoid multiple lookups.
6626 */
6627 if (j == 0) {
6628 /* This is not valid for the first entry */
6629 if (i == 0) {
6630 error = EINVAL;
6631 goto out;
6632 }
6633 continue;
6634 }
6635
6636 /*
6637 * If the offset of the string for this descriptor is before
6638 * what we believe is the current actual last descriptor,
6639 * then we need to adjust our estimate downward; this permits
6640 * the string table following the last descriptor to be out
6641 * of order relative to the descriptor list.
6642 */
6643 if (j < desc_actual) {
6644 desc_actual = j;
6645 }
6646 }
6647
6648 /*
6649 * We limit the actual number of descriptors we are willing to process
6650 * to a hard maximum of ACCESSX_MAX_DESCRIPTORS. If the number being
6651 * requested does not exceed this limit,
6652 */
6653 if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6654 error = ENOMEM;
6655 goto out;
6656 }
6657 result = kalloc_data(desc_actual * sizeof(errno_t), Z_WAITOK | Z_ZERO);
6658 if (result == NULL) {
6659 error = ENOMEM;
6660 goto out;
6661 }
6662
6663 /*
6664 * Do the work by iterating over the descriptor entries we know to
6665 * at least appear to contain valid data.
6666 */
6667 error = 0;
6668 for (i = 0; i < desc_actual; i++) {
6669 /*
6670 * If the ad_name_offset is 0, then we use the previous
6671 * results to make the check; otherwise, we are looking up
6672 * a new file name.
6673 */
6674 if (input[i].ad_name_offset != 0) {
6675 /* discard old vnodes */
6676 if (vp) {
6677 vnode_put(vp);
6678 vp = NULL;
6679 }
6680 if (dvp) {
6681 vnode_put(dvp);
6682 dvp = NULL;
6683 }
6684
6685 /*
6686 * Scan forward in the descriptor list to see if we
6687 * need the parent vnode. We will need it if we are
6688 * deleting, since we must have rights to remove
6689 * entries in the parent directory, as well as the
6690 * rights to delete the object itself.
6691 */
6692 wantdelete = input[i].ad_flags & _DELETE_OK;
6693 for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6694 if (input[j].ad_flags & _DELETE_OK) {
6695 wantdelete = 1;
6696 }
6697 }
6698
6699 niopts = FOLLOW | AUDITVNPATH1;
6700
6701 /* need parent for vnode_authorize for deletion test */
6702 if (wantdelete) {
6703 niopts |= WANTPARENT;
6704 }
6705
6706 /* do the lookup */
6707 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6708 CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6709 &context);
6710 error = namei(&nd);
6711 if (!error) {
6712 vp = nd.ni_vp;
6713 if (wantdelete) {
6714 dvp = nd.ni_dvp;
6715 }
6716 }
6717 nameidone(&nd);
6718 }
6719
6720 /*
6721 * Handle lookup errors.
6722 */
6723 switch (error) {
6724 case ENOENT:
6725 case EACCES:
6726 case EPERM:
6727 case ENOTDIR:
6728 result[i] = error;
6729 break;
6730 case 0:
6731 /* run this access check */
6732 result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6733 break;
6734 default:
6735 /* fatal lookup error */
6736
6737 goto out;
6738 }
6739 }
6740
6741 AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6742
6743 /* copy out results */
6744 error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6745
6746 out:
6747 if (input && input != stack_input) {
6748 kfree_data(input, uap->size);
6749 }
6750 if (result) {
6751 kfree_data(result, desc_actual * sizeof(errno_t));
6752 }
6753 if (vp) {
6754 vnode_put(vp);
6755 }
6756 if (dvp) {
6757 vnode_put(dvp);
6758 }
6759 if (IS_VALID_CRED(context.vc_ucred)) {
6760 kauth_cred_unref(&context.vc_ucred);
6761 }
6762 return error;
6763 }
6764
6765
6766 /*
6767 * Returns: 0 Success
6768 * namei:EFAULT Bad address
6769 * namei:ENAMETOOLONG Filename too long
6770 * namei:ENOENT No such file or directory
6771 * namei:ELOOP Too many levels of symbolic links
6772 * namei:EBADF Bad file descriptor
6773 * namei:ENOTDIR Not a directory
6774 * namei:???
6775 * access1:
6776 */
6777 static int
faccessat_internal(vfs_context_t ctx,int fd,user_addr_t path,int amode,int flag,enum uio_seg segflg)6778 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6779 int flag, enum uio_seg segflg)
6780 {
6781 int error;
6782 struct nameidata nd;
6783 int niopts;
6784 struct vfs_context context;
6785 #if NAMEDRSRCFORK
6786 int is_namedstream = 0;
6787 #endif
6788
6789 /*
6790 * Unless the AT_EACCESS option is used, Access is defined as checking
6791 * against the process' real identity, even if operations are checking
6792 * the effective identity. So we need to tweak the credential
6793 * in the context for that case.
6794 */
6795 if (!(flag & AT_EACCESS)) {
6796 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6797 } else {
6798 context.vc_ucred = ctx->vc_ucred;
6799 }
6800 context.vc_thread = ctx->vc_thread;
6801
6802
6803 niopts = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY) ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6804 /* need parent for vnode_authorize for deletion test */
6805 if (amode & _DELETE_OK) {
6806 niopts |= WANTPARENT;
6807 }
6808 NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6809 path, &context);
6810 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6811 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
6812 }
6813
6814 #if NAMEDRSRCFORK
6815 /* access(F_OK) calls are allowed for resource forks. */
6816 if (amode == F_OK) {
6817 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6818 }
6819 #endif
6820 error = nameiat(&nd, fd);
6821 if (error) {
6822 goto out;
6823 }
6824
6825 #if NAMEDRSRCFORK
6826 /* Grab reference on the shadow stream file vnode to
6827 * force an inactive on release which will mark it
6828 * for recycle.
6829 */
6830 if (vnode_isnamedstream(nd.ni_vp) &&
6831 (nd.ni_vp->v_parent != NULLVP) &&
6832 vnode_isshadow(nd.ni_vp)) {
6833 is_namedstream = 1;
6834 vnode_ref(nd.ni_vp);
6835 }
6836 #endif
6837
6838 error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6839
6840 #if NAMEDRSRCFORK
6841 if (is_namedstream) {
6842 vnode_rele(nd.ni_vp);
6843 }
6844 #endif
6845
6846 vnode_put(nd.ni_vp);
6847 if (amode & _DELETE_OK) {
6848 vnode_put(nd.ni_dvp);
6849 }
6850 nameidone(&nd);
6851
6852 out:
6853 if (!(flag & AT_EACCESS)) {
6854 kauth_cred_unref(&context.vc_ucred);
6855 }
6856 return error;
6857 }
6858
6859 int
access(__unused proc_t p,struct access_args * uap,__unused int32_t * retval)6860 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6861 {
6862 return faccessat_internal(vfs_context_current(), AT_FDCWD,
6863 uap->path, uap->flags, 0, UIO_USERSPACE);
6864 }
6865
6866 int
faccessat(__unused proc_t p,struct faccessat_args * uap,__unused int32_t * retval)6867 faccessat(__unused proc_t p, struct faccessat_args *uap,
6868 __unused int32_t *retval)
6869 {
6870 if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
6871 return EINVAL;
6872 }
6873
6874 return faccessat_internal(vfs_context_current(), uap->fd,
6875 uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6876 }
6877
6878 /*
6879 * Returns: 0 Success
6880 * EFAULT
6881 * copyout:EFAULT
6882 * namei:???
6883 * vn_stat:???
6884 */
6885 static int
fstatat_internal(vfs_context_t ctx,user_addr_t path,user_addr_t ub,user_addr_t xsecurity,user_addr_t xsecurity_size,int isstat64,enum uio_seg segflg,int fd,int flag)6886 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6887 user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6888 enum uio_seg segflg, int fd, int flag)
6889 {
6890 struct nameidata *ndp = NULL;
6891 int follow;
6892 union {
6893 struct stat sb;
6894 struct stat64 sb64;
6895 } source = {};
6896 union {
6897 struct user64_stat user64_sb;
6898 struct user32_stat user32_sb;
6899 struct user64_stat64 user64_sb64;
6900 struct user32_stat64 user32_sb64;
6901 } dest = {};
6902 caddr_t sbp;
6903 int error, my_size;
6904 kauth_filesec_t fsec = KAUTH_FILESEC_NONE;
6905 size_t xsecurity_bufsize;
6906 void * statptr;
6907 struct fileproc *fp = NULL;
6908 int needsrealdev = 0;
6909
6910 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
6911 ndp = kalloc_type(struct nameidata, Z_WAITOK);
6912 NDINIT(ndp, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6913 segflg, path, ctx);
6914 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
6915 ndp->ni_flag |= NAMEI_NOFOLLOW_ANY;
6916 }
6917
6918 #if NAMEDRSRCFORK
6919 int is_namedstream = 0;
6920 /* stat calls are allowed for resource forks. */
6921 ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6922 #endif
6923
6924 if (flag & AT_FDONLY) {
6925 vnode_t fvp;
6926
6927 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6928 if (error) {
6929 goto out;
6930 }
6931 if ((error = vnode_getwithref(fvp))) {
6932 file_drop(fd);
6933 goto out;
6934 }
6935 ndp->ni_vp = fvp;
6936 } else {
6937 error = nameiat(ndp, fd);
6938 if (error) {
6939 goto out;
6940 }
6941 }
6942
6943 statptr = (void *)&source;
6944
6945 #if NAMEDRSRCFORK
6946 /* Grab reference on the shadow stream file vnode to
6947 * force an inactive on release which will mark it
6948 * for recycle.
6949 */
6950 if (vnode_isnamedstream(ndp->ni_vp) &&
6951 (ndp->ni_vp->v_parent != NULLVP) &&
6952 vnode_isshadow(ndp->ni_vp)) {
6953 is_namedstream = 1;
6954 vnode_ref(ndp->ni_vp);
6955 }
6956 #endif
6957
6958 needsrealdev = flag & AT_REALDEV ? 1 : 0;
6959 if (fp && (xsecurity == USER_ADDR_NULL)) {
6960 /*
6961 * If the caller has the file open, and is not
6962 * requesting extended security information, we are
6963 * going to let them get the basic stat information.
6964 */
6965 error = vn_stat_noauth(ndp->ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6966 fp->fp_glob->fg_cred);
6967 } else {
6968 error = vn_stat(ndp->ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6969 isstat64, needsrealdev, ctx);
6970 }
6971
6972 #if NAMEDRSRCFORK
6973 if (is_namedstream) {
6974 vnode_rele(ndp->ni_vp);
6975 }
6976 #endif
6977 vnode_put(ndp->ni_vp);
6978 nameidone(ndp);
6979
6980 if (fp) {
6981 file_drop(fd);
6982 fp = NULL;
6983 }
6984
6985 if (error) {
6986 goto out;
6987 }
6988 /* Zap spare fields */
6989 if (isstat64 != 0) {
6990 source.sb64.st_lspare = 0;
6991 source.sb64.st_qspare[0] = 0LL;
6992 source.sb64.st_qspare[1] = 0LL;
6993 if (vfs_context_is64bit(ctx)) {
6994 munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6995 my_size = sizeof(dest.user64_sb64);
6996 sbp = (caddr_t)&dest.user64_sb64;
6997 } else {
6998 munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6999 my_size = sizeof(dest.user32_sb64);
7000 sbp = (caddr_t)&dest.user32_sb64;
7001 }
7002 /*
7003 * Check if we raced (post lookup) against the last unlink of a file.
7004 */
7005 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
7006 source.sb64.st_nlink = 1;
7007 }
7008 } else {
7009 source.sb.st_lspare = 0;
7010 source.sb.st_qspare[0] = 0LL;
7011 source.sb.st_qspare[1] = 0LL;
7012 if (vfs_context_is64bit(ctx)) {
7013 munge_user64_stat(&source.sb, &dest.user64_sb);
7014 my_size = sizeof(dest.user64_sb);
7015 sbp = (caddr_t)&dest.user64_sb;
7016 } else {
7017 munge_user32_stat(&source.sb, &dest.user32_sb);
7018 my_size = sizeof(dest.user32_sb);
7019 sbp = (caddr_t)&dest.user32_sb;
7020 }
7021
7022 /*
7023 * Check if we raced (post lookup) against the last unlink of a file.
7024 */
7025 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
7026 source.sb.st_nlink = 1;
7027 }
7028 }
7029 if ((error = copyout(sbp, ub, my_size)) != 0) {
7030 goto out;
7031 }
7032
7033 /* caller wants extended security information? */
7034 if (xsecurity != USER_ADDR_NULL) {
7035 /* did we get any? */
7036 if (fsec == KAUTH_FILESEC_NONE) {
7037 if (susize(xsecurity_size, 0) != 0) {
7038 error = EFAULT;
7039 goto out;
7040 }
7041 } else {
7042 /* find the user buffer size */
7043 xsecurity_bufsize = fusize(xsecurity_size);
7044
7045 /* copy out the actual data size */
7046 if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
7047 error = EFAULT;
7048 goto out;
7049 }
7050
7051 /* if the caller supplied enough room, copy out to it */
7052 if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
7053 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
7054 }
7055 }
7056 }
7057 out:
7058 if (ndp) {
7059 kfree_type(struct nameidata, ndp);
7060 }
7061 if (fsec != KAUTH_FILESEC_NONE) {
7062 kauth_filesec_free(fsec);
7063 }
7064 return error;
7065 }
7066
7067 /*
7068 * stat_extended: Get file status; with extended security (ACL).
7069 *
7070 * Parameters: p (ignored)
7071 * uap User argument descriptor (see below)
7072 * retval (ignored)
7073 *
7074 * Indirect: uap->path Path of file to get status from
7075 * uap->ub User buffer (holds file status info)
7076 * uap->xsecurity ACL to get (extended security)
7077 * uap->xsecurity_size Size of ACL
7078 *
7079 * Returns: 0 Success
7080 * !0 errno value
7081 *
7082 */
7083 int
stat_extended(__unused proc_t p,struct stat_extended_args * uap,__unused int32_t * retval)7084 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
7085 __unused int32_t *retval)
7086 {
7087 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7088 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7089 0);
7090 }
7091
7092 /*
7093 * Returns: 0 Success
7094 * fstatat_internal:??? [see fstatat_internal() in this file]
7095 */
7096 int
stat(__unused proc_t p,struct stat_args * uap,__unused int32_t * retval)7097 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
7098 {
7099 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7100 0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
7101 }
7102
7103 int
stat64(__unused proc_t p,struct stat64_args * uap,__unused int32_t * retval)7104 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
7105 {
7106 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7107 0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
7108 }
7109
7110 /*
7111 * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
7112 *
7113 * Parameters: p (ignored)
7114 * uap User argument descriptor (see below)
7115 * retval (ignored)
7116 *
7117 * Indirect: uap->path Path of file to get status from
7118 * uap->ub User buffer (holds file status info)
7119 * uap->xsecurity ACL to get (extended security)
7120 * uap->xsecurity_size Size of ACL
7121 *
7122 * Returns: 0 Success
7123 * !0 errno value
7124 *
7125 */
7126 int
stat64_extended(__unused proc_t p,struct stat64_extended_args * uap,__unused int32_t * retval)7127 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
7128 {
7129 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7130 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7131 0);
7132 }
7133
7134 /*
7135 * lstat_extended: Get file status; does not follow links; with extended security (ACL).
7136 *
7137 * Parameters: p (ignored)
7138 * uap User argument descriptor (see below)
7139 * retval (ignored)
7140 *
7141 * Indirect: uap->path Path of file to get status from
7142 * uap->ub User buffer (holds file status info)
7143 * uap->xsecurity ACL to get (extended security)
7144 * uap->xsecurity_size Size of ACL
7145 *
7146 * Returns: 0 Success
7147 * !0 errno value
7148 *
7149 */
7150 int
lstat_extended(__unused proc_t p,struct lstat_extended_args * uap,__unused int32_t * retval)7151 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
7152 {
7153 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7154 uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
7155 AT_SYMLINK_NOFOLLOW);
7156 }
7157
7158 /*
7159 * Get file status; this version does not follow links.
7160 */
7161 int
lstat(__unused proc_t p,struct lstat_args * uap,__unused int32_t * retval)7162 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
7163 {
7164 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7165 0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7166 }
7167
7168 int
lstat64(__unused proc_t p,struct lstat64_args * uap,__unused int32_t * retval)7169 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
7170 {
7171 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7172 0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
7173 }
7174
7175 /*
7176 * lstat64_extended: Get file status; can handle large inode numbers; does not
7177 * follow links; with extended security (ACL).
7178 *
7179 * Parameters: p (ignored)
7180 * uap User argument descriptor (see below)
7181 * retval (ignored)
7182 *
7183 * Indirect: uap->path Path of file to get status from
7184 * uap->ub User buffer (holds file status info)
7185 * uap->xsecurity ACL to get (extended security)
7186 * uap->xsecurity_size Size of ACL
7187 *
7188 * Returns: 0 Success
7189 * !0 errno value
7190 *
7191 */
7192 int
lstat64_extended(__unused proc_t p,struct lstat64_extended_args * uap,__unused int32_t * retval)7193 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
7194 {
7195 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7196 uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
7197 AT_SYMLINK_NOFOLLOW);
7198 }
7199
7200 int
fstatat(__unused proc_t p,struct fstatat_args * uap,__unused int32_t * retval)7201 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
7202 {
7203 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7204 return EINVAL;
7205 }
7206
7207 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7208 0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
7209 }
7210
7211 int
fstatat64(__unused proc_t p,struct fstatat64_args * uap,__unused int32_t * retval)7212 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
7213 __unused int32_t *retval)
7214 {
7215 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) {
7216 return EINVAL;
7217 }
7218
7219 return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
7220 0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
7221 }
7222
7223 /*
7224 * Get configurable pathname variables.
7225 *
7226 * Returns: 0 Success
7227 * namei:???
7228 * vn_pathconf:???
7229 *
7230 * Notes: Global implementation constants are intended to be
7231 * implemented in this function directly; all other constants
7232 * are per-FS implementation, and therefore must be handled in
7233 * each respective FS, instead.
7234 *
7235 * XXX We implement some things globally right now that should actually be
7236 * XXX per-FS; we will need to deal with this at some point.
7237 */
7238 /* ARGSUSED */
7239 int
pathconf(__unused proc_t p,struct pathconf_args * uap,int32_t * retval)7240 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
7241 {
7242 int error;
7243 struct nameidata nd;
7244 vfs_context_t ctx = vfs_context_current();
7245
7246 NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
7247 UIO_USERSPACE, uap->path, ctx);
7248 error = namei(&nd);
7249 if (error) {
7250 return error;
7251 }
7252
7253 error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
7254
7255 vnode_put(nd.ni_vp);
7256 nameidone(&nd);
7257 return error;
7258 }
7259
7260 /*
7261 * Return target name of a symbolic link.
7262 */
7263 /* ARGSUSED */
7264 static int
readlinkat_internal(vfs_context_t ctx,int fd,vnode_t lnk_vp,user_addr_t path,enum uio_seg seg,user_addr_t buf,size_t bufsize,enum uio_seg bufseg,int * retval)7265 readlinkat_internal(vfs_context_t ctx, int fd, vnode_t lnk_vp, user_addr_t path,
7266 enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
7267 int *retval)
7268 {
7269 vnode_t vp;
7270 uio_t auio;
7271 int error;
7272 struct nameidata nd;
7273 UIO_STACKBUF(uio_buf, 1);
7274 bool put_vnode;
7275
7276 if (bufsize > INT32_MAX) {
7277 return EINVAL;
7278 }
7279
7280 if (lnk_vp) {
7281 vp = lnk_vp;
7282 put_vnode = false;
7283 } else {
7284 NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
7285 seg, path, ctx);
7286
7287 error = nameiat(&nd, fd);
7288 if (error) {
7289 return error;
7290 }
7291 vp = nd.ni_vp;
7292 put_vnode = true;
7293 nameidone(&nd);
7294 }
7295
7296 auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
7297 &uio_buf[0], sizeof(uio_buf));
7298 uio_addiov(auio, buf, bufsize);
7299 if (vp->v_type != VLNK) {
7300 error = EINVAL;
7301 } else {
7302 #if CONFIG_MACF
7303 error = mac_vnode_check_readlink(ctx, vp);
7304 #endif
7305 if (error == 0) {
7306 error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
7307 ctx);
7308 }
7309 if (error == 0) {
7310 error = VNOP_READLINK(vp, auio, ctx);
7311 }
7312 }
7313
7314 if (put_vnode) {
7315 vnode_put(vp);
7316 }
7317
7318 *retval = (int)(bufsize - uio_resid(auio));
7319 return error;
7320 }
7321
7322 int
freadlink(proc_t p,struct freadlink_args * uap,int32_t * retval)7323 freadlink(proc_t p, struct freadlink_args *uap, int32_t *retval)
7324 {
7325 enum uio_seg procseg;
7326 vnode_t vp;
7327 int error;
7328
7329 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7330
7331 AUDIT_ARG(fd, uap->fd);
7332
7333 if ((error = file_vnode(uap->fd, &vp))) {
7334 return error;
7335 }
7336 if ((error = vnode_getwithref(vp))) {
7337 file_drop(uap->fd);
7338 return error;
7339 }
7340
7341 error = readlinkat_internal(vfs_context_current(), -1,
7342 vp, 0, procseg, CAST_USER_ADDR_T(uap->buf),
7343 uap->bufsize, procseg, retval);
7344
7345 vnode_put(vp);
7346 file_drop(uap->fd);
7347 return error;
7348 }
7349
7350 int
readlink(proc_t p,struct readlink_args * uap,int32_t * retval)7351 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
7352 {
7353 enum uio_seg procseg;
7354
7355 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7356 return readlinkat_internal(vfs_context_current(), AT_FDCWD, NULL,
7357 CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
7358 uap->count, procseg, retval);
7359 }
7360
7361 int
readlinkat(proc_t p,struct readlinkat_args * uap,int32_t * retval)7362 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
7363 {
7364 enum uio_seg procseg;
7365
7366 procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7367 return readlinkat_internal(vfs_context_current(), uap->fd, NULL,
7368 CAST_USER_ADDR_T(uap->path), procseg, uap->buf, uap->bufsize, procseg,
7369 retval);
7370 }
7371
7372 /*
7373 * Change file flags, the deep inner layer.
7374 */
7375 static int
chflags0(vnode_t vp,struct vnode_attr * va,int (* setattr)(vnode_t,void *,vfs_context_t),void * arg,vfs_context_t ctx)7376 chflags0(vnode_t vp, struct vnode_attr *va,
7377 int (*setattr)(vnode_t, void *, vfs_context_t),
7378 void *arg, vfs_context_t ctx)
7379 {
7380 kauth_action_t action = 0;
7381 int error;
7382
7383 #if CONFIG_MACF
7384 error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
7385 if (error) {
7386 goto out;
7387 }
7388 #endif
7389
7390 /* request authorisation, disregard immutability */
7391 if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
7392 goto out;
7393 }
7394 /*
7395 * Request that the auth layer disregard those file flags it's allowed to when
7396 * authorizing this operation; we need to do this in order to be able to
7397 * clear immutable flags.
7398 */
7399 if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
7400 goto out;
7401 }
7402 error = (*setattr)(vp, arg, ctx);
7403
7404 #if CONFIG_MACF
7405 if (error == 0) {
7406 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
7407 }
7408 #endif
7409
7410 out:
7411 return error;
7412 }
7413
7414 /*
7415 * Change file flags.
7416 *
7417 * NOTE: this will vnode_put() `vp'
7418 */
7419 static int
chflags1(vnode_t vp,int flags,vfs_context_t ctx)7420 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
7421 {
7422 struct vnode_attr va;
7423 int error;
7424
7425 VATTR_INIT(&va);
7426 VATTR_SET(&va, va_flags, flags);
7427
7428 error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
7429 vnode_put(vp);
7430
7431 if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
7432 error = ENOTSUP;
7433 }
7434
7435 return error;
7436 }
7437
7438 /*
7439 * Change flags of a file given a path name.
7440 */
7441 /* ARGSUSED */
7442 int
chflags(__unused proc_t p,struct chflags_args * uap,__unused int32_t * retval)7443 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
7444 {
7445 vnode_t vp;
7446 vfs_context_t ctx = vfs_context_current();
7447 int error;
7448 struct nameidata nd;
7449 uint32_t wantparent = 0;
7450
7451 #if CONFIG_FILE_LEASES
7452 wantparent = WANTPARENT;
7453 #endif
7454
7455 AUDIT_ARG(fflags, uap->flags);
7456 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
7457 UIO_USERSPACE, uap->path, ctx);
7458 error = namei(&nd);
7459 if (error) {
7460 return error;
7461 }
7462 vp = nd.ni_vp;
7463
7464 #if CONFIG_FILE_LEASES
7465 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7466 vnode_put(nd.ni_dvp);
7467 #endif
7468
7469 nameidone(&nd);
7470
7471 /* we don't vnode_put() here because chflags1 does internally */
7472 error = chflags1(vp, uap->flags, ctx);
7473
7474 return error;
7475 }
7476
7477 /*
7478 * Change flags of a file given a file descriptor.
7479 */
7480 /* ARGSUSED */
7481 int
fchflags(__unused proc_t p,struct fchflags_args * uap,__unused int32_t * retval)7482 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
7483 {
7484 vnode_t vp;
7485 int error;
7486
7487 AUDIT_ARG(fd, uap->fd);
7488 AUDIT_ARG(fflags, uap->flags);
7489 if ((error = file_vnode(uap->fd, &vp))) {
7490 return error;
7491 }
7492
7493 if ((error = vnode_getwithref(vp))) {
7494 file_drop(uap->fd);
7495 return error;
7496 }
7497
7498 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7499
7500 #if CONFIG_FILE_LEASES
7501 vnode_breakdirlease(vp, true, O_WRONLY);
7502 #endif
7503
7504 /* we don't vnode_put() here because chflags1 does internally */
7505 error = chflags1(vp, uap->flags, vfs_context_current());
7506
7507 file_drop(uap->fd);
7508 return error;
7509 }
7510
7511 /*
7512 * Change security information on a filesystem object.
7513 *
7514 * Returns: 0 Success
7515 * EPERM Operation not permitted
7516 * vnode_authattr:??? [anything vnode_authattr can return]
7517 * vnode_authorize:??? [anything vnode_authorize can return]
7518 * vnode_setattr:??? [anything vnode_setattr can return]
7519 *
7520 * Notes: If vnode_authattr or vnode_authorize return EACCES, it will be
7521 * translated to EPERM before being returned.
7522 */
7523 static int
chmod_vnode(vfs_context_t ctx,vnode_t vp,struct vnode_attr * vap)7524 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
7525 {
7526 kauth_action_t action;
7527 int error;
7528
7529 AUDIT_ARG(mode, vap->va_mode);
7530 /* XXX audit new args */
7531
7532 #if NAMEDSTREAMS
7533 /* chmod calls are not allowed for resource forks. */
7534 if (vp->v_flag & VISNAMEDSTREAM) {
7535 return EPERM;
7536 }
7537 #endif
7538
7539 #if CONFIG_MACF
7540 if (VATTR_IS_ACTIVE(vap, va_mode) &&
7541 (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
7542 return error;
7543 }
7544
7545 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7546 if ((error = mac_vnode_check_setowner(ctx, vp,
7547 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7548 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
7549 return error;
7550 }
7551 }
7552
7553 if (VATTR_IS_ACTIVE(vap, va_acl) &&
7554 (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
7555 return error;
7556 }
7557 #endif
7558
7559 /* make sure that the caller is allowed to set this security information */
7560 if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
7561 ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7562 if (error == EACCES) {
7563 error = EPERM;
7564 }
7565 return error;
7566 }
7567
7568 if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
7569 return error;
7570 }
7571
7572 #if CONFIG_MACF
7573 if (VATTR_IS_ACTIVE(vap, va_mode)) {
7574 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
7575 }
7576
7577 if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
7578 mac_vnode_notify_setowner(ctx, vp,
7579 VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
7580 VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
7581 }
7582
7583 if (VATTR_IS_ACTIVE(vap, va_acl)) {
7584 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
7585 }
7586 #endif
7587
7588 return error;
7589 }
7590
7591
7592 /*
7593 * Change mode of a file given a path name.
7594 *
7595 * Returns: 0 Success
7596 * namei:??? [anything namei can return]
7597 * chmod_vnode:??? [anything chmod_vnode can return]
7598 */
7599 static int
chmodat(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,int flag,enum uio_seg segflg)7600 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
7601 int fd, int flag, enum uio_seg segflg)
7602 {
7603 struct nameidata nd;
7604 int follow, error;
7605 uint32_t wantparent = 0;
7606
7607 #if CONFIG_FILE_LEASES
7608 wantparent = WANTPARENT;
7609 #endif
7610
7611 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7612 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1 | wantparent,
7613 segflg, path, ctx);
7614 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7615 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7616 }
7617 if ((error = nameiat(&nd, fd))) {
7618 return error;
7619 }
7620
7621 #if CONFIG_FILE_LEASES
7622 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
7623 vnode_put(nd.ni_dvp);
7624 #endif
7625
7626 error = chmod_vnode(ctx, nd.ni_vp, vap);
7627 vnode_put(nd.ni_vp);
7628 nameidone(&nd);
7629 return error;
7630 }
7631
7632 static int
chmod_extended_init(struct vnode_attr * pva,kauth_filesec_t * pxsecdst,int mode,uid_t uid,gid_t gid,user_addr_t xsecurity)7633 chmod_extended_init(struct vnode_attr *pva, kauth_filesec_t *pxsecdst, int mode, uid_t uid,
7634 gid_t gid, user_addr_t xsecurity)
7635 {
7636 int error;
7637
7638 VATTR_INIT(pva);
7639
7640 if (mode != -1) {
7641 VATTR_SET(pva, va_mode, mode & ALLPERMS);
7642 } else {
7643 pva->va_mode = 0;
7644 }
7645
7646 if (uid != KAUTH_UID_NONE) {
7647 VATTR_SET(pva, va_uid, uid);
7648 }
7649
7650 if (gid != KAUTH_GID_NONE) {
7651 VATTR_SET(pva, va_gid, gid);
7652 }
7653
7654 *pxsecdst = NULL;
7655 switch (xsecurity) {
7656 case USER_ADDR_NULL:
7657 break;
7658
7659 case CAST_USER_ADDR_T((void *)1): /* _FILESEC_REMOVE_ACL */
7660 VATTR_SET(pva, va_acl, NULL);
7661 break;
7662
7663 default:
7664 if ((error = kauth_copyinfilesec(xsecurity, pxsecdst)) != 0) {
7665 return error;
7666 }
7667
7668 VATTR_SET(pva, va_acl, &(*pxsecdst)->fsec_acl);
7669 pva->va_vaflags |= VA_FILESEC_ACL;
7670 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", pva->va_acl->acl_entrycount);
7671 break;
7672 }
7673
7674 return 0;
7675 }
7676
7677 /*
7678 * chmod_extended: Change the mode of a file given a path name; with extended
7679 * argument list (including extended security (ACL)).
7680 *
7681 * Parameters: p Process requesting the open
7682 * uap User argument descriptor (see below)
7683 * retval (ignored)
7684 *
7685 * Indirect: uap->path Path to object (same as 'chmod')
7686 * uap->uid UID to set
7687 * uap->gid GID to set
7688 * uap->mode File mode to set (same as 'chmod')
7689 * uap->xsecurity ACL to set (or delete)
7690 *
7691 * Returns: 0 Success
7692 * !0 errno value
7693 *
7694 * Notes: The kauth_filesec_t in 'va', if any, is in host byte order.
7695 *
7696 * XXX: We should enummerate the possible errno values here, and where
7697 * in the code they originated.
7698 */
7699 int
chmod_extended(__unused proc_t p,struct chmod_extended_args * uap,__unused int32_t * retval)7700 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7701 {
7702 int error;
7703 struct vnode_attr va;
7704 kauth_filesec_t xsecdst = NULL;
7705
7706 AUDIT_ARG(owner, uap->uid, uap->gid);
7707
7708 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7709 uap->gid, uap->xsecurity);
7710
7711 if (error) {
7712 return error;
7713 }
7714
7715 error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7716 UIO_USERSPACE);
7717
7718 if (xsecdst != NULL) {
7719 kauth_filesec_free(xsecdst);
7720 }
7721 return error;
7722 }
7723
7724 /*
7725 * Returns: 0 Success
7726 * chmodat:??? [anything chmodat can return]
7727 */
7728 static int
fchmodat_internal(vfs_context_t ctx,user_addr_t path,int mode,int fd,int flag,enum uio_seg segflg)7729 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7730 int flag, enum uio_seg segflg)
7731 {
7732 struct vnode_attr va;
7733
7734 VATTR_INIT(&va);
7735 VATTR_SET(&va, va_mode, mode & ALLPERMS);
7736
7737 return chmodat(ctx, path, &va, fd, flag, segflg);
7738 }
7739
7740 int
chmod(__unused proc_t p,struct chmod_args * uap,__unused int32_t * retval)7741 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7742 {
7743 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7744 AT_FDCWD, 0, UIO_USERSPACE);
7745 }
7746
7747 int
fchmodat(__unused proc_t p,struct fchmodat_args * uap,__unused int32_t * retval)7748 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7749 {
7750 if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) {
7751 return EINVAL;
7752 }
7753
7754 return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7755 uap->fd, uap->flag, UIO_USERSPACE);
7756 }
7757
7758 /*
7759 * Change mode of a file given a file descriptor.
7760 */
7761 static int
fchmod1(__unused proc_t p,int fd,struct vnode_attr * vap)7762 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7763 {
7764 vnode_t vp;
7765 int error;
7766
7767 AUDIT_ARG(fd, fd);
7768
7769 if ((error = file_vnode(fd, &vp)) != 0) {
7770 return error;
7771 }
7772 if ((error = vnode_getwithref(vp)) != 0) {
7773 file_drop(fd);
7774 return error;
7775 }
7776 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7777
7778 #if CONFIG_FILE_LEASES
7779 vnode_breakdirlease(vp, true, O_WRONLY);
7780 #endif
7781
7782 error = chmod_vnode(vfs_context_current(), vp, vap);
7783 (void)vnode_put(vp);
7784 file_drop(fd);
7785
7786 return error;
7787 }
7788
7789 /*
7790 * fchmod_extended: Change mode of a file given a file descriptor; with
7791 * extended argument list (including extended security (ACL)).
7792 *
7793 * Parameters: p Process requesting to change file mode
7794 * uap User argument descriptor (see below)
7795 * retval (ignored)
7796 *
7797 * Indirect: uap->mode File mode to set (same as 'chmod')
7798 * uap->uid UID to set
7799 * uap->gid GID to set
7800 * uap->xsecurity ACL to set (or delete)
7801 * uap->fd File descriptor of file to change mode
7802 *
7803 * Returns: 0 Success
7804 * !0 errno value
7805 *
7806 */
7807 int
fchmod_extended(proc_t p,struct fchmod_extended_args * uap,__unused int32_t * retval)7808 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7809 {
7810 int error;
7811 struct vnode_attr va;
7812 kauth_filesec_t xsecdst = NULL;
7813
7814 AUDIT_ARG(owner, uap->uid, uap->gid);
7815
7816 error = chmod_extended_init(&va, &xsecdst, uap->mode, uap->uid,
7817 uap->gid, uap->xsecurity);
7818
7819 if (error) {
7820 return error;
7821 }
7822
7823 error = fchmod1(p, uap->fd, &va);
7824
7825 if (xsecdst != NULL) {
7826 kauth_filesec_free(xsecdst);
7827 }
7828 return error;
7829 }
7830
7831 int
fchmod(proc_t p,struct fchmod_args * uap,__unused int32_t * retval)7832 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7833 {
7834 struct vnode_attr va;
7835
7836 VATTR_INIT(&va);
7837 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7838
7839 return fchmod1(p, uap->fd, &va);
7840 }
7841
7842 static int
vn_chown_internal(__unused vfs_context_t ctx,vnode_t vp,uid_t uid,gid_t gid)7843 vn_chown_internal(__unused vfs_context_t ctx, vnode_t vp, uid_t uid, gid_t gid)
7844 {
7845 struct vnode_attr va;
7846 kauth_action_t action;
7847 int error;
7848
7849 VATTR_INIT(&va);
7850 if (uid != (uid_t)VNOVAL) {
7851 VATTR_SET(&va, va_uid, uid);
7852 }
7853 if (gid != (gid_t)VNOVAL) {
7854 VATTR_SET(&va, va_gid, gid);
7855 }
7856
7857 #if NAMEDSTREAMS
7858 /* chown calls are not allowed for resource forks. */
7859 if (vp->v_flag & VISNAMEDSTREAM) {
7860 error = EPERM;
7861 goto out;
7862 }
7863 #endif
7864
7865 #if CONFIG_MACF
7866 error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7867 if (error) {
7868 goto out;
7869 }
7870 #endif
7871
7872 /* preflight and authorize attribute changes */
7873 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7874 goto out;
7875 }
7876 if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7877 /*
7878 * EACCES is only allowed from namei(); permissions failure should
7879 * return EPERM, so we need to translate the error code.
7880 */
7881 if (error == EACCES) {
7882 error = EPERM;
7883 }
7884
7885 goto out;
7886 }
7887
7888 #if CONFIG_FILE_LEASES
7889 vnode_breakdirlease(vp, true, O_WRONLY);
7890 #endif
7891
7892 error = vnode_setattr(vp, &va, ctx);
7893
7894 #if CONFIG_MACF
7895 if (error == 0) {
7896 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7897 }
7898 #endif
7899
7900 out:
7901 return error;
7902 }
7903
7904 /*
7905 * Set ownership given a path name.
7906 */
7907 /* ARGSUSED */
7908 static int
fchownat_internal(vfs_context_t ctx,int fd,user_addr_t path,uid_t uid,gid_t gid,int flag,enum uio_seg segflg)7909 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7910 gid_t gid, int flag, enum uio_seg segflg)
7911 {
7912 vnode_t vp;
7913 int error;
7914 struct nameidata nd;
7915 int follow;
7916
7917 AUDIT_ARG(owner, uid, gid);
7918
7919 follow = (flag & (AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) ? NOFOLLOW : FOLLOW;
7920 NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg, path, ctx);
7921 if (flag & AT_SYMLINK_NOFOLLOW_ANY) {
7922 nd.ni_flag |= NAMEI_NOFOLLOW_ANY;
7923 }
7924
7925 error = nameiat(&nd, fd);
7926 if (error) {
7927 return error;
7928 }
7929
7930 vp = nd.ni_vp;
7931 error = vn_chown_internal(ctx, vp, uid, gid);
7932
7933 nameidone(&nd);
7934 vnode_put(vp);
7935 return error;
7936 }
7937
7938 int
chown(__unused proc_t p,struct chown_args * uap,__unused int32_t * retval)7939 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7940 {
7941 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7942 uap->uid, uap->gid, 0, UIO_USERSPACE);
7943 }
7944
7945 int
lchown(__unused proc_t p,struct lchown_args * uap,__unused int32_t * retval)7946 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7947 {
7948 return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7949 uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7950 }
7951
7952 int
fchownat(__unused proc_t p,struct fchownat_args * uap,__unused int32_t * retval)7953 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7954 {
7955 if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7956 return EINVAL;
7957 }
7958
7959 return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7960 uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7961 }
7962
7963 /*
7964 * Set ownership given a file descriptor.
7965 */
7966 /* ARGSUSED */
7967 int
fchown(__unused proc_t p,struct fchown_args * uap,__unused int32_t * retval)7968 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7969 {
7970 vfs_context_t ctx = vfs_context_current();
7971 vnode_t vp;
7972 int error;
7973
7974 AUDIT_ARG(owner, uap->uid, uap->gid);
7975 AUDIT_ARG(fd, uap->fd);
7976
7977 if ((error = file_vnode(uap->fd, &vp))) {
7978 return error;
7979 }
7980
7981 if ((error = vnode_getwithref(vp))) {
7982 file_drop(uap->fd);
7983 return error;
7984 }
7985 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7986
7987 error = vn_chown_internal(ctx, vp, uap->uid, uap->gid);
7988
7989 (void)vnode_put(vp);
7990 file_drop(uap->fd);
7991 return error;
7992 }
7993
7994 static int
getutimes(user_addr_t usrtvp,struct timespec * tsp)7995 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7996 {
7997 int error;
7998
7999 if (usrtvp == USER_ADDR_NULL) {
8000 struct timeval old_tv;
8001 /* XXX Y2038 bug because of microtime argument */
8002 microtime(&old_tv);
8003 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
8004 tsp[1] = tsp[0];
8005 } else {
8006 if (IS_64BIT_PROCESS(current_proc())) {
8007 struct user64_timeval tv[2];
8008 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8009 if (error) {
8010 return error;
8011 }
8012 TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
8013 TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
8014 } else {
8015 struct user32_timeval tv[2];
8016 error = copyin(usrtvp, (void *)tv, sizeof(tv));
8017 if (error) {
8018 return error;
8019 }
8020 TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
8021 TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
8022 }
8023 }
8024 return 0;
8025 }
8026
8027 static int
setutimes(vfs_context_t ctx,vnode_t vp,const struct timespec * ts,int nullflag)8028 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
8029 int nullflag)
8030 {
8031 int error;
8032 struct vnode_attr va;
8033 kauth_action_t action;
8034
8035 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8036
8037 VATTR_INIT(&va);
8038 VATTR_SET(&va, va_access_time, ts[0]);
8039 VATTR_SET(&va, va_modify_time, ts[1]);
8040 if (nullflag) {
8041 va.va_vaflags |= VA_UTIMES_NULL;
8042 }
8043
8044 #if NAMEDSTREAMS
8045 /* utimes calls are not allowed for resource forks. */
8046 if (vp->v_flag & VISNAMEDSTREAM) {
8047 error = EPERM;
8048 goto out;
8049 }
8050 #endif
8051
8052 #if CONFIG_MACF
8053 error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
8054 if (error) {
8055 goto out;
8056 }
8057 #endif
8058 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8059 if (!nullflag && error == EACCES) {
8060 error = EPERM;
8061 }
8062 goto out;
8063 }
8064
8065 /* since we may not need to auth anything, check here */
8066 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8067 if (!nullflag && error == EACCES) {
8068 error = EPERM;
8069 }
8070 goto out;
8071 }
8072 error = vnode_setattr(vp, &va, ctx);
8073
8074 #if CONFIG_MACF
8075 if (error == 0) {
8076 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
8077 }
8078 #endif
8079
8080 out:
8081 return error;
8082 }
8083
8084 /*
8085 * Set the access and modification times of a file.
8086 */
8087 /* ARGSUSED */
8088 int
utimes(__unused proc_t p,struct utimes_args * uap,__unused int32_t * retval)8089 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
8090 {
8091 struct timespec ts[2];
8092 user_addr_t usrtvp;
8093 int error;
8094 struct nameidata nd;
8095 vfs_context_t ctx = vfs_context_current();
8096 uint32_t wantparent = 0;
8097
8098 #if CONFIG_FILE_LEASES
8099 wantparent = WANTPARENT;
8100 #endif
8101
8102 /*
8103 * AUDIT: Needed to change the order of operations to do the
8104 * name lookup first because auditing wants the path.
8105 */
8106 NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1 | wantparent,
8107 UIO_USERSPACE, uap->path, ctx);
8108 error = namei(&nd);
8109 if (error) {
8110 return error;
8111 }
8112
8113 /*
8114 * Fetch the user-supplied time. If usrtvp is USER_ADDR_NULL, we fetch
8115 * the current time instead.
8116 */
8117 usrtvp = uap->tptr;
8118 if ((error = getutimes(usrtvp, ts)) != 0) {
8119 goto out;
8120 }
8121
8122 #if CONFIG_FILE_LEASES
8123 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
8124 #endif
8125
8126 error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
8127
8128 out:
8129 #if CONFIG_FILE_LEASES
8130 vnode_put(nd.ni_dvp);
8131 #endif
8132 nameidone(&nd);
8133 vnode_put(nd.ni_vp);
8134 return error;
8135 }
8136
8137 /*
8138 * Set the access and modification times of a file.
8139 */
8140 /* ARGSUSED */
8141 int
futimes(__unused proc_t p,struct futimes_args * uap,__unused int32_t * retval)8142 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
8143 {
8144 struct timespec ts[2];
8145 vnode_t vp;
8146 user_addr_t usrtvp;
8147 int error;
8148
8149 AUDIT_ARG(fd, uap->fd);
8150 usrtvp = uap->tptr;
8151 if ((error = getutimes(usrtvp, ts)) != 0) {
8152 return error;
8153 }
8154 if ((error = file_vnode(uap->fd, &vp)) != 0) {
8155 return error;
8156 }
8157 if ((error = vnode_getwithref(vp))) {
8158 file_drop(uap->fd);
8159 return error;
8160 }
8161
8162 #if CONFIG_FILE_LEASES
8163 vnode_breakdirlease(vp, true, O_WRONLY);
8164 #endif
8165
8166 error = setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
8167
8168 vnode_put(vp);
8169 file_drop(uap->fd);
8170 return error;
8171 }
8172
8173 static int
truncate_validate_common(proc_t p,off_t length)8174 truncate_validate_common(proc_t p, off_t length)
8175 {
8176 rlim_t fsize_limit;
8177
8178 if (length < 0) {
8179 return EINVAL;
8180 }
8181
8182 fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE);
8183 if ((rlim_t)length > fsize_limit) {
8184 psignal(p, SIGXFSZ);
8185 return EFBIG;
8186 }
8187
8188 return 0;
8189 }
8190
8191 static int
truncate_internal(vnode_t vp,off_t length,kauth_cred_t cred,vfs_context_t ctx,boolean_t need_auth)8192 truncate_internal(vnode_t vp, off_t length, kauth_cred_t cred,
8193 vfs_context_t ctx, boolean_t need_auth)
8194 {
8195 struct vnode_attr va;
8196 kauth_action_t action;
8197 int error;
8198
8199 VATTR_INIT(&va);
8200 VATTR_SET(&va, va_data_size, length);
8201
8202 #if CONFIG_MACF
8203 error = mac_vnode_check_truncate(ctx, cred, vp);
8204 if (error) {
8205 return error;
8206 }
8207 #endif
8208
8209 /*
8210 * If we reached here from `ftruncate` then we already did an effective
8211 * `vnode_authorize` upon open. We honour the result from then.
8212 */
8213 if (need_auth) {
8214 if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
8215 return error;
8216 }
8217
8218 if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
8219 return error;
8220 }
8221 }
8222
8223 #if CONFIG_FILE_LEASES
8224 /* Check if there is a lease placed on the parent directory. */
8225 vnode_breakdirlease(vp, true, O_WRONLY);
8226
8227 /* Now check if there is a lease placed on the file itself. */
8228 (void)vnode_breaklease(vp, O_WRONLY, ctx);
8229 #endif
8230
8231 error = vnode_setattr(vp, &va, ctx);
8232
8233 #if CONFIG_MACF
8234 if (error == 0) {
8235 mac_vnode_notify_truncate(ctx, cred, vp);
8236 }
8237 #endif
8238
8239 return error;
8240 }
8241
8242 /*
8243 * Truncate a file given its path name.
8244 */
8245 /* ARGSUSED */
8246 int
truncate(proc_t p,struct truncate_args * uap,__unused int32_t * retval)8247 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
8248 {
8249 vfs_context_t ctx = vfs_context_current();
8250 vnode_t vp;
8251 int error;
8252 struct nameidata nd;
8253
8254 if ((error = truncate_validate_common(p, uap->length))) {
8255 return error;
8256 }
8257
8258 NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
8259 UIO_USERSPACE, uap->path, ctx);
8260
8261 if ((error = namei(&nd))) {
8262 return error;
8263 }
8264
8265 vp = nd.ni_vp;
8266 nameidone(&nd);
8267
8268 error = truncate_internal(vp, uap->length, NOCRED, ctx, true);
8269 vnode_put(vp);
8270
8271 return error;
8272 }
8273
8274 /*
8275 * Truncate a file given a file descriptor.
8276 */
8277 /* ARGSUSED */
8278 int
ftruncate(proc_t p,struct ftruncate_args * uap,int32_t * retval)8279 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
8280 {
8281 vnode_t vp;
8282 struct fileproc *fp;
8283 int error;
8284
8285 AUDIT_ARG(fd, uap->fd);
8286
8287 if ((error = truncate_validate_common(p, uap->length))) {
8288 return error;
8289 }
8290
8291 if ((error = fp_lookup(p, uap->fd, &fp, 0))) {
8292 return error;
8293 }
8294
8295 switch (FILEGLOB_DTYPE(fp->fp_glob)) {
8296 case DTYPE_PSXSHM:
8297 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
8298 goto out;
8299 case DTYPE_VNODE:
8300 break;
8301 default:
8302 error = EINVAL;
8303 goto out;
8304 }
8305
8306 vp = (vnode_t)fp_get_data(fp);
8307
8308 if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
8309 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8310 error = EINVAL;
8311 goto out;
8312 }
8313
8314 if ((error = vnode_getwithref(vp)) != 0) {
8315 goto out;
8316 }
8317
8318 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8319
8320 error = truncate_internal(vp, uap->length, fp->fp_glob->fg_cred,
8321 vfs_context_current(), false);
8322 vnode_put(vp);
8323
8324 out:
8325 file_drop(uap->fd);
8326 return error;
8327 }
8328
8329
8330 /*
8331 * Sync an open file with synchronized I/O _file_ integrity completion
8332 */
8333 /* ARGSUSED */
8334 int
fsync(proc_t p,struct fsync_args * uap,__unused int32_t * retval)8335 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
8336 {
8337 __pthread_testcancel(1);
8338 return fsync_common(p, uap, MNT_WAIT);
8339 }
8340
8341
8342 /*
8343 * Sync an open file with synchronized I/O _file_ integrity completion
8344 *
8345 * Notes: This is a legacy support function that does not test for
8346 * thread cancellation points.
8347 */
8348 /* ARGSUSED */
8349 int
fsync_nocancel(proc_t p,struct fsync_nocancel_args * uap,__unused int32_t * retval)8350 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
8351 {
8352 return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
8353 }
8354
8355
8356 /*
8357 * Sync an open file with synchronized I/O _data_ integrity completion
8358 */
8359 /* ARGSUSED */
8360 int
fdatasync(proc_t p,struct fdatasync_args * uap,__unused int32_t * retval)8361 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
8362 {
8363 __pthread_testcancel(1);
8364 return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
8365 }
8366
8367
8368 /*
8369 * fsync_common
8370 *
8371 * Common fsync code to support both synchronized I/O file integrity completion
8372 * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
8373 *
8374 * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
8375 * will only guarantee that the file data contents are retrievable. If
8376 * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
8377 * includes additional metadata unnecessary for retrieving the file data
8378 * contents, such as atime, mtime, ctime, etc., also be committed to stable
8379 * storage.
8380 *
8381 * Parameters: p The process
8382 * uap->fd The descriptor to synchronize
8383 * flags The data integrity flags
8384 *
8385 * Returns: int Success
8386 * fp_getfvp:EBADF Bad file descriptor
8387 * fp_getfvp:ENOTSUP fd does not refer to a vnode
8388 * VNOP_FSYNC:??? unspecified
8389 *
8390 * Notes: We use struct fsync_args because it is a short name, and all
8391 * caller argument structures are otherwise identical.
8392 */
8393 static int
fsync_common(proc_t p,struct fsync_args * uap,int flags)8394 fsync_common(proc_t p, struct fsync_args *uap, int flags)
8395 {
8396 vnode_t vp;
8397 struct fileproc *fp;
8398 vfs_context_t ctx = vfs_context_current();
8399 int error;
8400
8401 AUDIT_ARG(fd, uap->fd);
8402
8403 if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
8404 return error;
8405 }
8406 if ((error = vnode_getwithref(vp))) {
8407 file_drop(uap->fd);
8408 return error;
8409 }
8410
8411 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8412
8413 error = VNOP_FSYNC(vp, flags, ctx);
8414
8415 #if NAMEDRSRCFORK
8416 /* Sync resource fork shadow file if necessary. */
8417 if ((error == 0) &&
8418 (vp->v_flag & VISNAMEDSTREAM) &&
8419 (vp->v_parent != NULLVP) &&
8420 vnode_isshadow(vp) &&
8421 (fp->fp_glob->fg_flag & FWASWRITTEN)) {
8422 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
8423 }
8424 #endif
8425
8426 (void)vnode_put(vp);
8427 file_drop(uap->fd);
8428 return error;
8429 }
8430
8431 /*
8432 * Duplicate files. Source must be a file, target must be a file or
8433 * must not exist.
8434 *
8435 * XXX Copyfile authorisation checking is woefully inadequate, and will not
8436 * perform inheritance correctly.
8437 */
8438 /* ARGSUSED */
8439 int
copyfile(__unused proc_t p,struct copyfile_args * uap,__unused int32_t * retval)8440 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
8441 {
8442 vnode_t tvp, fvp, tdvp, sdvp;
8443 struct nameidata fromnd, tond;
8444 int error;
8445 vfs_context_t ctx = vfs_context_current();
8446
8447 /* Check that the flags are valid. */
8448 if (uap->flags & ~CPF_MASK) {
8449 return EINVAL;
8450 }
8451
8452 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
8453 UIO_USERSPACE, uap->from, ctx);
8454 if ((error = namei(&fromnd))) {
8455 return error;
8456 }
8457 fvp = fromnd.ni_vp;
8458
8459 NDINIT(&tond, CREATE, OP_LINK,
8460 LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8461 UIO_USERSPACE, uap->to, ctx);
8462 if ((error = namei(&tond))) {
8463 goto out1;
8464 }
8465 tdvp = tond.ni_dvp;
8466 tvp = tond.ni_vp;
8467
8468 if (tvp != NULL) {
8469 if (!(uap->flags & CPF_OVERWRITE)) {
8470 error = EEXIST;
8471 goto out;
8472 }
8473 }
8474
8475 if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
8476 error = EISDIR;
8477 goto out;
8478 }
8479
8480 if (fvp->v_type == VSOCK && fvp->v_tag != VT_FDESC) {
8481 error = EOPNOTSUPP;
8482 goto out;
8483 }
8484
8485 #if CONFIG_MACF
8486 if ((error = mac_vnode_check_copyfile(ctx, tdvp, tvp, fvp, &tond.ni_cnd, (mode_t)uap->mode, uap->flags)) != 0) {
8487 goto out;
8488 }
8489 #endif /* CONFIG_MACF */
8490
8491 if ((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA, ctx)) != 0) {
8492 goto out;
8493 }
8494 if (tvp) {
8495 if ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0) {
8496 goto out;
8497 }
8498 }
8499 if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
8500 goto out;
8501 }
8502
8503 if (fvp == tdvp) {
8504 error = EINVAL;
8505 }
8506 /*
8507 * If source is the same as the destination (that is the
8508 * same inode number) then there is nothing to do.
8509 * (fixed to have POSIX semantics - CSM 3/2/98)
8510 */
8511 if (fvp == tvp) {
8512 error = -1;
8513 }
8514
8515 #if CONFIG_FILE_LEASES
8516 vnode_breakdirlease(tdvp, false, O_WRONLY);
8517 #endif
8518
8519 if (!error) {
8520 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
8521 }
8522 out:
8523 sdvp = tond.ni_startdir;
8524 /*
8525 * nameidone has to happen before we vnode_put(tdvp)
8526 * since it may need to release the fs_nodelock on the tdvp
8527 */
8528 nameidone(&tond);
8529
8530 if (tvp) {
8531 vnode_put(tvp);
8532 }
8533 vnode_put(tdvp);
8534 vnode_put(sdvp);
8535 out1:
8536 vnode_put(fvp);
8537
8538 nameidone(&fromnd);
8539
8540 if (error == -1) {
8541 return 0;
8542 }
8543 return error;
8544 }
8545
8546 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
8547
8548 /*
8549 * Helper function for doing clones. The caller is expected to provide an
8550 * iocounted source vnode and release it.
8551 */
8552 static int
clonefile_internal(vnode_t fvp,boolean_t data_read_authorised,int dst_dirfd,user_addr_t dst,uint32_t flags,vfs_context_t ctx)8553 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
8554 user_addr_t dst, uint32_t flags, vfs_context_t ctx)
8555 {
8556 vnode_t tvp, tdvp;
8557 struct nameidata tond;
8558 int error;
8559 int follow;
8560 boolean_t free_src_acl;
8561 boolean_t attr_cleanup;
8562 enum vtype v_type;
8563 kauth_action_t action;
8564 struct componentname *cnp;
8565 uint32_t defaulted = 0;
8566 struct vnode_attr va;
8567 struct vnode_attr nva;
8568 uint32_t vnop_flags;
8569
8570 v_type = vnode_vtype(fvp);
8571 switch (v_type) {
8572 case VLNK:
8573 /* FALLTHRU */
8574 case VREG:
8575 action = KAUTH_VNODE_ADD_FILE;
8576 break;
8577 case VDIR:
8578 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
8579 fvp->v_mountedhere) {
8580 return EINVAL;
8581 }
8582 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
8583 break;
8584 default:
8585 return EINVAL;
8586 }
8587
8588 AUDIT_ARG(fd2, dst_dirfd);
8589 AUDIT_ARG(value32, flags);
8590
8591 follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8592 NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
8593 UIO_USERSPACE, dst, ctx);
8594 if ((error = nameiat(&tond, dst_dirfd))) {
8595 return error;
8596 }
8597 cnp = &tond.ni_cnd;
8598 tdvp = tond.ni_dvp;
8599 tvp = tond.ni_vp;
8600
8601 free_src_acl = FALSE;
8602 attr_cleanup = FALSE;
8603
8604 if (tvp != NULL) {
8605 error = EEXIST;
8606 goto out;
8607 }
8608
8609 if (vnode_mount(tdvp) != vnode_mount(fvp)) {
8610 error = EXDEV;
8611 goto out;
8612 }
8613
8614 #if CONFIG_MACF
8615 if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
8616 goto out;
8617 }
8618 #endif
8619 if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
8620 goto out;
8621 }
8622
8623 action = KAUTH_VNODE_GENERIC_READ_BITS;
8624 if (data_read_authorised) {
8625 action &= ~KAUTH_VNODE_READ_DATA;
8626 }
8627 if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8628 goto out;
8629 }
8630
8631 /*
8632 * certain attributes may need to be changed from the source, we ask for
8633 * those here with the exception of source file's ACLs unless the CLONE_ACL
8634 * flag is specified. By default, the clone file will inherit the target
8635 * directory's ACLs unless the the CLONE_ACL flag is specified then it
8636 * will inherit the source file's ACLs instead.
8637 */
8638 VATTR_INIT(&va);
8639 VATTR_WANTED(&va, va_uid);
8640 VATTR_WANTED(&va, va_gid);
8641 VATTR_WANTED(&va, va_mode);
8642 VATTR_WANTED(&va, va_flags);
8643 if (flags & CLONE_ACL) {
8644 VATTR_WANTED(&va, va_acl);
8645 }
8646
8647 if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8648 goto out;
8649 }
8650
8651 VATTR_INIT(&nva);
8652 VATTR_SET(&nva, va_type, v_type);
8653 if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8654 VATTR_SET(&nva, va_acl, va.va_acl);
8655 free_src_acl = TRUE;
8656 }
8657
8658 /* Handle ACL inheritance, initialize vap. */
8659 if (v_type == VLNK) {
8660 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8661 } else {
8662 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8663 if (error) {
8664 goto out;
8665 }
8666 attr_cleanup = TRUE;
8667 }
8668
8669 vnop_flags = VNODE_CLONEFILE_DEFAULT;
8670 /*
8671 * We've got initial values for all security parameters,
8672 * If we are superuser, then we can change owners to be the
8673 * same as the source. Both superuser and the owner have default
8674 * WRITE_SECURITY privileges so all other fields can be taken
8675 * from source as well.
8676 */
8677 if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8678 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8679 VATTR_SET(&nva, va_uid, va.va_uid);
8680 }
8681 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8682 VATTR_SET(&nva, va_gid, va.va_gid);
8683 }
8684 } else {
8685 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8686 }
8687
8688 if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8689 VATTR_SET(&nva, va_mode, va.va_mode);
8690 }
8691 if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8692 VATTR_SET(&nva, va_flags,
8693 ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8694 (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8695 }
8696
8697 #if CONFIG_FILE_LEASES
8698 vnode_breakdirlease(tdvp, false, O_WRONLY);
8699 #endif
8700
8701 error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8702
8703 if (!error && tvp) {
8704 int update_flags = 0;
8705 #if CONFIG_FSE
8706 int fsevent;
8707 #endif /* CONFIG_FSE */
8708
8709 /*
8710 * If some of the requested attributes weren't handled by the
8711 * VNOP, use our fallback code.
8712 */
8713 if (!VATTR_ALL_SUPPORTED(&nva)) {
8714 (void)vnode_setattr_fallback(tvp, &nva, ctx);
8715 }
8716
8717 #if CONFIG_MACF
8718 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8719 VNODE_LABEL_CREATE, ctx);
8720 #endif
8721
8722 // Make sure the name & parent pointers are hooked up
8723 if (tvp->v_name == NULL) {
8724 update_flags |= VNODE_UPDATE_NAME;
8725 }
8726 if (tvp->v_parent == NULLVP) {
8727 update_flags |= VNODE_UPDATE_PARENT;
8728 }
8729
8730 if (update_flags) {
8731 (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8732 cnp->cn_namelen, cnp->cn_hash, update_flags);
8733 }
8734
8735 #if CONFIG_FSE
8736 switch (vnode_vtype(tvp)) {
8737 case VLNK:
8738 /* FALLTHRU */
8739 case VREG:
8740 fsevent = FSE_CREATE_FILE;
8741 break;
8742 case VDIR:
8743 fsevent = FSE_CREATE_DIR;
8744 break;
8745 default:
8746 goto out;
8747 }
8748
8749 if (need_fsevent(fsevent, tvp)) {
8750 /*
8751 * The following is a sequence of three explicit events.
8752 * A pair of FSE_CLONE events representing the source and destination
8753 * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8754 * fseventsd may coalesce the destination clone and create events
8755 * into a single event resulting in the following sequence for a client
8756 * FSE_CLONE (src)
8757 * FSE_CLONE | FSE_CREATE (dst)
8758 */
8759 add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8760 FSE_ARG_DONE);
8761 add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8762 FSE_ARG_DONE);
8763 }
8764 #endif /* CONFIG_FSE */
8765 }
8766
8767 out:
8768 if (attr_cleanup) {
8769 vn_attribute_cleanup(&nva, defaulted);
8770 }
8771 if (free_src_acl && va.va_acl) {
8772 kauth_acl_free(va.va_acl);
8773 }
8774 nameidone(&tond);
8775 if (tvp) {
8776 vnode_put(tvp);
8777 }
8778 vnode_put(tdvp);
8779 return error;
8780 }
8781
8782 /*
8783 * clone files or directories, target must not exist.
8784 */
8785 /* ARGSUSED */
8786 int
clonefileat(__unused proc_t p,struct clonefileat_args * uap,__unused int32_t * retval)8787 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8788 __unused int32_t *retval)
8789 {
8790 vnode_t fvp;
8791 struct nameidata fromnd;
8792 int follow;
8793 int error;
8794 vfs_context_t ctx = vfs_context_current();
8795
8796 /* Check that the flags are valid. */
8797 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8798 return EINVAL;
8799 }
8800
8801 AUDIT_ARG(fd, uap->src_dirfd);
8802
8803 follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8804 NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8805 UIO_USERSPACE, uap->src, ctx);
8806 if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8807 return error;
8808 }
8809
8810 fvp = fromnd.ni_vp;
8811 nameidone(&fromnd);
8812
8813 error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8814 uap->flags, ctx);
8815
8816 vnode_put(fvp);
8817 return error;
8818 }
8819
8820 int
fclonefileat(__unused proc_t p,struct fclonefileat_args * uap,__unused int32_t * retval)8821 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8822 __unused int32_t *retval)
8823 {
8824 vnode_t fvp;
8825 struct fileproc *fp;
8826 int error;
8827 vfs_context_t ctx = vfs_context_current();
8828
8829 /* Check that the flags are valid. */
8830 if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL)) {
8831 return EINVAL;
8832 }
8833
8834 AUDIT_ARG(fd, uap->src_fd);
8835 error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8836 if (error) {
8837 return error;
8838 }
8839
8840 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8841 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8842 error = EBADF;
8843 goto out;
8844 }
8845
8846 if ((error = vnode_getwithref(fvp))) {
8847 goto out;
8848 }
8849
8850 AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8851
8852 error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8853 uap->flags, ctx);
8854
8855 vnode_put(fvp);
8856 out:
8857 file_drop(uap->src_fd);
8858 return error;
8859 }
8860
8861 static int
rename_submounts_callback(mount_t mp,void * arg)8862 rename_submounts_callback(mount_t mp, void *arg)
8863 {
8864 int error = 0;
8865 mount_t pmp = (mount_t)arg;
8866 int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8867
8868 if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8869 return 0;
8870 }
8871
8872 if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8873 return 0;
8874 }
8875
8876 if ((error = vfs_busy(mp, LK_NOWAIT))) {
8877 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8878 return -1;
8879 }
8880
8881 size_t pathlen = MAXPATHLEN;
8882 if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8883 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8884 }
8885
8886 vfs_unbusy(mp);
8887
8888 return error;
8889 }
8890
8891 /*
8892 * Rename files. Source and destination must either both be directories,
8893 * or both not be directories. If target is a directory, it must be empty.
8894 */
8895 /* ARGSUSED */
8896 static int
renameat_internal(vfs_context_t ctx,int fromfd,user_addr_t from,int tofd,user_addr_t to,int segflg,u_int uflags)8897 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8898 int tofd, user_addr_t to, int segflg, u_int uflags)
8899 {
8900 vnode_t tvp, tdvp;
8901 vnode_t fvp, fdvp;
8902 vnode_t mnt_fvp;
8903 struct nameidata *fromnd, *tond;
8904 int error = 0;
8905 int do_retry;
8906 int retry_count;
8907 int mntrename;
8908 int need_event;
8909 int need_kpath2;
8910 int has_listeners;
8911 const char *oname = NULL;
8912 char *from_name = NULL, *to_name = NULL;
8913 char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8914 int from_len = 0, to_len = 0;
8915 int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8916 int holding_mntlock;
8917 int vn_authorize_skipped;
8918 mount_t locked_mp = NULL;
8919 vnode_t oparent = NULLVP;
8920 #if CONFIG_FSE
8921 fse_info from_finfo = {}, to_finfo;
8922 #endif
8923 int from_truncated = 0, to_truncated = 0;
8924 int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8925 int batched = 0;
8926 struct vnode_attr *fvap, *tvap;
8927 int continuing = 0;
8928 vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK;
8929 int32_t nofollow_any = 0;
8930 /* carving out a chunk for structs that are too big to be on stack. */
8931 struct {
8932 struct nameidata from_node, to_node;
8933 struct vnode_attr fv_attr, tv_attr;
8934 } * __rename_data;
8935
8936 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
8937 fromnd = &__rename_data->from_node;
8938 tond = &__rename_data->to_node;
8939
8940 holding_mntlock = 0;
8941 do_retry = 0;
8942 retry_count = 0;
8943 retry:
8944 fvp = tvp = NULL;
8945 fdvp = tdvp = NULL;
8946 fvap = tvap = NULL;
8947 mnt_fvp = NULLVP;
8948 mntrename = FALSE;
8949 vn_authorize_skipped = FALSE;
8950
8951 if (uflags & RENAME_NOFOLLOW_ANY) {
8952 nofollow_any = NAMEI_NOFOLLOW_ANY;
8953 }
8954 NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8955 segflg, from, ctx);
8956 fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8957
8958 NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8959 segflg, to, ctx);
8960 tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any;
8961
8962 continue_lookup:
8963 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8964 if ((error = nameiat(fromnd, fromfd))) {
8965 goto out1;
8966 }
8967 fdvp = fromnd->ni_dvp;
8968 fvp = fromnd->ni_vp;
8969
8970 if (fvp && fvp->v_type == VDIR) {
8971 tond->ni_cnd.cn_flags |= WILLBEDIR;
8972 }
8973 }
8974
8975 if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8976 if ((error = nameiat(tond, tofd))) {
8977 /*
8978 * Translate error code for rename("dir1", "dir2/.").
8979 */
8980 if (error == EISDIR && fvp->v_type == VDIR) {
8981 error = EINVAL;
8982 }
8983 goto out1;
8984 }
8985 tdvp = tond->ni_dvp;
8986 tvp = tond->ni_vp;
8987 }
8988
8989 #if DEVELOPMENT || DEBUG
8990 /*
8991 * XXX VSWAP: Check for entitlements or special flag here
8992 * so we can restrict access appropriately.
8993 */
8994 #else /* DEVELOPMENT || DEBUG */
8995
8996 if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8997 error = EPERM;
8998 goto out1;
8999 }
9000
9001 if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
9002 error = EPERM;
9003 goto out1;
9004 }
9005 #endif /* DEVELOPMENT || DEBUG */
9006
9007 if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
9008 error = ENOENT;
9009 goto out1;
9010 }
9011
9012 if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
9013 int32_t pval = 0;
9014 int err = 0;
9015
9016 /*
9017 * We allow rename with VFS_RENAME_EXCL flag for an existing file which
9018 * has the same name as target iff the following conditions are met:
9019 * 1. the target file system is case insensitive
9020 * 2. source and target directories are the same
9021 * 3. source and target files are the same
9022 * 4. name only differs in case (determined by underlying filesystem)
9023 */
9024 if (fvp != tvp || fdvp != tdvp) {
9025 error = EEXIST;
9026 goto out1;
9027 }
9028
9029 /*
9030 * Assume that the target file system is case sensitive if
9031 * _PC_CASE_SENSITIVE selector isn't supported.
9032 */
9033 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
9034 if (err != 0 || pval != 0) {
9035 error = EEXIST;
9036 goto out1;
9037 }
9038 }
9039
9040 batched = vnode_compound_rename_available(fdvp);
9041
9042 #if CONFIG_FSE
9043 need_event = need_fsevent(FSE_RENAME, fdvp);
9044 if (need_event) {
9045 if (fvp) {
9046 get_fse_info(fvp, &from_finfo, ctx);
9047 } else {
9048 error = vfs_get_notify_attributes(&__rename_data->fv_attr);
9049 if (error) {
9050 goto out1;
9051 }
9052
9053 fvap = &__rename_data->fv_attr;
9054 }
9055
9056 if (tvp) {
9057 get_fse_info(tvp, &to_finfo, ctx);
9058 } else if (batched) {
9059 error = vfs_get_notify_attributes(&__rename_data->tv_attr);
9060 if (error) {
9061 goto out1;
9062 }
9063
9064 tvap = &__rename_data->tv_attr;
9065 }
9066 }
9067 #else
9068 need_event = 0;
9069 #endif /* CONFIG_FSE */
9070
9071 has_listeners = kauth_authorize_fileop_has_listeners();
9072
9073 need_kpath2 = 0;
9074 #if CONFIG_AUDIT
9075 if (AUDIT_RECORD_EXISTS()) {
9076 need_kpath2 = 1;
9077 }
9078 #endif
9079
9080 if (need_event || has_listeners) {
9081 if (from_name == NULL) {
9082 GET_PATH(from_name);
9083 }
9084
9085 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
9086
9087 if (from_name_no_firmlink == NULL) {
9088 GET_PATH(from_name_no_firmlink);
9089 }
9090
9091 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
9092 }
9093
9094 if (need_event || need_kpath2 || has_listeners) {
9095 if (to_name == NULL) {
9096 GET_PATH(to_name);
9097 }
9098
9099 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
9100
9101 if (to_name_no_firmlink == NULL) {
9102 GET_PATH(to_name_no_firmlink);
9103 }
9104
9105 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
9106 if (to_name && need_kpath2) {
9107 AUDIT_ARG(kpath, to_name, ARG_KPATH2);
9108 }
9109 }
9110 if (!fvp) {
9111 /*
9112 * Claim: this check will never reject a valid rename.
9113 * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
9114 * Suppose fdvp and tdvp are not on the same mount.
9115 * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root,
9116 * then you can't move it to within another dir on the same mountpoint.
9117 * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
9118 *
9119 * If this check passes, then we are safe to pass these vnodes to the same FS.
9120 */
9121 if (fdvp->v_mount != tdvp->v_mount) {
9122 error = EXDEV;
9123 goto out1;
9124 }
9125 goto skipped_lookup;
9126 }
9127
9128 /*
9129 * If the source and destination are the same (i.e. they're
9130 * links to the same vnode) and the target file system is
9131 * case sensitive, then there is nothing to do.
9132 *
9133 * XXX Come back to this.
9134 */
9135 if (fvp == tvp) {
9136 int pathconf_val;
9137
9138 /*
9139 * Note: if _PC_CASE_SENSITIVE selector isn't supported,
9140 * then assume that this file system is case sensitive.
9141 */
9142 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
9143 pathconf_val != 0) {
9144 vn_authorize_skipped = TRUE;
9145 goto out1;
9146 }
9147 }
9148
9149 /*
9150 * Allow the renaming of mount points.
9151 * - target must not exist
9152 * - target must reside in the same directory as source
9153 * - union mounts cannot be renamed
9154 * - the root fs, and tightly-linked system volumes, cannot be renamed
9155 *
9156 * XXX Handle this in VFS after a continued lookup (if we missed
9157 * in the cache to start off)
9158 *
9159 * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
9160 * we'll skip past here. The file system is responsible for
9161 * checking that @tvp is not a descendent of @fvp and vice versa
9162 * so it should always return EINVAL if either @tvp or @fvp is the
9163 * root of a volume.
9164 */
9165 if ((fvp->v_flag & VROOT) &&
9166 (fvp->v_type == VDIR) &&
9167 (tvp == NULL) &&
9168 (fvp->v_mountedhere == NULL) &&
9169 (fdvp == tdvp) &&
9170 ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
9171 ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
9172 (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
9173 vnode_t coveredvp;
9174
9175 /* switch fvp to the covered vnode */
9176 coveredvp = fvp->v_mount->mnt_vnodecovered;
9177 if ((vnode_getwithref(coveredvp))) {
9178 error = ENOENT;
9179 goto out1;
9180 }
9181 /*
9182 * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
9183 * later.
9184 */
9185 mnt_fvp = fvp;
9186
9187 fvp = coveredvp;
9188 mntrename = TRUE;
9189 }
9190 /*
9191 * Check for cross-device rename.
9192 */
9193 if ((fvp->v_mount != tdvp->v_mount) ||
9194 (tvp && (fvp->v_mount != tvp->v_mount))) {
9195 error = EXDEV;
9196 goto out1;
9197 }
9198
9199 /*
9200 * If source is the same as the destination (that is the
9201 * same inode number) then there is nothing to do...
9202 * EXCEPT if the underlying file system supports case
9203 * insensitivity and is case preserving. In this case
9204 * the file system needs to handle the special case of
9205 * getting the same vnode as target (fvp) and source (tvp).
9206 *
9207 * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
9208 * and _PC_CASE_PRESERVING can have this exception, and they need to
9209 * handle the special case of getting the same vnode as target and
9210 * source. NOTE: Then the target is unlocked going into vnop_rename,
9211 * so not to cause locking problems. There is a single reference on tvp.
9212 *
9213 * NOTE - that fvp == tvp also occurs if they are hard linked and
9214 * that correct behaviour then is just to return success without doing
9215 * anything.
9216 *
9217 * XXX filesystem should take care of this itself, perhaps...
9218 */
9219 if (fvp == tvp && fdvp == tdvp) {
9220 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
9221 !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
9222 fromnd->ni_cnd.cn_namelen)) {
9223 vn_authorize_skipped = TRUE;
9224 goto out1;
9225 }
9226 }
9227
9228 if (holding_mntlock && fvp->v_mount != locked_mp) {
9229 /*
9230 * we're holding a reference and lock
9231 * on locked_mp, but it no longer matches
9232 * what we want to do... so drop our hold
9233 */
9234 mount_unlock_renames(locked_mp);
9235 mount_drop(locked_mp, 0);
9236 holding_mntlock = 0;
9237 }
9238 if (tdvp != fdvp && fvp->v_type == VDIR) {
9239 /*
9240 * serialize renames that re-shape
9241 * the tree... if holding_mntlock is
9242 * set, then we're ready to go...
9243 * otherwise we
9244 * first need to drop the iocounts
9245 * we picked up, second take the
9246 * lock to serialize the access,
9247 * then finally start the lookup
9248 * process over with the lock held
9249 */
9250 if (!holding_mntlock) {
9251 /*
9252 * need to grab a reference on
9253 * the mount point before we
9254 * drop all the iocounts... once
9255 * the iocounts are gone, the mount
9256 * could follow
9257 */
9258 locked_mp = fvp->v_mount;
9259 mount_ref(locked_mp, 0);
9260
9261 /*
9262 * nameidone has to happen before we vnode_put(tvp)
9263 * since it may need to release the fs_nodelock on the tvp
9264 */
9265 nameidone(tond);
9266
9267 if (tvp) {
9268 vnode_put(tvp);
9269 }
9270 vnode_put(tdvp);
9271
9272 /*
9273 * nameidone has to happen before we vnode_put(fdvp)
9274 * since it may need to release the fs_nodelock on the fvp
9275 */
9276 nameidone(fromnd);
9277
9278 vnode_put(fvp);
9279 vnode_put(fdvp);
9280
9281 if (mnt_fvp != NULLVP) {
9282 vnode_put(mnt_fvp);
9283 }
9284
9285 mount_lock_renames(locked_mp);
9286 holding_mntlock = 1;
9287
9288 goto retry;
9289 }
9290 } else {
9291 /*
9292 * when we dropped the iocounts to take
9293 * the lock, we allowed the identity of
9294 * the various vnodes to change... if they did,
9295 * we may no longer be dealing with a rename
9296 * that reshapes the tree... once we're holding
9297 * the iocounts, the vnodes can't change type
9298 * so we're free to drop the lock at this point
9299 * and continue on
9300 */
9301 if (holding_mntlock) {
9302 mount_unlock_renames(locked_mp);
9303 mount_drop(locked_mp, 0);
9304 holding_mntlock = 0;
9305 }
9306 }
9307
9308 if (!batched) {
9309 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
9310 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9311 flags, NULL);
9312 if (error) {
9313 if (error == ENOENT) {
9314 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9315 /*
9316 * We encountered a race where after doing the namei,
9317 * tvp stops being valid. If so, simply re-drive the rename
9318 * call from the top.
9319 */
9320 do_retry = 1;
9321 retry_count += 1;
9322 }
9323 }
9324 goto out1;
9325 }
9326 }
9327
9328 /* Release the 'mnt_fvp' now that it is no longer needed. */
9329 if (mnt_fvp != NULLVP) {
9330 vnode_put(mnt_fvp);
9331 mnt_fvp = NULLVP;
9332 }
9333
9334 // save these off so we can later verify that fvp is the same
9335 oname = fvp->v_name;
9336 oparent = fvp->v_parent;
9337
9338 skipped_lookup:
9339 #if CONFIG_FILE_LEASES
9340 /* Lease break needed for source's parent dir? */
9341 vnode_breakdirlease(fdvp, false, O_WRONLY);
9342
9343 /* Lease break needed for target's parent dir? */
9344 vnode_breakdirlease(tdvp, false, O_WRONLY);
9345 #endif
9346
9347 error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
9348 tdvp, &tvp, &tond->ni_cnd, tvap,
9349 flags, ctx);
9350
9351 if (holding_mntlock) {
9352 /*
9353 * we can drop our serialization
9354 * lock now
9355 */
9356 mount_unlock_renames(locked_mp);
9357 mount_drop(locked_mp, 0);
9358 holding_mntlock = 0;
9359 }
9360 if (error) {
9361 if (error == EDATALESS) {
9362 /*
9363 * If we've been here before, something has gone
9364 * horribly wrong and we should just get out lest
9365 * we spiral around the drain forever.
9366 */
9367 if (flags & VFS_RENAME_DATALESS) {
9368 error = EIO;
9369 goto out1;
9370 }
9371
9372 /*
9373 * The object we're renaming is dataless (or has a
9374 * dataless descendent) and requires materialization
9375 * before the rename occurs. But we're holding the
9376 * mount point's rename lock, so it's not safe to
9377 * make the upcall.
9378 *
9379 * In this case, we release the lock (above), perform
9380 * the materialization, and start the whole thing over.
9381 */
9382 error = vfs_materialize_reparent(fvp, tdvp);
9383 if (error == 0) {
9384 /*
9385 * The next time around we need to tell the
9386 * file system that the materializtaion has
9387 * been performed.
9388 */
9389 flags |= VFS_RENAME_DATALESS;
9390 do_retry = 1;
9391 }
9392 goto out1;
9393 }
9394 if (error == EKEEPLOOKING) {
9395 if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9396 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
9397 panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
9398 }
9399 }
9400
9401 fromnd->ni_vp = fvp;
9402 tond->ni_vp = tvp;
9403
9404 goto continue_lookup;
9405 }
9406
9407 /*
9408 * We may encounter a race in the VNOP where the destination didn't
9409 * exist when we did the namei, but it does by the time we go and
9410 * try to create the entry. In this case, we should re-drive this rename
9411 * call from the top again. Currently, only HFS bubbles out ERECYCLE,
9412 * but other filesystems susceptible to this race could return it, too.
9413 */
9414 if (error == ERECYCLE) {
9415 if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
9416 do_retry = 1;
9417 retry_count += 1;
9418 } else {
9419 printf("rename retry limit due to ERECYCLE reached\n");
9420 error = ENOENT;
9421 }
9422 }
9423
9424 /*
9425 * For compound VNOPs, the authorization callback may return
9426 * ENOENT in case of racing hardlink lookups hitting the name
9427 * cache, redrive the lookup.
9428 */
9429 if (batched && error == ENOENT) {
9430 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9431 do_retry = 1;
9432 retry_count += 1;
9433 }
9434 }
9435
9436 goto out1;
9437 }
9438
9439 /* call out to allow 3rd party notification of rename.
9440 * Ignore result of kauth_authorize_fileop call.
9441 */
9442 kauth_authorize_fileop(vfs_context_ucred(ctx),
9443 KAUTH_FILEOP_RENAME,
9444 (uintptr_t)from_name, (uintptr_t)to_name);
9445 if (flags & VFS_RENAME_SWAP) {
9446 kauth_authorize_fileop(vfs_context_ucred(ctx),
9447 KAUTH_FILEOP_RENAME,
9448 (uintptr_t)to_name, (uintptr_t)from_name);
9449 }
9450
9451 #if CONFIG_FSE
9452 if (from_name != NULL && to_name != NULL) {
9453 if (from_truncated || to_truncated) {
9454 // set it here since only the from_finfo gets reported up to user space
9455 from_finfo.mode |= FSE_TRUNCATED_PATH;
9456 }
9457
9458 if (tvap && tvp) {
9459 vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
9460 }
9461 if (fvap) {
9462 vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
9463 }
9464
9465 if (tvp) {
9466 add_fsevent(FSE_RENAME, ctx,
9467 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9468 FSE_ARG_FINFO, &from_finfo,
9469 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9470 FSE_ARG_FINFO, &to_finfo,
9471 FSE_ARG_DONE);
9472 if (flags & VFS_RENAME_SWAP) {
9473 /*
9474 * Strictly speaking, swap is the equivalent of
9475 * *three* renames. FSEvents clients should only take
9476 * the events as a hint, so we only bother reporting
9477 * two.
9478 */
9479 add_fsevent(FSE_RENAME, ctx,
9480 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9481 FSE_ARG_FINFO, &to_finfo,
9482 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9483 FSE_ARG_FINFO, &from_finfo,
9484 FSE_ARG_DONE);
9485 }
9486 } else {
9487 add_fsevent(FSE_RENAME, ctx,
9488 FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
9489 FSE_ARG_FINFO, &from_finfo,
9490 FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
9491 FSE_ARG_DONE);
9492 }
9493 }
9494 #endif /* CONFIG_FSE */
9495
9496 /*
9497 * update filesystem's mount point data
9498 */
9499 if (mntrename) {
9500 char *cp, *pathend, *mpname;
9501 char * tobuf;
9502 struct mount *mp;
9503 int maxlen;
9504 size_t len = 0;
9505
9506 mp = fvp->v_mountedhere;
9507
9508 if (vfs_busy(mp, LK_NOWAIT)) {
9509 error = EBUSY;
9510 goto out1;
9511 }
9512 tobuf = zalloc(ZV_NAMEI);
9513
9514 if (UIO_SEG_IS_USER_SPACE(segflg)) {
9515 error = copyinstr(to, tobuf, MAXPATHLEN, &len);
9516 } else {
9517 error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
9518 }
9519 if (!error) {
9520 /* find current mount point prefix */
9521 pathend = &mp->mnt_vfsstat.f_mntonname[0];
9522 for (cp = pathend; *cp != '\0'; ++cp) {
9523 if (*cp == '/') {
9524 pathend = cp + 1;
9525 }
9526 }
9527 /* find last component of target name */
9528 for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
9529 if (*cp == '/') {
9530 mpname = cp + 1;
9531 }
9532 }
9533
9534 /* Update f_mntonname of sub mounts */
9535 vfs_iterate(0, rename_submounts_callback, (void *)mp);
9536
9537 /* append name to prefix */
9538 maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
9539 bzero(pathend, maxlen);
9540
9541 strlcpy(pathend, mpname, maxlen);
9542 }
9543 zfree(ZV_NAMEI, tobuf);
9544
9545 vfs_unbusy(mp);
9546
9547 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
9548 }
9549 /*
9550 * fix up name & parent pointers. note that we first
9551 * check that fvp has the same name/parent pointers it
9552 * had before the rename call... this is a 'weak' check
9553 * at best...
9554 *
9555 * XXX oparent and oname may not be set in the compound vnop case
9556 */
9557 if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
9558 int update_flags;
9559
9560 update_flags = VNODE_UPDATE_NAME;
9561
9562 if (fdvp != tdvp) {
9563 update_flags |= VNODE_UPDATE_PARENT;
9564 }
9565
9566 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
9567 }
9568 out1:
9569 /*
9570 * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
9571 * skipped earlier as no actual rename was performed.
9572 */
9573 if (vn_authorize_skipped && error == 0) {
9574 error = vn_authorize_renamex_with_paths(fdvp, fvp,
9575 &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
9576 flags, NULL);
9577 if (error && error == ENOENT) {
9578 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9579 do_retry = 1;
9580 retry_count += 1;
9581 }
9582 }
9583 }
9584 if (to_name != NULL) {
9585 RELEASE_PATH(to_name);
9586 to_name = NULL;
9587 }
9588 if (to_name_no_firmlink != NULL) {
9589 RELEASE_PATH(to_name_no_firmlink);
9590 to_name_no_firmlink = NULL;
9591 }
9592 if (from_name != NULL) {
9593 RELEASE_PATH(from_name);
9594 from_name = NULL;
9595 }
9596 if (from_name_no_firmlink != NULL) {
9597 RELEASE_PATH(from_name_no_firmlink);
9598 from_name_no_firmlink = NULL;
9599 }
9600 if (holding_mntlock) {
9601 mount_unlock_renames(locked_mp);
9602 mount_drop(locked_mp, 0);
9603 holding_mntlock = 0;
9604 }
9605 if (tdvp) {
9606 /*
9607 * nameidone has to happen before we vnode_put(tdvp)
9608 * since it may need to release the fs_nodelock on the tdvp
9609 */
9610 nameidone(tond);
9611
9612 if (tvp) {
9613 vnode_put(tvp);
9614 }
9615 vnode_put(tdvp);
9616 }
9617 if (fdvp) {
9618 /*
9619 * nameidone has to happen before we vnode_put(fdvp)
9620 * since it may need to release the fs_nodelock on the fdvp
9621 */
9622 nameidone(fromnd);
9623
9624 if (fvp) {
9625 vnode_put(fvp);
9626 }
9627 vnode_put(fdvp);
9628 }
9629 if (mnt_fvp != NULLVP) {
9630 vnode_put(mnt_fvp);
9631 }
9632 /*
9633 * If things changed after we did the namei, then we will re-drive
9634 * this rename call from the top.
9635 */
9636 if (do_retry) {
9637 do_retry = 0;
9638 goto retry;
9639 }
9640
9641 kfree_type(typeof(*__rename_data), __rename_data);
9642 return error;
9643 }
9644
9645 int
rename(__unused proc_t p,struct rename_args * uap,__unused int32_t * retval)9646 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9647 {
9648 return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9649 AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9650 }
9651
9652 int
renameatx_np(__unused proc_t p,struct renameatx_np_args * uap,__unused int32_t * retval)9653 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9654 {
9655 if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) {
9656 return EINVAL;
9657 }
9658
9659 if ((uap->flags & (RENAME_EXCL | RENAME_SWAP)) == (RENAME_EXCL | RENAME_SWAP)) {
9660 return EINVAL;
9661 }
9662
9663 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9664 uap->tofd, uap->to, UIO_USERSPACE, uap->flags);
9665 }
9666
9667 int
renameat(__unused proc_t p,struct renameat_args * uap,__unused int32_t * retval)9668 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9669 {
9670 return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9671 uap->tofd, uap->to, UIO_USERSPACE, 0);
9672 }
9673
9674 /*
9675 * Make a directory file.
9676 *
9677 * Returns: 0 Success
9678 * EEXIST
9679 * namei:???
9680 * vnode_authorize:???
9681 * vn_create:???
9682 */
9683 /* ARGSUSED */
9684 static int
mkdir1at(vfs_context_t ctx,user_addr_t path,struct vnode_attr * vap,int fd,enum uio_seg segflg)9685 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9686 enum uio_seg segflg)
9687 {
9688 vnode_t vp, dvp;
9689 int error;
9690 int update_flags = 0;
9691 int batched;
9692 struct nameidata nd;
9693
9694 AUDIT_ARG(mode, vap->va_mode);
9695 NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9696 path, ctx);
9697 nd.ni_cnd.cn_flags |= WILLBEDIR;
9698 nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9699
9700 continue_lookup:
9701 error = nameiat(&nd, fd);
9702 if (error) {
9703 return error;
9704 }
9705 dvp = nd.ni_dvp;
9706 vp = nd.ni_vp;
9707
9708 if (vp != NULL) {
9709 error = EEXIST;
9710 goto out;
9711 }
9712
9713 batched = vnode_compound_mkdir_available(dvp);
9714
9715 VATTR_SET(vap, va_type, VDIR);
9716
9717 /*
9718 * XXX
9719 * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9720 * only get EXISTS or EISDIR for existing path components, and not that it could see
9721 * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9722 * it will fail in a spurious manner. Need to figure out if this is valid behavior.
9723 */
9724 if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9725 if (error == EACCES || error == EPERM) {
9726 int error2;
9727
9728 nameidone(&nd);
9729 vnode_put(dvp);
9730 dvp = NULLVP;
9731
9732 /*
9733 * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9734 * rather than EACCESS if the target exists.
9735 */
9736 NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9737 path, ctx);
9738 error2 = nameiat(&nd, fd);
9739 if (error2) {
9740 goto out;
9741 } else {
9742 vp = nd.ni_vp;
9743 error = EEXIST;
9744 goto out;
9745 }
9746 }
9747
9748 goto out;
9749 }
9750
9751 #if CONFIG_FILE_LEASES
9752 vnode_breakdirlease(dvp, false, O_WRONLY);
9753 #endif
9754
9755 /*
9756 * make the directory
9757 */
9758 if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9759 if (error == EKEEPLOOKING) {
9760 nd.ni_vp = vp;
9761 goto continue_lookup;
9762 }
9763
9764 goto out;
9765 }
9766
9767 // Make sure the name & parent pointers are hooked up
9768 if (vp->v_name == NULL) {
9769 update_flags |= VNODE_UPDATE_NAME;
9770 }
9771 if (vp->v_parent == NULLVP) {
9772 update_flags |= VNODE_UPDATE_PARENT;
9773 }
9774
9775 if (update_flags) {
9776 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9777 }
9778
9779 #if CONFIG_FSE
9780 add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9781 #endif
9782
9783 out:
9784 /*
9785 * nameidone has to happen before we vnode_put(dvp)
9786 * since it may need to release the fs_nodelock on the dvp
9787 */
9788 nameidone(&nd);
9789
9790 if (vp) {
9791 vnode_put(vp);
9792 }
9793 if (dvp) {
9794 vnode_put(dvp);
9795 }
9796
9797 return error;
9798 }
9799
9800 /*
9801 * mkdir_extended: Create a directory; with extended security (ACL).
9802 *
9803 * Parameters: p Process requesting to create the directory
9804 * uap User argument descriptor (see below)
9805 * retval (ignored)
9806 *
9807 * Indirect: uap->path Path of directory to create
9808 * uap->mode Access permissions to set
9809 * uap->xsecurity ACL to set
9810 *
9811 * Returns: 0 Success
9812 * !0 Not success
9813 *
9814 */
9815 int
mkdir_extended(proc_t p,struct mkdir_extended_args * uap,__unused int32_t * retval)9816 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9817 {
9818 int ciferror;
9819 kauth_filesec_t xsecdst;
9820 struct vnode_attr va;
9821
9822 AUDIT_ARG(owner, uap->uid, uap->gid);
9823
9824 xsecdst = NULL;
9825 if ((uap->xsecurity != USER_ADDR_NULL) &&
9826 ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9827 return ciferror;
9828 }
9829
9830 VATTR_INIT(&va);
9831 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9832 if (xsecdst != NULL) {
9833 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9834 va.va_vaflags |= VA_FILESEC_ACL;
9835 }
9836
9837 ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9838 UIO_USERSPACE);
9839 if (xsecdst != NULL) {
9840 kauth_filesec_free(xsecdst);
9841 }
9842 return ciferror;
9843 }
9844
9845 int
mkdir(proc_t p,struct mkdir_args * uap,__unused int32_t * retval)9846 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9847 {
9848 struct vnode_attr va;
9849
9850 VATTR_INIT(&va);
9851 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9852
9853 return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9854 UIO_USERSPACE);
9855 }
9856
9857 int
mkdirat(proc_t p,struct mkdirat_args * uap,__unused int32_t * retval)9858 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9859 {
9860 struct vnode_attr va;
9861
9862 VATTR_INIT(&va);
9863 VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd.fd_cmask);
9864
9865 return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9866 UIO_USERSPACE);
9867 }
9868
9869 static int
rmdirat_internal(vfs_context_t ctx,int fd,user_addr_t dirpath,enum uio_seg segflg,int unlink_flags)9870 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9871 enum uio_seg segflg, int unlink_flags)
9872 {
9873 struct {
9874 struct nameidata nd;
9875 #if CONFIG_FSE
9876 struct vnode_attr va;
9877 #endif /* CONFIG_FSE */
9878 } *__rmdir_data;
9879 vnode_t vp, dvp;
9880 int error;
9881 struct nameidata *ndp;
9882 char *path = NULL;
9883 char *no_firmlink_path = NULL;
9884 int len_path = 0;
9885 int len_no_firmlink_path = 0;
9886 int has_listeners = 0;
9887 int need_event = 0;
9888 int truncated_path = 0;
9889 int truncated_no_firmlink_path = 0;
9890 struct vnode_attr *vap = NULL;
9891 int restart_count = 0;
9892 int batched;
9893
9894 int restart_flag;
9895 int nofollow_any = 0;
9896
9897 __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK);
9898 ndp = &__rmdir_data->nd;
9899
9900 if (unlink_flags & VNODE_REMOVE_NOFOLLOW_ANY) {
9901 nofollow_any = NAMEI_NOFOLLOW_ANY;
9902 unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY;
9903 }
9904
9905 /*
9906 * This loop exists to restart rmdir in the unlikely case that two
9907 * processes are simultaneously trying to remove the same directory
9908 * containing orphaned appleDouble files.
9909 */
9910 do {
9911 NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9912 segflg, dirpath, ctx);
9913 ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any;
9914 continue_lookup:
9915 restart_flag = 0;
9916 vap = NULL;
9917
9918 error = nameiat(ndp, fd);
9919 if (error) {
9920 goto err_out;
9921 }
9922
9923 dvp = ndp->ni_dvp;
9924 vp = ndp->ni_vp;
9925
9926 if (vp) {
9927 batched = vnode_compound_rmdir_available(vp);
9928
9929 if (vp->v_flag & VROOT) {
9930 /*
9931 * The root of a mounted filesystem cannot be deleted.
9932 */
9933 error = EBUSY;
9934 goto out;
9935 }
9936
9937 #if DEVELOPMENT || DEBUG
9938 /*
9939 * XXX VSWAP: Check for entitlements or special flag here
9940 * so we can restrict access appropriately.
9941 */
9942 #else /* DEVELOPMENT || DEBUG */
9943
9944 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9945 error = EPERM;
9946 goto out;
9947 }
9948 #endif /* DEVELOPMENT || DEBUG */
9949
9950 /*
9951 * Removed a check here; we used to abort if vp's vid
9952 * was not the same as what we'd seen the last time around.
9953 * I do not think that check was valid, because if we retry
9954 * and all dirents are gone, the directory could legitimately
9955 * be recycled but still be present in a situation where we would
9956 * have had permission to delete. Therefore, we won't make
9957 * an effort to preserve that check now that we may not have a
9958 * vp here.
9959 */
9960
9961 if (!batched) {
9962 error = vn_authorize_rmdir(dvp, vp, &ndp->ni_cnd, ctx, NULL);
9963 if (error) {
9964 if (error == ENOENT) {
9965 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9966 restart_flag = 1;
9967 restart_count += 1;
9968 }
9969 }
9970 goto out;
9971 }
9972 }
9973 } else {
9974 batched = 1;
9975
9976 if (!vnode_compound_rmdir_available(dvp)) {
9977 panic("No error, but no compound rmdir?");
9978 }
9979 }
9980
9981 #if CONFIG_FSE
9982 fse_info finfo = {0};
9983
9984 need_event = need_fsevent(FSE_DELETE, dvp);
9985 if (need_event) {
9986 if (!batched) {
9987 get_fse_info(vp, &finfo, ctx);
9988 } else {
9989 error = vfs_get_notify_attributes(&__rmdir_data->va);
9990 if (error) {
9991 goto out;
9992 }
9993
9994 vap = &__rmdir_data->va;
9995 }
9996 }
9997 #endif
9998 has_listeners = kauth_authorize_fileop_has_listeners();
9999 if (need_event || has_listeners) {
10000 if (path == NULL) {
10001 GET_PATH(path);
10002 }
10003
10004 len_path = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
10005
10006 if (no_firmlink_path == NULL) {
10007 GET_PATH(no_firmlink_path);
10008 }
10009
10010 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, ndp->ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
10011 #if CONFIG_FSE
10012 if (truncated_no_firmlink_path) {
10013 finfo.mode |= FSE_TRUNCATED_PATH;
10014 }
10015 #endif
10016 }
10017
10018 #if CONFIG_FILE_LEASES
10019 vnode_breakdirlease(dvp, false, O_WRONLY);
10020 #endif
10021
10022 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10023 ndp->ni_vp = vp;
10024 if (vp == NULLVP) {
10025 /* Couldn't find a vnode */
10026 goto out;
10027 }
10028
10029 if (error == EKEEPLOOKING) {
10030 goto continue_lookup;
10031 } else if (batched && error == ENOENT) {
10032 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
10033 /*
10034 * For compound VNOPs, the authorization callback
10035 * may return ENOENT in case of racing hard link lookups
10036 * redrive the lookup.
10037 */
10038 restart_flag = 1;
10039 restart_count += 1;
10040 goto out;
10041 }
10042 }
10043
10044 /*
10045 * XXX There's no provision for passing flags
10046 * to VNOP_RMDIR(). So, if vn_rmdir() fails
10047 * because it's not empty, then we try again
10048 * with VNOP_REMOVE(), passing in a special
10049 * flag that clever file systems will know
10050 * how to handle.
10051 */
10052 if (error == ENOTEMPTY &&
10053 (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
10054 /*
10055 * Only do this if the directory is actually
10056 * marked as DATALESS.
10057 */
10058 struct vnode_attr *lvap =
10059 kalloc_type(struct vnode_attr, Z_WAITOK);
10060
10061 VATTR_INIT(lvap);
10062 VATTR_WANTED(lvap, va_flags);
10063 if (vnode_getattr(vp, lvap, ctx) == 0 &&
10064 VATTR_IS_SUPPORTED(lvap, va_flags) &&
10065 (lvap->va_flags & SF_DATALESS) != 0) {
10066 /*
10067 * If this fails, we want to keep the original
10068 * error.
10069 */
10070 if (vn_remove(dvp, &vp, ndp,
10071 VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
10072 error = 0;
10073 }
10074 }
10075 kfree_type(struct vnode_attr, lvap);
10076 }
10077
10078 #if CONFIG_APPLEDOUBLE
10079 /*
10080 * Special case to remove orphaned AppleDouble
10081 * files. I don't like putting this in the kernel,
10082 * but carbon does not like putting this in carbon either,
10083 * so here we are.
10084 */
10085 if (error == ENOTEMPTY) {
10086 int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
10087 if (ad_error == EBUSY) {
10088 error = ad_error;
10089 goto out;
10090 }
10091
10092
10093 /*
10094 * Assuming everything went well, we will try the RMDIR again
10095 */
10096 if (!ad_error) {
10097 error = vn_rmdir(dvp, &vp, ndp, vap, ctx);
10098 }
10099 }
10100 #endif /* CONFIG_APPLEDOUBLE */
10101 /*
10102 * Call out to allow 3rd party notification of delete.
10103 * Ignore result of kauth_authorize_fileop call.
10104 */
10105 if (!error) {
10106 if (has_listeners) {
10107 kauth_authorize_fileop(vfs_context_ucred(ctx),
10108 KAUTH_FILEOP_DELETE,
10109 (uintptr_t)vp,
10110 (uintptr_t)path);
10111 }
10112
10113 if (vp->v_flag & VISHARDLINK) {
10114 // see the comment in unlink1() about why we update
10115 // the parent of a hard link when it is removed
10116 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
10117 }
10118
10119 #if CONFIG_FSE
10120 if (need_event) {
10121 if (vap) {
10122 vnode_get_fse_info_from_vap(vp, &finfo, vap);
10123 }
10124 add_fsevent(FSE_DELETE, ctx,
10125 FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
10126 FSE_ARG_FINFO, &finfo,
10127 FSE_ARG_DONE);
10128 }
10129 #endif
10130
10131 #if CONFIG_MACF
10132 mac_vnode_notify_unlink(ctx, dvp, vp, &ndp->ni_cnd);
10133 #endif
10134 }
10135
10136 out:
10137 if (path != NULL) {
10138 RELEASE_PATH(path);
10139 path = NULL;
10140 }
10141
10142 if (no_firmlink_path != NULL) {
10143 RELEASE_PATH(no_firmlink_path);
10144 no_firmlink_path = NULL;
10145 }
10146
10147 /*
10148 * nameidone has to happen before we vnode_put(dvp)
10149 * since it may need to release the fs_nodelock on the dvp
10150 */
10151 nameidone(ndp);
10152 vnode_put(dvp);
10153
10154 if (vp) {
10155 vnode_put(vp);
10156 }
10157
10158 if (restart_flag == 0) {
10159 wakeup_one((caddr_t)vp);
10160 goto err_out;
10161 }
10162 tsleep(vp, PVFS, "rm AD", 1);
10163 } while (restart_flag != 0);
10164
10165 err_out:
10166 kfree_type(typeof(*__rmdir_data), __rmdir_data);
10167
10168 return error;
10169 }
10170
10171 /*
10172 * Remove a directory file.
10173 */
10174 /* ARGSUSED */
10175 int
rmdir(__unused proc_t p,struct rmdir_args * uap,__unused int32_t * retval)10176 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
10177 {
10178 return rmdirat_internal(vfs_context_current(), AT_FDCWD,
10179 CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
10180 }
10181
10182 /* Get direntry length padded to 8 byte alignment */
10183 #define DIRENT64_LEN(namlen) \
10184 ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
10185
10186 /* Get dirent length padded to 4 byte alignment */
10187 #define DIRENT_LEN(namelen) \
10188 ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
10189
10190 /* Get the end of this dirent */
10191 #define DIRENT_END(dep) \
10192 (((char *)(dep)) + (dep)->d_reclen - 1)
10193
10194 errno_t
vnode_readdir64(struct vnode * vp,struct uio * uio,int flags,int * eofflag,int * numdirent,vfs_context_t ctxp)10195 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
10196 int *numdirent, vfs_context_t ctxp)
10197 {
10198 /* Check if fs natively supports VNODE_READDIR_EXTENDED */
10199 if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
10200 ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
10201 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
10202 } else {
10203 size_t bufsize;
10204 void * bufptr;
10205 uio_t auio;
10206 struct direntry *entry64;
10207 struct dirent *dep;
10208 size_t bytesread;
10209 int error;
10210
10211 /*
10212 * We're here because the underlying file system does not
10213 * support direnties or we mounted denying support so we must
10214 * fall back to dirents and convert them to direntries.
10215 *
10216 * Our kernel buffer needs to be smaller since re-packing will
10217 * expand each dirent. The worse case (when the name length
10218 * is 3 or less) corresponds to a struct direntry size of 32
10219 * bytes (8-byte aligned) and a struct dirent size of 12 bytes
10220 * (4-byte aligned). So having a buffer that is 3/8 the size
10221 * will prevent us from reading more than we can pack.
10222 *
10223 * Since this buffer is wired memory, we will limit the
10224 * buffer size to a maximum of 32K. We would really like to
10225 * use 32K in the MIN(), but we use magic number 87371 to
10226 * prevent uio_resid() * 3 / 8 from overflowing.
10227 */
10228 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
10229 bufptr = kalloc_data(bufsize, Z_WAITOK);
10230 if (bufptr == NULL) {
10231 return ENOMEM;
10232 }
10233
10234 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
10235 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
10236 auio->uio_offset = uio->uio_offset;
10237
10238 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
10239
10240 dep = (struct dirent *)bufptr;
10241 bytesread = bufsize - uio_resid(auio);
10242
10243 entry64 = kalloc_type(struct direntry, Z_WAITOK);
10244 /*
10245 * Convert all the entries and copy them out to user's buffer.
10246 */
10247 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
10248 /* First check that the dirent struct up to d_name is within the buffer */
10249 if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
10250 /* Check that the length of the entire dirent is within the buffer */
10251 DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
10252 /* Check that the actual length including the name doesn't exceed d_reclen */
10253 DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
10254 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
10255 vp->v_mount->mnt_vfsstat.f_mntonname,
10256 vp->v_name ? vp->v_name : "<unknown>");
10257 error = EIO;
10258 break;
10259 }
10260
10261 size_t enbufsize = DIRENT64_LEN(dep->d_namlen);
10262
10263 bzero(entry64, enbufsize);
10264 /* Convert a dirent to a dirent64. */
10265 entry64->d_ino = dep->d_ino;
10266 entry64->d_seekoff = 0;
10267 entry64->d_reclen = (uint16_t)enbufsize;
10268 entry64->d_namlen = dep->d_namlen;
10269 entry64->d_type = dep->d_type;
10270 bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
10271
10272 /* Move to next entry. */
10273 dep = (struct dirent *)((char *)dep + dep->d_reclen);
10274
10275 /* Copy entry64 to user's buffer. */
10276 error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
10277 }
10278
10279 /* Update the real offset using the offset we got from VNOP_READDIR. */
10280 if (error == 0) {
10281 uio->uio_offset = auio->uio_offset;
10282 }
10283 uio_free(auio);
10284 kfree_data(bufptr, bufsize);
10285 kfree_type(struct direntry, entry64);
10286 return error;
10287 }
10288 }
10289
10290 #define GETDIRENTRIES_MAXBUFSIZE (128 * 1024 * 1024U)
10291
10292 /*
10293 * Read a block of directory entries in a file system independent format.
10294 */
10295 static int
getdirentries_common(int fd,user_addr_t bufp,user_size_t bufsize,ssize_t * bytesread,off_t * offset,int * eofflag,int flags)10296 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
10297 off_t *offset, int *eofflag, int flags)
10298 {
10299 vnode_t vp;
10300 struct vfs_context context = *vfs_context_current(); /* local copy */
10301 struct fileproc *fp;
10302 uio_t auio;
10303 int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10304 off_t loff;
10305 int error, numdirent;
10306 UIO_STACKBUF(uio_buf, 1);
10307
10308 get_from_fd:
10309 error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
10310 if (error) {
10311 return error;
10312 }
10313
10314 vn_offset_lock(fp->fp_glob);
10315 if (((vnode_t)fp_get_data(fp)) != vp) {
10316 vn_offset_unlock(fp->fp_glob);
10317 file_drop(fd);
10318 goto get_from_fd;
10319 }
10320
10321 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10322 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10323 error = EBADF;
10324 goto out;
10325 }
10326
10327 if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
10328 bufsize = GETDIRENTRIES_MAXBUFSIZE;
10329 }
10330
10331 #if CONFIG_MACF
10332 error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
10333 if (error) {
10334 goto out;
10335 }
10336 #endif
10337
10338 if ((error = vnode_getwithref(vp))) {
10339 goto out;
10340 }
10341 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10342
10343 #if CONFIG_UNION_MOUNTS
10344 unionread:
10345 #endif /* CONFIG_UNION_MOUNTS */
10346 if (vp->v_type != VDIR) {
10347 (void)vnode_put(vp);
10348 error = EINVAL;
10349 goto out;
10350 }
10351
10352 #if CONFIG_MACF
10353 error = mac_vnode_check_readdir(&context, vp);
10354 if (error != 0) {
10355 (void)vnode_put(vp);
10356 goto out;
10357 }
10358 #endif /* MAC */
10359
10360 loff = fp->fp_glob->fg_offset;
10361 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10362 uio_addiov(auio, bufp, bufsize);
10363
10364 if (flags & VNODE_READDIR_EXTENDED) {
10365 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
10366 fp->fp_glob->fg_offset = uio_offset(auio);
10367 } else {
10368 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
10369 fp->fp_glob->fg_offset = uio_offset(auio);
10370 }
10371 if (error) {
10372 (void)vnode_put(vp);
10373 goto out;
10374 }
10375
10376 #if CONFIG_UNION_MOUNTS
10377 if ((user_ssize_t)bufsize == uio_resid(auio) &&
10378 (vp->v_mount->mnt_flag & MNT_UNION)) {
10379 vnode_t uvp;
10380
10381 if (lookup_traverse_union(vp, &uvp, &context) == 0) {
10382 if (vnode_ref(uvp) == 0) {
10383 fp_set_data(fp, uvp);
10384 fp->fp_glob->fg_offset = 0;
10385 vnode_rele(vp);
10386 vnode_put(vp);
10387 vp = uvp;
10388 goto unionread;
10389 } else {
10390 /* could not get a ref, can't replace in fd */
10391 vnode_put(uvp);
10392 }
10393 }
10394 }
10395 #endif /* CONFIG_UNION_MOUNTS */
10396
10397 vnode_put(vp);
10398 if (offset) {
10399 *offset = loff;
10400 }
10401
10402 *bytesread = bufsize - uio_resid(auio);
10403 out:
10404 vn_offset_unlock(fp->fp_glob);
10405 file_drop(fd);
10406 return error;
10407 }
10408
10409
10410 int
getdirentries(__unused struct proc * p,struct getdirentries_args * uap,int32_t * retval)10411 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
10412 {
10413 off_t offset;
10414 ssize_t bytesread;
10415 int error, eofflag;
10416
10417 AUDIT_ARG(fd, uap->fd);
10418 error = getdirentries_common(uap->fd, uap->buf, uap->count,
10419 &bytesread, &offset, &eofflag, 0);
10420
10421 if (error == 0) {
10422 if (proc_is64bit(p)) {
10423 user64_long_t base = (user64_long_t)offset;
10424 error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
10425 } else {
10426 user32_long_t base = (user32_long_t)offset;
10427 error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
10428 }
10429 *retval = (int)bytesread;
10430 }
10431 return error;
10432 }
10433
10434 int
getdirentries64(__unused struct proc * p,struct getdirentries64_args * uap,user_ssize_t * retval)10435 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
10436 {
10437 off_t offset;
10438 ssize_t bytesread;
10439 int error, eofflag;
10440 user_size_t bufsize;
10441
10442 AUDIT_ARG(fd, uap->fd);
10443
10444 /*
10445 * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
10446 * then the kernel carves out the last 4 bytes to return extended
10447 * information to userspace (namely whether we reached EOF with this call).
10448 */
10449 if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10450 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
10451 } else {
10452 bufsize = uap->bufsize;
10453 }
10454
10455 error = getdirentries_common(uap->fd, uap->buf, bufsize,
10456 &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
10457
10458 if (error == 0) {
10459 *retval = bytesread;
10460 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
10461
10462 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
10463 getdirentries64_flags_t flags = 0;
10464 if (eofflag) {
10465 flags |= GETDIRENTRIES64_EOF;
10466 }
10467 error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
10468 sizeof(flags));
10469 }
10470 }
10471 return error;
10472 }
10473
10474
10475 /*
10476 * Set the mode mask for creation of filesystem nodes.
10477 * XXX implement xsecurity
10478 */
10479 #define UMASK_NOXSECURITY (void *)1 /* leave existing xsecurity alone */
10480 static int
umask1(proc_t p,int newmask,__unused kauth_filesec_t fsec,int32_t * retval)10481 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
10482 {
10483 AUDIT_ARG(mask, newmask);
10484 proc_fdlock(p);
10485 *retval = p->p_fd.fd_cmask;
10486 p->p_fd.fd_cmask = newmask & ALLPERMS;
10487 proc_fdunlock(p);
10488 return 0;
10489 }
10490
10491 /*
10492 * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
10493 *
10494 * Parameters: p Process requesting to set the umask
10495 * uap User argument descriptor (see below)
10496 * retval umask of the process (parameter p)
10497 *
10498 * Indirect: uap->newmask umask to set
10499 * uap->xsecurity ACL to set
10500 *
10501 * Returns: 0 Success
10502 * !0 Not success
10503 *
10504 */
10505 int
umask_extended(proc_t p,struct umask_extended_args * uap,int32_t * retval)10506 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
10507 {
10508 return umask1(p, uap->newmask, KAUTH_FILESEC_NONE, retval);
10509 }
10510
10511 int
umask(proc_t p,struct umask_args * uap,int32_t * retval)10512 umask(proc_t p, struct umask_args *uap, int32_t *retval)
10513 {
10514 return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
10515 }
10516
10517 #define REVOKE_MOUNTED_DEVICE_ENTITLEMENT \
10518 "com.apple.private.vfs.revoke-mounted-device"
10519
10520 /*
10521 * Void all references to file by ripping underlying filesystem
10522 * away from vnode.
10523 */
10524 /* ARGSUSED */
10525 int
revoke(proc_t p,struct revoke_args * uap,__unused int32_t * retval)10526 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
10527 {
10528 vnode_t vp;
10529 struct vnode_attr va;
10530 vfs_context_t ctx = vfs_context_current();
10531 int error;
10532 struct nameidata nd;
10533
10534 NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
10535 uap->path, ctx);
10536 error = namei(&nd);
10537 if (error) {
10538 return error;
10539 }
10540 vp = nd.ni_vp;
10541
10542 nameidone(&nd);
10543
10544 if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
10545 error = ENOTSUP;
10546 goto out;
10547 }
10548
10549 if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
10550 error = EBUSY;
10551 goto out;
10552 }
10553
10554 #if CONFIG_MACF
10555 error = mac_vnode_check_revoke(ctx, vp);
10556 if (error) {
10557 goto out;
10558 }
10559 #endif
10560
10561 VATTR_INIT(&va);
10562 VATTR_WANTED(&va, va_uid);
10563 if ((error = vnode_getattr(vp, &va, ctx))) {
10564 goto out;
10565 }
10566 if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
10567 (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
10568 goto out;
10569 }
10570 if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
10571 VNOP_REVOKE(vp, REVOKEALL, ctx);
10572 }
10573 out:
10574 vnode_put(vp);
10575 return error;
10576 }
10577
10578
10579 /*
10580 * HFS/HFS PlUS SPECIFIC SYSTEM CALLS
10581 * The following system calls are designed to support features
10582 * which are specific to the HFS & HFS Plus volume formats
10583 */
10584
10585
10586 /*
10587 * Obtain attribute information on objects in a directory while enumerating
10588 * the directory.
10589 */
10590 /* ARGSUSED */
10591 int
getdirentriesattr(proc_t p,struct getdirentriesattr_args * uap,int32_t * retval)10592 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
10593 {
10594 vnode_t vp;
10595 struct fileproc *fp;
10596 uio_t auio = NULL;
10597 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10598 uint32_t count = 0, savecount = 0;
10599 uint32_t newstate = 0;
10600 int error, eofflag = 0;
10601 off_t loff = 0;
10602 struct attrlist attributelist;
10603 vfs_context_t ctx = vfs_context_current();
10604 int fd = uap->fd;
10605 UIO_STACKBUF(uio_buf, 1);
10606 kauth_action_t action;
10607
10608 AUDIT_ARG(fd, fd);
10609
10610 /* Get the attributes into kernel space */
10611 if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
10612 return error;
10613 }
10614 if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
10615 return error;
10616 }
10617 savecount = count;
10618
10619 get_from_fd:
10620 if ((error = fp_getfvp(p, fd, &fp, &vp))) {
10621 return error;
10622 }
10623
10624 vn_offset_lock(fp->fp_glob);
10625 if (((vnode_t)fp_get_data(fp)) != vp) {
10626 vn_offset_unlock(fp->fp_glob);
10627 file_drop(fd);
10628 goto get_from_fd;
10629 }
10630
10631 if ((fp->fp_glob->fg_flag & FREAD) == 0) {
10632 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
10633 error = EBADF;
10634 goto out;
10635 }
10636
10637
10638 #if CONFIG_MACF
10639 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
10640 fp->fp_glob);
10641 if (error) {
10642 goto out;
10643 }
10644 #endif
10645
10646
10647 if ((error = vnode_getwithref(vp))) {
10648 goto out;
10649 }
10650
10651 AUDIT_ARG(vnpath, vp, ARG_VNODE1);
10652
10653 #if CONFIG_UNION_MOUNTS
10654 unionread:
10655 #endif /* CONFIG_UNION_MOUNTS */
10656 if (vp->v_type != VDIR) {
10657 (void)vnode_put(vp);
10658 error = EINVAL;
10659 goto out;
10660 }
10661
10662 #if CONFIG_MACF
10663 error = mac_vnode_check_readdir(ctx, vp);
10664 if (error != 0) {
10665 (void)vnode_put(vp);
10666 goto out;
10667 }
10668 #endif /* MAC */
10669
10670 /* set up the uio structure which will contain the users return buffer */
10671 loff = fp->fp_glob->fg_offset;
10672 auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10673 uio_addiov(auio, uap->buffer, uap->buffersize);
10674
10675 /*
10676 * If the only item requested is file names, we can let that past with
10677 * just LIST_DIRECTORY. If they want any other attributes, that means
10678 * they need SEARCH as well.
10679 */
10680 action = KAUTH_VNODE_LIST_DIRECTORY;
10681 if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
10682 attributelist.fileattr || attributelist.dirattr) {
10683 action |= KAUTH_VNODE_SEARCH;
10684 }
10685
10686 if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
10687 /* Believe it or not, uap->options only has 32-bits of valid
10688 * info, so truncate before extending again */
10689
10690 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
10691 (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
10692 }
10693
10694 if (error) {
10695 (void) vnode_put(vp);
10696 goto out;
10697 }
10698
10699 #if CONFIG_UNION_MOUNTS
10700 /*
10701 * If we've got the last entry of a directory in a union mount
10702 * then reset the eofflag and pretend there's still more to come.
10703 * The next call will again set eofflag and the buffer will be empty,
10704 * so traverse to the underlying directory and do the directory
10705 * read there.
10706 */
10707 if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10708 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10709 eofflag = 0;
10710 } else { // Empty buffer
10711 vnode_t uvp;
10712 if (lookup_traverse_union(vp, &uvp, ctx) == 0) {
10713 if (vnode_ref_ext(uvp, fp->fp_glob->fg_flag & O_EVTONLY, 0) == 0) {
10714 fp_set_data(fp, uvp);
10715 fp->fp_glob->fg_offset = 0; // reset index for new dir
10716 count = savecount;
10717 vnode_rele_internal(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10718 vnode_put(vp);
10719 vp = uvp;
10720 goto unionread;
10721 } else {
10722 /* could not get a ref, can't replace in fd */
10723 vnode_put(uvp);
10724 }
10725 }
10726 }
10727 }
10728 #endif /* CONFIG_UNION_MOUNTS */
10729
10730 (void)vnode_put(vp);
10731
10732 if (error) {
10733 goto out;
10734 }
10735 fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10736
10737 if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10738 goto out;
10739 }
10740 if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10741 goto out;
10742 }
10743 if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10744 goto out;
10745 }
10746
10747 *retval = eofflag; /* similar to getdirentries */
10748 error = 0;
10749 out:
10750 vn_offset_unlock(fp->fp_glob);
10751 file_drop(fd);
10752 return error; /* return error earlier, an retval of 0 or 1 now */
10753 } /* end of getdirentriesattr system call */
10754
10755 /*
10756 * Exchange data between two files
10757 */
10758
10759 /* ARGSUSED */
10760 int
exchangedata(__unused proc_t p,struct exchangedata_args * uap,__unused int32_t * retval)10761 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10762 {
10763 struct nameidata fnd, snd;
10764 vfs_context_t ctx = vfs_context_current();
10765 vnode_t fvp;
10766 vnode_t svp;
10767 int error;
10768 u_int32_t nameiflags;
10769 char *fpath = NULL;
10770 char *spath = NULL;
10771 int flen = 0, slen = 0;
10772 int from_truncated = 0, to_truncated = 0;
10773 #if CONFIG_FSE
10774 fse_info f_finfo, s_finfo;
10775 #endif
10776
10777 nameiflags = 0;
10778 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10779 nameiflags |= FOLLOW;
10780 }
10781
10782 NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10783 UIO_USERSPACE, uap->path1, ctx);
10784
10785 error = namei(&fnd);
10786 if (error) {
10787 goto out2;
10788 }
10789
10790 nameidone(&fnd);
10791 fvp = fnd.ni_vp;
10792
10793 NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10794 UIO_USERSPACE, uap->path2, ctx);
10795
10796 error = namei(&snd);
10797 if (error) {
10798 vnode_put(fvp);
10799 goto out2;
10800 }
10801 nameidone(&snd);
10802 svp = snd.ni_vp;
10803
10804 /*
10805 * if the files are the same, return an inval error
10806 */
10807 if (svp == fvp) {
10808 error = EINVAL;
10809 goto out;
10810 }
10811
10812 /*
10813 * if the files are on different volumes, return an error
10814 */
10815 if (svp->v_mount != fvp->v_mount) {
10816 error = EXDEV;
10817 goto out;
10818 }
10819
10820 /* If they're not files, return an error */
10821 if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10822 error = EINVAL;
10823 goto out;
10824 }
10825
10826 #if CONFIG_MACF
10827 error = mac_vnode_check_exchangedata(ctx,
10828 fvp, svp);
10829 if (error) {
10830 goto out;
10831 }
10832 #endif
10833 if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10834 ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10835 goto out;
10836 }
10837
10838 if (
10839 #if CONFIG_FSE
10840 need_fsevent(FSE_EXCHANGE, fvp) ||
10841 #endif
10842 kauth_authorize_fileop_has_listeners()) {
10843 GET_PATH(fpath);
10844 GET_PATH(spath);
10845
10846 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10847 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10848
10849 #if CONFIG_FSE
10850 get_fse_info(fvp, &f_finfo, ctx);
10851 get_fse_info(svp, &s_finfo, ctx);
10852 if (from_truncated || to_truncated) {
10853 // set it here since only the f_finfo gets reported up to user space
10854 f_finfo.mode |= FSE_TRUNCATED_PATH;
10855 }
10856 #endif
10857 }
10858 /* Ok, make the call */
10859 error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10860
10861 if (error == 0) {
10862 const char *tmpname;
10863
10864 if (fpath != NULL && spath != NULL) {
10865 /* call out to allow 3rd party notification of exchangedata.
10866 * Ignore result of kauth_authorize_fileop call.
10867 */
10868 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10869 (uintptr_t)fpath, (uintptr_t)spath);
10870 }
10871 name_cache_lock();
10872
10873 tmpname = fvp->v_name;
10874 fvp->v_name = svp->v_name;
10875 svp->v_name = tmpname;
10876
10877 if (fvp->v_parent != svp->v_parent) {
10878 vnode_t tmp;
10879
10880 tmp = fvp->v_parent;
10881 fvp->v_parent = svp->v_parent;
10882 svp->v_parent = tmp;
10883 }
10884 name_cache_unlock();
10885
10886 #if CONFIG_FSE
10887 if (fpath != NULL && spath != NULL) {
10888 add_fsevent(FSE_EXCHANGE, ctx,
10889 FSE_ARG_STRING, flen, fpath,
10890 FSE_ARG_FINFO, &f_finfo,
10891 FSE_ARG_STRING, slen, spath,
10892 FSE_ARG_FINFO, &s_finfo,
10893 FSE_ARG_DONE);
10894 }
10895 #endif
10896 }
10897
10898 out:
10899 if (fpath != NULL) {
10900 RELEASE_PATH(fpath);
10901 }
10902 if (spath != NULL) {
10903 RELEASE_PATH(spath);
10904 }
10905 vnode_put(svp);
10906 vnode_put(fvp);
10907 out2:
10908 return error;
10909 }
10910
10911 /*
10912 * Return (in MB) the amount of freespace on the given vnode's volume.
10913 */
10914 uint32_t freespace_mb(vnode_t vp);
10915
10916 uint32_t
freespace_mb(vnode_t vp)10917 freespace_mb(vnode_t vp)
10918 {
10919 vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10920 return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10921 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10922 }
10923
10924 #if CONFIG_SEARCHFS
10925
10926 /* ARGSUSED */
10927
10928 int
searchfs(proc_t p,struct searchfs_args * uap,__unused int32_t * retval)10929 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10930 {
10931 vnode_t vp, tvp;
10932 int i, error = 0;
10933 int fserror = 0;
10934 struct nameidata nd;
10935 struct user64_fssearchblock searchblock;
10936 struct searchstate *state;
10937 struct attrlist *returnattrs;
10938 struct timeval timelimit;
10939 void *searchparams1, *searchparams2;
10940 uio_t auio = NULL;
10941 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10942 uint32_t nummatches;
10943 size_t mallocsize;
10944 uint32_t nameiflags;
10945 vfs_context_t ctx = vfs_context_current();
10946 UIO_STACKBUF(uio_buf, 1);
10947
10948 /* Start by copying in fsearchblock parameter list */
10949 if (IS_64BIT_PROCESS(p)) {
10950 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10951 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10952 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10953 } else {
10954 struct user32_fssearchblock tmp_searchblock;
10955
10956 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10957 // munge into 64-bit version
10958 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10959 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10960 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10961 searchblock.maxmatches = tmp_searchblock.maxmatches;
10962 /*
10963 * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10964 * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10965 */
10966 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10967 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10968 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10969 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10970 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10971 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10972 searchblock.searchattrs = tmp_searchblock.searchattrs;
10973 }
10974 if (error) {
10975 return error;
10976 }
10977
10978 /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10979 */
10980 if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10981 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10982 return EINVAL;
10983 }
10984
10985 /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10986 /* It all has to do into local memory and it's not that big so we might as well put it all together. */
10987 /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10988 /* block. */
10989 /* */
10990 /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate */
10991 /* due to the changes in rdar://problem/12438273. That way if a 3rd party file system */
10992 /* assumes the size is still 556 bytes it will continue to work */
10993
10994 mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10995 sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10996
10997 searchparams1 = kalloc_data(mallocsize, Z_WAITOK);
10998
10999 /* Now set up the various pointers to the correct place in our newly allocated memory */
11000
11001 searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
11002 returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
11003 state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
11004
11005 /* Now copy in the stuff given our local variables. */
11006
11007 if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
11008 goto freeandexit;
11009 }
11010
11011 if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
11012 goto freeandexit;
11013 }
11014
11015 if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
11016 goto freeandexit;
11017 }
11018
11019 if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
11020 goto freeandexit;
11021 }
11022
11023 /*
11024 * When searching a union mount, need to set the
11025 * start flag at the first call on each layer to
11026 * reset state for the new volume.
11027 */
11028 if (uap->options & SRCHFS_START) {
11029 state->ss_union_layer = 0;
11030 } else {
11031 uap->options |= state->ss_union_flags;
11032 }
11033 state->ss_union_flags = 0;
11034
11035 /*
11036 * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
11037 * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
11038 * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
11039 * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
11040 * validate the user-supplied data offset of the attrreference_t, we'll do it here.
11041 */
11042
11043 if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
11044 attrreference_t* string_ref;
11045 u_int32_t* start_length;
11046 user64_size_t param_length;
11047
11048 /* validate searchparams1 */
11049 param_length = searchblock.sizeofsearchparams1;
11050 /* skip the word that specifies length of the buffer */
11051 start_length = (u_int32_t*) searchparams1;
11052 start_length = start_length + 1;
11053 string_ref = (attrreference_t*) start_length;
11054
11055 /* ensure no negative offsets or too big offsets */
11056 if (string_ref->attr_dataoffset < 0) {
11057 error = EINVAL;
11058 goto freeandexit;
11059 }
11060 if (string_ref->attr_length > MAXPATHLEN) {
11061 error = EINVAL;
11062 goto freeandexit;
11063 }
11064
11065 /* Check for pointer overflow in the string ref */
11066 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
11067 error = EINVAL;
11068 goto freeandexit;
11069 }
11070
11071 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
11072 error = EINVAL;
11073 goto freeandexit;
11074 }
11075 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
11076 error = EINVAL;
11077 goto freeandexit;
11078 }
11079 }
11080
11081 /* set up the uio structure which will contain the users return buffer */
11082 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
11083 uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
11084
11085 nameiflags = 0;
11086 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11087 nameiflags |= FOLLOW;
11088 }
11089 NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
11090 UIO_USERSPACE, uap->path, ctx);
11091
11092 error = namei(&nd);
11093 if (error) {
11094 goto freeandexit;
11095 }
11096 vp = nd.ni_vp;
11097 nameidone(&nd);
11098
11099 /*
11100 * Switch to the root vnode for the volume
11101 */
11102 error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
11103 vnode_put(vp);
11104 if (error) {
11105 goto freeandexit;
11106 }
11107 vp = tvp;
11108
11109 #if CONFIG_UNION_MOUNTS
11110 /*
11111 * If it's a union mount, the path lookup takes
11112 * us to the top layer. But we may need to descend
11113 * to a lower layer. For non-union mounts the layer
11114 * is always zero.
11115 */
11116 for (i = 0; i < (int) state->ss_union_layer; i++) {
11117 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
11118 break;
11119 }
11120 tvp = vp;
11121 vp = vp->v_mount->mnt_vnodecovered;
11122 if (vp == NULL) {
11123 vnode_put(tvp);
11124 error = ENOENT;
11125 goto freeandexit;
11126 }
11127 error = vnode_getwithref(vp);
11128 vnode_put(tvp);
11129 if (error) {
11130 goto freeandexit;
11131 }
11132 }
11133 #endif /* CONFIG_UNION_MOUNTS */
11134
11135 #if CONFIG_MACF
11136 error = mac_vnode_check_searchfs(ctx, vp, returnattrs, &searchblock.searchattrs);
11137 if (error) {
11138 vnode_put(vp);
11139 goto freeandexit;
11140 }
11141 #endif
11142
11143
11144 /*
11145 * If searchblock.maxmatches == 0, then skip the search. This has happened
11146 * before and sometimes the underlying code doesnt deal with it well.
11147 */
11148 if (searchblock.maxmatches == 0) {
11149 nummatches = 0;
11150 goto saveandexit;
11151 }
11152
11153 /*
11154 * Allright, we have everything we need, so lets make that call.
11155 *
11156 * We keep special track of the return value from the file system:
11157 * EAGAIN is an acceptable error condition that shouldn't keep us
11158 * from copying out any results...
11159 */
11160
11161 fserror = VNOP_SEARCHFS(vp,
11162 searchparams1,
11163 searchparams2,
11164 &searchblock.searchattrs,
11165 (uint32_t)searchblock.maxmatches,
11166 &timelimit,
11167 returnattrs,
11168 &nummatches,
11169 (uint32_t)uap->scriptcode,
11170 (uint32_t)uap->options,
11171 auio,
11172 (struct searchstate *) &state->ss_fsstate,
11173 ctx);
11174
11175 #if CONFIG_UNION_MOUNTS
11176 /*
11177 * If it's a union mount we need to be called again
11178 * to search the mounted-on filesystem.
11179 */
11180 if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
11181 state->ss_union_flags = SRCHFS_START;
11182 state->ss_union_layer++; // search next layer down
11183 fserror = EAGAIN;
11184 }
11185 #endif /* CONFIG_UNION_MOUNTS */
11186
11187 saveandexit:
11188
11189 vnode_put(vp);
11190
11191 /* Now copy out the stuff that needs copying out. That means the number of matches, the
11192 * search state. Everything was already put into he return buffer by the vop call. */
11193
11194 if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
11195 goto freeandexit;
11196 }
11197
11198 if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
11199 goto freeandexit;
11200 }
11201
11202 error = fserror;
11203
11204 freeandexit:
11205
11206 kfree_data(searchparams1, mallocsize);
11207
11208 return error;
11209 } /* end of searchfs system call */
11210
11211 #else /* CONFIG_SEARCHFS */
11212
11213 int
searchfs(__unused proc_t p,__unused struct searchfs_args * uap,__unused int32_t * retval)11214 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
11215 {
11216 return ENOTSUP;
11217 }
11218
11219 #endif /* CONFIG_SEARCHFS */
11220
11221
11222 #if CONFIG_DATALESS_FILES
11223
11224 /*
11225 * === Namespace Resolver Up-call Mechanism ===
11226 *
11227 * When I/O is performed to a dataless file or directory (read, write,
11228 * lookup-in, etc.), the file system performs an upcall to the namespace
11229 * resolver (filecoordinationd) to materialize the object.
11230 *
11231 * We need multiple up-calls to be in flight at once, and we need these
11232 * up-calls to be interruptible, thus the following implementation:
11233 *
11234 * => The nspace_resolver_request represents the in-kernel request state.
11235 * It contains a request ID, storage space for the errno code returned
11236 * by filecoordinationd, and flags.
11237 *
11238 * => The request ID is simply a global monotonically incrementing 32-bit
11239 * number. Outstanding requests are stored in a hash table, and the
11240 * hash function is extremely simple.
11241 *
11242 * => When an upcall is to be made to filecoordinationd, a request structure
11243 * is allocated on the stack (it is small, and needs to live only during
11244 * the duration of the call to resolve_nspace_item_ext()). It is
11245 * initialized and inserted into the table. Some backpressure from
11246 * filecoordinationd is applied by limiting the numnber of entries that
11247 * can be inserted into the table (and thus limiting the number of
11248 * outstanding requests issued to filecoordinationd); waiting for an
11249 * available slot is interruptible.
11250 *
11251 * => Once the request has been inserted into the table, the up-call is made
11252 * to filecoordinationd via a MiG-generated stub. The up-call returns
11253 * immediately and filecoordinationd processes the request asynchronously.
11254 *
11255 * => The caller now waits for the request to complete. Tnis is achieved by
11256 * sleeping on the address of the request structure and waiting for
11257 * filecoordinationd to mark the request structure as complete. This
11258 * is an interruptible sleep call; if interrupted, the request structure
11259 * is removed from the table and EINTR is returned to the caller. If
11260 * this occurs, an advisory up-call is made to filecoordinationd with
11261 * the request ID to indicate that the request can be aborted or
11262 * de-prioritized at the discretion of filecoordinationd.
11263 *
11264 * => When filecoordinationd has completed the request, it signals completion
11265 * by writing to the vfs.nspace.complete sysctl node. Only a process
11266 * decorated as a namespace resolver can write to this sysctl node. The
11267 * value is a request ID / errno tuple passed as an array of 2 uint32_t's.
11268 * The request ID is looked up in the table, and if the request is found,
11269 * the error code is stored in the request structure and a wakeup()
11270 * issued on the address of the request structure. If the request is not
11271 * found, we simply drop the completion notification, assuming that the
11272 * caller was interrupted.
11273 *
11274 * => When the waiting thread wakes up, it extracts the error code from the
11275 * request structure, removes the request from the table, and returns the
11276 * error code to the calling function. Fini!
11277 */
11278
11279 struct nspace_resolver_request {
11280 LIST_ENTRY(nspace_resolver_request) r_hashlink;
11281 vnode_t r_vp;
11282 vnode_t r_tdvp;
11283 uint32_t r_req_id;
11284 int r_resolver_error;
11285 int r_flags;
11286 };
11287
11288 #define RRF_COMPLETE 0x0001
11289 #define RRF_COMPLETING 0x0002
11290
11291 struct nspace_resolver_completion_data {
11292 uint32_t req_id;
11293 int32_t resolver_error;
11294 uint64_t orig_gencount;
11295 uint64_t orig_syncroot;
11296 };
11297
11298 static uint32_t
next_nspace_req_id(void)11299 next_nspace_req_id(void)
11300 {
11301 static uint32_t next_req_id;
11302
11303 return OSAddAtomic(1, &next_req_id);
11304 }
11305
11306 #define NSPACE_RESOLVER_REQ_HASHSIZE 32 /* XXX tune */
11307 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256 /* XXX tune */
11308
11309 static LIST_HEAD(nspace_resolver_requesthead,
11310 nspace_resolver_request) * nspace_resolver_request_hashtbl;
11311 static u_long nspace_resolver_request_hashmask;
11312 static u_int nspace_resolver_request_count;
11313 static bool nspace_resolver_request_wait_slot;
11314 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
11315 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
11316 &nspace_resolver_request_lck_grp);
11317
11318 #define NSPACE_REQ_LOCK() \
11319 lck_mtx_lock(&nspace_resolver_request_hash_mutex)
11320 #define NSPACE_REQ_UNLOCK() \
11321 lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
11322
11323 #define NSPACE_RESOLVER_HASH(req_id) \
11324 (&nspace_resolver_request_hashtbl[(req_id) & \
11325 nspace_resolver_request_hashmask])
11326
11327 static struct nspace_resolver_request *
nspace_resolver_req_lookup(uint32_t req_id,bool skip_completing)11328 nspace_resolver_req_lookup(uint32_t req_id, bool skip_completing)
11329 {
11330 struct nspace_resolver_requesthead *bucket;
11331 struct nspace_resolver_request *req;
11332
11333 bucket = NSPACE_RESOLVER_HASH(req_id);
11334 LIST_FOREACH(req, bucket, r_hashlink) {
11335 if (req->r_req_id == req_id) {
11336 /*
11337 * If this request already has a completion
11338 * pending, don't return it again.
11339 */
11340 if ((req->r_flags & RRF_COMPLETING) != 0 &&
11341 skip_completing) {
11342 req = NULL;
11343 }
11344 return req;
11345 }
11346 }
11347
11348 return NULL;
11349 }
11350
11351 static int
nspace_resolver_req_add(struct nspace_resolver_request * req)11352 nspace_resolver_req_add(struct nspace_resolver_request *req)
11353 {
11354 struct nspace_resolver_requesthead *bucket;
11355 int error;
11356
11357 NSPACE_REQ_LOCK();
11358
11359 while (nspace_resolver_request_count >=
11360 NSPACE_RESOLVER_MAX_OUTSTANDING) {
11361 nspace_resolver_request_wait_slot = true;
11362 error = msleep(&nspace_resolver_request_count,
11363 &nspace_resolver_request_hash_mutex,
11364 PVFS | PCATCH, "nspacerq", NULL);
11365 if (error) {
11366 NSPACE_REQ_UNLOCK();
11367 return error;
11368 }
11369 }
11370
11371 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11372 #if DIAGNOSTIC
11373 assert(nspace_resolver_req_lookup(req->r_req_id, false) == NULL);
11374 #endif /* DIAGNOSTIC */
11375 LIST_INSERT_HEAD(bucket, req, r_hashlink);
11376 nspace_resolver_request_count++;
11377
11378 NSPACE_REQ_UNLOCK();
11379
11380 return 0;
11381 }
11382
11383 static void
nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request * req)11384 nspace_resolver_req_wait_pending_completion(struct nspace_resolver_request *req)
11385 {
11386 /*
11387 * If a completion is in-progress, we have to wait for the
11388 * completion handler to finish because it's still using 'req',
11389 * which is allocated on our stack a couple of frames up.
11390 */
11391 while ((req->r_flags & RRF_COMPLETING) != 0) {
11392 (void) msleep(req, &nspace_resolver_request_hash_mutex,
11393 PVFS, "nspacecmplt", NULL);
11394 }
11395 }
11396
11397 static void
nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request * req)11398 nspace_resolver_req_remove_and_unlock(struct nspace_resolver_request *req)
11399 {
11400 struct nspace_resolver_requesthead *bucket;
11401
11402 /* We're called with NSPACE_REQ_LOCK held. */
11403
11404 bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
11405 #if DIAGNOSTIC
11406 assert((req->r_flags & RRF_COMPLETING) == 0);
11407 assert(nspace_resolver_req_lookup(req->r_req_id, false) != NULL);
11408 #endif /* DIAGNOSTIC */
11409 LIST_REMOVE(req, r_hashlink);
11410 nspace_resolver_request_count--;
11411
11412 if (nspace_resolver_request_wait_slot) {
11413 nspace_resolver_request_wait_slot = false;
11414 wakeup(&nspace_resolver_request_count);
11415 }
11416
11417 nspace_resolver_req_wait_pending_completion(req);
11418
11419 NSPACE_REQ_UNLOCK();
11420 }
11421
11422 static void
nspace_resolver_req_remove(struct nspace_resolver_request * req)11423 nspace_resolver_req_remove(struct nspace_resolver_request *req)
11424 {
11425 NSPACE_REQ_LOCK();
11426 nspace_resolver_req_remove_and_unlock(req);
11427 }
11428
11429 static void
nspace_resolver_req_cancel(uint32_t req_id)11430 nspace_resolver_req_cancel(uint32_t req_id)
11431 {
11432 kern_return_t kr;
11433 mach_port_t mp;
11434
11435 // Failures here aren't fatal -- the cancellation message
11436 // sent to the resolver is merely advisory.
11437
11438 kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11439 if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11440 return;
11441 }
11442
11443 kr = send_nspace_resolve_cancel(mp, req_id);
11444 if (kr != KERN_SUCCESS) {
11445 os_log_error(OS_LOG_DEFAULT,
11446 "NSPACE send_nspace_resolve_cancel failure: %d", kr);
11447 }
11448
11449 ipc_port_release_send(mp);
11450 }
11451
11452 static int
nspace_resolver_req_wait(struct nspace_resolver_request * req)11453 nspace_resolver_req_wait(struct nspace_resolver_request *req)
11454 {
11455 bool send_cancel_message = false;
11456 int error;
11457
11458 NSPACE_REQ_LOCK();
11459
11460 while ((req->r_flags & RRF_COMPLETE) == 0) {
11461 error = msleep(req, &nspace_resolver_request_hash_mutex,
11462 PVFS | PCATCH, "nspace", NULL);
11463 if (error && error != ERESTART) {
11464 req->r_resolver_error = (error == EINTR) ? EINTR :
11465 ETIMEDOUT;
11466 send_cancel_message = true;
11467 break;
11468 }
11469 }
11470
11471 nspace_resolver_req_remove_and_unlock(req);
11472
11473 /*
11474 * It's safe to continue referencing 'req' here because it's
11475 * allocated on our caller's stack.
11476 */
11477
11478 if (send_cancel_message) {
11479 nspace_resolver_req_cancel(req->r_req_id);
11480 }
11481
11482 return req->r_resolver_error;
11483 }
11484
11485 static void
nspace_resolver_req_mark_complete(struct nspace_resolver_request * req,int resolver_error)11486 nspace_resolver_req_mark_complete(
11487 struct nspace_resolver_request *req,
11488 int resolver_error)
11489 {
11490 req->r_resolver_error = resolver_error;
11491 req->r_flags = (req->r_flags & ~RRF_COMPLETING) | RRF_COMPLETE;
11492 wakeup(req);
11493 }
11494
11495 static void
nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request * req)11496 nspace_resolver_req_mark_completion_pending(struct nspace_resolver_request *req)
11497 {
11498 req->r_flags |= RRF_COMPLETING;
11499 }
11500
11501 static void
nspace_resolver_req_completed(const struct nspace_resolver_completion_data * c)11502 nspace_resolver_req_completed(const struct nspace_resolver_completion_data *c)
11503 {
11504 struct nspace_resolver_request *req;
11505 int error;
11506 struct vnode_attr va;
11507 vnode_t vp;
11508
11509 NSPACE_REQ_LOCK();
11510
11511 req = nspace_resolver_req_lookup(c->req_id, true);
11512 if (req == NULL) {
11513 /*
11514 * If we don't find the request corresponding to our req_id,
11515 * just drop the completion on the floor; it's likely that
11516 * the requester interrupted with a signal, or it may already
11517 * be completing.
11518 */
11519 NSPACE_REQ_UNLOCK();
11520 return;
11521 }
11522
11523 /*
11524 * Get out now if the resolver reported an error.
11525 */
11526 if ((error = c->resolver_error) != 0) {
11527 goto out;
11528 }
11529
11530 /*
11531 * If the resolver did not specify any namespace shape criteria
11532 * for letting the operation proceed, then get out now.
11533 */
11534 if (c->orig_gencount == 0 && c->orig_syncroot == 0) {
11535 goto out;
11536 }
11537
11538 /*
11539 * We're going to have to acquire the mount rename lock and do
11540 * some I/O in order to verify the criteria. Mark the request
11541 * as pending so no one else messes with it after we drop the
11542 * NSPACE_REQ_LOCK.
11543 */
11544 nspace_resolver_req_mark_completion_pending(req);
11545 NSPACE_REQ_UNLOCK();
11546
11547 /*
11548 * Lock out renames from changing the shape of the tree while
11549 * validate the criteria.
11550 */
11551 mount_t locked_mp = req->r_vp->v_mount;
11552 mount_ref(locked_mp, 0);
11553 mount_lock_renames(locked_mp);
11554
11555 if (c->orig_gencount != 0) {
11556 vp = req->r_vp;
11557 if (error) {
11558 goto out_dropmount;
11559 }
11560
11561 VATTR_INIT(&va);
11562 VATTR_WANTED(&va, va_recursive_gencount);
11563 error = vnode_getattr(vp, &va, vfs_context_kernel());
11564 if (error) {
11565 goto out_dropmount;
11566 }
11567 if (VATTR_NOT_RETURNED(&va, va_recursive_gencount) ||
11568 va.va_recursive_gencount != c->orig_gencount) {
11569 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n",
11570 c->orig_gencount, va.va_recursive_gencount);
11571 error = EBUSY;
11572 goto out_dropmount;
11573 }
11574 }
11575
11576 /*
11577 * Ignore orig_syncroot if a destination directory wasn't specified
11578 * in the request.
11579 */
11580 if (c->orig_syncroot != 0 && (vp = req->r_tdvp) != NULL) {
11581 uint64_t syncroot_id;
11582
11583 if (error) {
11584 goto out_dropmount;
11585 }
11586
11587 #ifndef APFSIOC_GET_SYNC_ROOT
11588 #define APFSIOC_GET_SYNC_ROOT _IOR('J', 115, uint64_t)
11589 #endif
11590
11591 error = VNOP_IOCTL(vp, APFSIOC_GET_SYNC_ROOT,
11592 (caddr_t)&syncroot_id, 0, vfs_context_kernel());
11593 if (error) {
11594 goto out_dropmount;
11595 }
11596 if (syncroot_id != c->orig_syncroot) {
11597 printf("nspace.complete: syncroot changed! (orig %llu cur %llu)\n",
11598 c->orig_syncroot, syncroot_id);
11599 error = EBUSY;
11600 goto out_dropmount;
11601 }
11602 }
11603
11604 out_dropmount:
11605 mount_unlock_renames(locked_mp);
11606 mount_drop(locked_mp, 0);
11607 NSPACE_REQ_LOCK();
11608
11609 out:
11610 nspace_resolver_req_mark_complete(req, error);
11611 NSPACE_REQ_UNLOCK();
11612 }
11613
11614 static struct proc *nspace_resolver_proc;
11615
11616 static int
nspace_resolver_get_proc_state(struct proc * p,int * is_resolver)11617 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
11618 {
11619 *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11620 p == nspace_resolver_proc) ? 1 : 0;
11621 return 0;
11622 }
11623
11624 static boolean_t vfs_context_is_dataless_resolver(vfs_context_t);
11625
11626 static int
nspace_resolver_set_proc_state(struct proc * p,int is_resolver)11627 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
11628 {
11629 vfs_context_t ctx = vfs_context_current();
11630 int error = 0;
11631
11632 //
11633 // The system filecoordinationd runs as uid == 0. This also
11634 // has the nice side-effect of filtering out filecoordinationd
11635 // running in the simulator.
11636 //
11637 if (!vfs_context_issuser(ctx) ||
11638 !vfs_context_is_dataless_resolver(ctx)) {
11639 return EPERM;
11640 }
11641
11642 if (is_resolver) {
11643 NSPACE_REQ_LOCK();
11644
11645 if (nspace_resolver_proc == NULL) {
11646 proc_lock(p);
11647 p->p_lflag |= P_LNSPACE_RESOLVER;
11648 proc_unlock(p);
11649 nspace_resolver_proc = p;
11650 } else {
11651 error = EBUSY;
11652 }
11653
11654 NSPACE_REQ_UNLOCK();
11655 } else {
11656 // This is basically just like the exit case.
11657 // nspace_resolver_exited() will verify that the
11658 // process is the resolver, and will clear the
11659 // global.
11660 nspace_resolver_exited(p);
11661 }
11662
11663 return error;
11664 }
11665
11666 static int
nspace_materialization_get_proc_state(struct proc * p,int * is_prevented)11667 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
11668 {
11669 if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
11670 (p->p_vfs_iopolicy &
11671 P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
11672 *is_prevented = 1;
11673 } else {
11674 *is_prevented = 0;
11675 }
11676 return 0;
11677 }
11678
11679 static int
nspace_materialization_set_proc_state(struct proc * p,int is_prevented)11680 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
11681 {
11682 if (p->p_lflag & P_LNSPACE_RESOLVER) {
11683 return is_prevented ? 0 : EBUSY;
11684 }
11685
11686 if (is_prevented) {
11687 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
11688 } else {
11689 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
11690 }
11691 return 0;
11692 }
11693
11694 static int
nspace_materialization_get_thread_state(int * is_prevented)11695 nspace_materialization_get_thread_state(int *is_prevented)
11696 {
11697 uthread_t ut = current_uthread();
11698
11699 *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
11700 return 0;
11701 }
11702
11703 static int
nspace_materialization_set_thread_state(int is_prevented)11704 nspace_materialization_set_thread_state(int is_prevented)
11705 {
11706 uthread_t ut = current_uthread();
11707
11708 if (is_prevented) {
11709 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
11710 } else {
11711 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
11712 }
11713 return 0;
11714 }
11715
11716 /* the vfs.nspace branch */
11717 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
11718
11719 static int
sysctl_nspace_resolver(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11720 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
11721 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11722 {
11723 struct proc *p = req->p;
11724 int new_value, old_value, changed = 0;
11725 int error;
11726
11727 error = nspace_resolver_get_proc_state(p, &old_value);
11728 if (error) {
11729 return error;
11730 }
11731
11732 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11733 &changed);
11734 if (error == 0 && changed) {
11735 error = nspace_resolver_set_proc_state(p, new_value);
11736 }
11737 return error;
11738 }
11739
11740 /* decorate this process as the dataless file resolver */
11741 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
11742 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11743 0, 0, sysctl_nspace_resolver, "I", "");
11744
11745 static int
sysctl_nspace_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11746 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
11747 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11748 {
11749 struct proc *p = req->p;
11750 int new_value, old_value, changed = 0;
11751 int error;
11752
11753 error = nspace_materialization_get_proc_state(p, &old_value);
11754 if (error) {
11755 return error;
11756 }
11757
11758 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11759 &changed);
11760 if (error == 0 && changed) {
11761 error = nspace_materialization_set_proc_state(p, new_value);
11762 }
11763 return error;
11764 }
11765
11766 /* decorate this process as not wanting to materialize dataless files */
11767 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
11768 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11769 0, 0, sysctl_nspace_prevent_materialization, "I", "");
11770
11771 static int
sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11772 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
11773 __unused void *arg1, __unused int arg2, struct sysctl_req *req)
11774 {
11775 int new_value, old_value, changed = 0;
11776 int error;
11777
11778 error = nspace_materialization_get_thread_state(&old_value);
11779 if (error) {
11780 return error;
11781 }
11782
11783 error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
11784 &changed);
11785 if (error == 0 && changed) {
11786 error = nspace_materialization_set_thread_state(new_value);
11787 }
11788 return error;
11789 }
11790
11791 /* decorate this thread as not wanting to materialize dataless files */
11792 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
11793 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11794 0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
11795
11796 static int
sysctl_nspace_complete(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)11797 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
11798 __unused int arg2, struct sysctl_req *req)
11799 {
11800 struct proc *p = req->p;
11801 uint32_t req_status[2] = { 0, 0 };
11802 uint64_t gencount = 0;
11803 uint64_t syncroot = 0;
11804 int error, is_resolver, changed = 0, other_changed;
11805
11806 error = nspace_resolver_get_proc_state(p, &is_resolver);
11807 if (error) {
11808 return error;
11809 }
11810
11811 if (!is_resolver) {
11812 return EPERM;
11813 }
11814
11815 error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11816 &changed);
11817 if (error) {
11818 return error;
11819 }
11820
11821 /*
11822 * Get the gencount if it was passed. Ignore errors, because
11823 * it's optional.
11824 */
11825 error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11826 &other_changed);
11827 if (error) {
11828 gencount = 0;
11829 error = 0;
11830 }
11831
11832 /*
11833 * ...and now the syncroot ID.
11834 */
11835 error = sysctl_io_opaque(req, &syncroot, sizeof(syncroot),
11836 &other_changed);
11837 if (error) {
11838 syncroot = 0;
11839 error = 0;
11840 }
11841
11842 /*
11843 * req_status[0] is the req_id
11844 *
11845 * req_status[1] is the errno
11846 */
11847 if (error == 0 && changed) {
11848 const struct nspace_resolver_completion_data cd = {
11849 .req_id = req_status[0],
11850 .resolver_error = req_status[1],
11851 .orig_gencount = gencount,
11852 .orig_syncroot = syncroot,
11853 };
11854 nspace_resolver_req_completed(&cd);
11855 }
11856 return error;
11857 }
11858
11859 /* Resolver reports completed reqs here. */
11860 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11861 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11862 0, 0, sysctl_nspace_complete, "-", "");
11863
11864 #endif /* CONFIG_DATALESS_FILES */
11865
11866 #if CONFIG_DATALESS_FILES
11867 #define __no_dataless_unused /* nothing */
11868 #else
11869 #define __no_dataless_unused __unused
11870 #endif
11871
11872 int
vfs_context_dataless_materialization_is_prevented(vfs_context_t const ctx __no_dataless_unused)11873 vfs_context_dataless_materialization_is_prevented(
11874 vfs_context_t const ctx __no_dataless_unused)
11875 {
11876 #if CONFIG_DATALESS_FILES
11877 proc_t const p = vfs_context_proc(ctx);
11878 thread_t const t = vfs_context_thread(ctx);
11879 uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11880
11881 /*
11882 * Kernel context ==> return EDEADLK, as we would with any random
11883 * process decorated as no-materialize.
11884 */
11885 if (ctx == vfs_context_kernel()) {
11886 return EDEADLK;
11887 }
11888
11889 /*
11890 * If the process has the dataless-manipulation entitlement,
11891 * materialization is prevented, and depending on the kind
11892 * of file system operation, things get to proceed as if the
11893 * object is not dataless.
11894 */
11895 if (vfs_context_is_dataless_manipulator(ctx)) {
11896 return EJUSTRETURN;
11897 }
11898
11899 /*
11900 * Per-thread decorations override any process-wide decorations.
11901 * (Foundation uses this, and this overrides even the dataless-
11902 * manipulation entitlement so as to make API contracts consistent.)
11903 */
11904 if (ut != NULL) {
11905 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11906 return EDEADLK;
11907 }
11908 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11909 return 0;
11910 }
11911 }
11912
11913 /*
11914 * If the process's iopolicy specifies that dataless files
11915 * can be materialized, then we let it go ahead.
11916 */
11917 if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11918 return 0;
11919 }
11920 #endif /* CONFIG_DATALESS_FILES */
11921
11922 /*
11923 * The default behavior is to not materialize dataless files;
11924 * return to the caller that deadlock was detected.
11925 */
11926 return EDEADLK;
11927 }
11928
11929 void
nspace_resolver_init(void)11930 nspace_resolver_init(void)
11931 {
11932 #if CONFIG_DATALESS_FILES
11933 nspace_resolver_request_hashtbl =
11934 hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11935 M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11936 #endif /* CONFIG_DATALESS_FILES */
11937 }
11938
11939 void
nspace_resolver_exited(struct proc * p __no_dataless_unused)11940 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11941 {
11942 #if CONFIG_DATALESS_FILES
11943 struct nspace_resolver_requesthead *bucket;
11944 struct nspace_resolver_request *req;
11945 u_long idx;
11946
11947 NSPACE_REQ_LOCK();
11948
11949 if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11950 p == nspace_resolver_proc) {
11951 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11952 bucket = &nspace_resolver_request_hashtbl[idx];
11953 LIST_FOREACH(req, bucket, r_hashlink) {
11954 nspace_resolver_req_wait_pending_completion(req);
11955 nspace_resolver_req_mark_complete(req,
11956 ETIMEDOUT);
11957 }
11958 }
11959 nspace_resolver_proc = NULL;
11960 }
11961
11962 NSPACE_REQ_UNLOCK();
11963 #endif /* CONFIG_DATALESS_FILES */
11964 }
11965
11966 #define DATALESS_RESOLVER_ENTITLEMENT \
11967 "com.apple.private.vfs.dataless-resolver"
11968 #define DATALESS_MANIPULATION_ENTITLEMENT \
11969 "com.apple.private.vfs.dataless-manipulation"
11970
11971 #if CONFIG_DATALESS_FILES
11972 /*
11973 * Return TRUE if the vfs context is associated with the dataless
11974 * resolver.
11975 */
11976 static boolean_t
vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)11977 vfs_context_is_dataless_resolver(vfs_context_t ctx __no_dataless_unused)
11978 {
11979 return IOTaskHasEntitlement(vfs_context_task(ctx),
11980 DATALESS_RESOLVER_ENTITLEMENT);
11981 }
11982 #endif /* CONFIG_DATALESS_FILES */
11983
11984 /*
11985 * Return TRUE if the vfs context is associated with a process entitled
11986 * for dataless manipulation.
11987 *
11988 * XXX Arguably belongs in vfs_subr.c, but is here because of the
11989 * complication around CONFIG_DATALESS_FILES.
11990 */
11991 boolean_t
vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)11992 vfs_context_is_dataless_manipulator(vfs_context_t ctx __no_dataless_unused)
11993 {
11994 #if CONFIG_DATALESS_FILES
11995 task_t task = vfs_context_task(ctx);
11996 return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11997 IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11998 #else
11999 return false;
12000 #endif /* CONFIG_DATALESS_FILES */
12001 }
12002
12003 #if CONFIG_DATALESS_FILES
12004 static void
log_materialization_prevented(vnode_t vp,uint64_t op)12005 log_materialization_prevented(vnode_t vp, uint64_t op)
12006 {
12007 char p_name[MAXCOMLEN + 1];
12008 char *vntype;
12009 proc_selfname(&p_name[0], sizeof(p_name));
12010
12011 if (vp->v_type == VREG) {
12012 vntype = "File";
12013 } else if (vp->v_type == VDIR) {
12014 vntype = "Dir";
12015 } else if (vp->v_type == VLNK) {
12016 vntype = "SymLink";
12017 } else {
12018 vntype = "Other";
12019 }
12020
12021 #if DEVELOPMENT
12022 char *path = NULL;
12023 int len;
12024
12025 path = get_pathbuff();
12026 len = MAXPATHLEN;
12027 if (path) {
12028 vn_getpath(vp, path, &len);
12029 }
12030
12031 os_log_debug(OS_LOG_DEFAULT,
12032 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s) path: %s",
12033 p_name, proc_selfpid(),
12034 op, vntype, path ? path : "<unknown-path>");
12035 if (path) {
12036 release_pathbuff(path);
12037 }
12038 #else
12039 os_log_debug(OS_LOG_DEFAULT,
12040 "NSPACE process %s (pid %d) is decorated as no-materialization (op %lld; %s)",
12041 p_name, proc_selfpid(),
12042 op, vntype);
12043 #endif
12044 }
12045 #endif /* CONFIG_DATALESS_FILES */
12046
12047 static int
vfs_materialize_item(vnode_t vp __no_dataless_unused,uint32_t op __no_dataless_unused,int64_t offset __no_dataless_unused,int64_t size __no_dataless_unused,char * lookup_name __no_dataless_unused,size_t const namelen __no_dataless_unused,vnode_t tdvp __no_dataless_unused)12048 vfs_materialize_item(
12049 vnode_t vp __no_dataless_unused,
12050 uint32_t op __no_dataless_unused,
12051 int64_t offset __no_dataless_unused,
12052 int64_t size __no_dataless_unused,
12053 char *lookup_name __no_dataless_unused,
12054 size_t const namelen __no_dataless_unused,
12055 vnode_t tdvp __no_dataless_unused)
12056 {
12057 #if CONFIG_DATALESS_FILES
12058 kern_return_t kern_ret;
12059 mach_port_t mach_port;
12060 char *path = NULL;
12061 vfs_context_t context;
12062 int path_len;
12063 int error;
12064 audit_token_t atoken;
12065 enum vtype vp_vtype;
12066
12067 /* Swap files are special; ignore them */
12068 if (vnode_isswap(vp)) {
12069 return 0;
12070 }
12071
12072 /*
12073 * NAMESPACE_HANDLER_SNAPSHOT_EVENT and NAMESPACE_HANDLER_TRACK_EVENT
12074 * are no longer used nor supported.
12075 */
12076 if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
12077 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
12078 return ENOTSUP;
12079 }
12080 if (op & NAMESPACE_HANDLER_TRACK_EVENT) {
12081 os_log_debug(OS_LOG_DEFAULT, "NSPACE TRACK not handled");
12082 return ENOTSUP;
12083 }
12084
12085 /* Normalize 'op'. */
12086 op &= ~NAMESPACE_HANDLER_EVENT_TYPE_MASK;
12087
12088 /*
12089 * To-directory is only meaningful for rename operations;
12090 * ignore it if someone handed one to us unexpectedly.
12091 */
12092 if (op != NAMESPACE_HANDLER_RENAME_OP) {
12093 tdvp = NULL;
12094 }
12095
12096 context = vfs_context_current();
12097
12098 /* Remember this for later. */
12099 vp_vtype = vnode_vtype(vp);
12100
12101 error = vfs_context_dataless_materialization_is_prevented(context);
12102 if (error) {
12103 log_materialization_prevented(vp, op);
12104 goto out_check_errors;
12105 }
12106
12107 kern_ret = host_get_filecoordinationd_port(host_priv_self(),
12108 &mach_port);
12109 if (kern_ret != KERN_SUCCESS || !IPC_PORT_VALID(mach_port)) {
12110 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
12111 /*
12112 * Treat this like being unable to access the backing store
12113 * server.
12114 */
12115 return ETIMEDOUT;
12116 }
12117
12118 int path_alloc_len = MAXPATHLEN;
12119 do {
12120 path = kalloc_data(path_alloc_len, Z_WAITOK | Z_ZERO);
12121 if (path == NULL) {
12122 return ENOMEM;
12123 }
12124
12125 path_len = path_alloc_len;
12126 error = vn_getpath(vp, path, &path_len);
12127 if (error == 0) {
12128 break;
12129 } else if (error == ENOSPC) {
12130 kfree_data(path, path_alloc_len);
12131 path = NULL;
12132 } else {
12133 goto out_release_port;
12134 }
12135 } while (error == ENOSPC && (path_alloc_len += MAXPATHLEN) && path_alloc_len <= FSGETPATH_MAXBUFLEN);
12136
12137 error = vfs_context_copy_audit_token(context, &atoken);
12138 if (error) {
12139 goto out_release_port;
12140 }
12141
12142 struct nspace_resolver_request req = {
12143 .r_req_id = next_nspace_req_id(),
12144 .r_vp = vp,
12145 .r_tdvp = tdvp,
12146 };
12147
12148 error = nspace_resolver_req_add(&req);
12149 if (error) {
12150 goto out_release_port;
12151 }
12152
12153 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
12154
12155 if (op == NAMESPACE_HANDLER_RENAME_OP && tdvp != NULL) {
12156 char *dest_path = NULL;
12157 int dest_path_len;
12158
12159 dest_path = zalloc(ZV_NAMEI);
12160 dest_path_len = MAXPATHLEN;
12161
12162 error = vn_getpath(tdvp, dest_path, &dest_path_len);
12163 if (error) {
12164 zfree(ZV_NAMEI, dest_path);
12165 goto out_release_port;
12166 }
12167
12168 /*
12169 * Force setting NAMESPACE_HANDLER_NSPACE_EVENT for
12170 * compatibility with existing agents in user-space
12171 * who get passed this value.
12172 */
12173 kern_ret = send_vfs_resolve_reparent_with_audit_token(mach_port,
12174 req.r_req_id,
12175 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12176 path, dest_path, atoken);
12177
12178 zfree(ZV_NAMEI, dest_path);
12179 } else if (vp_vtype == VDIR) {
12180 char *tmpname = NULL;
12181
12182 /*
12183 * If the caller provided a lookup_name *and* a name length,
12184 * then we assume the lookup_name is not NUL-terminated.
12185 * Allocate a temporary buffer in this case to provide
12186 * a NUL-terminated path name to the IPC call.
12187 */
12188 if (lookup_name != NULL && namelen != 0) {
12189 if (namelen >= PATH_MAX) {
12190 error = EINVAL;
12191 goto out_req_remove;
12192 }
12193 tmpname = zalloc(ZV_NAMEI);
12194 strlcpy(tmpname, lookup_name, namelen + 1);
12195 lookup_name = tmpname;
12196 } else if (lookup_name != NULL) {
12197 /*
12198 * If the caller provided a lookup_name with a
12199 * zero name length, then we assume it's NUL-
12200 * terminated. Verify it has a valid length.
12201 */
12202 if (strlen(lookup_name) >= PATH_MAX) {
12203 error = EINVAL;
12204 goto out_req_remove;
12205 }
12206 }
12207
12208 /* (See above.) */
12209 kern_ret = send_vfs_resolve_dir_with_audit_token(mach_port,
12210 req.r_req_id,
12211 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12212 lookup_name == NULL ? "" : lookup_name, path, atoken);
12213
12214 if (tmpname != NULL) {
12215 zfree(ZV_NAMEI, tmpname);
12216
12217 /*
12218 * Poison lookup_name rather than reference
12219 * freed memory.
12220 */
12221 lookup_name = NULL;
12222 }
12223 } else {
12224 /* (See above.) */
12225 kern_ret = send_vfs_resolve_file_with_audit_token(mach_port,
12226 req.r_req_id,
12227 op | NAMESPACE_HANDLER_NSPACE_EVENT,
12228 offset, size, path, atoken);
12229 }
12230 if (kern_ret != KERN_SUCCESS) {
12231 /*
12232 * Also treat this like being unable to access the backing
12233 * store server.
12234 */
12235 os_log_error(OS_LOG_DEFAULT, "NSPACE resolve failure: %d",
12236 kern_ret);
12237 error = ETIMEDOUT;
12238 goto out_req_remove;
12239 }
12240
12241 /*
12242 * Give back the memory we allocated earlier while we wait; we
12243 * no longer need it.
12244 */
12245 kfree_data(path, path_alloc_len);
12246 path = NULL;
12247
12248 /*
12249 * Request has been submitted to the resolver. Now (interruptibly)
12250 * wait for completion. Upon requrn, the request will have been
12251 * removed from the lookup table.
12252 */
12253 error = nspace_resolver_req_wait(&req);
12254
12255 out_release_port:
12256 if (path != NULL) {
12257 kfree_data(path, path_alloc_len);
12258 path = NULL;
12259 }
12260 ipc_port_release_send(mach_port);
12261
12262 out_check_errors:
12263 /*
12264 * The file resolver owns the logic about what error to return
12265 * to the caller. We only need to handle a couple of special
12266 * cases here:
12267 */
12268 if (error == EJUSTRETURN) {
12269 /*
12270 * The requesting process is allowed to interact with
12271 * dataless objects. Make a couple of sanity-checks
12272 * here to ensure the action makes sense.
12273 */
12274 switch (op) {
12275 case NAMESPACE_HANDLER_WRITE_OP:
12276 case NAMESPACE_HANDLER_TRUNCATE_OP:
12277 case NAMESPACE_HANDLER_RENAME_OP:
12278 /*
12279 * This handles the case of the resolver itself
12280 * writing data to the file (or throwing it
12281 * away).
12282 */
12283 error = 0;
12284 break;
12285 case NAMESPACE_HANDLER_READ_OP:
12286 case NAMESPACE_HANDLER_LOOKUP_OP:
12287 /*
12288 * This handles the case of the resolver needing
12289 * to look up inside of a dataless directory while
12290 * it's in the process of materializing it (for
12291 * example, creating files or directories).
12292 */
12293 error = (vp_vtype == VDIR) ? 0 : EBADF;
12294 break;
12295 default:
12296 error = EBADF;
12297 break;
12298 }
12299 }
12300
12301 return error;
12302
12303 out_req_remove:
12304 nspace_resolver_req_remove(&req);
12305 goto out_release_port;
12306 #else
12307 return ENOTSUP;
12308 #endif /* CONFIG_DATALESS_FILES */
12309 }
12310
12311 /*
12312 * vfs_materialize_file: Materialize a regular file.
12313 *
12314 * Inputs:
12315 * vp The dataless file to be materialized.
12316 *
12317 * op What kind of operation is being performed:
12318 * -> NAMESPACE_HANDLER_READ_OP
12319 * -> NAMESPACE_HANDLER_WRITE_OP
12320 * -> NAMESPACE_HANDLER_LINK_CREATE
12321 * -> NAMESPACE_HANDLER_DELETE_OP
12322 * -> NAMESPACE_HANDLER_TRUNCATE_OP
12323 * -> NAMESPACE_HANDLER_RENAME_OP
12324 *
12325 * offset offset of I/O for READ or WRITE. Ignored for
12326 * other ops.
12327 *
12328 * size size of I/O for READ or WRITE Ignored for
12329 * other ops.
12330 *
12331 * If offset or size are -1 for a READ or WRITE, then the resolver should
12332 * consider the range to be unknown.
12333 *
12334 * Upon successful return, the caller may proceed with the operation.
12335 * N.B. the file may still be "dataless" in this case.
12336 */
12337 int
vfs_materialize_file(struct vnode * vp,uint64_t op,int64_t offset,int64_t size)12338 vfs_materialize_file(
12339 struct vnode *vp,
12340 uint64_t op,
12341 int64_t offset,
12342 int64_t size)
12343 {
12344 if (vp->v_type != VREG) {
12345 return EFTYPE;
12346 }
12347 return vfs_materialize_item(vp, (uint32_t)op, offset, size, NULL, 0,
12348 NULL);
12349 }
12350
12351 /*
12352 * vfs_materialize_dir:
12353 *
12354 * Inputs:
12355 * vp The dataless directory to be materialized.
12356 *
12357 * op What kind of operation is being performed:
12358 * -> NAMESPACE_HANDLER_READ_OP
12359 * -> NAMESPACE_HANDLER_WRITE_OP
12360 * -> NAMESPACE_HANDLER_DELETE_OP
12361 * -> NAMESPACE_HANDLER_RENAME_OP
12362 * -> NAMESPACE_HANDLER_LOOKUP_OP
12363 *
12364 * lookup_name Name being looked up for a LOOKUP op. Ignored for
12365 * other ops. May or may not be NUL-terminated; see below.
12366 *
12367 * namelen If non-zero, then lookup_name is assumed to not be NUL-
12368 * terminated and namelen is the number of valid bytes in
12369 * lookup_name. If zero, then lookup_name is assumed to be
12370 * NUL-terminated.
12371 *
12372 * Upon successful return, the caller may proceed with the operation.
12373 * N.B. the directory may still be "dataless" in this case.
12374 */
12375 int
vfs_materialize_dir(struct vnode * vp,uint64_t op,char * lookup_name,size_t namelen)12376 vfs_materialize_dir(
12377 struct vnode *vp,
12378 uint64_t op,
12379 char *lookup_name,
12380 size_t namelen)
12381 {
12382 if (vp->v_type != VDIR) {
12383 return EFTYPE;
12384 }
12385 if (op == NAMESPACE_HANDLER_LOOKUP_OP && lookup_name == NULL) {
12386 return EINVAL;
12387 }
12388 return vfs_materialize_item(vp, (uint32_t)op, 0, 0, lookup_name,
12389 namelen, NULL);
12390 }
12391
12392 /*
12393 * vfs_materialize_reparent:
12394 *
12395 * Inputs:
12396 * vp The dataless file or directory to be materialized.
12397 *
12398 * tdvp The new parent directory for the dataless file.
12399 *
12400 * Upon successful return, the caller may proceed with the operation.
12401 * N.B. the item may still be "dataless" in this case.
12402 */
12403 int
vfs_materialize_reparent(vnode_t vp,vnode_t tdvp)12404 vfs_materialize_reparent(vnode_t vp, vnode_t tdvp)
12405 {
12406 if (vp->v_type != VDIR && vp->v_type != VREG) {
12407 return EFTYPE;
12408 }
12409 return vfs_materialize_item(vp, NAMESPACE_HANDLER_RENAME_OP,
12410 0, 0, NULL, 0, tdvp);
12411 }
12412
12413 #if 0
12414 static int
12415 build_volfs_path(struct vnode *vp, char *path, int *len)
12416 {
12417 struct vnode_attr va;
12418 int ret;
12419
12420 VATTR_INIT(&va);
12421 VATTR_WANTED(&va, va_fsid);
12422 VATTR_WANTED(&va, va_fileid);
12423
12424 if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
12425 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
12426 ret = -1;
12427 } else {
12428 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
12429 ret = 0;
12430 }
12431
12432 return ret;
12433 }
12434 #endif
12435
12436 static unsigned long
fsctl_bogus_command_compat(unsigned long cmd)12437 fsctl_bogus_command_compat(unsigned long cmd)
12438 {
12439 switch (cmd) {
12440 case IOCBASECMD(FSIOC_SYNC_VOLUME):
12441 return FSIOC_SYNC_VOLUME;
12442 case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
12443 return FSIOC_ROUTEFS_SETROUTEID;
12444 case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
12445 return FSIOC_SET_PACKAGE_EXTS;
12446 case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
12447 return FSIOC_SET_FSTYPENAME_OVERRIDE;
12448 case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
12449 return DISK_CONDITIONER_IOC_GET;
12450 case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
12451 return DISK_CONDITIONER_IOC_SET;
12452 case IOCBASECMD(FSIOC_FIOSEEKHOLE):
12453 return FSIOC_FIOSEEKHOLE;
12454 case IOCBASECMD(FSIOC_FIOSEEKDATA):
12455 return FSIOC_FIOSEEKDATA;
12456 case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
12457 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
12458 case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
12459 return SPOTLIGHT_IOC_GET_LAST_MTIME;
12460 }
12461
12462 return cmd;
12463 }
12464
12465 static int
cas_bsdflags_setattr(vnode_t vp,void * arg,vfs_context_t ctx)12466 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
12467 {
12468 return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
12469 }
12470
12471 static int __attribute__((noinline))
handle_sync_volume(vnode_t vp,vnode_t * arg_vp,caddr_t data,vfs_context_t ctx)12472 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
12473 {
12474 struct vfs_attr vfa;
12475 mount_t mp = vp->v_mount;
12476 unsigned arg;
12477 int error;
12478
12479 /* record vid of vp so we can drop it below. */
12480 uint32_t vvid = vp->v_id;
12481
12482 /*
12483 * Then grab mount_iterref so that we can release the vnode.
12484 * Without this, a thread may call vnode_iterate_prepare then
12485 * get into a deadlock because we've never released the root vp
12486 */
12487 error = mount_iterref(mp, 0);
12488 if (error) {
12489 return error;
12490 }
12491 vnode_hold(vp);
12492 vnode_put(vp);
12493
12494 arg = MNT_NOWAIT;
12495 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
12496 arg = MNT_WAIT;
12497 }
12498
12499 /*
12500 * If the filessytem supports multiple filesytems in a
12501 * partition (For eg APFS volumes in a container, it knows
12502 * that the waitfor argument to VFS_SYNC are flags.
12503 */
12504 VFSATTR_INIT(&vfa);
12505 VFSATTR_WANTED(&vfa, f_capabilities);
12506 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
12507 VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
12508 ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
12509 ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
12510 arg |= MNT_VOLUME;
12511 }
12512
12513 /* issue the sync for this volume */
12514 (void)sync_callback(mp, &arg);
12515
12516 /*
12517 * Then release the mount_iterref once we're done syncing; it's not
12518 * needed for the VNOP_IOCTL below
12519 */
12520 mount_iterdrop(mp);
12521
12522 if (arg & FSCTL_SYNC_FULLSYNC) {
12523 /* re-obtain vnode iocount on the root vp, if possible */
12524 error = vnode_getwithvid(vp, vvid);
12525 if (error == 0) {
12526 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
12527 vnode_put(vp);
12528 }
12529 }
12530 vnode_drop(vp);
12531 /* mark the argument VP as having been released */
12532 *arg_vp = NULL;
12533 return error;
12534 }
12535
12536 #if ROUTEFS
12537 static int __attribute__((noinline))
handle_routes(user_addr_t udata)12538 handle_routes(user_addr_t udata)
12539 {
12540 char routepath[MAXPATHLEN];
12541 size_t len = 0;
12542 int error;
12543
12544 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12545 return error;
12546 }
12547 bzero(routepath, MAXPATHLEN);
12548 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
12549 if (error) {
12550 return error;
12551 }
12552 error = routefs_kernel_mount(routepath);
12553 return error;
12554 }
12555 #endif
12556
12557 static int __attribute__((noinline))
handle_flags(vnode_t vp,caddr_t data,vfs_context_t ctx)12558 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
12559 {
12560 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
12561 struct vnode_attr va;
12562 int error;
12563
12564 VATTR_INIT(&va);
12565 VATTR_SET(&va, va_flags, cas->new_flags);
12566
12567 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
12568
12569 #if CONFIG_FSE
12570 if (error == 0 && cas->expected_flags == cas->actual_flags && need_fsevent(FSE_STAT_CHANGED, vp)) {
12571 add_fsevent(FSE_STAT_CHANGED, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
12572 }
12573 #endif
12574
12575 return error;
12576 }
12577
12578 static int __attribute__((noinline))
handle_auth(vnode_t vp,u_long cmd,caddr_t data,u_long options,vfs_context_t ctx)12579 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
12580 {
12581 struct mount *mp = NULL;
12582 errno_t rootauth = 0;
12583
12584 mp = vp->v_mount;
12585
12586 /*
12587 * query the underlying FS and see if it reports something
12588 * sane for this vnode. If volume is authenticated via
12589 * chunklist, leave that for the caller to determine.
12590 */
12591 rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12592
12593 return rootauth;
12594 }
12595
12596 #define SET_PACKAGE_EXTENSION_ENTITLEMENT \
12597 "com.apple.private.kernel.set-package-extensions"
12598
12599 /*
12600 * Make a filesystem-specific control call:
12601 */
12602 /* ARGSUSED */
12603 static int
fsctl_internal(proc_t p,vnode_t * arg_vp,u_long cmd,user_addr_t udata,u_long options,vfs_context_t ctx)12604 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
12605 {
12606 int error = 0;
12607 boolean_t is64bit;
12608 u_int size;
12609 #define STK_PARAMS 128
12610 char stkbuf[STK_PARAMS] = {0};
12611 caddr_t data, memp;
12612 vnode_t vp = *arg_vp;
12613
12614 if (vp->v_type == VCHR || vp->v_type == VBLK) {
12615 return ENOTTY;
12616 }
12617
12618 cmd = fsctl_bogus_command_compat(cmd);
12619
12620 size = IOCPARM_LEN(cmd);
12621 if (size > IOCPARM_MAX) {
12622 return EINVAL;
12623 }
12624
12625 is64bit = proc_is64bit(p);
12626
12627 memp = NULL;
12628
12629 if (size > sizeof(stkbuf)) {
12630 if ((memp = (caddr_t)kalloc_data(size, Z_WAITOK)) == 0) {
12631 return ENOMEM;
12632 }
12633 data = memp;
12634 } else {
12635 data = &stkbuf[0];
12636 };
12637
12638 if (cmd & IOC_IN) {
12639 if (size) {
12640 error = copyin(udata, data, size);
12641 if (error) {
12642 if (memp) {
12643 kfree_data(memp, size);
12644 }
12645 return error;
12646 }
12647 } else {
12648 if (is64bit) {
12649 *(user_addr_t *)data = udata;
12650 } else {
12651 *(uint32_t *)data = (uint32_t)udata;
12652 }
12653 };
12654 } else if ((cmd & IOC_OUT) && size) {
12655 /*
12656 * Zero the buffer so the user always
12657 * gets back something deterministic.
12658 */
12659 bzero(data, size);
12660 } else if (cmd & IOC_VOID) {
12661 if (is64bit) {
12662 *(user_addr_t *)data = udata;
12663 } else {
12664 *(uint32_t *)data = (uint32_t)udata;
12665 }
12666 }
12667
12668 /* Check to see if it's a generic command */
12669 switch (cmd) {
12670 case FSIOC_SYNC_VOLUME:
12671 error = handle_sync_volume(vp, arg_vp, data, ctx);
12672 break;
12673
12674 case FSIOC_ROUTEFS_SETROUTEID:
12675 #if ROUTEFS
12676 error = handle_routes(udata);
12677 #endif
12678 break;
12679
12680 case FSIOC_SET_PACKAGE_EXTS: {
12681 user_addr_t ext_strings;
12682 uint32_t num_entries;
12683 uint32_t max_width;
12684
12685 if (!IOTaskHasEntitlement(vfs_context_task(ctx),
12686 SET_PACKAGE_EXTENSION_ENTITLEMENT)) {
12687 error = EPERM;
12688 break;
12689 }
12690
12691 if ((is64bit && size != sizeof(user64_package_ext_info))
12692 || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
12693 // either you're 64-bit and passed a 64-bit struct or
12694 // you're 32-bit and passed a 32-bit struct. otherwise
12695 // it's not ok.
12696 error = EINVAL;
12697 break;
12698 }
12699
12700 if (is64bit) {
12701 if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
12702 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
12703 }
12704 ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
12705 num_entries = ((user64_package_ext_info *)data)->num_entries;
12706 max_width = ((user64_package_ext_info *)data)->max_width;
12707 } else {
12708 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
12709 num_entries = ((user32_package_ext_info *)data)->num_entries;
12710 max_width = ((user32_package_ext_info *)data)->max_width;
12711 }
12712 error = set_package_extensions_table(ext_strings, num_entries, max_width);
12713 }
12714 break;
12715
12716 case FSIOC_SET_FSTYPENAME_OVERRIDE:
12717 {
12718 mount_t mp;
12719
12720 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
12721 break;
12722 }
12723 if ((mp = vp->v_mount) != NULL) {
12724 mount_lock(mp);
12725 if (data[0] != 0) {
12726 for (int i = 0; i < MFSTYPENAMELEN; i++) {
12727 if (!data[i]) {
12728 goto continue_copy;
12729 }
12730 }
12731 /*
12732 * Getting here means we have a user data
12733 * string which has no NULL termination in
12734 * its first MFSTYPENAMELEN bytes. This is
12735 * bogus, let's avoid strlcpy-ing the read
12736 * data and return an error.
12737 */
12738 error = EINVAL;
12739 goto unlock;
12740 continue_copy:
12741 vfs_setfstypename_locked(mp, data);
12742 if (vfs_isrdonly(mp) &&
12743 strcmp(data, "mtmfs") == 0) {
12744 mp->mnt_kern_flag |=
12745 MNTK_EXTENDED_SECURITY;
12746 mp->mnt_kern_flag &=
12747 ~MNTK_AUTH_OPAQUE;
12748 }
12749 } else if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12750 const char *name =
12751 vfs_getfstypenameref_locked(mp, NULL);
12752 if (strcmp(name, "mtmfs") == 0) {
12753 mp->mnt_kern_flag &=
12754 ~MNTK_EXTENDED_SECURITY;
12755 }
12756 vfs_setfstypename_locked(mp, NULL);
12757 }
12758 unlock:
12759 mount_unlock(mp);
12760 }
12761 }
12762 break;
12763
12764 case DISK_CONDITIONER_IOC_GET: {
12765 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
12766 }
12767 break;
12768
12769 case DISK_CONDITIONER_IOC_SET: {
12770 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
12771 }
12772 break;
12773
12774 case FSIOC_CAS_BSDFLAGS:
12775 error = handle_flags(vp, data, ctx);
12776 break;
12777
12778 case FSIOC_FD_ONLY_OPEN_ONCE: {
12779 error = 0;
12780 if (vnode_usecount(vp) > 1) {
12781 vnode_lock_spin(vp);
12782 if (vp->v_lflag & VL_HASSTREAMS) {
12783 if (vnode_isinuse_locked(vp, 1, 1)) {
12784 error = EBUSY;
12785 }
12786 } else if (vnode_usecount(vp) > 1) {
12787 error = EBUSY;
12788 }
12789 vnode_unlock(vp);
12790 }
12791 }
12792 break;
12793
12794 case FSIOC_EVAL_ROOTAUTH:
12795 error = handle_auth(vp, cmd, data, options, ctx);
12796 break;
12797
12798 case FSIOC_TEST_FSE_ACCESS_GRANTED:
12799 error = test_fse_access_granted(vp, (unsigned long)udata, ctx);
12800 break;
12801
12802 #if CONFIG_EXCLAVES
12803 case FSIOC_EXCLAVE_FS_REGISTER:
12804 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12805 error = vfs_exclave_fs_register(((fsioc_exclave_fs_register_t *)data)->fs_tag, vp);
12806 } else {
12807 error = EPERM;
12808 }
12809 break;
12810
12811 case FSIOC_EXCLAVE_FS_UNREGISTER:
12812 if (IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12813 error = vfs_exclave_fs_unregister(vp);
12814 } else {
12815 error = EPERM;
12816 }
12817 break;
12818
12819 case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: {
12820 exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data);
12821 exclave_fs_base_dir_t *dirs = NULL;
12822 if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) {
12823 error = EPERM;
12824 break;
12825 }
12826 if (get_base_dirs->base_dirs) {
12827 if ((get_base_dirs->count == 0) || (get_base_dirs->count > EXCLAVE_FS_GET_BASE_DIRS_MAX_COUNT)) {
12828 error = EINVAL;
12829 break;
12830 }
12831 dirs = kalloc_type(exclave_fs_base_dir_t, get_base_dirs->count, Z_WAITOK | Z_ZERO);
12832 if (!dirs) {
12833 error = ENOSPC;
12834 break;
12835 }
12836 }
12837 error = vfs_exclave_fs_get_base_dirs(dirs, &get_base_dirs->count);
12838 if (!error && dirs) {
12839 error = copyout(dirs, (user_addr_t)get_base_dirs->base_dirs,
12840 get_base_dirs->count * sizeof(exclave_fs_base_dir_t));
12841 }
12842 if (dirs) {
12843 kfree_type(exclave_fs_base_dir_t, get_base_dirs->count, dirs);
12844 }
12845 }
12846 break;
12847 #endif
12848
12849 default: {
12850 /*
12851 * Other, known commands shouldn't be passed down here.
12852 * (When adding a selector to this list, it may be prudent
12853 * to consider adding it to the list in sys_fcntl_nocancel() as well.)
12854 */
12855 switch (cmd) {
12856 case F_PUNCHHOLE:
12857 case F_TRIM_ACTIVE_FILE:
12858 case F_RDADVISE:
12859 case F_TRANSCODEKEY:
12860 case F_GETPROTECTIONLEVEL:
12861 case F_GETDEFAULTPROTLEVEL:
12862 case F_MAKECOMPRESSED:
12863 case F_SET_GREEDY_MODE:
12864 case F_SETSTATICCONTENT:
12865 case F_SETIOTYPE:
12866 case F_SETBACKINGSTORE:
12867 case F_GETPATH_MTMINFO:
12868 case APFSIOC_REVERT_TO_SNAPSHOT:
12869 case FSIOC_FIOSEEKHOLE:
12870 case FSIOC_FIOSEEKDATA:
12871 case HFS_GET_BOOT_INFO:
12872 case HFS_SET_BOOT_INFO:
12873 case FIOPINSWAP:
12874 case F_CHKCLEAN:
12875 case F_FULLFSYNC:
12876 case F_BARRIERFSYNC:
12877 case F_FREEZE_FS:
12878 case F_THAW_FS:
12879 case FSIOC_KERNEL_ROOTAUTH:
12880 case FSIOC_GRAFT_FS:
12881 case FSIOC_UNGRAFT_FS:
12882 case FSIOC_AUTH_FS:
12883 error = EINVAL;
12884 goto outdrop;
12885 }
12886 /* Invoke the filesystem-specific code */
12887 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
12888 }
12889 } /* end switch stmt */
12890
12891 /*
12892 * if no errors, copy any data to user. Size was
12893 * already set and checked above.
12894 */
12895 if (error == 0 && (cmd & IOC_OUT) && size) {
12896 error = copyout(data, udata, size);
12897 }
12898
12899 outdrop:
12900 if (memp) {
12901 kfree_data(memp, size);
12902 }
12903
12904 return error;
12905 }
12906
12907 /* ARGSUSED */
12908 int
fsctl(proc_t p,struct fsctl_args * uap,__unused int32_t * retval)12909 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
12910 {
12911 int error;
12912 struct nameidata nd;
12913 uint32_t nameiflags;
12914 vnode_t vp = NULL;
12915 vfs_context_t ctx = vfs_context_current();
12916
12917 AUDIT_ARG(cmd, (int)uap->cmd);
12918 AUDIT_ARG(value32, uap->options);
12919 /* Get the vnode for the file we are getting info on: */
12920 nameiflags = 0;
12921 //
12922 // if we come through fsctl() then the file is by definition not open.
12923 // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
12924 // lest the caller mistakenly thinks the only open is their own (but in
12925 // reality it's someone elses).
12926 //
12927 if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
12928 return EINVAL;
12929 }
12930 if ((uap->options & FSOPT_NOFOLLOW) == 0) {
12931 nameiflags |= FOLLOW;
12932 }
12933 if (uap->cmd == FSIOC_FIRMLINK_CTL) {
12934 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
12935 }
12936 NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
12937 UIO_USERSPACE, uap->path, ctx);
12938 if ((error = namei(&nd))) {
12939 goto done;
12940 }
12941 vp = nd.ni_vp;
12942 nameidone(&nd);
12943
12944 #if CONFIG_MACF
12945 error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
12946 if (error) {
12947 goto done;
12948 }
12949 #endif
12950
12951 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12952
12953 done:
12954 if (vp) {
12955 vnode_put(vp);
12956 }
12957 return error;
12958 }
12959 /* ARGSUSED */
12960 int
ffsctl(proc_t p,struct ffsctl_args * uap,__unused int32_t * retval)12961 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
12962 {
12963 int error;
12964 vnode_t vp = NULL;
12965 vfs_context_t ctx = vfs_context_current();
12966 int fd = -1;
12967
12968 AUDIT_ARG(fd, uap->fd);
12969 AUDIT_ARG(cmd, (int)uap->cmd);
12970 AUDIT_ARG(value32, uap->options);
12971
12972 /* Get the vnode for the file we are getting info on: */
12973 if ((error = file_vnode(uap->fd, &vp))) {
12974 return error;
12975 }
12976 fd = uap->fd;
12977 if ((error = vnode_getwithref(vp))) {
12978 file_drop(fd);
12979 return error;
12980 }
12981
12982 #if CONFIG_MACF
12983 if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
12984 file_drop(fd);
12985 vnode_put(vp);
12986 return error;
12987 }
12988 #endif
12989
12990 error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
12991
12992 file_drop(fd);
12993
12994 /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
12995 if (vp) {
12996 vnode_put(vp);
12997 }
12998
12999 return error;
13000 }
13001 /* end of fsctl system call */
13002
13003 #define FILESEC_ACCESS_ENTITLEMENT \
13004 "com.apple.private.vfs.filesec-access"
13005
13006 static int
xattr_entitlement_check(const char * attrname,vfs_context_t ctx,bool setting)13007 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
13008 {
13009 if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
13010 /*
13011 * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
13012 * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
13013 */
13014 if ((!setting && vfs_context_issuser(ctx)) ||
13015 IOTaskHasEntitlement(vfs_context_task(ctx),
13016 FILESEC_ACCESS_ENTITLEMENT)) {
13017 return 0;
13018 }
13019 }
13020
13021 return EPERM;
13022 }
13023
13024 /*
13025 * Retrieve the data of an extended attribute.
13026 */
13027 int
getxattr(proc_t p,struct getxattr_args * uap,user_ssize_t * retval)13028 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
13029 {
13030 vnode_t vp;
13031 struct nameidata nd;
13032 char attrname[XATTR_MAXNAMELEN + 1];
13033 vfs_context_t ctx = vfs_context_current();
13034 uio_t auio = NULL;
13035 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13036 size_t attrsize = 0;
13037 size_t namelen;
13038 u_int32_t nameiflags;
13039 int error;
13040 UIO_STACKBUF(uio_buf, 1);
13041
13042 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13043 return EINVAL;
13044 }
13045
13046 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13047 NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
13048 if ((error = namei(&nd))) {
13049 return error;
13050 }
13051 vp = nd.ni_vp;
13052 nameidone(&nd);
13053
13054 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13055 if (error != 0) {
13056 goto out;
13057 }
13058 if (xattr_protected(attrname) &&
13059 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13060 goto out;
13061 }
13062 /*
13063 * the specific check for 0xffffffff is a hack to preserve
13064 * binaray compatibilty in K64 with applications that discovered
13065 * that passing in a buf pointer and a size of -1 resulted in
13066 * just the size of the indicated extended attribute being returned.
13067 * this isn't part of the documented behavior, but because of the
13068 * original implemtation's check for "uap->size > 0", this behavior
13069 * was allowed. In K32 that check turned into a signed comparison
13070 * even though uap->size is unsigned... in K64, we blow by that
13071 * check because uap->size is unsigned and doesn't get sign smeared
13072 * in the munger for a 32 bit user app. we also need to add a
13073 * check to limit the maximum size of the buffer being passed in...
13074 * unfortunately, the underlying fileystems seem to just malloc
13075 * the requested size even if the actual extended attribute is tiny.
13076 * because that malloc is for kernel wired memory, we have to put a
13077 * sane limit on it.
13078 *
13079 * U32 running on K64 will yield 0x00000000ffffffff for uap->size
13080 * U64 running on K64 will yield -1 (64 bits wide)
13081 * U32/U64 running on K32 will yield -1 (32 bits wide)
13082 */
13083 if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
13084 goto no_uio;
13085 }
13086
13087 if (uap->value) {
13088 if (uap->size > (size_t)XATTR_MAXSIZE) {
13089 uap->size = XATTR_MAXSIZE;
13090 }
13091
13092 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13093 &uio_buf[0], sizeof(uio_buf));
13094 uio_addiov(auio, uap->value, uap->size);
13095 }
13096 no_uio:
13097 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
13098 out:
13099 vnode_put(vp);
13100
13101 if (auio) {
13102 *retval = uap->size - uio_resid(auio);
13103 } else {
13104 *retval = (user_ssize_t)attrsize;
13105 }
13106
13107 return error;
13108 }
13109
13110 /*
13111 * Retrieve the data of an extended attribute.
13112 */
13113 int
fgetxattr(proc_t p,struct fgetxattr_args * uap,user_ssize_t * retval)13114 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
13115 {
13116 vnode_t vp;
13117 char attrname[XATTR_MAXNAMELEN + 1];
13118 vfs_context_t ctx = vfs_context_current();
13119 uio_t auio = NULL;
13120 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13121 size_t attrsize = 0;
13122 size_t namelen;
13123 int error;
13124 UIO_STACKBUF(uio_buf, 1);
13125
13126 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13127 return EINVAL;
13128 }
13129
13130 if ((error = file_vnode(uap->fd, &vp))) {
13131 return error;
13132 }
13133 if ((error = vnode_getwithref(vp))) {
13134 file_drop(uap->fd);
13135 return error;
13136 }
13137 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13138 if (error != 0) {
13139 goto out;
13140 }
13141 if (xattr_protected(attrname) &&
13142 (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
13143 goto out;
13144 }
13145 if (uap->value && uap->size > 0) {
13146 if (uap->size > (size_t)XATTR_MAXSIZE) {
13147 uap->size = XATTR_MAXSIZE;
13148 }
13149
13150 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
13151 &uio_buf[0], sizeof(uio_buf));
13152 uio_addiov(auio, uap->value, uap->size);
13153 }
13154
13155 error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
13156 out:
13157 (void)vnode_put(vp);
13158 file_drop(uap->fd);
13159
13160 if (auio) {
13161 *retval = uap->size - uio_resid(auio);
13162 } else {
13163 *retval = (user_ssize_t)attrsize;
13164 }
13165 return error;
13166 }
13167
13168 /* struct for checkdirs iteration */
13169 struct setxattr_ctx {
13170 struct nameidata nd;
13171 char attrname[XATTR_MAXNAMELEN + 1];
13172 UIO_STACKBUF(uio_buf, 1);
13173 };
13174
13175 /*
13176 * Set the data of an extended attribute.
13177 */
13178 int
setxattr(proc_t p,struct setxattr_args * uap,int * retval)13179 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
13180 {
13181 vnode_t vp;
13182 vfs_context_t ctx = vfs_context_current();
13183 uio_t auio = NULL;
13184 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13185 size_t namelen;
13186 u_int32_t nameiflags;
13187 int error;
13188 struct setxattr_ctx *sactx;
13189
13190 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13191 return EINVAL;
13192 }
13193
13194 sactx = kalloc_type(struct setxattr_ctx, Z_WAITOK);
13195 if (sactx == NULL) {
13196 return ENOMEM;
13197 }
13198
13199 error = copyinstr(uap->attrname, sactx->attrname, sizeof(sactx->attrname), &namelen);
13200 if (error != 0) {
13201 if (error == EPERM) {
13202 /* if the string won't fit in attrname, copyinstr emits EPERM */
13203 error = ENAMETOOLONG;
13204 }
13205 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13206 goto out;
13207 }
13208 if (xattr_protected(sactx->attrname) &&
13209 (error = xattr_entitlement_check(sactx->attrname, ctx, true)) != 0) {
13210 goto out;
13211 }
13212 if (uap->size != 0 && uap->value == 0) {
13213 error = EINVAL;
13214 goto out;
13215 }
13216 if (uap->size > INT_MAX) {
13217 error = E2BIG;
13218 goto out;
13219 }
13220
13221 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13222 #if CONFIG_FILE_LEASES
13223 nameiflags |= WANTPARENT;
13224 #endif
13225 NDINIT(&sactx->nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
13226 if ((error = namei(&sactx->nd))) {
13227 goto out;
13228 }
13229 vp = sactx->nd.ni_vp;
13230 #if CONFIG_FILE_LEASES
13231 vnode_breakdirlease(sactx->nd.ni_dvp, false, O_WRONLY);
13232 vnode_put(sactx->nd.ni_dvp);
13233 #endif
13234 nameidone(&sactx->nd);
13235
13236 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13237 &sactx->uio_buf[0], sizeof(sactx->uio_buf));
13238 uio_addiov(auio, uap->value, uap->size);
13239
13240 error = vn_setxattr(vp, sactx->attrname, auio, uap->options, ctx);
13241 #if CONFIG_FSE
13242 if (error == 0) {
13243 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13244 FSE_ARG_VNODE, vp,
13245 FSE_ARG_DONE);
13246 }
13247 #endif
13248 vnode_put(vp);
13249 out:
13250 kfree_type(struct setxattr_ctx, sactx);
13251 *retval = 0;
13252 return error;
13253 }
13254
13255 /*
13256 * Set the data of an extended attribute.
13257 */
13258 int
fsetxattr(proc_t p,struct fsetxattr_args * uap,int * retval)13259 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
13260 {
13261 vnode_t vp;
13262 char attrname[XATTR_MAXNAMELEN + 1];
13263 vfs_context_t ctx = vfs_context_current();
13264 uio_t auio = NULL;
13265 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13266 size_t namelen;
13267 int error;
13268 UIO_STACKBUF(uio_buf, 1);
13269
13270 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13271 return EINVAL;
13272 }
13273
13274 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13275 if (error != 0) {
13276 if (error == EPERM) {
13277 /* if the string won't fit in attrname, copyinstr emits EPERM */
13278 return ENAMETOOLONG;
13279 }
13280 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
13281 return error;
13282 }
13283 if (xattr_protected(attrname) &&
13284 (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
13285 return error;
13286 }
13287 if (uap->size != 0 && uap->value == 0) {
13288 return EINVAL;
13289 }
13290 if (uap->size > INT_MAX) {
13291 return E2BIG;
13292 }
13293 if ((error = file_vnode(uap->fd, &vp))) {
13294 return error;
13295 }
13296 if ((error = vnode_getwithref(vp))) {
13297 file_drop(uap->fd);
13298 return error;
13299 }
13300
13301 #if CONFIG_FILE_LEASES
13302 vnode_breakdirlease(vp, true, O_WRONLY);
13303 #endif
13304
13305 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
13306 &uio_buf[0], sizeof(uio_buf));
13307 uio_addiov(auio, uap->value, uap->size);
13308
13309 error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
13310 #if CONFIG_FSE
13311 if (error == 0) {
13312 add_fsevent(FSE_XATTR_MODIFIED, ctx,
13313 FSE_ARG_VNODE, vp,
13314 FSE_ARG_DONE);
13315 }
13316 #endif
13317 vnode_put(vp);
13318 file_drop(uap->fd);
13319 *retval = 0;
13320 return error;
13321 }
13322
13323 /*
13324 * Remove an extended attribute.
13325 * XXX Code duplication here.
13326 */
13327 int
removexattr(proc_t p,struct removexattr_args * uap,int * retval)13328 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
13329 {
13330 vnode_t vp;
13331 struct nameidata nd;
13332 char attrname[XATTR_MAXNAMELEN + 1];
13333 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13334 vfs_context_t ctx = vfs_context_current();
13335 size_t namelen;
13336 u_int32_t nameiflags;
13337 int error;
13338
13339 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13340 return EINVAL;
13341 }
13342
13343 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13344 if (error != 0) {
13345 return error;
13346 }
13347 if (xattr_protected(attrname)) {
13348 return EPERM;
13349 }
13350 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13351 #if CONFIG_FILE_LEASES
13352 nameiflags |= WANTPARENT;
13353 #endif
13354 NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
13355 if ((error = namei(&nd))) {
13356 return error;
13357 }
13358 vp = nd.ni_vp;
13359 #if CONFIG_FILE_LEASES
13360 vnode_breakdirlease(nd.ni_dvp, false, O_WRONLY);
13361 vnode_put(nd.ni_dvp);
13362 #endif
13363 nameidone(&nd);
13364
13365 error = vn_removexattr(vp, attrname, uap->options, ctx);
13366 #if CONFIG_FSE
13367 if (error == 0) {
13368 add_fsevent(FSE_XATTR_REMOVED, ctx,
13369 FSE_ARG_VNODE, vp,
13370 FSE_ARG_DONE);
13371 }
13372 #endif
13373 vnode_put(vp);
13374 *retval = 0;
13375 return error;
13376 }
13377
13378 /*
13379 * Remove an extended attribute.
13380 * XXX Code duplication here.
13381 */
13382 int
fremovexattr(__unused proc_t p,struct fremovexattr_args * uap,int * retval)13383 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
13384 {
13385 vnode_t vp;
13386 char attrname[XATTR_MAXNAMELEN + 1];
13387 size_t namelen;
13388 int error;
13389 #if CONFIG_FSE
13390 vfs_context_t ctx = vfs_context_current();
13391 #endif
13392
13393 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13394 return EINVAL;
13395 }
13396
13397 error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
13398 if (error != 0) {
13399 return error;
13400 }
13401 if (xattr_protected(attrname)) {
13402 return EPERM;
13403 }
13404 if ((error = file_vnode(uap->fd, &vp))) {
13405 return error;
13406 }
13407 if ((error = vnode_getwithref(vp))) {
13408 file_drop(uap->fd);
13409 return error;
13410 }
13411
13412 #if CONFIG_FILE_LEASES
13413 vnode_breakdirlease(vp, true, O_WRONLY);
13414 #endif
13415
13416 error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
13417 #if CONFIG_FSE
13418 if (error == 0) {
13419 add_fsevent(FSE_XATTR_REMOVED, ctx,
13420 FSE_ARG_VNODE, vp,
13421 FSE_ARG_DONE);
13422 }
13423 #endif
13424 vnode_put(vp);
13425 file_drop(uap->fd);
13426 *retval = 0;
13427 return error;
13428 }
13429
13430 /*
13431 * Retrieve the list of extended attribute names.
13432 * XXX Code duplication here.
13433 */
13434 int
listxattr(proc_t p,struct listxattr_args * uap,user_ssize_t * retval)13435 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
13436 {
13437 vnode_t vp;
13438 struct nameidata nd;
13439 vfs_context_t ctx = vfs_context_current();
13440 uio_t auio = NULL;
13441 int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13442 size_t attrsize = 0;
13443 u_int32_t nameiflags;
13444 int error;
13445 UIO_STACKBUF(uio_buf, 1);
13446
13447 if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13448 return EINVAL;
13449 }
13450
13451 nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
13452 NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
13453 if ((error = namei(&nd))) {
13454 return error;
13455 }
13456 vp = nd.ni_vp;
13457 nameidone(&nd);
13458 if (uap->namebuf != 0 && uap->bufsize > 0) {
13459 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
13460 &uio_buf[0], sizeof(uio_buf));
13461 uio_addiov(auio, uap->namebuf, uap->bufsize);
13462 }
13463
13464 error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
13465
13466 vnode_put(vp);
13467 if (auio) {
13468 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13469 } else {
13470 *retval = (user_ssize_t)attrsize;
13471 }
13472 return error;
13473 }
13474
13475 /*
13476 * Retrieve the list of extended attribute names.
13477 * XXX Code duplication here.
13478 */
13479 int
flistxattr(proc_t p,struct flistxattr_args * uap,user_ssize_t * retval)13480 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
13481 {
13482 vnode_t vp;
13483 uio_t auio = NULL;
13484 int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
13485 size_t attrsize = 0;
13486 int error;
13487 UIO_STACKBUF(uio_buf, 1);
13488
13489 if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
13490 return EINVAL;
13491 }
13492
13493 if ((error = file_vnode(uap->fd, &vp))) {
13494 return error;
13495 }
13496 if ((error = vnode_getwithref(vp))) {
13497 file_drop(uap->fd);
13498 return error;
13499 }
13500 if (uap->namebuf != 0 && uap->bufsize > 0) {
13501 auio = uio_createwithbuffer(1, 0, spacetype,
13502 UIO_READ, &uio_buf[0], sizeof(uio_buf));
13503 uio_addiov(auio, uap->namebuf, uap->bufsize);
13504 }
13505
13506 error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
13507
13508 vnode_put(vp);
13509 file_drop(uap->fd);
13510 if (auio) {
13511 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
13512 } else {
13513 *retval = (user_ssize_t)attrsize;
13514 }
13515 return error;
13516 }
13517
13518 int
fsgetpath_internal(vfs_context_t ctx,int volfs_id,uint64_t objid,vm_size_t bufsize,caddr_t buf,uint32_t options,int * pathlen)13519 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
13520 vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
13521 {
13522 int error;
13523 struct mount *mp = NULL;
13524 vnode_t vp;
13525 int length;
13526 int bpflags;
13527 /* maximum number of times to retry build_path */
13528 unsigned int retries = 0x10;
13529
13530 if (bufsize > FSGETPATH_MAXBUFLEN) {
13531 return EINVAL;
13532 }
13533
13534 if (buf == NULL) {
13535 return ENOMEM;
13536 }
13537
13538 retry:
13539 if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
13540 error = ENOTSUP; /* unexpected failure */
13541 return ENOTSUP;
13542 }
13543
13544 #if CONFIG_UNION_MOUNTS
13545 unionget:
13546 #endif /* CONFIG_UNION_MOUNTS */
13547 if (objid == 2) {
13548 struct vfs_attr vfsattr;
13549 int use_vfs_root = TRUE;
13550
13551 VFSATTR_INIT(&vfsattr);
13552 VFSATTR_WANTED(&vfsattr, f_capabilities);
13553 if (!(options & FSOPT_ISREALFSID) &&
13554 vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
13555 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
13556 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
13557 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
13558 use_vfs_root = FALSE;
13559 }
13560 }
13561
13562 if (use_vfs_root) {
13563 error = VFS_ROOT(mp, &vp, ctx);
13564 } else {
13565 error = VFS_VGET(mp, objid, &vp, ctx);
13566 }
13567 } else {
13568 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
13569 }
13570
13571 #if CONFIG_UNION_MOUNTS
13572 if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
13573 /*
13574 * If the fileid isn't found and we're in a union
13575 * mount volume, then see if the fileid is in the
13576 * mounted-on volume.
13577 */
13578 struct mount *tmp = mp;
13579 mp = vnode_mount(tmp->mnt_vnodecovered);
13580 vfs_unbusy(tmp);
13581 if (vfs_busy(mp, LK_NOWAIT) == 0) {
13582 goto unionget;
13583 }
13584 } else {
13585 vfs_unbusy(mp);
13586 }
13587 #else
13588 vfs_unbusy(mp);
13589 #endif /* CONFIG_UNION_MOUNTS */
13590
13591 if (error) {
13592 return error;
13593 }
13594
13595 #if CONFIG_MACF
13596 error = mac_vnode_check_fsgetpath(ctx, vp);
13597 if (error) {
13598 vnode_put(vp);
13599 return error;
13600 }
13601 #endif
13602
13603 /* Obtain the absolute path to this vnode. */
13604 bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
13605 if (options & FSOPT_NOFIRMLINKPATH) {
13606 bpflags |= BUILDPATH_NO_FIRMLINK;
13607 }
13608 bpflags |= BUILDPATH_CHECK_MOVED;
13609 error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
13610 vnode_put(vp);
13611
13612 if (error) {
13613 /* there was a race building the path, try a few more times */
13614 if (error == EAGAIN) {
13615 --retries;
13616 if (retries > 0) {
13617 goto retry;
13618 }
13619
13620 error = ENOENT;
13621 }
13622 goto out;
13623 }
13624
13625 AUDIT_ARG(text, buf);
13626
13627 if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
13628 unsigned long path_words[NUMPARMS];
13629 size_t path_len = sizeof(path_words);
13630
13631 if ((size_t)length < path_len) {
13632 memcpy((char *)path_words, buf, length);
13633 memset((char *)path_words + length, 0, path_len - length);
13634
13635 path_len = length;
13636 } else {
13637 memcpy((char *)path_words, buf + (length - path_len), path_len);
13638 }
13639
13640 kdebug_vfs_lookup(path_words, (int)path_len, vp,
13641 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
13642 }
13643
13644 *pathlen = length; /* may be superseded by error */
13645
13646 out:
13647 return error;
13648 }
13649
13650 /*
13651 * Obtain the full pathname of a file system object by id.
13652 */
13653 static int
fsgetpath_extended(user_addr_t buf,user_size_t bufsize,user_addr_t user_fsid,uint64_t objid,uint32_t options,user_ssize_t * retval)13654 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
13655 uint32_t options, user_ssize_t *retval)
13656 {
13657 vfs_context_t ctx = vfs_context_current();
13658 fsid_t fsid;
13659 char *realpath;
13660 int length;
13661 int error;
13662
13663 if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
13664 return EINVAL;
13665 }
13666
13667 if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
13668 return error;
13669 }
13670 AUDIT_ARG(value32, fsid.val[0]);
13671 AUDIT_ARG(value64, objid);
13672 /* Restrict output buffer size for now. */
13673
13674 if (bufsize > FSGETPATH_MAXBUFLEN || bufsize <= 0) {
13675 return EINVAL;
13676 }
13677 realpath = kalloc_data(bufsize, Z_WAITOK | Z_ZERO);
13678 if (realpath == NULL) {
13679 return ENOMEM;
13680 }
13681
13682 error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
13683 options, &length);
13684
13685 if (error) {
13686 goto out;
13687 }
13688
13689 error = copyout((caddr_t)realpath, buf, length);
13690
13691 *retval = (user_ssize_t)length; /* may be superseded by error */
13692 out:
13693 kfree_data(realpath, bufsize);
13694 return error;
13695 }
13696
13697 int
fsgetpath(__unused proc_t p,struct fsgetpath_args * uap,user_ssize_t * retval)13698 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
13699 {
13700 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13701 0, retval);
13702 }
13703
13704 int
fsgetpath_ext(__unused proc_t p,struct fsgetpath_ext_args * uap,user_ssize_t * retval)13705 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
13706 {
13707 return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
13708 uap->options, retval);
13709 }
13710
13711 /*
13712 * Common routine to handle various flavors of statfs data heading out
13713 * to user space.
13714 *
13715 * Returns: 0 Success
13716 * EFAULT
13717 */
13718 static int
munge_statfs(struct mount * mp,struct vfsstatfs * sfsp,user_addr_t bufp,int * sizep,boolean_t is_64_bit,boolean_t partial_copy)13719 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
13720 user_addr_t bufp, int *sizep, boolean_t is_64_bit,
13721 boolean_t partial_copy)
13722 {
13723 int error;
13724 int my_size, copy_size;
13725
13726 if (is_64_bit) {
13727 struct user64_statfs sfs;
13728 my_size = copy_size = sizeof(sfs);
13729 bzero(&sfs, my_size);
13730 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13731 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13732 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13733 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
13734 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
13735 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
13736 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
13737 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
13738 sfs.f_files = (user64_long_t)sfsp->f_files;
13739 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
13740 sfs.f_fsid = sfsp->f_fsid;
13741 sfs.f_owner = sfsp->f_owner;
13742 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13743 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13744 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13745
13746 if (partial_copy) {
13747 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13748 }
13749 error = copyout((caddr_t)&sfs, bufp, copy_size);
13750 } else {
13751 struct user32_statfs sfs;
13752
13753 my_size = copy_size = sizeof(sfs);
13754 bzero(&sfs, my_size);
13755
13756 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
13757 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
13758 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
13759
13760 /*
13761 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
13762 * have to fudge the numbers here in that case. We inflate the blocksize in order
13763 * to reflect the filesystem size as best we can.
13764 */
13765 if ((sfsp->f_blocks > INT_MAX)
13766 /* Hack for 4061702 . I think the real fix is for Carbon to
13767 * look for some volume capability and not depend on hidden
13768 * semantics agreed between a FS and carbon.
13769 * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
13770 * for Carbon to set bNoVolumeSizes volume attribute.
13771 * Without this the webdavfs files cannot be copied onto
13772 * disk as they look huge. This change should not affect
13773 * XSAN as they should not setting these to -1..
13774 */
13775 && (sfsp->f_blocks != 0xffffffffffffffffULL)
13776 && (sfsp->f_bfree != 0xffffffffffffffffULL)
13777 && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
13778 int shift;
13779
13780 /*
13781 * Work out how far we have to shift the block count down to make it fit.
13782 * Note that it's possible to have to shift so far that the resulting
13783 * blocksize would be unreportably large. At that point, we will clip
13784 * any values that don't fit.
13785 *
13786 * For safety's sake, we also ensure that f_iosize is never reported as
13787 * being smaller than f_bsize.
13788 */
13789 for (shift = 0; shift < 32; shift++) {
13790 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
13791 break;
13792 }
13793 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
13794 break;
13795 }
13796 }
13797 #define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
13798 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
13799 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
13800 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
13801 #undef __SHIFT_OR_CLIP
13802 sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
13803 sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
13804 } else {
13805 /* filesystem is small enough to be reported honestly */
13806 sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
13807 sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
13808 sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
13809 sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
13810 sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
13811 }
13812 sfs.f_files = (user32_long_t)sfsp->f_files;
13813 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
13814 sfs.f_fsid = sfsp->f_fsid;
13815 sfs.f_owner = sfsp->f_owner;
13816 vfs_getfstypename(mp, sfs.f_fstypename, MFSNAMELEN);
13817 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
13818 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
13819
13820 if (partial_copy) {
13821 copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
13822 }
13823 error = copyout((caddr_t)&sfs, bufp, copy_size);
13824 }
13825
13826 if (sizep != NULL) {
13827 *sizep = my_size;
13828 }
13829 return error;
13830 }
13831
13832 /*
13833 * copy stat structure into user_stat structure.
13834 */
13835 void
munge_user64_stat(struct stat * sbp,struct user64_stat * usbp)13836 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
13837 {
13838 bzero(usbp, sizeof(*usbp));
13839
13840 usbp->st_dev = sbp->st_dev;
13841 usbp->st_ino = sbp->st_ino;
13842 usbp->st_mode = sbp->st_mode;
13843 usbp->st_nlink = sbp->st_nlink;
13844 usbp->st_uid = sbp->st_uid;
13845 usbp->st_gid = sbp->st_gid;
13846 usbp->st_rdev = sbp->st_rdev;
13847 #ifndef _POSIX_C_SOURCE
13848 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13849 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13850 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13851 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13852 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13853 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13854 #else
13855 usbp->st_atime = sbp->st_atime;
13856 usbp->st_atimensec = sbp->st_atimensec;
13857 usbp->st_mtime = sbp->st_mtime;
13858 usbp->st_mtimensec = sbp->st_mtimensec;
13859 usbp->st_ctime = sbp->st_ctime;
13860 usbp->st_ctimensec = sbp->st_ctimensec;
13861 #endif
13862 usbp->st_size = sbp->st_size;
13863 usbp->st_blocks = sbp->st_blocks;
13864 usbp->st_blksize = sbp->st_blksize;
13865 usbp->st_flags = sbp->st_flags;
13866 usbp->st_gen = sbp->st_gen;
13867 usbp->st_lspare = sbp->st_lspare;
13868 usbp->st_qspare[0] = sbp->st_qspare[0];
13869 usbp->st_qspare[1] = sbp->st_qspare[1];
13870 }
13871
13872 void
munge_user32_stat(struct stat * sbp,struct user32_stat * usbp)13873 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
13874 {
13875 bzero(usbp, sizeof(*usbp));
13876
13877 usbp->st_dev = sbp->st_dev;
13878 usbp->st_ino = sbp->st_ino;
13879 usbp->st_mode = sbp->st_mode;
13880 usbp->st_nlink = sbp->st_nlink;
13881 usbp->st_uid = sbp->st_uid;
13882 usbp->st_gid = sbp->st_gid;
13883 usbp->st_rdev = sbp->st_rdev;
13884 #ifndef _POSIX_C_SOURCE
13885 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13886 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13887 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13888 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13889 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13890 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13891 #else
13892 usbp->st_atime = sbp->st_atime;
13893 usbp->st_atimensec = sbp->st_atimensec;
13894 usbp->st_mtime = sbp->st_mtime;
13895 usbp->st_mtimensec = sbp->st_mtimensec;
13896 usbp->st_ctime = sbp->st_ctime;
13897 usbp->st_ctimensec = sbp->st_ctimensec;
13898 #endif
13899 usbp->st_size = sbp->st_size;
13900 usbp->st_blocks = sbp->st_blocks;
13901 usbp->st_blksize = sbp->st_blksize;
13902 usbp->st_flags = sbp->st_flags;
13903 usbp->st_gen = sbp->st_gen;
13904 usbp->st_lspare = sbp->st_lspare;
13905 usbp->st_qspare[0] = sbp->st_qspare[0];
13906 usbp->st_qspare[1] = sbp->st_qspare[1];
13907 }
13908
13909 /*
13910 * copy stat64 structure into user_stat64 structure.
13911 */
13912 void
munge_user64_stat64(struct stat64 * sbp,struct user64_stat64 * usbp)13913 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
13914 {
13915 bzero(usbp, sizeof(*usbp));
13916
13917 usbp->st_dev = sbp->st_dev;
13918 usbp->st_ino = sbp->st_ino;
13919 usbp->st_mode = sbp->st_mode;
13920 usbp->st_nlink = sbp->st_nlink;
13921 usbp->st_uid = sbp->st_uid;
13922 usbp->st_gid = sbp->st_gid;
13923 usbp->st_rdev = sbp->st_rdev;
13924 #ifndef _POSIX_C_SOURCE
13925 usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
13926 usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
13927 usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
13928 usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
13929 usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
13930 usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
13931 usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
13932 usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
13933 #else
13934 usbp->st_atime = sbp->st_atime;
13935 usbp->st_atimensec = sbp->st_atimensec;
13936 usbp->st_mtime = sbp->st_mtime;
13937 usbp->st_mtimensec = sbp->st_mtimensec;
13938 usbp->st_ctime = sbp->st_ctime;
13939 usbp->st_ctimensec = sbp->st_ctimensec;
13940 usbp->st_birthtime = sbp->st_birthtime;
13941 usbp->st_birthtimensec = sbp->st_birthtimensec;
13942 #endif
13943 usbp->st_size = sbp->st_size;
13944 usbp->st_blocks = sbp->st_blocks;
13945 usbp->st_blksize = sbp->st_blksize;
13946 usbp->st_flags = sbp->st_flags;
13947 usbp->st_gen = sbp->st_gen;
13948 usbp->st_lspare = sbp->st_lspare;
13949 usbp->st_qspare[0] = sbp->st_qspare[0];
13950 usbp->st_qspare[1] = sbp->st_qspare[1];
13951 }
13952
13953 void
munge_user32_stat64(struct stat64 * sbp,struct user32_stat64 * usbp)13954 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
13955 {
13956 bzero(usbp, sizeof(*usbp));
13957
13958 usbp->st_dev = sbp->st_dev;
13959 usbp->st_ino = sbp->st_ino;
13960 usbp->st_mode = sbp->st_mode;
13961 usbp->st_nlink = sbp->st_nlink;
13962 usbp->st_uid = sbp->st_uid;
13963 usbp->st_gid = sbp->st_gid;
13964 usbp->st_rdev = sbp->st_rdev;
13965 #ifndef _POSIX_C_SOURCE
13966 usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
13967 usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
13968 usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
13969 usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
13970 usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
13971 usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
13972 usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
13973 usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
13974 #else
13975 usbp->st_atime = sbp->st_atime;
13976 usbp->st_atimensec = sbp->st_atimensec;
13977 usbp->st_mtime = sbp->st_mtime;
13978 usbp->st_mtimensec = sbp->st_mtimensec;
13979 usbp->st_ctime = sbp->st_ctime;
13980 usbp->st_ctimensec = sbp->st_ctimensec;
13981 usbp->st_birthtime = sbp->st_birthtime;
13982 usbp->st_birthtimensec = sbp->st_birthtimensec;
13983 #endif
13984 usbp->st_size = sbp->st_size;
13985 usbp->st_blocks = sbp->st_blocks;
13986 usbp->st_blksize = sbp->st_blksize;
13987 usbp->st_flags = sbp->st_flags;
13988 usbp->st_gen = sbp->st_gen;
13989 usbp->st_lspare = sbp->st_lspare;
13990 usbp->st_qspare[0] = sbp->st_qspare[0];
13991 usbp->st_qspare[1] = sbp->st_qspare[1];
13992 }
13993
13994 /*
13995 * Purge buffer cache for simulating cold starts
13996 */
13997 static int
vnode_purge_callback(struct vnode * vp,__unused void * cargs)13998 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
13999 {
14000 ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
14001
14002 return VNODE_RETURNED;
14003 }
14004
14005 static int
vfs_purge_callback(mount_t mp,__unused void * arg)14006 vfs_purge_callback(mount_t mp, __unused void * arg)
14007 {
14008 vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
14009
14010 return VFS_RETURNED;
14011 }
14012
14013 static TUNABLE_WRITEABLE(boolean_t, vfs_purge_vm_pagers, "vfs_purge_vm_pagers", TRUE);
14014 SYSCTL_INT(_vfs, OID_AUTO, purge_vm_pagers, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_purge_vm_pagers, 0, "VFS purge also purges file-backed VM pagers");
14015
14016 int
vfs_purge(__unused struct proc * p,__unused struct vfs_purge_args * uap,__unused int32_t * retval)14017 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
14018 {
14019 if (!kauth_cred_issuser(kauth_cred_get())) {
14020 return EPERM;
14021 }
14022
14023 vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
14024
14025 /* also flush any VM pagers backed by files */
14026 if (vfs_purge_vm_pagers) {
14027 vm_purge_filebacked_pagers();
14028 }
14029
14030 return 0;
14031 }
14032
14033 /*
14034 * gets the vnode associated with the (unnamed) snapshot directory
14035 * for a Filesystem. The snapshot directory vnode is returned with
14036 * an iocount on it.
14037 */
14038 int
vnode_get_snapdir(vnode_t rvp,vnode_t * sdvpp,vfs_context_t ctx)14039 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
14040 {
14041 return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
14042 }
14043
14044 /*
14045 * Get the snapshot vnode.
14046 *
14047 * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
14048 * needs nameidone() on ndp.
14049 *
14050 * If the snapshot vnode exists it is returned in ndp->ni_vp.
14051 *
14052 * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
14053 * not needed.
14054 */
14055 static int
vnode_get_snapshot(int dirfd,vnode_t * rvpp,vnode_t * sdvpp,user_addr_t name,struct nameidata * ndp,int32_t op,__unused enum path_operation pathop,vfs_context_t ctx)14056 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
14057 user_addr_t name, struct nameidata *ndp, int32_t op,
14058 #if !CONFIG_TRIGGERS
14059 __unused
14060 #endif
14061 enum path_operation pathop,
14062 vfs_context_t ctx)
14063 {
14064 int error, i;
14065 caddr_t name_buf;
14066 size_t name_len;
14067 struct vfs_attr vfa;
14068
14069 *sdvpp = NULLVP;
14070 *rvpp = NULLVP;
14071
14072 error = vnode_getfromfd(ctx, dirfd, rvpp);
14073 if (error) {
14074 return error;
14075 }
14076
14077 if (!vnode_isvroot(*rvpp)) {
14078 error = EINVAL;
14079 goto out;
14080 }
14081
14082 /* Make sure the filesystem supports snapshots */
14083 VFSATTR_INIT(&vfa);
14084 VFSATTR_WANTED(&vfa, f_capabilities);
14085 if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
14086 !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
14087 !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
14088 VOL_CAP_INT_SNAPSHOT)) ||
14089 !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
14090 VOL_CAP_INT_SNAPSHOT))) {
14091 error = ENOTSUP;
14092 goto out;
14093 }
14094
14095 error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
14096 if (error) {
14097 goto out;
14098 }
14099
14100 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14101 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14102 if (error) {
14103 goto out1;
14104 }
14105
14106 /*
14107 * Some sanity checks- name can't be empty, "." or ".." or have slashes.
14108 * (the length returned by copyinstr includes the terminating NUL)
14109 */
14110 if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
14111 (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
14112 error = EINVAL;
14113 goto out1;
14114 }
14115 for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
14116 ;
14117 }
14118 if (i < (int)name_len) {
14119 error = EINVAL;
14120 goto out1;
14121 }
14122
14123 #if CONFIG_MACF
14124 if (op == CREATE) {
14125 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
14126 name_buf);
14127 } else if (op == DELETE) {
14128 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
14129 name_buf);
14130 }
14131 if (error) {
14132 goto out1;
14133 }
14134 #endif
14135
14136 /* Check if the snapshot already exists ... */
14137 NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
14138 UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
14139 ndp->ni_dvp = *sdvpp;
14140
14141 error = namei(ndp);
14142 out1:
14143 zfree(ZV_NAMEI, name_buf);
14144 out:
14145 if (error) {
14146 if (*sdvpp) {
14147 vnode_put(*sdvpp);
14148 *sdvpp = NULLVP;
14149 }
14150 if (*rvpp) {
14151 vnode_put(*rvpp);
14152 *rvpp = NULLVP;
14153 }
14154 }
14155 return error;
14156 }
14157
14158 /*
14159 * create a filesystem snapshot (for supporting filesystems)
14160 *
14161 * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
14162 * We get to the (unnamed) snapshot directory vnode and create the vnode
14163 * for the snapshot in it.
14164 *
14165 * Restrictions:
14166 *
14167 * a) Passed in name for snapshot cannot have slashes.
14168 * b) name can't be "." or ".."
14169 *
14170 * Since this requires superuser privileges, vnode_authorize calls are not
14171 * made.
14172 */
14173 static int __attribute__((noinline))
snapshot_create(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14174 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
14175 vfs_context_t ctx)
14176 {
14177 vnode_t rvp, snapdvp;
14178 int error;
14179 struct nameidata *ndp;
14180
14181 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14182
14183 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
14184 OP_LINK, ctx);
14185 if (error) {
14186 goto out;
14187 }
14188
14189 if (ndp->ni_vp) {
14190 vnode_put(ndp->ni_vp);
14191 error = EEXIST;
14192 } else {
14193 struct vnode_attr *vap;
14194 vnode_t vp = NULLVP;
14195
14196 vap = kalloc_type(struct vnode_attr, Z_WAITOK);
14197
14198 VATTR_INIT(vap);
14199 VATTR_SET(vap, va_type, VREG);
14200 VATTR_SET(vap, va_mode, 0);
14201
14202 error = vn_create(snapdvp, &vp, ndp, vap,
14203 VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT, 0, NULL, ctx);
14204 if (!error && vp) {
14205 vnode_put(vp);
14206 }
14207
14208 kfree_type(struct vnode_attr, vap);
14209 }
14210
14211 nameidone(ndp);
14212 vnode_put(snapdvp);
14213 vnode_put(rvp);
14214 out:
14215 kfree_type(struct nameidata, ndp);
14216
14217 return error;
14218 }
14219
14220 /*
14221 * Delete a Filesystem snapshot
14222 *
14223 * get the vnode for the unnamed snapshot directory and the snapshot and
14224 * delete the snapshot.
14225 */
14226 static int __attribute__((noinline))
snapshot_delete(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14227 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
14228 vfs_context_t ctx)
14229 {
14230 vnode_t rvp, snapdvp;
14231 int error;
14232 struct nameidata *ndp;
14233
14234 ndp = kalloc_type(struct nameidata, Z_WAITOK);
14235
14236 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
14237 OP_UNLINK, ctx);
14238 if (error) {
14239 goto out;
14240 }
14241
14242 error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
14243 VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
14244
14245 vnode_put(ndp->ni_vp);
14246 nameidone(ndp);
14247 vnode_put(snapdvp);
14248 vnode_put(rvp);
14249 out:
14250 kfree_type(struct nameidata, ndp);
14251
14252 return error;
14253 }
14254
14255 /*
14256 * Revert a filesystem to a snapshot
14257 *
14258 * Marks the filesystem to revert to the given snapshot on next mount.
14259 */
14260 static int __attribute__((noinline))
snapshot_revert(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14261 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
14262 vfs_context_t ctx)
14263 {
14264 int error;
14265 vnode_t rvp;
14266 mount_t mp;
14267 struct fs_snapshot_revert_args revert_data;
14268 struct componentname cnp;
14269 caddr_t name_buf;
14270 size_t name_len;
14271
14272 error = vnode_getfromfd(ctx, dirfd, &rvp);
14273 if (error) {
14274 return error;
14275 }
14276 mp = vnode_mount(rvp);
14277
14278 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14279 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14280 if (error) {
14281 zfree(ZV_NAMEI, name_buf);
14282 vnode_put(rvp);
14283 return error;
14284 }
14285
14286 #if CONFIG_MACF
14287 error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
14288 if (error) {
14289 zfree(ZV_NAMEI, name_buf);
14290 vnode_put(rvp);
14291 return error;
14292 }
14293 #endif
14294
14295 /*
14296 * Grab mount_iterref so that we can release the vnode,
14297 * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
14298 */
14299 error = mount_iterref(mp, 0);
14300 vnode_put(rvp);
14301 if (error) {
14302 zfree(ZV_NAMEI, name_buf);
14303 return error;
14304 }
14305
14306 memset(&cnp, 0, sizeof(cnp));
14307 cnp.cn_pnbuf = (char *)name_buf;
14308 cnp.cn_nameiop = LOOKUP;
14309 cnp.cn_flags = ISLASTCN | HASBUF;
14310 cnp.cn_pnlen = MAXPATHLEN;
14311 cnp.cn_nameptr = cnp.cn_pnbuf;
14312 cnp.cn_namelen = (int)name_len;
14313 revert_data.sr_cnp = &cnp;
14314
14315 error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
14316 mount_iterdrop(mp);
14317 zfree(ZV_NAMEI, name_buf);
14318
14319 if (error) {
14320 /* If there was any error, try again using VNOP_IOCTL */
14321
14322 vnode_t snapdvp;
14323 struct nameidata namend;
14324
14325 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
14326 OP_LOOKUP, ctx);
14327 if (error) {
14328 return error;
14329 }
14330
14331
14332 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
14333 0, ctx);
14334
14335 vnode_put(namend.ni_vp);
14336 nameidone(&namend);
14337 vnode_put(snapdvp);
14338 vnode_put(rvp);
14339 }
14340
14341 return error;
14342 }
14343
14344 /*
14345 * rename a Filesystem snapshot
14346 *
14347 * get the vnode for the unnamed snapshot directory and the snapshot and
14348 * rename the snapshot. This is a very specialised (and simple) case of
14349 * rename(2) (which has to deal with a lot more complications). It differs
14350 * slightly from rename(2) in that EEXIST is returned if the new name exists.
14351 */
14352 static int __attribute__((noinline))
snapshot_rename(int dirfd,user_addr_t old,user_addr_t new,__unused uint32_t flags,vfs_context_t ctx)14353 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
14354 __unused uint32_t flags, vfs_context_t ctx)
14355 {
14356 vnode_t rvp, snapdvp;
14357 int error, i;
14358 caddr_t newname_buf;
14359 size_t name_len;
14360 vnode_t fvp;
14361 struct nameidata *fromnd, *tond;
14362 /* carving out a chunk for structs that are too big to be on stack. */
14363 struct {
14364 struct nameidata from_node;
14365 struct nameidata to_node;
14366 } * __rename_data;
14367
14368 __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK);
14369 fromnd = &__rename_data->from_node;
14370 tond = &__rename_data->to_node;
14371
14372 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
14373 OP_UNLINK, ctx);
14374 if (error) {
14375 goto out;
14376 }
14377 fvp = fromnd->ni_vp;
14378
14379 newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14380 error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
14381 if (error) {
14382 goto out1;
14383 }
14384
14385 /*
14386 * Some sanity checks- new name can't be empty, "." or ".." or have
14387 * slashes.
14388 * (the length returned by copyinstr includes the terminating NUL)
14389 *
14390 * The FS rename VNOP is suppossed to handle this but we'll pick it
14391 * off here itself.
14392 */
14393 if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
14394 (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
14395 error = EINVAL;
14396 goto out1;
14397 }
14398 for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
14399 ;
14400 }
14401 if (i < (int)name_len) {
14402 error = EINVAL;
14403 goto out1;
14404 }
14405
14406 #if CONFIG_MACF
14407 error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
14408 newname_buf);
14409 if (error) {
14410 goto out1;
14411 }
14412 #endif
14413
14414 NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
14415 UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
14416 tond->ni_dvp = snapdvp;
14417
14418 error = namei(tond);
14419 if (error) {
14420 goto out2;
14421 } else if (tond->ni_vp) {
14422 /*
14423 * snapshot rename behaves differently than rename(2) - if the
14424 * new name exists, EEXIST is returned.
14425 */
14426 vnode_put(tond->ni_vp);
14427 error = EEXIST;
14428 goto out2;
14429 }
14430
14431 error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
14432 &tond->ni_cnd, ctx);
14433
14434 out2:
14435 nameidone(tond);
14436 out1:
14437 zfree(ZV_NAMEI, newname_buf);
14438 vnode_put(fvp);
14439 vnode_put(snapdvp);
14440 vnode_put(rvp);
14441 nameidone(fromnd);
14442 out:
14443 kfree_type(typeof(*__rename_data), __rename_data);
14444 return error;
14445 }
14446
14447 /*
14448 * Mount a Filesystem snapshot
14449 *
14450 * get the vnode for the unnamed snapshot directory and the snapshot and
14451 * mount the snapshot.
14452 */
14453 static int __attribute__((noinline))
snapshot_mount(int dirfd,user_addr_t name,user_addr_t directory,__unused user_addr_t mnt_data,__unused uint32_t flags,vfs_context_t ctx)14454 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
14455 __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
14456 {
14457 mount_t mp;
14458 vnode_t rvp, snapdvp, snapvp, vp, pvp;
14459 struct fs_snapshot_mount_args smnt_data;
14460 int error;
14461 struct nameidata *snapndp, *dirndp;
14462 /* carving out a chunk for structs that are too big to be on stack. */
14463 struct {
14464 struct nameidata snapnd;
14465 struct nameidata dirnd;
14466 } * __snapshot_mount_data;
14467
14468 __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK);
14469 snapndp = &__snapshot_mount_data->snapnd;
14470 dirndp = &__snapshot_mount_data->dirnd;
14471
14472 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
14473 OP_LOOKUP, ctx);
14474 if (error) {
14475 goto out;
14476 }
14477
14478 snapvp = snapndp->ni_vp;
14479 if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
14480 error = EIO;
14481 goto out1;
14482 }
14483
14484 /* Get the vnode to be covered */
14485 NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
14486 UIO_USERSPACE, directory, ctx);
14487 error = namei(dirndp);
14488 if (error) {
14489 goto out1;
14490 }
14491
14492 vp = dirndp->ni_vp;
14493 pvp = dirndp->ni_dvp;
14494 mp = vnode_mount(rvp);
14495
14496 if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
14497 error = EINVAL;
14498 goto out2;
14499 }
14500
14501 #if CONFIG_MACF
14502 error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
14503 mp->mnt_vfsstat.f_fstypename);
14504 if (error) {
14505 goto out2;
14506 }
14507 #endif
14508
14509 smnt_data.sm_mp = mp;
14510 smnt_data.sm_cnp = &snapndp->ni_cnd;
14511 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
14512 &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & (MNT_DONTBROWSE | MNT_IGNORE_OWNERSHIP),
14513 KERNEL_MOUNT_SNAPSHOT, NULL, ctx);
14514
14515 out2:
14516 vnode_put(vp);
14517 vnode_put(pvp);
14518 nameidone(dirndp);
14519 out1:
14520 vnode_put(snapvp);
14521 vnode_put(snapdvp);
14522 vnode_put(rvp);
14523 nameidone(snapndp);
14524 out:
14525 kfree_type(typeof(*__snapshot_mount_data), __snapshot_mount_data);
14526 return error;
14527 }
14528
14529 /*
14530 * Root from a snapshot of the filesystem
14531 *
14532 * Marks the filesystem to root from the given snapshot on next boot.
14533 */
14534 static int __attribute__((noinline))
snapshot_root(int dirfd,user_addr_t name,__unused uint32_t flags,vfs_context_t ctx)14535 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
14536 vfs_context_t ctx)
14537 {
14538 int error;
14539 vnode_t rvp;
14540 mount_t mp;
14541 struct fs_snapshot_root_args root_data;
14542 struct componentname cnp;
14543 caddr_t name_buf;
14544 size_t name_len;
14545
14546 error = vnode_getfromfd(ctx, dirfd, &rvp);
14547 if (error) {
14548 return error;
14549 }
14550 mp = vnode_mount(rvp);
14551
14552 name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
14553 error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
14554 if (error) {
14555 zfree(ZV_NAMEI, name_buf);
14556 vnode_put(rvp);
14557 return error;
14558 }
14559
14560 // XXX MAC checks ?
14561
14562 /*
14563 * Grab mount_iterref so that we can release the vnode,
14564 * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
14565 */
14566 error = mount_iterref(mp, 0);
14567 vnode_put(rvp);
14568 if (error) {
14569 zfree(ZV_NAMEI, name_buf);
14570 return error;
14571 }
14572
14573 memset(&cnp, 0, sizeof(cnp));
14574 cnp.cn_pnbuf = (char *)name_buf;
14575 cnp.cn_nameiop = LOOKUP;
14576 cnp.cn_flags = ISLASTCN | HASBUF;
14577 cnp.cn_pnlen = MAXPATHLEN;
14578 cnp.cn_nameptr = cnp.cn_pnbuf;
14579 cnp.cn_namelen = (int)name_len;
14580 root_data.sr_cnp = &cnp;
14581
14582 error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
14583
14584 mount_iterdrop(mp);
14585 zfree(ZV_NAMEI, name_buf);
14586
14587 return error;
14588 }
14589
14590 static boolean_t
vfs_context_can_snapshot(vfs_context_t ctx)14591 vfs_context_can_snapshot(vfs_context_t ctx)
14592 {
14593 static const char * const snapshot_entitlements[] = {
14594 "com.apple.private.vfs.snapshot",
14595 "com.apple.developer.vfs.snapshot",
14596 "com.apple.private.apfs.arv.limited.snapshot",
14597 };
14598 static const size_t nentitlements =
14599 sizeof(snapshot_entitlements) / sizeof(snapshot_entitlements[0]);
14600 size_t i;
14601
14602 task_t task = vfs_context_task(ctx);
14603 for (i = 0; i < nentitlements; i++) {
14604 if (IOTaskHasEntitlement(task, snapshot_entitlements[i])) {
14605 return TRUE;
14606 }
14607 }
14608 return FALSE;
14609 }
14610
14611 /*
14612 * FS snapshot operations dispatcher
14613 */
14614 int
fs_snapshot(__unused proc_t p,struct fs_snapshot_args * uap,__unused int32_t * retval)14615 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
14616 __unused int32_t *retval)
14617 {
14618 int error;
14619 vfs_context_t ctx = vfs_context_current();
14620
14621 AUDIT_ARG(fd, uap->dirfd);
14622 AUDIT_ARG(value32, uap->op);
14623
14624 if (!vfs_context_can_snapshot(ctx)) {
14625 return EPERM;
14626 }
14627
14628 /*
14629 * Enforce user authorization for snapshot modification operations,
14630 * or if trying to root from snapshot.
14631 */
14632 if (uap->op != SNAPSHOT_OP_MOUNT) {
14633 vnode_t dvp = NULLVP;
14634 vnode_t devvp = NULLVP;
14635 mount_t mp;
14636
14637 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
14638 if (error) {
14639 return error;
14640 }
14641 mp = vnode_mount(dvp);
14642 devvp = mp->mnt_devvp;
14643
14644 /* get an iocount on devvp */
14645 if (devvp == NULLVP) {
14646 error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
14647 /* for mounts which arent block devices */
14648 if (error == ENOENT) {
14649 error = ENXIO;
14650 }
14651 } else {
14652 error = vnode_getwithref(devvp);
14653 }
14654
14655 if (error) {
14656 vnode_put(dvp);
14657 return error;
14658 }
14659
14660 if ((vfs_context_issuser(ctx) == 0) &&
14661 (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0) &&
14662 (!IOTaskHasEntitlement(vfs_context_task(ctx), "com.apple.private.vfs.snapshot.user"))) {
14663 error = EPERM;
14664 }
14665 vnode_put(dvp);
14666 vnode_put(devvp);
14667
14668 if (error) {
14669 return error;
14670 }
14671 }
14672
14673 switch (uap->op) {
14674 case SNAPSHOT_OP_CREATE:
14675 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
14676 break;
14677 case SNAPSHOT_OP_DELETE:
14678 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
14679 break;
14680 case SNAPSHOT_OP_RENAME:
14681 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
14682 uap->flags, ctx);
14683 break;
14684 case SNAPSHOT_OP_MOUNT:
14685 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
14686 uap->data, uap->flags, ctx);
14687 break;
14688 case SNAPSHOT_OP_REVERT:
14689 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
14690 break;
14691 #if CONFIG_MNT_ROOTSNAP
14692 case SNAPSHOT_OP_ROOT:
14693 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
14694 break;
14695 #endif /* CONFIG_MNT_ROOTSNAP */
14696 default:
14697 error = ENOSYS;
14698 }
14699
14700 return error;
14701 }
14702