xref: /xnu-8020.140.41/bsd/vfs/vfs_cluster.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)vfs_cluster.c	8.10 (Berkeley) 3/28/95
62  */
63 
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <kern/kalloc.h>
71 #include <sys/time.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <miscfs/specfs/specdev.h>
75 #include <sys/uio_internal.h>
76 #include <libkern/libkern.h>
77 #include <machine/machine_routines.h>
78 
79 #include <sys/ubc_internal.h>
80 #include <vm/vnode_pager.h>
81 
82 #include <mach/mach_types.h>
83 #include <mach/memory_object_types.h>
84 #include <mach/vm_map.h>
85 #include <mach/upl.h>
86 #include <kern/task.h>
87 #include <kern/policy_internal.h>
88 
89 #include <vm/vm_kern.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_pageout.h>
92 #include <vm/vm_fault.h>
93 
94 #include <sys/kdebug.h>
95 #include <sys/kdebug_triage.h>
96 #include <libkern/OSAtomic.h>
97 
98 #include <sys/sdt.h>
99 
100 #include <stdbool.h>
101 
102 #include <vfs/vfs_disk_conditioner.h>
103 
104 #if 0
105 #undef KERNEL_DEBUG
106 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
107 #endif
108 
109 
110 #define CL_READ         0x01
111 #define CL_WRITE        0x02
112 #define CL_ASYNC        0x04
113 #define CL_COMMIT       0x08
114 #define CL_PAGEOUT      0x10
115 #define CL_AGE          0x20
116 #define CL_NOZERO       0x40
117 #define CL_PAGEIN       0x80
118 #define CL_DEV_MEMORY   0x100
119 #define CL_PRESERVE     0x200
120 #define CL_THROTTLE     0x400
121 #define CL_KEEPCACHED   0x800
122 #define CL_DIRECT_IO    0x1000
123 #define CL_PASSIVE      0x2000
124 #define CL_IOSTREAMING  0x4000
125 #define CL_CLOSE        0x8000
126 #define CL_ENCRYPTED    0x10000
127 #define CL_RAW_ENCRYPTED        0x20000
128 #define CL_NOCACHE      0x40000
129 
130 #define MAX_VECTOR_UPL_ELEMENTS 8
131 #define MAX_VECTOR_UPL_SIZE     (2 * MAX_UPL_SIZE_BYTES)
132 
133 #define CLUSTER_IO_WAITING              ((buf_t)1)
134 
135 extern upl_t vector_upl_create(vm_offset_t);
136 extern boolean_t vector_upl_is_valid(upl_t);
137 extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
138 extern void vector_upl_set_pagelist(upl_t);
139 extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
140 
141 struct clios {
142 	lck_mtx_t io_mtxp;
143 	u_int  io_completed;       /* amount of io that has currently completed */
144 	u_int  io_issued;          /* amount of io that was successfully issued */
145 	int    io_error;           /* error code of first error encountered */
146 	int    io_wanted;          /* someone is sleeping waiting for a change in state */
147 };
148 
149 struct cl_direct_read_lock {
150 	LIST_ENTRY(cl_direct_read_lock)         chain;
151 	int32_t                                                         ref_count;
152 	vnode_t                                                         vp;
153 	lck_rw_t                                                        rw_lock;
154 };
155 
156 #define CL_DIRECT_READ_LOCK_BUCKETS 61
157 
158 static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
159 cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
160 
161 static LCK_GRP_DECLARE(cl_mtx_grp, "cluster I/O");
162 static LCK_MTX_DECLARE(cl_transaction_mtxp, &cl_mtx_grp);
163 static LCK_SPIN_DECLARE(cl_direct_read_spin_lock, &cl_mtx_grp);
164 
165 static ZONE_DEFINE(cl_rd_zone, "cluster_read",
166     sizeof(struct cl_readahead), ZC_ZFREE_CLEARMEM);
167 
168 static ZONE_DEFINE(cl_wr_zone, "cluster_write",
169     sizeof(struct cl_writebehind), ZC_ZFREE_CLEARMEM);
170 
171 #define IO_UNKNOWN      0
172 #define IO_DIRECT       1
173 #define IO_CONTIG       2
174 #define IO_COPY         3
175 
176 #define PUSH_DELAY      0x01
177 #define PUSH_ALL        0x02
178 #define PUSH_SYNC       0x04
179 
180 
181 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size);
182 static void cluster_wait_IO(buf_t cbp_head, int async);
183 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
184 
185 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
186 
187 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
188     int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
189 static int cluster_iodone(buf_t bp, void *callback_arg);
190 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
191 static int cluster_is_throttled(vnode_t vp);
192 
193 static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
194 
195 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
196 
197 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
198 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
199 
200 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
201     int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
202 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
203     int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
204 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
205     int (*)(buf_t, void *), void *callback_arg, int flags) __attribute__((noinline));
206 
207 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
208     off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
209 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
210     int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
211 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
212     int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag) __attribute__((noinline));
213 
214 static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
215     off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
216 
217 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
218 
219 static int      cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
220 static void     cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
221     int (*callback)(buf_t, void *), void *callback_arg, int bflag);
222 
223 static int      cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
224 
225 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
226     void *callback_arg, int *err, boolean_t vm_initiated);
227 
228 static int      sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
229 static int      sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
230     int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
231 static int      sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
232     int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
233 
234 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
235 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
236 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
237 static kern_return_t vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag);
238 
239 
240 /*
241  * For throttled IO to check whether
242  * a block is cached by the boot cache
243  * and thus it can avoid delaying the IO.
244  *
245  * bootcache_contains_block is initially
246  * NULL. The BootCache will set it while
247  * the cache is active and clear it when
248  * the cache is jettisoned.
249  *
250  * Returns 0 if the block is not
251  * contained in the cache, 1 if it is
252  * contained.
253  *
254  * The function pointer remains valid
255  * after the cache has been evicted even
256  * if bootcache_contains_block has been
257  * cleared.
258  *
259  * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
260  */
261 int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
262 
263 
264 /*
265  * limit the internal I/O size so that we
266  * can represent it in a 32 bit int
267  */
268 #define MAX_IO_REQUEST_SIZE     (1024 * 1024 * 512)
269 #define MAX_IO_CONTIG_SIZE      MAX_UPL_SIZE_BYTES
270 #define MAX_VECTS               16
271 /*
272  * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
273  * allowing the caller to bypass the buffer cache.  For small I/Os (less than 16k),
274  * we have not historically allowed the write to bypass the UBC.
275  */
276 #define MIN_DIRECT_WRITE_SIZE   (16384)
277 
278 #define WRITE_THROTTLE          6
279 #define WRITE_THROTTLE_SSD      2
280 #define WRITE_BEHIND            1
281 #define WRITE_BEHIND_SSD        1
282 
283 #if !defined(XNU_TARGET_OS_OSX)
284 #define PREFETCH                1
285 #define PREFETCH_SSD            1
286 uint32_t speculative_prefetch_max = (2048 * 1024);              /* maximum bytes in a specluative read-ahead */
287 uint32_t speculative_prefetch_max_iosize = (512 * 1024);        /* maximum I/O size to use in a specluative read-ahead */
288 #else /* XNU_TARGET_OS_OSX */
289 #define PREFETCH                3
290 #define PREFETCH_SSD            2
291 uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3);   /* maximum bytes in a specluative read-ahead */
292 uint32_t speculative_prefetch_max_iosize = (512 * 1024);        /* maximum I/O size to use in a specluative read-ahead on SSDs*/
293 #endif /* ! XNU_TARGET_OS_OSX */
294 
295 
296 #define IO_SCALE(vp, base)              (vp->v_mount->mnt_ioscale * (base))
297 #define MAX_CLUSTER_SIZE(vp)            (cluster_max_io_size(vp->v_mount, CL_WRITE))
298 #define MAX_PREFETCH(vp, size, is_ssd)  (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
299 
300 int     speculative_reads_disabled = 0;
301 
302 /*
303  * throttle the number of async writes that
304  * can be outstanding on a single vnode
305  * before we issue a synchronous write
306  */
307 #define THROTTLE_MAXCNT 0
308 
309 uint32_t throttle_max_iosize = (128 * 1024);
310 
311 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
312 
313 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
314 
315 
316 void
cluster_init(void)317 cluster_init(void)
318 {
319 	for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
320 		LIST_INIT(&cl_direct_read_locks[i]);
321 	}
322 }
323 
324 
325 uint32_t
cluster_max_io_size(mount_t mp,int type)326 cluster_max_io_size(mount_t mp, int type)
327 {
328 	uint32_t        max_io_size;
329 	uint32_t        segcnt;
330 	uint32_t        maxcnt;
331 
332 	switch (type) {
333 	case CL_READ:
334 		segcnt = mp->mnt_segreadcnt;
335 		maxcnt = mp->mnt_maxreadcnt;
336 		break;
337 	case CL_WRITE:
338 		segcnt = mp->mnt_segwritecnt;
339 		maxcnt = mp->mnt_maxwritecnt;
340 		break;
341 	default:
342 		segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
343 		maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
344 		break;
345 	}
346 	if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
347 		/*
348 		 * don't allow a size beyond the max UPL size we can create
349 		 */
350 		segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
351 	}
352 	max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
353 
354 	if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
355 		/*
356 		 * don't allow a size smaller than the old fixed limit
357 		 */
358 		max_io_size = MAX_UPL_TRANSFER_BYTES;
359 	} else {
360 		/*
361 		 * make sure the size specified is a multiple of PAGE_SIZE
362 		 */
363 		max_io_size &= ~PAGE_MASK;
364 	}
365 	return max_io_size;
366 }
367 
368 
369 
370 
371 #define CLW_ALLOCATE            0x01
372 #define CLW_RETURNLOCKED        0x02
373 #define CLW_IONOCACHE           0x04
374 #define CLW_IOPASSIVE   0x08
375 
376 /*
377  * if the read ahead context doesn't yet exist,
378  * allocate and initialize it...
379  * the vnode lock serializes multiple callers
380  * during the actual assignment... first one
381  * to grab the lock wins... the other callers
382  * will release the now unnecessary storage
383  *
384  * once the context is present, try to grab (but don't block on)
385  * the lock associated with it... if someone
386  * else currently owns it, than the read
387  * will run without read-ahead.  this allows
388  * multiple readers to run in parallel and
389  * since there's only 1 read ahead context,
390  * there's no real loss in only allowing 1
391  * reader to have read-ahead enabled.
392  */
393 static struct cl_readahead *
cluster_get_rap(vnode_t vp)394 cluster_get_rap(vnode_t vp)
395 {
396 	struct ubc_info         *ubc;
397 	struct cl_readahead     *rap;
398 
399 	ubc = vp->v_ubcinfo;
400 
401 	if ((rap = ubc->cl_rahead) == NULL) {
402 		rap = zalloc_flags(cl_rd_zone, Z_WAITOK | Z_ZERO);
403 		rap->cl_lastr = -1;
404 		lck_mtx_init(&rap->cl_lockr, &cl_mtx_grp, LCK_ATTR_NULL);
405 
406 		vnode_lock(vp);
407 
408 		if (ubc->cl_rahead == NULL) {
409 			ubc->cl_rahead = rap;
410 		} else {
411 			lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
412 			zfree(cl_rd_zone, rap);
413 			rap = ubc->cl_rahead;
414 		}
415 		vnode_unlock(vp);
416 	}
417 	if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) {
418 		return rap;
419 	}
420 
421 	return (struct cl_readahead *)NULL;
422 }
423 
424 
425 /*
426  * if the write behind context doesn't yet exist,
427  * and CLW_ALLOCATE is specified, allocate and initialize it...
428  * the vnode lock serializes multiple callers
429  * during the actual assignment... first one
430  * to grab the lock wins... the other callers
431  * will release the now unnecessary storage
432  *
433  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
434  * the lock associated with the write behind context before
435  * returning
436  */
437 
438 static struct cl_writebehind *
cluster_get_wbp(vnode_t vp,int flags)439 cluster_get_wbp(vnode_t vp, int flags)
440 {
441 	struct ubc_info *ubc;
442 	struct cl_writebehind *wbp;
443 
444 	ubc = vp->v_ubcinfo;
445 
446 	if ((wbp = ubc->cl_wbehind) == NULL) {
447 		if (!(flags & CLW_ALLOCATE)) {
448 			return (struct cl_writebehind *)NULL;
449 		}
450 
451 		wbp = zalloc_flags(cl_wr_zone, Z_WAITOK | Z_ZERO);
452 
453 		lck_mtx_init(&wbp->cl_lockw, &cl_mtx_grp, LCK_ATTR_NULL);
454 
455 		vnode_lock(vp);
456 
457 		if (ubc->cl_wbehind == NULL) {
458 			ubc->cl_wbehind = wbp;
459 		} else {
460 			lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
461 			zfree(cl_wr_zone, wbp);
462 			wbp = ubc->cl_wbehind;
463 		}
464 		vnode_unlock(vp);
465 	}
466 	if (flags & CLW_RETURNLOCKED) {
467 		lck_mtx_lock(&wbp->cl_lockw);
468 	}
469 
470 	return wbp;
471 }
472 
473 
474 static void
cluster_syncup(vnode_t vp,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,int flags)475 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
476 {
477 	struct cl_writebehind *wbp;
478 
479 	if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
480 		if (wbp->cl_number) {
481 			lck_mtx_lock(&wbp->cl_lockw);
482 
483 			cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
484 
485 			lck_mtx_unlock(&wbp->cl_lockw);
486 		}
487 	}
488 }
489 
490 
491 static int
cluster_io_present_in_BC(vnode_t vp,off_t f_offset)492 cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
493 {
494 	daddr64_t blkno;
495 	size_t    io_size;
496 	int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
497 
498 	if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
499 		if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL)) {
500 			return 0;
501 		}
502 
503 		if (io_size == 0) {
504 			return 0;
505 		}
506 
507 		if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
508 			return 1;
509 		}
510 	}
511 	return 0;
512 }
513 
514 
515 static int
cluster_is_throttled(vnode_t vp)516 cluster_is_throttled(vnode_t vp)
517 {
518 	return throttle_io_will_be_throttled(-1, vp->v_mount);
519 }
520 
521 
522 static void
cluster_iostate_wait(struct clios * iostate,u_int target,const char * wait_name)523 cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
524 {
525 	lck_mtx_lock(&iostate->io_mtxp);
526 
527 	while ((iostate->io_issued - iostate->io_completed) > target) {
528 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
529 		    iostate->io_issued, iostate->io_completed, target, 0, 0);
530 
531 		iostate->io_wanted = 1;
532 		msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
533 
534 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
535 		    iostate->io_issued, iostate->io_completed, target, 0, 0);
536 	}
537 	lck_mtx_unlock(&iostate->io_mtxp);
538 }
539 
540 static void
cluster_handle_associated_upl(struct clios * iostate,upl_t upl,upl_offset_t upl_offset,upl_size_t size)541 cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
542     upl_offset_t upl_offset, upl_size_t size)
543 {
544 	if (!size) {
545 		return;
546 	}
547 
548 	upl_t associated_upl = upl_associated_upl(upl);
549 
550 	if (!associated_upl) {
551 		return;
552 	}
553 
554 #if 0
555 	printf("1: %d %d\n", upl_offset, upl_offset + size);
556 #endif
557 
558 	/*
559 	 * The associated UPL is page aligned to file offsets whereas the
560 	 * UPL it's attached to has different alignment requirements.  The
561 	 * upl_offset that we have refers to @upl.  The code that follows
562 	 * has to deal with the first and last pages in this transaction
563 	 * which might straddle pages in the associated UPL.  To keep
564 	 * track of these pages, we use the mark bits: if the mark bit is
565 	 * set, we know another transaction has completed its part of that
566 	 * page and so we can unlock that page here.
567 	 *
568 	 * The following illustrates what we have to deal with:
569 	 *
570 	 *    MEM u <------------ 1 PAGE ------------> e
571 	 *        +-------------+----------------------+-----------------
572 	 *        |             |######################|#################
573 	 *        +-------------+----------------------+-----------------
574 	 *   FILE | <--- a ---> o <------------ 1 PAGE ------------>
575 	 *
576 	 * So here we show a write to offset @o.  The data that is to be
577 	 * written is in a buffer that is not page aligned; it has offset
578 	 * @a in the page.  The upl that carries the data starts in memory
579 	 * at @u.  The associated upl starts in the file at offset @o.  A
580 	 * transaction will always end on a page boundary (like @e above)
581 	 * except for the very last transaction in the group.  We cannot
582 	 * unlock the page at @o in the associated upl until both the
583 	 * transaction ending at @e and the following transaction (that
584 	 * starts at @e) has completed.
585 	 */
586 
587 	/*
588 	 * We record whether or not the two UPLs are aligned as the mark
589 	 * bit in the first page of @upl.
590 	 */
591 	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
592 	bool is_unaligned = upl_page_get_mark(pl, 0);
593 
594 	if (is_unaligned) {
595 		upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
596 
597 		upl_offset_t upl_end = upl_offset + size;
598 		assert(upl_end >= PAGE_SIZE);
599 
600 		upl_size_t assoc_upl_size = upl_get_size(associated_upl);
601 
602 		/*
603 		 * In the very first transaction in the group, upl_offset will
604 		 * not be page aligned, but after that it will be and in that
605 		 * case we want the preceding page in the associated UPL hence
606 		 * the minus one.
607 		 */
608 		assert(upl_offset);
609 		if (upl_offset) {
610 			upl_offset = trunc_page_32(upl_offset - 1);
611 		}
612 
613 		lck_mtx_lock_spin(&iostate->io_mtxp);
614 
615 		// Look at the first page...
616 		if (upl_offset
617 		    && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
618 			/*
619 			 * The first page isn't marked so let another transaction
620 			 * completion handle it.
621 			 */
622 			upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
623 			upl_offset += PAGE_SIZE;
624 		}
625 
626 		// And now the last page...
627 
628 		/*
629 		 * This needs to be > rather than >= because if it's equal, it
630 		 * means there's another transaction that is sharing the last
631 		 * page.
632 		 */
633 		if (upl_end > assoc_upl_size) {
634 			upl_end = assoc_upl_size;
635 		} else {
636 			upl_end = trunc_page_32(upl_end);
637 			const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
638 
639 			if (!upl_page_get_mark(assoc_pl, last_pg)) {
640 				/*
641 				 * The last page isn't marked so mark the page and let another
642 				 * transaction completion handle it.
643 				 */
644 				upl_page_set_mark(assoc_pl, last_pg, true);
645 				upl_end -= PAGE_SIZE;
646 			}
647 		}
648 
649 		lck_mtx_unlock(&iostate->io_mtxp);
650 
651 #if 0
652 		printf("2: %d %d\n", upl_offset, upl_end);
653 #endif
654 
655 		if (upl_end <= upl_offset) {
656 			return;
657 		}
658 
659 		size = upl_end - upl_offset;
660 	} else {
661 		assert(!(upl_offset & PAGE_MASK));
662 		assert(!(size & PAGE_MASK));
663 	}
664 
665 	boolean_t empty;
666 
667 	/*
668 	 * We can unlock these pages now and as this is for a
669 	 * direct/uncached write, we want to dump the pages too.
670 	 */
671 	kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
672 	    UPL_ABORT_DUMP_PAGES, &empty);
673 
674 	assert(!kr);
675 
676 	if (!kr && empty) {
677 		upl_set_associated_upl(upl, NULL);
678 		upl_deallocate(associated_upl);
679 	}
680 }
681 
682 static int
cluster_ioerror(upl_t upl,int upl_offset,int abort_size,int error,int io_flags,vnode_t vp)683 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
684 {
685 	int upl_abort_code = 0;
686 	int page_in  = 0;
687 	int page_out = 0;
688 
689 	if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) {
690 		/*
691 		 * direct write of any flavor, or a direct read that wasn't aligned
692 		 */
693 		ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
694 	} else {
695 		if (io_flags & B_PAGEIO) {
696 			if (io_flags & B_READ) {
697 				page_in  = 1;
698 			} else {
699 				page_out = 1;
700 			}
701 		}
702 		if (io_flags & B_CACHE) {
703 			/*
704 			 * leave pages in the cache unchanged on error
705 			 */
706 			upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
707 		} else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) {
708 			/*
709 			 * transient error on pageout/write path... leave pages unchanged
710 			 */
711 			upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
712 		} else if (page_in) {
713 			upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
714 		} else {
715 			upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
716 		}
717 
718 		ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
719 	}
720 	return upl_abort_code;
721 }
722 
723 
724 static int
cluster_iodone(buf_t bp,void * callback_arg)725 cluster_iodone(buf_t bp, void *callback_arg)
726 {
727 	int     b_flags;
728 	int     error;
729 	int     total_size;
730 	int     total_resid;
731 	int     upl_offset;
732 	int     zero_offset;
733 	int     pg_offset = 0;
734 	int     commit_size = 0;
735 	int     upl_flags = 0;
736 	int     transaction_size = 0;
737 	upl_t   upl;
738 	buf_t   cbp;
739 	buf_t   cbp_head;
740 	buf_t   cbp_next;
741 	buf_t   real_bp;
742 	vnode_t vp;
743 	struct  clios *iostate;
744 	void    *verify_ctx;
745 	boolean_t       transaction_complete = FALSE;
746 
747 	__IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
748 
749 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
750 	    cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
751 
752 	if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
753 		lck_mtx_lock_spin(&cl_transaction_mtxp);
754 
755 		bp->b_flags |= B_TDONE;
756 
757 		for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
758 			/*
759 			 * all I/O requests that are part of this transaction
760 			 * have to complete before we can process it
761 			 */
762 			if (!(cbp->b_flags & B_TDONE)) {
763 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
764 				    cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
765 
766 				lck_mtx_unlock(&cl_transaction_mtxp);
767 
768 				return 0;
769 			}
770 
771 			if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
772 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
773 				    cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
774 
775 				lck_mtx_unlock(&cl_transaction_mtxp);
776 				wakeup(cbp);
777 
778 				return 0;
779 			}
780 
781 			if (cbp->b_flags & B_EOT) {
782 				transaction_complete = TRUE;
783 			}
784 		}
785 		lck_mtx_unlock(&cl_transaction_mtxp);
786 
787 		if (transaction_complete == FALSE) {
788 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
789 			    cbp_head, 0, 0, 0, 0);
790 			return 0;
791 		}
792 	}
793 	error       = 0;
794 	total_size  = 0;
795 	total_resid = 0;
796 
797 	cbp        = cbp_head;
798 	vp         = cbp->b_vp;
799 	upl_offset = cbp->b_uploffset;
800 	upl        = cbp->b_upl;
801 	b_flags    = cbp->b_flags;
802 	real_bp    = cbp->b_real_bp;
803 	zero_offset = cbp->b_validend;
804 	iostate    = (struct clios *)cbp->b_iostate;
805 
806 	if (real_bp) {
807 		real_bp->b_dev = cbp->b_dev;
808 	}
809 
810 	while (cbp) {
811 		if ((cbp->b_flags & B_ERROR) && error == 0) {
812 			error = cbp->b_error;
813 		}
814 
815 		total_resid += cbp->b_resid;
816 		total_size  += cbp->b_bcount;
817 
818 		cbp_next = cbp->b_trans_next;
819 
820 		if (cbp_next == NULL) {
821 			/*
822 			 * compute the overall size of the transaction
823 			 * in case we created one that has 'holes' in it
824 			 * 'total_size' represents the amount of I/O we
825 			 * did, not the span of the transaction w/r to the UPL
826 			 */
827 			transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
828 		}
829 
830 		if (cbp != cbp_head) {
831 			free_io_buf(cbp);
832 		}
833 
834 		cbp = cbp_next;
835 	}
836 
837 	if (ISSET(b_flags, B_COMMIT_UPL)) {
838 		cluster_handle_associated_upl(iostate,
839 		    cbp_head->b_upl,
840 		    upl_offset,
841 		    transaction_size);
842 	}
843 
844 	if (error == 0 && total_resid) {
845 		error = EIO;
846 	}
847 
848 	if (error == 0) {
849 		int     (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
850 
851 		if (cliodone_func != NULL) {
852 			cbp_head->b_bcount = transaction_size;
853 
854 			error = (*cliodone_func)(cbp_head, callback_arg);
855 		}
856 	}
857 	if (zero_offset) {
858 		cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
859 	}
860 
861 	verify_ctx = cbp_head->b_attr.ba_verify_ctx;
862 	cbp_head->b_attr.ba_verify_ctx = NULL;
863 	if (verify_ctx) {
864 		vnode_verify_flags_t verify_flags = VNODE_VERIFY_CONTEXT_FREE;
865 		caddr_t verify_buf = NULL;
866 		off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
867 		size_t verify_length = transaction_size;
868 		vm_offset_t vaddr;
869 
870 		if (!error) {
871 			verify_flags |= VNODE_VERIFY_WITH_CONTEXT;
872 			error = ubc_upl_map_range(upl, upl_offset, round_page(transaction_size), VM_PROT_DEFAULT, &vaddr);    /* Map it in */
873 			if (error) {
874 				panic("ubc_upl_map_range returned error %d, upl = %p, upl_offset = %d, size = %d",
875 				    error, upl, (int)upl_offset, (int)round_page(transaction_size));
876 			} else {
877 				verify_buf = (caddr_t)vaddr;
878 			}
879 		}
880 
881 		error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length, 0, &verify_ctx, verify_flags, NULL);
882 
883 		if (verify_buf) {
884 			(void)ubc_upl_unmap_range(upl, upl_offset, round_page(transaction_size));
885 			verify_buf = NULL;
886 		}
887 	}
888 
889 	free_io_buf(cbp_head);
890 
891 	if (iostate) {
892 		int need_wakeup = 0;
893 
894 		/*
895 		 * someone has issued multiple I/Os asynchrounsly
896 		 * and is waiting for them to complete (streaming)
897 		 */
898 		lck_mtx_lock_spin(&iostate->io_mtxp);
899 
900 		if (error && iostate->io_error == 0) {
901 			iostate->io_error = error;
902 		}
903 
904 		iostate->io_completed += total_size;
905 
906 		if (iostate->io_wanted) {
907 			/*
908 			 * someone is waiting for the state of
909 			 * this io stream to change
910 			 */
911 			iostate->io_wanted = 0;
912 			need_wakeup = 1;
913 		}
914 		lck_mtx_unlock(&iostate->io_mtxp);
915 
916 		if (need_wakeup) {
917 			wakeup((caddr_t)&iostate->io_wanted);
918 		}
919 	}
920 
921 	if (b_flags & B_COMMIT_UPL) {
922 		pg_offset   = upl_offset & PAGE_MASK;
923 		commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
924 
925 		if (error) {
926 			upl_set_iodone_error(upl, error);
927 
928 			upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
929 		} else {
930 			upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
931 
932 			if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
933 				upl_flags |= UPL_COMMIT_SET_DIRTY;
934 			}
935 
936 			if (b_flags & B_AGE) {
937 				upl_flags |= UPL_COMMIT_INACTIVATE;
938 			}
939 
940 			ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
941 		}
942 	}
943 	if (real_bp) {
944 		if (error) {
945 			real_bp->b_flags |= B_ERROR;
946 			real_bp->b_error = error;
947 		}
948 		real_bp->b_resid = total_resid;
949 
950 		buf_biodone(real_bp);
951 	}
952 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
953 	    upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
954 
955 	return error;
956 }
957 
958 
959 uint32_t
cluster_throttle_io_limit(vnode_t vp,uint32_t * limit)960 cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
961 {
962 	if (cluster_is_throttled(vp)) {
963 		*limit = THROTTLE_MAX_IOSIZE;
964 		return 1;
965 	}
966 	return 0;
967 }
968 
969 
970 void
cluster_zero(upl_t upl,upl_offset_t upl_offset,int size,buf_t bp)971 cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
972 {
973 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
974 	    upl_offset, size, bp, 0, 0);
975 
976 	if (bp == NULL || bp->b_datap == 0) {
977 		upl_page_info_t *pl;
978 		addr64_t        zero_addr;
979 
980 		pl = ubc_upl_pageinfo(upl);
981 
982 		if (upl_device_page(pl) == TRUE) {
983 			zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
984 
985 			bzero_phys_nc(zero_addr, size);
986 		} else {
987 			while (size) {
988 				int     page_offset;
989 				int     page_index;
990 				int     zero_cnt;
991 
992 				page_index  = upl_offset / PAGE_SIZE;
993 				page_offset = upl_offset & PAGE_MASK;
994 
995 				zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
996 				zero_cnt  = min(PAGE_SIZE - page_offset, size);
997 
998 				bzero_phys(zero_addr, zero_cnt);
999 
1000 				size       -= zero_cnt;
1001 				upl_offset += zero_cnt;
1002 			}
1003 		}
1004 	} else {
1005 		bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
1006 	}
1007 
1008 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
1009 	    upl_offset, size, 0, 0, 0);
1010 }
1011 
1012 
1013 static void
cluster_EOT(buf_t cbp_head,buf_t cbp_tail,int zero_offset,size_t verify_block_size)1014 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size)
1015 {
1016 	/*
1017 	 * We will assign a verification context to cbp_head.
1018 	 * This will be passed back to the filesystem  when
1019 	 * verifying (in cluster_iodone).
1020 	 */
1021 	if (verify_block_size) {
1022 		off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
1023 		size_t length;
1024 		void *verify_ctx = NULL;
1025 		int error = 0;
1026 		vnode_t vp = buf_vnode(cbp_head);
1027 
1028 		if (cbp_head == cbp_tail) {
1029 			length = cbp_head->b_bcount;
1030 		} else {
1031 			length = ((cbp_tail->b_lblkno * cbp_tail->b_lblksize) + cbp_tail->b_bcount) - start_off;
1032 		}
1033 
1034 		/*
1035 		 * zero_offset is non zero for the transaction containing the EOF
1036 		 * (if the filesize is not page aligned). In that case we might
1037 		 * have the transaction size not be page/verify block size aligned
1038 		 */
1039 		if ((zero_offset == 0) &&
1040 		    ((length < verify_block_size) || (length % verify_block_size)) != 0) {
1041 			panic("%s length = %zu, verify_block_size = %zu",
1042 			    __FUNCTION__, length, verify_block_size);
1043 		}
1044 
1045 		error = VNOP_VERIFY(vp, start_off, NULL, length,
1046 		    &verify_block_size, &verify_ctx, VNODE_VERIFY_CONTEXT_ALLOC, NULL);
1047 
1048 		if (!verify_ctx) {
1049 			if (!error && verify_block_size) {
1050 				/*
1051 				 * fetch the verify block size again, it is
1052 				 * possible that the verification was turned off
1053 				 * in the filesystem between the time it was
1054 				 * checked last and now.
1055 				 */
1056 				error = VNOP_VERIFY(vp, start_off, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL);
1057 			}
1058 
1059 			if (error || verify_block_size) {
1060 				panic("No verify context for vp = %p, start_off = %lld, length = %zu, error = %d",
1061 				    buf_vnode(cbp_head), start_off, length, error);
1062 			}
1063 		}
1064 
1065 		cbp_head->b_attr.ba_verify_ctx = verify_ctx;
1066 	} else {
1067 		cbp_head->b_attr.ba_verify_ctx = NULL;
1068 	}
1069 
1070 	cbp_head->b_validend = zero_offset;
1071 	cbp_tail->b_flags |= B_EOT;
1072 }
1073 
1074 static void
cluster_wait_IO(buf_t cbp_head,int async)1075 cluster_wait_IO(buf_t cbp_head, int async)
1076 {
1077 	buf_t   cbp;
1078 
1079 	if (async) {
1080 		/*
1081 		 * Async callback completion will not normally generate a
1082 		 * wakeup upon I/O completion.  To get woken up, we set
1083 		 * b_trans_next (which is safe for us to modify) on the last
1084 		 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1085 		 * to wake us up when all buffers as part of this transaction
1086 		 * are completed.  This is done under the umbrella of
1087 		 * cl_transaction_mtxp which is also taken in cluster_iodone.
1088 		 */
1089 		bool done = true;
1090 		buf_t last = NULL;
1091 
1092 		lck_mtx_lock_spin(&cl_transaction_mtxp);
1093 
1094 		for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1095 			if (!ISSET(cbp->b_flags, B_TDONE)) {
1096 				done = false;
1097 			}
1098 		}
1099 
1100 		if (!done) {
1101 			last->b_trans_next = CLUSTER_IO_WAITING;
1102 
1103 			DTRACE_IO1(wait__start, buf_t, last);
1104 			do {
1105 				msleep(last, &cl_transaction_mtxp, PSPIN | (PRIBIO + 1), "cluster_wait_IO", NULL);
1106 
1107 				/*
1108 				 * We should only have been woken up if all the
1109 				 * buffers are completed, but just in case...
1110 				 */
1111 				done = true;
1112 				for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1113 					if (!ISSET(cbp->b_flags, B_TDONE)) {
1114 						done = false;
1115 						break;
1116 					}
1117 				}
1118 			} while (!done);
1119 			DTRACE_IO1(wait__done, buf_t, last);
1120 
1121 			last->b_trans_next = NULL;
1122 		}
1123 
1124 		lck_mtx_unlock(&cl_transaction_mtxp);
1125 	} else { // !async
1126 		for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1127 			buf_biowait(cbp);
1128 		}
1129 	}
1130 }
1131 
1132 static void
cluster_complete_transaction(buf_t * cbp_head,void * callback_arg,int * retval,int flags,int needwait)1133 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1134 {
1135 	buf_t   cbp;
1136 	int     error;
1137 	boolean_t isswapout = FALSE;
1138 
1139 	/*
1140 	 * cluster_complete_transaction will
1141 	 * only be called if we've issued a complete chain in synchronous mode
1142 	 * or, we've already done a cluster_wait_IO on an incomplete chain
1143 	 */
1144 	if (needwait) {
1145 		for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1146 			buf_biowait(cbp);
1147 		}
1148 	}
1149 	/*
1150 	 * we've already waited on all of the I/Os in this transaction,
1151 	 * so mark all of the buf_t's in this transaction as B_TDONE
1152 	 * so that cluster_iodone sees the transaction as completed
1153 	 */
1154 	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1155 		cbp->b_flags |= B_TDONE;
1156 	}
1157 	cbp = *cbp_head;
1158 
1159 	if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
1160 		isswapout = TRUE;
1161 	}
1162 
1163 	error = cluster_iodone(cbp, callback_arg);
1164 
1165 	if (!(flags & CL_ASYNC) && error && *retval == 0) {
1166 		if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
1167 			*retval = error;
1168 		} else if (isswapout == TRUE) {
1169 			*retval = error;
1170 		}
1171 	}
1172 	*cbp_head = (buf_t)NULL;
1173 }
1174 
1175 
1176 static int
cluster_io(vnode_t vp,upl_t upl,vm_offset_t upl_offset,off_t f_offset,int non_rounded_size,int flags,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)1177 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1178     int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1179 {
1180 	buf_t   cbp;
1181 	u_int   size;
1182 	u_int   io_size;
1183 	int     io_flags;
1184 	int     bmap_flags;
1185 	int     error = 0;
1186 	int     retval = 0;
1187 	buf_t   cbp_head = NULL;
1188 	buf_t   cbp_tail = NULL;
1189 	int     trans_count = 0;
1190 	int     max_trans_count;
1191 	u_int   pg_count;
1192 	int     pg_offset;
1193 	u_int   max_iosize;
1194 	u_int   max_vectors;
1195 	int     priv;
1196 	int     zero_offset = 0;
1197 	int     async_throttle = 0;
1198 	mount_t mp;
1199 	vm_offset_t upl_end_offset;
1200 	boolean_t   need_EOT = FALSE;
1201 	size_t verify_block_size = 0;
1202 
1203 	/*
1204 	 * we currently don't support buffers larger than a page
1205 	 */
1206 	if (real_bp && non_rounded_size > PAGE_SIZE) {
1207 		panic("%s(): Called with real buffer of size %d bytes which "
1208 		    "is greater than the maximum allowed size of "
1209 		    "%d bytes (the system PAGE_SIZE).\n",
1210 		    __FUNCTION__, non_rounded_size, PAGE_SIZE);
1211 	}
1212 
1213 	mp = vp->v_mount;
1214 
1215 	/*
1216 	 * we don't want to do any funny rounding of the size for IO requests
1217 	 * coming through the DIRECT or CONTIGUOUS paths...  those pages don't
1218 	 * belong to us... we can't extend (nor do we need to) the I/O to fill
1219 	 * out a page
1220 	 */
1221 	if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1222 		/*
1223 		 * round the requested size up so that this I/O ends on a
1224 		 * page boundary in case this is a 'write'... if the filesystem
1225 		 * has blocks allocated to back the page beyond the EOF, we want to
1226 		 * make sure to write out the zero's that are sitting beyond the EOF
1227 		 * so that in case the filesystem doesn't explicitly zero this area
1228 		 * if a hole is created via a lseek/write beyond the current EOF,
1229 		 * it will return zeros when it's read back from the disk.  If the
1230 		 * physical allocation doesn't extend for the whole page, we'll
1231 		 * only write/read from the disk up to the end of this allocation
1232 		 * via the extent info returned from the VNOP_BLOCKMAP call.
1233 		 */
1234 		pg_offset = upl_offset & PAGE_MASK;
1235 
1236 		size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1237 	} else {
1238 		/*
1239 		 * anyone advertising a blocksize of 1 byte probably
1240 		 * can't deal with us rounding up the request size
1241 		 * AFP is one such filesystem/device
1242 		 */
1243 		size = non_rounded_size;
1244 	}
1245 	upl_end_offset = upl_offset + size;
1246 
1247 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1248 
1249 	/*
1250 	 * Set the maximum transaction size to the maximum desired number of
1251 	 * buffers.
1252 	 */
1253 	max_trans_count = 8;
1254 	if (flags & CL_DEV_MEMORY) {
1255 		max_trans_count = 16;
1256 	}
1257 
1258 	if (flags & CL_READ) {
1259 		io_flags = B_READ;
1260 		bmap_flags = VNODE_READ;
1261 
1262 		max_iosize  = mp->mnt_maxreadcnt;
1263 		max_vectors = mp->mnt_segreadcnt;
1264 
1265 		if ((flags & CL_PAGEIN) && /* Cluster layer verification will be limited to pagein for now */
1266 		    !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1267 		    (VNOP_VERIFY(vp, f_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) &&
1268 		    verify_block_size) {
1269 			if (verify_block_size != PAGE_SIZE) {
1270 				verify_block_size = 0;
1271 			}
1272 			if (real_bp && verify_block_size) {
1273 				panic("%s(): Called with real buffer and needs verification ",
1274 				    __FUNCTION__);
1275 			}
1276 		}
1277 	} else {
1278 		io_flags = B_WRITE;
1279 		bmap_flags = VNODE_WRITE;
1280 
1281 		max_iosize  = mp->mnt_maxwritecnt;
1282 		max_vectors = mp->mnt_segwritecnt;
1283 	}
1284 	if (verify_block_size) {
1285 		bmap_flags |= VNODE_CLUSTER_VERIFY;
1286 	}
1287 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1288 
1289 	/*
1290 	 * make sure the maximum iosize is a
1291 	 * multiple of the page size
1292 	 */
1293 	max_iosize  &= ~PAGE_MASK;
1294 
1295 	/*
1296 	 * Ensure the maximum iosize is sensible.
1297 	 */
1298 	if (!max_iosize) {
1299 		max_iosize = PAGE_SIZE;
1300 	}
1301 
1302 	if (flags & CL_THROTTLE) {
1303 		if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1304 			if (max_iosize > THROTTLE_MAX_IOSIZE) {
1305 				max_iosize = THROTTLE_MAX_IOSIZE;
1306 			}
1307 			async_throttle = THROTTLE_MAXCNT;
1308 		} else {
1309 			if ((flags & CL_DEV_MEMORY)) {
1310 				async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1311 			} else {
1312 				u_int max_cluster;
1313 				u_int max_cluster_size;
1314 				u_int scale;
1315 
1316 				if (vp->v_mount->mnt_minsaturationbytecount) {
1317 					max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1318 
1319 					scale = 1;
1320 				} else {
1321 					max_cluster_size = MAX_CLUSTER_SIZE(vp);
1322 
1323 					if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
1324 						scale = WRITE_THROTTLE_SSD;
1325 					} else {
1326 						scale = WRITE_THROTTLE;
1327 					}
1328 				}
1329 				if (max_iosize > max_cluster_size) {
1330 					max_cluster = max_cluster_size;
1331 				} else {
1332 					max_cluster = max_iosize;
1333 				}
1334 
1335 				if (size < max_cluster) {
1336 					max_cluster = size;
1337 				}
1338 
1339 				if (flags & CL_CLOSE) {
1340 					scale += MAX_CLUSTERS;
1341 				}
1342 
1343 				async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
1344 			}
1345 		}
1346 	}
1347 	if (flags & CL_AGE) {
1348 		io_flags |= B_AGE;
1349 	}
1350 	if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
1351 		io_flags |= B_PAGEIO;
1352 	}
1353 	if (flags & (CL_IOSTREAMING)) {
1354 		io_flags |= B_IOSTREAMING;
1355 	}
1356 	if (flags & CL_COMMIT) {
1357 		io_flags |= B_COMMIT_UPL;
1358 	}
1359 	if (flags & CL_DIRECT_IO) {
1360 		io_flags |= B_PHYS;
1361 	}
1362 	if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
1363 		io_flags |= B_CACHE;
1364 	}
1365 	if (flags & CL_PASSIVE) {
1366 		io_flags |= B_PASSIVE;
1367 	}
1368 	if (flags & CL_ENCRYPTED) {
1369 		io_flags |= B_ENCRYPTED_IO;
1370 	}
1371 
1372 	if (vp->v_flag & VSYSTEM) {
1373 		io_flags |= B_META;
1374 	}
1375 
1376 	if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1377 		/*
1378 		 * then we are going to end up
1379 		 * with a page that we can't complete (the file size wasn't a multiple
1380 		 * of PAGE_SIZE and we're trying to read to the end of the file
1381 		 * so we'll go ahead and zero out the portion of the page we can't
1382 		 * read in from the file
1383 		 */
1384 		zero_offset = (int)(upl_offset + non_rounded_size);
1385 	} else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1386 		assert(ISSET(flags, CL_COMMIT));
1387 
1388 		// For a direct/uncached write, we need to lock pages...
1389 
1390 		upl_t cached_upl;
1391 
1392 		/*
1393 		 * Create a UPL to lock the pages in the cache whilst the
1394 		 * write is in progress.
1395 		 */
1396 		ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
1397 		    NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
1398 
1399 		/*
1400 		 * Attach this UPL to the other UPL so that we can find it
1401 		 * later.
1402 		 */
1403 		upl_set_associated_upl(upl, cached_upl);
1404 
1405 		if (upl_offset & PAGE_MASK) {
1406 			/*
1407 			 * The two UPLs are not aligned, so mark the first page in
1408 			 * @upl so that cluster_handle_associated_upl can handle
1409 			 * it accordingly.
1410 			 */
1411 			upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1412 			upl_page_set_mark(pl, 0, true);
1413 		}
1414 	}
1415 
1416 	while (size) {
1417 		daddr64_t blkno;
1418 		daddr64_t lblkno;
1419 		size_t  io_size_tmp;
1420 		u_int   io_size_wanted;
1421 		uint32_t lblksize;
1422 
1423 		if (size > max_iosize) {
1424 			io_size = max_iosize;
1425 		} else {
1426 			io_size = size;
1427 		}
1428 
1429 		io_size_wanted = io_size;
1430 		io_size_tmp = (size_t)io_size;
1431 
1432 		if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1433 			break;
1434 		}
1435 
1436 		if (io_size_tmp > io_size_wanted) {
1437 			io_size = io_size_wanted;
1438 		} else {
1439 			io_size = (u_int)io_size_tmp;
1440 		}
1441 
1442 		if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1443 			real_bp->b_blkno = blkno;
1444 		}
1445 
1446 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1447 		    (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
1448 
1449 		if (io_size == 0) {
1450 			/*
1451 			 * vnop_blockmap didn't return an error... however, it did
1452 			 * return an extent size of 0 which means we can't
1453 			 * make forward progress on this I/O... a hole in the
1454 			 * file would be returned as a blkno of -1 with a non-zero io_size
1455 			 * a real extent is returned with a blkno != -1 and a non-zero io_size
1456 			 */
1457 			error = EINVAL;
1458 			break;
1459 		}
1460 		if (!(flags & CL_READ) && blkno == -1) {
1461 			off_t   e_offset;
1462 			int     pageout_flags;
1463 
1464 			if (upl_get_internal_vectorupl(upl)) {
1465 				panic("Vector UPLs should not take this code-path");
1466 			}
1467 			/*
1468 			 * we're writing into a 'hole'
1469 			 */
1470 			if (flags & CL_PAGEOUT) {
1471 				/*
1472 				 * if we got here via cluster_pageout
1473 				 * then just error the request and return
1474 				 * the 'hole' should already have been covered
1475 				 */
1476 				error = EINVAL;
1477 				break;
1478 			}
1479 			/*
1480 			 * we can get here if the cluster code happens to
1481 			 * pick up a page that was dirtied via mmap vs
1482 			 * a 'write' and the page targets a 'hole'...
1483 			 * i.e. the writes to the cluster were sparse
1484 			 * and the file was being written for the first time
1485 			 *
1486 			 * we can also get here if the filesystem supports
1487 			 * 'holes' that are less than PAGE_SIZE.... because
1488 			 * we can't know if the range in the page that covers
1489 			 * the 'hole' has been dirtied via an mmap or not,
1490 			 * we have to assume the worst and try to push the
1491 			 * entire page to storage.
1492 			 *
1493 			 * Try paging out the page individually before
1494 			 * giving up entirely and dumping it (the pageout
1495 			 * path will insure that the zero extent accounting
1496 			 * has been taken care of before we get back into cluster_io)
1497 			 *
1498 			 * go direct to vnode_pageout so that we don't have to
1499 			 * unbusy the page from the UPL... we used to do this
1500 			 * so that we could call ubc_msync, but that results
1501 			 * in a potential deadlock if someone else races us to acquire
1502 			 * that page and wins and in addition needs one of the pages
1503 			 * we're continuing to hold in the UPL
1504 			 */
1505 			pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1506 
1507 			if (!(flags & CL_ASYNC)) {
1508 				pageout_flags |= UPL_IOSYNC;
1509 			}
1510 			if (!(flags & CL_COMMIT)) {
1511 				pageout_flags |= UPL_NOCOMMIT;
1512 			}
1513 
1514 			if (cbp_head) {
1515 				buf_t prev_cbp;
1516 				uint32_t   bytes_in_last_page;
1517 
1518 				/*
1519 				 * first we have to wait for the the current outstanding I/Os
1520 				 * to complete... EOT hasn't been set yet on this transaction
1521 				 * so the pages won't be released
1522 				 */
1523 				cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1524 
1525 				bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1526 				for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1527 					bytes_in_last_page += cbp->b_bcount;
1528 				}
1529 				bytes_in_last_page &= PAGE_MASK;
1530 
1531 				while (bytes_in_last_page) {
1532 					/*
1533 					 * we've got a transcation that
1534 					 * includes the page we're about to push out through vnode_pageout...
1535 					 * find the bp's in the list which intersect this page and either
1536 					 * remove them entirely from the transaction (there could be multiple bp's), or
1537 					 * round it's iosize down to the page boundary (there can only be one)...
1538 					 *
1539 					 * find the last bp in the list and act on it
1540 					 */
1541 					for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
1542 						prev_cbp = cbp;
1543 					}
1544 
1545 					if (bytes_in_last_page >= cbp->b_bcount) {
1546 						/*
1547 						 * this buf no longer has any I/O associated with it
1548 						 */
1549 						bytes_in_last_page -= cbp->b_bcount;
1550 						cbp->b_bcount = 0;
1551 
1552 						free_io_buf(cbp);
1553 
1554 						if (cbp == cbp_head) {
1555 							assert(bytes_in_last_page == 0);
1556 							/*
1557 							 * the buf we just freed was the only buf in
1558 							 * this transaction... so there's no I/O to do
1559 							 */
1560 							cbp_head = NULL;
1561 							cbp_tail = NULL;
1562 						} else {
1563 							/*
1564 							 * remove the buf we just freed from
1565 							 * the transaction list
1566 							 */
1567 							prev_cbp->b_trans_next = NULL;
1568 							cbp_tail = prev_cbp;
1569 						}
1570 					} else {
1571 						/*
1572 						 * this is the last bp that has I/O
1573 						 * intersecting the page of interest
1574 						 * only some of the I/O is in the intersection
1575 						 * so clip the size but keep it in the transaction list
1576 						 */
1577 						cbp->b_bcount -= bytes_in_last_page;
1578 						cbp_tail = cbp;
1579 						bytes_in_last_page = 0;
1580 					}
1581 				}
1582 				if (cbp_head) {
1583 					/*
1584 					 * there was more to the current transaction
1585 					 * than just the page we are pushing out via vnode_pageout...
1586 					 * mark it as finished and complete it... we've already
1587 					 * waited for the I/Os to complete above in the call to cluster_wait_IO
1588 					 */
1589 					cluster_EOT(cbp_head, cbp_tail, 0, 0);
1590 
1591 					cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1592 
1593 					trans_count = 0;
1594 				}
1595 			}
1596 			if (vnode_pageout(vp, upl, (upl_offset_t)trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1597 				error = EINVAL;
1598 			}
1599 			e_offset = round_page_64(f_offset + 1);
1600 			io_size = (u_int)(e_offset - f_offset);
1601 
1602 			f_offset   += io_size;
1603 			upl_offset += io_size;
1604 
1605 			if (size >= io_size) {
1606 				size -= io_size;
1607 			} else {
1608 				size = 0;
1609 			}
1610 			/*
1611 			 * keep track of how much of the original request
1612 			 * that we've actually completed... non_rounded_size
1613 			 * may go negative due to us rounding the request
1614 			 * to a page size multiple (i.e.  size > non_rounded_size)
1615 			 */
1616 			non_rounded_size -= io_size;
1617 
1618 			if (non_rounded_size <= 0) {
1619 				/*
1620 				 * we've transferred all of the data in the original
1621 				 * request, but we were unable to complete the tail
1622 				 * of the last page because the file didn't have
1623 				 * an allocation to back that portion... this is ok.
1624 				 */
1625 				size = 0;
1626 			}
1627 			if (error) {
1628 				if (size == 0) {
1629 					flags &= ~CL_COMMIT;
1630 				}
1631 				break;
1632 			}
1633 			continue;
1634 		}
1635 
1636 		lblksize = CLUSTER_IO_BLOCK_SIZE;
1637 		lblkno = (daddr64_t)(f_offset / lblksize);
1638 
1639 		/*
1640 		 * we have now figured out how much I/O we can do - this is in 'io_size'
1641 		 * pg_offset is the starting point in the first page for the I/O
1642 		 * pg_count is the number of full and partial pages that 'io_size' encompasses
1643 		 */
1644 		pg_offset = upl_offset & PAGE_MASK;
1645 
1646 		if (flags & CL_DEV_MEMORY) {
1647 			/*
1648 			 * treat physical requests as one 'giant' page
1649 			 */
1650 			pg_count = 1;
1651 		} else {
1652 			pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1653 		}
1654 
1655 		if ((flags & CL_READ) && blkno == -1) {
1656 			vm_offset_t  commit_offset;
1657 			int bytes_to_zero;
1658 			int complete_transaction_now = 0;
1659 
1660 			/*
1661 			 * if we're reading and blkno == -1, then we've got a
1662 			 * 'hole' in the file that we need to deal with by zeroing
1663 			 * out the affected area in the upl
1664 			 */
1665 			if (io_size >= (u_int)non_rounded_size) {
1666 				/*
1667 				 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1668 				 * than 'zero_offset' will be non-zero
1669 				 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1670 				 * (indicated by the io_size finishing off the I/O request for this UPL)
1671 				 * than we're not going to issue an I/O for the
1672 				 * last page in this upl... we need to zero both the hole and the tail
1673 				 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1674 				 */
1675 				bytes_to_zero = non_rounded_size;
1676 				if (!(flags & CL_NOZERO)) {
1677 					bytes_to_zero = (int)((((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset);
1678 				}
1679 
1680 				zero_offset = 0;
1681 			} else {
1682 				bytes_to_zero = io_size;
1683 			}
1684 
1685 			pg_count = 0;
1686 
1687 			cluster_zero(upl, (upl_offset_t)upl_offset, bytes_to_zero, real_bp);
1688 
1689 			if (cbp_head) {
1690 				int     pg_resid;
1691 
1692 				/*
1693 				 * if there is a current I/O chain pending
1694 				 * then the first page of the group we just zero'd
1695 				 * will be handled by the I/O completion if the zero
1696 				 * fill started in the middle of the page
1697 				 */
1698 				commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1699 
1700 				pg_resid = (int)(commit_offset - upl_offset);
1701 
1702 				if (bytes_to_zero >= pg_resid) {
1703 					/*
1704 					 * the last page of the current I/O
1705 					 * has been completed...
1706 					 * compute the number of fully zero'd
1707 					 * pages that are beyond it
1708 					 * plus the last page if its partial
1709 					 * and we have no more I/O to issue...
1710 					 * otherwise a partial page is left
1711 					 * to begin the next I/O
1712 					 */
1713 					if ((int)io_size >= non_rounded_size) {
1714 						pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1715 					} else {
1716 						pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1717 					}
1718 
1719 					complete_transaction_now = 1;
1720 				}
1721 			} else {
1722 				/*
1723 				 * no pending I/O to deal with
1724 				 * so, commit all of the fully zero'd pages
1725 				 * plus the last page if its partial
1726 				 * and we have no more I/O to issue...
1727 				 * otherwise a partial page is left
1728 				 * to begin the next I/O
1729 				 */
1730 				if ((int)io_size >= non_rounded_size) {
1731 					pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1732 				} else {
1733 					pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1734 				}
1735 
1736 				commit_offset = upl_offset & ~PAGE_MASK;
1737 			}
1738 
1739 			// Associated UPL is currently only used in the direct write path
1740 			assert(!upl_associated_upl(upl));
1741 
1742 			if ((flags & CL_COMMIT) && pg_count) {
1743 				ubc_upl_commit_range(upl, (upl_offset_t)commit_offset,
1744 				    pg_count * PAGE_SIZE,
1745 				    UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1746 			}
1747 			upl_offset += io_size;
1748 			f_offset   += io_size;
1749 			size       -= io_size;
1750 
1751 			/*
1752 			 * keep track of how much of the original request
1753 			 * that we've actually completed... non_rounded_size
1754 			 * may go negative due to us rounding the request
1755 			 * to a page size multiple (i.e.  size > non_rounded_size)
1756 			 */
1757 			non_rounded_size -= io_size;
1758 
1759 			if (non_rounded_size <= 0) {
1760 				/*
1761 				 * we've transferred all of the data in the original
1762 				 * request, but we were unable to complete the tail
1763 				 * of the last page because the file didn't have
1764 				 * an allocation to back that portion... this is ok.
1765 				 */
1766 				size = 0;
1767 			}
1768 			if (cbp_head && (complete_transaction_now || size == 0)) {
1769 				cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1770 
1771 				cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
1772 
1773 				cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1774 
1775 				trans_count = 0;
1776 			}
1777 			continue;
1778 		}
1779 		if (pg_count > max_vectors) {
1780 			if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1781 				io_size = PAGE_SIZE - pg_offset;
1782 				pg_count = 1;
1783 			} else {
1784 				io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1785 				pg_count = max_vectors;
1786 			}
1787 		}
1788 		/*
1789 		 * If the transaction is going to reach the maximum number of
1790 		 * desired elements, truncate the i/o to the nearest page so
1791 		 * that the actual i/o is initiated after this buffer is
1792 		 * created and added to the i/o chain.
1793 		 *
1794 		 * I/O directed to physically contiguous memory
1795 		 * doesn't have a requirement to make sure we 'fill' a page
1796 		 */
1797 		if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1798 		    ((upl_offset + io_size) & PAGE_MASK)) {
1799 			vm_offset_t aligned_ofs;
1800 
1801 			aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1802 			/*
1803 			 * If the io_size does not actually finish off even a
1804 			 * single page we have to keep adding buffers to the
1805 			 * transaction despite having reached the desired limit.
1806 			 *
1807 			 * Eventually we get here with the page being finished
1808 			 * off (and exceeded) and then we truncate the size of
1809 			 * this i/o request so that it is page aligned so that
1810 			 * we can finally issue the i/o on the transaction.
1811 			 */
1812 			if (aligned_ofs > upl_offset) {
1813 				io_size = (u_int)(aligned_ofs - upl_offset);
1814 				pg_count--;
1815 			}
1816 		}
1817 
1818 		if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1819 			/*
1820 			 * if we're not targeting a virtual device i.e. a disk image
1821 			 * it's safe to dip into the reserve pool since real devices
1822 			 * can complete this I/O request without requiring additional
1823 			 * bufs from the alloc_io_buf pool
1824 			 */
1825 			priv = 1;
1826 		} else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT) && !cbp_head) {
1827 			/*
1828 			 * Throttle the speculative IO
1829 			 *
1830 			 * We can only throttle this if it is the first iobuf
1831 			 * for the transaction. alloc_io_buf implements
1832 			 * additional restrictions for diskimages anyway.
1833 			 */
1834 			priv = 0;
1835 		} else {
1836 			priv = 1;
1837 		}
1838 
1839 		cbp = alloc_io_buf(vp, priv);
1840 
1841 		if (flags & CL_PAGEOUT) {
1842 			u_int i;
1843 
1844 			/*
1845 			 * since blocks are in offsets of lblksize (CLUSTER_IO_BLOCK_SIZE), scale
1846 			 * iteration to (PAGE_SIZE * pg_count) of blks.
1847 			 */
1848 			for (i = 0; i < (PAGE_SIZE * pg_count) / lblksize; i++) {
1849 				if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
1850 					panic("BUSY bp found in cluster_io");
1851 				}
1852 			}
1853 		}
1854 		if (flags & CL_ASYNC) {
1855 			if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) {
1856 				panic("buf_setcallback failed");
1857 			}
1858 		}
1859 		cbp->b_cliodone = (void *)callback;
1860 		cbp->b_flags |= io_flags;
1861 		if (flags & CL_NOCACHE) {
1862 			cbp->b_attr.ba_flags |= BA_NOCACHE;
1863 		}
1864 		if (verify_block_size) {
1865 			cbp->b_attr.ba_flags |= BA_WILL_VERIFY;
1866 		}
1867 
1868 		cbp->b_lblkno = lblkno;
1869 		cbp->b_lblksize = lblksize;
1870 		cbp->b_blkno  = blkno;
1871 		cbp->b_bcount = io_size;
1872 
1873 		if (buf_setupl(cbp, upl, (uint32_t)upl_offset)) {
1874 			panic("buf_setupl failed");
1875 		}
1876 #if CONFIG_IOSCHED
1877 		upl_set_blkno(upl, upl_offset, io_size, blkno);
1878 #endif
1879 		cbp->b_trans_next = (buf_t)NULL;
1880 
1881 		if ((cbp->b_iostate = (void *)iostate)) {
1882 			/*
1883 			 * caller wants to track the state of this
1884 			 * io... bump the amount issued against this stream
1885 			 */
1886 			iostate->io_issued += io_size;
1887 		}
1888 
1889 		if (flags & CL_READ) {
1890 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1891 			    (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1892 		} else {
1893 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1894 			    (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1895 		}
1896 
1897 		if (cbp_head) {
1898 			cbp_tail->b_trans_next = cbp;
1899 			cbp_tail = cbp;
1900 		} else {
1901 			cbp_head = cbp;
1902 			cbp_tail = cbp;
1903 
1904 			if ((cbp_head->b_real_bp = real_bp)) {
1905 				real_bp = (buf_t)NULL;
1906 			}
1907 		}
1908 		*(buf_t *)(&cbp->b_trans_head) = cbp_head;
1909 
1910 		trans_count++;
1911 
1912 		upl_offset += io_size;
1913 		f_offset   += io_size;
1914 		size       -= io_size;
1915 		/*
1916 		 * keep track of how much of the original request
1917 		 * that we've actually completed... non_rounded_size
1918 		 * may go negative due to us rounding the request
1919 		 * to a page size multiple (i.e.  size > non_rounded_size)
1920 		 */
1921 		non_rounded_size -= io_size;
1922 
1923 		if (non_rounded_size <= 0) {
1924 			/*
1925 			 * we've transferred all of the data in the original
1926 			 * request, but we were unable to complete the tail
1927 			 * of the last page because the file didn't have
1928 			 * an allocation to back that portion... this is ok.
1929 			 */
1930 			size = 0;
1931 		}
1932 		if (size == 0) {
1933 			/*
1934 			 * we have no more I/O to issue, so go
1935 			 * finish the final transaction
1936 			 */
1937 			need_EOT = TRUE;
1938 		} else if (((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1939 		    ((flags & CL_ASYNC) || trans_count > max_trans_count)) {
1940 			/*
1941 			 * I/O directed to physically contiguous memory...
1942 			 * which doesn't have a requirement to make sure we 'fill' a page
1943 			 * or...
1944 			 * the current I/O we've prepared fully
1945 			 * completes the last page in this request
1946 			 * and ...
1947 			 * it's either an ASYNC request or
1948 			 * we've already accumulated more than 8 I/O's into
1949 			 * this transaction so mark it as complete so that
1950 			 * it can finish asynchronously or via the cluster_complete_transaction
1951 			 * below if the request is synchronous
1952 			 */
1953 			need_EOT = TRUE;
1954 		}
1955 		if (need_EOT == TRUE) {
1956 			cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
1957 		}
1958 
1959 		if (flags & CL_THROTTLE) {
1960 			(void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
1961 		}
1962 
1963 		if (!(io_flags & B_READ)) {
1964 			vnode_startwrite(vp);
1965 		}
1966 
1967 		if (flags & CL_RAW_ENCRYPTED) {
1968 			/*
1969 			 * User requested raw encrypted bytes.
1970 			 * Twiddle the bit in the ba_flags for the buffer
1971 			 */
1972 			cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
1973 		}
1974 
1975 		(void) VNOP_STRATEGY(cbp);
1976 
1977 		if (need_EOT == TRUE) {
1978 			if (!(flags & CL_ASYNC)) {
1979 				cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
1980 			}
1981 
1982 			need_EOT = FALSE;
1983 			trans_count = 0;
1984 			cbp_head = NULL;
1985 		}
1986 	}
1987 	if (error) {
1988 		int abort_size;
1989 
1990 		io_size = 0;
1991 
1992 		if (cbp_head) {
1993 			/*
1994 			 * Wait until all of the outstanding I/O
1995 			 * for this partial transaction has completed
1996 			 */
1997 			cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1998 
1999 			/*
2000 			 * Rewind the upl offset to the beginning of the
2001 			 * transaction.
2002 			 */
2003 			upl_offset = cbp_head->b_uploffset;
2004 		}
2005 
2006 		if (ISSET(flags, CL_COMMIT)) {
2007 			cluster_handle_associated_upl(iostate, upl,
2008 			    (upl_offset_t)upl_offset,
2009 			    (upl_size_t)(upl_end_offset - upl_offset));
2010 		}
2011 
2012 		// Free all the IO buffers in this transaction
2013 		for (cbp = cbp_head; cbp;) {
2014 			buf_t   cbp_next;
2015 
2016 			size       += cbp->b_bcount;
2017 			io_size    += cbp->b_bcount;
2018 
2019 			cbp_next = cbp->b_trans_next;
2020 			free_io_buf(cbp);
2021 			cbp = cbp_next;
2022 		}
2023 
2024 		if (iostate) {
2025 			int need_wakeup = 0;
2026 
2027 			/*
2028 			 * update the error condition for this stream
2029 			 * since we never really issued the io
2030 			 * just go ahead and adjust it back
2031 			 */
2032 			lck_mtx_lock_spin(&iostate->io_mtxp);
2033 
2034 			if (iostate->io_error == 0) {
2035 				iostate->io_error = error;
2036 			}
2037 			iostate->io_issued -= io_size;
2038 
2039 			if (iostate->io_wanted) {
2040 				/*
2041 				 * someone is waiting for the state of
2042 				 * this io stream to change
2043 				 */
2044 				iostate->io_wanted = 0;
2045 				need_wakeup = 1;
2046 			}
2047 			lck_mtx_unlock(&iostate->io_mtxp);
2048 
2049 			if (need_wakeup) {
2050 				wakeup((caddr_t)&iostate->io_wanted);
2051 			}
2052 		}
2053 
2054 		if (flags & CL_COMMIT) {
2055 			int     upl_flags;
2056 
2057 			pg_offset  = upl_offset & PAGE_MASK;
2058 			abort_size = (int)((upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK);
2059 
2060 			upl_flags = cluster_ioerror(upl, (int)(upl_offset - pg_offset),
2061 			    abort_size, error, io_flags, vp);
2062 
2063 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
2064 			    upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
2065 		}
2066 		if (retval == 0) {
2067 			retval = error;
2068 		}
2069 	} else if (cbp_head) {
2070 		panic("%s(): cbp_head is not NULL.", __FUNCTION__);
2071 	}
2072 
2073 	if (real_bp) {
2074 		/*
2075 		 * can get here if we either encountered an error
2076 		 * or we completely zero-filled the request and
2077 		 * no I/O was issued
2078 		 */
2079 		if (error) {
2080 			real_bp->b_flags |= B_ERROR;
2081 			real_bp->b_error = error;
2082 		}
2083 		buf_biodone(real_bp);
2084 	}
2085 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
2086 
2087 	return retval;
2088 }
2089 
2090 #define reset_vector_run_state()                                                                                \
2091 	issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
2092 
2093 static int
vector_cluster_io(vnode_t vp,upl_t vector_upl,vm_offset_t vector_upl_offset,off_t v_upl_uio_offset,int vector_upl_iosize,int io_flag,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)2094 vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
2095     int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
2096 {
2097 	vector_upl_set_pagelist(vector_upl);
2098 
2099 	if (io_flag & CL_READ) {
2100 		if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
2101 			io_flag &= ~CL_PRESERVE; /*don't zero fill*/
2102 		} else {
2103 			io_flag |= CL_PRESERVE; /*zero fill*/
2104 		}
2105 	}
2106 	return cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg);
2107 }
2108 
2109 static int
cluster_read_prefetch(vnode_t vp,off_t f_offset,u_int size,off_t filesize,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2110 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2111 {
2112 	int           pages_in_prefetch;
2113 
2114 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
2115 	    (int)f_offset, size, (int)filesize, 0, 0);
2116 
2117 	if (f_offset >= filesize) {
2118 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2119 		    (int)f_offset, 0, 0, 0, 0);
2120 		return 0;
2121 	}
2122 	if ((off_t)size > (filesize - f_offset)) {
2123 		size = (u_int)(filesize - f_offset);
2124 	}
2125 	pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
2126 
2127 	advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
2128 
2129 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2130 	    (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
2131 
2132 	return pages_in_prefetch;
2133 }
2134 
2135 
2136 
2137 static void
cluster_read_ahead(vnode_t vp,struct cl_extent * extent,off_t filesize,struct cl_readahead * rap,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2138 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
2139     int bflag)
2140 {
2141 	daddr64_t       r_addr;
2142 	off_t           f_offset;
2143 	int             size_of_prefetch;
2144 	u_int           max_prefetch;
2145 
2146 
2147 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
2148 	    (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
2149 
2150 	if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
2151 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2152 		    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
2153 		return;
2154 	}
2155 	if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
2156 		rap->cl_ralen = 0;
2157 		rap->cl_maxra = 0;
2158 
2159 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2160 		    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
2161 
2162 		return;
2163 	}
2164 	max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), disk_conditioner_mount_is_ssd(vp->v_mount));
2165 
2166 	if (max_prefetch > speculative_prefetch_max) {
2167 		max_prefetch = speculative_prefetch_max;
2168 	}
2169 
2170 	if (max_prefetch <= PAGE_SIZE) {
2171 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2172 		    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
2173 		return;
2174 	}
2175 	if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
2176 		if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2177 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2178 			    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
2179 			return;
2180 		}
2181 	}
2182 	r_addr = MAX(extent->e_addr, rap->cl_maxra) + 1;
2183 	f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2184 
2185 	size_of_prefetch = 0;
2186 
2187 	ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2188 
2189 	if (size_of_prefetch) {
2190 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2191 		    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
2192 		return;
2193 	}
2194 	if (f_offset < filesize) {
2195 		daddr64_t read_size;
2196 
2197 		rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
2198 
2199 		read_size = (extent->e_addr + 1) - extent->b_addr;
2200 
2201 		if (read_size > rap->cl_ralen) {
2202 			if (read_size > max_prefetch / PAGE_SIZE) {
2203 				rap->cl_ralen = max_prefetch / PAGE_SIZE;
2204 			} else {
2205 				rap->cl_ralen = (int)read_size;
2206 			}
2207 		}
2208 		size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2209 
2210 		if (size_of_prefetch) {
2211 			rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2212 		}
2213 	}
2214 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2215 	    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
2216 }
2217 
2218 
2219 int
cluster_pageout(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2220 cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2221     int size, off_t filesize, int flags)
2222 {
2223 	return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2224 }
2225 
2226 
2227 int
cluster_pageout_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2228 cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2229     int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2230 {
2231 	int           io_size;
2232 	int           rounded_size;
2233 	off_t         max_size;
2234 	int           local_flags;
2235 
2236 	local_flags = CL_PAGEOUT | CL_THROTTLE;
2237 
2238 	if ((flags & UPL_IOSYNC) == 0) {
2239 		local_flags |= CL_ASYNC;
2240 	}
2241 	if ((flags & UPL_NOCOMMIT) == 0) {
2242 		local_flags |= CL_COMMIT;
2243 	}
2244 	if ((flags & UPL_KEEPCACHED)) {
2245 		local_flags |= CL_KEEPCACHED;
2246 	}
2247 	if (flags & UPL_PAGING_ENCRYPTED) {
2248 		local_flags |= CL_ENCRYPTED;
2249 	}
2250 
2251 
2252 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2253 	    (int)f_offset, size, (int)filesize, local_flags, 0);
2254 
2255 	/*
2256 	 * If they didn't specify any I/O, then we are done...
2257 	 * we can't issue an abort because we don't know how
2258 	 * big the upl really is
2259 	 */
2260 	if (size <= 0) {
2261 		return EINVAL;
2262 	}
2263 
2264 	if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2265 		if (local_flags & CL_COMMIT) {
2266 			ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2267 		}
2268 		return EROFS;
2269 	}
2270 	/*
2271 	 * can't page-in from a negative offset
2272 	 * or if we're starting beyond the EOF
2273 	 * or if the file offset isn't page aligned
2274 	 * or the size requested isn't a multiple of PAGE_SIZE
2275 	 */
2276 	if (f_offset < 0 || f_offset >= filesize ||
2277 	    (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2278 		if (local_flags & CL_COMMIT) {
2279 			ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2280 		}
2281 		return EINVAL;
2282 	}
2283 	max_size = filesize - f_offset;
2284 
2285 	if (size < max_size) {
2286 		io_size = size;
2287 	} else {
2288 		io_size = (int)max_size;
2289 	}
2290 
2291 	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2292 
2293 	if (size > rounded_size) {
2294 		if (local_flags & CL_COMMIT) {
2295 			ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2296 			    UPL_ABORT_FREE_ON_EMPTY);
2297 		}
2298 	}
2299 	return cluster_io(vp, upl, upl_offset, f_offset, io_size,
2300 	           local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2301 }
2302 
2303 
2304 int
cluster_pagein(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2305 cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2306     int size, off_t filesize, int flags)
2307 {
2308 	return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2309 }
2310 
2311 
2312 int
cluster_pagein_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2313 cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2314     int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2315 {
2316 	u_int         io_size;
2317 	int           rounded_size;
2318 	off_t         max_size;
2319 	int           retval;
2320 	int           local_flags = 0;
2321 
2322 	if (upl == NULL || size < 0) {
2323 		panic("cluster_pagein: NULL upl passed in");
2324 	}
2325 
2326 	if ((flags & UPL_IOSYNC) == 0) {
2327 		local_flags |= CL_ASYNC;
2328 	}
2329 	if ((flags & UPL_NOCOMMIT) == 0) {
2330 		local_flags |= CL_COMMIT;
2331 	}
2332 	if (flags & UPL_IOSTREAMING) {
2333 		local_flags |= CL_IOSTREAMING;
2334 	}
2335 	if (flags & UPL_PAGING_ENCRYPTED) {
2336 		local_flags |= CL_ENCRYPTED;
2337 	}
2338 
2339 
2340 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2341 	    (int)f_offset, size, (int)filesize, local_flags, 0);
2342 
2343 	/*
2344 	 * can't page-in from a negative offset
2345 	 * or if we're starting beyond the EOF
2346 	 * or if the file offset isn't page aligned
2347 	 * or the size requested isn't a multiple of PAGE_SIZE
2348 	 */
2349 	if (f_offset < 0 || f_offset >= filesize ||
2350 	    (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2351 		if (local_flags & CL_COMMIT) {
2352 			ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2353 		}
2354 
2355 		if (f_offset >= filesize) {
2356 			kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CLUSTER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CL_PGIN_PAST_EOF), 0 /* arg */);
2357 		}
2358 
2359 		return EINVAL;
2360 	}
2361 	max_size = filesize - f_offset;
2362 
2363 	if (size < max_size) {
2364 		io_size = size;
2365 	} else {
2366 		io_size = (int)max_size;
2367 	}
2368 
2369 	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2370 
2371 	if (size > rounded_size && (local_flags & CL_COMMIT)) {
2372 		ubc_upl_abort_range(upl, upl_offset + rounded_size,
2373 		    size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2374 	}
2375 
2376 	retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
2377 	    local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2378 
2379 	return retval;
2380 }
2381 
2382 
2383 int
cluster_bp(buf_t bp)2384 cluster_bp(buf_t bp)
2385 {
2386 	return cluster_bp_ext(bp, NULL, NULL);
2387 }
2388 
2389 
2390 int
cluster_bp_ext(buf_t bp,int (* callback)(buf_t,void *),void * callback_arg)2391 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2392 {
2393 	off_t  f_offset;
2394 	int    flags;
2395 
2396 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2397 	    bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2398 
2399 	if (bp->b_flags & B_READ) {
2400 		flags = CL_ASYNC | CL_READ;
2401 	} else {
2402 		flags = CL_ASYNC;
2403 	}
2404 	if (bp->b_flags & B_PASSIVE) {
2405 		flags |= CL_PASSIVE;
2406 	}
2407 
2408 	f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2409 
2410 	return cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg);
2411 }
2412 
2413 
2414 
2415 int
cluster_write(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags)2416 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2417 {
2418 	return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2419 }
2420 
2421 
2422 int
cluster_write_ext(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags,int (* callback)(buf_t,void *),void * callback_arg)2423 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2424     int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2425 {
2426 	user_ssize_t    cur_resid;
2427 	int             retval = 0;
2428 	int             flags;
2429 	int             zflags;
2430 	int             bflag;
2431 	int             write_type = IO_COPY;
2432 	u_int32_t       write_length;
2433 
2434 	flags = xflags;
2435 
2436 	if (flags & IO_PASSIVE) {
2437 		bflag = CL_PASSIVE;
2438 	} else {
2439 		bflag = 0;
2440 	}
2441 
2442 	if (vp->v_flag & VNOCACHE_DATA) {
2443 		flags |= IO_NOCACHE;
2444 		bflag |= CL_NOCACHE;
2445 	}
2446 	if (uio == NULL) {
2447 		/*
2448 		 * no user data...
2449 		 * this call is being made to zero-fill some range in the file
2450 		 */
2451 		retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2452 
2453 		return retval;
2454 	}
2455 	/*
2456 	 * do a write through the cache if one of the following is true....
2457 	 *   NOCACHE is not true or NODIRECT is true
2458 	 *   the uio request doesn't target USERSPACE
2459 	 * otherwise, find out if we want the direct or contig variant for
2460 	 * the first vector in the uio request
2461 	 */
2462 	if (((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
2463 		retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2464 	}
2465 
2466 	if ((flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
2467 		/*
2468 		 * must go through the cached variant in this case
2469 		 */
2470 		write_type = IO_COPY;
2471 	}
2472 
2473 	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
2474 		switch (write_type) {
2475 		case IO_COPY:
2476 			/*
2477 			 * make sure the uio_resid isn't too big...
2478 			 * internally, we want to handle all of the I/O in
2479 			 * chunk sizes that fit in a 32 bit int
2480 			 */
2481 			if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2482 				/*
2483 				 * we're going to have to call cluster_write_copy
2484 				 * more than once...
2485 				 *
2486 				 * only want the last call to cluster_write_copy to
2487 				 * have the IO_TAILZEROFILL flag set and only the
2488 				 * first call should have IO_HEADZEROFILL
2489 				 */
2490 				zflags = flags & ~IO_TAILZEROFILL;
2491 				flags &= ~IO_HEADZEROFILL;
2492 
2493 				write_length = MAX_IO_REQUEST_SIZE;
2494 			} else {
2495 				/*
2496 				 * last call to cluster_write_copy
2497 				 */
2498 				zflags = flags;
2499 
2500 				write_length = (u_int32_t)cur_resid;
2501 			}
2502 			retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
2503 			break;
2504 
2505 		case IO_CONTIG:
2506 			zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
2507 
2508 			if (flags & IO_HEADZEROFILL) {
2509 				/*
2510 				 * only do this once per request
2511 				 */
2512 				flags &= ~IO_HEADZEROFILL;
2513 
2514 				retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
2515 				    headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2516 				if (retval) {
2517 					break;
2518 				}
2519 			}
2520 			retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
2521 
2522 			if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
2523 				/*
2524 				 * we're done with the data from the user specified buffer(s)
2525 				 * and we've been requested to zero fill at the tail
2526 				 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2527 				 * by rearranging the args and passing in IO_HEADZEROFILL
2528 				 */
2529 
2530 				/*
2531 				 * Update the oldEOF to reflect the current EOF. If the UPL page
2532 				 * to zero-fill is not valid (when F_NOCACHE is set), the
2533 				 * cluster_write_copy() will perform RMW on the UPL page when
2534 				 * the oldEOF is not aligned on page boundary due to unaligned
2535 				 * write.
2536 				 */
2537 				if (uio->uio_offset > oldEOF) {
2538 					oldEOF = uio->uio_offset;
2539 				}
2540 				retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)oldEOF, tailOff, uio->uio_offset,
2541 				    (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2542 			}
2543 			break;
2544 
2545 		case IO_DIRECT:
2546 			/*
2547 			 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2548 			 */
2549 			retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
2550 			break;
2551 
2552 		case IO_UNKNOWN:
2553 			retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2554 			break;
2555 		}
2556 		/*
2557 		 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2558 		 * multiple times to service a multi-vector request that is not aligned properly
2559 		 * we need to update the oldEOF so that we
2560 		 * don't zero-fill the head of a page if we've successfully written
2561 		 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2562 		 * page that is beyond the oldEOF if the write is unaligned... we only
2563 		 * want that to happen for the very first page of the cluster_write,
2564 		 * NOT the first page of each vector making up a multi-vector write.
2565 		 */
2566 		if (uio->uio_offset > oldEOF) {
2567 			oldEOF = uio->uio_offset;
2568 		}
2569 	}
2570 	return retval;
2571 }
2572 
2573 
2574 static int
cluster_write_direct(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,int * write_type,u_int32_t * write_length,int flags,int (* callback)(buf_t,void *),void * callback_arg)2575 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
2576     int flags, int (*callback)(buf_t, void *), void *callback_arg)
2577 {
2578 	upl_t            upl;
2579 	upl_page_info_t  *pl;
2580 	vm_offset_t      upl_offset;
2581 	vm_offset_t      vector_upl_offset = 0;
2582 	u_int32_t        io_req_size;
2583 	u_int32_t        offset_in_file;
2584 	u_int32_t        offset_in_iovbase;
2585 	u_int32_t        io_size;
2586 	int              io_flag = 0;
2587 	upl_size_t       upl_size, vector_upl_size = 0;
2588 	vm_size_t        upl_needed_size;
2589 	mach_msg_type_number_t  pages_in_pl;
2590 	upl_control_flags_t upl_flags;
2591 	kern_return_t    kret;
2592 	mach_msg_type_number_t  i;
2593 	int              force_data_sync;
2594 	int              retval = 0;
2595 	int              first_IO = 1;
2596 	struct clios     iostate;
2597 	user_addr_t      iov_base;
2598 	u_int32_t        mem_alignment_mask;
2599 	u_int32_t        devblocksize;
2600 	u_int32_t        max_io_size;
2601 	u_int32_t        max_upl_size;
2602 	u_int32_t        max_vector_size;
2603 	u_int32_t        bytes_outstanding_limit;
2604 	boolean_t        io_throttled = FALSE;
2605 
2606 	u_int32_t        vector_upl_iosize = 0;
2607 	int              issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
2608 	off_t            v_upl_uio_offset = 0;
2609 	int              vector_upl_index = 0;
2610 	upl_t            vector_upl = NULL;
2611 
2612 
2613 	/*
2614 	 * When we enter this routine, we know
2615 	 *  -- the resid will not exceed iov_len
2616 	 */
2617 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
2618 	    (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2619 
2620 	assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
2621 
2622 	max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2623 
2624 	io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2625 
2626 	if (flags & IO_PASSIVE) {
2627 		io_flag |= CL_PASSIVE;
2628 	}
2629 
2630 	if (flags & IO_NOCACHE) {
2631 		io_flag |= CL_NOCACHE;
2632 	}
2633 
2634 	if (flags & IO_SKIP_ENCRYPTION) {
2635 		io_flag |= CL_ENCRYPTED;
2636 	}
2637 
2638 	iostate.io_completed = 0;
2639 	iostate.io_issued = 0;
2640 	iostate.io_error = 0;
2641 	iostate.io_wanted = 0;
2642 
2643 	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
2644 
2645 	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2646 	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2647 
2648 	if (devblocksize == 1) {
2649 		/*
2650 		 * the AFP client advertises a devblocksize of 1
2651 		 * however, its BLOCKMAP routine maps to physical
2652 		 * blocks that are PAGE_SIZE in size...
2653 		 * therefore we can't ask for I/Os that aren't page aligned
2654 		 * or aren't multiples of PAGE_SIZE in size
2655 		 * by setting devblocksize to PAGE_SIZE, we re-instate
2656 		 * the old behavior we had before the mem_alignment_mask
2657 		 * changes went in...
2658 		 */
2659 		devblocksize = PAGE_SIZE;
2660 	}
2661 
2662 next_dwrite:
2663 	io_req_size = *write_length;
2664 	iov_base = uio_curriovbase(uio);
2665 
2666 	offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2667 	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
2668 
2669 	if (offset_in_file || offset_in_iovbase) {
2670 		/*
2671 		 * one of the 2 important offsets is misaligned
2672 		 * so fire an I/O through the cache for this entire vector
2673 		 */
2674 		goto wait_for_dwrites;
2675 	}
2676 	if (iov_base & (devblocksize - 1)) {
2677 		/*
2678 		 * the offset in memory must be on a device block boundary
2679 		 * so that we can guarantee that we can generate an
2680 		 * I/O that ends on a page boundary in cluster_io
2681 		 */
2682 		goto wait_for_dwrites;
2683 	}
2684 
2685 	task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2686 	while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
2687 		int     throttle_type;
2688 
2689 		if ((throttle_type = cluster_is_throttled(vp))) {
2690 			/*
2691 			 * we're in the throttle window, at the very least
2692 			 * we want to limit the size of the I/O we're about
2693 			 * to issue
2694 			 */
2695 			if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2696 				/*
2697 				 * we're in the throttle window and at least 1 I/O
2698 				 * has already been issued by a throttleable thread
2699 				 * in this window, so return with EAGAIN to indicate
2700 				 * to the FS issuing the cluster_write call that it
2701 				 * should now throttle after dropping any locks
2702 				 */
2703 				throttle_info_update_by_mount(vp->v_mount);
2704 
2705 				io_throttled = TRUE;
2706 				goto wait_for_dwrites;
2707 			}
2708 			max_vector_size = THROTTLE_MAX_IOSIZE;
2709 			max_io_size = THROTTLE_MAX_IOSIZE;
2710 		} else {
2711 			max_vector_size = MAX_VECTOR_UPL_SIZE;
2712 			max_io_size = max_upl_size;
2713 		}
2714 
2715 		if (first_IO) {
2716 			cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2717 			first_IO = 0;
2718 		}
2719 		io_size  = io_req_size & ~PAGE_MASK;
2720 		iov_base = uio_curriovbase(uio);
2721 
2722 		if (io_size > max_io_size) {
2723 			io_size = max_io_size;
2724 		}
2725 
2726 		if (useVectorUPL && (iov_base & PAGE_MASK)) {
2727 			/*
2728 			 * We have an iov_base that's not page-aligned.
2729 			 * Issue all I/O's that have been collected within
2730 			 * this Vectored UPL.
2731 			 */
2732 			if (vector_upl_index) {
2733 				retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2734 				reset_vector_run_state();
2735 			}
2736 
2737 			/*
2738 			 * After this point, if we are using the Vector UPL path and the base is
2739 			 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2740 			 */
2741 		}
2742 
2743 		upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2744 		upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2745 
2746 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
2747 		    (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
2748 
2749 		vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2750 		for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2751 			pages_in_pl = 0;
2752 			upl_size = (upl_size_t)upl_needed_size;
2753 			upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2754 			    UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2755 
2756 			kret = vm_map_get_upl(map,
2757 			    (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2758 			    &upl_size,
2759 			    &upl,
2760 			    NULL,
2761 			    &pages_in_pl,
2762 			    &upl_flags,
2763 			    VM_KERN_MEMORY_FILE,
2764 			    force_data_sync);
2765 
2766 			if (kret != KERN_SUCCESS) {
2767 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2768 				    0, 0, 0, kret, 0);
2769 				/*
2770 				 * failed to get pagelist
2771 				 *
2772 				 * we may have already spun some portion of this request
2773 				 * off as async requests... we need to wait for the I/O
2774 				 * to complete before returning
2775 				 */
2776 				goto wait_for_dwrites;
2777 			}
2778 			pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2779 			pages_in_pl = upl_size / PAGE_SIZE;
2780 
2781 			for (i = 0; i < pages_in_pl; i++) {
2782 				if (!upl_valid_page(pl, i)) {
2783 					break;
2784 				}
2785 			}
2786 			if (i == pages_in_pl) {
2787 				break;
2788 			}
2789 
2790 			/*
2791 			 * didn't get all the pages back that we
2792 			 * needed... release this upl and try again
2793 			 */
2794 			ubc_upl_abort(upl, 0);
2795 		}
2796 		if (force_data_sync >= 3) {
2797 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2798 			    i, pages_in_pl, upl_size, kret, 0);
2799 			/*
2800 			 * for some reason, we couldn't acquire a hold on all
2801 			 * the pages needed in the user's address space
2802 			 *
2803 			 * we may have already spun some portion of this request
2804 			 * off as async requests... we need to wait for the I/O
2805 			 * to complete before returning
2806 			 */
2807 			goto wait_for_dwrites;
2808 		}
2809 
2810 		/*
2811 		 * Consider the possibility that upl_size wasn't satisfied.
2812 		 */
2813 		if (upl_size < upl_needed_size) {
2814 			if (upl_size && upl_offset == 0) {
2815 				io_size = upl_size;
2816 			} else {
2817 				io_size = 0;
2818 			}
2819 		}
2820 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2821 		    (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2822 
2823 		if (io_size == 0) {
2824 			ubc_upl_abort(upl, 0);
2825 			/*
2826 			 * we may have already spun some portion of this request
2827 			 * off as async requests... we need to wait for the I/O
2828 			 * to complete before returning
2829 			 */
2830 			goto wait_for_dwrites;
2831 		}
2832 
2833 		if (useVectorUPL) {
2834 			vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2835 			if (end_off) {
2836 				issueVectorUPL = 1;
2837 			}
2838 			/*
2839 			 * After this point, if we are using a vector UPL, then
2840 			 * either all the UPL elements end on a page boundary OR
2841 			 * this UPL is the last element because it does not end
2842 			 * on a page boundary.
2843 			 */
2844 		}
2845 
2846 		/*
2847 		 * we want push out these writes asynchronously so that we can overlap
2848 		 * the preparation of the next I/O
2849 		 * if there are already too many outstanding writes
2850 		 * wait until some complete before issuing the next
2851 		 */
2852 		if (vp->v_mount->mnt_minsaturationbytecount) {
2853 			bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
2854 		} else {
2855 			bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2);
2856 		}
2857 
2858 		cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
2859 
2860 		if (iostate.io_error) {
2861 			/*
2862 			 * one of the earlier writes we issued ran into a hard error
2863 			 * don't issue any more writes, cleanup the UPL
2864 			 * that was just created but not used, then
2865 			 * go wait for all writes that are part of this stream
2866 			 * to complete before returning the error to the caller
2867 			 */
2868 			ubc_upl_abort(upl, 0);
2869 
2870 			goto wait_for_dwrites;
2871 		}
2872 
2873 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2874 		    (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2875 
2876 		if (!useVectorUPL) {
2877 			retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2878 			    io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2879 		} else {
2880 			if (!vector_upl_index) {
2881 				vector_upl = vector_upl_create(upl_offset);
2882 				v_upl_uio_offset = uio->uio_offset;
2883 				vector_upl_offset = upl_offset;
2884 			}
2885 
2886 			vector_upl_set_subupl(vector_upl, upl, upl_size);
2887 			vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2888 			vector_upl_index++;
2889 			vector_upl_iosize += io_size;
2890 			vector_upl_size += upl_size;
2891 
2892 			if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
2893 				retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2894 				reset_vector_run_state();
2895 			}
2896 		}
2897 
2898 		/*
2899 		 * update the uio structure to
2900 		 * reflect the I/O that we just issued
2901 		 */
2902 		uio_update(uio, (user_size_t)io_size);
2903 
2904 		/*
2905 		 * in case we end up calling through to cluster_write_copy to finish
2906 		 * the tail of this request, we need to update the oldEOF so that we
2907 		 * don't zero-fill the head of a page if we've successfully written
2908 		 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2909 		 * page that is beyond the oldEOF if the write is unaligned... we only
2910 		 * want that to happen for the very first page of the cluster_write,
2911 		 * NOT the first page of each vector making up a multi-vector write.
2912 		 */
2913 		if (uio->uio_offset > oldEOF) {
2914 			oldEOF = uio->uio_offset;
2915 		}
2916 
2917 		io_req_size -= io_size;
2918 
2919 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2920 		    (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2921 	} /* end while */
2922 
2923 	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2924 		retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2925 
2926 		if (retval == 0 && *write_type == IO_DIRECT) {
2927 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2928 			    (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2929 
2930 			goto next_dwrite;
2931 		}
2932 	}
2933 
2934 wait_for_dwrites:
2935 
2936 	if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
2937 		retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2938 		reset_vector_run_state();
2939 	}
2940 	/*
2941 	 * make sure all async writes issued as part of this stream
2942 	 * have completed before we return
2943 	 */
2944 	cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
2945 
2946 	if (iostate.io_error) {
2947 		retval = iostate.io_error;
2948 	}
2949 
2950 	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
2951 
2952 	if (io_throttled == TRUE && retval == 0) {
2953 		retval = EAGAIN;
2954 	}
2955 
2956 	if (io_req_size && retval == 0) {
2957 		/*
2958 		 * we couldn't handle the tail of this request in DIRECT mode
2959 		 * so fire it through the copy path
2960 		 *
2961 		 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2962 		 * so we can just pass 0 in for the headOff and tailOff
2963 		 */
2964 		if (uio->uio_offset > oldEOF) {
2965 			oldEOF = uio->uio_offset;
2966 		}
2967 
2968 		retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
2969 
2970 		*write_type = IO_UNKNOWN;
2971 	}
2972 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
2973 	    (int)uio->uio_offset, io_req_size, retval, 4, 0);
2974 
2975 	return retval;
2976 }
2977 
2978 
2979 static int
cluster_write_contig(vnode_t vp,struct uio * uio,off_t newEOF,int * write_type,u_int32_t * write_length,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2980 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
2981     int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2982 {
2983 	upl_page_info_t *pl;
2984 	addr64_t         src_paddr = 0;
2985 	upl_t            upl[MAX_VECTS];
2986 	vm_offset_t      upl_offset;
2987 	u_int32_t        tail_size = 0;
2988 	u_int32_t        io_size;
2989 	u_int32_t        xsize;
2990 	upl_size_t       upl_size;
2991 	vm_size_t        upl_needed_size;
2992 	mach_msg_type_number_t  pages_in_pl;
2993 	upl_control_flags_t upl_flags;
2994 	kern_return_t    kret;
2995 	struct clios     iostate;
2996 	int              error  = 0;
2997 	int              cur_upl = 0;
2998 	int              num_upl = 0;
2999 	int              n;
3000 	user_addr_t      iov_base;
3001 	u_int32_t        devblocksize;
3002 	u_int32_t        mem_alignment_mask;
3003 
3004 	/*
3005 	 * When we enter this routine, we know
3006 	 *  -- the io_req_size will not exceed iov_len
3007 	 *  -- the target address is physically contiguous
3008 	 */
3009 	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
3010 
3011 	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3012 	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3013 
3014 	iostate.io_completed = 0;
3015 	iostate.io_issued = 0;
3016 	iostate.io_error = 0;
3017 	iostate.io_wanted = 0;
3018 
3019 	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
3020 
3021 next_cwrite:
3022 	io_size = *write_length;
3023 
3024 	iov_base = uio_curriovbase(uio);
3025 
3026 	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3027 	upl_needed_size = upl_offset + io_size;
3028 
3029 	pages_in_pl = 0;
3030 	upl_size = (upl_size_t)upl_needed_size;
3031 	upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
3032 	    UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3033 
3034 	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
3035 	kret = vm_map_get_upl(map,
3036 	    vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
3037 	    &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
3038 
3039 	if (kret != KERN_SUCCESS) {
3040 		/*
3041 		 * failed to get pagelist
3042 		 */
3043 		error = EINVAL;
3044 		goto wait_for_cwrites;
3045 	}
3046 	num_upl++;
3047 
3048 	/*
3049 	 * Consider the possibility that upl_size wasn't satisfied.
3050 	 */
3051 	if (upl_size < upl_needed_size) {
3052 		/*
3053 		 * This is a failure in the physical memory case.
3054 		 */
3055 		error = EINVAL;
3056 		goto wait_for_cwrites;
3057 	}
3058 	pl = ubc_upl_pageinfo(upl[cur_upl]);
3059 
3060 	src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
3061 
3062 	while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3063 		u_int32_t   head_size;
3064 
3065 		head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
3066 
3067 		if (head_size > io_size) {
3068 			head_size = io_size;
3069 		}
3070 
3071 		error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
3072 
3073 		if (error) {
3074 			goto wait_for_cwrites;
3075 		}
3076 
3077 		upl_offset += head_size;
3078 		src_paddr  += head_size;
3079 		io_size    -= head_size;
3080 
3081 		iov_base   += head_size;
3082 	}
3083 	if ((u_int32_t)iov_base & mem_alignment_mask) {
3084 		/*
3085 		 * request doesn't set up on a memory boundary
3086 		 * the underlying DMA engine can handle...
3087 		 * return an error instead of going through
3088 		 * the slow copy path since the intent of this
3089 		 * path is direct I/O from device memory
3090 		 */
3091 		error = EINVAL;
3092 		goto wait_for_cwrites;
3093 	}
3094 
3095 	tail_size = io_size & (devblocksize - 1);
3096 	io_size  -= tail_size;
3097 
3098 	while (io_size && error == 0) {
3099 		if (io_size > MAX_IO_CONTIG_SIZE) {
3100 			xsize = MAX_IO_CONTIG_SIZE;
3101 		} else {
3102 			xsize = io_size;
3103 		}
3104 		/*
3105 		 * request asynchronously so that we can overlap
3106 		 * the preparation of the next I/O... we'll do
3107 		 * the commit after all the I/O has completed
3108 		 * since its all issued against the same UPL
3109 		 * if there are already too many outstanding writes
3110 		 * wait until some have completed before issuing the next
3111 		 */
3112 		cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
3113 
3114 		if (iostate.io_error) {
3115 			/*
3116 			 * one of the earlier writes we issued ran into a hard error
3117 			 * don't issue any more writes...
3118 			 * go wait for all writes that are part of this stream
3119 			 * to complete before returning the error to the caller
3120 			 */
3121 			goto wait_for_cwrites;
3122 		}
3123 		/*
3124 		 * issue an asynchronous write to cluster_io
3125 		 */
3126 		error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
3127 		    xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
3128 
3129 		if (error == 0) {
3130 			/*
3131 			 * The cluster_io write completed successfully,
3132 			 * update the uio structure
3133 			 */
3134 			uio_update(uio, (user_size_t)xsize);
3135 
3136 			upl_offset += xsize;
3137 			src_paddr  += xsize;
3138 			io_size    -= xsize;
3139 		}
3140 	}
3141 	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
3142 		error = cluster_io_type(uio, write_type, write_length, 0);
3143 
3144 		if (error == 0 && *write_type == IO_CONTIG) {
3145 			cur_upl++;
3146 			goto next_cwrite;
3147 		}
3148 	} else {
3149 		*write_type = IO_UNKNOWN;
3150 	}
3151 
3152 wait_for_cwrites:
3153 	/*
3154 	 * make sure all async writes that are part of this stream
3155 	 * have completed before we proceed
3156 	 */
3157 	cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
3158 
3159 	if (iostate.io_error) {
3160 		error = iostate.io_error;
3161 	}
3162 
3163 	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
3164 
3165 	if (error == 0 && tail_size) {
3166 		error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
3167 	}
3168 
3169 	for (n = 0; n < num_upl; n++) {
3170 		/*
3171 		 * just release our hold on each physically contiguous
3172 		 * region without changing any state
3173 		 */
3174 		ubc_upl_abort(upl[n], 0);
3175 	}
3176 
3177 	return error;
3178 }
3179 
3180 
3181 /*
3182  * need to avoid a race between an msync of a range of pages dirtied via mmap
3183  * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3184  * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3185  *
3186  * we should never force-zero-fill pages that are already valid in the cache...
3187  * the entire page contains valid data (either from disk, zero-filled or dirtied
3188  * via an mmap) so we can only do damage by trying to zero-fill
3189  *
3190  */
3191 static int
cluster_zero_range(upl_t upl,upl_page_info_t * pl,int flags,int io_offset,off_t zero_off,off_t upl_f_offset,int bytes_to_zero)3192 cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3193 {
3194 	int zero_pg_index;
3195 	boolean_t need_cluster_zero = TRUE;
3196 
3197 	if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
3198 		bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
3199 		zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3200 
3201 		if (upl_valid_page(pl, zero_pg_index)) {
3202 			/*
3203 			 * never force zero valid pages - dirty or clean
3204 			 * we'll leave these in the UPL for cluster_write_copy to deal with
3205 			 */
3206 			need_cluster_zero = FALSE;
3207 		}
3208 	}
3209 	if (need_cluster_zero == TRUE) {
3210 		cluster_zero(upl, io_offset, bytes_to_zero, NULL);
3211 	}
3212 
3213 	return bytes_to_zero;
3214 }
3215 
3216 
3217 void
cluster_update_state(vnode_t vp,vm_object_offset_t s_offset,vm_object_offset_t e_offset,boolean_t vm_initiated)3218 cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3219 {
3220 	struct cl_extent cl;
3221 	boolean_t first_pass = TRUE;
3222 
3223 	assert(s_offset < e_offset);
3224 	assert((s_offset & PAGE_MASK_64) == 0);
3225 	assert((e_offset & PAGE_MASK_64) == 0);
3226 
3227 	cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3228 	cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3229 
3230 	cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
3231 	    vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3232 }
3233 
3234 
3235 static void
cluster_update_state_internal(vnode_t vp,struct cl_extent * cl,int flags,boolean_t defer_writes,boolean_t * first_pass,off_t write_off,int write_cnt,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)3236 cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
3237     boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3238     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
3239 {
3240 	struct cl_writebehind *wbp;
3241 	int     cl_index;
3242 	int     ret_cluster_try_push;
3243 	u_int   max_cluster_pgcount;
3244 
3245 
3246 	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3247 
3248 	/*
3249 	 * take the lock to protect our accesses
3250 	 * of the writebehind and sparse cluster state
3251 	 */
3252 	wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3253 
3254 	if (wbp->cl_scmap) {
3255 		if (!(flags & IO_NOCACHE)) {
3256 			/*
3257 			 * we've fallen into the sparse
3258 			 * cluster method of delaying dirty pages
3259 			 */
3260 			sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3261 
3262 			lck_mtx_unlock(&wbp->cl_lockw);
3263 			return;
3264 		}
3265 		/*
3266 		 * must have done cached writes that fell into
3267 		 * the sparse cluster mechanism... we've switched
3268 		 * to uncached writes on the file, so go ahead
3269 		 * and push whatever's in the sparse map
3270 		 * and switch back to normal clustering
3271 		 */
3272 		wbp->cl_number = 0;
3273 
3274 		sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
3275 		/*
3276 		 * no clusters of either type present at this point
3277 		 * so just go directly to start_new_cluster since
3278 		 * we know we need to delay this I/O since we've
3279 		 * already released the pages back into the cache
3280 		 * to avoid the deadlock with sparse_cluster_push
3281 		 */
3282 		goto start_new_cluster;
3283 	}
3284 	if (*first_pass == TRUE) {
3285 		if (write_off == wbp->cl_last_write) {
3286 			wbp->cl_seq_written += write_cnt;
3287 		} else {
3288 			wbp->cl_seq_written = write_cnt;
3289 		}
3290 
3291 		wbp->cl_last_write = write_off + write_cnt;
3292 
3293 		*first_pass = FALSE;
3294 	}
3295 	if (wbp->cl_number == 0) {
3296 		/*
3297 		 * no clusters currently present
3298 		 */
3299 		goto start_new_cluster;
3300 	}
3301 
3302 	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3303 		/*
3304 		 * check each cluster that we currently hold
3305 		 * try to merge some or all of this write into
3306 		 * one or more of the existing clusters... if
3307 		 * any portion of the write remains, start a
3308 		 * new cluster
3309 		 */
3310 		if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3311 			/*
3312 			 * the current write starts at or after the current cluster
3313 			 */
3314 			if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3315 				/*
3316 				 * we have a write that fits entirely
3317 				 * within the existing cluster limits
3318 				 */
3319 				if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3320 					/*
3321 					 * update our idea of where the cluster ends
3322 					 */
3323 					wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3324 				}
3325 				break;
3326 			}
3327 			if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3328 				/*
3329 				 * we have a write that starts in the middle of the current cluster
3330 				 * but extends beyond the cluster's limit... we know this because
3331 				 * of the previous checks
3332 				 * we'll extend the current cluster to the max
3333 				 * and update the b_addr for the current write to reflect that
3334 				 * the head of it was absorbed into this cluster...
3335 				 * note that we'll always have a leftover tail in this case since
3336 				 * full absorbtion would have occurred in the clause above
3337 				 */
3338 				wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3339 
3340 				cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3341 			}
3342 			/*
3343 			 * we come here for the case where the current write starts
3344 			 * beyond the limit of the existing cluster or we have a leftover
3345 			 * tail after a partial absorbtion
3346 			 *
3347 			 * in either case, we'll check the remaining clusters before
3348 			 * starting a new one
3349 			 */
3350 		} else {
3351 			/*
3352 			 * the current write starts in front of the cluster we're currently considering
3353 			 */
3354 			if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3355 				/*
3356 				 * we can just merge the new request into
3357 				 * this cluster and leave it in the cache
3358 				 * since the resulting cluster is still
3359 				 * less than the maximum allowable size
3360 				 */
3361 				wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3362 
3363 				if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3364 					/*
3365 					 * the current write completely
3366 					 * envelops the existing cluster and since
3367 					 * each write is limited to at most max_cluster_pgcount pages
3368 					 * we can just use the start and last blocknos of the write
3369 					 * to generate the cluster limits
3370 					 */
3371 					wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3372 				}
3373 				break;
3374 			}
3375 			/*
3376 			 * if we were to combine this write with the current cluster
3377 			 * we would exceed the cluster size limit.... so,
3378 			 * let's see if there's any overlap of the new I/O with
3379 			 * the cluster we're currently considering... in fact, we'll
3380 			 * stretch the cluster out to it's full limit and see if we
3381 			 * get an intersection with the current write
3382 			 *
3383 			 */
3384 			if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3385 				/*
3386 				 * the current write extends into the proposed cluster
3387 				 * clip the length of the current write after first combining it's
3388 				 * tail with the newly shaped cluster
3389 				 */
3390 				wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3391 
3392 				cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3393 			}
3394 			/*
3395 			 * if we get here, there was no way to merge
3396 			 * any portion of this write with this cluster
3397 			 * or we could only merge part of it which
3398 			 * will leave a tail...
3399 			 * we'll check the remaining clusters before starting a new one
3400 			 */
3401 		}
3402 	}
3403 	if (cl_index < wbp->cl_number) {
3404 		/*
3405 		 * we found an existing cluster(s) that we
3406 		 * could entirely merge this I/O into
3407 		 */
3408 		goto delay_io;
3409 	}
3410 
3411 	if (defer_writes == FALSE &&
3412 	    wbp->cl_number == MAX_CLUSTERS &&
3413 	    wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3414 		uint32_t        n;
3415 
3416 		if (vp->v_mount->mnt_minsaturationbytecount) {
3417 			n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
3418 
3419 			if (n > MAX_CLUSTERS) {
3420 				n = MAX_CLUSTERS;
3421 			}
3422 		} else {
3423 			n = 0;
3424 		}
3425 
3426 		if (n == 0) {
3427 			if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
3428 				n = WRITE_BEHIND_SSD;
3429 			} else {
3430 				n = WRITE_BEHIND;
3431 			}
3432 		}
3433 		while (n--) {
3434 			cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
3435 		}
3436 	}
3437 	if (wbp->cl_number < MAX_CLUSTERS) {
3438 		/*
3439 		 * we didn't find an existing cluster to
3440 		 * merge into, but there's room to start
3441 		 * a new one
3442 		 */
3443 		goto start_new_cluster;
3444 	}
3445 	/*
3446 	 * no exisitng cluster to merge with and no
3447 	 * room to start a new one... we'll try
3448 	 * pushing one of the existing ones... if none of
3449 	 * them are able to be pushed, we'll switch
3450 	 * to the sparse cluster mechanism
3451 	 * cluster_try_push updates cl_number to the
3452 	 * number of remaining clusters... and
3453 	 * returns the number of currently unused clusters
3454 	 */
3455 	ret_cluster_try_push = 0;
3456 
3457 	/*
3458 	 * if writes are not deferred, call cluster push immediately
3459 	 */
3460 	if (defer_writes == FALSE) {
3461 		ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
3462 	}
3463 	/*
3464 	 * execute following regardless of writes being deferred or not
3465 	 */
3466 	if (ret_cluster_try_push == 0) {
3467 		/*
3468 		 * no more room in the normal cluster mechanism
3469 		 * so let's switch to the more expansive but expensive
3470 		 * sparse mechanism....
3471 		 */
3472 		sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
3473 		sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3474 
3475 		lck_mtx_unlock(&wbp->cl_lockw);
3476 		return;
3477 	}
3478 start_new_cluster:
3479 	wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3480 	wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3481 
3482 	wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3483 
3484 	if (flags & IO_NOCACHE) {
3485 		wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3486 	}
3487 
3488 	if (flags & IO_PASSIVE) {
3489 		wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3490 	}
3491 
3492 	wbp->cl_number++;
3493 delay_io:
3494 	lck_mtx_unlock(&wbp->cl_lockw);
3495 	return;
3496 }
3497 
3498 
3499 static int
cluster_write_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int flags,int (* callback)(buf_t,void *),void * callback_arg)3500 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
3501     off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3502 {
3503 	upl_page_info_t *pl;
3504 	upl_t            upl;
3505 	vm_offset_t      upl_offset = 0;
3506 	vm_size_t        upl_size;
3507 	off_t            upl_f_offset;
3508 	int              pages_in_upl;
3509 	int              start_offset;
3510 	int              xfer_resid;
3511 	int              io_size;
3512 	int              io_offset;
3513 	int              bytes_to_zero;
3514 	int              bytes_to_move;
3515 	kern_return_t    kret;
3516 	int              retval = 0;
3517 	int              io_resid;
3518 	long long        total_size;
3519 	long long        zero_cnt;
3520 	off_t            zero_off;
3521 	long long        zero_cnt1;
3522 	off_t            zero_off1;
3523 	off_t            write_off = 0;
3524 	int              write_cnt = 0;
3525 	boolean_t        first_pass = FALSE;
3526 	struct cl_extent cl;
3527 	int              bflag;
3528 	u_int            max_io_size;
3529 
3530 	if (uio) {
3531 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3532 		    (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
3533 
3534 		io_resid = io_req_size;
3535 	} else {
3536 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3537 		    0, 0, (int)oldEOF, (int)newEOF, 0);
3538 
3539 		io_resid = 0;
3540 	}
3541 	if (flags & IO_PASSIVE) {
3542 		bflag = CL_PASSIVE;
3543 	} else {
3544 		bflag = 0;
3545 	}
3546 	if (flags & IO_NOCACHE) {
3547 		bflag |= CL_NOCACHE;
3548 	}
3549 
3550 	if (flags & IO_SKIP_ENCRYPTION) {
3551 		bflag |= CL_ENCRYPTED;
3552 	}
3553 
3554 	zero_cnt  = 0;
3555 	zero_cnt1 = 0;
3556 	zero_off  = 0;
3557 	zero_off1 = 0;
3558 
3559 	max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
3560 
3561 	if (flags & IO_HEADZEROFILL) {
3562 		/*
3563 		 * some filesystems (HFS is one) don't support unallocated holes within a file...
3564 		 * so we zero fill the intervening space between the old EOF and the offset
3565 		 * where the next chunk of real data begins.... ftruncate will also use this
3566 		 * routine to zero fill to the new EOF when growing a file... in this case, the
3567 		 * uio structure will not be provided
3568 		 */
3569 		if (uio) {
3570 			if (headOff < uio->uio_offset) {
3571 				zero_cnt = uio->uio_offset - headOff;
3572 				zero_off = headOff;
3573 			}
3574 		} else if (headOff < newEOF) {
3575 			zero_cnt = newEOF - headOff;
3576 			zero_off = headOff;
3577 		}
3578 	} else {
3579 		if (uio && uio->uio_offset > oldEOF) {
3580 			zero_off = uio->uio_offset & ~PAGE_MASK_64;
3581 
3582 			if (zero_off >= oldEOF) {
3583 				zero_cnt = uio->uio_offset - zero_off;
3584 
3585 				flags |= IO_HEADZEROFILL;
3586 			}
3587 		}
3588 	}
3589 	if (flags & IO_TAILZEROFILL) {
3590 		if (uio) {
3591 			zero_off1 = uio->uio_offset + io_req_size;
3592 
3593 			if (zero_off1 < tailOff) {
3594 				zero_cnt1 = tailOff - zero_off1;
3595 			}
3596 		}
3597 	} else {
3598 		if (uio && newEOF > oldEOF) {
3599 			zero_off1 = uio->uio_offset + io_req_size;
3600 
3601 			if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3602 				zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3603 
3604 				flags |= IO_TAILZEROFILL;
3605 			}
3606 		}
3607 	}
3608 	if (zero_cnt == 0 && uio == (struct uio *) 0) {
3609 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3610 		    retval, 0, 0, 0, 0);
3611 		return 0;
3612 	}
3613 	if (uio) {
3614 		write_off = uio->uio_offset;
3615 		write_cnt = (int)uio_resid(uio);
3616 		/*
3617 		 * delay updating the sequential write info
3618 		 * in the control block until we've obtained
3619 		 * the lock for it
3620 		 */
3621 		first_pass = TRUE;
3622 	}
3623 	while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
3624 		/*
3625 		 * for this iteration of the loop, figure out where our starting point is
3626 		 */
3627 		if (zero_cnt) {
3628 			start_offset = (int)(zero_off & PAGE_MASK_64);
3629 			upl_f_offset = zero_off - start_offset;
3630 		} else if (io_resid) {
3631 			start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3632 			upl_f_offset = uio->uio_offset - start_offset;
3633 		} else {
3634 			start_offset = (int)(zero_off1 & PAGE_MASK_64);
3635 			upl_f_offset = zero_off1 - start_offset;
3636 		}
3637 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3638 		    (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
3639 
3640 		if (total_size > max_io_size) {
3641 			total_size = max_io_size;
3642 		}
3643 
3644 		cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3645 
3646 		if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
3647 			/*
3648 			 * assumption... total_size <= io_resid
3649 			 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3650 			 */
3651 			if ((start_offset + total_size) > max_io_size) {
3652 				total_size = max_io_size - start_offset;
3653 			}
3654 			xfer_resid = (int)total_size;
3655 
3656 			retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
3657 
3658 			if (retval) {
3659 				break;
3660 			}
3661 
3662 			io_resid    -= (total_size - xfer_resid);
3663 			total_size   = xfer_resid;
3664 			start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3665 			upl_f_offset = uio->uio_offset - start_offset;
3666 
3667 			if (total_size == 0) {
3668 				if (start_offset) {
3669 					/*
3670 					 * the write did not finish on a page boundary
3671 					 * which will leave upl_f_offset pointing to the
3672 					 * beginning of the last page written instead of
3673 					 * the page beyond it... bump it in this case
3674 					 * so that the cluster code records the last page
3675 					 * written as dirty
3676 					 */
3677 					upl_f_offset += PAGE_SIZE_64;
3678 				}
3679 				upl_size = 0;
3680 
3681 				goto check_cluster;
3682 			}
3683 		}
3684 		/*
3685 		 * compute the size of the upl needed to encompass
3686 		 * the requested write... limit each call to cluster_io
3687 		 * to the maximum UPL size... cluster_io will clip if
3688 		 * this exceeds the maximum io_size for the device,
3689 		 * make sure to account for
3690 		 * a starting offset that's not page aligned
3691 		 */
3692 		upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3693 
3694 		if (upl_size > max_io_size) {
3695 			upl_size = max_io_size;
3696 		}
3697 
3698 		pages_in_upl = (int)(upl_size / PAGE_SIZE);
3699 		io_size      = (int)(upl_size - start_offset);
3700 
3701 		if ((long long)io_size > total_size) {
3702 			io_size = (int)total_size;
3703 		}
3704 
3705 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
3706 
3707 
3708 		/*
3709 		 * Gather the pages from the buffer cache.
3710 		 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3711 		 * that we intend to modify these pages.
3712 		 */
3713 		kret = ubc_create_upl_kernel(vp,
3714 		    upl_f_offset,
3715 		    (int)upl_size,
3716 		    &upl,
3717 		    &pl,
3718 		    UPL_SET_LITE | ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
3719 		    VM_KERN_MEMORY_FILE);
3720 		if (kret != KERN_SUCCESS) {
3721 			panic("cluster_write_copy: failed to get pagelist");
3722 		}
3723 
3724 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
3725 		    upl, (int)upl_f_offset, start_offset, 0, 0);
3726 
3727 		if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
3728 			int   read_size;
3729 
3730 			/*
3731 			 * we're starting in the middle of the first page of the upl
3732 			 * and the page isn't currently valid, so we're going to have
3733 			 * to read it in first... this is a synchronous operation
3734 			 */
3735 			read_size = PAGE_SIZE;
3736 
3737 			if ((upl_f_offset + read_size) > oldEOF) {
3738 				read_size = (int)(oldEOF - upl_f_offset);
3739 			}
3740 
3741 			retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
3742 			    CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3743 			if (retval) {
3744 				/*
3745 				 * we had an error during the read which causes us to abort
3746 				 * the current cluster_write request... before we do, we need
3747 				 * to release the rest of the pages in the upl without modifying
3748 				 * there state and mark the failed page in error
3749 				 */
3750 				ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3751 
3752 				if (upl_size > PAGE_SIZE) {
3753 					ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size,
3754 					    UPL_ABORT_FREE_ON_EMPTY);
3755 				}
3756 
3757 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3758 				    upl, 0, 0, retval, 0);
3759 				break;
3760 			}
3761 		}
3762 		if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3763 			/*
3764 			 * the last offset we're writing to in this upl does not end on a page
3765 			 * boundary... if it's not beyond the old EOF, then we'll also need to
3766 			 * pre-read this page in if it isn't already valid
3767 			 */
3768 			upl_offset = upl_size - PAGE_SIZE;
3769 
3770 			if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3771 			    !upl_valid_page(pl, (int)(upl_offset / PAGE_SIZE))) {
3772 				int   read_size;
3773 
3774 				read_size = PAGE_SIZE;
3775 
3776 				if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
3777 					read_size = (int)(oldEOF - (upl_f_offset + upl_offset));
3778 				}
3779 
3780 				retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
3781 				    CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3782 				if (retval) {
3783 					/*
3784 					 * we had an error during the read which causes us to abort
3785 					 * the current cluster_write request... before we do, we
3786 					 * need to release the rest of the pages in the upl without
3787 					 * modifying there state and mark the failed page in error
3788 					 */
3789 					ubc_upl_abort_range(upl, (upl_offset_t)upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3790 
3791 					if (upl_size > PAGE_SIZE) {
3792 						ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3793 					}
3794 
3795 					KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3796 					    upl, 0, 0, retval, 0);
3797 					break;
3798 				}
3799 			}
3800 		}
3801 		xfer_resid = io_size;
3802 		io_offset = start_offset;
3803 
3804 		while (zero_cnt && xfer_resid) {
3805 			if (zero_cnt < (long long)xfer_resid) {
3806 				bytes_to_zero = (int)zero_cnt;
3807 			} else {
3808 				bytes_to_zero = xfer_resid;
3809 			}
3810 
3811 			bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3812 
3813 			xfer_resid -= bytes_to_zero;
3814 			zero_cnt   -= bytes_to_zero;
3815 			zero_off   += bytes_to_zero;
3816 			io_offset  += bytes_to_zero;
3817 		}
3818 		if (xfer_resid && io_resid) {
3819 			u_int32_t  io_requested;
3820 
3821 			bytes_to_move = min(io_resid, xfer_resid);
3822 			io_requested = bytes_to_move;
3823 
3824 			retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3825 
3826 			if (retval) {
3827 				ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3828 
3829 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3830 				    upl, 0, 0, retval, 0);
3831 			} else {
3832 				io_resid   -= bytes_to_move;
3833 				xfer_resid -= bytes_to_move;
3834 				io_offset  += bytes_to_move;
3835 			}
3836 		}
3837 		while (xfer_resid && zero_cnt1 && retval == 0) {
3838 			if (zero_cnt1 < (long long)xfer_resid) {
3839 				bytes_to_zero = (int)zero_cnt1;
3840 			} else {
3841 				bytes_to_zero = xfer_resid;
3842 			}
3843 
3844 			bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
3845 
3846 			xfer_resid -= bytes_to_zero;
3847 			zero_cnt1  -= bytes_to_zero;
3848 			zero_off1  += bytes_to_zero;
3849 			io_offset  += bytes_to_zero;
3850 		}
3851 		if (retval == 0) {
3852 			int do_zeroing = 1;
3853 
3854 			io_size += start_offset;
3855 
3856 			/* Force more restrictive zeroing behavior only on APFS */
3857 			if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3858 				do_zeroing = 0;
3859 			}
3860 
3861 			if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3862 				/*
3863 				 * if we're extending the file with this write
3864 				 * we'll zero fill the rest of the page so that
3865 				 * if the file gets extended again in such a way as to leave a
3866 				 * hole starting at this EOF, we'll have zero's in the correct spot
3867 				 */
3868 				cluster_zero(upl, io_size, (int)(upl_size - io_size), NULL);
3869 			}
3870 			/*
3871 			 * release the upl now if we hold one since...
3872 			 * 1) pages in it may be present in the sparse cluster map
3873 			 *    and may span 2 separate buckets there... if they do and
3874 			 *    we happen to have to flush a bucket to make room and it intersects
3875 			 *    this upl, a deadlock may result on page BUSY
3876 			 * 2) we're delaying the I/O... from this point forward we're just updating
3877 			 *    the cluster state... no need to hold the pages, so commit them
3878 			 * 3) IO_SYNC is set...
3879 			 *    because we had to ask for a UPL that provides currenty non-present pages, the
3880 			 *    UPL has been automatically set to clear the dirty flags (both software and hardware)
3881 			 *    upon committing it... this is not the behavior we want since it's possible for
3882 			 *    pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3883 			 *    we'll pick these pages back up later with the correct behavior specified.
3884 			 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3885 			 *    of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3886 			 *    we hold since the flushing context is holding the cluster lock.
3887 			 */
3888 			ubc_upl_commit_range(upl, 0, (upl_size_t)upl_size,
3889 			    UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
3890 check_cluster:
3891 			/*
3892 			 * calculate the last logical block number
3893 			 * that this delayed I/O encompassed
3894 			 */
3895 			cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3896 
3897 			if (flags & IO_SYNC) {
3898 				/*
3899 				 * if the IO_SYNC flag is set than we need to bypass
3900 				 * any clustering and immediately issue the I/O
3901 				 *
3902 				 * we don't hold the lock at this point
3903 				 *
3904 				 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3905 				 * so that we correctly deal with a change in state of the hardware modify bit...
3906 				 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3907 				 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3908 				 * responsible for generating the correct sized I/O(s)
3909 				 */
3910 				retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
3911 			} else {
3912 				boolean_t defer_writes = FALSE;
3913 
3914 				if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) {
3915 					defer_writes = TRUE;
3916 				}
3917 
3918 				cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
3919 				    write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
3920 			}
3921 		}
3922 	}
3923 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
3924 
3925 	return retval;
3926 }
3927 
3928 
3929 
3930 int
cluster_read(vnode_t vp,struct uio * uio,off_t filesize,int xflags)3931 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
3932 {
3933 	return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3934 }
3935 
3936 
3937 int
cluster_read_ext(vnode_t vp,struct uio * uio,off_t filesize,int xflags,int (* callback)(buf_t,void *),void * callback_arg)3938 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3939 {
3940 	int             retval = 0;
3941 	int             flags;
3942 	user_ssize_t    cur_resid;
3943 	u_int32_t       io_size;
3944 	u_int32_t       read_length = 0;
3945 	int             read_type = IO_COPY;
3946 
3947 	flags = xflags;
3948 
3949 	if (vp->v_flag & VNOCACHE_DATA) {
3950 		flags |= IO_NOCACHE;
3951 	}
3952 	if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) {
3953 		flags |= IO_RAOFF;
3954 	}
3955 
3956 	if (flags & IO_SKIP_ENCRYPTION) {
3957 		flags |= IO_ENCRYPTED;
3958 	}
3959 
3960 	/*
3961 	 * do a read through the cache if one of the following is true....
3962 	 *   NOCACHE is not true
3963 	 *   the uio request doesn't target USERSPACE
3964 	 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3965 	 * Reading encrypted data from a CP filesystem should never result in the data touching
3966 	 * the UBC.
3967 	 *
3968 	 * otherwise, find out if we want the direct or contig variant for
3969 	 * the first vector in the uio request
3970 	 */
3971 	if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED)) {
3972 		retval = cluster_io_type(uio, &read_type, &read_length, 0);
3973 	}
3974 
3975 	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
3976 		switch (read_type) {
3977 		case IO_COPY:
3978 			/*
3979 			 * make sure the uio_resid isn't too big...
3980 			 * internally, we want to handle all of the I/O in
3981 			 * chunk sizes that fit in a 32 bit int
3982 			 */
3983 			if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
3984 				io_size = MAX_IO_REQUEST_SIZE;
3985 			} else {
3986 				io_size = (u_int32_t)cur_resid;
3987 			}
3988 
3989 			retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
3990 			break;
3991 
3992 		case IO_DIRECT:
3993 			retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
3994 			break;
3995 
3996 		case IO_CONTIG:
3997 			retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
3998 			break;
3999 
4000 		case IO_UNKNOWN:
4001 			retval = cluster_io_type(uio, &read_type, &read_length, 0);
4002 			break;
4003 		}
4004 	}
4005 	return retval;
4006 }
4007 
4008 
4009 
4010 static void
cluster_read_upl_release(upl_t upl,int start_pg,int last_pg,int take_reference)4011 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
4012 {
4013 	int range;
4014 	int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4015 
4016 	if ((range = last_pg - start_pg)) {
4017 		if (take_reference) {
4018 			abort_flags |= UPL_ABORT_REFERENCE;
4019 		}
4020 
4021 		ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
4022 	}
4023 }
4024 
4025 
4026 static int
cluster_read_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)4027 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4028 {
4029 	upl_page_info_t *pl;
4030 	upl_t            upl;
4031 	vm_offset_t      upl_offset;
4032 	u_int32_t        upl_size;
4033 	off_t            upl_f_offset;
4034 	int              start_offset;
4035 	int              start_pg;
4036 	int              last_pg;
4037 	int              uio_last = 0;
4038 	int              pages_in_upl;
4039 	off_t            max_size;
4040 	off_t            last_ioread_offset;
4041 	off_t            last_request_offset;
4042 	kern_return_t    kret;
4043 	int              error  = 0;
4044 	int              retval = 0;
4045 	u_int32_t        size_of_prefetch;
4046 	u_int32_t        xsize;
4047 	u_int32_t        io_size;
4048 	u_int32_t        max_rd_size;
4049 	u_int32_t        max_io_size;
4050 	u_int32_t        max_prefetch;
4051 	u_int            rd_ahead_enabled = 1;
4052 	u_int            prefetch_enabled = 1;
4053 	struct cl_readahead *   rap;
4054 	struct clios            iostate;
4055 	struct cl_extent        extent;
4056 	int              bflag;
4057 	int              take_reference = 1;
4058 	int              policy = IOPOL_DEFAULT;
4059 	boolean_t        iolock_inited = FALSE;
4060 
4061 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
4062 	    (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
4063 
4064 	if (flags & IO_ENCRYPTED) {
4065 		panic("encrypted blocks will hit UBC!");
4066 	}
4067 
4068 	policy = throttle_get_io_policy(NULL);
4069 
4070 	if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) {
4071 		take_reference = 0;
4072 	}
4073 
4074 	if (flags & IO_PASSIVE) {
4075 		bflag = CL_PASSIVE;
4076 	} else {
4077 		bflag = 0;
4078 	}
4079 
4080 	if (flags & IO_NOCACHE) {
4081 		bflag |= CL_NOCACHE;
4082 	}
4083 
4084 	if (flags & IO_SKIP_ENCRYPTION) {
4085 		bflag |= CL_ENCRYPTED;
4086 	}
4087 
4088 	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
4089 	max_prefetch = MAX_PREFETCH(vp, max_io_size, disk_conditioner_mount_is_ssd(vp->v_mount));
4090 	max_rd_size = max_prefetch;
4091 
4092 	last_request_offset = uio->uio_offset + io_req_size;
4093 
4094 	if (last_request_offset > filesize) {
4095 		last_request_offset = filesize;
4096 	}
4097 
4098 	if ((flags & (IO_RAOFF | IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
4099 		rd_ahead_enabled = 0;
4100 		rap = NULL;
4101 	} else {
4102 		if (cluster_is_throttled(vp)) {
4103 			/*
4104 			 * we're in the throttle window, at the very least
4105 			 * we want to limit the size of the I/O we're about
4106 			 * to issue
4107 			 */
4108 			rd_ahead_enabled = 0;
4109 			prefetch_enabled = 0;
4110 
4111 			max_rd_size = THROTTLE_MAX_IOSIZE;
4112 		}
4113 		if ((rap = cluster_get_rap(vp)) == NULL) {
4114 			rd_ahead_enabled = 0;
4115 		} else {
4116 			extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
4117 			extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
4118 		}
4119 	}
4120 	if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
4121 		/*
4122 		 * determine if we already have a read-ahead in the pipe courtesy of the
4123 		 * last read systemcall that was issued...
4124 		 * if so, pick up it's extent to determine where we should start
4125 		 * with respect to any read-ahead that might be necessary to
4126 		 * garner all the data needed to complete this read systemcall
4127 		 */
4128 		last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
4129 
4130 		if (last_ioread_offset < uio->uio_offset) {
4131 			last_ioread_offset = (off_t)0;
4132 		} else if (last_ioread_offset > last_request_offset) {
4133 			last_ioread_offset = last_request_offset;
4134 		}
4135 	} else {
4136 		last_ioread_offset = (off_t)0;
4137 	}
4138 
4139 	while (io_req_size && uio->uio_offset < filesize && retval == 0) {
4140 		max_size = filesize - uio->uio_offset;
4141 		bool leftover_upl_aborted = false;
4142 
4143 		if ((off_t)(io_req_size) < max_size) {
4144 			io_size = io_req_size;
4145 		} else {
4146 			io_size = (u_int32_t)max_size;
4147 		}
4148 
4149 		if (!(flags & IO_NOCACHE)) {
4150 			while (io_size) {
4151 				u_int32_t io_resid;
4152 				u_int32_t io_requested;
4153 
4154 				/*
4155 				 * if we keep finding the pages we need already in the cache, then
4156 				 * don't bother to call cluster_read_prefetch since it costs CPU cycles
4157 				 * to determine that we have all the pages we need... once we miss in
4158 				 * the cache and have issued an I/O, than we'll assume that we're likely
4159 				 * to continue to miss in the cache and it's to our advantage to try and prefetch
4160 				 */
4161 				if (last_request_offset && last_ioread_offset && (size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset))) {
4162 					if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4163 						/*
4164 						 * we've already issued I/O for this request and
4165 						 * there's still work to do and
4166 						 * our prefetch stream is running dry, so issue a
4167 						 * pre-fetch I/O... the I/O latency will overlap
4168 						 * with the copying of the data
4169 						 */
4170 						if (size_of_prefetch > max_rd_size) {
4171 							size_of_prefetch = max_rd_size;
4172 						}
4173 
4174 						size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4175 
4176 						last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4177 
4178 						if (last_ioread_offset > last_request_offset) {
4179 							last_ioread_offset = last_request_offset;
4180 						}
4181 					}
4182 				}
4183 				/*
4184 				 * limit the size of the copy we're about to do so that
4185 				 * we can notice that our I/O pipe is running dry and
4186 				 * get the next I/O issued before it does go dry
4187 				 */
4188 				if (last_ioread_offset && io_size > (max_io_size / 4)) {
4189 					io_resid = (max_io_size / 4);
4190 				} else {
4191 					io_resid = io_size;
4192 				}
4193 
4194 				io_requested = io_resid;
4195 
4196 				retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
4197 
4198 				xsize = io_requested - io_resid;
4199 
4200 				io_size -= xsize;
4201 				io_req_size -= xsize;
4202 
4203 				if (retval || io_resid) {
4204 					/*
4205 					 * if we run into a real error or
4206 					 * a page that is not in the cache
4207 					 * we need to leave streaming mode
4208 					 */
4209 					break;
4210 				}
4211 
4212 				if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
4213 					/*
4214 					 * we're already finished the I/O for this read request
4215 					 * let's see if we should do a read-ahead
4216 					 */
4217 					cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4218 				}
4219 			}
4220 			if (retval) {
4221 				break;
4222 			}
4223 			if (io_size == 0) {
4224 				if (rap != NULL) {
4225 					if (extent.e_addr < rap->cl_lastr) {
4226 						rap->cl_maxra = 0;
4227 					}
4228 					rap->cl_lastr = extent.e_addr;
4229 				}
4230 				break;
4231 			}
4232 			/*
4233 			 * recompute max_size since cluster_copy_ubc_data_internal
4234 			 * may have advanced uio->uio_offset
4235 			 */
4236 			max_size = filesize - uio->uio_offset;
4237 		}
4238 
4239 		iostate.io_completed = 0;
4240 		iostate.io_issued = 0;
4241 		iostate.io_error = 0;
4242 		iostate.io_wanted = 0;
4243 
4244 		if ((flags & IO_RETURN_ON_THROTTLE)) {
4245 			if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4246 				if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4247 					/*
4248 					 * we're in the throttle window and at least 1 I/O
4249 					 * has already been issued by a throttleable thread
4250 					 * in this window, so return with EAGAIN to indicate
4251 					 * to the FS issuing the cluster_read call that it
4252 					 * should now throttle after dropping any locks
4253 					 */
4254 					throttle_info_update_by_mount(vp->v_mount);
4255 
4256 					retval = EAGAIN;
4257 					break;
4258 				}
4259 			}
4260 		}
4261 
4262 		/*
4263 		 * compute the size of the upl needed to encompass
4264 		 * the requested read... limit each call to cluster_io
4265 		 * to the maximum UPL size... cluster_io will clip if
4266 		 * this exceeds the maximum io_size for the device,
4267 		 * make sure to account for
4268 		 * a starting offset that's not page aligned
4269 		 */
4270 		start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4271 		upl_f_offset = uio->uio_offset - (off_t)start_offset;
4272 
4273 		if (io_size > max_rd_size) {
4274 			io_size = max_rd_size;
4275 		}
4276 
4277 		upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4278 
4279 		if (flags & IO_NOCACHE) {
4280 			if (upl_size > max_io_size) {
4281 				upl_size = max_io_size;
4282 			}
4283 		} else {
4284 			if (upl_size > max_io_size / 4) {
4285 				upl_size = max_io_size / 4;
4286 				upl_size &= ~PAGE_MASK;
4287 
4288 				if (upl_size == 0) {
4289 					upl_size = PAGE_SIZE;
4290 				}
4291 			}
4292 		}
4293 		pages_in_upl = upl_size / PAGE_SIZE;
4294 
4295 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
4296 		    upl, (int)upl_f_offset, upl_size, start_offset, 0);
4297 
4298 		kret = ubc_create_upl_kernel(vp,
4299 		    upl_f_offset,
4300 		    upl_size,
4301 		    &upl,
4302 		    &pl,
4303 		    UPL_FILE_IO | UPL_SET_LITE,
4304 		    VM_KERN_MEMORY_FILE);
4305 		if (kret != KERN_SUCCESS) {
4306 			panic("cluster_read_copy: failed to get pagelist");
4307 		}
4308 
4309 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
4310 		    upl, (int)upl_f_offset, upl_size, start_offset, 0);
4311 
4312 		/*
4313 		 * scan from the beginning of the upl looking for the first
4314 		 * non-valid page.... this will become the first page in
4315 		 * the request we're going to make to 'cluster_io'... if all
4316 		 * of the pages are valid, we won't call through to 'cluster_io'
4317 		 */
4318 		for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
4319 			if (!upl_valid_page(pl, start_pg)) {
4320 				break;
4321 			}
4322 		}
4323 
4324 		/*
4325 		 * scan from the starting invalid page looking for a valid
4326 		 * page before the end of the upl is reached, if we
4327 		 * find one, then it will be the last page of the request to
4328 		 * 'cluster_io'
4329 		 */
4330 		for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4331 			if (upl_valid_page(pl, last_pg)) {
4332 				break;
4333 			}
4334 		}
4335 
4336 		if (start_pg < last_pg) {
4337 			/*
4338 			 * we found a range of 'invalid' pages that must be filled
4339 			 * if the last page in this range is the last page of the file
4340 			 * we may have to clip the size of it to keep from reading past
4341 			 * the end of the last physical block associated with the file
4342 			 */
4343 			if (iolock_inited == FALSE) {
4344 				lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4345 
4346 				iolock_inited = TRUE;
4347 			}
4348 			upl_offset = start_pg * PAGE_SIZE;
4349 			io_size    = (last_pg - start_pg) * PAGE_SIZE;
4350 
4351 			if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
4352 				io_size = (u_int32_t)(filesize - (upl_f_offset + upl_offset));
4353 			}
4354 
4355 			/*
4356 			 * Find out if this needs verification, we'll have to manage the UPL
4357 			 * diffrently if so. Note that this call only lets us know if
4358 			 * verification is enabled on this mount point, the actual verification
4359 			 * is performed in the File system.
4360 			 */
4361 			size_t verify_block_size = 0;
4362 			if ((VNOP_VERIFY(vp, start_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) /* && verify_block_size */) {
4363 				for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4364 					if (!upl_valid_page(pl, uio_last)) {
4365 						break;
4366 					}
4367 				}
4368 				if (uio_last < pages_in_upl) {
4369 					/*
4370 					 * there were some invalid pages beyond the valid pages
4371 					 * that we didn't issue an I/O for, just release them
4372 					 * unchanged now, so that any prefetch/readahed can
4373 					 * include them
4374 					 */
4375 					ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4376 					    (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4377 					leftover_upl_aborted = true;
4378 				}
4379 			}
4380 
4381 			/*
4382 			 * issue an asynchronous read to cluster_io
4383 			 */
4384 
4385 			error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
4386 			    io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
4387 
4388 			if (rap) {
4389 				if (extent.e_addr < rap->cl_maxra) {
4390 					/*
4391 					 * we've just issued a read for a block that should have been
4392 					 * in the cache courtesy of the read-ahead engine... something
4393 					 * has gone wrong with the pipeline, so reset the read-ahead
4394 					 * logic which will cause us to restart from scratch
4395 					 */
4396 					rap->cl_maxra = 0;
4397 				}
4398 			}
4399 		}
4400 		if (error == 0) {
4401 			/*
4402 			 * if the read completed successfully, or there was no I/O request
4403 			 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4404 			 * we'll first add on any 'valid'
4405 			 * pages that were present in the upl when we acquired it.
4406 			 */
4407 			u_int  val_size;
4408 
4409 			if (!leftover_upl_aborted) {
4410 				for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4411 					if (!upl_valid_page(pl, uio_last)) {
4412 						break;
4413 					}
4414 				}
4415 				if (uio_last < pages_in_upl) {
4416 					/*
4417 					 * there were some invalid pages beyond the valid pages
4418 					 * that we didn't issue an I/O for, just release them
4419 					 * unchanged now, so that any prefetch/readahed can
4420 					 * include them
4421 					 */
4422 					ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4423 					    (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4424 				}
4425 			}
4426 
4427 			/*
4428 			 * compute size to transfer this round,  if io_req_size is
4429 			 * still non-zero after this attempt, we'll loop around and
4430 			 * set up for another I/O.
4431 			 */
4432 			val_size = (uio_last * PAGE_SIZE) - start_offset;
4433 
4434 			if (val_size > max_size) {
4435 				val_size = (u_int)max_size;
4436 			}
4437 
4438 			if (val_size > io_req_size) {
4439 				val_size = io_req_size;
4440 			}
4441 
4442 			if ((uio->uio_offset + val_size) > last_ioread_offset) {
4443 				last_ioread_offset = uio->uio_offset + val_size;
4444 			}
4445 
4446 			if ((size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4447 				if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4448 					/*
4449 					 * if there's still I/O left to do for this request, and...
4450 					 * we're not in hard throttle mode, and...
4451 					 * we're close to using up the previous prefetch, then issue a
4452 					 * new pre-fetch I/O... the I/O latency will overlap
4453 					 * with the copying of the data
4454 					 */
4455 					if (size_of_prefetch > max_rd_size) {
4456 						size_of_prefetch = max_rd_size;
4457 					}
4458 
4459 					size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4460 
4461 					last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4462 
4463 					if (last_ioread_offset > last_request_offset) {
4464 						last_ioread_offset = last_request_offset;
4465 					}
4466 				}
4467 			} else if ((uio->uio_offset + val_size) == last_request_offset) {
4468 				/*
4469 				 * this transfer will finish this request, so...
4470 				 * let's try to read ahead if we're in
4471 				 * a sequential access pattern and we haven't
4472 				 * explicitly disabled it
4473 				 */
4474 				if (rd_ahead_enabled) {
4475 					cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4476 				}
4477 
4478 				if (rap != NULL) {
4479 					if (extent.e_addr < rap->cl_lastr) {
4480 						rap->cl_maxra = 0;
4481 					}
4482 					rap->cl_lastr = extent.e_addr;
4483 				}
4484 			}
4485 			if (iolock_inited == TRUE) {
4486 				cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4487 			}
4488 
4489 			if (iostate.io_error) {
4490 				error = iostate.io_error;
4491 			} else {
4492 				u_int32_t io_requested;
4493 
4494 				io_requested = val_size;
4495 
4496 				retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4497 
4498 				io_req_size -= (val_size - io_requested);
4499 			}
4500 		} else {
4501 			if (iolock_inited == TRUE) {
4502 				cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4503 			}
4504 		}
4505 		if (start_pg < last_pg) {
4506 			/*
4507 			 * compute the range of pages that we actually issued an I/O for
4508 			 * and either commit them as valid if the I/O succeeded
4509 			 * or abort them if the I/O failed or we're not supposed to
4510 			 * keep them in the cache
4511 			 */
4512 			io_size = (last_pg - start_pg) * PAGE_SIZE;
4513 
4514 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4515 
4516 			if (error || (flags & IO_NOCACHE)) {
4517 				ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4518 				    UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4519 			} else {
4520 				int     commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
4521 
4522 				if (take_reference) {
4523 					commit_flags |= UPL_COMMIT_INACTIVATE;
4524 				} else {
4525 					commit_flags |= UPL_COMMIT_SPECULATE;
4526 				}
4527 
4528 				ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4529 			}
4530 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4531 		}
4532 		if ((last_pg - start_pg) < pages_in_upl) {
4533 			/*
4534 			 * the set of pages that we issued an I/O for did not encompass
4535 			 * the entire upl... so just release these without modifying
4536 			 * their state
4537 			 */
4538 			if (error) {
4539 				if (leftover_upl_aborted) {
4540 					ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, (uio_last - start_pg) * PAGE_SIZE,
4541 					    UPL_ABORT_FREE_ON_EMPTY);
4542 				} else {
4543 					ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4544 				}
4545 			} else {
4546 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
4547 				    upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
4548 
4549 				/*
4550 				 * handle any valid pages at the beginning of
4551 				 * the upl... release these appropriately
4552 				 */
4553 				cluster_read_upl_release(upl, 0, start_pg, take_reference);
4554 
4555 				/*
4556 				 * handle any valid pages immediately after the
4557 				 * pages we issued I/O for... ... release these appropriately
4558 				 */
4559 				cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
4560 
4561 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
4562 			}
4563 		}
4564 		if (retval == 0) {
4565 			retval = error;
4566 		}
4567 
4568 		if (io_req_size) {
4569 			if (cluster_is_throttled(vp)) {
4570 				/*
4571 				 * we're in the throttle window, at the very least
4572 				 * we want to limit the size of the I/O we're about
4573 				 * to issue
4574 				 */
4575 				rd_ahead_enabled = 0;
4576 				prefetch_enabled = 0;
4577 				max_rd_size = THROTTLE_MAX_IOSIZE;
4578 			} else {
4579 				if (max_rd_size == THROTTLE_MAX_IOSIZE) {
4580 					/*
4581 					 * coming out of throttled state
4582 					 */
4583 					if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4584 						if (rap != NULL) {
4585 							rd_ahead_enabled = 1;
4586 						}
4587 						prefetch_enabled = 1;
4588 					}
4589 					max_rd_size = max_prefetch;
4590 					last_ioread_offset = 0;
4591 				}
4592 			}
4593 		}
4594 	}
4595 	if (iolock_inited == TRUE) {
4596 		/*
4597 		 * cluster_io returned an error after it
4598 		 * had already issued some I/O.  we need
4599 		 * to wait for that I/O to complete before
4600 		 * we can destroy the iostate mutex...
4601 		 * 'retval' already contains the early error
4602 		 * so no need to pick it up from iostate.io_error
4603 		 */
4604 		cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4605 
4606 		lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
4607 	}
4608 	if (rap != NULL) {
4609 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4610 		    (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
4611 
4612 		lck_mtx_unlock(&rap->cl_lockr);
4613 	} else {
4614 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4615 		    (int)uio->uio_offset, io_req_size, 0, retval, 0);
4616 	}
4617 
4618 	return retval;
4619 }
4620 
4621 /*
4622  * We don't want another read/write lock for every vnode in the system
4623  * so we keep a hash of them here.  There should never be very many of
4624  * these around at any point in time.
4625  */
4626 cl_direct_read_lock_t *
cluster_lock_direct_read(vnode_t vp,lck_rw_type_t type)4627 cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4628 {
4629 	struct cl_direct_read_locks *head
4630 	        = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4631 	    % CL_DIRECT_READ_LOCK_BUCKETS];
4632 
4633 	struct cl_direct_read_lock *lck, *new_lck = NULL;
4634 
4635 	for (;;) {
4636 		lck_spin_lock(&cl_direct_read_spin_lock);
4637 
4638 		LIST_FOREACH(lck, head, chain) {
4639 			if (lck->vp == vp) {
4640 				++lck->ref_count;
4641 				lck_spin_unlock(&cl_direct_read_spin_lock);
4642 				if (new_lck) {
4643 					// Someone beat us to it, ditch the allocation
4644 					lck_rw_destroy(&new_lck->rw_lock, &cl_mtx_grp);
4645 					kfree_type(cl_direct_read_lock_t, new_lck);
4646 				}
4647 				lck_rw_lock(&lck->rw_lock, type);
4648 				return lck;
4649 			}
4650 		}
4651 
4652 		if (new_lck) {
4653 			// Use the lock we allocated
4654 			LIST_INSERT_HEAD(head, new_lck, chain);
4655 			lck_spin_unlock(&cl_direct_read_spin_lock);
4656 			lck_rw_lock(&new_lck->rw_lock, type);
4657 			return new_lck;
4658 		}
4659 
4660 		lck_spin_unlock(&cl_direct_read_spin_lock);
4661 
4662 		// Allocate a new lock
4663 		new_lck = kalloc_type(cl_direct_read_lock_t, Z_WAITOK);
4664 		lck_rw_init(&new_lck->rw_lock, &cl_mtx_grp, LCK_ATTR_NULL);
4665 		new_lck->vp = vp;
4666 		new_lck->ref_count = 1;
4667 
4668 		// Got to go round again
4669 	}
4670 }
4671 
4672 void
cluster_unlock_direct_read(cl_direct_read_lock_t * lck)4673 cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4674 {
4675 	lck_rw_done(&lck->rw_lock);
4676 
4677 	lck_spin_lock(&cl_direct_read_spin_lock);
4678 	if (lck->ref_count == 1) {
4679 		LIST_REMOVE(lck, chain);
4680 		lck_spin_unlock(&cl_direct_read_spin_lock);
4681 		lck_rw_destroy(&lck->rw_lock, &cl_mtx_grp);
4682 		kfree_type(cl_direct_read_lock_t, lck);
4683 	} else {
4684 		--lck->ref_count;
4685 		lck_spin_unlock(&cl_direct_read_spin_lock);
4686 	}
4687 }
4688 
4689 static int
cluster_read_direct(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int flags,int (* callback)(buf_t,void *),void * callback_arg)4690 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4691     int flags, int (*callback)(buf_t, void *), void *callback_arg)
4692 {
4693 	upl_t            upl;
4694 	upl_page_info_t  *pl;
4695 	off_t            max_io_size;
4696 	vm_offset_t      upl_offset, vector_upl_offset = 0;
4697 	upl_size_t       upl_size, vector_upl_size = 0;
4698 	vm_size_t        upl_needed_size;
4699 	unsigned int     pages_in_pl;
4700 	upl_control_flags_t upl_flags;
4701 	kern_return_t    kret;
4702 	unsigned int     i;
4703 	int              force_data_sync;
4704 	int              retval = 0;
4705 	int              no_zero_fill = 0;
4706 	int              io_flag = 0;
4707 	int              misaligned = 0;
4708 	struct clios     iostate;
4709 	user_addr_t      iov_base;
4710 	u_int32_t        io_req_size;
4711 	u_int32_t        offset_in_file;
4712 	u_int32_t        offset_in_iovbase;
4713 	u_int32_t        io_size;
4714 	u_int32_t        io_min;
4715 	u_int32_t        xsize;
4716 	u_int32_t        devblocksize;
4717 	u_int32_t        mem_alignment_mask;
4718 	u_int32_t        max_upl_size;
4719 	u_int32_t        max_rd_size;
4720 	u_int32_t        max_rd_ahead;
4721 	u_int32_t        max_vector_size;
4722 	boolean_t        io_throttled = FALSE;
4723 
4724 	u_int32_t        vector_upl_iosize = 0;
4725 	int              issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
4726 	off_t            v_upl_uio_offset = 0;
4727 	int              vector_upl_index = 0;
4728 	upl_t            vector_upl = NULL;
4729 	cl_direct_read_lock_t *lock = NULL;
4730 
4731 	assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
4732 
4733 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
4734 	    (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4735 
4736 	max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
4737 
4738 	max_rd_size = max_upl_size;
4739 	max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4740 
4741 	io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
4742 
4743 	if (flags & IO_PASSIVE) {
4744 		io_flag |= CL_PASSIVE;
4745 	}
4746 
4747 	if (flags & IO_ENCRYPTED) {
4748 		io_flag |= CL_RAW_ENCRYPTED;
4749 	}
4750 
4751 	if (flags & IO_NOCACHE) {
4752 		io_flag |= CL_NOCACHE;
4753 	}
4754 
4755 	if (flags & IO_SKIP_ENCRYPTION) {
4756 		io_flag |= CL_ENCRYPTED;
4757 	}
4758 
4759 	iostate.io_completed = 0;
4760 	iostate.io_issued = 0;
4761 	iostate.io_error = 0;
4762 	iostate.io_wanted = 0;
4763 
4764 	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4765 
4766 	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4767 	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4768 
4769 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4770 	    (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
4771 
4772 	if (devblocksize == 1) {
4773 		/*
4774 		 * the AFP client advertises a devblocksize of 1
4775 		 * however, its BLOCKMAP routine maps to physical
4776 		 * blocks that are PAGE_SIZE in size...
4777 		 * therefore we can't ask for I/Os that aren't page aligned
4778 		 * or aren't multiples of PAGE_SIZE in size
4779 		 * by setting devblocksize to PAGE_SIZE, we re-instate
4780 		 * the old behavior we had before the mem_alignment_mask
4781 		 * changes went in...
4782 		 */
4783 		devblocksize = PAGE_SIZE;
4784 	}
4785 
4786 	/*
4787 	 * We are going to need this uio for the prefaulting later
4788 	 * especially for the cases where multiple non-contiguous
4789 	 * iovs are passed into this routine.
4790 	 */
4791 	uio_t uio_acct = uio_duplicate(uio);
4792 
4793 next_dread:
4794 	io_req_size = *read_length;
4795 	iov_base = uio_curriovbase(uio);
4796 
4797 	offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4798 	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4799 
4800 	if (vm_map_page_mask(current_map()) < PAGE_MASK) {
4801 		/*
4802 		 * XXX TODO4K
4803 		 * Direct I/O might not work as expected from a 16k kernel space
4804 		 * to a 4k user space because each 4k chunk might point to
4805 		 * a different 16k physical page...
4806 		 * Let's go the "misaligned" way.
4807 		 */
4808 		if (!misaligned) {
4809 			DEBUG4K_VFS("forcing misaligned\n");
4810 		}
4811 		misaligned = 1;
4812 	}
4813 
4814 	if (offset_in_file || offset_in_iovbase) {
4815 		/*
4816 		 * one of the 2 important offsets is misaligned
4817 		 * so fire an I/O through the cache for this entire vector
4818 		 */
4819 		misaligned = 1;
4820 	}
4821 	if (iov_base & (devblocksize - 1)) {
4822 		/*
4823 		 * the offset in memory must be on a device block boundary
4824 		 * so that we can guarantee that we can generate an
4825 		 * I/O that ends on a page boundary in cluster_io
4826 		 */
4827 		misaligned = 1;
4828 	}
4829 
4830 	max_io_size = filesize - uio->uio_offset;
4831 
4832 	/*
4833 	 * The user must request IO in aligned chunks.  If the
4834 	 * offset into the file is bad, or the userland pointer
4835 	 * is non-aligned, then we cannot service the encrypted IO request.
4836 	 */
4837 	if (flags & IO_ENCRYPTED) {
4838 		if (misaligned || (io_req_size & (devblocksize - 1))) {
4839 			retval = EINVAL;
4840 		}
4841 
4842 		max_io_size = roundup(max_io_size, devblocksize);
4843 	}
4844 
4845 	if ((off_t)io_req_size > max_io_size) {
4846 		io_req_size = (u_int32_t)max_io_size;
4847 	}
4848 
4849 	/*
4850 	 * When we get to this point, we know...
4851 	 *  -- the offset into the file is on a devblocksize boundary
4852 	 */
4853 
4854 	while (io_req_size && retval == 0) {
4855 		u_int32_t io_start;
4856 
4857 		if (cluster_is_throttled(vp)) {
4858 			/*
4859 			 * we're in the throttle window, at the very least
4860 			 * we want to limit the size of the I/O we're about
4861 			 * to issue
4862 			 */
4863 			max_rd_size  = THROTTLE_MAX_IOSIZE;
4864 			max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
4865 			max_vector_size = THROTTLE_MAX_IOSIZE;
4866 		} else {
4867 			max_rd_size  = max_upl_size;
4868 			max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4869 			max_vector_size = MAX_VECTOR_UPL_SIZE;
4870 		}
4871 		io_start = io_size = io_req_size;
4872 
4873 		/*
4874 		 * First look for pages already in the cache
4875 		 * and move them to user space.  But only do this
4876 		 * check if we are not retrieving encrypted data directly
4877 		 * from the filesystem;  those blocks should never
4878 		 * be in the UBC.
4879 		 *
4880 		 * cluster_copy_ubc_data returns the resid
4881 		 * in io_size
4882 		 */
4883 		if ((flags & IO_ENCRYPTED) == 0) {
4884 			retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
4885 		}
4886 		/*
4887 		 * calculate the number of bytes actually copied
4888 		 * starting size - residual
4889 		 */
4890 		xsize = io_start - io_size;
4891 
4892 		io_req_size -= xsize;
4893 
4894 		if (useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
4895 			/*
4896 			 * We found something in the cache or we have an iov_base that's not
4897 			 * page-aligned.
4898 			 *
4899 			 * Issue all I/O's that have been collected within this Vectored UPL.
4900 			 */
4901 			if (vector_upl_index) {
4902 				retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4903 				reset_vector_run_state();
4904 			}
4905 
4906 			if (xsize) {
4907 				useVectorUPL = 0;
4908 			}
4909 
4910 			/*
4911 			 * After this point, if we are using the Vector UPL path and the base is
4912 			 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4913 			 */
4914 		}
4915 
4916 		/*
4917 		 * check to see if we are finished with this request.
4918 		 *
4919 		 * If we satisfied this IO already, then io_req_size will be 0.
4920 		 * Otherwise, see if the IO was mis-aligned and needs to go through
4921 		 * the UBC to deal with the 'tail'.
4922 		 *
4923 		 */
4924 		if (io_req_size == 0 || (misaligned)) {
4925 			/*
4926 			 * see if there's another uio vector to
4927 			 * process that's of type IO_DIRECT
4928 			 *
4929 			 * break out of while loop to get there
4930 			 */
4931 			break;
4932 		}
4933 		/*
4934 		 * assume the request ends on a device block boundary
4935 		 */
4936 		io_min = devblocksize;
4937 
4938 		/*
4939 		 * we can handle I/O's in multiples of the device block size
4940 		 * however, if io_size isn't a multiple of devblocksize we
4941 		 * want to clip it back to the nearest page boundary since
4942 		 * we are going to have to go through cluster_read_copy to
4943 		 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4944 		 * multiple, we avoid asking the drive for the same physical
4945 		 * blocks twice.. once for the partial page at the end of the
4946 		 * request and a 2nd time for the page we read into the cache
4947 		 * (which overlaps the end of the direct read) in order to
4948 		 * get at the overhang bytes
4949 		 */
4950 		if (io_size & (devblocksize - 1)) {
4951 			assert(!(flags & IO_ENCRYPTED));
4952 			/*
4953 			 * Clip the request to the previous page size boundary
4954 			 * since request does NOT end on a device block boundary
4955 			 */
4956 			io_size &= ~PAGE_MASK;
4957 			io_min = PAGE_SIZE;
4958 		}
4959 		if (retval || io_size < io_min) {
4960 			/*
4961 			 * either an error or we only have the tail left to
4962 			 * complete via the copy path...
4963 			 * we may have already spun some portion of this request
4964 			 * off as async requests... we need to wait for the I/O
4965 			 * to complete before returning
4966 			 */
4967 			goto wait_for_dreads;
4968 		}
4969 
4970 		/*
4971 		 * Don't re-check the UBC data if we are looking for uncached IO
4972 		 * or asking for encrypted blocks.
4973 		 */
4974 		if ((flags & IO_ENCRYPTED) == 0) {
4975 			if ((xsize = io_size) > max_rd_size) {
4976 				xsize = max_rd_size;
4977 			}
4978 
4979 			io_size = 0;
4980 
4981 			if (!lock) {
4982 				/*
4983 				 * We hold a lock here between the time we check the
4984 				 * cache and the time we issue I/O.  This saves us
4985 				 * from having to lock the pages in the cache.  Not
4986 				 * all clients will care about this lock but some
4987 				 * clients may want to guarantee stability between
4988 				 * here and when the I/O is issued in which case they
4989 				 * will take the lock exclusively.
4990 				 */
4991 				lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
4992 			}
4993 
4994 			ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
4995 
4996 			if (io_size == 0) {
4997 				/*
4998 				 * a page must have just come into the cache
4999 				 * since the first page in this range is no
5000 				 * longer absent, go back and re-evaluate
5001 				 */
5002 				continue;
5003 			}
5004 		}
5005 		if ((flags & IO_RETURN_ON_THROTTLE)) {
5006 			if (cluster_is_throttled(vp) == THROTTLE_NOW) {
5007 				if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
5008 					/*
5009 					 * we're in the throttle window and at least 1 I/O
5010 					 * has already been issued by a throttleable thread
5011 					 * in this window, so return with EAGAIN to indicate
5012 					 * to the FS issuing the cluster_read call that it
5013 					 * should now throttle after dropping any locks
5014 					 */
5015 					throttle_info_update_by_mount(vp->v_mount);
5016 
5017 					io_throttled = TRUE;
5018 					goto wait_for_dreads;
5019 				}
5020 			}
5021 		}
5022 		if (io_size > max_rd_size) {
5023 			io_size = max_rd_size;
5024 		}
5025 
5026 		iov_base = uio_curriovbase(uio);
5027 
5028 		upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5029 		upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5030 
5031 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
5032 		    (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
5033 
5034 		if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
5035 			no_zero_fill = 1;
5036 		} else {
5037 			no_zero_fill = 0;
5038 		}
5039 
5040 		vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5041 		for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
5042 			pages_in_pl = 0;
5043 			upl_size = (upl_size_t)upl_needed_size;
5044 			upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5045 			if (no_zero_fill) {
5046 				upl_flags |= UPL_NOZEROFILL;
5047 			}
5048 			if (force_data_sync) {
5049 				upl_flags |= UPL_FORCE_DATA_SYNC;
5050 			}
5051 
5052 			kret = vm_map_create_upl(map,
5053 			    (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5054 			    &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
5055 
5056 			if (kret != KERN_SUCCESS) {
5057 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5058 				    (int)upl_offset, upl_size, io_size, kret, 0);
5059 				/*
5060 				 * failed to get pagelist
5061 				 *
5062 				 * we may have already spun some portion of this request
5063 				 * off as async requests... we need to wait for the I/O
5064 				 * to complete before returning
5065 				 */
5066 				goto wait_for_dreads;
5067 			}
5068 			pages_in_pl = upl_size / PAGE_SIZE;
5069 			pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
5070 
5071 			for (i = 0; i < pages_in_pl; i++) {
5072 				if (!upl_page_present(pl, i)) {
5073 					break;
5074 				}
5075 			}
5076 			if (i == pages_in_pl) {
5077 				break;
5078 			}
5079 
5080 			ubc_upl_abort(upl, 0);
5081 		}
5082 		if (force_data_sync >= 3) {
5083 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5084 			    (int)upl_offset, upl_size, io_size, kret, 0);
5085 
5086 			goto wait_for_dreads;
5087 		}
5088 		/*
5089 		 * Consider the possibility that upl_size wasn't satisfied.
5090 		 */
5091 		if (upl_size < upl_needed_size) {
5092 			if (upl_size && upl_offset == 0) {
5093 				io_size = upl_size;
5094 			} else {
5095 				io_size = 0;
5096 			}
5097 		}
5098 		if (io_size == 0) {
5099 			ubc_upl_abort(upl, 0);
5100 			goto wait_for_dreads;
5101 		}
5102 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5103 		    (int)upl_offset, upl_size, io_size, kret, 0);
5104 
5105 		if (useVectorUPL) {
5106 			vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
5107 			if (end_off) {
5108 				issueVectorUPL = 1;
5109 			}
5110 			/*
5111 			 * After this point, if we are using a vector UPL, then
5112 			 * either all the UPL elements end on a page boundary OR
5113 			 * this UPL is the last element because it does not end
5114 			 * on a page boundary.
5115 			 */
5116 		}
5117 
5118 		/*
5119 		 * request asynchronously so that we can overlap
5120 		 * the preparation of the next I/O
5121 		 * if there are already too many outstanding reads
5122 		 * wait until some have completed before issuing the next read
5123 		 */
5124 		cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
5125 
5126 		if (iostate.io_error) {
5127 			/*
5128 			 * one of the earlier reads we issued ran into a hard error
5129 			 * don't issue any more reads, cleanup the UPL
5130 			 * that was just created but not used, then
5131 			 * go wait for any other reads to complete before
5132 			 * returning the error to the caller
5133 			 */
5134 			ubc_upl_abort(upl, 0);
5135 
5136 			goto wait_for_dreads;
5137 		}
5138 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
5139 		    upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
5140 
5141 		if (!useVectorUPL) {
5142 			if (no_zero_fill) {
5143 				io_flag &= ~CL_PRESERVE;
5144 			} else {
5145 				io_flag |= CL_PRESERVE;
5146 			}
5147 
5148 			retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5149 		} else {
5150 			if (!vector_upl_index) {
5151 				vector_upl = vector_upl_create(upl_offset);
5152 				v_upl_uio_offset = uio->uio_offset;
5153 				vector_upl_offset = upl_offset;
5154 			}
5155 
5156 			vector_upl_set_subupl(vector_upl, upl, upl_size);
5157 			vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
5158 			vector_upl_index++;
5159 			vector_upl_size += upl_size;
5160 			vector_upl_iosize += io_size;
5161 
5162 			if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
5163 				retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5164 				reset_vector_run_state();
5165 			}
5166 		}
5167 
5168 		if (lock) {
5169 			// We don't need to wait for the I/O to complete
5170 			cluster_unlock_direct_read(lock);
5171 			lock = NULL;
5172 		}
5173 
5174 		/*
5175 		 * update the uio structure
5176 		 */
5177 		if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5178 			uio_update(uio, (user_size_t)max_io_size);
5179 		} else {
5180 			uio_update(uio, (user_size_t)io_size);
5181 		}
5182 
5183 		io_req_size -= io_size;
5184 
5185 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
5186 		    upl, (int)uio->uio_offset, io_req_size, retval, 0);
5187 	} /* end while */
5188 
5189 	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
5190 		retval = cluster_io_type(uio, read_type, read_length, 0);
5191 
5192 		if (retval == 0 && *read_type == IO_DIRECT) {
5193 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5194 			    (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
5195 
5196 			goto next_dread;
5197 		}
5198 	}
5199 
5200 wait_for_dreads:
5201 
5202 	if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
5203 		retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5204 		reset_vector_run_state();
5205 	}
5206 
5207 	// We don't need to wait for the I/O to complete
5208 	if (lock) {
5209 		cluster_unlock_direct_read(lock);
5210 	}
5211 
5212 	/*
5213 	 * make sure all async reads that are part of this stream
5214 	 * have completed before we return
5215 	 */
5216 	cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
5217 
5218 	if (iostate.io_error) {
5219 		retval = iostate.io_error;
5220 	}
5221 
5222 	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5223 
5224 	if (io_throttled == TRUE && retval == 0) {
5225 		retval = EAGAIN;
5226 	}
5227 
5228 	vm_map_offset_t current_page_size, current_page_mask;
5229 	current_page_size = vm_map_page_size(current_map());
5230 	current_page_mask = vm_map_page_mask(current_map());
5231 	if (uio_acct) {
5232 		off_t bytes_to_prefault = 0, bytes_prefaulted = 0;
5233 		user_addr_t curr_iov_base = 0;
5234 		user_addr_t curr_iov_end = 0;
5235 		user_size_t curr_iov_len = 0;
5236 
5237 		bytes_to_prefault = uio_offset(uio) - uio_offset(uio_acct);
5238 
5239 		for (; bytes_prefaulted < bytes_to_prefault;) {
5240 			curr_iov_base = uio_curriovbase(uio_acct);
5241 			curr_iov_len = MIN(uio_curriovlen(uio_acct), bytes_to_prefault - bytes_prefaulted);
5242 			curr_iov_end = curr_iov_base + curr_iov_len;
5243 
5244 			for (; curr_iov_base < curr_iov_end;) {
5245 				/*
5246 				 * This is specifically done for pmap accounting purposes.
5247 				 * vm_pre_fault() will call vm_fault() to enter the page into
5248 				 * the pmap if there isn't _a_ physical page for that VA already.
5249 				 */
5250 				vm_pre_fault(vm_map_trunc_page(curr_iov_base, current_page_mask), VM_PROT_READ);
5251 				curr_iov_base += current_page_size;
5252 				bytes_prefaulted += current_page_size;
5253 			}
5254 			/*
5255 			 * Use update instead of advance so we can see how many iovs we processed.
5256 			 */
5257 			uio_update(uio_acct, curr_iov_len);
5258 		}
5259 		uio_free(uio_acct);
5260 		uio_acct = NULL;
5261 	}
5262 
5263 	if (io_req_size && retval == 0) {
5264 		/*
5265 		 * we couldn't handle the tail of this request in DIRECT mode
5266 		 * so fire it through the copy path
5267 		 */
5268 		if (flags & IO_ENCRYPTED) {
5269 			/*
5270 			 * We cannot fall back to the copy path for encrypted I/O. If this
5271 			 * happens, there is something wrong with the user buffer passed
5272 			 * down.
5273 			 */
5274 			retval = EFAULT;
5275 		} else {
5276 			retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
5277 		}
5278 
5279 		*read_type = IO_UNKNOWN;
5280 	}
5281 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
5282 	    (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
5283 
5284 	return retval;
5285 }
5286 
5287 
5288 static int
cluster_read_contig(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int (* callback)(buf_t,void *),void * callback_arg,int flags)5289 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
5290     int (*callback)(buf_t, void *), void *callback_arg, int flags)
5291 {
5292 	upl_page_info_t *pl;
5293 	upl_t            upl[MAX_VECTS];
5294 	vm_offset_t      upl_offset;
5295 	addr64_t         dst_paddr = 0;
5296 	user_addr_t      iov_base;
5297 	off_t            max_size;
5298 	upl_size_t       upl_size;
5299 	vm_size_t        upl_needed_size;
5300 	mach_msg_type_number_t  pages_in_pl;
5301 	upl_control_flags_t upl_flags;
5302 	kern_return_t    kret;
5303 	struct clios     iostate;
5304 	int              error = 0;
5305 	int              cur_upl = 0;
5306 	int              num_upl = 0;
5307 	int              n;
5308 	u_int32_t        xsize;
5309 	u_int32_t        io_size;
5310 	u_int32_t        devblocksize;
5311 	u_int32_t        mem_alignment_mask;
5312 	u_int32_t        tail_size = 0;
5313 	int              bflag;
5314 
5315 	if (flags & IO_PASSIVE) {
5316 		bflag = CL_PASSIVE;
5317 	} else {
5318 		bflag = 0;
5319 	}
5320 
5321 	if (flags & IO_NOCACHE) {
5322 		bflag |= CL_NOCACHE;
5323 	}
5324 
5325 	/*
5326 	 * When we enter this routine, we know
5327 	 *  -- the read_length will not exceed the current iov_len
5328 	 *  -- the target address is physically contiguous for read_length
5329 	 */
5330 	cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
5331 
5332 	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5333 	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
5334 
5335 	iostate.io_completed = 0;
5336 	iostate.io_issued = 0;
5337 	iostate.io_error = 0;
5338 	iostate.io_wanted = 0;
5339 
5340 	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
5341 
5342 next_cread:
5343 	io_size = *read_length;
5344 
5345 	max_size = filesize - uio->uio_offset;
5346 
5347 	if (io_size > max_size) {
5348 		io_size = (u_int32_t)max_size;
5349 	}
5350 
5351 	iov_base = uio_curriovbase(uio);
5352 
5353 	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5354 	upl_needed_size = upl_offset + io_size;
5355 
5356 	pages_in_pl = 0;
5357 	upl_size = (upl_size_t)upl_needed_size;
5358 	upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5359 
5360 
5361 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
5362 	    (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
5363 
5364 	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5365 	kret = vm_map_get_upl(map,
5366 	    vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5367 	    &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
5368 
5369 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
5370 	    (int)upl_offset, upl_size, io_size, kret, 0);
5371 
5372 	if (kret != KERN_SUCCESS) {
5373 		/*
5374 		 * failed to get pagelist
5375 		 */
5376 		error = EINVAL;
5377 		goto wait_for_creads;
5378 	}
5379 	num_upl++;
5380 
5381 	if (upl_size < upl_needed_size) {
5382 		/*
5383 		 * The upl_size wasn't satisfied.
5384 		 */
5385 		error = EINVAL;
5386 		goto wait_for_creads;
5387 	}
5388 	pl = ubc_upl_pageinfo(upl[cur_upl]);
5389 
5390 	dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
5391 
5392 	while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
5393 		u_int32_t   head_size;
5394 
5395 		head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
5396 
5397 		if (head_size > io_size) {
5398 			head_size = io_size;
5399 		}
5400 
5401 		error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
5402 
5403 		if (error) {
5404 			goto wait_for_creads;
5405 		}
5406 
5407 		upl_offset += head_size;
5408 		dst_paddr  += head_size;
5409 		io_size    -= head_size;
5410 
5411 		iov_base   += head_size;
5412 	}
5413 	if ((u_int32_t)iov_base & mem_alignment_mask) {
5414 		/*
5415 		 * request doesn't set up on a memory boundary
5416 		 * the underlying DMA engine can handle...
5417 		 * return an error instead of going through
5418 		 * the slow copy path since the intent of this
5419 		 * path is direct I/O to device memory
5420 		 */
5421 		error = EINVAL;
5422 		goto wait_for_creads;
5423 	}
5424 
5425 	tail_size = io_size & (devblocksize - 1);
5426 
5427 	io_size  -= tail_size;
5428 
5429 	while (io_size && error == 0) {
5430 		if (io_size > MAX_IO_CONTIG_SIZE) {
5431 			xsize = MAX_IO_CONTIG_SIZE;
5432 		} else {
5433 			xsize = io_size;
5434 		}
5435 		/*
5436 		 * request asynchronously so that we can overlap
5437 		 * the preparation of the next I/O... we'll do
5438 		 * the commit after all the I/O has completed
5439 		 * since its all issued against the same UPL
5440 		 * if there are already too many outstanding reads
5441 		 * wait until some have completed before issuing the next
5442 		 */
5443 		cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
5444 
5445 		if (iostate.io_error) {
5446 			/*
5447 			 * one of the earlier reads we issued ran into a hard error
5448 			 * don't issue any more reads...
5449 			 * go wait for any other reads to complete before
5450 			 * returning the error to the caller
5451 			 */
5452 			goto wait_for_creads;
5453 		}
5454 		error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
5455 		    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5456 		    (buf_t)NULL, &iostate, callback, callback_arg);
5457 		/*
5458 		 * The cluster_io read was issued successfully,
5459 		 * update the uio structure
5460 		 */
5461 		if (error == 0) {
5462 			uio_update(uio, (user_size_t)xsize);
5463 
5464 			dst_paddr  += xsize;
5465 			upl_offset += xsize;
5466 			io_size    -= xsize;
5467 		}
5468 	}
5469 	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5470 		error = cluster_io_type(uio, read_type, read_length, 0);
5471 
5472 		if (error == 0 && *read_type == IO_CONTIG) {
5473 			cur_upl++;
5474 			goto next_cread;
5475 		}
5476 	} else {
5477 		*read_type = IO_UNKNOWN;
5478 	}
5479 
5480 wait_for_creads:
5481 	/*
5482 	 * make sure all async reads that are part of this stream
5483 	 * have completed before we proceed
5484 	 */
5485 	cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
5486 
5487 	if (iostate.io_error) {
5488 		error = iostate.io_error;
5489 	}
5490 
5491 	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5492 
5493 	if (error == 0 && tail_size) {
5494 		error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
5495 	}
5496 
5497 	for (n = 0; n < num_upl; n++) {
5498 		/*
5499 		 * just release our hold on each physically contiguous
5500 		 * region without changing any state
5501 		 */
5502 		ubc_upl_abort(upl[n], 0);
5503 	}
5504 
5505 	return error;
5506 }
5507 
5508 
5509 static int
cluster_io_type(struct uio * uio,int * io_type,u_int32_t * io_length,u_int32_t min_length)5510 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5511 {
5512 	user_size_t      iov_len;
5513 	user_addr_t      iov_base = 0;
5514 	upl_t            upl;
5515 	upl_size_t       upl_size;
5516 	upl_control_flags_t upl_flags;
5517 	int              retval = 0;
5518 
5519 	/*
5520 	 * skip over any emtpy vectors
5521 	 */
5522 	uio_update(uio, (user_size_t)0);
5523 
5524 	iov_len = uio_curriovlen(uio);
5525 
5526 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
5527 
5528 	if (iov_len) {
5529 		iov_base = uio_curriovbase(uio);
5530 		/*
5531 		 * make sure the size of the vector isn't too big...
5532 		 * internally, we want to handle all of the I/O in
5533 		 * chunk sizes that fit in a 32 bit int
5534 		 */
5535 		if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
5536 			upl_size = MAX_IO_REQUEST_SIZE;
5537 		} else {
5538 			upl_size = (u_int32_t)iov_len;
5539 		}
5540 
5541 		upl_flags = UPL_QUERY_OBJECT_TYPE;
5542 
5543 		vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5544 		if ((vm_map_get_upl(map,
5545 		    vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5546 		    &upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
5547 			/*
5548 			 * the user app must have passed in an invalid address
5549 			 */
5550 			retval = EFAULT;
5551 		}
5552 		if (upl_size == 0) {
5553 			retval = EFAULT;
5554 		}
5555 
5556 		*io_length = upl_size;
5557 
5558 		if (upl_flags & UPL_PHYS_CONTIG) {
5559 			*io_type = IO_CONTIG;
5560 		} else if (iov_len >= min_length) {
5561 			*io_type = IO_DIRECT;
5562 		} else {
5563 			*io_type = IO_COPY;
5564 		}
5565 	} else {
5566 		/*
5567 		 * nothing left to do for this uio
5568 		 */
5569 		*io_length = 0;
5570 		*io_type   = IO_UNKNOWN;
5571 	}
5572 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
5573 
5574 	if (*io_type == IO_DIRECT &&
5575 	    vm_map_page_shift(current_map()) < PAGE_SHIFT) {
5576 		/* no direct I/O for sub-page-size address spaces */
5577 		DEBUG4K_VFS("io_type IO_DIRECT -> IO_COPY\n");
5578 		*io_type = IO_COPY;
5579 	}
5580 
5581 	return retval;
5582 }
5583 
5584 
5585 /*
5586  * generate advisory I/O's in the largest chunks possible
5587  * the completed pages will be released into the VM cache
5588  */
5589 int
advisory_read(vnode_t vp,off_t filesize,off_t f_offset,int resid)5590 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5591 {
5592 	return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5593 }
5594 
5595 int
advisory_read_ext(vnode_t vp,off_t filesize,off_t f_offset,int resid,int (* callback)(buf_t,void *),void * callback_arg,int bflag)5596 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
5597 {
5598 	upl_page_info_t *pl;
5599 	upl_t            upl;
5600 	vm_offset_t      upl_offset;
5601 	int              upl_size;
5602 	off_t            upl_f_offset;
5603 	int              start_offset;
5604 	int              start_pg;
5605 	int              last_pg;
5606 	int              pages_in_upl;
5607 	off_t            max_size;
5608 	int              io_size;
5609 	kern_return_t    kret;
5610 	int              retval = 0;
5611 	int              issued_io;
5612 	int              skip_range;
5613 	uint32_t         max_io_size;
5614 
5615 
5616 	if (!UBCINFOEXISTS(vp)) {
5617 		return EINVAL;
5618 	}
5619 
5620 	if (f_offset < 0 || resid < 0) {
5621 		return EINVAL;
5622 	}
5623 
5624 	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
5625 
5626 	if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
5627 		if (max_io_size > speculative_prefetch_max_iosize) {
5628 			max_io_size = speculative_prefetch_max_iosize;
5629 		}
5630 	}
5631 
5632 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
5633 	    (int)f_offset, resid, (int)filesize, 0, 0);
5634 
5635 	while (resid && f_offset < filesize && retval == 0) {
5636 		/*
5637 		 * compute the size of the upl needed to encompass
5638 		 * the requested read... limit each call to cluster_io
5639 		 * to the maximum UPL size... cluster_io will clip if
5640 		 * this exceeds the maximum io_size for the device,
5641 		 * make sure to account for
5642 		 * a starting offset that's not page aligned
5643 		 */
5644 		start_offset = (int)(f_offset & PAGE_MASK_64);
5645 		upl_f_offset = f_offset - (off_t)start_offset;
5646 		max_size     = filesize - f_offset;
5647 
5648 		if (resid < max_size) {
5649 			io_size = resid;
5650 		} else {
5651 			io_size = (int)max_size;
5652 		}
5653 
5654 		upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5655 		if ((uint32_t)upl_size > max_io_size) {
5656 			upl_size = max_io_size;
5657 		}
5658 
5659 		skip_range = 0;
5660 		/*
5661 		 * return the number of contiguously present pages in the cache
5662 		 * starting at upl_f_offset within the file
5663 		 */
5664 		ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5665 
5666 		if (skip_range) {
5667 			/*
5668 			 * skip over pages already present in the cache
5669 			 */
5670 			io_size = skip_range - start_offset;
5671 
5672 			f_offset += io_size;
5673 			resid    -= io_size;
5674 
5675 			if (skip_range == upl_size) {
5676 				continue;
5677 			}
5678 			/*
5679 			 * have to issue some real I/O
5680 			 * at this point, we know it's starting on a page boundary
5681 			 * because we've skipped over at least the first page in the request
5682 			 */
5683 			start_offset = 0;
5684 			upl_f_offset += skip_range;
5685 			upl_size     -= skip_range;
5686 		}
5687 		pages_in_upl = upl_size / PAGE_SIZE;
5688 
5689 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
5690 		    upl, (int)upl_f_offset, upl_size, start_offset, 0);
5691 
5692 		kret = ubc_create_upl_kernel(vp,
5693 		    upl_f_offset,
5694 		    upl_size,
5695 		    &upl,
5696 		    &pl,
5697 		    UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
5698 		    VM_KERN_MEMORY_FILE);
5699 		if (kret != KERN_SUCCESS) {
5700 			return retval;
5701 		}
5702 		issued_io = 0;
5703 
5704 		/*
5705 		 * before we start marching forward, we must make sure we end on
5706 		 * a present page, otherwise we will be working with a freed
5707 		 * upl
5708 		 */
5709 		for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5710 			if (upl_page_present(pl, last_pg)) {
5711 				break;
5712 			}
5713 		}
5714 		pages_in_upl = last_pg + 1;
5715 
5716 
5717 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
5718 		    upl, (int)upl_f_offset, upl_size, start_offset, 0);
5719 
5720 
5721 		for (last_pg = 0; last_pg < pages_in_upl;) {
5722 			/*
5723 			 * scan from the beginning of the upl looking for the first
5724 			 * page that is present.... this will become the first page in
5725 			 * the request we're going to make to 'cluster_io'... if all
5726 			 * of the pages are absent, we won't call through to 'cluster_io'
5727 			 */
5728 			for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5729 				if (upl_page_present(pl, start_pg)) {
5730 					break;
5731 				}
5732 			}
5733 
5734 			/*
5735 			 * scan from the starting present page looking for an absent
5736 			 * page before the end of the upl is reached, if we
5737 			 * find one, then it will terminate the range of pages being
5738 			 * presented to 'cluster_io'
5739 			 */
5740 			for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5741 				if (!upl_page_present(pl, last_pg)) {
5742 					break;
5743 				}
5744 			}
5745 
5746 			if (last_pg > start_pg) {
5747 				/*
5748 				 * we found a range of pages that must be filled
5749 				 * if the last page in this range is the last page of the file
5750 				 * we may have to clip the size of it to keep from reading past
5751 				 * the end of the last physical block associated with the file
5752 				 */
5753 				upl_offset = start_pg * PAGE_SIZE;
5754 				io_size    = (last_pg - start_pg) * PAGE_SIZE;
5755 
5756 				if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5757 					io_size = (int)(filesize - (upl_f_offset + upl_offset));
5758 				}
5759 
5760 				/*
5761 				 * issue an asynchronous read to cluster_io
5762 				 */
5763 				retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5764 				    CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5765 
5766 				issued_io = 1;
5767 			}
5768 		}
5769 		if (issued_io == 0) {
5770 			ubc_upl_abort(upl, 0);
5771 		}
5772 
5773 		io_size = upl_size - start_offset;
5774 
5775 		if (io_size > resid) {
5776 			io_size = resid;
5777 		}
5778 		f_offset += io_size;
5779 		resid    -= io_size;
5780 	}
5781 
5782 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
5783 	    (int)f_offset, resid, retval, 0, 0);
5784 
5785 	return retval;
5786 }
5787 
5788 
5789 int
cluster_push(vnode_t vp,int flags)5790 cluster_push(vnode_t vp, int flags)
5791 {
5792 	return cluster_push_ext(vp, flags, NULL, NULL);
5793 }
5794 
5795 
5796 int
cluster_push_ext(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg)5797 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5798 {
5799 	return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5800 }
5801 
5802 /* write errors via err, but return the number of clusters written */
5803 extern uint32_t system_inshutdown;
5804 uint32_t cl_sparse_push_error = 0;
5805 int
cluster_push_err(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg,int * err)5806 cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
5807 {
5808 	int     retval;
5809 	int     my_sparse_wait = 0;
5810 	struct  cl_writebehind *wbp;
5811 	int     local_err = 0;
5812 
5813 	if (err) {
5814 		*err = 0;
5815 	}
5816 
5817 	if (!UBCINFOEXISTS(vp)) {
5818 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
5819 		return 0;
5820 	}
5821 	/* return if deferred write is set */
5822 	if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5823 		return 0;
5824 	}
5825 	if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5826 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
5827 		return 0;
5828 	}
5829 	if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
5830 		lck_mtx_unlock(&wbp->cl_lockw);
5831 
5832 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
5833 		return 0;
5834 	}
5835 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
5836 	    wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
5837 
5838 	/*
5839 	 * if we have an fsync in progress, we don't want to allow any additional
5840 	 * sync/fsync/close(s) to occur until it finishes.
5841 	 * note that its possible for writes to continue to occur to this file
5842 	 * while we're waiting and also once the fsync starts to clean if we're
5843 	 * in the sparse map case
5844 	 */
5845 	while (wbp->cl_sparse_wait) {
5846 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5847 
5848 		msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5849 
5850 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5851 	}
5852 	if (flags & IO_SYNC) {
5853 		my_sparse_wait = 1;
5854 		wbp->cl_sparse_wait = 1;
5855 
5856 		/*
5857 		 * this is an fsync (or equivalent)... we must wait for any existing async
5858 		 * cleaning operations to complete before we evaulate the current state
5859 		 * and finish cleaning... this insures that all writes issued before this
5860 		 * fsync actually get cleaned to the disk before this fsync returns
5861 		 */
5862 		while (wbp->cl_sparse_pushes) {
5863 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5864 
5865 			msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5866 
5867 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5868 		}
5869 	}
5870 	if (wbp->cl_scmap) {
5871 		void    *scmap;
5872 
5873 		if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5874 			scmap = wbp->cl_scmap;
5875 			wbp->cl_scmap = NULL;
5876 
5877 			wbp->cl_sparse_pushes++;
5878 
5879 			lck_mtx_unlock(&wbp->cl_lockw);
5880 
5881 			retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5882 
5883 			lck_mtx_lock(&wbp->cl_lockw);
5884 
5885 			wbp->cl_sparse_pushes--;
5886 
5887 			if (retval) {
5888 				if (wbp->cl_scmap != NULL) {
5889 					/*
5890 					 * panic("cluster_push_err: Expected NULL cl_scmap\n");
5891 					 *
5892 					 * This can happen if we get an error from the underlying FS
5893 					 * e.g. ENOSPC, EPERM or EIO etc. We hope that these errors
5894 					 * are transient and the I/Os will succeed at a later point.
5895 					 *
5896 					 * The tricky part here is that a new sparse cluster has been
5897 					 * allocated and tracking a different set of dirty pages. So these
5898 					 * pages are not going to be pushed out with the next sparse_cluster_push.
5899 					 * An explicit msync or file close will, however, push the pages out.
5900 					 *
5901 					 * What if those calls still don't work? And so, during shutdown we keep
5902 					 * trying till we succeed...
5903 					 */
5904 
5905 					if (system_inshutdown) {
5906 						if ((retval == ENOSPC) && (vp->v_mount->mnt_flag & (MNT_LOCAL | MNT_REMOVABLE)) == MNT_LOCAL) {
5907 							os_atomic_inc(&cl_sparse_push_error, relaxed);
5908 						}
5909 					} else {
5910 						vfs_drt_control(&scmap, 0); /* emit stats and free this memory. Dirty pages stay intact. */
5911 						scmap = NULL;
5912 					}
5913 				} else {
5914 					wbp->cl_scmap = scmap;
5915 				}
5916 			}
5917 
5918 			if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
5919 				wakeup((caddr_t)&wbp->cl_sparse_pushes);
5920 			}
5921 		} else {
5922 			retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5923 		}
5924 
5925 		local_err = retval;
5926 
5927 		if (err) {
5928 			*err = retval;
5929 		}
5930 		retval = 1;
5931 	} else {
5932 		retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
5933 		if (err) {
5934 			*err = local_err;
5935 		}
5936 	}
5937 	lck_mtx_unlock(&wbp->cl_lockw);
5938 
5939 	if (flags & IO_SYNC) {
5940 		(void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
5941 	}
5942 
5943 	if (my_sparse_wait) {
5944 		/*
5945 		 * I'm the owner of the serialization token
5946 		 * clear it and wakeup anyone that is waiting
5947 		 * for me to finish
5948 		 */
5949 		lck_mtx_lock(&wbp->cl_lockw);
5950 
5951 		wbp->cl_sparse_wait = 0;
5952 		wakeup((caddr_t)&wbp->cl_sparse_wait);
5953 
5954 		lck_mtx_unlock(&wbp->cl_lockw);
5955 	}
5956 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
5957 	    wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
5958 
5959 	return retval;
5960 }
5961 
5962 
5963 __private_extern__ void
cluster_release(struct ubc_info * ubc)5964 cluster_release(struct ubc_info *ubc)
5965 {
5966 	struct cl_writebehind *wbp;
5967 	struct cl_readahead   *rap;
5968 
5969 	if ((wbp = ubc->cl_wbehind)) {
5970 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
5971 
5972 		if (wbp->cl_scmap) {
5973 			vfs_drt_control(&(wbp->cl_scmap), 0);
5974 		}
5975 		lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
5976 		zfree(cl_wr_zone, wbp);
5977 		ubc->cl_wbehind = NULL;
5978 	} else {
5979 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
5980 	}
5981 
5982 	if ((rap = ubc->cl_rahead)) {
5983 		lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
5984 		zfree(cl_rd_zone, rap);
5985 		ubc->cl_rahead  = NULL;
5986 	}
5987 
5988 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
5989 }
5990 
5991 
5992 static int
cluster_try_push(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,int * err,boolean_t vm_initiated)5993 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
5994 {
5995 	int cl_index;
5996 	int cl_index1;
5997 	int min_index;
5998 	int cl_len;
5999 	int cl_pushed = 0;
6000 	struct cl_wextent l_clusters[MAX_CLUSTERS];
6001 	u_int  max_cluster_pgcount;
6002 	int error = 0;
6003 
6004 	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
6005 	/*
6006 	 * the write behind context exists and has
6007 	 * already been locked...
6008 	 */
6009 	if (wbp->cl_number == 0) {
6010 		/*
6011 		 * no clusters to push
6012 		 * return number of empty slots
6013 		 */
6014 		return MAX_CLUSTERS;
6015 	}
6016 
6017 	/*
6018 	 * make a local 'sorted' copy of the clusters
6019 	 * and clear wbp->cl_number so that new clusters can
6020 	 * be developed
6021 	 */
6022 	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6023 		for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
6024 			if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
6025 				continue;
6026 			}
6027 			if (min_index == -1) {
6028 				min_index = cl_index1;
6029 			} else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
6030 				min_index = cl_index1;
6031 			}
6032 		}
6033 		if (min_index == -1) {
6034 			break;
6035 		}
6036 
6037 		l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
6038 		l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
6039 		l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
6040 
6041 		wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
6042 	}
6043 	wbp->cl_number = 0;
6044 
6045 	cl_len = cl_index;
6046 
6047 	/* skip switching to the sparse cluster mechanism if on diskimage */
6048 	if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
6049 	    !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
6050 		int   i;
6051 
6052 		/*
6053 		 * determine if we appear to be writing the file sequentially
6054 		 * if not, by returning without having pushed any clusters
6055 		 * we will cause this vnode to be pushed into the sparse cluster mechanism
6056 		 * used for managing more random I/O patterns
6057 		 *
6058 		 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
6059 		 * that's why we're in try_push with PUSH_DELAY...
6060 		 *
6061 		 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
6062 		 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
6063 		 * so we can just make a simple pass through, up to, but not including the last one...
6064 		 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
6065 		 * are sequential
6066 		 *
6067 		 * we let the last one be partial as long as it was adjacent to the previous one...
6068 		 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
6069 		 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
6070 		 */
6071 		for (i = 0; i < MAX_CLUSTERS - 1; i++) {
6072 			if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
6073 				goto dont_try;
6074 			}
6075 			if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
6076 				goto dont_try;
6077 			}
6078 		}
6079 	}
6080 	if (vm_initiated == TRUE) {
6081 		lck_mtx_unlock(&wbp->cl_lockw);
6082 	}
6083 
6084 	for (cl_index = 0; cl_index < cl_len; cl_index++) {
6085 		int     flags;
6086 		struct  cl_extent cl;
6087 		int retval;
6088 
6089 		flags = io_flags & (IO_PASSIVE | IO_CLOSE);
6090 
6091 		/*
6092 		 * try to push each cluster in turn...
6093 		 */
6094 		if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
6095 			flags |= IO_NOCACHE;
6096 		}
6097 
6098 		if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
6099 			flags |= IO_PASSIVE;
6100 		}
6101 
6102 		if (push_flag & PUSH_SYNC) {
6103 			flags |= IO_SYNC;
6104 		}
6105 
6106 		cl.b_addr = l_clusters[cl_index].b_addr;
6107 		cl.e_addr = l_clusters[cl_index].e_addr;
6108 
6109 		retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
6110 
6111 		if (retval == 0) {
6112 			cl_pushed++;
6113 
6114 			l_clusters[cl_index].b_addr = 0;
6115 			l_clusters[cl_index].e_addr = 0;
6116 		} else if (error == 0) {
6117 			error = retval;
6118 		}
6119 
6120 		if (!(push_flag & PUSH_ALL)) {
6121 			break;
6122 		}
6123 	}
6124 	if (vm_initiated == TRUE) {
6125 		lck_mtx_lock(&wbp->cl_lockw);
6126 	}
6127 
6128 	if (err) {
6129 		*err = error;
6130 	}
6131 
6132 dont_try:
6133 	if (cl_len > cl_pushed) {
6134 		/*
6135 		 * we didn't push all of the clusters, so
6136 		 * lets try to merge them back in to the vnode
6137 		 */
6138 		if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
6139 			/*
6140 			 * we picked up some new clusters while we were trying to
6141 			 * push the old ones... this can happen because I've dropped
6142 			 * the vnode lock... the sum of the
6143 			 * leftovers plus the new cluster count exceeds our ability
6144 			 * to represent them, so switch to the sparse cluster mechanism
6145 			 *
6146 			 * collect the active public clusters...
6147 			 */
6148 			sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6149 
6150 			for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
6151 				if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6152 					continue;
6153 				}
6154 				wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6155 				wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6156 				wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6157 
6158 				cl_index1++;
6159 			}
6160 			/*
6161 			 * update the cluster count
6162 			 */
6163 			wbp->cl_number = cl_index1;
6164 
6165 			/*
6166 			 * and collect the original clusters that were moved into the
6167 			 * local storage for sorting purposes
6168 			 */
6169 			sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6170 		} else {
6171 			/*
6172 			 * we've got room to merge the leftovers back in
6173 			 * just append them starting at the next 'hole'
6174 			 * represented by wbp->cl_number
6175 			 */
6176 			for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
6177 				if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6178 					continue;
6179 				}
6180 
6181 				wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6182 				wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6183 				wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6184 
6185 				cl_index1++;
6186 			}
6187 			/*
6188 			 * update the cluster count
6189 			 */
6190 			wbp->cl_number = cl_index1;
6191 		}
6192 	}
6193 	return MAX_CLUSTERS - wbp->cl_number;
6194 }
6195 
6196 
6197 
6198 static int
cluster_push_now(vnode_t vp,struct cl_extent * cl,off_t EOF,int flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6199 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
6200     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6201 {
6202 	upl_page_info_t *pl;
6203 	upl_t            upl;
6204 	vm_offset_t      upl_offset;
6205 	int              upl_size;
6206 	off_t            upl_f_offset;
6207 	int              pages_in_upl;
6208 	int              start_pg;
6209 	int              last_pg;
6210 	int              io_size;
6211 	int              io_flags;
6212 	int              upl_flags;
6213 	int              bflag;
6214 	int              size;
6215 	int              error = 0;
6216 	int              retval;
6217 	kern_return_t    kret;
6218 
6219 	if (flags & IO_PASSIVE) {
6220 		bflag = CL_PASSIVE;
6221 	} else {
6222 		bflag = 0;
6223 	}
6224 
6225 	if (flags & IO_SKIP_ENCRYPTION) {
6226 		bflag |= CL_ENCRYPTED;
6227 	}
6228 
6229 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
6230 	    (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
6231 
6232 	if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
6233 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
6234 
6235 		return 0;
6236 	}
6237 	upl_size = pages_in_upl * PAGE_SIZE;
6238 	upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6239 
6240 	if (upl_f_offset + upl_size >= EOF) {
6241 		if (upl_f_offset >= EOF) {
6242 			/*
6243 			 * must have truncated the file and missed
6244 			 * clearing a dangling cluster (i.e. it's completely
6245 			 * beyond the new EOF
6246 			 */
6247 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
6248 
6249 			return 0;
6250 		}
6251 		size = (int)(EOF - upl_f_offset);
6252 
6253 		upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
6254 		pages_in_upl = upl_size / PAGE_SIZE;
6255 	} else {
6256 		size = upl_size;
6257 	}
6258 
6259 
6260 	if (vm_initiated) {
6261 		vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
6262 		    UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
6263 
6264 		return error;
6265 	}
6266 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
6267 
6268 	/*
6269 	 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
6270 	 *
6271 	 * - only pages that are currently dirty are returned... these are the ones we need to clean
6272 	 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6273 	 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
6274 	 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
6275 	 *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
6276 	 *
6277 	 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6278 	 */
6279 
6280 	if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) {
6281 		upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
6282 	} else {
6283 		upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
6284 	}
6285 
6286 	kret = ubc_create_upl_kernel(vp,
6287 	    upl_f_offset,
6288 	    upl_size,
6289 	    &upl,
6290 	    &pl,
6291 	    upl_flags,
6292 	    VM_KERN_MEMORY_FILE);
6293 	if (kret != KERN_SUCCESS) {
6294 		panic("cluster_push: failed to get pagelist");
6295 	}
6296 
6297 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
6298 
6299 	/*
6300 	 * since we only asked for the dirty pages back
6301 	 * it's possible that we may only get a few or even none, so...
6302 	 * before we start marching forward, we must make sure we know
6303 	 * where the last present page is in the UPL, otherwise we could
6304 	 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
6305 	 * employed by commit_range and abort_range.
6306 	 */
6307 	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
6308 		if (upl_page_present(pl, last_pg)) {
6309 			break;
6310 		}
6311 	}
6312 	pages_in_upl = last_pg + 1;
6313 
6314 	if (pages_in_upl == 0) {
6315 		ubc_upl_abort(upl, 0);
6316 
6317 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
6318 		return 0;
6319 	}
6320 
6321 	for (last_pg = 0; last_pg < pages_in_upl;) {
6322 		/*
6323 		 * find the next dirty page in the UPL
6324 		 * this will become the first page in the
6325 		 * next I/O to generate
6326 		 */
6327 		for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
6328 			if (upl_dirty_page(pl, start_pg)) {
6329 				break;
6330 			}
6331 			if (upl_page_present(pl, start_pg)) {
6332 				/*
6333 				 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
6334 				 * just release these unchanged since we're not going
6335 				 * to steal them or change their state
6336 				 */
6337 				ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
6338 			}
6339 		}
6340 		if (start_pg >= pages_in_upl) {
6341 			/*
6342 			 * done... no more dirty pages to push
6343 			 */
6344 			break;
6345 		}
6346 		if (start_pg > last_pg) {
6347 			/*
6348 			 * skipped over some non-dirty pages
6349 			 */
6350 			size -= ((start_pg - last_pg) * PAGE_SIZE);
6351 		}
6352 
6353 		/*
6354 		 * find a range of dirty pages to write
6355 		 */
6356 		for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
6357 			if (!upl_dirty_page(pl, last_pg)) {
6358 				break;
6359 			}
6360 		}
6361 		upl_offset = start_pg * PAGE_SIZE;
6362 
6363 		io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
6364 
6365 		io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
6366 
6367 		if (!(flags & IO_SYNC)) {
6368 			io_flags |= CL_ASYNC;
6369 		}
6370 
6371 		if (flags & IO_CLOSE) {
6372 			io_flags |= CL_CLOSE;
6373 		}
6374 
6375 		if (flags & IO_NOCACHE) {
6376 			io_flags |= CL_NOCACHE;
6377 		}
6378 
6379 		retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
6380 		    io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6381 
6382 		if (error == 0 && retval) {
6383 			error = retval;
6384 		}
6385 
6386 		size -= io_size;
6387 	}
6388 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
6389 
6390 	return error;
6391 }
6392 
6393 
6394 /*
6395  * sparse_cluster_switch is called with the write behind lock held
6396  */
6397 static int
sparse_cluster_switch(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6398 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6399 {
6400 	int     cl_index;
6401 	int     error = 0;
6402 
6403 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
6404 
6405 	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6406 		int       flags;
6407 		struct cl_extent cl;
6408 
6409 		for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
6410 			if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
6411 				if (flags & UPL_POP_DIRTY) {
6412 					cl.e_addr = cl.b_addr + 1;
6413 
6414 					error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
6415 
6416 					if (error) {
6417 						break;
6418 					}
6419 				}
6420 			}
6421 		}
6422 	}
6423 	wbp->cl_number -= cl_index;
6424 
6425 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
6426 
6427 	return error;
6428 }
6429 
6430 
6431 /*
6432  * sparse_cluster_push must be called with the write-behind lock held if the scmap is
6433  * still associated with the write-behind context... however, if the scmap has been disassociated
6434  * from the write-behind context (the cluster_push case), the wb lock is not held
6435  */
6436 static int
sparse_cluster_push(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6437 sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
6438     int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6439 {
6440 	struct cl_extent cl;
6441 	off_t           offset;
6442 	u_int           length;
6443 	void            *l_scmap;
6444 	int error = 0;
6445 
6446 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
6447 
6448 	if (push_flag & PUSH_ALL) {
6449 		vfs_drt_control(scmap, 1);
6450 	}
6451 
6452 	l_scmap = *scmap;
6453 
6454 	for (;;) {
6455 		int retval;
6456 
6457 		if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) {
6458 			/*
6459 			 * Not finding anything to push will return KERN_FAILURE.
6460 			 * Confusing since it isn't really a failure. But that's the
6461 			 * reason we don't set 'error' here like we do below.
6462 			 */
6463 			break;
6464 		}
6465 
6466 		if (vm_initiated == TRUE) {
6467 			lck_mtx_unlock(&wbp->cl_lockw);
6468 		}
6469 
6470 		cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6471 		cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6472 
6473 		retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
6474 		if (error == 0 && retval) {
6475 			error = retval;
6476 		}
6477 
6478 		if (vm_initiated == TRUE) {
6479 			lck_mtx_lock(&wbp->cl_lockw);
6480 
6481 			if (*scmap != l_scmap) {
6482 				break;
6483 			}
6484 		}
6485 
6486 		if (error) {
6487 			if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
6488 				panic("Failed to restore dirty state on failure");
6489 			}
6490 
6491 			break;
6492 		}
6493 
6494 		if (!(push_flag & PUSH_ALL)) {
6495 			break;
6496 		}
6497 	}
6498 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6499 
6500 	return error;
6501 }
6502 
6503 
6504 /*
6505  * sparse_cluster_add is called with the write behind lock held
6506  */
6507 static int
sparse_cluster_add(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,struct cl_extent * cl,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6508 sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
6509     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6510 {
6511 	u_int   new_dirty;
6512 	u_int   length;
6513 	off_t   offset;
6514 	int     error = 0;
6515 	int     push_flag = 0; /* Is this a valid value? */
6516 
6517 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
6518 
6519 	offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6520 	length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
6521 
6522 	while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
6523 		/*
6524 		 * no room left in the map
6525 		 * only a partial update was done
6526 		 * push out some pages and try again
6527 		 */
6528 
6529 		if (vfs_get_scmap_push_behavior_internal(scmap, &push_flag)) {
6530 			push_flag = 0;
6531 		}
6532 
6533 		error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, 0, callback, callback_arg, vm_initiated);
6534 
6535 		if (error) {
6536 			break;
6537 		}
6538 
6539 		offset += (new_dirty * PAGE_SIZE_64);
6540 		length -= (new_dirty * PAGE_SIZE);
6541 	}
6542 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6543 
6544 	return error;
6545 }
6546 
6547 
6548 static int
cluster_align_phys_io(vnode_t vp,struct uio * uio,addr64_t usr_paddr,u_int32_t xsize,int flags,int (* callback)(buf_t,void *),void * callback_arg)6549 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
6550 {
6551 	upl_page_info_t  *pl;
6552 	upl_t            upl;
6553 	addr64_t         ubc_paddr;
6554 	kern_return_t    kret;
6555 	int              error = 0;
6556 	int              did_read = 0;
6557 	int              abort_flags;
6558 	int              upl_flags;
6559 	int              bflag;
6560 
6561 	if (flags & IO_PASSIVE) {
6562 		bflag = CL_PASSIVE;
6563 	} else {
6564 		bflag = 0;
6565 	}
6566 
6567 	if (flags & IO_NOCACHE) {
6568 		bflag |= CL_NOCACHE;
6569 	}
6570 
6571 	upl_flags = UPL_SET_LITE;
6572 
6573 	if (!(flags & CL_READ)) {
6574 		/*
6575 		 * "write" operation:  let the UPL subsystem know
6576 		 * that we intend to modify the buffer cache pages
6577 		 * we're gathering.
6578 		 */
6579 		upl_flags |= UPL_WILL_MODIFY;
6580 	} else {
6581 		/*
6582 		 * indicate that there is no need to pull the
6583 		 * mapping for this page... we're only going
6584 		 * to read from it, not modify it.
6585 		 */
6586 		upl_flags |= UPL_FILE_IO;
6587 	}
6588 	kret = ubc_create_upl_kernel(vp,
6589 	    uio->uio_offset & ~PAGE_MASK_64,
6590 	    PAGE_SIZE,
6591 	    &upl,
6592 	    &pl,
6593 	    upl_flags,
6594 	    VM_KERN_MEMORY_FILE);
6595 
6596 	if (kret != KERN_SUCCESS) {
6597 		return EINVAL;
6598 	}
6599 
6600 	if (!upl_valid_page(pl, 0)) {
6601 		/*
6602 		 * issue a synchronous read to cluster_io
6603 		 */
6604 		error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6605 		    CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6606 		if (error) {
6607 			ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
6608 
6609 			return error;
6610 		}
6611 		did_read = 1;
6612 	}
6613 	ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
6614 
6615 /*
6616  *	NOTE:  There is no prototype for the following in BSD. It, and the definitions
6617  *	of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6618  *	osfmk/ppc/mappings.h.  They are not included here because there appears to be no
6619  *	way to do so without exporting them to kexts as well.
6620  */
6621 	if (flags & CL_READ) {
6622 //		copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);	/* Copy physical to physical and flush the destination */
6623 		copypv(ubc_paddr, usr_paddr, xsize, 2 |        1 |        4);           /* Copy physical to physical and flush the destination */
6624 	} else {
6625 //		copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);	/* Copy physical to physical and flush the source */
6626 		copypv(usr_paddr, ubc_paddr, xsize, 2 |        1 |        8);           /* Copy physical to physical and flush the source */
6627 	}
6628 	if (!(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
6629 		/*
6630 		 * issue a synchronous write to cluster_io
6631 		 */
6632 		error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6633 		    bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6634 	}
6635 	if (error == 0) {
6636 		uio_update(uio, (user_size_t)xsize);
6637 	}
6638 
6639 	if (did_read) {
6640 		abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6641 	} else {
6642 		abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
6643 	}
6644 
6645 	ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
6646 
6647 	return error;
6648 }
6649 
6650 int
cluster_copy_upl_data(struct uio * uio,upl_t upl,int upl_offset,int * io_resid)6651 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
6652 {
6653 	int       pg_offset;
6654 	int       pg_index;
6655 	int       csize;
6656 	int       segflg;
6657 	int       retval = 0;
6658 	int       xsize;
6659 	upl_page_info_t *pl;
6660 	int       dirty_count;
6661 
6662 	xsize = *io_resid;
6663 
6664 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6665 	    (int)uio->uio_offset, upl_offset, xsize, 0, 0);
6666 
6667 	segflg = uio->uio_segflg;
6668 
6669 	switch (segflg) {
6670 	case UIO_USERSPACE32:
6671 	case UIO_USERISPACE32:
6672 		uio->uio_segflg = UIO_PHYS_USERSPACE32;
6673 		break;
6674 
6675 	case UIO_USERSPACE:
6676 	case UIO_USERISPACE:
6677 		uio->uio_segflg = UIO_PHYS_USERSPACE;
6678 		break;
6679 
6680 	case UIO_USERSPACE64:
6681 	case UIO_USERISPACE64:
6682 		uio->uio_segflg = UIO_PHYS_USERSPACE64;
6683 		break;
6684 
6685 	case UIO_SYSSPACE:
6686 		uio->uio_segflg = UIO_PHYS_SYSSPACE;
6687 		break;
6688 	}
6689 	pl = ubc_upl_pageinfo(upl);
6690 
6691 	pg_index  = upl_offset / PAGE_SIZE;
6692 	pg_offset = upl_offset & PAGE_MASK;
6693 	csize     = min(PAGE_SIZE - pg_offset, xsize);
6694 
6695 	dirty_count = 0;
6696 	while (xsize && retval == 0) {
6697 		addr64_t  paddr;
6698 
6699 		paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
6700 		if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) {
6701 			dirty_count++;
6702 		}
6703 
6704 		retval = uiomove64(paddr, csize, uio);
6705 
6706 		pg_index += 1;
6707 		pg_offset = 0;
6708 		xsize    -= csize;
6709 		csize     = min(PAGE_SIZE, xsize);
6710 	}
6711 	*io_resid = xsize;
6712 
6713 	uio->uio_segflg = segflg;
6714 
6715 	task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
6716 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6717 	    (int)uio->uio_offset, xsize, retval, segflg, 0);
6718 
6719 	return retval;
6720 }
6721 
6722 
6723 int
cluster_copy_ubc_data(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty)6724 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
6725 {
6726 	return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1);
6727 }
6728 
6729 
6730 static int
cluster_copy_ubc_data_internal(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty,int take_reference)6731 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
6732 {
6733 	int       segflg;
6734 	int       io_size;
6735 	int       xsize;
6736 	int       start_offset;
6737 	int       retval = 0;
6738 	memory_object_control_t  control;
6739 
6740 	io_size = *io_resid;
6741 
6742 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6743 	    (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
6744 
6745 	control = ubc_getobject(vp, UBC_FLAGS_NONE);
6746 
6747 	if (control == MEMORY_OBJECT_CONTROL_NULL) {
6748 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6749 		    (int)uio->uio_offset, io_size, retval, 3, 0);
6750 
6751 		return 0;
6752 	}
6753 	segflg = uio->uio_segflg;
6754 
6755 	switch (segflg) {
6756 	case UIO_USERSPACE32:
6757 	case UIO_USERISPACE32:
6758 		uio->uio_segflg = UIO_PHYS_USERSPACE32;
6759 		break;
6760 
6761 	case UIO_USERSPACE64:
6762 	case UIO_USERISPACE64:
6763 		uio->uio_segflg = UIO_PHYS_USERSPACE64;
6764 		break;
6765 
6766 	case UIO_USERSPACE:
6767 	case UIO_USERISPACE:
6768 		uio->uio_segflg = UIO_PHYS_USERSPACE;
6769 		break;
6770 
6771 	case UIO_SYSSPACE:
6772 		uio->uio_segflg = UIO_PHYS_SYSSPACE;
6773 		break;
6774 	}
6775 
6776 	if ((io_size = *io_resid)) {
6777 		start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6778 		xsize = (int)uio_resid(uio);
6779 
6780 		retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6781 		    start_offset, io_size, mark_dirty, take_reference);
6782 		xsize -= uio_resid(uio);
6783 		io_size -= xsize;
6784 	}
6785 	uio->uio_segflg = segflg;
6786 	*io_resid       = io_size;
6787 
6788 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6789 	    (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
6790 
6791 	return retval;
6792 }
6793 
6794 
6795 int
is_file_clean(vnode_t vp,off_t filesize)6796 is_file_clean(vnode_t vp, off_t filesize)
6797 {
6798 	off_t f_offset;
6799 	int   flags;
6800 	int   total_dirty = 0;
6801 
6802 	for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6803 		if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6804 			if (flags & UPL_POP_DIRTY) {
6805 				total_dirty++;
6806 			}
6807 		}
6808 	}
6809 	if (total_dirty) {
6810 		return EINVAL;
6811 	}
6812 
6813 	return 0;
6814 }
6815 
6816 
6817 
6818 /*
6819  * Dirty region tracking/clustering mechanism.
6820  *
6821  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6822  * dirty regions within a larger space (file).  It is primarily intended to
6823  * support clustering in large files with many dirty areas.
6824  *
6825  * The implementation assumes that the dirty regions are pages.
6826  *
6827  * To represent dirty pages within the file, we store bit vectors in a
6828  * variable-size circular hash.
6829  */
6830 
6831 /*
6832  * Bitvector size.  This determines the number of pages we group in a
6833  * single hashtable entry.  Each hashtable entry is aligned to this
6834  * size within the file.
6835  */
6836 #define DRT_BITVECTOR_PAGES             ((1024 * 256) / PAGE_SIZE)
6837 
6838 /*
6839  * File offset handling.
6840  *
6841  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6842  * the correct formula is  (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6843  */
6844 #define DRT_ADDRESS_MASK                (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6845 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
6846 
6847 /*
6848  * Hashtable address field handling.
6849  *
6850  * The low-order bits of the hashtable address are used to conserve
6851  * space.
6852  *
6853  * DRT_HASH_COUNT_MASK must be large enough to store the range
6854  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6855  * to indicate that the bucket is actually unoccupied.
6856  */
6857 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6858 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
6859 	do {                                                                                            \
6860 	        (scm)->scm_hashtable[(i)].dhe_control =                                                 \
6861 	            ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6862 	} while (0)
6863 #define DRT_HASH_COUNT_MASK             0x1ff
6864 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6865 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
6866 	do {                                                                                                            \
6867 	        (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
6868 	            ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
6869 	} while (0)
6870 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
6871 	do {                                                                                                            \
6872 	        (scm)->scm_hashtable[(i)].dhe_control =	0;                                                              \
6873 	} while (0)
6874 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6875 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6876 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
6877 	do {                                                                                            \
6878 	        (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
6879 	        DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
6880 	} while(0);
6881 
6882 
6883 #if !defined(XNU_TARGET_OS_OSX)
6884 /*
6885  * Hash table moduli.
6886  *
6887  * Since the hashtable entry's size is dependent on the size of
6888  * the bitvector, and since the hashtable size is constrained to
6889  * both being prime and fitting within the desired allocation
6890  * size, these values need to be manually determined.
6891  *
6892  * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6893  *
6894  * The small hashtable allocation is 4096 bytes, so the modulus is 251.
6895  * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
6896  * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
6897  */
6898 
6899 #define DRT_HASH_SMALL_MODULUS  251
6900 #define DRT_HASH_LARGE_MODULUS  2039
6901 #define DRT_HASH_XLARGE_MODULUS  8179
6902 
6903 /*
6904  * Physical memory required before the large hash modulus is permitted.
6905  *
6906  * On small memory systems, the large hash modulus can lead to phsyical
6907  * memory starvation, so we avoid using it there.
6908  */
6909 #define DRT_HASH_LARGE_MEMORY_REQUIRED  (1024LL * 1024LL * 1024LL)      /* 1GiB */
6910 #define DRT_HASH_XLARGE_MEMORY_REQUIRED  (8 * 1024LL * 1024LL * 1024LL)  /* 8GiB */
6911 
6912 #define DRT_SMALL_ALLOCATION    4096    /* 80 bytes spare */
6913 #define DRT_LARGE_ALLOCATION    32768   /* 144 bytes spare */
6914 #define DRT_XLARGE_ALLOCATION    131072  /* 208 bytes spare */
6915 
6916 #else /* XNU_TARGET_OS_OSX */
6917 /*
6918  * Hash table moduli.
6919  *
6920  * Since the hashtable entry's size is dependent on the size of
6921  * the bitvector, and since the hashtable size is constrained to
6922  * both being prime and fitting within the desired allocation
6923  * size, these values need to be manually determined.
6924  *
6925  * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6926  *
6927  * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6928  * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
6929  * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
6930  */
6931 
6932 #define DRT_HASH_SMALL_MODULUS  1019
6933 #define DRT_HASH_LARGE_MODULUS  8179
6934 #define DRT_HASH_XLARGE_MODULUS  32749
6935 
6936 /*
6937  * Physical memory required before the large hash modulus is permitted.
6938  *
6939  * On small memory systems, the large hash modulus can lead to phsyical
6940  * memory starvation, so we avoid using it there.
6941  */
6942 #define DRT_HASH_LARGE_MEMORY_REQUIRED  (4 * 1024LL * 1024LL * 1024LL)  /* 4GiB */
6943 #define DRT_HASH_XLARGE_MEMORY_REQUIRED  (32 * 1024LL * 1024LL * 1024LL)  /* 32GiB */
6944 
6945 #define DRT_SMALL_ALLOCATION    16384   /* 80 bytes spare */
6946 #define DRT_LARGE_ALLOCATION    131072  /* 208 bytes spare */
6947 #define DRT_XLARGE_ALLOCATION   524288  /* 304 bytes spare */
6948 
6949 #endif /* ! XNU_TARGET_OS_OSX */
6950 
6951 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6952 
6953 /*
6954  * Hashtable entry.
6955  */
6956 struct vfs_drt_hashentry {
6957 	u_int64_t       dhe_control;
6958 /*
6959  * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6960  * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
6961  * Since PAGE_SIZE is only known at boot time,
6962  *	-define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6963  *	-declare dhe_bitvector array for largest possible length
6964  */
6965 #define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
6966 	u_int32_t       dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
6967 };
6968 
6969 /*
6970  * Hashtable bitvector handling.
6971  *
6972  * Bitvector fields are 32 bits long.
6973  */
6974 
6975 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
6976 	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
6977 
6978 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
6979 	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
6980 
6981 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
6982 	((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
6983 
6984 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
6985 	bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6986 
6987 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
6988 	bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
6989 	    &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
6990 	    (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6991 
6992 /*
6993  * Dirty Region Tracking structure.
6994  *
6995  * The hashtable is allocated entirely inside the DRT structure.
6996  *
6997  * The hash is a simple circular prime modulus arrangement, the structure
6998  * is resized from small to large if it overflows.
6999  */
7000 
7001 struct vfs_drt_clustermap {
7002 	u_int32_t               scm_magic;      /* sanity/detection */
7003 #define DRT_SCM_MAGIC           0x12020003
7004 	u_int32_t               scm_modulus;    /* current ring size */
7005 	u_int32_t               scm_buckets;    /* number of occupied buckets */
7006 	u_int32_t               scm_lastclean;  /* last entry we cleaned */
7007 	u_int32_t               scm_iskips;     /* number of slot skips */
7008 
7009 	struct vfs_drt_hashentry scm_hashtable[0];
7010 };
7011 
7012 
7013 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
7014 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
7015 
7016 /*
7017  * Debugging codes and arguments.
7018  */
7019 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
7020 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
7021 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
7022 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
7023 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
7024 	                                                    * dirty */
7025                                                            /* 0, setcount */
7026                                                            /* 1 (clean, no map) */
7027                                                            /* 2 (map alloc fail) */
7028                                                            /* 3, resid (partial) */
7029 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
7030 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
7031 	                                                    * lastclean, iskips */
7032 
7033 
7034 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
7035 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
7036 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
7037     u_int64_t offset, int *indexp);
7038 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
7039     u_int64_t offset,
7040     int *indexp,
7041     int recursed);
7042 static kern_return_t    vfs_drt_do_mark_pages(
7043 	void            **cmapp,
7044 	u_int64_t       offset,
7045 	u_int           length,
7046 	u_int           *setcountp,
7047 	int             dirty);
7048 static void             vfs_drt_trace(
7049 	struct vfs_drt_clustermap *cmap,
7050 	int code,
7051 	int arg1,
7052 	int arg2,
7053 	int arg3,
7054 	int arg4);
7055 
7056 
7057 /*
7058  * Allocate and initialise a sparse cluster map.
7059  *
7060  * Will allocate a new map, resize or compact an existing map.
7061  *
7062  * XXX we should probably have at least one intermediate map size,
7063  * as the 1:16 ratio seems a bit drastic.
7064  */
7065 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap ** cmapp)7066 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
7067 {
7068 	struct vfs_drt_clustermap *cmap = NULL, *ocmap = NULL;
7069 	kern_return_t   kret = KERN_SUCCESS;
7070 	u_int64_t       offset = 0;
7071 	u_int32_t       i = 0;
7072 	int             modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
7073 
7074 	ocmap = NULL;
7075 	if (cmapp != NULL) {
7076 		ocmap = *cmapp;
7077 	}
7078 
7079 	/*
7080 	 * Decide on the size of the new map.
7081 	 */
7082 	if (ocmap == NULL) {
7083 		modulus_size = DRT_HASH_SMALL_MODULUS;
7084 		map_size = DRT_SMALL_ALLOCATION;
7085 	} else {
7086 		/* count the number of active buckets in the old map */
7087 		active_buckets = 0;
7088 		for (i = 0; i < ocmap->scm_modulus; i++) {
7089 			if (!DRT_HASH_VACANT(ocmap, i) &&
7090 			    (DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
7091 				active_buckets++;
7092 			}
7093 		}
7094 		/*
7095 		 * If we're currently using the small allocation, check to
7096 		 * see whether we should grow to the large one.
7097 		 */
7098 		if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7099 			/*
7100 			 * If the ring is nearly full and we are allowed to
7101 			 * use the large modulus, upgrade.
7102 			 */
7103 			if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
7104 			    (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
7105 				modulus_size = DRT_HASH_LARGE_MODULUS;
7106 				map_size = DRT_LARGE_ALLOCATION;
7107 			} else {
7108 				modulus_size = DRT_HASH_SMALL_MODULUS;
7109 				map_size = DRT_SMALL_ALLOCATION;
7110 			}
7111 		} else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7112 			if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
7113 			    (max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
7114 				modulus_size = DRT_HASH_XLARGE_MODULUS;
7115 				map_size = DRT_XLARGE_ALLOCATION;
7116 			} else {
7117 				/*
7118 				 * If the ring is completely full and we can't
7119 				 * expand, there's nothing useful for us to do.
7120 				 * Behave as though we had compacted into the new
7121 				 * array and return.
7122 				 */
7123 				return KERN_SUCCESS;
7124 			}
7125 		} else {
7126 			/* already using the xlarge modulus */
7127 			modulus_size = DRT_HASH_XLARGE_MODULUS;
7128 			map_size = DRT_XLARGE_ALLOCATION;
7129 
7130 			/*
7131 			 * If the ring is completely full, there's
7132 			 * nothing useful for us to do.  Behave as
7133 			 * though we had compacted into the new
7134 			 * array and return.
7135 			 */
7136 			if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
7137 				return KERN_SUCCESS;
7138 			}
7139 		}
7140 	}
7141 
7142 	/*
7143 	 * Allocate and initialise the new map.
7144 	 */
7145 
7146 	kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, map_size,
7147 	    KMA_DATA, VM_KERN_MEMORY_FILE);
7148 	if (kret != KERN_SUCCESS) {
7149 		return kret;
7150 	}
7151 	cmap->scm_magic = DRT_SCM_MAGIC;
7152 	cmap->scm_modulus = modulus_size;
7153 	cmap->scm_buckets = 0;
7154 	cmap->scm_lastclean = 0;
7155 	cmap->scm_iskips = 0;
7156 	for (i = 0; i < cmap->scm_modulus; i++) {
7157 		DRT_HASH_CLEAR(cmap, i);
7158 		DRT_HASH_VACATE(cmap, i);
7159 		DRT_BITVECTOR_CLEAR(cmap, i);
7160 	}
7161 
7162 	/*
7163 	 * If there's an old map, re-hash entries from it into the new map.
7164 	 */
7165 	copycount = 0;
7166 	if (ocmap != NULL) {
7167 		for (i = 0; i < ocmap->scm_modulus; i++) {
7168 			/* skip empty buckets */
7169 			if (DRT_HASH_VACANT(ocmap, i) ||
7170 			    (DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
7171 				continue;
7172 			}
7173 			/* get new index */
7174 			offset = DRT_HASH_GET_ADDRESS(ocmap, i);
7175 			kret = vfs_drt_get_index(&cmap, offset, &index, 1);
7176 			if (kret != KERN_SUCCESS) {
7177 				/* XXX need to bail out gracefully here */
7178 				panic("vfs_drt: new cluster map mysteriously too small");
7179 				index = 0;
7180 			}
7181 			/* copy */
7182 			DRT_HASH_COPY(ocmap, i, cmap, index);
7183 			copycount++;
7184 		}
7185 	}
7186 
7187 	/* log what we've done */
7188 	vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
7189 
7190 	/*
7191 	 * It's important to ensure that *cmapp always points to
7192 	 * a valid map, so we must overwrite it before freeing
7193 	 * the old map.
7194 	 */
7195 	*cmapp = cmap;
7196 	if (ocmap != NULL) {
7197 		/* emit stats into trace buffer */
7198 		vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
7199 		    ocmap->scm_modulus,
7200 		    ocmap->scm_buckets,
7201 		    ocmap->scm_lastclean,
7202 		    ocmap->scm_iskips);
7203 
7204 		vfs_drt_free_map(ocmap);
7205 	}
7206 	return KERN_SUCCESS;
7207 }
7208 
7209 
7210 /*
7211  * Free a sparse cluster map.
7212  */
7213 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap * cmap)7214 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
7215 {
7216 	vm_size_t map_size = 0;
7217 
7218 	if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7219 		map_size = DRT_SMALL_ALLOCATION;
7220 	} else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7221 		map_size = DRT_LARGE_ALLOCATION;
7222 	} else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7223 		map_size = DRT_XLARGE_ALLOCATION;
7224 	} else {
7225 		panic("vfs_drt_free_map: Invalid modulus %d", cmap->scm_modulus);
7226 	}
7227 
7228 	kmem_free(kernel_map, (vm_offset_t)cmap, map_size);
7229 	return KERN_SUCCESS;
7230 }
7231 
7232 
7233 /*
7234  * Find the hashtable slot currently occupied by an entry for the supplied offset.
7235  */
7236 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap * cmap,u_int64_t offset,int * indexp)7237 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
7238 {
7239 	int             index;
7240 	u_int32_t       i;
7241 
7242 	offset = DRT_ALIGN_ADDRESS(offset);
7243 	index = DRT_HASH(cmap, offset);
7244 
7245 	/* traverse the hashtable */
7246 	for (i = 0; i < cmap->scm_modulus; i++) {
7247 		/*
7248 		 * If the slot is vacant, we can stop.
7249 		 */
7250 		if (DRT_HASH_VACANT(cmap, index)) {
7251 			break;
7252 		}
7253 
7254 		/*
7255 		 * If the address matches our offset, we have success.
7256 		 */
7257 		if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
7258 			*indexp = index;
7259 			return KERN_SUCCESS;
7260 		}
7261 
7262 		/*
7263 		 * Move to the next slot, try again.
7264 		 */
7265 		index = DRT_HASH_NEXT(cmap, index);
7266 	}
7267 	/*
7268 	 * It's not there.
7269 	 */
7270 	return KERN_FAILURE;
7271 }
7272 
7273 /*
7274  * Find the hashtable slot for the supplied offset.  If we haven't allocated
7275  * one yet, allocate one and populate the address field.  Note that it will
7276  * not have a nonzero page count and thus will still technically be free, so
7277  * in the case where we are called to clean pages, the slot will remain free.
7278  */
7279 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap ** cmapp,u_int64_t offset,int * indexp,int recursed)7280 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
7281 {
7282 	struct vfs_drt_clustermap *cmap;
7283 	kern_return_t   kret;
7284 	u_int32_t       index;
7285 	u_int32_t       i;
7286 
7287 	cmap = *cmapp;
7288 
7289 	/* look for an existing entry */
7290 	kret = vfs_drt_search_index(cmap, offset, indexp);
7291 	if (kret == KERN_SUCCESS) {
7292 		return kret;
7293 	}
7294 
7295 	/* need to allocate an entry */
7296 	offset = DRT_ALIGN_ADDRESS(offset);
7297 	index = DRT_HASH(cmap, offset);
7298 
7299 	/* scan from the index forwards looking for a vacant slot */
7300 	for (i = 0; i < cmap->scm_modulus; i++) {
7301 		/* slot vacant? */
7302 		if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap, index) == 0) {
7303 			cmap->scm_buckets++;
7304 			if (index < cmap->scm_lastclean) {
7305 				cmap->scm_lastclean = index;
7306 			}
7307 			DRT_HASH_SET_ADDRESS(cmap, index, offset);
7308 			DRT_HASH_SET_COUNT(cmap, index, 0);
7309 			DRT_BITVECTOR_CLEAR(cmap, index);
7310 			*indexp = index;
7311 			vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
7312 			return KERN_SUCCESS;
7313 		}
7314 		cmap->scm_iskips += i;
7315 		index = DRT_HASH_NEXT(cmap, index);
7316 	}
7317 
7318 	/*
7319 	 * We haven't found a vacant slot, so the map is full.  If we're not
7320 	 * already recursed, try reallocating/compacting it.
7321 	 */
7322 	if (recursed) {
7323 		return KERN_FAILURE;
7324 	}
7325 	kret = vfs_drt_alloc_map(cmapp);
7326 	if (kret == KERN_SUCCESS) {
7327 		/* now try to insert again */
7328 		kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
7329 	}
7330 	return kret;
7331 }
7332 
7333 /*
7334  * Implementation of set dirty/clean.
7335  *
7336  * In the 'clean' case, not finding a map is OK.
7337  */
7338 static kern_return_t
vfs_drt_do_mark_pages(void ** private,u_int64_t offset,u_int length,u_int * setcountp,int dirty)7339 vfs_drt_do_mark_pages(
7340 	void            **private,
7341 	u_int64_t       offset,
7342 	u_int           length,
7343 	u_int           *setcountp,
7344 	int             dirty)
7345 {
7346 	struct vfs_drt_clustermap *cmap, **cmapp;
7347 	kern_return_t   kret;
7348 	int             i, index, pgoff, pgcount, setcount, ecount;
7349 
7350 	cmapp = (struct vfs_drt_clustermap **)private;
7351 	cmap = *cmapp;
7352 
7353 	vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
7354 
7355 	if (setcountp != NULL) {
7356 		*setcountp = 0;
7357 	}
7358 
7359 	/* allocate a cluster map if we don't already have one */
7360 	if (cmap == NULL) {
7361 		/* no cluster map, nothing to clean */
7362 		if (!dirty) {
7363 			vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
7364 			return KERN_SUCCESS;
7365 		}
7366 		kret = vfs_drt_alloc_map(cmapp);
7367 		if (kret != KERN_SUCCESS) {
7368 			vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
7369 			return kret;
7370 		}
7371 	}
7372 	setcount = 0;
7373 
7374 	/*
7375 	 * Iterate over the length of the region.
7376 	 */
7377 	while (length > 0) {
7378 		/*
7379 		 * Get the hashtable index for this offset.
7380 		 *
7381 		 * XXX this will add blank entries if we are clearing a range
7382 		 * that hasn't been dirtied.
7383 		 */
7384 		kret = vfs_drt_get_index(cmapp, offset, &index, 0);
7385 		cmap = *cmapp;  /* may have changed! */
7386 		/* this may be a partial-success return */
7387 		if (kret != KERN_SUCCESS) {
7388 			if (setcountp != NULL) {
7389 				*setcountp = setcount;
7390 			}
7391 			vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
7392 
7393 			return kret;
7394 		}
7395 
7396 		/*
7397 		 * Work out how many pages we're modifying in this
7398 		 * hashtable entry.
7399 		 */
7400 		pgoff = (int)((offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE);
7401 		pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
7402 
7403 		/*
7404 		 * Iterate over pages, dirty/clearing as we go.
7405 		 */
7406 		ecount = DRT_HASH_GET_COUNT(cmap, index);
7407 		for (i = 0; i < pgcount; i++) {
7408 			if (dirty) {
7409 				if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7410 					if (ecount >= DRT_BITVECTOR_PAGES) {
7411 						panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7412 					}
7413 					DRT_HASH_SET_BIT(cmap, index, pgoff + i);
7414 					ecount++;
7415 					setcount++;
7416 				}
7417 			} else {
7418 				if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7419 					if (ecount <= 0) {
7420 						panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7421 					}
7422 					assert(ecount > 0);
7423 					DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
7424 					ecount--;
7425 					setcount++;
7426 				}
7427 			}
7428 		}
7429 		DRT_HASH_SET_COUNT(cmap, index, ecount);
7430 
7431 		offset += pgcount * PAGE_SIZE;
7432 		length -= pgcount * PAGE_SIZE;
7433 	}
7434 	if (setcountp != NULL) {
7435 		*setcountp = setcount;
7436 	}
7437 
7438 	vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
7439 
7440 	return KERN_SUCCESS;
7441 }
7442 
7443 /*
7444  * Mark a set of pages as dirty/clean.
7445  *
7446  * This is a public interface.
7447  *
7448  * cmapp
7449  *	Pointer to storage suitable for holding a pointer.  Note that
7450  *	this must either be NULL or a value set by this function.
7451  *
7452  * size
7453  *	Current file size in bytes.
7454  *
7455  * offset
7456  *	Offset of the first page to be marked as dirty, in bytes.  Must be
7457  *	page-aligned.
7458  *
7459  * length
7460  *	Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
7461  *
7462  * setcountp
7463  *	Number of pages newly marked dirty by this call (optional).
7464  *
7465  * Returns KERN_SUCCESS if all the pages were successfully marked.
7466  */
7467 static kern_return_t
vfs_drt_mark_pages(void ** cmapp,off_t offset,u_int length,u_int * setcountp)7468 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
7469 {
7470 	/* XXX size unused, drop from interface */
7471 	return vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1);
7472 }
7473 
7474 #if 0
7475 static kern_return_t
7476 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
7477 {
7478 	return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7479 }
7480 #endif
7481 
7482 /*
7483  * Get a cluster of dirty pages.
7484  *
7485  * This is a public interface.
7486  *
7487  * cmapp
7488  *	Pointer to storage managed by drt_mark_pages.  Note that this must
7489  *	be NULL or a value set by drt_mark_pages.
7490  *
7491  * offsetp
7492  *	Returns the byte offset into the file of the first page in the cluster.
7493  *
7494  * lengthp
7495  *	Returns the length in bytes of the cluster of dirty pages.
7496  *
7497  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
7498  * are no dirty pages meeting the minmum size criteria.  Private storage will
7499  * be released if there are no more dirty pages left in the map
7500  *
7501  */
7502 static kern_return_t
vfs_drt_get_cluster(void ** cmapp,off_t * offsetp,u_int * lengthp)7503 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
7504 {
7505 	struct vfs_drt_clustermap *cmap;
7506 	u_int64_t       offset;
7507 	u_int           length;
7508 	u_int32_t       j;
7509 	int             index, i, fs, ls;
7510 
7511 	/* sanity */
7512 	if ((cmapp == NULL) || (*cmapp == NULL)) {
7513 		return KERN_FAILURE;
7514 	}
7515 	cmap = *cmapp;
7516 
7517 	/* walk the hashtable */
7518 	for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
7519 		index = DRT_HASH(cmap, offset);
7520 
7521 		if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
7522 			continue;
7523 		}
7524 
7525 		/* scan the bitfield for a string of bits */
7526 		fs = -1;
7527 
7528 		for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7529 			if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7530 				fs = i;
7531 				break;
7532 			}
7533 		}
7534 		if (fs == -1) {
7535 			/*  didn't find any bits set */
7536 			panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7537 			    cmap, index, DRT_HASH_GET_COUNT(cmap, index));
7538 		}
7539 		for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
7540 			if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
7541 				break;
7542 			}
7543 		}
7544 
7545 		/* compute offset and length, mark pages clean */
7546 		offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7547 		length = ls * PAGE_SIZE;
7548 		vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7549 		cmap->scm_lastclean = index;
7550 
7551 		/* return successful */
7552 		*offsetp = (off_t)offset;
7553 		*lengthp = length;
7554 
7555 		vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
7556 		return KERN_SUCCESS;
7557 	}
7558 	/*
7559 	 * We didn't find anything... hashtable is empty
7560 	 * emit stats into trace buffer and
7561 	 * then free it
7562 	 */
7563 	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7564 	    cmap->scm_modulus,
7565 	    cmap->scm_buckets,
7566 	    cmap->scm_lastclean,
7567 	    cmap->scm_iskips);
7568 
7569 	vfs_drt_free_map(cmap);
7570 	*cmapp = NULL;
7571 
7572 	return KERN_FAILURE;
7573 }
7574 
7575 
7576 static kern_return_t
vfs_drt_control(void ** cmapp,int op_type)7577 vfs_drt_control(void **cmapp, int op_type)
7578 {
7579 	struct vfs_drt_clustermap *cmap;
7580 
7581 	/* sanity */
7582 	if ((cmapp == NULL) || (*cmapp == NULL)) {
7583 		return KERN_FAILURE;
7584 	}
7585 	cmap = *cmapp;
7586 
7587 	switch (op_type) {
7588 	case 0:
7589 		/* emit stats into trace buffer */
7590 		vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7591 		    cmap->scm_modulus,
7592 		    cmap->scm_buckets,
7593 		    cmap->scm_lastclean,
7594 		    cmap->scm_iskips);
7595 
7596 		vfs_drt_free_map(cmap);
7597 		*cmapp = NULL;
7598 		break;
7599 
7600 	case 1:
7601 		cmap->scm_lastclean = 0;
7602 		break;
7603 	}
7604 	return KERN_SUCCESS;
7605 }
7606 
7607 
7608 
7609 /*
7610  * Emit a summary of the state of the clustermap into the trace buffer
7611  * along with some caller-provided data.
7612  */
7613 #if KDEBUG
7614 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,int code,int arg1,int arg2,int arg3,int arg4)7615 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
7616 {
7617 	KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
7618 }
7619 #else
7620 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,__unused int code,__unused int arg1,__unused int arg2,__unused int arg3,__unused int arg4)7621 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
7622     __unused int arg1, __unused int arg2, __unused int arg3,
7623     __unused int arg4)
7624 {
7625 }
7626 #endif
7627 
7628 #if 0
7629 /*
7630  * Perform basic sanity check on the hash entry summary count
7631  * vs. the actual bits set in the entry.
7632  */
7633 static void
7634 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7635 {
7636 	int index, i;
7637 	int bits_on;
7638 
7639 	for (index = 0; index < cmap->scm_modulus; index++) {
7640 		if (DRT_HASH_VACANT(cmap, index)) {
7641 			continue;
7642 		}
7643 
7644 		for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7645 			if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7646 				bits_on++;
7647 			}
7648 		}
7649 		if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
7650 			panic("bits_on = %d,  index = %d", bits_on, index);
7651 		}
7652 	}
7653 }
7654 #endif
7655 
7656 /*
7657  * Internal interface only.
7658  */
7659 static kern_return_t
vfs_get_scmap_push_behavior_internal(void ** cmapp,int * push_flag)7660 vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag)
7661 {
7662 	struct vfs_drt_clustermap *cmap;
7663 
7664 	/* sanity */
7665 	if ((cmapp == NULL) || (*cmapp == NULL) || (push_flag == NULL)) {
7666 		return KERN_FAILURE;
7667 	}
7668 	cmap = *cmapp;
7669 
7670 	if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7671 		/*
7672 		 * If we have a full xlarge sparse cluster,
7673 		 * we push it out all at once so the cluster
7674 		 * map can be available to absorb more I/Os.
7675 		 * This is done on large memory configs so
7676 		 * the small I/Os don't interfere with the
7677 		 * pro workloads.
7678 		 */
7679 		*push_flag = PUSH_ALL;
7680 	}
7681 	return KERN_SUCCESS;
7682 }
7683