xref: /xnu-8792.81.2/bsd/vfs/vfs_cluster.c (revision 19c3b8c28c31cb8130e034cfb5df6bf9ba342d90)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)vfs_cluster.c	8.10 (Berkeley) 3/28/95
62  */
63 
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <kern/kalloc.h>
71 #include <sys/time.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <miscfs/specfs/specdev.h>
75 #include <sys/uio_internal.h>
76 #include <libkern/libkern.h>
77 #include <machine/machine_routines.h>
78 
79 #include <sys/ubc_internal.h>
80 #include <vm/vnode_pager.h>
81 
82 #include <mach/mach_types.h>
83 #include <mach/memory_object_types.h>
84 #include <mach/vm_map.h>
85 #include <mach/upl.h>
86 #include <kern/task.h>
87 #include <kern/policy_internal.h>
88 
89 #include <vm/vm_kern.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_pageout.h>
92 #include <vm/vm_fault.h>
93 
94 #include <sys/kdebug.h>
95 #include <sys/kdebug_triage.h>
96 #include <libkern/OSAtomic.h>
97 
98 #include <sys/sdt.h>
99 
100 #include <stdbool.h>
101 
102 #include <vfs/vfs_disk_conditioner.h>
103 
104 #if 0
105 #undef KERNEL_DEBUG
106 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
107 #endif
108 
109 
110 #define CL_READ         0x01
111 #define CL_WRITE        0x02
112 #define CL_ASYNC        0x04
113 #define CL_COMMIT       0x08
114 #define CL_PAGEOUT      0x10
115 #define CL_AGE          0x20
116 #define CL_NOZERO       0x40
117 #define CL_PAGEIN       0x80
118 #define CL_DEV_MEMORY   0x100
119 #define CL_PRESERVE     0x200
120 #define CL_THROTTLE     0x400
121 #define CL_KEEPCACHED   0x800
122 #define CL_DIRECT_IO    0x1000
123 #define CL_PASSIVE      0x2000
124 #define CL_IOSTREAMING  0x4000
125 #define CL_CLOSE        0x8000
126 #define CL_ENCRYPTED    0x10000
127 #define CL_RAW_ENCRYPTED        0x20000
128 #define CL_NOCACHE      0x40000
129 
130 #define MAX_VECTOR_UPL_ELEMENTS 8
131 #define MAX_VECTOR_UPL_SIZE     (2 * MAX_UPL_SIZE_BYTES)
132 
133 #define CLUSTER_IO_WAITING              ((buf_t)1)
134 
135 extern upl_t vector_upl_create(vm_offset_t);
136 extern boolean_t vector_upl_is_valid(upl_t);
137 extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
138 extern void vector_upl_set_pagelist(upl_t);
139 extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
140 
141 struct clios {
142 	lck_mtx_t io_mtxp;
143 	u_int  io_completed;       /* amount of io that has currently completed */
144 	u_int  io_issued;          /* amount of io that was successfully issued */
145 	int    io_error;           /* error code of first error encountered */
146 	int    io_wanted;          /* someone is sleeping waiting for a change in state */
147 };
148 
149 struct cl_direct_read_lock {
150 	LIST_ENTRY(cl_direct_read_lock)         chain;
151 	int32_t                                                         ref_count;
152 	vnode_t                                                         vp;
153 	lck_rw_t                                                        rw_lock;
154 };
155 
156 #define CL_DIRECT_READ_LOCK_BUCKETS 61
157 
158 static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
159 cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
160 
161 static LCK_GRP_DECLARE(cl_mtx_grp, "cluster I/O");
162 static LCK_MTX_DECLARE(cl_transaction_mtxp, &cl_mtx_grp);
163 static LCK_SPIN_DECLARE(cl_direct_read_spin_lock, &cl_mtx_grp);
164 
165 static ZONE_DEFINE(cl_rd_zone, "cluster_read",
166     sizeof(struct cl_readahead), ZC_ZFREE_CLEARMEM);
167 
168 static ZONE_DEFINE(cl_wr_zone, "cluster_write",
169     sizeof(struct cl_writebehind), ZC_ZFREE_CLEARMEM);
170 
171 #define IO_UNKNOWN      0
172 #define IO_DIRECT       1
173 #define IO_CONTIG       2
174 #define IO_COPY         3
175 
176 #define PUSH_DELAY      0x01
177 #define PUSH_ALL        0x02
178 #define PUSH_SYNC       0x04
179 
180 
181 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size);
182 static void cluster_wait_IO(buf_t cbp_head, int async);
183 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
184 
185 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
186 
187 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
188     int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
189 static int cluster_iodone(buf_t bp, void *callback_arg);
190 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
191 static int cluster_is_throttled(vnode_t vp);
192 
193 static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
194 
195 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
196 
197 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
198 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
199 
200 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
201     int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
202 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
203     int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
204 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
205     int (*)(buf_t, void *), void *callback_arg, int flags) __attribute__((noinline));
206 
207 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
208     off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
209 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
210     int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
211 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
212     int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag) __attribute__((noinline));
213 
214 static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
215     off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
216 
217 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
218 
219 static int      cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
220 static void     cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
221     int (*callback)(buf_t, void *), void *callback_arg, int bflag);
222 
223 static int      cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
224 
225 static int      cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
226     void *callback_arg, int *err, boolean_t vm_initiated);
227 
228 static int      sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
229 static int      sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
230     int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
231 static int      sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
232     int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
233 
234 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
235 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
236 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
237 static kern_return_t vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag);
238 
239 
240 /*
241  * For throttled IO to check whether
242  * a block is cached by the boot cache
243  * and thus it can avoid delaying the IO.
244  *
245  * bootcache_contains_block is initially
246  * NULL. The BootCache will set it while
247  * the cache is active and clear it when
248  * the cache is jettisoned.
249  *
250  * Returns 0 if the block is not
251  * contained in the cache, 1 if it is
252  * contained.
253  *
254  * The function pointer remains valid
255  * after the cache has been evicted even
256  * if bootcache_contains_block has been
257  * cleared.
258  *
259  * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
260  */
261 int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
262 
263 
264 /*
265  * limit the internal I/O size so that we
266  * can represent it in a 32 bit int
267  */
268 #define MAX_IO_REQUEST_SIZE     (1024 * 1024 * 512)
269 #define MAX_IO_CONTIG_SIZE      MAX_UPL_SIZE_BYTES
270 #define MAX_VECTS               16
271 /*
272  * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
273  * allowing the caller to bypass the buffer cache.  For small I/Os (less than 16k),
274  * we have not historically allowed the write to bypass the UBC.
275  */
276 #define MIN_DIRECT_WRITE_SIZE   (16384)
277 
278 #define WRITE_THROTTLE          6
279 #define WRITE_THROTTLE_SSD      2
280 #define WRITE_BEHIND            1
281 #define WRITE_BEHIND_SSD        1
282 
283 #if !defined(XNU_TARGET_OS_OSX)
284 #define PREFETCH                1
285 #define PREFETCH_SSD            1
286 uint32_t speculative_prefetch_max = (2048 * 1024);              /* maximum bytes in a specluative read-ahead */
287 uint32_t speculative_prefetch_max_iosize = (512 * 1024);        /* maximum I/O size to use in a specluative read-ahead */
288 #else /* XNU_TARGET_OS_OSX */
289 #define PREFETCH                3
290 #define PREFETCH_SSD            2
291 uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3);   /* maximum bytes in a specluative read-ahead */
292 uint32_t speculative_prefetch_max_iosize = (512 * 1024);        /* maximum I/O size to use in a specluative read-ahead on SSDs*/
293 #endif /* ! XNU_TARGET_OS_OSX */
294 
295 /* maximum bytes for read-ahead */
296 uint32_t prefetch_max = (1024 * 1024 * 1024);
297 /* maximum bytes for outstanding reads */
298 uint32_t overlapping_read_max = (1024 * 1024 * 1024);
299 /* maximum bytes for outstanding writes */
300 uint32_t overlapping_write_max = (1024 * 1024 * 1024);
301 
302 #define IO_SCALE(vp, base)              (vp->v_mount->mnt_ioscale * (base))
303 #define MAX_CLUSTER_SIZE(vp)            (cluster_max_io_size(vp->v_mount, CL_WRITE))
304 
305 int     speculative_reads_disabled = 0;
306 
307 /*
308  * throttle the number of async writes that
309  * can be outstanding on a single vnode
310  * before we issue a synchronous write
311  */
312 #define THROTTLE_MAXCNT 0
313 
314 uint32_t throttle_max_iosize = (128 * 1024);
315 
316 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
317 
318 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
319 
320 
321 void
cluster_init(void)322 cluster_init(void)
323 {
324 	for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
325 		LIST_INIT(&cl_direct_read_locks[i]);
326 	}
327 }
328 
329 
330 uint32_t
cluster_max_io_size(mount_t mp,int type)331 cluster_max_io_size(mount_t mp, int type)
332 {
333 	uint32_t        max_io_size;
334 	uint32_t        segcnt;
335 	uint32_t        maxcnt;
336 
337 	switch (type) {
338 	case CL_READ:
339 		segcnt = mp->mnt_segreadcnt;
340 		maxcnt = mp->mnt_maxreadcnt;
341 		break;
342 	case CL_WRITE:
343 		segcnt = mp->mnt_segwritecnt;
344 		maxcnt = mp->mnt_maxwritecnt;
345 		break;
346 	default:
347 		segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
348 		maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
349 		break;
350 	}
351 	if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
352 		/*
353 		 * don't allow a size beyond the max UPL size we can create
354 		 */
355 		segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
356 	}
357 	max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
358 
359 	if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
360 		/*
361 		 * don't allow a size smaller than the old fixed limit
362 		 */
363 		max_io_size = MAX_UPL_TRANSFER_BYTES;
364 	} else {
365 		/*
366 		 * make sure the size specified is a multiple of PAGE_SIZE
367 		 */
368 		max_io_size &= ~PAGE_MASK;
369 	}
370 	return max_io_size;
371 }
372 
373 /*
374  * Returns max prefetch value. If the value overflows or exceeds the specified
375  * 'prefetch_limit', it will be capped at 'prefetch_limit' value.
376  */
377 static inline uint32_t
cluster_max_prefetch(vnode_t vp,uint32_t max_io_size,uint32_t prefetch_limit)378 cluster_max_prefetch(vnode_t vp, uint32_t max_io_size, uint32_t prefetch_limit)
379 {
380 	bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
381 	uint32_t io_scale = IO_SCALE(vp, is_ssd ? PREFETCH_SSD : PREFETCH);
382 	uint32_t prefetch = 0;
383 
384 	if (__improbable(os_mul_overflow(max_io_size, io_scale, &prefetch) ||
385 	    (prefetch > prefetch_limit))) {
386 		prefetch = prefetch_limit;
387 	}
388 
389 	return prefetch;
390 }
391 
392 #define CLW_ALLOCATE            0x01
393 #define CLW_RETURNLOCKED        0x02
394 #define CLW_IONOCACHE           0x04
395 #define CLW_IOPASSIVE   0x08
396 
397 /*
398  * if the read ahead context doesn't yet exist,
399  * allocate and initialize it...
400  * the vnode lock serializes multiple callers
401  * during the actual assignment... first one
402  * to grab the lock wins... the other callers
403  * will release the now unnecessary storage
404  *
405  * once the context is present, try to grab (but don't block on)
406  * the lock associated with it... if someone
407  * else currently owns it, than the read
408  * will run without read-ahead.  this allows
409  * multiple readers to run in parallel and
410  * since there's only 1 read ahead context,
411  * there's no real loss in only allowing 1
412  * reader to have read-ahead enabled.
413  */
414 static struct cl_readahead *
cluster_get_rap(vnode_t vp)415 cluster_get_rap(vnode_t vp)
416 {
417 	struct ubc_info         *ubc;
418 	struct cl_readahead     *rap;
419 
420 	ubc = vp->v_ubcinfo;
421 
422 	if ((rap = ubc->cl_rahead) == NULL) {
423 		rap = zalloc_flags(cl_rd_zone, Z_WAITOK | Z_ZERO);
424 		rap->cl_lastr = -1;
425 		lck_mtx_init(&rap->cl_lockr, &cl_mtx_grp, LCK_ATTR_NULL);
426 
427 		vnode_lock(vp);
428 
429 		if (ubc->cl_rahead == NULL) {
430 			ubc->cl_rahead = rap;
431 		} else {
432 			lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
433 			zfree(cl_rd_zone, rap);
434 			rap = ubc->cl_rahead;
435 		}
436 		vnode_unlock(vp);
437 	}
438 	if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) {
439 		return rap;
440 	}
441 
442 	return (struct cl_readahead *)NULL;
443 }
444 
445 
446 /*
447  * if the write behind context doesn't yet exist,
448  * and CLW_ALLOCATE is specified, allocate and initialize it...
449  * the vnode lock serializes multiple callers
450  * during the actual assignment... first one
451  * to grab the lock wins... the other callers
452  * will release the now unnecessary storage
453  *
454  * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
455  * the lock associated with the write behind context before
456  * returning
457  */
458 
459 static struct cl_writebehind *
cluster_get_wbp(vnode_t vp,int flags)460 cluster_get_wbp(vnode_t vp, int flags)
461 {
462 	struct ubc_info *ubc;
463 	struct cl_writebehind *wbp;
464 
465 	ubc = vp->v_ubcinfo;
466 
467 	if ((wbp = ubc->cl_wbehind) == NULL) {
468 		if (!(flags & CLW_ALLOCATE)) {
469 			return (struct cl_writebehind *)NULL;
470 		}
471 
472 		wbp = zalloc_flags(cl_wr_zone, Z_WAITOK | Z_ZERO);
473 
474 		lck_mtx_init(&wbp->cl_lockw, &cl_mtx_grp, LCK_ATTR_NULL);
475 
476 		vnode_lock(vp);
477 
478 		if (ubc->cl_wbehind == NULL) {
479 			ubc->cl_wbehind = wbp;
480 		} else {
481 			lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
482 			zfree(cl_wr_zone, wbp);
483 			wbp = ubc->cl_wbehind;
484 		}
485 		vnode_unlock(vp);
486 	}
487 	if (flags & CLW_RETURNLOCKED) {
488 		lck_mtx_lock(&wbp->cl_lockw);
489 	}
490 
491 	return wbp;
492 }
493 
494 
495 static void
cluster_syncup(vnode_t vp,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,int flags)496 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
497 {
498 	struct cl_writebehind *wbp;
499 
500 	if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
501 		if (wbp->cl_number) {
502 			lck_mtx_lock(&wbp->cl_lockw);
503 
504 			cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
505 
506 			lck_mtx_unlock(&wbp->cl_lockw);
507 		}
508 	}
509 }
510 
511 
512 static int
cluster_io_present_in_BC(vnode_t vp,off_t f_offset)513 cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
514 {
515 	daddr64_t blkno;
516 	size_t    io_size;
517 	int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
518 
519 	if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
520 		if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL)) {
521 			return 0;
522 		}
523 
524 		if (io_size == 0) {
525 			return 0;
526 		}
527 
528 		if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
529 			return 1;
530 		}
531 	}
532 	return 0;
533 }
534 
535 
536 static int
cluster_is_throttled(vnode_t vp)537 cluster_is_throttled(vnode_t vp)
538 {
539 	return throttle_io_will_be_throttled(-1, vp->v_mount);
540 }
541 
542 
543 static void
cluster_iostate_wait(struct clios * iostate,u_int target,const char * wait_name)544 cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
545 {
546 	lck_mtx_lock(&iostate->io_mtxp);
547 
548 	while ((iostate->io_issued - iostate->io_completed) > target) {
549 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
550 		    iostate->io_issued, iostate->io_completed, target, 0, 0);
551 
552 		iostate->io_wanted = 1;
553 		msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
554 
555 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
556 		    iostate->io_issued, iostate->io_completed, target, 0, 0);
557 	}
558 	lck_mtx_unlock(&iostate->io_mtxp);
559 }
560 
561 static void
cluster_handle_associated_upl(struct clios * iostate,upl_t upl,upl_offset_t upl_offset,upl_size_t size)562 cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
563     upl_offset_t upl_offset, upl_size_t size)
564 {
565 	if (!size) {
566 		return;
567 	}
568 
569 	upl_t associated_upl = upl_associated_upl(upl);
570 
571 	if (!associated_upl) {
572 		return;
573 	}
574 
575 #if 0
576 	printf("1: %d %d\n", upl_offset, upl_offset + size);
577 #endif
578 
579 	/*
580 	 * The associated UPL is page aligned to file offsets whereas the
581 	 * UPL it's attached to has different alignment requirements.  The
582 	 * upl_offset that we have refers to @upl.  The code that follows
583 	 * has to deal with the first and last pages in this transaction
584 	 * which might straddle pages in the associated UPL.  To keep
585 	 * track of these pages, we use the mark bits: if the mark bit is
586 	 * set, we know another transaction has completed its part of that
587 	 * page and so we can unlock that page here.
588 	 *
589 	 * The following illustrates what we have to deal with:
590 	 *
591 	 *    MEM u <------------ 1 PAGE ------------> e
592 	 *        +-------------+----------------------+-----------------
593 	 *        |             |######################|#################
594 	 *        +-------------+----------------------+-----------------
595 	 *   FILE | <--- a ---> o <------------ 1 PAGE ------------>
596 	 *
597 	 * So here we show a write to offset @o.  The data that is to be
598 	 * written is in a buffer that is not page aligned; it has offset
599 	 * @a in the page.  The upl that carries the data starts in memory
600 	 * at @u.  The associated upl starts in the file at offset @o.  A
601 	 * transaction will always end on a page boundary (like @e above)
602 	 * except for the very last transaction in the group.  We cannot
603 	 * unlock the page at @o in the associated upl until both the
604 	 * transaction ending at @e and the following transaction (that
605 	 * starts at @e) has completed.
606 	 */
607 
608 	/*
609 	 * We record whether or not the two UPLs are aligned as the mark
610 	 * bit in the first page of @upl.
611 	 */
612 	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
613 	bool is_unaligned = upl_page_get_mark(pl, 0);
614 
615 	if (is_unaligned) {
616 		upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
617 
618 		upl_offset_t upl_end = upl_offset + size;
619 		assert(upl_end >= PAGE_SIZE);
620 
621 		upl_size_t assoc_upl_size = upl_get_size(associated_upl);
622 
623 		/*
624 		 * In the very first transaction in the group, upl_offset will
625 		 * not be page aligned, but after that it will be and in that
626 		 * case we want the preceding page in the associated UPL hence
627 		 * the minus one.
628 		 */
629 		assert(upl_offset);
630 		if (upl_offset) {
631 			upl_offset = trunc_page_32(upl_offset - 1);
632 		}
633 
634 		lck_mtx_lock_spin(&iostate->io_mtxp);
635 
636 		// Look at the first page...
637 		if (upl_offset
638 		    && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
639 			/*
640 			 * The first page isn't marked so let another transaction
641 			 * completion handle it.
642 			 */
643 			upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
644 			upl_offset += PAGE_SIZE;
645 		}
646 
647 		// And now the last page...
648 
649 		/*
650 		 * This needs to be > rather than >= because if it's equal, it
651 		 * means there's another transaction that is sharing the last
652 		 * page.
653 		 */
654 		if (upl_end > assoc_upl_size) {
655 			upl_end = assoc_upl_size;
656 		} else {
657 			upl_end = trunc_page_32(upl_end);
658 			const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
659 
660 			if (!upl_page_get_mark(assoc_pl, last_pg)) {
661 				/*
662 				 * The last page isn't marked so mark the page and let another
663 				 * transaction completion handle it.
664 				 */
665 				upl_page_set_mark(assoc_pl, last_pg, true);
666 				upl_end -= PAGE_SIZE;
667 			}
668 		}
669 
670 		lck_mtx_unlock(&iostate->io_mtxp);
671 
672 #if 0
673 		printf("2: %d %d\n", upl_offset, upl_end);
674 #endif
675 
676 		if (upl_end <= upl_offset) {
677 			return;
678 		}
679 
680 		size = upl_end - upl_offset;
681 	} else {
682 		assert(!(upl_offset & PAGE_MASK));
683 		assert(!(size & PAGE_MASK));
684 	}
685 
686 	boolean_t empty;
687 
688 	/*
689 	 * We can unlock these pages now and as this is for a
690 	 * direct/uncached write, we want to dump the pages too.
691 	 */
692 	kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
693 	    UPL_ABORT_DUMP_PAGES, &empty);
694 
695 	assert(!kr);
696 
697 	if (!kr && empty) {
698 		upl_set_associated_upl(upl, NULL);
699 		upl_deallocate(associated_upl);
700 	}
701 }
702 
703 static int
cluster_ioerror(upl_t upl,int upl_offset,int abort_size,int error,int io_flags,vnode_t vp)704 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
705 {
706 	int upl_abort_code = 0;
707 	int page_in  = 0;
708 	int page_out = 0;
709 
710 	if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) {
711 		/*
712 		 * direct write of any flavor, or a direct read that wasn't aligned
713 		 */
714 		ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
715 	} else {
716 		if (io_flags & B_PAGEIO) {
717 			if (io_flags & B_READ) {
718 				page_in  = 1;
719 			} else {
720 				page_out = 1;
721 			}
722 		}
723 		if (io_flags & B_CACHE) {
724 			/*
725 			 * leave pages in the cache unchanged on error
726 			 */
727 			upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
728 		} else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) {
729 			/*
730 			 * transient error on pageout/write path... leave pages unchanged
731 			 */
732 			upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
733 		} else if (page_in) {
734 			upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
735 		} else {
736 			upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
737 		}
738 
739 		ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
740 	}
741 	return upl_abort_code;
742 }
743 
744 
745 static int
cluster_iodone(buf_t bp,void * callback_arg)746 cluster_iodone(buf_t bp, void *callback_arg)
747 {
748 	int     b_flags;
749 	int     error;
750 	int     total_size;
751 	int     total_resid;
752 	int     upl_offset;
753 	int     zero_offset;
754 	int     pg_offset = 0;
755 	int     commit_size = 0;
756 	int     upl_flags = 0;
757 	int     transaction_size = 0;
758 	upl_t   upl;
759 	buf_t   cbp;
760 	buf_t   cbp_head;
761 	buf_t   cbp_next;
762 	buf_t   real_bp;
763 	vnode_t vp;
764 	struct  clios *iostate;
765 	void    *verify_ctx;
766 	boolean_t       transaction_complete = FALSE;
767 
768 	__IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
769 
770 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
771 	    cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
772 
773 	if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
774 		lck_mtx_lock_spin(&cl_transaction_mtxp);
775 
776 		bp->b_flags |= B_TDONE;
777 
778 		for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
779 			/*
780 			 * all I/O requests that are part of this transaction
781 			 * have to complete before we can process it
782 			 */
783 			if (!(cbp->b_flags & B_TDONE)) {
784 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
785 				    cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
786 
787 				lck_mtx_unlock(&cl_transaction_mtxp);
788 
789 				return 0;
790 			}
791 
792 			if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
793 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
794 				    cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
795 
796 				lck_mtx_unlock(&cl_transaction_mtxp);
797 				wakeup(cbp);
798 
799 				return 0;
800 			}
801 
802 			if (cbp->b_flags & B_EOT) {
803 				transaction_complete = TRUE;
804 			}
805 		}
806 		lck_mtx_unlock(&cl_transaction_mtxp);
807 
808 		if (transaction_complete == FALSE) {
809 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
810 			    cbp_head, 0, 0, 0, 0);
811 			return 0;
812 		}
813 	}
814 	error       = 0;
815 	total_size  = 0;
816 	total_resid = 0;
817 
818 	cbp        = cbp_head;
819 	vp         = cbp->b_vp;
820 	upl_offset = cbp->b_uploffset;
821 	upl        = cbp->b_upl;
822 	b_flags    = cbp->b_flags;
823 	real_bp    = cbp->b_real_bp;
824 	zero_offset = cbp->b_validend;
825 	iostate    = (struct clios *)cbp->b_iostate;
826 
827 	if (real_bp) {
828 		real_bp->b_dev = cbp->b_dev;
829 	}
830 
831 	while (cbp) {
832 		if ((cbp->b_flags & B_ERROR) && error == 0) {
833 			error = cbp->b_error;
834 		}
835 
836 		total_resid += cbp->b_resid;
837 		total_size  += cbp->b_bcount;
838 
839 		cbp_next = cbp->b_trans_next;
840 
841 		if (cbp_next == NULL) {
842 			/*
843 			 * compute the overall size of the transaction
844 			 * in case we created one that has 'holes' in it
845 			 * 'total_size' represents the amount of I/O we
846 			 * did, not the span of the transaction w/r to the UPL
847 			 */
848 			transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
849 		}
850 
851 		if (cbp != cbp_head) {
852 			free_io_buf(cbp);
853 		}
854 
855 		cbp = cbp_next;
856 	}
857 
858 	if (ISSET(b_flags, B_COMMIT_UPL)) {
859 		cluster_handle_associated_upl(iostate,
860 		    cbp_head->b_upl,
861 		    upl_offset,
862 		    transaction_size);
863 	}
864 
865 	if (error == 0 && total_resid) {
866 		error = EIO;
867 	}
868 
869 	if (error == 0) {
870 		int     (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
871 
872 		if (cliodone_func != NULL) {
873 			cbp_head->b_bcount = transaction_size;
874 
875 			error = (*cliodone_func)(cbp_head, callback_arg);
876 		}
877 	}
878 	if (zero_offset) {
879 		cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
880 	}
881 
882 	verify_ctx = cbp_head->b_attr.ba_verify_ctx;
883 	cbp_head->b_attr.ba_verify_ctx = NULL;
884 	if (verify_ctx) {
885 		vnode_verify_flags_t verify_flags = VNODE_VERIFY_CONTEXT_FREE;
886 		caddr_t verify_buf = NULL;
887 		off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
888 		size_t verify_length = transaction_size;
889 		vm_offset_t vaddr;
890 
891 		if (!error) {
892 			verify_flags |= VNODE_VERIFY_WITH_CONTEXT;
893 			error = ubc_upl_map_range(upl, upl_offset, round_page(transaction_size), VM_PROT_DEFAULT, &vaddr);    /* Map it in */
894 			if (error) {
895 				panic("ubc_upl_map_range returned error %d, upl = %p, upl_offset = %d, size = %d",
896 				    error, upl, (int)upl_offset, (int)round_page(transaction_size));
897 			} else {
898 				verify_buf = (caddr_t)vaddr;
899 			}
900 		}
901 
902 		error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length, 0, &verify_ctx, verify_flags, NULL);
903 
904 		if (verify_buf) {
905 			(void)ubc_upl_unmap_range(upl, upl_offset, round_page(transaction_size));
906 			verify_buf = NULL;
907 		}
908 	}
909 
910 	free_io_buf(cbp_head);
911 
912 	if (iostate) {
913 		int need_wakeup = 0;
914 
915 		/*
916 		 * someone has issued multiple I/Os asynchrounsly
917 		 * and is waiting for them to complete (streaming)
918 		 */
919 		lck_mtx_lock_spin(&iostate->io_mtxp);
920 
921 		if (error && iostate->io_error == 0) {
922 			iostate->io_error = error;
923 		}
924 
925 		iostate->io_completed += total_size;
926 
927 		if (iostate->io_wanted) {
928 			/*
929 			 * someone is waiting for the state of
930 			 * this io stream to change
931 			 */
932 			iostate->io_wanted = 0;
933 			need_wakeup = 1;
934 		}
935 		lck_mtx_unlock(&iostate->io_mtxp);
936 
937 		if (need_wakeup) {
938 			wakeup((caddr_t)&iostate->io_wanted);
939 		}
940 	}
941 
942 	if (b_flags & B_COMMIT_UPL) {
943 		pg_offset   = upl_offset & PAGE_MASK;
944 		commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
945 
946 		if (error) {
947 			upl_set_iodone_error(upl, error);
948 
949 			upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
950 		} else {
951 			upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
952 
953 			if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
954 				upl_flags |= UPL_COMMIT_SET_DIRTY;
955 			}
956 
957 			if (b_flags & B_AGE) {
958 				upl_flags |= UPL_COMMIT_INACTIVATE;
959 			}
960 
961 			ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
962 		}
963 	}
964 	if (real_bp) {
965 		if (error) {
966 			real_bp->b_flags |= B_ERROR;
967 			real_bp->b_error = error;
968 		}
969 		real_bp->b_resid = total_resid;
970 
971 		buf_biodone(real_bp);
972 	}
973 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
974 	    upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
975 
976 	return error;
977 }
978 
979 
980 uint32_t
cluster_throttle_io_limit(vnode_t vp,uint32_t * limit)981 cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
982 {
983 	if (cluster_is_throttled(vp)) {
984 		*limit = THROTTLE_MAX_IOSIZE;
985 		return 1;
986 	}
987 	return 0;
988 }
989 
990 
991 void
cluster_zero(upl_t upl,upl_offset_t upl_offset,int size,buf_t bp)992 cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
993 {
994 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
995 	    upl_offset, size, bp, 0, 0);
996 
997 	if (bp == NULL || bp->b_datap == 0) {
998 		upl_page_info_t *pl;
999 		addr64_t        zero_addr;
1000 
1001 		pl = ubc_upl_pageinfo(upl);
1002 
1003 		if (upl_device_page(pl) == TRUE) {
1004 			zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
1005 
1006 			bzero_phys_nc(zero_addr, size);
1007 		} else {
1008 			while (size) {
1009 				int     page_offset;
1010 				int     page_index;
1011 				int     zero_cnt;
1012 
1013 				page_index  = upl_offset / PAGE_SIZE;
1014 				page_offset = upl_offset & PAGE_MASK;
1015 
1016 				zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
1017 				zero_cnt  = min(PAGE_SIZE - page_offset, size);
1018 
1019 				bzero_phys(zero_addr, zero_cnt);
1020 
1021 				size       -= zero_cnt;
1022 				upl_offset += zero_cnt;
1023 			}
1024 		}
1025 	} else {
1026 		bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
1027 	}
1028 
1029 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
1030 	    upl_offset, size, 0, 0, 0);
1031 }
1032 
1033 
1034 static void
cluster_EOT(buf_t cbp_head,buf_t cbp_tail,int zero_offset,size_t verify_block_size)1035 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size)
1036 {
1037 	/*
1038 	 * We will assign a verification context to cbp_head.
1039 	 * This will be passed back to the filesystem  when
1040 	 * verifying (in cluster_iodone).
1041 	 */
1042 	if (verify_block_size) {
1043 		off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
1044 		size_t length;
1045 		void *verify_ctx = NULL;
1046 		int error = 0;
1047 		vnode_t vp = buf_vnode(cbp_head);
1048 
1049 		if (cbp_head == cbp_tail) {
1050 			length = cbp_head->b_bcount;
1051 		} else {
1052 			length = ((cbp_tail->b_lblkno * cbp_tail->b_lblksize) + cbp_tail->b_bcount) - start_off;
1053 		}
1054 
1055 		/*
1056 		 * zero_offset is non zero for the transaction containing the EOF
1057 		 * (if the filesize is not page aligned). In that case we might
1058 		 * have the transaction size not be page/verify block size aligned
1059 		 */
1060 		if ((zero_offset == 0) &&
1061 		    ((length < verify_block_size) || (length % verify_block_size)) != 0) {
1062 			panic("%s length = %zu, verify_block_size = %zu",
1063 			    __FUNCTION__, length, verify_block_size);
1064 		}
1065 
1066 		error = VNOP_VERIFY(vp, start_off, NULL, length,
1067 		    &verify_block_size, &verify_ctx, VNODE_VERIFY_CONTEXT_ALLOC, NULL);
1068 
1069 		if (!verify_ctx) {
1070 			if (!error && verify_block_size) {
1071 				/*
1072 				 * fetch the verify block size again, it is
1073 				 * possible that the verification was turned off
1074 				 * in the filesystem between the time it was
1075 				 * checked last and now.
1076 				 */
1077 				error = VNOP_VERIFY(vp, start_off, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL);
1078 			}
1079 
1080 			if (error || verify_block_size) {
1081 				panic("No verify context for vp = %p, start_off = %lld, length = %zu, error = %d",
1082 				    buf_vnode(cbp_head), start_off, length, error);
1083 			}
1084 		}
1085 
1086 		cbp_head->b_attr.ba_verify_ctx = verify_ctx;
1087 	} else {
1088 		cbp_head->b_attr.ba_verify_ctx = NULL;
1089 	}
1090 
1091 	cbp_head->b_validend = zero_offset;
1092 	cbp_tail->b_flags |= B_EOT;
1093 }
1094 
1095 static void
cluster_wait_IO(buf_t cbp_head,int async)1096 cluster_wait_IO(buf_t cbp_head, int async)
1097 {
1098 	buf_t   cbp;
1099 
1100 	if (async) {
1101 		/*
1102 		 * Async callback completion will not normally generate a
1103 		 * wakeup upon I/O completion.  To get woken up, we set
1104 		 * b_trans_next (which is safe for us to modify) on the last
1105 		 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1106 		 * to wake us up when all buffers as part of this transaction
1107 		 * are completed.  This is done under the umbrella of
1108 		 * cl_transaction_mtxp which is also taken in cluster_iodone.
1109 		 */
1110 		bool done = true;
1111 		buf_t last = NULL;
1112 
1113 		lck_mtx_lock_spin(&cl_transaction_mtxp);
1114 
1115 		for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1116 			if (!ISSET(cbp->b_flags, B_TDONE)) {
1117 				done = false;
1118 			}
1119 		}
1120 
1121 		if (!done) {
1122 			last->b_trans_next = CLUSTER_IO_WAITING;
1123 
1124 			DTRACE_IO1(wait__start, buf_t, last);
1125 			do {
1126 				msleep(last, &cl_transaction_mtxp, PSPIN | (PRIBIO + 1), "cluster_wait_IO", NULL);
1127 
1128 				/*
1129 				 * We should only have been woken up if all the
1130 				 * buffers are completed, but just in case...
1131 				 */
1132 				done = true;
1133 				for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1134 					if (!ISSET(cbp->b_flags, B_TDONE)) {
1135 						done = false;
1136 						break;
1137 					}
1138 				}
1139 			} while (!done);
1140 			DTRACE_IO1(wait__done, buf_t, last);
1141 
1142 			last->b_trans_next = NULL;
1143 		}
1144 
1145 		lck_mtx_unlock(&cl_transaction_mtxp);
1146 	} else { // !async
1147 		for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1148 			buf_biowait(cbp);
1149 		}
1150 	}
1151 }
1152 
1153 static void
cluster_complete_transaction(buf_t * cbp_head,void * callback_arg,int * retval,int flags,int needwait)1154 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1155 {
1156 	buf_t   cbp;
1157 	int     error;
1158 	boolean_t isswapout = FALSE;
1159 
1160 	/*
1161 	 * cluster_complete_transaction will
1162 	 * only be called if we've issued a complete chain in synchronous mode
1163 	 * or, we've already done a cluster_wait_IO on an incomplete chain
1164 	 */
1165 	if (needwait) {
1166 		for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1167 			buf_biowait(cbp);
1168 		}
1169 	}
1170 	/*
1171 	 * we've already waited on all of the I/Os in this transaction,
1172 	 * so mark all of the buf_t's in this transaction as B_TDONE
1173 	 * so that cluster_iodone sees the transaction as completed
1174 	 */
1175 	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1176 		cbp->b_flags |= B_TDONE;
1177 	}
1178 	cbp = *cbp_head;
1179 
1180 	if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
1181 		isswapout = TRUE;
1182 	}
1183 
1184 	error = cluster_iodone(cbp, callback_arg);
1185 
1186 	if (!(flags & CL_ASYNC) && error && *retval == 0) {
1187 		if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
1188 			*retval = error;
1189 		} else if (isswapout == TRUE) {
1190 			*retval = error;
1191 		}
1192 	}
1193 	*cbp_head = (buf_t)NULL;
1194 }
1195 
1196 
1197 static int
cluster_io(vnode_t vp,upl_t upl,vm_offset_t upl_offset,off_t f_offset,int non_rounded_size,int flags,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)1198 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1199     int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1200 {
1201 	buf_t   cbp;
1202 	u_int   size;
1203 	u_int   io_size;
1204 	int     io_flags;
1205 	int     bmap_flags;
1206 	int     error = 0;
1207 	int     retval = 0;
1208 	buf_t   cbp_head = NULL;
1209 	buf_t   cbp_tail = NULL;
1210 	int     trans_count = 0;
1211 	int     max_trans_count;
1212 	u_int   pg_count;
1213 	int     pg_offset;
1214 	u_int   max_iosize;
1215 	u_int   max_vectors;
1216 	int     priv;
1217 	int     zero_offset = 0;
1218 	int     async_throttle = 0;
1219 	mount_t mp;
1220 	vm_offset_t upl_end_offset;
1221 	boolean_t   need_EOT = FALSE;
1222 	size_t verify_block_size = 0;
1223 
1224 	/*
1225 	 * we currently don't support buffers larger than a page
1226 	 */
1227 	if (real_bp && non_rounded_size > PAGE_SIZE) {
1228 		panic("%s(): Called with real buffer of size %d bytes which "
1229 		    "is greater than the maximum allowed size of "
1230 		    "%d bytes (the system PAGE_SIZE).\n",
1231 		    __FUNCTION__, non_rounded_size, PAGE_SIZE);
1232 	}
1233 
1234 	mp = vp->v_mount;
1235 
1236 	/*
1237 	 * we don't want to do any funny rounding of the size for IO requests
1238 	 * coming through the DIRECT or CONTIGUOUS paths...  those pages don't
1239 	 * belong to us... we can't extend (nor do we need to) the I/O to fill
1240 	 * out a page
1241 	 */
1242 	if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1243 		/*
1244 		 * round the requested size up so that this I/O ends on a
1245 		 * page boundary in case this is a 'write'... if the filesystem
1246 		 * has blocks allocated to back the page beyond the EOF, we want to
1247 		 * make sure to write out the zero's that are sitting beyond the EOF
1248 		 * so that in case the filesystem doesn't explicitly zero this area
1249 		 * if a hole is created via a lseek/write beyond the current EOF,
1250 		 * it will return zeros when it's read back from the disk.  If the
1251 		 * physical allocation doesn't extend for the whole page, we'll
1252 		 * only write/read from the disk up to the end of this allocation
1253 		 * via the extent info returned from the VNOP_BLOCKMAP call.
1254 		 */
1255 		pg_offset = upl_offset & PAGE_MASK;
1256 
1257 		size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1258 	} else {
1259 		/*
1260 		 * anyone advertising a blocksize of 1 byte probably
1261 		 * can't deal with us rounding up the request size
1262 		 * AFP is one such filesystem/device
1263 		 */
1264 		size = non_rounded_size;
1265 	}
1266 	upl_end_offset = upl_offset + size;
1267 
1268 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1269 
1270 	/*
1271 	 * Set the maximum transaction size to the maximum desired number of
1272 	 * buffers.
1273 	 */
1274 	max_trans_count = 8;
1275 	if (flags & CL_DEV_MEMORY) {
1276 		max_trans_count = 16;
1277 	}
1278 
1279 	if (flags & CL_READ) {
1280 		io_flags = B_READ;
1281 		bmap_flags = VNODE_READ;
1282 
1283 		max_iosize  = mp->mnt_maxreadcnt;
1284 		max_vectors = mp->mnt_segreadcnt;
1285 
1286 		if ((flags & CL_PAGEIN) && /* Cluster layer verification will be limited to pagein for now */
1287 		    !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1288 		    (VNOP_VERIFY(vp, f_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) &&
1289 		    verify_block_size) {
1290 			if (verify_block_size != PAGE_SIZE) {
1291 				verify_block_size = 0;
1292 			}
1293 			if (real_bp && verify_block_size) {
1294 				panic("%s(): Called with real buffer and needs verification ",
1295 				    __FUNCTION__);
1296 			}
1297 		}
1298 	} else {
1299 		io_flags = B_WRITE;
1300 		bmap_flags = VNODE_WRITE;
1301 
1302 		max_iosize  = mp->mnt_maxwritecnt;
1303 		max_vectors = mp->mnt_segwritecnt;
1304 	}
1305 	if (verify_block_size) {
1306 		bmap_flags |= VNODE_CLUSTER_VERIFY;
1307 	}
1308 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1309 
1310 	/*
1311 	 * make sure the maximum iosize is a
1312 	 * multiple of the page size
1313 	 */
1314 	max_iosize  &= ~PAGE_MASK;
1315 
1316 	/*
1317 	 * Ensure the maximum iosize is sensible.
1318 	 */
1319 	if (!max_iosize) {
1320 		max_iosize = PAGE_SIZE;
1321 	}
1322 
1323 	if (flags & CL_THROTTLE) {
1324 		if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1325 			if (max_iosize > THROTTLE_MAX_IOSIZE) {
1326 				max_iosize = THROTTLE_MAX_IOSIZE;
1327 			}
1328 			async_throttle = THROTTLE_MAXCNT;
1329 		} else {
1330 			if ((flags & CL_DEV_MEMORY)) {
1331 				async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1332 			} else {
1333 				u_int max_cluster;
1334 				u_int max_cluster_size;
1335 				u_int scale;
1336 
1337 				if (vp->v_mount->mnt_minsaturationbytecount) {
1338 					max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1339 
1340 					scale = 1;
1341 				} else {
1342 					max_cluster_size = MAX_CLUSTER_SIZE(vp);
1343 
1344 					if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
1345 						scale = WRITE_THROTTLE_SSD;
1346 					} else {
1347 						scale = WRITE_THROTTLE;
1348 					}
1349 				}
1350 				if (max_iosize > max_cluster_size) {
1351 					max_cluster = max_cluster_size;
1352 				} else {
1353 					max_cluster = max_iosize;
1354 				}
1355 
1356 				if (size < max_cluster) {
1357 					max_cluster = size;
1358 				}
1359 
1360 				if (flags & CL_CLOSE) {
1361 					scale += MAX_CLUSTERS;
1362 				}
1363 
1364 				async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
1365 			}
1366 		}
1367 	}
1368 	if (flags & CL_AGE) {
1369 		io_flags |= B_AGE;
1370 	}
1371 	if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
1372 		io_flags |= B_PAGEIO;
1373 	}
1374 	if (flags & (CL_IOSTREAMING)) {
1375 		io_flags |= B_IOSTREAMING;
1376 	}
1377 	if (flags & CL_COMMIT) {
1378 		io_flags |= B_COMMIT_UPL;
1379 	}
1380 	if (flags & CL_DIRECT_IO) {
1381 		io_flags |= B_PHYS;
1382 	}
1383 	if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
1384 		io_flags |= B_CACHE;
1385 	}
1386 	if (flags & CL_PASSIVE) {
1387 		io_flags |= B_PASSIVE;
1388 	}
1389 	if (flags & CL_ENCRYPTED) {
1390 		io_flags |= B_ENCRYPTED_IO;
1391 	}
1392 
1393 	if (vp->v_flag & VSYSTEM) {
1394 		io_flags |= B_META;
1395 	}
1396 
1397 	if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1398 		/*
1399 		 * then we are going to end up
1400 		 * with a page that we can't complete (the file size wasn't a multiple
1401 		 * of PAGE_SIZE and we're trying to read to the end of the file
1402 		 * so we'll go ahead and zero out the portion of the page we can't
1403 		 * read in from the file
1404 		 */
1405 		zero_offset = (int)(upl_offset + non_rounded_size);
1406 	} else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1407 		assert(ISSET(flags, CL_COMMIT));
1408 
1409 		// For a direct/uncached write, we need to lock pages...
1410 
1411 		upl_t cached_upl;
1412 
1413 		/*
1414 		 * Create a UPL to lock the pages in the cache whilst the
1415 		 * write is in progress.
1416 		 */
1417 		ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
1418 		    NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
1419 
1420 		/*
1421 		 * Attach this UPL to the other UPL so that we can find it
1422 		 * later.
1423 		 */
1424 		upl_set_associated_upl(upl, cached_upl);
1425 
1426 		if (upl_offset & PAGE_MASK) {
1427 			/*
1428 			 * The two UPLs are not aligned, so mark the first page in
1429 			 * @upl so that cluster_handle_associated_upl can handle
1430 			 * it accordingly.
1431 			 */
1432 			upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1433 			upl_page_set_mark(pl, 0, true);
1434 		}
1435 	}
1436 
1437 	while (size) {
1438 		daddr64_t blkno;
1439 		daddr64_t lblkno;
1440 		size_t  io_size_tmp;
1441 		u_int   io_size_wanted;
1442 		uint32_t lblksize;
1443 
1444 		if (size > max_iosize) {
1445 			io_size = max_iosize;
1446 		} else {
1447 			io_size = size;
1448 		}
1449 
1450 		io_size_wanted = io_size;
1451 		io_size_tmp = (size_t)io_size;
1452 
1453 		if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1454 			break;
1455 		}
1456 
1457 		if (io_size_tmp > io_size_wanted) {
1458 			io_size = io_size_wanted;
1459 		} else {
1460 			io_size = (u_int)io_size_tmp;
1461 		}
1462 
1463 		if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1464 			real_bp->b_blkno = blkno;
1465 		}
1466 
1467 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1468 		    (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
1469 
1470 		if (io_size == 0) {
1471 			/*
1472 			 * vnop_blockmap didn't return an error... however, it did
1473 			 * return an extent size of 0 which means we can't
1474 			 * make forward progress on this I/O... a hole in the
1475 			 * file would be returned as a blkno of -1 with a non-zero io_size
1476 			 * a real extent is returned with a blkno != -1 and a non-zero io_size
1477 			 */
1478 			error = EINVAL;
1479 			break;
1480 		}
1481 		if (!(flags & CL_READ) && blkno == -1) {
1482 			off_t   e_offset;
1483 			int     pageout_flags;
1484 
1485 			if (upl_get_internal_vectorupl(upl)) {
1486 				panic("Vector UPLs should not take this code-path");
1487 			}
1488 			/*
1489 			 * we're writing into a 'hole'
1490 			 */
1491 			if (flags & CL_PAGEOUT) {
1492 				/*
1493 				 * if we got here via cluster_pageout
1494 				 * then just error the request and return
1495 				 * the 'hole' should already have been covered
1496 				 */
1497 				error = EINVAL;
1498 				break;
1499 			}
1500 			/*
1501 			 * we can get here if the cluster code happens to
1502 			 * pick up a page that was dirtied via mmap vs
1503 			 * a 'write' and the page targets a 'hole'...
1504 			 * i.e. the writes to the cluster were sparse
1505 			 * and the file was being written for the first time
1506 			 *
1507 			 * we can also get here if the filesystem supports
1508 			 * 'holes' that are less than PAGE_SIZE.... because
1509 			 * we can't know if the range in the page that covers
1510 			 * the 'hole' has been dirtied via an mmap or not,
1511 			 * we have to assume the worst and try to push the
1512 			 * entire page to storage.
1513 			 *
1514 			 * Try paging out the page individually before
1515 			 * giving up entirely and dumping it (the pageout
1516 			 * path will insure that the zero extent accounting
1517 			 * has been taken care of before we get back into cluster_io)
1518 			 *
1519 			 * go direct to vnode_pageout so that we don't have to
1520 			 * unbusy the page from the UPL... we used to do this
1521 			 * so that we could call ubc_msync, but that results
1522 			 * in a potential deadlock if someone else races us to acquire
1523 			 * that page and wins and in addition needs one of the pages
1524 			 * we're continuing to hold in the UPL
1525 			 */
1526 			pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1527 
1528 			if (!(flags & CL_ASYNC)) {
1529 				pageout_flags |= UPL_IOSYNC;
1530 			}
1531 			if (!(flags & CL_COMMIT)) {
1532 				pageout_flags |= UPL_NOCOMMIT;
1533 			}
1534 
1535 			if (cbp_head) {
1536 				buf_t prev_cbp;
1537 				uint32_t   bytes_in_last_page;
1538 
1539 				/*
1540 				 * first we have to wait for the the current outstanding I/Os
1541 				 * to complete... EOT hasn't been set yet on this transaction
1542 				 * so the pages won't be released
1543 				 */
1544 				cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1545 
1546 				bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1547 				for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1548 					bytes_in_last_page += cbp->b_bcount;
1549 				}
1550 				bytes_in_last_page &= PAGE_MASK;
1551 
1552 				while (bytes_in_last_page) {
1553 					/*
1554 					 * we've got a transcation that
1555 					 * includes the page we're about to push out through vnode_pageout...
1556 					 * find the bp's in the list which intersect this page and either
1557 					 * remove them entirely from the transaction (there could be multiple bp's), or
1558 					 * round it's iosize down to the page boundary (there can only be one)...
1559 					 *
1560 					 * find the last bp in the list and act on it
1561 					 */
1562 					for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
1563 						prev_cbp = cbp;
1564 					}
1565 
1566 					if (bytes_in_last_page >= cbp->b_bcount) {
1567 						/*
1568 						 * this buf no longer has any I/O associated with it
1569 						 */
1570 						bytes_in_last_page -= cbp->b_bcount;
1571 						cbp->b_bcount = 0;
1572 
1573 						free_io_buf(cbp);
1574 
1575 						if (cbp == cbp_head) {
1576 							assert(bytes_in_last_page == 0);
1577 							/*
1578 							 * the buf we just freed was the only buf in
1579 							 * this transaction... so there's no I/O to do
1580 							 */
1581 							cbp_head = NULL;
1582 							cbp_tail = NULL;
1583 						} else {
1584 							/*
1585 							 * remove the buf we just freed from
1586 							 * the transaction list
1587 							 */
1588 							prev_cbp->b_trans_next = NULL;
1589 							cbp_tail = prev_cbp;
1590 						}
1591 					} else {
1592 						/*
1593 						 * this is the last bp that has I/O
1594 						 * intersecting the page of interest
1595 						 * only some of the I/O is in the intersection
1596 						 * so clip the size but keep it in the transaction list
1597 						 */
1598 						cbp->b_bcount -= bytes_in_last_page;
1599 						cbp_tail = cbp;
1600 						bytes_in_last_page = 0;
1601 					}
1602 				}
1603 				if (cbp_head) {
1604 					/*
1605 					 * there was more to the current transaction
1606 					 * than just the page we are pushing out via vnode_pageout...
1607 					 * mark it as finished and complete it... we've already
1608 					 * waited for the I/Os to complete above in the call to cluster_wait_IO
1609 					 */
1610 					cluster_EOT(cbp_head, cbp_tail, 0, 0);
1611 
1612 					cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1613 
1614 					trans_count = 0;
1615 				}
1616 			}
1617 			if (vnode_pageout(vp, upl, (upl_offset_t)trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1618 				error = EINVAL;
1619 			}
1620 			e_offset = round_page_64(f_offset + 1);
1621 			io_size = (u_int)(e_offset - f_offset);
1622 
1623 			f_offset   += io_size;
1624 			upl_offset += io_size;
1625 
1626 			if (size >= io_size) {
1627 				size -= io_size;
1628 			} else {
1629 				size = 0;
1630 			}
1631 			/*
1632 			 * keep track of how much of the original request
1633 			 * that we've actually completed... non_rounded_size
1634 			 * may go negative due to us rounding the request
1635 			 * to a page size multiple (i.e.  size > non_rounded_size)
1636 			 */
1637 			non_rounded_size -= io_size;
1638 
1639 			if (non_rounded_size <= 0) {
1640 				/*
1641 				 * we've transferred all of the data in the original
1642 				 * request, but we were unable to complete the tail
1643 				 * of the last page because the file didn't have
1644 				 * an allocation to back that portion... this is ok.
1645 				 */
1646 				size = 0;
1647 			}
1648 			if (error) {
1649 				if (size == 0) {
1650 					flags &= ~CL_COMMIT;
1651 				}
1652 				break;
1653 			}
1654 			continue;
1655 		}
1656 
1657 		lblksize = CLUSTER_IO_BLOCK_SIZE;
1658 		lblkno = (daddr64_t)(f_offset / lblksize);
1659 
1660 		/*
1661 		 * we have now figured out how much I/O we can do - this is in 'io_size'
1662 		 * pg_offset is the starting point in the first page for the I/O
1663 		 * pg_count is the number of full and partial pages that 'io_size' encompasses
1664 		 */
1665 		pg_offset = upl_offset & PAGE_MASK;
1666 
1667 		if (flags & CL_DEV_MEMORY) {
1668 			/*
1669 			 * treat physical requests as one 'giant' page
1670 			 */
1671 			pg_count = 1;
1672 		} else {
1673 			pg_count  = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1674 		}
1675 
1676 		if ((flags & CL_READ) && blkno == -1) {
1677 			vm_offset_t  commit_offset;
1678 			int bytes_to_zero;
1679 			int complete_transaction_now = 0;
1680 
1681 			/*
1682 			 * if we're reading and blkno == -1, then we've got a
1683 			 * 'hole' in the file that we need to deal with by zeroing
1684 			 * out the affected area in the upl
1685 			 */
1686 			if (io_size >= (u_int)non_rounded_size) {
1687 				/*
1688 				 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1689 				 * than 'zero_offset' will be non-zero
1690 				 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1691 				 * (indicated by the io_size finishing off the I/O request for this UPL)
1692 				 * than we're not going to issue an I/O for the
1693 				 * last page in this upl... we need to zero both the hole and the tail
1694 				 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1695 				 */
1696 				bytes_to_zero = non_rounded_size;
1697 				if (!(flags & CL_NOZERO)) {
1698 					bytes_to_zero = (int)((((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset);
1699 				}
1700 
1701 				zero_offset = 0;
1702 			} else {
1703 				bytes_to_zero = io_size;
1704 			}
1705 
1706 			pg_count = 0;
1707 
1708 			cluster_zero(upl, (upl_offset_t)upl_offset, bytes_to_zero, real_bp);
1709 
1710 			if (cbp_head) {
1711 				int     pg_resid;
1712 
1713 				/*
1714 				 * if there is a current I/O chain pending
1715 				 * then the first page of the group we just zero'd
1716 				 * will be handled by the I/O completion if the zero
1717 				 * fill started in the middle of the page
1718 				 */
1719 				commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1720 
1721 				pg_resid = (int)(commit_offset - upl_offset);
1722 
1723 				if (bytes_to_zero >= pg_resid) {
1724 					/*
1725 					 * the last page of the current I/O
1726 					 * has been completed...
1727 					 * compute the number of fully zero'd
1728 					 * pages that are beyond it
1729 					 * plus the last page if its partial
1730 					 * and we have no more I/O to issue...
1731 					 * otherwise a partial page is left
1732 					 * to begin the next I/O
1733 					 */
1734 					if ((int)io_size >= non_rounded_size) {
1735 						pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1736 					} else {
1737 						pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1738 					}
1739 
1740 					complete_transaction_now = 1;
1741 				}
1742 			} else {
1743 				/*
1744 				 * no pending I/O to deal with
1745 				 * so, commit all of the fully zero'd pages
1746 				 * plus the last page if its partial
1747 				 * and we have no more I/O to issue...
1748 				 * otherwise a partial page is left
1749 				 * to begin the next I/O
1750 				 */
1751 				if ((int)io_size >= non_rounded_size) {
1752 					pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1753 				} else {
1754 					pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1755 				}
1756 
1757 				commit_offset = upl_offset & ~PAGE_MASK;
1758 			}
1759 
1760 			// Associated UPL is currently only used in the direct write path
1761 			assert(!upl_associated_upl(upl));
1762 
1763 			if ((flags & CL_COMMIT) && pg_count) {
1764 				ubc_upl_commit_range(upl, (upl_offset_t)commit_offset,
1765 				    pg_count * PAGE_SIZE,
1766 				    UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1767 			}
1768 			upl_offset += io_size;
1769 			f_offset   += io_size;
1770 			size       -= io_size;
1771 
1772 			/*
1773 			 * keep track of how much of the original request
1774 			 * that we've actually completed... non_rounded_size
1775 			 * may go negative due to us rounding the request
1776 			 * to a page size multiple (i.e.  size > non_rounded_size)
1777 			 */
1778 			non_rounded_size -= io_size;
1779 
1780 			if (non_rounded_size <= 0) {
1781 				/*
1782 				 * we've transferred all of the data in the original
1783 				 * request, but we were unable to complete the tail
1784 				 * of the last page because the file didn't have
1785 				 * an allocation to back that portion... this is ok.
1786 				 */
1787 				size = 0;
1788 			}
1789 			if (cbp_head && (complete_transaction_now || size == 0)) {
1790 				cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1791 
1792 				cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
1793 
1794 				cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1795 
1796 				trans_count = 0;
1797 			}
1798 			continue;
1799 		}
1800 		if (pg_count > max_vectors) {
1801 			if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1802 				io_size = PAGE_SIZE - pg_offset;
1803 				pg_count = 1;
1804 			} else {
1805 				io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1806 				pg_count = max_vectors;
1807 			}
1808 		}
1809 		/*
1810 		 * If the transaction is going to reach the maximum number of
1811 		 * desired elements, truncate the i/o to the nearest page so
1812 		 * that the actual i/o is initiated after this buffer is
1813 		 * created and added to the i/o chain.
1814 		 *
1815 		 * I/O directed to physically contiguous memory
1816 		 * doesn't have a requirement to make sure we 'fill' a page
1817 		 */
1818 		if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1819 		    ((upl_offset + io_size) & PAGE_MASK)) {
1820 			vm_offset_t aligned_ofs;
1821 
1822 			aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1823 			/*
1824 			 * If the io_size does not actually finish off even a
1825 			 * single page we have to keep adding buffers to the
1826 			 * transaction despite having reached the desired limit.
1827 			 *
1828 			 * Eventually we get here with the page being finished
1829 			 * off (and exceeded) and then we truncate the size of
1830 			 * this i/o request so that it is page aligned so that
1831 			 * we can finally issue the i/o on the transaction.
1832 			 */
1833 			if (aligned_ofs > upl_offset) {
1834 				io_size = (u_int)(aligned_ofs - upl_offset);
1835 				pg_count--;
1836 			}
1837 		}
1838 
1839 		if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1840 			/*
1841 			 * if we're not targeting a virtual device i.e. a disk image
1842 			 * it's safe to dip into the reserve pool since real devices
1843 			 * can complete this I/O request without requiring additional
1844 			 * bufs from the alloc_io_buf pool
1845 			 */
1846 			priv = 1;
1847 		} else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT) && !cbp_head) {
1848 			/*
1849 			 * Throttle the speculative IO
1850 			 *
1851 			 * We can only throttle this if it is the first iobuf
1852 			 * for the transaction. alloc_io_buf implements
1853 			 * additional restrictions for diskimages anyway.
1854 			 */
1855 			priv = 0;
1856 		} else {
1857 			priv = 1;
1858 		}
1859 
1860 		cbp = alloc_io_buf(vp, priv);
1861 
1862 		if (flags & CL_PAGEOUT) {
1863 			u_int i;
1864 
1865 			/*
1866 			 * since blocks are in offsets of lblksize (CLUSTER_IO_BLOCK_SIZE), scale
1867 			 * iteration to (PAGE_SIZE * pg_count) of blks.
1868 			 */
1869 			for (i = 0; i < (PAGE_SIZE * pg_count) / lblksize; i++) {
1870 				if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
1871 					panic("BUSY bp found in cluster_io");
1872 				}
1873 			}
1874 		}
1875 		if (flags & CL_ASYNC) {
1876 			if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) {
1877 				panic("buf_setcallback failed");
1878 			}
1879 		}
1880 		cbp->b_cliodone = (void *)callback;
1881 		cbp->b_flags |= io_flags;
1882 		if (flags & CL_NOCACHE) {
1883 			cbp->b_attr.ba_flags |= BA_NOCACHE;
1884 		}
1885 		if (verify_block_size) {
1886 			cbp->b_attr.ba_flags |= BA_WILL_VERIFY;
1887 		}
1888 
1889 		cbp->b_lblkno = lblkno;
1890 		cbp->b_lblksize = lblksize;
1891 		cbp->b_blkno  = blkno;
1892 		cbp->b_bcount = io_size;
1893 
1894 		if (buf_setupl(cbp, upl, (uint32_t)upl_offset)) {
1895 			panic("buf_setupl failed");
1896 		}
1897 #if CONFIG_IOSCHED
1898 		upl_set_blkno(upl, upl_offset, io_size, blkno);
1899 #endif
1900 		cbp->b_trans_next = (buf_t)NULL;
1901 
1902 		if ((cbp->b_iostate = (void *)iostate)) {
1903 			/*
1904 			 * caller wants to track the state of this
1905 			 * io... bump the amount issued against this stream
1906 			 */
1907 			iostate->io_issued += io_size;
1908 		}
1909 
1910 		if (flags & CL_READ) {
1911 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1912 			    (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1913 		} else {
1914 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1915 			    (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1916 		}
1917 
1918 		if (cbp_head) {
1919 			cbp_tail->b_trans_next = cbp;
1920 			cbp_tail = cbp;
1921 		} else {
1922 			cbp_head = cbp;
1923 			cbp_tail = cbp;
1924 
1925 			if ((cbp_head->b_real_bp = real_bp)) {
1926 				real_bp = (buf_t)NULL;
1927 			}
1928 		}
1929 		*(buf_t *)(&cbp->b_trans_head) = cbp_head;
1930 
1931 		trans_count++;
1932 
1933 		upl_offset += io_size;
1934 		f_offset   += io_size;
1935 		size       -= io_size;
1936 		/*
1937 		 * keep track of how much of the original request
1938 		 * that we've actually completed... non_rounded_size
1939 		 * may go negative due to us rounding the request
1940 		 * to a page size multiple (i.e.  size > non_rounded_size)
1941 		 */
1942 		non_rounded_size -= io_size;
1943 
1944 		if (non_rounded_size <= 0) {
1945 			/*
1946 			 * we've transferred all of the data in the original
1947 			 * request, but we were unable to complete the tail
1948 			 * of the last page because the file didn't have
1949 			 * an allocation to back that portion... this is ok.
1950 			 */
1951 			size = 0;
1952 		}
1953 		if (size == 0) {
1954 			/*
1955 			 * we have no more I/O to issue, so go
1956 			 * finish the final transaction
1957 			 */
1958 			need_EOT = TRUE;
1959 		} else if (((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1960 		    ((flags & CL_ASYNC) || trans_count > max_trans_count)) {
1961 			/*
1962 			 * I/O directed to physically contiguous memory...
1963 			 * which doesn't have a requirement to make sure we 'fill' a page
1964 			 * or...
1965 			 * the current I/O we've prepared fully
1966 			 * completes the last page in this request
1967 			 * and ...
1968 			 * it's either an ASYNC request or
1969 			 * we've already accumulated more than 8 I/O's into
1970 			 * this transaction so mark it as complete so that
1971 			 * it can finish asynchronously or via the cluster_complete_transaction
1972 			 * below if the request is synchronous
1973 			 */
1974 			need_EOT = TRUE;
1975 		}
1976 		if (need_EOT == TRUE) {
1977 			cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
1978 		}
1979 
1980 		if (flags & CL_THROTTLE) {
1981 			(void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
1982 		}
1983 
1984 		if (!(io_flags & B_READ)) {
1985 			vnode_startwrite(vp);
1986 		}
1987 
1988 		if (flags & CL_RAW_ENCRYPTED) {
1989 			/*
1990 			 * User requested raw encrypted bytes.
1991 			 * Twiddle the bit in the ba_flags for the buffer
1992 			 */
1993 			cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
1994 		}
1995 
1996 		(void) VNOP_STRATEGY(cbp);
1997 
1998 		if (need_EOT == TRUE) {
1999 			if (!(flags & CL_ASYNC)) {
2000 				cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
2001 			}
2002 
2003 			need_EOT = FALSE;
2004 			trans_count = 0;
2005 			cbp_head = NULL;
2006 		}
2007 	}
2008 	if (error) {
2009 		int abort_size;
2010 
2011 		io_size = 0;
2012 
2013 		if (cbp_head) {
2014 			/*
2015 			 * Wait until all of the outstanding I/O
2016 			 * for this partial transaction has completed
2017 			 */
2018 			cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
2019 
2020 			/*
2021 			 * Rewind the upl offset to the beginning of the
2022 			 * transaction.
2023 			 */
2024 			upl_offset = cbp_head->b_uploffset;
2025 		}
2026 
2027 		if (ISSET(flags, CL_COMMIT)) {
2028 			cluster_handle_associated_upl(iostate, upl,
2029 			    (upl_offset_t)upl_offset,
2030 			    (upl_size_t)(upl_end_offset - upl_offset));
2031 		}
2032 
2033 		// Free all the IO buffers in this transaction
2034 		for (cbp = cbp_head; cbp;) {
2035 			buf_t   cbp_next;
2036 
2037 			size       += cbp->b_bcount;
2038 			io_size    += cbp->b_bcount;
2039 
2040 			cbp_next = cbp->b_trans_next;
2041 			free_io_buf(cbp);
2042 			cbp = cbp_next;
2043 		}
2044 
2045 		if (iostate) {
2046 			int need_wakeup = 0;
2047 
2048 			/*
2049 			 * update the error condition for this stream
2050 			 * since we never really issued the io
2051 			 * just go ahead and adjust it back
2052 			 */
2053 			lck_mtx_lock_spin(&iostate->io_mtxp);
2054 
2055 			if (iostate->io_error == 0) {
2056 				iostate->io_error = error;
2057 			}
2058 			iostate->io_issued -= io_size;
2059 
2060 			if (iostate->io_wanted) {
2061 				/*
2062 				 * someone is waiting for the state of
2063 				 * this io stream to change
2064 				 */
2065 				iostate->io_wanted = 0;
2066 				need_wakeup = 1;
2067 			}
2068 			lck_mtx_unlock(&iostate->io_mtxp);
2069 
2070 			if (need_wakeup) {
2071 				wakeup((caddr_t)&iostate->io_wanted);
2072 			}
2073 		}
2074 
2075 		if (flags & CL_COMMIT) {
2076 			int     upl_flags;
2077 
2078 			pg_offset  = upl_offset & PAGE_MASK;
2079 			abort_size = (int)((upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK);
2080 
2081 			upl_flags = cluster_ioerror(upl, (int)(upl_offset - pg_offset),
2082 			    abort_size, error, io_flags, vp);
2083 
2084 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
2085 			    upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
2086 		}
2087 		if (retval == 0) {
2088 			retval = error;
2089 		}
2090 	} else if (cbp_head) {
2091 		panic("%s(): cbp_head is not NULL.", __FUNCTION__);
2092 	}
2093 
2094 	if (real_bp) {
2095 		/*
2096 		 * can get here if we either encountered an error
2097 		 * or we completely zero-filled the request and
2098 		 * no I/O was issued
2099 		 */
2100 		if (error) {
2101 			real_bp->b_flags |= B_ERROR;
2102 			real_bp->b_error = error;
2103 		}
2104 		buf_biodone(real_bp);
2105 	}
2106 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
2107 
2108 	return retval;
2109 }
2110 
2111 #define reset_vector_run_state()                                                                                \
2112 	issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
2113 
2114 static int
vector_cluster_io(vnode_t vp,upl_t vector_upl,vm_offset_t vector_upl_offset,off_t v_upl_uio_offset,int vector_upl_iosize,int io_flag,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)2115 vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
2116     int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
2117 {
2118 	vector_upl_set_pagelist(vector_upl);
2119 
2120 	if (io_flag & CL_READ) {
2121 		if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
2122 			io_flag &= ~CL_PRESERVE; /*don't zero fill*/
2123 		} else {
2124 			io_flag |= CL_PRESERVE; /*zero fill*/
2125 		}
2126 	}
2127 	return cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg);
2128 }
2129 
2130 static int
cluster_read_prefetch(vnode_t vp,off_t f_offset,u_int size,off_t filesize,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2131 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2132 {
2133 	int           pages_in_prefetch;
2134 
2135 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
2136 	    (int)f_offset, size, (int)filesize, 0, 0);
2137 
2138 	if (f_offset >= filesize) {
2139 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2140 		    (int)f_offset, 0, 0, 0, 0);
2141 		return 0;
2142 	}
2143 	if ((off_t)size > (filesize - f_offset)) {
2144 		size = (u_int)(filesize - f_offset);
2145 	}
2146 	pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
2147 
2148 	advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
2149 
2150 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2151 	    (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
2152 
2153 	return pages_in_prefetch;
2154 }
2155 
2156 
2157 
2158 static void
cluster_read_ahead(vnode_t vp,struct cl_extent * extent,off_t filesize,struct cl_readahead * rap,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2159 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
2160     int bflag)
2161 {
2162 	daddr64_t       r_addr;
2163 	off_t           f_offset;
2164 	int             size_of_prefetch;
2165 	u_int           max_prefetch;
2166 
2167 
2168 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
2169 	    (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
2170 
2171 	if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
2172 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2173 		    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
2174 		return;
2175 	}
2176 	if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
2177 		rap->cl_ralen = 0;
2178 		rap->cl_maxra = 0;
2179 
2180 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2181 		    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
2182 
2183 		return;
2184 	}
2185 
2186 	max_prefetch = cluster_max_prefetch(vp,
2187 	    cluster_max_io_size(vp->v_mount, CL_READ), speculative_prefetch_max);
2188 
2189 	if (max_prefetch <= PAGE_SIZE) {
2190 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2191 		    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
2192 		return;
2193 	}
2194 	if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
2195 		if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2196 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2197 			    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
2198 			return;
2199 		}
2200 	}
2201 	r_addr = MAX(extent->e_addr, rap->cl_maxra) + 1;
2202 	f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2203 
2204 	size_of_prefetch = 0;
2205 
2206 	ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2207 
2208 	if (size_of_prefetch) {
2209 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2210 		    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
2211 		return;
2212 	}
2213 	if (f_offset < filesize) {
2214 		daddr64_t read_size;
2215 
2216 		rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
2217 
2218 		read_size = (extent->e_addr + 1) - extent->b_addr;
2219 
2220 		if (read_size > rap->cl_ralen) {
2221 			if (read_size > max_prefetch / PAGE_SIZE) {
2222 				rap->cl_ralen = max_prefetch / PAGE_SIZE;
2223 			} else {
2224 				rap->cl_ralen = (int)read_size;
2225 			}
2226 		}
2227 		size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2228 
2229 		if (size_of_prefetch) {
2230 			rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2231 		}
2232 	}
2233 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2234 	    rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
2235 }
2236 
2237 
2238 int
cluster_pageout(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2239 cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2240     int size, off_t filesize, int flags)
2241 {
2242 	return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2243 }
2244 
2245 
2246 int
cluster_pageout_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2247 cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2248     int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2249 {
2250 	int           io_size;
2251 	int           rounded_size;
2252 	off_t         max_size;
2253 	int           local_flags;
2254 
2255 	local_flags = CL_PAGEOUT | CL_THROTTLE;
2256 
2257 	if ((flags & UPL_IOSYNC) == 0) {
2258 		local_flags |= CL_ASYNC;
2259 	}
2260 	if ((flags & UPL_NOCOMMIT) == 0) {
2261 		local_flags |= CL_COMMIT;
2262 	}
2263 	if ((flags & UPL_KEEPCACHED)) {
2264 		local_flags |= CL_KEEPCACHED;
2265 	}
2266 	if (flags & UPL_PAGING_ENCRYPTED) {
2267 		local_flags |= CL_ENCRYPTED;
2268 	}
2269 
2270 
2271 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2272 	    (int)f_offset, size, (int)filesize, local_flags, 0);
2273 
2274 	/*
2275 	 * If they didn't specify any I/O, then we are done...
2276 	 * we can't issue an abort because we don't know how
2277 	 * big the upl really is
2278 	 */
2279 	if (size <= 0) {
2280 		return EINVAL;
2281 	}
2282 
2283 	if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2284 		if (local_flags & CL_COMMIT) {
2285 			ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2286 		}
2287 		return EROFS;
2288 	}
2289 	/*
2290 	 * can't page-in from a negative offset
2291 	 * or if we're starting beyond the EOF
2292 	 * or if the file offset isn't page aligned
2293 	 * or the size requested isn't a multiple of PAGE_SIZE
2294 	 */
2295 	if (f_offset < 0 || f_offset >= filesize ||
2296 	    (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2297 		if (local_flags & CL_COMMIT) {
2298 			ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2299 		}
2300 		return EINVAL;
2301 	}
2302 	max_size = filesize - f_offset;
2303 
2304 	if (size < max_size) {
2305 		io_size = size;
2306 	} else {
2307 		io_size = (int)max_size;
2308 	}
2309 
2310 	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2311 
2312 	if (size > rounded_size) {
2313 		if (local_flags & CL_COMMIT) {
2314 			ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2315 			    UPL_ABORT_FREE_ON_EMPTY);
2316 		}
2317 	}
2318 	return cluster_io(vp, upl, upl_offset, f_offset, io_size,
2319 	           local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2320 }
2321 
2322 
2323 int
cluster_pagein(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2324 cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2325     int size, off_t filesize, int flags)
2326 {
2327 	return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2328 }
2329 
2330 
2331 int
cluster_pagein_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2332 cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2333     int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2334 {
2335 	u_int         io_size;
2336 	int           rounded_size;
2337 	off_t         max_size;
2338 	int           retval;
2339 	int           local_flags = 0;
2340 
2341 	if (upl == NULL || size < 0) {
2342 		panic("cluster_pagein: NULL upl passed in");
2343 	}
2344 
2345 	if ((flags & UPL_IOSYNC) == 0) {
2346 		local_flags |= CL_ASYNC;
2347 	}
2348 	if ((flags & UPL_NOCOMMIT) == 0) {
2349 		local_flags |= CL_COMMIT;
2350 	}
2351 	if (flags & UPL_IOSTREAMING) {
2352 		local_flags |= CL_IOSTREAMING;
2353 	}
2354 	if (flags & UPL_PAGING_ENCRYPTED) {
2355 		local_flags |= CL_ENCRYPTED;
2356 	}
2357 
2358 
2359 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2360 	    (int)f_offset, size, (int)filesize, local_flags, 0);
2361 
2362 	/*
2363 	 * can't page-in from a negative offset
2364 	 * or if we're starting beyond the EOF
2365 	 * or if the file offset isn't page aligned
2366 	 * or the size requested isn't a multiple of PAGE_SIZE
2367 	 */
2368 	if (f_offset < 0 || f_offset >= filesize ||
2369 	    (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2370 		if (local_flags & CL_COMMIT) {
2371 			ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2372 		}
2373 
2374 		if (f_offset >= filesize) {
2375 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CLUSTER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CL_PGIN_PAST_EOF), 0 /* arg */);
2376 		}
2377 
2378 		return EINVAL;
2379 	}
2380 	max_size = filesize - f_offset;
2381 
2382 	if (size < max_size) {
2383 		io_size = size;
2384 	} else {
2385 		io_size = (int)max_size;
2386 	}
2387 
2388 	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2389 
2390 	if (size > rounded_size && (local_flags & CL_COMMIT)) {
2391 		ubc_upl_abort_range(upl, upl_offset + rounded_size,
2392 		    size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2393 	}
2394 
2395 	retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
2396 	    local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2397 
2398 	return retval;
2399 }
2400 
2401 
2402 int
cluster_bp(buf_t bp)2403 cluster_bp(buf_t bp)
2404 {
2405 	return cluster_bp_ext(bp, NULL, NULL);
2406 }
2407 
2408 
2409 int
cluster_bp_ext(buf_t bp,int (* callback)(buf_t,void *),void * callback_arg)2410 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2411 {
2412 	off_t  f_offset;
2413 	int    flags;
2414 
2415 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2416 	    bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2417 
2418 	if (bp->b_flags & B_READ) {
2419 		flags = CL_ASYNC | CL_READ;
2420 	} else {
2421 		flags = CL_ASYNC;
2422 	}
2423 	if (bp->b_flags & B_PASSIVE) {
2424 		flags |= CL_PASSIVE;
2425 	}
2426 
2427 	f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2428 
2429 	return cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg);
2430 }
2431 
2432 
2433 
2434 int
cluster_write(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags)2435 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2436 {
2437 	return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2438 }
2439 
2440 
2441 int
cluster_write_ext(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags,int (* callback)(buf_t,void *),void * callback_arg)2442 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2443     int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2444 {
2445 	user_ssize_t    cur_resid;
2446 	int             retval = 0;
2447 	int             flags;
2448 	int             zflags;
2449 	int             bflag;
2450 	int             write_type = IO_COPY;
2451 	u_int32_t       write_length;
2452 
2453 	flags = xflags;
2454 
2455 	if (flags & IO_PASSIVE) {
2456 		bflag = CL_PASSIVE;
2457 	} else {
2458 		bflag = 0;
2459 	}
2460 
2461 	if (vp->v_flag & VNOCACHE_DATA) {
2462 		flags |= IO_NOCACHE;
2463 		bflag |= CL_NOCACHE;
2464 	}
2465 	if (uio == NULL) {
2466 		/*
2467 		 * no user data...
2468 		 * this call is being made to zero-fill some range in the file
2469 		 */
2470 		retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2471 
2472 		return retval;
2473 	}
2474 	/*
2475 	 * do a write through the cache if one of the following is true....
2476 	 *   NOCACHE is not true or NODIRECT is true
2477 	 *   the uio request doesn't target USERSPACE
2478 	 * otherwise, find out if we want the direct or contig variant for
2479 	 * the first vector in the uio request
2480 	 */
2481 	if (((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
2482 		retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2483 	}
2484 
2485 	if ((flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
2486 		/*
2487 		 * must go through the cached variant in this case
2488 		 */
2489 		write_type = IO_COPY;
2490 	}
2491 
2492 	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
2493 		switch (write_type) {
2494 		case IO_COPY:
2495 			/*
2496 			 * make sure the uio_resid isn't too big...
2497 			 * internally, we want to handle all of the I/O in
2498 			 * chunk sizes that fit in a 32 bit int
2499 			 */
2500 			if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2501 				/*
2502 				 * we're going to have to call cluster_write_copy
2503 				 * more than once...
2504 				 *
2505 				 * only want the last call to cluster_write_copy to
2506 				 * have the IO_TAILZEROFILL flag set and only the
2507 				 * first call should have IO_HEADZEROFILL
2508 				 */
2509 				zflags = flags & ~IO_TAILZEROFILL;
2510 				flags &= ~IO_HEADZEROFILL;
2511 
2512 				write_length = MAX_IO_REQUEST_SIZE;
2513 			} else {
2514 				/*
2515 				 * last call to cluster_write_copy
2516 				 */
2517 				zflags = flags;
2518 
2519 				write_length = (u_int32_t)cur_resid;
2520 			}
2521 			retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
2522 			break;
2523 
2524 		case IO_CONTIG:
2525 			zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
2526 
2527 			if (flags & IO_HEADZEROFILL) {
2528 				/*
2529 				 * only do this once per request
2530 				 */
2531 				flags &= ~IO_HEADZEROFILL;
2532 
2533 				retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
2534 				    headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2535 				if (retval) {
2536 					break;
2537 				}
2538 			}
2539 			retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
2540 
2541 			if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
2542 				/*
2543 				 * we're done with the data from the user specified buffer(s)
2544 				 * and we've been requested to zero fill at the tail
2545 				 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2546 				 * by rearranging the args and passing in IO_HEADZEROFILL
2547 				 */
2548 
2549 				/*
2550 				 * Update the oldEOF to reflect the current EOF. If the UPL page
2551 				 * to zero-fill is not valid (when F_NOCACHE is set), the
2552 				 * cluster_write_copy() will perform RMW on the UPL page when
2553 				 * the oldEOF is not aligned on page boundary due to unaligned
2554 				 * write.
2555 				 */
2556 				if (uio->uio_offset > oldEOF) {
2557 					oldEOF = uio->uio_offset;
2558 				}
2559 				retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)oldEOF, tailOff, uio->uio_offset,
2560 				    (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2561 			}
2562 			break;
2563 
2564 		case IO_DIRECT:
2565 			/*
2566 			 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2567 			 */
2568 			retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
2569 			break;
2570 
2571 		case IO_UNKNOWN:
2572 			retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2573 			break;
2574 		}
2575 		/*
2576 		 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2577 		 * multiple times to service a multi-vector request that is not aligned properly
2578 		 * we need to update the oldEOF so that we
2579 		 * don't zero-fill the head of a page if we've successfully written
2580 		 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2581 		 * page that is beyond the oldEOF if the write is unaligned... we only
2582 		 * want that to happen for the very first page of the cluster_write,
2583 		 * NOT the first page of each vector making up a multi-vector write.
2584 		 */
2585 		if (uio->uio_offset > oldEOF) {
2586 			oldEOF = uio->uio_offset;
2587 		}
2588 	}
2589 	return retval;
2590 }
2591 
2592 
2593 static int
cluster_write_direct(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,int * write_type,u_int32_t * write_length,int flags,int (* callback)(buf_t,void *),void * callback_arg)2594 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
2595     int flags, int (*callback)(buf_t, void *), void *callback_arg)
2596 {
2597 	upl_t            upl = NULL;
2598 	upl_page_info_t  *pl;
2599 	vm_offset_t      upl_offset;
2600 	vm_offset_t      vector_upl_offset = 0;
2601 	u_int32_t        io_req_size;
2602 	u_int32_t        offset_in_file;
2603 	u_int32_t        offset_in_iovbase;
2604 	u_int32_t        io_size;
2605 	int              io_flag = 0;
2606 	upl_size_t       upl_size = 0, vector_upl_size = 0;
2607 	vm_size_t        upl_needed_size;
2608 	mach_msg_type_number_t  pages_in_pl;
2609 	upl_control_flags_t upl_flags;
2610 	kern_return_t    kret;
2611 	mach_msg_type_number_t  i;
2612 	int              force_data_sync;
2613 	int              retval = 0;
2614 	int              first_IO = 1;
2615 	struct clios     iostate;
2616 	user_addr_t      iov_base;
2617 	u_int32_t        mem_alignment_mask;
2618 	u_int32_t        devblocksize;
2619 	u_int32_t        max_io_size;
2620 	u_int32_t        max_upl_size;
2621 	u_int32_t        max_vector_size;
2622 	u_int32_t        bytes_outstanding_limit;
2623 	boolean_t        io_throttled = FALSE;
2624 
2625 	u_int32_t        vector_upl_iosize = 0;
2626 	int              issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
2627 	off_t            v_upl_uio_offset = 0;
2628 	int              vector_upl_index = 0;
2629 	upl_t            vector_upl = NULL;
2630 
2631 
2632 	/*
2633 	 * When we enter this routine, we know
2634 	 *  -- the resid will not exceed iov_len
2635 	 */
2636 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
2637 	    (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2638 
2639 	assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
2640 
2641 	max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2642 
2643 	io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2644 
2645 	if (flags & IO_PASSIVE) {
2646 		io_flag |= CL_PASSIVE;
2647 	}
2648 
2649 	if (flags & IO_NOCACHE) {
2650 		io_flag |= CL_NOCACHE;
2651 	}
2652 
2653 	if (flags & IO_SKIP_ENCRYPTION) {
2654 		io_flag |= CL_ENCRYPTED;
2655 	}
2656 
2657 	iostate.io_completed = 0;
2658 	iostate.io_issued = 0;
2659 	iostate.io_error = 0;
2660 	iostate.io_wanted = 0;
2661 
2662 	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
2663 
2664 	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2665 	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2666 
2667 	if (devblocksize == 1) {
2668 		/*
2669 		 * the AFP client advertises a devblocksize of 1
2670 		 * however, its BLOCKMAP routine maps to physical
2671 		 * blocks that are PAGE_SIZE in size...
2672 		 * therefore we can't ask for I/Os that aren't page aligned
2673 		 * or aren't multiples of PAGE_SIZE in size
2674 		 * by setting devblocksize to PAGE_SIZE, we re-instate
2675 		 * the old behavior we had before the mem_alignment_mask
2676 		 * changes went in...
2677 		 */
2678 		devblocksize = PAGE_SIZE;
2679 	}
2680 
2681 next_dwrite:
2682 	io_req_size = *write_length;
2683 	iov_base = uio_curriovbase(uio);
2684 
2685 	offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2686 	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
2687 
2688 	if (offset_in_file || offset_in_iovbase) {
2689 		/*
2690 		 * one of the 2 important offsets is misaligned
2691 		 * so fire an I/O through the cache for this entire vector
2692 		 */
2693 		goto wait_for_dwrites;
2694 	}
2695 	if (iov_base & (devblocksize - 1)) {
2696 		/*
2697 		 * the offset in memory must be on a device block boundary
2698 		 * so that we can guarantee that we can generate an
2699 		 * I/O that ends on a page boundary in cluster_io
2700 		 */
2701 		goto wait_for_dwrites;
2702 	}
2703 
2704 	task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2705 	while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
2706 		int     throttle_type;
2707 
2708 		if ((throttle_type = cluster_is_throttled(vp))) {
2709 			/*
2710 			 * we're in the throttle window, at the very least
2711 			 * we want to limit the size of the I/O we're about
2712 			 * to issue
2713 			 */
2714 			if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2715 				/*
2716 				 * we're in the throttle window and at least 1 I/O
2717 				 * has already been issued by a throttleable thread
2718 				 * in this window, so return with EAGAIN to indicate
2719 				 * to the FS issuing the cluster_write call that it
2720 				 * should now throttle after dropping any locks
2721 				 */
2722 				throttle_info_update_by_mount(vp->v_mount);
2723 
2724 				io_throttled = TRUE;
2725 				goto wait_for_dwrites;
2726 			}
2727 			max_vector_size = THROTTLE_MAX_IOSIZE;
2728 			max_io_size = THROTTLE_MAX_IOSIZE;
2729 		} else {
2730 			max_vector_size = MAX_VECTOR_UPL_SIZE;
2731 			max_io_size = max_upl_size;
2732 		}
2733 
2734 		if (first_IO) {
2735 			cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2736 			first_IO = 0;
2737 		}
2738 		io_size  = io_req_size & ~PAGE_MASK;
2739 		iov_base = uio_curriovbase(uio);
2740 
2741 		if (io_size > max_io_size) {
2742 			io_size = max_io_size;
2743 		}
2744 
2745 		if (useVectorUPL && (iov_base & PAGE_MASK)) {
2746 			/*
2747 			 * We have an iov_base that's not page-aligned.
2748 			 * Issue all I/O's that have been collected within
2749 			 * this Vectored UPL.
2750 			 */
2751 			if (vector_upl_index) {
2752 				retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2753 				reset_vector_run_state();
2754 			}
2755 
2756 			/*
2757 			 * After this point, if we are using the Vector UPL path and the base is
2758 			 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2759 			 */
2760 		}
2761 
2762 		upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2763 		upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2764 
2765 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
2766 		    (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
2767 
2768 		vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2769 		for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2770 			pages_in_pl = 0;
2771 			upl_size = (upl_size_t)upl_needed_size;
2772 			upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2773 			    UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2774 
2775 			kret = vm_map_get_upl(map,
2776 			    (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2777 			    &upl_size,
2778 			    &upl,
2779 			    NULL,
2780 			    &pages_in_pl,
2781 			    &upl_flags,
2782 			    VM_KERN_MEMORY_FILE,
2783 			    force_data_sync);
2784 
2785 			if (kret != KERN_SUCCESS) {
2786 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2787 				    0, 0, 0, kret, 0);
2788 				/*
2789 				 * failed to get pagelist
2790 				 *
2791 				 * we may have already spun some portion of this request
2792 				 * off as async requests... we need to wait for the I/O
2793 				 * to complete before returning
2794 				 */
2795 				goto wait_for_dwrites;
2796 			}
2797 			pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2798 			pages_in_pl = upl_size / PAGE_SIZE;
2799 
2800 			for (i = 0; i < pages_in_pl; i++) {
2801 				if (!upl_valid_page(pl, i)) {
2802 					break;
2803 				}
2804 			}
2805 			if (i == pages_in_pl) {
2806 				break;
2807 			}
2808 
2809 			/*
2810 			 * didn't get all the pages back that we
2811 			 * needed... release this upl and try again
2812 			 */
2813 			ubc_upl_abort(upl, 0);
2814 		}
2815 		if (force_data_sync >= 3) {
2816 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2817 			    i, pages_in_pl, upl_size, kret, 0);
2818 			/*
2819 			 * for some reason, we couldn't acquire a hold on all
2820 			 * the pages needed in the user's address space
2821 			 *
2822 			 * we may have already spun some portion of this request
2823 			 * off as async requests... we need to wait for the I/O
2824 			 * to complete before returning
2825 			 */
2826 			goto wait_for_dwrites;
2827 		}
2828 
2829 		/*
2830 		 * Consider the possibility that upl_size wasn't satisfied.
2831 		 */
2832 		if (upl_size < upl_needed_size) {
2833 			if (upl_size && upl_offset == 0) {
2834 				io_size = upl_size;
2835 			} else {
2836 				io_size = 0;
2837 			}
2838 		}
2839 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2840 		    (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2841 
2842 		if (io_size == 0) {
2843 			ubc_upl_abort(upl, 0);
2844 			/*
2845 			 * we may have already spun some portion of this request
2846 			 * off as async requests... we need to wait for the I/O
2847 			 * to complete before returning
2848 			 */
2849 			goto wait_for_dwrites;
2850 		}
2851 
2852 		if (useVectorUPL) {
2853 			vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2854 			if (end_off) {
2855 				issueVectorUPL = 1;
2856 			}
2857 			/*
2858 			 * After this point, if we are using a vector UPL, then
2859 			 * either all the UPL elements end on a page boundary OR
2860 			 * this UPL is the last element because it does not end
2861 			 * on a page boundary.
2862 			 */
2863 		}
2864 
2865 		/*
2866 		 * we want push out these writes asynchronously so that we can overlap
2867 		 * the preparation of the next I/O
2868 		 * if there are already too many outstanding writes
2869 		 * wait until some complete before issuing the next
2870 		 */
2871 		if (vp->v_mount->mnt_minsaturationbytecount) {
2872 			bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
2873 		} else {
2874 			if (__improbable(os_mul_overflow(max_upl_size, IO_SCALE(vp, 2),
2875 			    &bytes_outstanding_limit) ||
2876 			    (bytes_outstanding_limit > overlapping_write_max))) {
2877 				bytes_outstanding_limit = overlapping_write_max;
2878 			}
2879 		}
2880 
2881 		cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
2882 
2883 		if (iostate.io_error) {
2884 			/*
2885 			 * one of the earlier writes we issued ran into a hard error
2886 			 * don't issue any more writes, cleanup the UPL
2887 			 * that was just created but not used, then
2888 			 * go wait for all writes that are part of this stream
2889 			 * to complete before returning the error to the caller
2890 			 */
2891 			ubc_upl_abort(upl, 0);
2892 
2893 			goto wait_for_dwrites;
2894 		}
2895 
2896 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2897 		    (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2898 
2899 		if (!useVectorUPL) {
2900 			retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2901 			    io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2902 		} else {
2903 			if (!vector_upl_index) {
2904 				vector_upl = vector_upl_create(upl_offset);
2905 				v_upl_uio_offset = uio->uio_offset;
2906 				vector_upl_offset = upl_offset;
2907 			}
2908 
2909 			vector_upl_set_subupl(vector_upl, upl, upl_size);
2910 			vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2911 			vector_upl_index++;
2912 			vector_upl_iosize += io_size;
2913 			vector_upl_size += upl_size;
2914 
2915 			if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
2916 				retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2917 				reset_vector_run_state();
2918 			}
2919 		}
2920 
2921 		/*
2922 		 * update the uio structure to
2923 		 * reflect the I/O that we just issued
2924 		 */
2925 		uio_update(uio, (user_size_t)io_size);
2926 
2927 		/*
2928 		 * in case we end up calling through to cluster_write_copy to finish
2929 		 * the tail of this request, we need to update the oldEOF so that we
2930 		 * don't zero-fill the head of a page if we've successfully written
2931 		 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2932 		 * page that is beyond the oldEOF if the write is unaligned... we only
2933 		 * want that to happen for the very first page of the cluster_write,
2934 		 * NOT the first page of each vector making up a multi-vector write.
2935 		 */
2936 		if (uio->uio_offset > oldEOF) {
2937 			oldEOF = uio->uio_offset;
2938 		}
2939 
2940 		io_req_size -= io_size;
2941 
2942 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2943 		    (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2944 	} /* end while */
2945 
2946 	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2947 		retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2948 
2949 		if (retval == 0 && *write_type == IO_DIRECT) {
2950 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2951 			    (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2952 
2953 			goto next_dwrite;
2954 		}
2955 	}
2956 
2957 wait_for_dwrites:
2958 
2959 	if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
2960 		retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2961 		reset_vector_run_state();
2962 	}
2963 	/*
2964 	 * make sure all async writes issued as part of this stream
2965 	 * have completed before we return
2966 	 */
2967 	cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
2968 
2969 	if (iostate.io_error) {
2970 		retval = iostate.io_error;
2971 	}
2972 
2973 	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
2974 
2975 	if (io_throttled == TRUE && retval == 0) {
2976 		retval = EAGAIN;
2977 	}
2978 
2979 	if (io_req_size && retval == 0) {
2980 		/*
2981 		 * we couldn't handle the tail of this request in DIRECT mode
2982 		 * so fire it through the copy path
2983 		 *
2984 		 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2985 		 * so we can just pass 0 in for the headOff and tailOff
2986 		 */
2987 		if (uio->uio_offset > oldEOF) {
2988 			oldEOF = uio->uio_offset;
2989 		}
2990 
2991 		retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
2992 
2993 		*write_type = IO_UNKNOWN;
2994 	}
2995 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
2996 	    (int)uio->uio_offset, io_req_size, retval, 4, 0);
2997 
2998 	return retval;
2999 }
3000 
3001 
3002 static int
cluster_write_contig(vnode_t vp,struct uio * uio,off_t newEOF,int * write_type,u_int32_t * write_length,int (* callback)(buf_t,void *),void * callback_arg,int bflag)3003 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
3004     int (*callback)(buf_t, void *), void *callback_arg, int bflag)
3005 {
3006 	upl_page_info_t *pl;
3007 	addr64_t         src_paddr = 0;
3008 	upl_t            upl[MAX_VECTS];
3009 	vm_offset_t      upl_offset;
3010 	u_int32_t        tail_size = 0;
3011 	u_int32_t        io_size;
3012 	u_int32_t        xsize;
3013 	upl_size_t       upl_size;
3014 	vm_size_t        upl_needed_size;
3015 	mach_msg_type_number_t  pages_in_pl;
3016 	upl_control_flags_t upl_flags;
3017 	kern_return_t    kret;
3018 	struct clios     iostate;
3019 	int              error  = 0;
3020 	int              cur_upl = 0;
3021 	int              num_upl = 0;
3022 	int              n;
3023 	user_addr_t      iov_base;
3024 	u_int32_t        devblocksize;
3025 	u_int32_t        mem_alignment_mask;
3026 
3027 	/*
3028 	 * When we enter this routine, we know
3029 	 *  -- the io_req_size will not exceed iov_len
3030 	 *  -- the target address is physically contiguous
3031 	 */
3032 	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
3033 
3034 	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3035 	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3036 
3037 	iostate.io_completed = 0;
3038 	iostate.io_issued = 0;
3039 	iostate.io_error = 0;
3040 	iostate.io_wanted = 0;
3041 
3042 	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
3043 
3044 next_cwrite:
3045 	io_size = *write_length;
3046 
3047 	iov_base = uio_curriovbase(uio);
3048 
3049 	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3050 	upl_needed_size = upl_offset + io_size;
3051 
3052 	pages_in_pl = 0;
3053 	upl_size = (upl_size_t)upl_needed_size;
3054 	upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
3055 	    UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3056 
3057 	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
3058 	kret = vm_map_get_upl(map,
3059 	    vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
3060 	    &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
3061 
3062 	if (kret != KERN_SUCCESS) {
3063 		/*
3064 		 * failed to get pagelist
3065 		 */
3066 		error = EINVAL;
3067 		goto wait_for_cwrites;
3068 	}
3069 	num_upl++;
3070 
3071 	/*
3072 	 * Consider the possibility that upl_size wasn't satisfied.
3073 	 */
3074 	if (upl_size < upl_needed_size) {
3075 		/*
3076 		 * This is a failure in the physical memory case.
3077 		 */
3078 		error = EINVAL;
3079 		goto wait_for_cwrites;
3080 	}
3081 	pl = ubc_upl_pageinfo(upl[cur_upl]);
3082 
3083 	src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
3084 
3085 	while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3086 		u_int32_t   head_size;
3087 
3088 		head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
3089 
3090 		if (head_size > io_size) {
3091 			head_size = io_size;
3092 		}
3093 
3094 		error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
3095 
3096 		if (error) {
3097 			goto wait_for_cwrites;
3098 		}
3099 
3100 		upl_offset += head_size;
3101 		src_paddr  += head_size;
3102 		io_size    -= head_size;
3103 
3104 		iov_base   += head_size;
3105 	}
3106 	if ((u_int32_t)iov_base & mem_alignment_mask) {
3107 		/*
3108 		 * request doesn't set up on a memory boundary
3109 		 * the underlying DMA engine can handle...
3110 		 * return an error instead of going through
3111 		 * the slow copy path since the intent of this
3112 		 * path is direct I/O from device memory
3113 		 */
3114 		error = EINVAL;
3115 		goto wait_for_cwrites;
3116 	}
3117 
3118 	tail_size = io_size & (devblocksize - 1);
3119 	io_size  -= tail_size;
3120 
3121 	while (io_size && error == 0) {
3122 		if (io_size > MAX_IO_CONTIG_SIZE) {
3123 			xsize = MAX_IO_CONTIG_SIZE;
3124 		} else {
3125 			xsize = io_size;
3126 		}
3127 		/*
3128 		 * request asynchronously so that we can overlap
3129 		 * the preparation of the next I/O... we'll do
3130 		 * the commit after all the I/O has completed
3131 		 * since its all issued against the same UPL
3132 		 * if there are already too many outstanding writes
3133 		 * wait until some have completed before issuing the next
3134 		 */
3135 		cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
3136 
3137 		if (iostate.io_error) {
3138 			/*
3139 			 * one of the earlier writes we issued ran into a hard error
3140 			 * don't issue any more writes...
3141 			 * go wait for all writes that are part of this stream
3142 			 * to complete before returning the error to the caller
3143 			 */
3144 			goto wait_for_cwrites;
3145 		}
3146 		/*
3147 		 * issue an asynchronous write to cluster_io
3148 		 */
3149 		error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
3150 		    xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
3151 
3152 		if (error == 0) {
3153 			/*
3154 			 * The cluster_io write completed successfully,
3155 			 * update the uio structure
3156 			 */
3157 			uio_update(uio, (user_size_t)xsize);
3158 
3159 			upl_offset += xsize;
3160 			src_paddr  += xsize;
3161 			io_size    -= xsize;
3162 		}
3163 	}
3164 	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
3165 		error = cluster_io_type(uio, write_type, write_length, 0);
3166 
3167 		if (error == 0 && *write_type == IO_CONTIG) {
3168 			cur_upl++;
3169 			goto next_cwrite;
3170 		}
3171 	} else {
3172 		*write_type = IO_UNKNOWN;
3173 	}
3174 
3175 wait_for_cwrites:
3176 	/*
3177 	 * make sure all async writes that are part of this stream
3178 	 * have completed before we proceed
3179 	 */
3180 	cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
3181 
3182 	if (iostate.io_error) {
3183 		error = iostate.io_error;
3184 	}
3185 
3186 	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
3187 
3188 	if (error == 0 && tail_size) {
3189 		error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
3190 	}
3191 
3192 	for (n = 0; n < num_upl; n++) {
3193 		/*
3194 		 * just release our hold on each physically contiguous
3195 		 * region without changing any state
3196 		 */
3197 		ubc_upl_abort(upl[n], 0);
3198 	}
3199 
3200 	return error;
3201 }
3202 
3203 
3204 /*
3205  * need to avoid a race between an msync of a range of pages dirtied via mmap
3206  * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3207  * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3208  *
3209  * we should never force-zero-fill pages that are already valid in the cache...
3210  * the entire page contains valid data (either from disk, zero-filled or dirtied
3211  * via an mmap) so we can only do damage by trying to zero-fill
3212  *
3213  */
3214 static int
cluster_zero_range(upl_t upl,upl_page_info_t * pl,int flags,int io_offset,off_t zero_off,off_t upl_f_offset,int bytes_to_zero)3215 cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3216 {
3217 	int zero_pg_index;
3218 	boolean_t need_cluster_zero = TRUE;
3219 
3220 	if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
3221 		bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
3222 		zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3223 
3224 		if (upl_valid_page(pl, zero_pg_index)) {
3225 			/*
3226 			 * never force zero valid pages - dirty or clean
3227 			 * we'll leave these in the UPL for cluster_write_copy to deal with
3228 			 */
3229 			need_cluster_zero = FALSE;
3230 		}
3231 	}
3232 	if (need_cluster_zero == TRUE) {
3233 		cluster_zero(upl, io_offset, bytes_to_zero, NULL);
3234 	}
3235 
3236 	return bytes_to_zero;
3237 }
3238 
3239 
3240 void
cluster_update_state(vnode_t vp,vm_object_offset_t s_offset,vm_object_offset_t e_offset,boolean_t vm_initiated)3241 cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3242 {
3243 	struct cl_extent cl;
3244 	boolean_t first_pass = TRUE;
3245 
3246 	assert(s_offset < e_offset);
3247 	assert((s_offset & PAGE_MASK_64) == 0);
3248 	assert((e_offset & PAGE_MASK_64) == 0);
3249 
3250 	cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3251 	cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3252 
3253 	cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
3254 	    vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3255 }
3256 
3257 
3258 static void
cluster_update_state_internal(vnode_t vp,struct cl_extent * cl,int flags,boolean_t defer_writes,boolean_t * first_pass,off_t write_off,int write_cnt,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)3259 cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
3260     boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3261     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
3262 {
3263 	struct cl_writebehind *wbp;
3264 	int     cl_index;
3265 	int     ret_cluster_try_push;
3266 	u_int   max_cluster_pgcount;
3267 
3268 
3269 	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3270 
3271 	/*
3272 	 * take the lock to protect our accesses
3273 	 * of the writebehind and sparse cluster state
3274 	 */
3275 	wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3276 
3277 	if (wbp->cl_scmap) {
3278 		if (!(flags & IO_NOCACHE)) {
3279 			/*
3280 			 * we've fallen into the sparse
3281 			 * cluster method of delaying dirty pages
3282 			 */
3283 			sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3284 
3285 			lck_mtx_unlock(&wbp->cl_lockw);
3286 			return;
3287 		}
3288 		/*
3289 		 * must have done cached writes that fell into
3290 		 * the sparse cluster mechanism... we've switched
3291 		 * to uncached writes on the file, so go ahead
3292 		 * and push whatever's in the sparse map
3293 		 * and switch back to normal clustering
3294 		 */
3295 		wbp->cl_number = 0;
3296 
3297 		sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
3298 		/*
3299 		 * no clusters of either type present at this point
3300 		 * so just go directly to start_new_cluster since
3301 		 * we know we need to delay this I/O since we've
3302 		 * already released the pages back into the cache
3303 		 * to avoid the deadlock with sparse_cluster_push
3304 		 */
3305 		goto start_new_cluster;
3306 	}
3307 	if (*first_pass == TRUE) {
3308 		if (write_off == wbp->cl_last_write) {
3309 			wbp->cl_seq_written += write_cnt;
3310 		} else {
3311 			wbp->cl_seq_written = write_cnt;
3312 		}
3313 
3314 		wbp->cl_last_write = write_off + write_cnt;
3315 
3316 		*first_pass = FALSE;
3317 	}
3318 	if (wbp->cl_number == 0) {
3319 		/*
3320 		 * no clusters currently present
3321 		 */
3322 		goto start_new_cluster;
3323 	}
3324 
3325 	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3326 		/*
3327 		 * check each cluster that we currently hold
3328 		 * try to merge some or all of this write into
3329 		 * one or more of the existing clusters... if
3330 		 * any portion of the write remains, start a
3331 		 * new cluster
3332 		 */
3333 		if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3334 			/*
3335 			 * the current write starts at or after the current cluster
3336 			 */
3337 			if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3338 				/*
3339 				 * we have a write that fits entirely
3340 				 * within the existing cluster limits
3341 				 */
3342 				if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3343 					/*
3344 					 * update our idea of where the cluster ends
3345 					 */
3346 					wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3347 				}
3348 				break;
3349 			}
3350 			if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3351 				/*
3352 				 * we have a write that starts in the middle of the current cluster
3353 				 * but extends beyond the cluster's limit... we know this because
3354 				 * of the previous checks
3355 				 * we'll extend the current cluster to the max
3356 				 * and update the b_addr for the current write to reflect that
3357 				 * the head of it was absorbed into this cluster...
3358 				 * note that we'll always have a leftover tail in this case since
3359 				 * full absorbtion would have occurred in the clause above
3360 				 */
3361 				wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3362 
3363 				cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3364 			}
3365 			/*
3366 			 * we come here for the case where the current write starts
3367 			 * beyond the limit of the existing cluster or we have a leftover
3368 			 * tail after a partial absorbtion
3369 			 *
3370 			 * in either case, we'll check the remaining clusters before
3371 			 * starting a new one
3372 			 */
3373 		} else {
3374 			/*
3375 			 * the current write starts in front of the cluster we're currently considering
3376 			 */
3377 			if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3378 				/*
3379 				 * we can just merge the new request into
3380 				 * this cluster and leave it in the cache
3381 				 * since the resulting cluster is still
3382 				 * less than the maximum allowable size
3383 				 */
3384 				wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3385 
3386 				if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3387 					/*
3388 					 * the current write completely
3389 					 * envelops the existing cluster and since
3390 					 * each write is limited to at most max_cluster_pgcount pages
3391 					 * we can just use the start and last blocknos of the write
3392 					 * to generate the cluster limits
3393 					 */
3394 					wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3395 				}
3396 				break;
3397 			}
3398 			/*
3399 			 * if we were to combine this write with the current cluster
3400 			 * we would exceed the cluster size limit.... so,
3401 			 * let's see if there's any overlap of the new I/O with
3402 			 * the cluster we're currently considering... in fact, we'll
3403 			 * stretch the cluster out to it's full limit and see if we
3404 			 * get an intersection with the current write
3405 			 *
3406 			 */
3407 			if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3408 				/*
3409 				 * the current write extends into the proposed cluster
3410 				 * clip the length of the current write after first combining it's
3411 				 * tail with the newly shaped cluster
3412 				 */
3413 				wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3414 
3415 				cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3416 			}
3417 			/*
3418 			 * if we get here, there was no way to merge
3419 			 * any portion of this write with this cluster
3420 			 * or we could only merge part of it which
3421 			 * will leave a tail...
3422 			 * we'll check the remaining clusters before starting a new one
3423 			 */
3424 		}
3425 	}
3426 	if (cl_index < wbp->cl_number) {
3427 		/*
3428 		 * we found an existing cluster(s) that we
3429 		 * could entirely merge this I/O into
3430 		 */
3431 		goto delay_io;
3432 	}
3433 
3434 	if (defer_writes == FALSE &&
3435 	    wbp->cl_number == MAX_CLUSTERS &&
3436 	    wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3437 		uint32_t        n;
3438 
3439 		if (vp->v_mount->mnt_minsaturationbytecount) {
3440 			n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
3441 
3442 			if (n > MAX_CLUSTERS) {
3443 				n = MAX_CLUSTERS;
3444 			}
3445 		} else {
3446 			n = 0;
3447 		}
3448 
3449 		if (n == 0) {
3450 			if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
3451 				n = WRITE_BEHIND_SSD;
3452 			} else {
3453 				n = WRITE_BEHIND;
3454 			}
3455 		}
3456 		while (n--) {
3457 			cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
3458 		}
3459 	}
3460 	if (wbp->cl_number < MAX_CLUSTERS) {
3461 		/*
3462 		 * we didn't find an existing cluster to
3463 		 * merge into, but there's room to start
3464 		 * a new one
3465 		 */
3466 		goto start_new_cluster;
3467 	}
3468 	/*
3469 	 * no exisitng cluster to merge with and no
3470 	 * room to start a new one... we'll try
3471 	 * pushing one of the existing ones... if none of
3472 	 * them are able to be pushed, we'll switch
3473 	 * to the sparse cluster mechanism
3474 	 * cluster_try_push updates cl_number to the
3475 	 * number of remaining clusters... and
3476 	 * returns the number of currently unused clusters
3477 	 */
3478 	ret_cluster_try_push = 0;
3479 
3480 	/*
3481 	 * if writes are not deferred, call cluster push immediately
3482 	 */
3483 	if (defer_writes == FALSE) {
3484 		ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
3485 	}
3486 	/*
3487 	 * execute following regardless of writes being deferred or not
3488 	 */
3489 	if (ret_cluster_try_push == 0) {
3490 		/*
3491 		 * no more room in the normal cluster mechanism
3492 		 * so let's switch to the more expansive but expensive
3493 		 * sparse mechanism....
3494 		 */
3495 		sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
3496 		sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3497 
3498 		lck_mtx_unlock(&wbp->cl_lockw);
3499 		return;
3500 	}
3501 start_new_cluster:
3502 	wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3503 	wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3504 
3505 	wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3506 
3507 	if (flags & IO_NOCACHE) {
3508 		wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3509 	}
3510 
3511 	if (flags & IO_PASSIVE) {
3512 		wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3513 	}
3514 
3515 	wbp->cl_number++;
3516 delay_io:
3517 	lck_mtx_unlock(&wbp->cl_lockw);
3518 	return;
3519 }
3520 
3521 
3522 static int
cluster_write_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int flags,int (* callback)(buf_t,void *),void * callback_arg)3523 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
3524     off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3525 {
3526 	upl_page_info_t *pl;
3527 	upl_t            upl;
3528 	vm_offset_t      upl_offset = 0;
3529 	vm_size_t        upl_size;
3530 	off_t            upl_f_offset;
3531 	int              pages_in_upl;
3532 	int              start_offset;
3533 	int              xfer_resid;
3534 	int              io_size;
3535 	int              io_offset;
3536 	int              bytes_to_zero;
3537 	int              bytes_to_move;
3538 	kern_return_t    kret;
3539 	int              retval = 0;
3540 	int              io_resid;
3541 	long long        total_size;
3542 	long long        zero_cnt;
3543 	off_t            zero_off;
3544 	long long        zero_cnt1;
3545 	off_t            zero_off1;
3546 	off_t            write_off = 0;
3547 	int              write_cnt = 0;
3548 	boolean_t        first_pass = FALSE;
3549 	struct cl_extent cl;
3550 	int              bflag;
3551 	u_int            max_io_size;
3552 
3553 	if (uio) {
3554 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3555 		    (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
3556 
3557 		io_resid = io_req_size;
3558 	} else {
3559 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3560 		    0, 0, (int)oldEOF, (int)newEOF, 0);
3561 
3562 		io_resid = 0;
3563 	}
3564 	if (flags & IO_PASSIVE) {
3565 		bflag = CL_PASSIVE;
3566 	} else {
3567 		bflag = 0;
3568 	}
3569 	if (flags & IO_NOCACHE) {
3570 		bflag |= CL_NOCACHE;
3571 	}
3572 
3573 	if (flags & IO_SKIP_ENCRYPTION) {
3574 		bflag |= CL_ENCRYPTED;
3575 	}
3576 
3577 	zero_cnt  = 0;
3578 	zero_cnt1 = 0;
3579 	zero_off  = 0;
3580 	zero_off1 = 0;
3581 
3582 	max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
3583 
3584 	if (flags & IO_HEADZEROFILL) {
3585 		/*
3586 		 * some filesystems (HFS is one) don't support unallocated holes within a file...
3587 		 * so we zero fill the intervening space between the old EOF and the offset
3588 		 * where the next chunk of real data begins.... ftruncate will also use this
3589 		 * routine to zero fill to the new EOF when growing a file... in this case, the
3590 		 * uio structure will not be provided
3591 		 */
3592 		if (uio) {
3593 			if (headOff < uio->uio_offset) {
3594 				zero_cnt = uio->uio_offset - headOff;
3595 				zero_off = headOff;
3596 			}
3597 		} else if (headOff < newEOF) {
3598 			zero_cnt = newEOF - headOff;
3599 			zero_off = headOff;
3600 		}
3601 	} else {
3602 		if (uio && uio->uio_offset > oldEOF) {
3603 			zero_off = uio->uio_offset & ~PAGE_MASK_64;
3604 
3605 			if (zero_off >= oldEOF) {
3606 				zero_cnt = uio->uio_offset - zero_off;
3607 
3608 				flags |= IO_HEADZEROFILL;
3609 			}
3610 		}
3611 	}
3612 	if (flags & IO_TAILZEROFILL) {
3613 		if (uio) {
3614 			zero_off1 = uio->uio_offset + io_req_size;
3615 
3616 			if (zero_off1 < tailOff) {
3617 				zero_cnt1 = tailOff - zero_off1;
3618 			}
3619 		}
3620 	} else {
3621 		if (uio && newEOF > oldEOF) {
3622 			zero_off1 = uio->uio_offset + io_req_size;
3623 
3624 			if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3625 				zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3626 
3627 				flags |= IO_TAILZEROFILL;
3628 			}
3629 		}
3630 	}
3631 	if (zero_cnt == 0 && uio == (struct uio *) 0) {
3632 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3633 		    retval, 0, 0, 0, 0);
3634 		return 0;
3635 	}
3636 	if (uio) {
3637 		write_off = uio->uio_offset;
3638 		write_cnt = (int)uio_resid(uio);
3639 		/*
3640 		 * delay updating the sequential write info
3641 		 * in the control block until we've obtained
3642 		 * the lock for it
3643 		 */
3644 		first_pass = TRUE;
3645 	}
3646 	while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
3647 		/*
3648 		 * for this iteration of the loop, figure out where our starting point is
3649 		 */
3650 		if (zero_cnt) {
3651 			start_offset = (int)(zero_off & PAGE_MASK_64);
3652 			upl_f_offset = zero_off - start_offset;
3653 		} else if (io_resid) {
3654 			start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3655 			upl_f_offset = uio->uio_offset - start_offset;
3656 		} else {
3657 			start_offset = (int)(zero_off1 & PAGE_MASK_64);
3658 			upl_f_offset = zero_off1 - start_offset;
3659 		}
3660 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3661 		    (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
3662 
3663 		if (total_size > max_io_size) {
3664 			total_size = max_io_size;
3665 		}
3666 
3667 		cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3668 
3669 		if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
3670 			/*
3671 			 * assumption... total_size <= io_resid
3672 			 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3673 			 */
3674 			if ((start_offset + total_size) > max_io_size) {
3675 				total_size = max_io_size - start_offset;
3676 			}
3677 			xfer_resid = (int)total_size;
3678 
3679 			retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
3680 
3681 			if (retval) {
3682 				break;
3683 			}
3684 
3685 			io_resid    -= (total_size - xfer_resid);
3686 			total_size   = xfer_resid;
3687 			start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3688 			upl_f_offset = uio->uio_offset - start_offset;
3689 
3690 			if (total_size == 0) {
3691 				if (start_offset) {
3692 					/*
3693 					 * the write did not finish on a page boundary
3694 					 * which will leave upl_f_offset pointing to the
3695 					 * beginning of the last page written instead of
3696 					 * the page beyond it... bump it in this case
3697 					 * so that the cluster code records the last page
3698 					 * written as dirty
3699 					 */
3700 					upl_f_offset += PAGE_SIZE_64;
3701 				}
3702 				upl_size = 0;
3703 
3704 				goto check_cluster;
3705 			}
3706 		}
3707 		/*
3708 		 * compute the size of the upl needed to encompass
3709 		 * the requested write... limit each call to cluster_io
3710 		 * to the maximum UPL size... cluster_io will clip if
3711 		 * this exceeds the maximum io_size for the device,
3712 		 * make sure to account for
3713 		 * a starting offset that's not page aligned
3714 		 */
3715 		upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3716 
3717 		if (upl_size > max_io_size) {
3718 			upl_size = max_io_size;
3719 		}
3720 
3721 		pages_in_upl = (int)(upl_size / PAGE_SIZE);
3722 		io_size      = (int)(upl_size - start_offset);
3723 
3724 		if ((long long)io_size > total_size) {
3725 			io_size = (int)total_size;
3726 		}
3727 
3728 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
3729 
3730 
3731 		/*
3732 		 * Gather the pages from the buffer cache.
3733 		 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3734 		 * that we intend to modify these pages.
3735 		 */
3736 		kret = ubc_create_upl_kernel(vp,
3737 		    upl_f_offset,
3738 		    (int)upl_size,
3739 		    &upl,
3740 		    &pl,
3741 		    UPL_SET_LITE | ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
3742 		    VM_KERN_MEMORY_FILE);
3743 		if (kret != KERN_SUCCESS) {
3744 			panic("cluster_write_copy: failed to get pagelist");
3745 		}
3746 
3747 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
3748 		    upl, (int)upl_f_offset, start_offset, 0, 0);
3749 
3750 		if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
3751 			int   read_size;
3752 
3753 			/*
3754 			 * we're starting in the middle of the first page of the upl
3755 			 * and the page isn't currently valid, so we're going to have
3756 			 * to read it in first... this is a synchronous operation
3757 			 */
3758 			read_size = PAGE_SIZE;
3759 
3760 			if ((upl_f_offset + read_size) > oldEOF) {
3761 				read_size = (int)(oldEOF - upl_f_offset);
3762 			}
3763 
3764 			retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
3765 			    CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3766 			if (retval) {
3767 				/*
3768 				 * we had an error during the read which causes us to abort
3769 				 * the current cluster_write request... before we do, we need
3770 				 * to release the rest of the pages in the upl without modifying
3771 				 * there state and mark the failed page in error
3772 				 */
3773 				ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3774 
3775 				if (upl_size > PAGE_SIZE) {
3776 					ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size,
3777 					    UPL_ABORT_FREE_ON_EMPTY);
3778 				}
3779 
3780 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3781 				    upl, 0, 0, retval, 0);
3782 				break;
3783 			}
3784 		}
3785 		if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3786 			/*
3787 			 * the last offset we're writing to in this upl does not end on a page
3788 			 * boundary... if it's not beyond the old EOF, then we'll also need to
3789 			 * pre-read this page in if it isn't already valid
3790 			 */
3791 			upl_offset = upl_size - PAGE_SIZE;
3792 
3793 			if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3794 			    !upl_valid_page(pl, (int)(upl_offset / PAGE_SIZE))) {
3795 				int   read_size;
3796 
3797 				read_size = PAGE_SIZE;
3798 
3799 				if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
3800 					read_size = (int)(oldEOF - (upl_f_offset + upl_offset));
3801 				}
3802 
3803 				retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
3804 				    CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3805 				if (retval) {
3806 					/*
3807 					 * we had an error during the read which causes us to abort
3808 					 * the current cluster_write request... before we do, we
3809 					 * need to release the rest of the pages in the upl without
3810 					 * modifying there state and mark the failed page in error
3811 					 */
3812 					ubc_upl_abort_range(upl, (upl_offset_t)upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3813 
3814 					if (upl_size > PAGE_SIZE) {
3815 						ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3816 					}
3817 
3818 					KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3819 					    upl, 0, 0, retval, 0);
3820 					break;
3821 				}
3822 			}
3823 		}
3824 		xfer_resid = io_size;
3825 		io_offset = start_offset;
3826 
3827 		while (zero_cnt && xfer_resid) {
3828 			if (zero_cnt < (long long)xfer_resid) {
3829 				bytes_to_zero = (int)zero_cnt;
3830 			} else {
3831 				bytes_to_zero = xfer_resid;
3832 			}
3833 
3834 			bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3835 
3836 			xfer_resid -= bytes_to_zero;
3837 			zero_cnt   -= bytes_to_zero;
3838 			zero_off   += bytes_to_zero;
3839 			io_offset  += bytes_to_zero;
3840 		}
3841 		if (xfer_resid && io_resid) {
3842 			u_int32_t  io_requested;
3843 
3844 			bytes_to_move = min(io_resid, xfer_resid);
3845 			io_requested = bytes_to_move;
3846 
3847 			retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3848 
3849 			if (retval) {
3850 				ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3851 
3852 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3853 				    upl, 0, 0, retval, 0);
3854 			} else {
3855 				io_resid   -= bytes_to_move;
3856 				xfer_resid -= bytes_to_move;
3857 				io_offset  += bytes_to_move;
3858 			}
3859 		}
3860 		while (xfer_resid && zero_cnt1 && retval == 0) {
3861 			if (zero_cnt1 < (long long)xfer_resid) {
3862 				bytes_to_zero = (int)zero_cnt1;
3863 			} else {
3864 				bytes_to_zero = xfer_resid;
3865 			}
3866 
3867 			bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
3868 
3869 			xfer_resid -= bytes_to_zero;
3870 			zero_cnt1  -= bytes_to_zero;
3871 			zero_off1  += bytes_to_zero;
3872 			io_offset  += bytes_to_zero;
3873 		}
3874 		if (retval == 0) {
3875 			int do_zeroing = 1;
3876 
3877 			io_size += start_offset;
3878 
3879 			/* Force more restrictive zeroing behavior only on APFS */
3880 			if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3881 				do_zeroing = 0;
3882 			}
3883 
3884 			if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3885 				/*
3886 				 * if we're extending the file with this write
3887 				 * we'll zero fill the rest of the page so that
3888 				 * if the file gets extended again in such a way as to leave a
3889 				 * hole starting at this EOF, we'll have zero's in the correct spot
3890 				 */
3891 				cluster_zero(upl, io_size, (int)(upl_size - io_size), NULL);
3892 			}
3893 			/*
3894 			 * release the upl now if we hold one since...
3895 			 * 1) pages in it may be present in the sparse cluster map
3896 			 *    and may span 2 separate buckets there... if they do and
3897 			 *    we happen to have to flush a bucket to make room and it intersects
3898 			 *    this upl, a deadlock may result on page BUSY
3899 			 * 2) we're delaying the I/O... from this point forward we're just updating
3900 			 *    the cluster state... no need to hold the pages, so commit them
3901 			 * 3) IO_SYNC is set...
3902 			 *    because we had to ask for a UPL that provides currenty non-present pages, the
3903 			 *    UPL has been automatically set to clear the dirty flags (both software and hardware)
3904 			 *    upon committing it... this is not the behavior we want since it's possible for
3905 			 *    pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3906 			 *    we'll pick these pages back up later with the correct behavior specified.
3907 			 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3908 			 *    of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3909 			 *    we hold since the flushing context is holding the cluster lock.
3910 			 */
3911 			ubc_upl_commit_range(upl, 0, (upl_size_t)upl_size,
3912 			    UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
3913 check_cluster:
3914 			/*
3915 			 * calculate the last logical block number
3916 			 * that this delayed I/O encompassed
3917 			 */
3918 			cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3919 
3920 			if (flags & IO_SYNC) {
3921 				/*
3922 				 * if the IO_SYNC flag is set than we need to bypass
3923 				 * any clustering and immediately issue the I/O
3924 				 *
3925 				 * we don't hold the lock at this point
3926 				 *
3927 				 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3928 				 * so that we correctly deal with a change in state of the hardware modify bit...
3929 				 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3930 				 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3931 				 * responsible for generating the correct sized I/O(s)
3932 				 */
3933 				retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
3934 			} else {
3935 				boolean_t defer_writes = FALSE;
3936 
3937 				if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) {
3938 					defer_writes = TRUE;
3939 				}
3940 
3941 				cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
3942 				    write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
3943 			}
3944 		}
3945 	}
3946 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
3947 
3948 	return retval;
3949 }
3950 
3951 
3952 
3953 int
cluster_read(vnode_t vp,struct uio * uio,off_t filesize,int xflags)3954 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
3955 {
3956 	return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3957 }
3958 
3959 
3960 int
cluster_read_ext(vnode_t vp,struct uio * uio,off_t filesize,int xflags,int (* callback)(buf_t,void *),void * callback_arg)3961 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3962 {
3963 	int             retval = 0;
3964 	int             flags;
3965 	user_ssize_t    cur_resid;
3966 	u_int32_t       io_size;
3967 	u_int32_t       read_length = 0;
3968 	int             read_type = IO_COPY;
3969 
3970 	flags = xflags;
3971 
3972 	if (vp->v_flag & VNOCACHE_DATA) {
3973 		flags |= IO_NOCACHE;
3974 	}
3975 	if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) {
3976 		flags |= IO_RAOFF;
3977 	}
3978 
3979 	if (flags & IO_SKIP_ENCRYPTION) {
3980 		flags |= IO_ENCRYPTED;
3981 	}
3982 
3983 	/*
3984 	 * do a read through the cache if one of the following is true....
3985 	 *   NOCACHE is not true
3986 	 *   the uio request doesn't target USERSPACE
3987 	 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3988 	 * Reading encrypted data from a CP filesystem should never result in the data touching
3989 	 * the UBC.
3990 	 *
3991 	 * otherwise, find out if we want the direct or contig variant for
3992 	 * the first vector in the uio request
3993 	 */
3994 	if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED)) {
3995 		retval = cluster_io_type(uio, &read_type, &read_length, 0);
3996 	}
3997 
3998 	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
3999 		switch (read_type) {
4000 		case IO_COPY:
4001 			/*
4002 			 * make sure the uio_resid isn't too big...
4003 			 * internally, we want to handle all of the I/O in
4004 			 * chunk sizes that fit in a 32 bit int
4005 			 */
4006 			if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
4007 				io_size = MAX_IO_REQUEST_SIZE;
4008 			} else {
4009 				io_size = (u_int32_t)cur_resid;
4010 			}
4011 
4012 			retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
4013 			break;
4014 
4015 		case IO_DIRECT:
4016 			retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
4017 			break;
4018 
4019 		case IO_CONTIG:
4020 			retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
4021 			break;
4022 
4023 		case IO_UNKNOWN:
4024 			retval = cluster_io_type(uio, &read_type, &read_length, 0);
4025 			break;
4026 		}
4027 	}
4028 	return retval;
4029 }
4030 
4031 
4032 
4033 static void
cluster_read_upl_release(upl_t upl,int start_pg,int last_pg,int take_reference)4034 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
4035 {
4036 	int range;
4037 	int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4038 
4039 	if ((range = last_pg - start_pg)) {
4040 		if (take_reference) {
4041 			abort_flags |= UPL_ABORT_REFERENCE;
4042 		}
4043 
4044 		ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
4045 	}
4046 }
4047 
4048 
4049 static int
cluster_read_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)4050 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4051 {
4052 	upl_page_info_t *pl;
4053 	upl_t            upl;
4054 	vm_offset_t      upl_offset;
4055 	u_int32_t        upl_size;
4056 	off_t            upl_f_offset;
4057 	int              start_offset;
4058 	int              start_pg;
4059 	int              last_pg;
4060 	int              uio_last = 0;
4061 	int              pages_in_upl;
4062 	off_t            max_size;
4063 	off_t            last_ioread_offset;
4064 	off_t            last_request_offset;
4065 	kern_return_t    kret;
4066 	int              error  = 0;
4067 	int              retval = 0;
4068 	u_int32_t        size_of_prefetch;
4069 	u_int32_t        xsize;
4070 	u_int32_t        io_size;
4071 	u_int32_t        max_rd_size;
4072 	u_int32_t        max_io_size;
4073 	u_int32_t        max_prefetch;
4074 	u_int            rd_ahead_enabled = 1;
4075 	u_int            prefetch_enabled = 1;
4076 	struct cl_readahead *   rap;
4077 	struct clios            iostate;
4078 	struct cl_extent        extent;
4079 	int              bflag;
4080 	int              take_reference = 1;
4081 	int              policy = IOPOL_DEFAULT;
4082 	boolean_t        iolock_inited = FALSE;
4083 
4084 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
4085 	    (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
4086 
4087 	if (flags & IO_ENCRYPTED) {
4088 		panic("encrypted blocks will hit UBC!");
4089 	}
4090 
4091 	policy = throttle_get_io_policy(NULL);
4092 
4093 	if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) {
4094 		take_reference = 0;
4095 	}
4096 
4097 	if (flags & IO_PASSIVE) {
4098 		bflag = CL_PASSIVE;
4099 	} else {
4100 		bflag = 0;
4101 	}
4102 
4103 	if (flags & IO_NOCACHE) {
4104 		bflag |= CL_NOCACHE;
4105 	}
4106 
4107 	if (flags & IO_SKIP_ENCRYPTION) {
4108 		bflag |= CL_ENCRYPTED;
4109 	}
4110 
4111 	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
4112 	max_prefetch = cluster_max_prefetch(vp, max_io_size, prefetch_max);
4113 	max_rd_size = max_prefetch;
4114 
4115 	last_request_offset = uio->uio_offset + io_req_size;
4116 
4117 	if (last_request_offset > filesize) {
4118 		last_request_offset = filesize;
4119 	}
4120 
4121 	if ((flags & (IO_RAOFF | IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
4122 		rd_ahead_enabled = 0;
4123 		rap = NULL;
4124 	} else {
4125 		if (cluster_is_throttled(vp)) {
4126 			/*
4127 			 * we're in the throttle window, at the very least
4128 			 * we want to limit the size of the I/O we're about
4129 			 * to issue
4130 			 */
4131 			rd_ahead_enabled = 0;
4132 			prefetch_enabled = 0;
4133 
4134 			max_rd_size = THROTTLE_MAX_IOSIZE;
4135 		}
4136 		if ((rap = cluster_get_rap(vp)) == NULL) {
4137 			rd_ahead_enabled = 0;
4138 		} else {
4139 			extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
4140 			extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
4141 		}
4142 	}
4143 	if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
4144 		/*
4145 		 * determine if we already have a read-ahead in the pipe courtesy of the
4146 		 * last read systemcall that was issued...
4147 		 * if so, pick up it's extent to determine where we should start
4148 		 * with respect to any read-ahead that might be necessary to
4149 		 * garner all the data needed to complete this read systemcall
4150 		 */
4151 		last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
4152 
4153 		if (last_ioread_offset < uio->uio_offset) {
4154 			last_ioread_offset = (off_t)0;
4155 		} else if (last_ioread_offset > last_request_offset) {
4156 			last_ioread_offset = last_request_offset;
4157 		}
4158 	} else {
4159 		last_ioread_offset = (off_t)0;
4160 	}
4161 
4162 	while (io_req_size && uio->uio_offset < filesize && retval == 0) {
4163 		max_size = filesize - uio->uio_offset;
4164 		bool leftover_upl_aborted = false;
4165 
4166 		if ((off_t)(io_req_size) < max_size) {
4167 			io_size = io_req_size;
4168 		} else {
4169 			io_size = (u_int32_t)max_size;
4170 		}
4171 
4172 		if (!(flags & IO_NOCACHE)) {
4173 			while (io_size) {
4174 				u_int32_t io_resid;
4175 				u_int32_t io_requested;
4176 
4177 				/*
4178 				 * if we keep finding the pages we need already in the cache, then
4179 				 * don't bother to call cluster_read_prefetch since it costs CPU cycles
4180 				 * to determine that we have all the pages we need... once we miss in
4181 				 * the cache and have issued an I/O, than we'll assume that we're likely
4182 				 * to continue to miss in the cache and it's to our advantage to try and prefetch
4183 				 */
4184 				if (last_request_offset && last_ioread_offset && (size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset))) {
4185 					if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4186 						/*
4187 						 * we've already issued I/O for this request and
4188 						 * there's still work to do and
4189 						 * our prefetch stream is running dry, so issue a
4190 						 * pre-fetch I/O... the I/O latency will overlap
4191 						 * with the copying of the data
4192 						 */
4193 						if (size_of_prefetch > max_rd_size) {
4194 							size_of_prefetch = max_rd_size;
4195 						}
4196 
4197 						size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4198 
4199 						last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4200 
4201 						if (last_ioread_offset > last_request_offset) {
4202 							last_ioread_offset = last_request_offset;
4203 						}
4204 					}
4205 				}
4206 				/*
4207 				 * limit the size of the copy we're about to do so that
4208 				 * we can notice that our I/O pipe is running dry and
4209 				 * get the next I/O issued before it does go dry
4210 				 */
4211 				if (last_ioread_offset && io_size > (max_io_size / 4)) {
4212 					io_resid = (max_io_size / 4);
4213 				} else {
4214 					io_resid = io_size;
4215 				}
4216 
4217 				io_requested = io_resid;
4218 
4219 				retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
4220 
4221 				xsize = io_requested - io_resid;
4222 
4223 				io_size -= xsize;
4224 				io_req_size -= xsize;
4225 
4226 				if (retval || io_resid) {
4227 					/*
4228 					 * if we run into a real error or
4229 					 * a page that is not in the cache
4230 					 * we need to leave streaming mode
4231 					 */
4232 					break;
4233 				}
4234 
4235 				if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
4236 					/*
4237 					 * we're already finished the I/O for this read request
4238 					 * let's see if we should do a read-ahead
4239 					 */
4240 					cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4241 				}
4242 			}
4243 			if (retval) {
4244 				break;
4245 			}
4246 			if (io_size == 0) {
4247 				if (rap != NULL) {
4248 					if (extent.e_addr < rap->cl_lastr) {
4249 						rap->cl_maxra = 0;
4250 					}
4251 					rap->cl_lastr = extent.e_addr;
4252 				}
4253 				break;
4254 			}
4255 			/*
4256 			 * recompute max_size since cluster_copy_ubc_data_internal
4257 			 * may have advanced uio->uio_offset
4258 			 */
4259 			max_size = filesize - uio->uio_offset;
4260 		}
4261 
4262 		iostate.io_completed = 0;
4263 		iostate.io_issued = 0;
4264 		iostate.io_error = 0;
4265 		iostate.io_wanted = 0;
4266 
4267 		if ((flags & IO_RETURN_ON_THROTTLE)) {
4268 			if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4269 				if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4270 					/*
4271 					 * we're in the throttle window and at least 1 I/O
4272 					 * has already been issued by a throttleable thread
4273 					 * in this window, so return with EAGAIN to indicate
4274 					 * to the FS issuing the cluster_read call that it
4275 					 * should now throttle after dropping any locks
4276 					 */
4277 					throttle_info_update_by_mount(vp->v_mount);
4278 
4279 					retval = EAGAIN;
4280 					break;
4281 				}
4282 			}
4283 		}
4284 
4285 		/*
4286 		 * compute the size of the upl needed to encompass
4287 		 * the requested read... limit each call to cluster_io
4288 		 * to the maximum UPL size... cluster_io will clip if
4289 		 * this exceeds the maximum io_size for the device,
4290 		 * make sure to account for
4291 		 * a starting offset that's not page aligned
4292 		 */
4293 		start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4294 		upl_f_offset = uio->uio_offset - (off_t)start_offset;
4295 
4296 		if (io_size > max_rd_size) {
4297 			io_size = max_rd_size;
4298 		}
4299 
4300 		upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4301 
4302 		if (flags & IO_NOCACHE) {
4303 			if (upl_size > max_io_size) {
4304 				upl_size = max_io_size;
4305 			}
4306 		} else {
4307 			if (upl_size > max_io_size / 4) {
4308 				upl_size = max_io_size / 4;
4309 				upl_size &= ~PAGE_MASK;
4310 
4311 				if (upl_size == 0) {
4312 					upl_size = PAGE_SIZE;
4313 				}
4314 			}
4315 		}
4316 		pages_in_upl = upl_size / PAGE_SIZE;
4317 
4318 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
4319 		    upl, (int)upl_f_offset, upl_size, start_offset, 0);
4320 
4321 		kret = ubc_create_upl_kernel(vp,
4322 		    upl_f_offset,
4323 		    upl_size,
4324 		    &upl,
4325 		    &pl,
4326 		    UPL_FILE_IO | UPL_SET_LITE,
4327 		    VM_KERN_MEMORY_FILE);
4328 		if (kret != KERN_SUCCESS) {
4329 			panic("cluster_read_copy: failed to get pagelist");
4330 		}
4331 
4332 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
4333 		    upl, (int)upl_f_offset, upl_size, start_offset, 0);
4334 
4335 		/*
4336 		 * scan from the beginning of the upl looking for the first
4337 		 * non-valid page.... this will become the first page in
4338 		 * the request we're going to make to 'cluster_io'... if all
4339 		 * of the pages are valid, we won't call through to 'cluster_io'
4340 		 */
4341 		for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
4342 			if (!upl_valid_page(pl, start_pg)) {
4343 				break;
4344 			}
4345 		}
4346 
4347 		/*
4348 		 * scan from the starting invalid page looking for a valid
4349 		 * page before the end of the upl is reached, if we
4350 		 * find one, then it will be the last page of the request to
4351 		 * 'cluster_io'
4352 		 */
4353 		for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4354 			if (upl_valid_page(pl, last_pg)) {
4355 				break;
4356 			}
4357 		}
4358 
4359 		if (start_pg < last_pg) {
4360 			/*
4361 			 * we found a range of 'invalid' pages that must be filled
4362 			 * if the last page in this range is the last page of the file
4363 			 * we may have to clip the size of it to keep from reading past
4364 			 * the end of the last physical block associated with the file
4365 			 */
4366 			if (iolock_inited == FALSE) {
4367 				lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4368 
4369 				iolock_inited = TRUE;
4370 			}
4371 			upl_offset = start_pg * PAGE_SIZE;
4372 			io_size    = (last_pg - start_pg) * PAGE_SIZE;
4373 
4374 			if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
4375 				io_size = (u_int32_t)(filesize - (upl_f_offset + upl_offset));
4376 			}
4377 
4378 			/*
4379 			 * Find out if this needs verification, we'll have to manage the UPL
4380 			 * diffrently if so. Note that this call only lets us know if
4381 			 * verification is enabled on this mount point, the actual verification
4382 			 * is performed in the File system.
4383 			 */
4384 			size_t verify_block_size = 0;
4385 			if ((VNOP_VERIFY(vp, start_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) /* && verify_block_size */) {
4386 				for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4387 					if (!upl_valid_page(pl, uio_last)) {
4388 						break;
4389 					}
4390 				}
4391 				if (uio_last < pages_in_upl) {
4392 					/*
4393 					 * there were some invalid pages beyond the valid pages
4394 					 * that we didn't issue an I/O for, just release them
4395 					 * unchanged now, so that any prefetch/readahed can
4396 					 * include them
4397 					 */
4398 					ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4399 					    (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4400 					leftover_upl_aborted = true;
4401 				}
4402 			}
4403 
4404 			/*
4405 			 * issue an asynchronous read to cluster_io
4406 			 */
4407 
4408 			error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
4409 			    io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
4410 
4411 			if (rap) {
4412 				if (extent.e_addr < rap->cl_maxra) {
4413 					/*
4414 					 * we've just issued a read for a block that should have been
4415 					 * in the cache courtesy of the read-ahead engine... something
4416 					 * has gone wrong with the pipeline, so reset the read-ahead
4417 					 * logic which will cause us to restart from scratch
4418 					 */
4419 					rap->cl_maxra = 0;
4420 				}
4421 			}
4422 		}
4423 		if (error == 0) {
4424 			/*
4425 			 * if the read completed successfully, or there was no I/O request
4426 			 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4427 			 * we'll first add on any 'valid'
4428 			 * pages that were present in the upl when we acquired it.
4429 			 */
4430 			u_int  val_size;
4431 
4432 			if (!leftover_upl_aborted) {
4433 				for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4434 					if (!upl_valid_page(pl, uio_last)) {
4435 						break;
4436 					}
4437 				}
4438 				if (uio_last < pages_in_upl) {
4439 					/*
4440 					 * there were some invalid pages beyond the valid pages
4441 					 * that we didn't issue an I/O for, just release them
4442 					 * unchanged now, so that any prefetch/readahed can
4443 					 * include them
4444 					 */
4445 					ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4446 					    (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4447 				}
4448 			}
4449 
4450 			/*
4451 			 * compute size to transfer this round,  if io_req_size is
4452 			 * still non-zero after this attempt, we'll loop around and
4453 			 * set up for another I/O.
4454 			 */
4455 			val_size = (uio_last * PAGE_SIZE) - start_offset;
4456 
4457 			if (val_size > max_size) {
4458 				val_size = (u_int)max_size;
4459 			}
4460 
4461 			if (val_size > io_req_size) {
4462 				val_size = io_req_size;
4463 			}
4464 
4465 			if ((uio->uio_offset + val_size) > last_ioread_offset) {
4466 				last_ioread_offset = uio->uio_offset + val_size;
4467 			}
4468 
4469 			if ((size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4470 				if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4471 					/*
4472 					 * if there's still I/O left to do for this request, and...
4473 					 * we're not in hard throttle mode, and...
4474 					 * we're close to using up the previous prefetch, then issue a
4475 					 * new pre-fetch I/O... the I/O latency will overlap
4476 					 * with the copying of the data
4477 					 */
4478 					if (size_of_prefetch > max_rd_size) {
4479 						size_of_prefetch = max_rd_size;
4480 					}
4481 
4482 					size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4483 
4484 					last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4485 
4486 					if (last_ioread_offset > last_request_offset) {
4487 						last_ioread_offset = last_request_offset;
4488 					}
4489 				}
4490 			} else if ((uio->uio_offset + val_size) == last_request_offset) {
4491 				/*
4492 				 * this transfer will finish this request, so...
4493 				 * let's try to read ahead if we're in
4494 				 * a sequential access pattern and we haven't
4495 				 * explicitly disabled it
4496 				 */
4497 				if (rd_ahead_enabled) {
4498 					cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4499 				}
4500 
4501 				if (rap != NULL) {
4502 					if (extent.e_addr < rap->cl_lastr) {
4503 						rap->cl_maxra = 0;
4504 					}
4505 					rap->cl_lastr = extent.e_addr;
4506 				}
4507 			}
4508 			if (iolock_inited == TRUE) {
4509 				cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4510 			}
4511 
4512 			if (iostate.io_error) {
4513 				error = iostate.io_error;
4514 			} else {
4515 				u_int32_t io_requested;
4516 
4517 				io_requested = val_size;
4518 
4519 				retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4520 
4521 				io_req_size -= (val_size - io_requested);
4522 			}
4523 		} else {
4524 			if (iolock_inited == TRUE) {
4525 				cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4526 			}
4527 		}
4528 		if (start_pg < last_pg) {
4529 			/*
4530 			 * compute the range of pages that we actually issued an I/O for
4531 			 * and either commit them as valid if the I/O succeeded
4532 			 * or abort them if the I/O failed or we're not supposed to
4533 			 * keep them in the cache
4534 			 */
4535 			io_size = (last_pg - start_pg) * PAGE_SIZE;
4536 
4537 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4538 
4539 			if (error || (flags & IO_NOCACHE)) {
4540 				ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4541 				    UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4542 			} else {
4543 				int     commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
4544 
4545 				if (take_reference) {
4546 					commit_flags |= UPL_COMMIT_INACTIVATE;
4547 				} else {
4548 					commit_flags |= UPL_COMMIT_SPECULATE;
4549 				}
4550 
4551 				ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4552 			}
4553 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4554 		}
4555 		if ((last_pg - start_pg) < pages_in_upl) {
4556 			/*
4557 			 * the set of pages that we issued an I/O for did not encompass
4558 			 * the entire upl... so just release these without modifying
4559 			 * their state
4560 			 */
4561 			if (error) {
4562 				if (leftover_upl_aborted) {
4563 					ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, (uio_last - start_pg) * PAGE_SIZE,
4564 					    UPL_ABORT_FREE_ON_EMPTY);
4565 				} else {
4566 					ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4567 				}
4568 			} else {
4569 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
4570 				    upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
4571 
4572 				/*
4573 				 * handle any valid pages at the beginning of
4574 				 * the upl... release these appropriately
4575 				 */
4576 				cluster_read_upl_release(upl, 0, start_pg, take_reference);
4577 
4578 				/*
4579 				 * handle any valid pages immediately after the
4580 				 * pages we issued I/O for... ... release these appropriately
4581 				 */
4582 				cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
4583 
4584 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
4585 			}
4586 		}
4587 		if (retval == 0) {
4588 			retval = error;
4589 		}
4590 
4591 		if (io_req_size) {
4592 			if (cluster_is_throttled(vp)) {
4593 				/*
4594 				 * we're in the throttle window, at the very least
4595 				 * we want to limit the size of the I/O we're about
4596 				 * to issue
4597 				 */
4598 				rd_ahead_enabled = 0;
4599 				prefetch_enabled = 0;
4600 				max_rd_size = THROTTLE_MAX_IOSIZE;
4601 			} else {
4602 				if (max_rd_size == THROTTLE_MAX_IOSIZE) {
4603 					/*
4604 					 * coming out of throttled state
4605 					 */
4606 					if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4607 						if (rap != NULL) {
4608 							rd_ahead_enabled = 1;
4609 						}
4610 						prefetch_enabled = 1;
4611 					}
4612 					max_rd_size = max_prefetch;
4613 					last_ioread_offset = 0;
4614 				}
4615 			}
4616 		}
4617 	}
4618 	if (iolock_inited == TRUE) {
4619 		/*
4620 		 * cluster_io returned an error after it
4621 		 * had already issued some I/O.  we need
4622 		 * to wait for that I/O to complete before
4623 		 * we can destroy the iostate mutex...
4624 		 * 'retval' already contains the early error
4625 		 * so no need to pick it up from iostate.io_error
4626 		 */
4627 		cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4628 
4629 		lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
4630 	}
4631 	if (rap != NULL) {
4632 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4633 		    (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
4634 
4635 		lck_mtx_unlock(&rap->cl_lockr);
4636 	} else {
4637 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4638 		    (int)uio->uio_offset, io_req_size, 0, retval, 0);
4639 	}
4640 
4641 	return retval;
4642 }
4643 
4644 /*
4645  * We don't want another read/write lock for every vnode in the system
4646  * so we keep a hash of them here.  There should never be very many of
4647  * these around at any point in time.
4648  */
4649 cl_direct_read_lock_t *
cluster_lock_direct_read(vnode_t vp,lck_rw_type_t type)4650 cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4651 {
4652 	struct cl_direct_read_locks *head
4653 	        = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4654 	    % CL_DIRECT_READ_LOCK_BUCKETS];
4655 
4656 	struct cl_direct_read_lock *lck, *new_lck = NULL;
4657 
4658 	for (;;) {
4659 		lck_spin_lock(&cl_direct_read_spin_lock);
4660 
4661 		LIST_FOREACH(lck, head, chain) {
4662 			if (lck->vp == vp) {
4663 				++lck->ref_count;
4664 				lck_spin_unlock(&cl_direct_read_spin_lock);
4665 				if (new_lck) {
4666 					// Someone beat us to it, ditch the allocation
4667 					lck_rw_destroy(&new_lck->rw_lock, &cl_mtx_grp);
4668 					kfree_type(cl_direct_read_lock_t, new_lck);
4669 				}
4670 				lck_rw_lock(&lck->rw_lock, type);
4671 				return lck;
4672 			}
4673 		}
4674 
4675 		if (new_lck) {
4676 			// Use the lock we allocated
4677 			LIST_INSERT_HEAD(head, new_lck, chain);
4678 			lck_spin_unlock(&cl_direct_read_spin_lock);
4679 			lck_rw_lock(&new_lck->rw_lock, type);
4680 			return new_lck;
4681 		}
4682 
4683 		lck_spin_unlock(&cl_direct_read_spin_lock);
4684 
4685 		// Allocate a new lock
4686 		new_lck = kalloc_type(cl_direct_read_lock_t, Z_WAITOK);
4687 		lck_rw_init(&new_lck->rw_lock, &cl_mtx_grp, LCK_ATTR_NULL);
4688 		new_lck->vp = vp;
4689 		new_lck->ref_count = 1;
4690 
4691 		// Got to go round again
4692 	}
4693 }
4694 
4695 void
cluster_unlock_direct_read(cl_direct_read_lock_t * lck)4696 cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4697 {
4698 	lck_rw_done(&lck->rw_lock);
4699 
4700 	lck_spin_lock(&cl_direct_read_spin_lock);
4701 	if (lck->ref_count == 1) {
4702 		LIST_REMOVE(lck, chain);
4703 		lck_spin_unlock(&cl_direct_read_spin_lock);
4704 		lck_rw_destroy(&lck->rw_lock, &cl_mtx_grp);
4705 		kfree_type(cl_direct_read_lock_t, lck);
4706 	} else {
4707 		--lck->ref_count;
4708 		lck_spin_unlock(&cl_direct_read_spin_lock);
4709 	}
4710 }
4711 
4712 static int
cluster_read_direct(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int flags,int (* callback)(buf_t,void *),void * callback_arg)4713 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4714     int flags, int (*callback)(buf_t, void *), void *callback_arg)
4715 {
4716 	upl_t            upl = NULL;
4717 	upl_page_info_t  *pl;
4718 	off_t            max_io_size;
4719 	vm_offset_t      upl_offset, vector_upl_offset = 0;
4720 	upl_size_t       upl_size = 0, vector_upl_size = 0;
4721 	vm_size_t        upl_needed_size;
4722 	unsigned int     pages_in_pl;
4723 	upl_control_flags_t upl_flags;
4724 	kern_return_t    kret;
4725 	unsigned int     i;
4726 	int              force_data_sync;
4727 	int              retval = 0;
4728 	int              no_zero_fill = 0;
4729 	int              io_flag = 0;
4730 	int              misaligned = 0;
4731 	struct clios     iostate;
4732 	user_addr_t      iov_base;
4733 	u_int32_t        io_req_size;
4734 	u_int32_t        offset_in_file;
4735 	u_int32_t        offset_in_iovbase;
4736 	u_int32_t        io_size;
4737 	u_int32_t        io_min;
4738 	u_int32_t        xsize;
4739 	u_int32_t        devblocksize;
4740 	u_int32_t        mem_alignment_mask;
4741 	u_int32_t        max_upl_size;
4742 	u_int32_t        max_rd_size;
4743 	u_int32_t        max_rd_ahead;
4744 	u_int32_t        max_vector_size;
4745 	boolean_t        io_throttled = FALSE;
4746 
4747 	u_int32_t        vector_upl_iosize = 0;
4748 	int              issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
4749 	off_t            v_upl_uio_offset = 0;
4750 	int              vector_upl_index = 0;
4751 	upl_t            vector_upl = NULL;
4752 	cl_direct_read_lock_t *lock = NULL;
4753 
4754 	assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
4755 
4756 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
4757 	    (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4758 
4759 	max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
4760 
4761 	max_rd_size = max_upl_size;
4762 
4763 	if (__improbable(os_mul_overflow(max_rd_size, IO_SCALE(vp, 2),
4764 	    &max_rd_ahead) || (max_rd_ahead > overlapping_read_max))) {
4765 		max_rd_ahead = overlapping_read_max;
4766 	}
4767 
4768 	io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
4769 
4770 	if (flags & IO_PASSIVE) {
4771 		io_flag |= CL_PASSIVE;
4772 	}
4773 
4774 	if (flags & IO_ENCRYPTED) {
4775 		io_flag |= CL_RAW_ENCRYPTED;
4776 	}
4777 
4778 	if (flags & IO_NOCACHE) {
4779 		io_flag |= CL_NOCACHE;
4780 	}
4781 
4782 	if (flags & IO_SKIP_ENCRYPTION) {
4783 		io_flag |= CL_ENCRYPTED;
4784 	}
4785 
4786 	iostate.io_completed = 0;
4787 	iostate.io_issued = 0;
4788 	iostate.io_error = 0;
4789 	iostate.io_wanted = 0;
4790 
4791 	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4792 
4793 	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4794 	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4795 
4796 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4797 	    (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
4798 
4799 	if (devblocksize == 1) {
4800 		/*
4801 		 * the AFP client advertises a devblocksize of 1
4802 		 * however, its BLOCKMAP routine maps to physical
4803 		 * blocks that are PAGE_SIZE in size...
4804 		 * therefore we can't ask for I/Os that aren't page aligned
4805 		 * or aren't multiples of PAGE_SIZE in size
4806 		 * by setting devblocksize to PAGE_SIZE, we re-instate
4807 		 * the old behavior we had before the mem_alignment_mask
4808 		 * changes went in...
4809 		 */
4810 		devblocksize = PAGE_SIZE;
4811 	}
4812 
4813 	/*
4814 	 * We are going to need this uio for the prefaulting later
4815 	 * especially for the cases where multiple non-contiguous
4816 	 * iovs are passed into this routine.
4817 	 */
4818 	uio_t uio_acct = uio_duplicate(uio);
4819 
4820 next_dread:
4821 	io_req_size = *read_length;
4822 	iov_base = uio_curriovbase(uio);
4823 
4824 	offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4825 	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4826 
4827 	if (vm_map_page_mask(current_map()) < PAGE_MASK) {
4828 		/*
4829 		 * XXX TODO4K
4830 		 * Direct I/O might not work as expected from a 16k kernel space
4831 		 * to a 4k user space because each 4k chunk might point to
4832 		 * a different 16k physical page...
4833 		 * Let's go the "misaligned" way.
4834 		 */
4835 		if (!misaligned) {
4836 			DEBUG4K_VFS("forcing misaligned\n");
4837 		}
4838 		misaligned = 1;
4839 	}
4840 
4841 	if (offset_in_file || offset_in_iovbase) {
4842 		/*
4843 		 * one of the 2 important offsets is misaligned
4844 		 * so fire an I/O through the cache for this entire vector
4845 		 */
4846 		misaligned = 1;
4847 	}
4848 	if (iov_base & (devblocksize - 1)) {
4849 		/*
4850 		 * the offset in memory must be on a device block boundary
4851 		 * so that we can guarantee that we can generate an
4852 		 * I/O that ends on a page boundary in cluster_io
4853 		 */
4854 		misaligned = 1;
4855 	}
4856 
4857 	max_io_size = filesize - uio->uio_offset;
4858 
4859 	/*
4860 	 * The user must request IO in aligned chunks.  If the
4861 	 * offset into the file is bad, or the userland pointer
4862 	 * is non-aligned, then we cannot service the encrypted IO request.
4863 	 */
4864 	if (flags & IO_ENCRYPTED) {
4865 		if (misaligned || (io_req_size & (devblocksize - 1))) {
4866 			retval = EINVAL;
4867 		}
4868 
4869 		max_io_size = roundup(max_io_size, devblocksize);
4870 	}
4871 
4872 	if ((off_t)io_req_size > max_io_size) {
4873 		io_req_size = (u_int32_t)max_io_size;
4874 	}
4875 
4876 	/*
4877 	 * When we get to this point, we know...
4878 	 *  -- the offset into the file is on a devblocksize boundary
4879 	 */
4880 
4881 	while (io_req_size && retval == 0) {
4882 		u_int32_t io_start;
4883 
4884 		if (cluster_is_throttled(vp)) {
4885 			/*
4886 			 * we're in the throttle window, at the very least
4887 			 * we want to limit the size of the I/O we're about
4888 			 * to issue
4889 			 */
4890 			max_rd_size  = THROTTLE_MAX_IOSIZE;
4891 			max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
4892 			max_vector_size = THROTTLE_MAX_IOSIZE;
4893 		} else {
4894 			max_rd_size  = max_upl_size;
4895 			max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4896 			max_vector_size = MAX_VECTOR_UPL_SIZE;
4897 		}
4898 		io_start = io_size = io_req_size;
4899 
4900 		/*
4901 		 * First look for pages already in the cache
4902 		 * and move them to user space.  But only do this
4903 		 * check if we are not retrieving encrypted data directly
4904 		 * from the filesystem;  those blocks should never
4905 		 * be in the UBC.
4906 		 *
4907 		 * cluster_copy_ubc_data returns the resid
4908 		 * in io_size
4909 		 */
4910 		if ((flags & IO_ENCRYPTED) == 0) {
4911 			retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
4912 		}
4913 		/*
4914 		 * calculate the number of bytes actually copied
4915 		 * starting size - residual
4916 		 */
4917 		xsize = io_start - io_size;
4918 
4919 		io_req_size -= xsize;
4920 
4921 		if (useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
4922 			/*
4923 			 * We found something in the cache or we have an iov_base that's not
4924 			 * page-aligned.
4925 			 *
4926 			 * Issue all I/O's that have been collected within this Vectored UPL.
4927 			 */
4928 			if (vector_upl_index) {
4929 				retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4930 				reset_vector_run_state();
4931 			}
4932 
4933 			if (xsize) {
4934 				useVectorUPL = 0;
4935 			}
4936 
4937 			/*
4938 			 * After this point, if we are using the Vector UPL path and the base is
4939 			 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4940 			 */
4941 		}
4942 
4943 		/*
4944 		 * check to see if we are finished with this request.
4945 		 *
4946 		 * If we satisfied this IO already, then io_req_size will be 0.
4947 		 * Otherwise, see if the IO was mis-aligned and needs to go through
4948 		 * the UBC to deal with the 'tail'.
4949 		 *
4950 		 */
4951 		if (io_req_size == 0 || (misaligned)) {
4952 			/*
4953 			 * see if there's another uio vector to
4954 			 * process that's of type IO_DIRECT
4955 			 *
4956 			 * break out of while loop to get there
4957 			 */
4958 			break;
4959 		}
4960 		/*
4961 		 * assume the request ends on a device block boundary
4962 		 */
4963 		io_min = devblocksize;
4964 
4965 		/*
4966 		 * we can handle I/O's in multiples of the device block size
4967 		 * however, if io_size isn't a multiple of devblocksize we
4968 		 * want to clip it back to the nearest page boundary since
4969 		 * we are going to have to go through cluster_read_copy to
4970 		 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4971 		 * multiple, we avoid asking the drive for the same physical
4972 		 * blocks twice.. once for the partial page at the end of the
4973 		 * request and a 2nd time for the page we read into the cache
4974 		 * (which overlaps the end of the direct read) in order to
4975 		 * get at the overhang bytes
4976 		 */
4977 		if (io_size & (devblocksize - 1)) {
4978 			assert(!(flags & IO_ENCRYPTED));
4979 			/*
4980 			 * Clip the request to the previous page size boundary
4981 			 * since request does NOT end on a device block boundary
4982 			 */
4983 			io_size &= ~PAGE_MASK;
4984 			io_min = PAGE_SIZE;
4985 		}
4986 		if (retval || io_size < io_min) {
4987 			/*
4988 			 * either an error or we only have the tail left to
4989 			 * complete via the copy path...
4990 			 * we may have already spun some portion of this request
4991 			 * off as async requests... we need to wait for the I/O
4992 			 * to complete before returning
4993 			 */
4994 			goto wait_for_dreads;
4995 		}
4996 
4997 		/*
4998 		 * Don't re-check the UBC data if we are looking for uncached IO
4999 		 * or asking for encrypted blocks.
5000 		 */
5001 		if ((flags & IO_ENCRYPTED) == 0) {
5002 			if ((xsize = io_size) > max_rd_size) {
5003 				xsize = max_rd_size;
5004 			}
5005 
5006 			io_size = 0;
5007 
5008 			if (!lock) {
5009 				/*
5010 				 * We hold a lock here between the time we check the
5011 				 * cache and the time we issue I/O.  This saves us
5012 				 * from having to lock the pages in the cache.  Not
5013 				 * all clients will care about this lock but some
5014 				 * clients may want to guarantee stability between
5015 				 * here and when the I/O is issued in which case they
5016 				 * will take the lock exclusively.
5017 				 */
5018 				lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
5019 			}
5020 
5021 			ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
5022 
5023 			if (io_size == 0) {
5024 				/*
5025 				 * a page must have just come into the cache
5026 				 * since the first page in this range is no
5027 				 * longer absent, go back and re-evaluate
5028 				 */
5029 				continue;
5030 			}
5031 		}
5032 		if ((flags & IO_RETURN_ON_THROTTLE)) {
5033 			if (cluster_is_throttled(vp) == THROTTLE_NOW) {
5034 				if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
5035 					/*
5036 					 * we're in the throttle window and at least 1 I/O
5037 					 * has already been issued by a throttleable thread
5038 					 * in this window, so return with EAGAIN to indicate
5039 					 * to the FS issuing the cluster_read call that it
5040 					 * should now throttle after dropping any locks
5041 					 */
5042 					throttle_info_update_by_mount(vp->v_mount);
5043 
5044 					io_throttled = TRUE;
5045 					goto wait_for_dreads;
5046 				}
5047 			}
5048 		}
5049 		if (io_size > max_rd_size) {
5050 			io_size = max_rd_size;
5051 		}
5052 
5053 		iov_base = uio_curriovbase(uio);
5054 
5055 		upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5056 		upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5057 
5058 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
5059 		    (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
5060 
5061 		if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
5062 			no_zero_fill = 1;
5063 		} else {
5064 			no_zero_fill = 0;
5065 		}
5066 
5067 		vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5068 		for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
5069 			pages_in_pl = 0;
5070 			upl_size = (upl_size_t)upl_needed_size;
5071 			upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5072 			if (no_zero_fill) {
5073 				upl_flags |= UPL_NOZEROFILL;
5074 			}
5075 			if (force_data_sync) {
5076 				upl_flags |= UPL_FORCE_DATA_SYNC;
5077 			}
5078 
5079 			kret = vm_map_create_upl(map,
5080 			    (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5081 			    &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
5082 
5083 			if (kret != KERN_SUCCESS) {
5084 				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5085 				    (int)upl_offset, upl_size, io_size, kret, 0);
5086 				/*
5087 				 * failed to get pagelist
5088 				 *
5089 				 * we may have already spun some portion of this request
5090 				 * off as async requests... we need to wait for the I/O
5091 				 * to complete before returning
5092 				 */
5093 				goto wait_for_dreads;
5094 			}
5095 			pages_in_pl = upl_size / PAGE_SIZE;
5096 			pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
5097 
5098 			for (i = 0; i < pages_in_pl; i++) {
5099 				if (!upl_page_present(pl, i)) {
5100 					break;
5101 				}
5102 			}
5103 			if (i == pages_in_pl) {
5104 				break;
5105 			}
5106 
5107 			ubc_upl_abort(upl, 0);
5108 		}
5109 		if (force_data_sync >= 3) {
5110 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5111 			    (int)upl_offset, upl_size, io_size, kret, 0);
5112 
5113 			goto wait_for_dreads;
5114 		}
5115 		/*
5116 		 * Consider the possibility that upl_size wasn't satisfied.
5117 		 */
5118 		if (upl_size < upl_needed_size) {
5119 			if (upl_size && upl_offset == 0) {
5120 				io_size = upl_size;
5121 			} else {
5122 				io_size = 0;
5123 			}
5124 		}
5125 		if (io_size == 0) {
5126 			ubc_upl_abort(upl, 0);
5127 			goto wait_for_dreads;
5128 		}
5129 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5130 		    (int)upl_offset, upl_size, io_size, kret, 0);
5131 
5132 		if (useVectorUPL) {
5133 			vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
5134 			if (end_off) {
5135 				issueVectorUPL = 1;
5136 			}
5137 			/*
5138 			 * After this point, if we are using a vector UPL, then
5139 			 * either all the UPL elements end on a page boundary OR
5140 			 * this UPL is the last element because it does not end
5141 			 * on a page boundary.
5142 			 */
5143 		}
5144 
5145 		/*
5146 		 * request asynchronously so that we can overlap
5147 		 * the preparation of the next I/O
5148 		 * if there are already too many outstanding reads
5149 		 * wait until some have completed before issuing the next read
5150 		 */
5151 		cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
5152 
5153 		if (iostate.io_error) {
5154 			/*
5155 			 * one of the earlier reads we issued ran into a hard error
5156 			 * don't issue any more reads, cleanup the UPL
5157 			 * that was just created but not used, then
5158 			 * go wait for any other reads to complete before
5159 			 * returning the error to the caller
5160 			 */
5161 			ubc_upl_abort(upl, 0);
5162 
5163 			goto wait_for_dreads;
5164 		}
5165 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
5166 		    upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
5167 
5168 		if (!useVectorUPL) {
5169 			if (no_zero_fill) {
5170 				io_flag &= ~CL_PRESERVE;
5171 			} else {
5172 				io_flag |= CL_PRESERVE;
5173 			}
5174 
5175 			retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5176 		} else {
5177 			if (!vector_upl_index) {
5178 				vector_upl = vector_upl_create(upl_offset);
5179 				v_upl_uio_offset = uio->uio_offset;
5180 				vector_upl_offset = upl_offset;
5181 			}
5182 
5183 			vector_upl_set_subupl(vector_upl, upl, upl_size);
5184 			vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
5185 			vector_upl_index++;
5186 			vector_upl_size += upl_size;
5187 			vector_upl_iosize += io_size;
5188 
5189 			if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
5190 				retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5191 				reset_vector_run_state();
5192 			}
5193 		}
5194 
5195 		if (lock) {
5196 			// We don't need to wait for the I/O to complete
5197 			cluster_unlock_direct_read(lock);
5198 			lock = NULL;
5199 		}
5200 
5201 		/*
5202 		 * update the uio structure
5203 		 */
5204 		if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5205 			uio_update(uio, (user_size_t)max_io_size);
5206 		} else {
5207 			uio_update(uio, (user_size_t)io_size);
5208 		}
5209 
5210 		io_req_size -= io_size;
5211 
5212 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
5213 		    upl, (int)uio->uio_offset, io_req_size, retval, 0);
5214 	} /* end while */
5215 
5216 	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
5217 		retval = cluster_io_type(uio, read_type, read_length, 0);
5218 
5219 		if (retval == 0 && *read_type == IO_DIRECT) {
5220 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5221 			    (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
5222 
5223 			goto next_dread;
5224 		}
5225 	}
5226 
5227 wait_for_dreads:
5228 
5229 	if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
5230 		retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5231 		reset_vector_run_state();
5232 	}
5233 
5234 	// We don't need to wait for the I/O to complete
5235 	if (lock) {
5236 		cluster_unlock_direct_read(lock);
5237 	}
5238 
5239 	/*
5240 	 * make sure all async reads that are part of this stream
5241 	 * have completed before we return
5242 	 */
5243 	cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
5244 
5245 	if (iostate.io_error) {
5246 		retval = iostate.io_error;
5247 	}
5248 
5249 	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5250 
5251 	if (io_throttled == TRUE && retval == 0) {
5252 		retval = EAGAIN;
5253 	}
5254 
5255 	vm_map_offset_t current_page_size, current_page_mask;
5256 	current_page_size = vm_map_page_size(current_map());
5257 	current_page_mask = vm_map_page_mask(current_map());
5258 	if (uio_acct) {
5259 		off_t bytes_to_prefault = 0, bytes_prefaulted = 0;
5260 		user_addr_t curr_iov_base = 0;
5261 		user_addr_t curr_iov_end = 0;
5262 		user_size_t curr_iov_len = 0;
5263 
5264 		bytes_to_prefault = uio_offset(uio) - uio_offset(uio_acct);
5265 
5266 		for (; bytes_prefaulted < bytes_to_prefault;) {
5267 			curr_iov_base = uio_curriovbase(uio_acct);
5268 			curr_iov_len = MIN(uio_curriovlen(uio_acct), bytes_to_prefault - bytes_prefaulted);
5269 			curr_iov_end = curr_iov_base + curr_iov_len;
5270 
5271 			for (; curr_iov_base < curr_iov_end;) {
5272 				/*
5273 				 * This is specifically done for pmap accounting purposes.
5274 				 * vm_pre_fault() will call vm_fault() to enter the page into
5275 				 * the pmap if there isn't _a_ physical page for that VA already.
5276 				 */
5277 				vm_pre_fault(vm_map_trunc_page(curr_iov_base, current_page_mask), VM_PROT_READ);
5278 				curr_iov_base += current_page_size;
5279 				bytes_prefaulted += current_page_size;
5280 			}
5281 			/*
5282 			 * Use update instead of advance so we can see how many iovs we processed.
5283 			 */
5284 			uio_update(uio_acct, curr_iov_len);
5285 		}
5286 		uio_free(uio_acct);
5287 		uio_acct = NULL;
5288 	}
5289 
5290 	if (io_req_size && retval == 0) {
5291 		/*
5292 		 * we couldn't handle the tail of this request in DIRECT mode
5293 		 * so fire it through the copy path
5294 		 */
5295 		if (flags & IO_ENCRYPTED) {
5296 			/*
5297 			 * We cannot fall back to the copy path for encrypted I/O. If this
5298 			 * happens, there is something wrong with the user buffer passed
5299 			 * down.
5300 			 */
5301 			retval = EFAULT;
5302 		} else {
5303 			retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
5304 		}
5305 
5306 		*read_type = IO_UNKNOWN;
5307 	}
5308 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
5309 	    (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
5310 
5311 	return retval;
5312 }
5313 
5314 
5315 static int
cluster_read_contig(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int (* callback)(buf_t,void *),void * callback_arg,int flags)5316 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
5317     int (*callback)(buf_t, void *), void *callback_arg, int flags)
5318 {
5319 	upl_page_info_t *pl;
5320 	upl_t            upl[MAX_VECTS];
5321 	vm_offset_t      upl_offset;
5322 	addr64_t         dst_paddr = 0;
5323 	user_addr_t      iov_base;
5324 	off_t            max_size;
5325 	upl_size_t       upl_size;
5326 	vm_size_t        upl_needed_size;
5327 	mach_msg_type_number_t  pages_in_pl;
5328 	upl_control_flags_t upl_flags;
5329 	kern_return_t    kret;
5330 	struct clios     iostate;
5331 	int              error = 0;
5332 	int              cur_upl = 0;
5333 	int              num_upl = 0;
5334 	int              n;
5335 	u_int32_t        xsize;
5336 	u_int32_t        io_size;
5337 	u_int32_t        devblocksize;
5338 	u_int32_t        mem_alignment_mask;
5339 	u_int32_t        tail_size = 0;
5340 	int              bflag;
5341 
5342 	if (flags & IO_PASSIVE) {
5343 		bflag = CL_PASSIVE;
5344 	} else {
5345 		bflag = 0;
5346 	}
5347 
5348 	if (flags & IO_NOCACHE) {
5349 		bflag |= CL_NOCACHE;
5350 	}
5351 
5352 	/*
5353 	 * When we enter this routine, we know
5354 	 *  -- the read_length will not exceed the current iov_len
5355 	 *  -- the target address is physically contiguous for read_length
5356 	 */
5357 	cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
5358 
5359 	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5360 	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
5361 
5362 	iostate.io_completed = 0;
5363 	iostate.io_issued = 0;
5364 	iostate.io_error = 0;
5365 	iostate.io_wanted = 0;
5366 
5367 	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
5368 
5369 next_cread:
5370 	io_size = *read_length;
5371 
5372 	max_size = filesize - uio->uio_offset;
5373 
5374 	if (io_size > max_size) {
5375 		io_size = (u_int32_t)max_size;
5376 	}
5377 
5378 	iov_base = uio_curriovbase(uio);
5379 
5380 	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5381 	upl_needed_size = upl_offset + io_size;
5382 
5383 	pages_in_pl = 0;
5384 	upl_size = (upl_size_t)upl_needed_size;
5385 	upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5386 
5387 
5388 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
5389 	    (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
5390 
5391 	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5392 	kret = vm_map_get_upl(map,
5393 	    vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5394 	    &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
5395 
5396 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
5397 	    (int)upl_offset, upl_size, io_size, kret, 0);
5398 
5399 	if (kret != KERN_SUCCESS) {
5400 		/*
5401 		 * failed to get pagelist
5402 		 */
5403 		error = EINVAL;
5404 		goto wait_for_creads;
5405 	}
5406 	num_upl++;
5407 
5408 	if (upl_size < upl_needed_size) {
5409 		/*
5410 		 * The upl_size wasn't satisfied.
5411 		 */
5412 		error = EINVAL;
5413 		goto wait_for_creads;
5414 	}
5415 	pl = ubc_upl_pageinfo(upl[cur_upl]);
5416 
5417 	dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
5418 
5419 	while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
5420 		u_int32_t   head_size;
5421 
5422 		head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
5423 
5424 		if (head_size > io_size) {
5425 			head_size = io_size;
5426 		}
5427 
5428 		error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
5429 
5430 		if (error) {
5431 			goto wait_for_creads;
5432 		}
5433 
5434 		upl_offset += head_size;
5435 		dst_paddr  += head_size;
5436 		io_size    -= head_size;
5437 
5438 		iov_base   += head_size;
5439 	}
5440 	if ((u_int32_t)iov_base & mem_alignment_mask) {
5441 		/*
5442 		 * request doesn't set up on a memory boundary
5443 		 * the underlying DMA engine can handle...
5444 		 * return an error instead of going through
5445 		 * the slow copy path since the intent of this
5446 		 * path is direct I/O to device memory
5447 		 */
5448 		error = EINVAL;
5449 		goto wait_for_creads;
5450 	}
5451 
5452 	tail_size = io_size & (devblocksize - 1);
5453 
5454 	io_size  -= tail_size;
5455 
5456 	while (io_size && error == 0) {
5457 		if (io_size > MAX_IO_CONTIG_SIZE) {
5458 			xsize = MAX_IO_CONTIG_SIZE;
5459 		} else {
5460 			xsize = io_size;
5461 		}
5462 		/*
5463 		 * request asynchronously so that we can overlap
5464 		 * the preparation of the next I/O... we'll do
5465 		 * the commit after all the I/O has completed
5466 		 * since its all issued against the same UPL
5467 		 * if there are already too many outstanding reads
5468 		 * wait until some have completed before issuing the next
5469 		 */
5470 		cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
5471 
5472 		if (iostate.io_error) {
5473 			/*
5474 			 * one of the earlier reads we issued ran into a hard error
5475 			 * don't issue any more reads...
5476 			 * go wait for any other reads to complete before
5477 			 * returning the error to the caller
5478 			 */
5479 			goto wait_for_creads;
5480 		}
5481 		error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
5482 		    CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5483 		    (buf_t)NULL, &iostate, callback, callback_arg);
5484 		/*
5485 		 * The cluster_io read was issued successfully,
5486 		 * update the uio structure
5487 		 */
5488 		if (error == 0) {
5489 			uio_update(uio, (user_size_t)xsize);
5490 
5491 			dst_paddr  += xsize;
5492 			upl_offset += xsize;
5493 			io_size    -= xsize;
5494 		}
5495 	}
5496 	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5497 		error = cluster_io_type(uio, read_type, read_length, 0);
5498 
5499 		if (error == 0 && *read_type == IO_CONTIG) {
5500 			cur_upl++;
5501 			goto next_cread;
5502 		}
5503 	} else {
5504 		*read_type = IO_UNKNOWN;
5505 	}
5506 
5507 wait_for_creads:
5508 	/*
5509 	 * make sure all async reads that are part of this stream
5510 	 * have completed before we proceed
5511 	 */
5512 	cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
5513 
5514 	if (iostate.io_error) {
5515 		error = iostate.io_error;
5516 	}
5517 
5518 	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5519 
5520 	if (error == 0 && tail_size) {
5521 		error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
5522 	}
5523 
5524 	for (n = 0; n < num_upl; n++) {
5525 		/*
5526 		 * just release our hold on each physically contiguous
5527 		 * region without changing any state
5528 		 */
5529 		ubc_upl_abort(upl[n], 0);
5530 	}
5531 
5532 	return error;
5533 }
5534 
5535 
5536 static int
cluster_io_type(struct uio * uio,int * io_type,u_int32_t * io_length,u_int32_t min_length)5537 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5538 {
5539 	user_size_t      iov_len;
5540 	user_addr_t      iov_base = 0;
5541 	upl_t            upl;
5542 	upl_size_t       upl_size;
5543 	upl_control_flags_t upl_flags;
5544 	int              retval = 0;
5545 
5546 	/*
5547 	 * skip over any emtpy vectors
5548 	 */
5549 	uio_update(uio, (user_size_t)0);
5550 
5551 	iov_len = uio_curriovlen(uio);
5552 
5553 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
5554 
5555 	if (iov_len) {
5556 		iov_base = uio_curriovbase(uio);
5557 		/*
5558 		 * make sure the size of the vector isn't too big...
5559 		 * internally, we want to handle all of the I/O in
5560 		 * chunk sizes that fit in a 32 bit int
5561 		 */
5562 		if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
5563 			upl_size = MAX_IO_REQUEST_SIZE;
5564 		} else {
5565 			upl_size = (u_int32_t)iov_len;
5566 		}
5567 
5568 		upl_flags = UPL_QUERY_OBJECT_TYPE;
5569 
5570 		vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5571 		if ((vm_map_get_upl(map,
5572 		    vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5573 		    &upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
5574 			/*
5575 			 * the user app must have passed in an invalid address
5576 			 */
5577 			retval = EFAULT;
5578 		}
5579 		if (upl_size == 0) {
5580 			retval = EFAULT;
5581 		}
5582 
5583 		*io_length = upl_size;
5584 
5585 		if (upl_flags & UPL_PHYS_CONTIG) {
5586 			*io_type = IO_CONTIG;
5587 		} else if (iov_len >= min_length) {
5588 			*io_type = IO_DIRECT;
5589 		} else {
5590 			*io_type = IO_COPY;
5591 		}
5592 	} else {
5593 		/*
5594 		 * nothing left to do for this uio
5595 		 */
5596 		*io_length = 0;
5597 		*io_type   = IO_UNKNOWN;
5598 	}
5599 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
5600 
5601 	if (*io_type == IO_DIRECT &&
5602 	    vm_map_page_shift(current_map()) < PAGE_SHIFT) {
5603 		/* no direct I/O for sub-page-size address spaces */
5604 		DEBUG4K_VFS("io_type IO_DIRECT -> IO_COPY\n");
5605 		*io_type = IO_COPY;
5606 	}
5607 
5608 	return retval;
5609 }
5610 
5611 
5612 /*
5613  * generate advisory I/O's in the largest chunks possible
5614  * the completed pages will be released into the VM cache
5615  */
5616 int
advisory_read(vnode_t vp,off_t filesize,off_t f_offset,int resid)5617 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5618 {
5619 	return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5620 }
5621 
5622 int
advisory_read_ext(vnode_t vp,off_t filesize,off_t f_offset,int resid,int (* callback)(buf_t,void *),void * callback_arg,int bflag)5623 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
5624 {
5625 	upl_page_info_t *pl;
5626 	upl_t            upl;
5627 	vm_offset_t      upl_offset;
5628 	int              upl_size;
5629 	off_t            upl_f_offset;
5630 	int              start_offset;
5631 	int              start_pg;
5632 	int              last_pg;
5633 	int              pages_in_upl;
5634 	off_t            max_size;
5635 	int              io_size;
5636 	kern_return_t    kret;
5637 	int              retval = 0;
5638 	int              issued_io;
5639 	int              skip_range;
5640 	uint32_t         max_io_size;
5641 
5642 
5643 	if (!UBCINFOEXISTS(vp)) {
5644 		return EINVAL;
5645 	}
5646 
5647 	if (f_offset < 0 || resid < 0) {
5648 		return EINVAL;
5649 	}
5650 
5651 	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
5652 
5653 	if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
5654 		if (max_io_size > speculative_prefetch_max_iosize) {
5655 			max_io_size = speculative_prefetch_max_iosize;
5656 		}
5657 	}
5658 
5659 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
5660 	    (int)f_offset, resid, (int)filesize, 0, 0);
5661 
5662 	while (resid && f_offset < filesize && retval == 0) {
5663 		/*
5664 		 * compute the size of the upl needed to encompass
5665 		 * the requested read... limit each call to cluster_io
5666 		 * to the maximum UPL size... cluster_io will clip if
5667 		 * this exceeds the maximum io_size for the device,
5668 		 * make sure to account for
5669 		 * a starting offset that's not page aligned
5670 		 */
5671 		start_offset = (int)(f_offset & PAGE_MASK_64);
5672 		upl_f_offset = f_offset - (off_t)start_offset;
5673 		max_size     = filesize - f_offset;
5674 
5675 		if (resid < max_size) {
5676 			io_size = resid;
5677 		} else {
5678 			io_size = (int)max_size;
5679 		}
5680 
5681 		upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5682 		if ((uint32_t)upl_size > max_io_size) {
5683 			upl_size = max_io_size;
5684 		}
5685 
5686 		skip_range = 0;
5687 		/*
5688 		 * return the number of contiguously present pages in the cache
5689 		 * starting at upl_f_offset within the file
5690 		 */
5691 		ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5692 
5693 		if (skip_range) {
5694 			/*
5695 			 * skip over pages already present in the cache
5696 			 */
5697 			io_size = skip_range - start_offset;
5698 
5699 			f_offset += io_size;
5700 			resid    -= io_size;
5701 
5702 			if (skip_range == upl_size) {
5703 				continue;
5704 			}
5705 			/*
5706 			 * have to issue some real I/O
5707 			 * at this point, we know it's starting on a page boundary
5708 			 * because we've skipped over at least the first page in the request
5709 			 */
5710 			start_offset = 0;
5711 			upl_f_offset += skip_range;
5712 			upl_size     -= skip_range;
5713 		}
5714 		pages_in_upl = upl_size / PAGE_SIZE;
5715 
5716 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
5717 		    upl, (int)upl_f_offset, upl_size, start_offset, 0);
5718 
5719 		kret = ubc_create_upl_kernel(vp,
5720 		    upl_f_offset,
5721 		    upl_size,
5722 		    &upl,
5723 		    &pl,
5724 		    UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
5725 		    VM_KERN_MEMORY_FILE);
5726 		if (kret != KERN_SUCCESS) {
5727 			return retval;
5728 		}
5729 		issued_io = 0;
5730 
5731 		/*
5732 		 * before we start marching forward, we must make sure we end on
5733 		 * a present page, otherwise we will be working with a freed
5734 		 * upl
5735 		 */
5736 		for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5737 			if (upl_page_present(pl, last_pg)) {
5738 				break;
5739 			}
5740 		}
5741 		pages_in_upl = last_pg + 1;
5742 
5743 
5744 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
5745 		    upl, (int)upl_f_offset, upl_size, start_offset, 0);
5746 
5747 
5748 		for (last_pg = 0; last_pg < pages_in_upl;) {
5749 			/*
5750 			 * scan from the beginning of the upl looking for the first
5751 			 * page that is present.... this will become the first page in
5752 			 * the request we're going to make to 'cluster_io'... if all
5753 			 * of the pages are absent, we won't call through to 'cluster_io'
5754 			 */
5755 			for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5756 				if (upl_page_present(pl, start_pg)) {
5757 					break;
5758 				}
5759 			}
5760 
5761 			/*
5762 			 * scan from the starting present page looking for an absent
5763 			 * page before the end of the upl is reached, if we
5764 			 * find one, then it will terminate the range of pages being
5765 			 * presented to 'cluster_io'
5766 			 */
5767 			for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5768 				if (!upl_page_present(pl, last_pg)) {
5769 					break;
5770 				}
5771 			}
5772 
5773 			if (last_pg > start_pg) {
5774 				/*
5775 				 * we found a range of pages that must be filled
5776 				 * if the last page in this range is the last page of the file
5777 				 * we may have to clip the size of it to keep from reading past
5778 				 * the end of the last physical block associated with the file
5779 				 */
5780 				upl_offset = start_pg * PAGE_SIZE;
5781 				io_size    = (last_pg - start_pg) * PAGE_SIZE;
5782 
5783 				if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5784 					io_size = (int)(filesize - (upl_f_offset + upl_offset));
5785 				}
5786 
5787 				/*
5788 				 * issue an asynchronous read to cluster_io
5789 				 */
5790 				retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5791 				    CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5792 
5793 				issued_io = 1;
5794 			}
5795 		}
5796 		if (issued_io == 0) {
5797 			ubc_upl_abort(upl, 0);
5798 		}
5799 
5800 		io_size = upl_size - start_offset;
5801 
5802 		if (io_size > resid) {
5803 			io_size = resid;
5804 		}
5805 		f_offset += io_size;
5806 		resid    -= io_size;
5807 	}
5808 
5809 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
5810 	    (int)f_offset, resid, retval, 0, 0);
5811 
5812 	return retval;
5813 }
5814 
5815 
5816 int
cluster_push(vnode_t vp,int flags)5817 cluster_push(vnode_t vp, int flags)
5818 {
5819 	return cluster_push_ext(vp, flags, NULL, NULL);
5820 }
5821 
5822 
5823 int
cluster_push_ext(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg)5824 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5825 {
5826 	return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5827 }
5828 
5829 /* write errors via err, but return the number of clusters written */
5830 extern uint32_t system_inshutdown;
5831 uint32_t cl_sparse_push_error = 0;
5832 int
cluster_push_err(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg,int * err)5833 cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
5834 {
5835 	int     retval;
5836 	int     my_sparse_wait = 0;
5837 	struct  cl_writebehind *wbp;
5838 	int     local_err = 0;
5839 
5840 	if (err) {
5841 		*err = 0;
5842 	}
5843 
5844 	if (!UBCINFOEXISTS(vp)) {
5845 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
5846 		return 0;
5847 	}
5848 	/* return if deferred write is set */
5849 	if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5850 		return 0;
5851 	}
5852 	if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5853 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
5854 		return 0;
5855 	}
5856 	if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
5857 		lck_mtx_unlock(&wbp->cl_lockw);
5858 
5859 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
5860 		return 0;
5861 	}
5862 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
5863 	    wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
5864 
5865 	/*
5866 	 * if we have an fsync in progress, we don't want to allow any additional
5867 	 * sync/fsync/close(s) to occur until it finishes.
5868 	 * note that its possible for writes to continue to occur to this file
5869 	 * while we're waiting and also once the fsync starts to clean if we're
5870 	 * in the sparse map case
5871 	 */
5872 	while (wbp->cl_sparse_wait) {
5873 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5874 
5875 		msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5876 
5877 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5878 	}
5879 	if (flags & IO_SYNC) {
5880 		my_sparse_wait = 1;
5881 		wbp->cl_sparse_wait = 1;
5882 
5883 		/*
5884 		 * this is an fsync (or equivalent)... we must wait for any existing async
5885 		 * cleaning operations to complete before we evaulate the current state
5886 		 * and finish cleaning... this insures that all writes issued before this
5887 		 * fsync actually get cleaned to the disk before this fsync returns
5888 		 */
5889 		while (wbp->cl_sparse_pushes) {
5890 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5891 
5892 			msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5893 
5894 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5895 		}
5896 	}
5897 	if (wbp->cl_scmap) {
5898 		void    *scmap;
5899 
5900 		if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5901 			scmap = wbp->cl_scmap;
5902 			wbp->cl_scmap = NULL;
5903 
5904 			wbp->cl_sparse_pushes++;
5905 
5906 			lck_mtx_unlock(&wbp->cl_lockw);
5907 
5908 			retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5909 
5910 			lck_mtx_lock(&wbp->cl_lockw);
5911 
5912 			wbp->cl_sparse_pushes--;
5913 
5914 			if (retval) {
5915 				if (wbp->cl_scmap != NULL) {
5916 					/*
5917 					 * panic("cluster_push_err: Expected NULL cl_scmap\n");
5918 					 *
5919 					 * This can happen if we get an error from the underlying FS
5920 					 * e.g. ENOSPC, EPERM or EIO etc. We hope that these errors
5921 					 * are transient and the I/Os will succeed at a later point.
5922 					 *
5923 					 * The tricky part here is that a new sparse cluster has been
5924 					 * allocated and tracking a different set of dirty pages. So these
5925 					 * pages are not going to be pushed out with the next sparse_cluster_push.
5926 					 * An explicit msync or file close will, however, push the pages out.
5927 					 *
5928 					 * What if those calls still don't work? And so, during shutdown we keep
5929 					 * trying till we succeed...
5930 					 */
5931 
5932 					if (system_inshutdown) {
5933 						if ((retval == ENOSPC) && (vp->v_mount->mnt_flag & (MNT_LOCAL | MNT_REMOVABLE)) == MNT_LOCAL) {
5934 							os_atomic_inc(&cl_sparse_push_error, relaxed);
5935 						}
5936 					} else {
5937 						vfs_drt_control(&scmap, 0); /* emit stats and free this memory. Dirty pages stay intact. */
5938 						scmap = NULL;
5939 					}
5940 				} else {
5941 					wbp->cl_scmap = scmap;
5942 				}
5943 			}
5944 
5945 			if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
5946 				wakeup((caddr_t)&wbp->cl_sparse_pushes);
5947 			}
5948 		} else {
5949 			retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5950 		}
5951 
5952 		local_err = retval;
5953 
5954 		if (err) {
5955 			*err = retval;
5956 		}
5957 		retval = 1;
5958 	} else {
5959 		retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
5960 		if (err) {
5961 			*err = local_err;
5962 		}
5963 	}
5964 	lck_mtx_unlock(&wbp->cl_lockw);
5965 
5966 	if (flags & IO_SYNC) {
5967 		(void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
5968 	}
5969 
5970 	if (my_sparse_wait) {
5971 		/*
5972 		 * I'm the owner of the serialization token
5973 		 * clear it and wakeup anyone that is waiting
5974 		 * for me to finish
5975 		 */
5976 		lck_mtx_lock(&wbp->cl_lockw);
5977 
5978 		wbp->cl_sparse_wait = 0;
5979 		wakeup((caddr_t)&wbp->cl_sparse_wait);
5980 
5981 		lck_mtx_unlock(&wbp->cl_lockw);
5982 	}
5983 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
5984 	    wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
5985 
5986 	return retval;
5987 }
5988 
5989 
5990 __private_extern__ void
cluster_release(struct ubc_info * ubc)5991 cluster_release(struct ubc_info *ubc)
5992 {
5993 	struct cl_writebehind *wbp;
5994 	struct cl_readahead   *rap;
5995 
5996 	if ((wbp = ubc->cl_wbehind)) {
5997 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
5998 
5999 		if (wbp->cl_scmap) {
6000 			vfs_drt_control(&(wbp->cl_scmap), 0);
6001 		}
6002 		lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
6003 		zfree(cl_wr_zone, wbp);
6004 		ubc->cl_wbehind = NULL;
6005 	} else {
6006 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
6007 	}
6008 
6009 	if ((rap = ubc->cl_rahead)) {
6010 		lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
6011 		zfree(cl_rd_zone, rap);
6012 		ubc->cl_rahead  = NULL;
6013 	}
6014 
6015 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
6016 }
6017 
6018 
6019 static int
cluster_try_push(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,int * err,boolean_t vm_initiated)6020 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
6021 {
6022 	int cl_index;
6023 	int cl_index1;
6024 	int min_index;
6025 	int cl_len;
6026 	int cl_pushed = 0;
6027 	struct cl_wextent l_clusters[MAX_CLUSTERS];
6028 	u_int  max_cluster_pgcount;
6029 	int error = 0;
6030 
6031 	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
6032 	/*
6033 	 * the write behind context exists and has
6034 	 * already been locked...
6035 	 */
6036 	if (wbp->cl_number == 0) {
6037 		/*
6038 		 * no clusters to push
6039 		 * return number of empty slots
6040 		 */
6041 		return MAX_CLUSTERS;
6042 	}
6043 
6044 	/*
6045 	 * make a local 'sorted' copy of the clusters
6046 	 * and clear wbp->cl_number so that new clusters can
6047 	 * be developed
6048 	 */
6049 	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6050 		for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
6051 			if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
6052 				continue;
6053 			}
6054 			if (min_index == -1) {
6055 				min_index = cl_index1;
6056 			} else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
6057 				min_index = cl_index1;
6058 			}
6059 		}
6060 		if (min_index == -1) {
6061 			break;
6062 		}
6063 
6064 		l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
6065 		l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
6066 		l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
6067 
6068 		wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
6069 	}
6070 	wbp->cl_number = 0;
6071 
6072 	cl_len = cl_index;
6073 
6074 	/* skip switching to the sparse cluster mechanism if on diskimage */
6075 	if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
6076 	    !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
6077 		int   i;
6078 
6079 		/*
6080 		 * determine if we appear to be writing the file sequentially
6081 		 * if not, by returning without having pushed any clusters
6082 		 * we will cause this vnode to be pushed into the sparse cluster mechanism
6083 		 * used for managing more random I/O patterns
6084 		 *
6085 		 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
6086 		 * that's why we're in try_push with PUSH_DELAY...
6087 		 *
6088 		 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
6089 		 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
6090 		 * so we can just make a simple pass through, up to, but not including the last one...
6091 		 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
6092 		 * are sequential
6093 		 *
6094 		 * we let the last one be partial as long as it was adjacent to the previous one...
6095 		 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
6096 		 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
6097 		 */
6098 		for (i = 0; i < MAX_CLUSTERS - 1; i++) {
6099 			if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
6100 				goto dont_try;
6101 			}
6102 			if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
6103 				goto dont_try;
6104 			}
6105 		}
6106 	}
6107 	if (vm_initiated == TRUE) {
6108 		lck_mtx_unlock(&wbp->cl_lockw);
6109 	}
6110 
6111 	for (cl_index = 0; cl_index < cl_len; cl_index++) {
6112 		int     flags;
6113 		struct  cl_extent cl;
6114 		int retval;
6115 
6116 		flags = io_flags & (IO_PASSIVE | IO_CLOSE);
6117 
6118 		/*
6119 		 * try to push each cluster in turn...
6120 		 */
6121 		if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
6122 			flags |= IO_NOCACHE;
6123 		}
6124 
6125 		if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
6126 			flags |= IO_PASSIVE;
6127 		}
6128 
6129 		if (push_flag & PUSH_SYNC) {
6130 			flags |= IO_SYNC;
6131 		}
6132 
6133 		cl.b_addr = l_clusters[cl_index].b_addr;
6134 		cl.e_addr = l_clusters[cl_index].e_addr;
6135 
6136 		retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
6137 
6138 		if (retval == 0) {
6139 			cl_pushed++;
6140 
6141 			l_clusters[cl_index].b_addr = 0;
6142 			l_clusters[cl_index].e_addr = 0;
6143 		} else if (error == 0) {
6144 			error = retval;
6145 		}
6146 
6147 		if (!(push_flag & PUSH_ALL)) {
6148 			break;
6149 		}
6150 	}
6151 	if (vm_initiated == TRUE) {
6152 		lck_mtx_lock(&wbp->cl_lockw);
6153 	}
6154 
6155 	if (err) {
6156 		*err = error;
6157 	}
6158 
6159 dont_try:
6160 	if (cl_len > cl_pushed) {
6161 		/*
6162 		 * we didn't push all of the clusters, so
6163 		 * lets try to merge them back in to the vnode
6164 		 */
6165 		if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
6166 			/*
6167 			 * we picked up some new clusters while we were trying to
6168 			 * push the old ones... this can happen because I've dropped
6169 			 * the vnode lock... the sum of the
6170 			 * leftovers plus the new cluster count exceeds our ability
6171 			 * to represent them, so switch to the sparse cluster mechanism
6172 			 *
6173 			 * collect the active public clusters...
6174 			 */
6175 			sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6176 
6177 			for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
6178 				if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6179 					continue;
6180 				}
6181 				wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6182 				wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6183 				wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6184 
6185 				cl_index1++;
6186 			}
6187 			/*
6188 			 * update the cluster count
6189 			 */
6190 			wbp->cl_number = cl_index1;
6191 
6192 			/*
6193 			 * and collect the original clusters that were moved into the
6194 			 * local storage for sorting purposes
6195 			 */
6196 			sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6197 		} else {
6198 			/*
6199 			 * we've got room to merge the leftovers back in
6200 			 * just append them starting at the next 'hole'
6201 			 * represented by wbp->cl_number
6202 			 */
6203 			for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
6204 				if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6205 					continue;
6206 				}
6207 
6208 				wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6209 				wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6210 				wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6211 
6212 				cl_index1++;
6213 			}
6214 			/*
6215 			 * update the cluster count
6216 			 */
6217 			wbp->cl_number = cl_index1;
6218 		}
6219 	}
6220 	return MAX_CLUSTERS - wbp->cl_number;
6221 }
6222 
6223 
6224 
6225 static int
cluster_push_now(vnode_t vp,struct cl_extent * cl,off_t EOF,int flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6226 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
6227     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6228 {
6229 	upl_page_info_t *pl;
6230 	upl_t            upl;
6231 	vm_offset_t      upl_offset;
6232 	int              upl_size;
6233 	off_t            upl_f_offset;
6234 	int              pages_in_upl;
6235 	int              start_pg;
6236 	int              last_pg;
6237 	int              io_size;
6238 	int              io_flags;
6239 	int              upl_flags;
6240 	int              bflag;
6241 	int              size;
6242 	int              error = 0;
6243 	int              retval;
6244 	kern_return_t    kret;
6245 
6246 	if (flags & IO_PASSIVE) {
6247 		bflag = CL_PASSIVE;
6248 	} else {
6249 		bflag = 0;
6250 	}
6251 
6252 	if (flags & IO_SKIP_ENCRYPTION) {
6253 		bflag |= CL_ENCRYPTED;
6254 	}
6255 
6256 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
6257 	    (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
6258 
6259 	if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
6260 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
6261 
6262 		return 0;
6263 	}
6264 	upl_size = pages_in_upl * PAGE_SIZE;
6265 	upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6266 
6267 	if (upl_f_offset + upl_size >= EOF) {
6268 		if (upl_f_offset >= EOF) {
6269 			/*
6270 			 * must have truncated the file and missed
6271 			 * clearing a dangling cluster (i.e. it's completely
6272 			 * beyond the new EOF
6273 			 */
6274 			KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
6275 
6276 			return 0;
6277 		}
6278 		size = (int)(EOF - upl_f_offset);
6279 
6280 		upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
6281 		pages_in_upl = upl_size / PAGE_SIZE;
6282 	} else {
6283 		size = upl_size;
6284 	}
6285 
6286 
6287 	if (vm_initiated) {
6288 		vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
6289 		    UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
6290 
6291 		return error;
6292 	}
6293 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
6294 
6295 	/*
6296 	 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
6297 	 *
6298 	 * - only pages that are currently dirty are returned... these are the ones we need to clean
6299 	 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6300 	 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
6301 	 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
6302 	 *   someone dirties this page while the I/O is in progress, we don't lose track of the new state
6303 	 *
6304 	 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6305 	 */
6306 
6307 	if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) {
6308 		upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
6309 	} else {
6310 		upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
6311 	}
6312 
6313 	kret = ubc_create_upl_kernel(vp,
6314 	    upl_f_offset,
6315 	    upl_size,
6316 	    &upl,
6317 	    &pl,
6318 	    upl_flags,
6319 	    VM_KERN_MEMORY_FILE);
6320 	if (kret != KERN_SUCCESS) {
6321 		panic("cluster_push: failed to get pagelist");
6322 	}
6323 
6324 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
6325 
6326 	/*
6327 	 * since we only asked for the dirty pages back
6328 	 * it's possible that we may only get a few or even none, so...
6329 	 * before we start marching forward, we must make sure we know
6330 	 * where the last present page is in the UPL, otherwise we could
6331 	 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
6332 	 * employed by commit_range and abort_range.
6333 	 */
6334 	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
6335 		if (upl_page_present(pl, last_pg)) {
6336 			break;
6337 		}
6338 	}
6339 	pages_in_upl = last_pg + 1;
6340 
6341 	if (pages_in_upl == 0) {
6342 		ubc_upl_abort(upl, 0);
6343 
6344 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
6345 		return 0;
6346 	}
6347 
6348 	for (last_pg = 0; last_pg < pages_in_upl;) {
6349 		/*
6350 		 * find the next dirty page in the UPL
6351 		 * this will become the first page in the
6352 		 * next I/O to generate
6353 		 */
6354 		for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
6355 			if (upl_dirty_page(pl, start_pg)) {
6356 				break;
6357 			}
6358 			if (upl_page_present(pl, start_pg)) {
6359 				/*
6360 				 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
6361 				 * just release these unchanged since we're not going
6362 				 * to steal them or change their state
6363 				 */
6364 				ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
6365 			}
6366 		}
6367 		if (start_pg >= pages_in_upl) {
6368 			/*
6369 			 * done... no more dirty pages to push
6370 			 */
6371 			break;
6372 		}
6373 		if (start_pg > last_pg) {
6374 			/*
6375 			 * skipped over some non-dirty pages
6376 			 */
6377 			size -= ((start_pg - last_pg) * PAGE_SIZE);
6378 		}
6379 
6380 		/*
6381 		 * find a range of dirty pages to write
6382 		 */
6383 		for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
6384 			if (!upl_dirty_page(pl, last_pg)) {
6385 				break;
6386 			}
6387 		}
6388 		upl_offset = start_pg * PAGE_SIZE;
6389 
6390 		io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
6391 
6392 		io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
6393 
6394 		if (!(flags & IO_SYNC)) {
6395 			io_flags |= CL_ASYNC;
6396 		}
6397 
6398 		if (flags & IO_CLOSE) {
6399 			io_flags |= CL_CLOSE;
6400 		}
6401 
6402 		if (flags & IO_NOCACHE) {
6403 			io_flags |= CL_NOCACHE;
6404 		}
6405 
6406 		retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
6407 		    io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6408 
6409 		if (error == 0 && retval) {
6410 			error = retval;
6411 		}
6412 
6413 		size -= io_size;
6414 	}
6415 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
6416 
6417 	return error;
6418 }
6419 
6420 
6421 /*
6422  * sparse_cluster_switch is called with the write behind lock held
6423  */
6424 static int
sparse_cluster_switch(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6425 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6426 {
6427 	int     cl_index;
6428 	int     error = 0;
6429 
6430 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
6431 
6432 	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6433 		int       flags;
6434 		struct cl_extent cl;
6435 
6436 		for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
6437 			if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
6438 				if (flags & UPL_POP_DIRTY) {
6439 					cl.e_addr = cl.b_addr + 1;
6440 
6441 					error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
6442 
6443 					if (error) {
6444 						break;
6445 					}
6446 				}
6447 			}
6448 		}
6449 	}
6450 	wbp->cl_number -= cl_index;
6451 
6452 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
6453 
6454 	return error;
6455 }
6456 
6457 
6458 /*
6459  * sparse_cluster_push must be called with the write-behind lock held if the scmap is
6460  * still associated with the write-behind context... however, if the scmap has been disassociated
6461  * from the write-behind context (the cluster_push case), the wb lock is not held
6462  */
6463 static int
sparse_cluster_push(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6464 sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
6465     int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6466 {
6467 	struct cl_extent cl;
6468 	off_t           offset;
6469 	u_int           length;
6470 	void            *l_scmap;
6471 	int error = 0;
6472 
6473 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
6474 
6475 	if (push_flag & PUSH_ALL) {
6476 		vfs_drt_control(scmap, 1);
6477 	}
6478 
6479 	l_scmap = *scmap;
6480 
6481 	for (;;) {
6482 		int retval;
6483 
6484 		if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) {
6485 			/*
6486 			 * Not finding anything to push will return KERN_FAILURE.
6487 			 * Confusing since it isn't really a failure. But that's the
6488 			 * reason we don't set 'error' here like we do below.
6489 			 */
6490 			break;
6491 		}
6492 
6493 		if (vm_initiated == TRUE) {
6494 			lck_mtx_unlock(&wbp->cl_lockw);
6495 		}
6496 
6497 		cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6498 		cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6499 
6500 		retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
6501 		if (error == 0 && retval) {
6502 			error = retval;
6503 		}
6504 
6505 		if (vm_initiated == TRUE) {
6506 			lck_mtx_lock(&wbp->cl_lockw);
6507 
6508 			if (*scmap != l_scmap) {
6509 				break;
6510 			}
6511 		}
6512 
6513 		if (error) {
6514 			if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
6515 				panic("Failed to restore dirty state on failure");
6516 			}
6517 
6518 			break;
6519 		}
6520 
6521 		if (!(push_flag & PUSH_ALL)) {
6522 			break;
6523 		}
6524 	}
6525 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6526 
6527 	return error;
6528 }
6529 
6530 
6531 /*
6532  * sparse_cluster_add is called with the write behind lock held
6533  */
6534 static int
sparse_cluster_add(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,struct cl_extent * cl,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6535 sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
6536     int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6537 {
6538 	u_int   new_dirty;
6539 	u_int   length;
6540 	off_t   offset;
6541 	int     error = 0;
6542 	int     push_flag = 0; /* Is this a valid value? */
6543 
6544 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
6545 
6546 	offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6547 	length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
6548 
6549 	while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
6550 		/*
6551 		 * no room left in the map
6552 		 * only a partial update was done
6553 		 * push out some pages and try again
6554 		 */
6555 
6556 		if (vfs_get_scmap_push_behavior_internal(scmap, &push_flag)) {
6557 			push_flag = 0;
6558 		}
6559 
6560 		error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, 0, callback, callback_arg, vm_initiated);
6561 
6562 		if (error) {
6563 			break;
6564 		}
6565 
6566 		offset += (new_dirty * PAGE_SIZE_64);
6567 		length -= (new_dirty * PAGE_SIZE);
6568 	}
6569 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6570 
6571 	return error;
6572 }
6573 
6574 
6575 static int
cluster_align_phys_io(vnode_t vp,struct uio * uio,addr64_t usr_paddr,u_int32_t xsize,int flags,int (* callback)(buf_t,void *),void * callback_arg)6576 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
6577 {
6578 	upl_page_info_t  *pl;
6579 	upl_t            upl;
6580 	addr64_t         ubc_paddr;
6581 	kern_return_t    kret;
6582 	int              error = 0;
6583 	int              did_read = 0;
6584 	int              abort_flags;
6585 	int              upl_flags;
6586 	int              bflag;
6587 
6588 	if (flags & IO_PASSIVE) {
6589 		bflag = CL_PASSIVE;
6590 	} else {
6591 		bflag = 0;
6592 	}
6593 
6594 	if (flags & IO_NOCACHE) {
6595 		bflag |= CL_NOCACHE;
6596 	}
6597 
6598 	upl_flags = UPL_SET_LITE;
6599 
6600 	if (!(flags & CL_READ)) {
6601 		/*
6602 		 * "write" operation:  let the UPL subsystem know
6603 		 * that we intend to modify the buffer cache pages
6604 		 * we're gathering.
6605 		 */
6606 		upl_flags |= UPL_WILL_MODIFY;
6607 	} else {
6608 		/*
6609 		 * indicate that there is no need to pull the
6610 		 * mapping for this page... we're only going
6611 		 * to read from it, not modify it.
6612 		 */
6613 		upl_flags |= UPL_FILE_IO;
6614 	}
6615 	kret = ubc_create_upl_kernel(vp,
6616 	    uio->uio_offset & ~PAGE_MASK_64,
6617 	    PAGE_SIZE,
6618 	    &upl,
6619 	    &pl,
6620 	    upl_flags,
6621 	    VM_KERN_MEMORY_FILE);
6622 
6623 	if (kret != KERN_SUCCESS) {
6624 		return EINVAL;
6625 	}
6626 
6627 	if (!upl_valid_page(pl, 0)) {
6628 		/*
6629 		 * issue a synchronous read to cluster_io
6630 		 */
6631 		error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6632 		    CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6633 		if (error) {
6634 			ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
6635 
6636 			return error;
6637 		}
6638 		did_read = 1;
6639 	}
6640 	ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
6641 
6642 /*
6643  *	NOTE:  There is no prototype for the following in BSD. It, and the definitions
6644  *	of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6645  *	osfmk/ppc/mappings.h.  They are not included here because there appears to be no
6646  *	way to do so without exporting them to kexts as well.
6647  */
6648 	if (flags & CL_READ) {
6649 //		copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk);	/* Copy physical to physical and flush the destination */
6650 		copypv(ubc_paddr, usr_paddr, xsize, 2 |        1 |        4);           /* Copy physical to physical and flush the destination */
6651 	} else {
6652 //		copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc);	/* Copy physical to physical and flush the source */
6653 		copypv(usr_paddr, ubc_paddr, xsize, 2 |        1 |        8);           /* Copy physical to physical and flush the source */
6654 	}
6655 	if (!(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
6656 		/*
6657 		 * issue a synchronous write to cluster_io
6658 		 */
6659 		error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6660 		    bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6661 	}
6662 	if (error == 0) {
6663 		uio_update(uio, (user_size_t)xsize);
6664 	}
6665 
6666 	if (did_read) {
6667 		abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6668 	} else {
6669 		abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
6670 	}
6671 
6672 	ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
6673 
6674 	return error;
6675 }
6676 
6677 int
cluster_copy_upl_data(struct uio * uio,upl_t upl,int upl_offset,int * io_resid)6678 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
6679 {
6680 	int       pg_offset;
6681 	int       pg_index;
6682 	int       csize;
6683 	int       segflg;
6684 	int       retval = 0;
6685 	int       xsize;
6686 	upl_page_info_t *pl;
6687 	int       dirty_count;
6688 
6689 	xsize = *io_resid;
6690 
6691 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6692 	    (int)uio->uio_offset, upl_offset, xsize, 0, 0);
6693 
6694 	segflg = uio->uio_segflg;
6695 
6696 	switch (segflg) {
6697 	case UIO_USERSPACE32:
6698 	case UIO_USERISPACE32:
6699 		uio->uio_segflg = UIO_PHYS_USERSPACE32;
6700 		break;
6701 
6702 	case UIO_USERSPACE:
6703 	case UIO_USERISPACE:
6704 		uio->uio_segflg = UIO_PHYS_USERSPACE;
6705 		break;
6706 
6707 	case UIO_USERSPACE64:
6708 	case UIO_USERISPACE64:
6709 		uio->uio_segflg = UIO_PHYS_USERSPACE64;
6710 		break;
6711 
6712 	case UIO_SYSSPACE:
6713 		uio->uio_segflg = UIO_PHYS_SYSSPACE;
6714 		break;
6715 	}
6716 	pl = ubc_upl_pageinfo(upl);
6717 
6718 	pg_index  = upl_offset / PAGE_SIZE;
6719 	pg_offset = upl_offset & PAGE_MASK;
6720 	csize     = min(PAGE_SIZE - pg_offset, xsize);
6721 
6722 	dirty_count = 0;
6723 	while (xsize && retval == 0) {
6724 		addr64_t  paddr;
6725 
6726 		paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
6727 		if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) {
6728 			dirty_count++;
6729 		}
6730 
6731 		retval = uiomove64(paddr, csize, uio);
6732 
6733 		pg_index += 1;
6734 		pg_offset = 0;
6735 		xsize    -= csize;
6736 		csize     = min(PAGE_SIZE, xsize);
6737 	}
6738 	*io_resid = xsize;
6739 
6740 	uio->uio_segflg = segflg;
6741 
6742 	task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
6743 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6744 	    (int)uio->uio_offset, xsize, retval, segflg, 0);
6745 
6746 	return retval;
6747 }
6748 
6749 
6750 int
cluster_copy_ubc_data(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty)6751 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
6752 {
6753 	return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1);
6754 }
6755 
6756 
6757 static int
cluster_copy_ubc_data_internal(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty,int take_reference)6758 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
6759 {
6760 	int       segflg;
6761 	int       io_size;
6762 	int       xsize;
6763 	int       start_offset;
6764 	int       retval = 0;
6765 	memory_object_control_t  control;
6766 
6767 	io_size = *io_resid;
6768 
6769 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6770 	    (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
6771 
6772 	control = ubc_getobject(vp, UBC_FLAGS_NONE);
6773 
6774 	if (control == MEMORY_OBJECT_CONTROL_NULL) {
6775 		KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6776 		    (int)uio->uio_offset, io_size, retval, 3, 0);
6777 
6778 		return 0;
6779 	}
6780 	segflg = uio->uio_segflg;
6781 
6782 	switch (segflg) {
6783 	case UIO_USERSPACE32:
6784 	case UIO_USERISPACE32:
6785 		uio->uio_segflg = UIO_PHYS_USERSPACE32;
6786 		break;
6787 
6788 	case UIO_USERSPACE64:
6789 	case UIO_USERISPACE64:
6790 		uio->uio_segflg = UIO_PHYS_USERSPACE64;
6791 		break;
6792 
6793 	case UIO_USERSPACE:
6794 	case UIO_USERISPACE:
6795 		uio->uio_segflg = UIO_PHYS_USERSPACE;
6796 		break;
6797 
6798 	case UIO_SYSSPACE:
6799 		uio->uio_segflg = UIO_PHYS_SYSSPACE;
6800 		break;
6801 	}
6802 
6803 	if ((io_size = *io_resid)) {
6804 		start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6805 		xsize = (int)uio_resid(uio);
6806 
6807 		retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6808 		    start_offset, io_size, mark_dirty, take_reference);
6809 		xsize -= uio_resid(uio);
6810 		io_size -= xsize;
6811 	}
6812 	uio->uio_segflg = segflg;
6813 	*io_resid       = io_size;
6814 
6815 	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6816 	    (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
6817 
6818 	return retval;
6819 }
6820 
6821 
6822 int
is_file_clean(vnode_t vp,off_t filesize)6823 is_file_clean(vnode_t vp, off_t filesize)
6824 {
6825 	off_t f_offset;
6826 	int   flags;
6827 	int   total_dirty = 0;
6828 
6829 	for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6830 		if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6831 			if (flags & UPL_POP_DIRTY) {
6832 				total_dirty++;
6833 			}
6834 		}
6835 	}
6836 	if (total_dirty) {
6837 		return EINVAL;
6838 	}
6839 
6840 	return 0;
6841 }
6842 
6843 
6844 
6845 /*
6846  * Dirty region tracking/clustering mechanism.
6847  *
6848  * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6849  * dirty regions within a larger space (file).  It is primarily intended to
6850  * support clustering in large files with many dirty areas.
6851  *
6852  * The implementation assumes that the dirty regions are pages.
6853  *
6854  * To represent dirty pages within the file, we store bit vectors in a
6855  * variable-size circular hash.
6856  */
6857 
6858 /*
6859  * Bitvector size.  This determines the number of pages we group in a
6860  * single hashtable entry.  Each hashtable entry is aligned to this
6861  * size within the file.
6862  */
6863 #define DRT_BITVECTOR_PAGES             ((1024 * 256) / PAGE_SIZE)
6864 
6865 /*
6866  * File offset handling.
6867  *
6868  * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6869  * the correct formula is  (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6870  */
6871 #define DRT_ADDRESS_MASK                (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6872 #define DRT_ALIGN_ADDRESS(addr)         ((addr) & DRT_ADDRESS_MASK)
6873 
6874 /*
6875  * Hashtable address field handling.
6876  *
6877  * The low-order bits of the hashtable address are used to conserve
6878  * space.
6879  *
6880  * DRT_HASH_COUNT_MASK must be large enough to store the range
6881  * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6882  * to indicate that the bucket is actually unoccupied.
6883  */
6884 #define DRT_HASH_GET_ADDRESS(scm, i)    ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6885 #define DRT_HASH_SET_ADDRESS(scm, i, a)                                                                 \
6886 	do {                                                                                            \
6887 	        (scm)->scm_hashtable[(i)].dhe_control =                                                 \
6888 	            ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6889 	} while (0)
6890 #define DRT_HASH_COUNT_MASK             0x1ff
6891 #define DRT_HASH_GET_COUNT(scm, i)      ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6892 #define DRT_HASH_SET_COUNT(scm, i, c)                                                                                   \
6893 	do {                                                                                                            \
6894 	        (scm)->scm_hashtable[(i)].dhe_control =                                                                 \
6895 	            ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK);       \
6896 	} while (0)
6897 #define DRT_HASH_CLEAR(scm, i)                                                                                          \
6898 	do {                                                                                                            \
6899 	        (scm)->scm_hashtable[(i)].dhe_control =	0;                                                              \
6900 	} while (0)
6901 #define DRT_HASH_VACATE(scm, i)         DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6902 #define DRT_HASH_VACANT(scm, i)         (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6903 #define DRT_HASH_COPY(oscm, oi, scm, i)                                                                 \
6904 	do {                                                                                            \
6905 	        (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control;        \
6906 	        DRT_BITVECTOR_COPY(oscm, oi, scm, i);                                                   \
6907 	} while(0);
6908 
6909 
6910 #if !defined(XNU_TARGET_OS_OSX)
6911 /*
6912  * Hash table moduli.
6913  *
6914  * Since the hashtable entry's size is dependent on the size of
6915  * the bitvector, and since the hashtable size is constrained to
6916  * both being prime and fitting within the desired allocation
6917  * size, these values need to be manually determined.
6918  *
6919  * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6920  *
6921  * The small hashtable allocation is 4096 bytes, so the modulus is 251.
6922  * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
6923  * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
6924  */
6925 
6926 #define DRT_HASH_SMALL_MODULUS  251
6927 #define DRT_HASH_LARGE_MODULUS  2039
6928 #define DRT_HASH_XLARGE_MODULUS  8179
6929 
6930 /*
6931  * Physical memory required before the large hash modulus is permitted.
6932  *
6933  * On small memory systems, the large hash modulus can lead to phsyical
6934  * memory starvation, so we avoid using it there.
6935  */
6936 #define DRT_HASH_LARGE_MEMORY_REQUIRED  (1024LL * 1024LL * 1024LL)      /* 1GiB */
6937 #define DRT_HASH_XLARGE_MEMORY_REQUIRED  (8 * 1024LL * 1024LL * 1024LL)  /* 8GiB */
6938 
6939 #define DRT_SMALL_ALLOCATION    4096    /* 80 bytes spare */
6940 #define DRT_LARGE_ALLOCATION    32768   /* 144 bytes spare */
6941 #define DRT_XLARGE_ALLOCATION    131072  /* 208 bytes spare */
6942 
6943 #else /* XNU_TARGET_OS_OSX */
6944 /*
6945  * Hash table moduli.
6946  *
6947  * Since the hashtable entry's size is dependent on the size of
6948  * the bitvector, and since the hashtable size is constrained to
6949  * both being prime and fitting within the desired allocation
6950  * size, these values need to be manually determined.
6951  *
6952  * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6953  *
6954  * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6955  * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
6956  * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
6957  */
6958 
6959 #define DRT_HASH_SMALL_MODULUS  1019
6960 #define DRT_HASH_LARGE_MODULUS  8179
6961 #define DRT_HASH_XLARGE_MODULUS  32749
6962 
6963 /*
6964  * Physical memory required before the large hash modulus is permitted.
6965  *
6966  * On small memory systems, the large hash modulus can lead to phsyical
6967  * memory starvation, so we avoid using it there.
6968  */
6969 #define DRT_HASH_LARGE_MEMORY_REQUIRED  (4 * 1024LL * 1024LL * 1024LL)  /* 4GiB */
6970 #define DRT_HASH_XLARGE_MEMORY_REQUIRED  (32 * 1024LL * 1024LL * 1024LL)  /* 32GiB */
6971 
6972 #define DRT_SMALL_ALLOCATION    16384   /* 80 bytes spare */
6973 #define DRT_LARGE_ALLOCATION    131072  /* 208 bytes spare */
6974 #define DRT_XLARGE_ALLOCATION   524288  /* 304 bytes spare */
6975 
6976 #endif /* ! XNU_TARGET_OS_OSX */
6977 
6978 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6979 
6980 /*
6981  * Hashtable entry.
6982  */
6983 struct vfs_drt_hashentry {
6984 	u_int64_t       dhe_control;
6985 /*
6986  * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6987  * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
6988  * Since PAGE_SIZE is only known at boot time,
6989  *	-define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6990  *	-declare dhe_bitvector array for largest possible length
6991  */
6992 #define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
6993 	u_int32_t       dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
6994 };
6995 
6996 /*
6997  * Hashtable bitvector handling.
6998  *
6999  * Bitvector fields are 32 bits long.
7000  */
7001 
7002 #define DRT_HASH_SET_BIT(scm, i, bit)                           \
7003 	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
7004 
7005 #define DRT_HASH_CLEAR_BIT(scm, i, bit)                         \
7006 	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
7007 
7008 #define DRT_HASH_TEST_BIT(scm, i, bit)                          \
7009 	((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
7010 
7011 #define DRT_BITVECTOR_CLEAR(scm, i)                             \
7012 	bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7013 
7014 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                    \
7015 	bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
7016 	    &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
7017 	    (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7018 
7019 /*
7020  * Dirty Region Tracking structure.
7021  *
7022  * The hashtable is allocated entirely inside the DRT structure.
7023  *
7024  * The hash is a simple circular prime modulus arrangement, the structure
7025  * is resized from small to large if it overflows.
7026  */
7027 
7028 struct vfs_drt_clustermap {
7029 	u_int32_t               scm_magic;      /* sanity/detection */
7030 #define DRT_SCM_MAGIC           0x12020003
7031 	u_int32_t               scm_modulus;    /* current ring size */
7032 	u_int32_t               scm_buckets;    /* number of occupied buckets */
7033 	u_int32_t               scm_lastclean;  /* last entry we cleaned */
7034 	u_int32_t               scm_iskips;     /* number of slot skips */
7035 
7036 	struct vfs_drt_hashentry scm_hashtable[0];
7037 };
7038 
7039 
7040 #define DRT_HASH(scm, addr)             ((addr) % (scm)->scm_modulus)
7041 #define DRT_HASH_NEXT(scm, addr)        (((addr) + 1) % (scm)->scm_modulus)
7042 
7043 /*
7044  * Debugging codes and arguments.
7045  */
7046 #define DRT_DEBUG_EMPTYFREE     (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
7047 #define DRT_DEBUG_RETCLUSTER    (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
7048 #define DRT_DEBUG_ALLOC         (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
7049 #define DRT_DEBUG_INSERT        (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
7050 #define DRT_DEBUG_MARK          (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
7051 	                                                    * dirty */
7052                                                            /* 0, setcount */
7053                                                            /* 1 (clean, no map) */
7054                                                            /* 2 (map alloc fail) */
7055                                                            /* 3, resid (partial) */
7056 #define DRT_DEBUG_6             (FSDBG_CODE(DBG_FSRW, 87))
7057 #define DRT_DEBUG_SCMDATA       (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
7058 	                                                    * lastclean, iskips */
7059 
7060 
7061 static kern_return_t    vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
7062 static kern_return_t    vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
7063 static kern_return_t    vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
7064     u_int64_t offset, int *indexp);
7065 static kern_return_t    vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
7066     u_int64_t offset,
7067     int *indexp,
7068     int recursed);
7069 static kern_return_t    vfs_drt_do_mark_pages(
7070 	void            **cmapp,
7071 	u_int64_t       offset,
7072 	u_int           length,
7073 	u_int           *setcountp,
7074 	int             dirty);
7075 static void             vfs_drt_trace(
7076 	struct vfs_drt_clustermap *cmap,
7077 	int code,
7078 	int arg1,
7079 	int arg2,
7080 	int arg3,
7081 	int arg4);
7082 
7083 
7084 /*
7085  * Allocate and initialise a sparse cluster map.
7086  *
7087  * Will allocate a new map, resize or compact an existing map.
7088  *
7089  * XXX we should probably have at least one intermediate map size,
7090  * as the 1:16 ratio seems a bit drastic.
7091  */
7092 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap ** cmapp)7093 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
7094 {
7095 	struct vfs_drt_clustermap *cmap = NULL, *ocmap = NULL;
7096 	kern_return_t   kret = KERN_SUCCESS;
7097 	u_int64_t       offset = 0;
7098 	u_int32_t       i = 0;
7099 	int             modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
7100 
7101 	ocmap = NULL;
7102 	if (cmapp != NULL) {
7103 		ocmap = *cmapp;
7104 	}
7105 
7106 	/*
7107 	 * Decide on the size of the new map.
7108 	 */
7109 	if (ocmap == NULL) {
7110 		modulus_size = DRT_HASH_SMALL_MODULUS;
7111 		map_size = DRT_SMALL_ALLOCATION;
7112 	} else {
7113 		/* count the number of active buckets in the old map */
7114 		active_buckets = 0;
7115 		for (i = 0; i < ocmap->scm_modulus; i++) {
7116 			if (!DRT_HASH_VACANT(ocmap, i) &&
7117 			    (DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
7118 				active_buckets++;
7119 			}
7120 		}
7121 		/*
7122 		 * If we're currently using the small allocation, check to
7123 		 * see whether we should grow to the large one.
7124 		 */
7125 		if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7126 			/*
7127 			 * If the ring is nearly full and we are allowed to
7128 			 * use the large modulus, upgrade.
7129 			 */
7130 			if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
7131 			    (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
7132 				modulus_size = DRT_HASH_LARGE_MODULUS;
7133 				map_size = DRT_LARGE_ALLOCATION;
7134 			} else {
7135 				modulus_size = DRT_HASH_SMALL_MODULUS;
7136 				map_size = DRT_SMALL_ALLOCATION;
7137 			}
7138 		} else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7139 			if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
7140 			    (max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
7141 				modulus_size = DRT_HASH_XLARGE_MODULUS;
7142 				map_size = DRT_XLARGE_ALLOCATION;
7143 			} else {
7144 				/*
7145 				 * If the ring is completely full and we can't
7146 				 * expand, there's nothing useful for us to do.
7147 				 * Behave as though we had compacted into the new
7148 				 * array and return.
7149 				 */
7150 				return KERN_SUCCESS;
7151 			}
7152 		} else {
7153 			/* already using the xlarge modulus */
7154 			modulus_size = DRT_HASH_XLARGE_MODULUS;
7155 			map_size = DRT_XLARGE_ALLOCATION;
7156 
7157 			/*
7158 			 * If the ring is completely full, there's
7159 			 * nothing useful for us to do.  Behave as
7160 			 * though we had compacted into the new
7161 			 * array and return.
7162 			 */
7163 			if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
7164 				return KERN_SUCCESS;
7165 			}
7166 		}
7167 	}
7168 
7169 	/*
7170 	 * Allocate and initialise the new map.
7171 	 */
7172 
7173 	kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, map_size,
7174 	    KMA_DATA, VM_KERN_MEMORY_FILE);
7175 	if (kret != KERN_SUCCESS) {
7176 		return kret;
7177 	}
7178 	cmap->scm_magic = DRT_SCM_MAGIC;
7179 	cmap->scm_modulus = modulus_size;
7180 	cmap->scm_buckets = 0;
7181 	cmap->scm_lastclean = 0;
7182 	cmap->scm_iskips = 0;
7183 	for (i = 0; i < cmap->scm_modulus; i++) {
7184 		DRT_HASH_CLEAR(cmap, i);
7185 		DRT_HASH_VACATE(cmap, i);
7186 		DRT_BITVECTOR_CLEAR(cmap, i);
7187 	}
7188 
7189 	/*
7190 	 * If there's an old map, re-hash entries from it into the new map.
7191 	 */
7192 	copycount = 0;
7193 	if (ocmap != NULL) {
7194 		for (i = 0; i < ocmap->scm_modulus; i++) {
7195 			/* skip empty buckets */
7196 			if (DRT_HASH_VACANT(ocmap, i) ||
7197 			    (DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
7198 				continue;
7199 			}
7200 			/* get new index */
7201 			offset = DRT_HASH_GET_ADDRESS(ocmap, i);
7202 			kret = vfs_drt_get_index(&cmap, offset, &index, 1);
7203 			if (kret != KERN_SUCCESS) {
7204 				/* XXX need to bail out gracefully here */
7205 				panic("vfs_drt: new cluster map mysteriously too small");
7206 				index = 0;
7207 			}
7208 			/* copy */
7209 			DRT_HASH_COPY(ocmap, i, cmap, index);
7210 			copycount++;
7211 		}
7212 	}
7213 
7214 	/* log what we've done */
7215 	vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
7216 
7217 	/*
7218 	 * It's important to ensure that *cmapp always points to
7219 	 * a valid map, so we must overwrite it before freeing
7220 	 * the old map.
7221 	 */
7222 	*cmapp = cmap;
7223 	if (ocmap != NULL) {
7224 		/* emit stats into trace buffer */
7225 		vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
7226 		    ocmap->scm_modulus,
7227 		    ocmap->scm_buckets,
7228 		    ocmap->scm_lastclean,
7229 		    ocmap->scm_iskips);
7230 
7231 		vfs_drt_free_map(ocmap);
7232 	}
7233 	return KERN_SUCCESS;
7234 }
7235 
7236 
7237 /*
7238  * Free a sparse cluster map.
7239  */
7240 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap * cmap)7241 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
7242 {
7243 	vm_size_t map_size = 0;
7244 
7245 	if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7246 		map_size = DRT_SMALL_ALLOCATION;
7247 	} else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7248 		map_size = DRT_LARGE_ALLOCATION;
7249 	} else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7250 		map_size = DRT_XLARGE_ALLOCATION;
7251 	} else {
7252 		panic("vfs_drt_free_map: Invalid modulus %d", cmap->scm_modulus);
7253 	}
7254 
7255 	kmem_free(kernel_map, (vm_offset_t)cmap, map_size);
7256 	return KERN_SUCCESS;
7257 }
7258 
7259 
7260 /*
7261  * Find the hashtable slot currently occupied by an entry for the supplied offset.
7262  */
7263 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap * cmap,u_int64_t offset,int * indexp)7264 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
7265 {
7266 	int             index;
7267 	u_int32_t       i;
7268 
7269 	offset = DRT_ALIGN_ADDRESS(offset);
7270 	index = DRT_HASH(cmap, offset);
7271 
7272 	/* traverse the hashtable */
7273 	for (i = 0; i < cmap->scm_modulus; i++) {
7274 		/*
7275 		 * If the slot is vacant, we can stop.
7276 		 */
7277 		if (DRT_HASH_VACANT(cmap, index)) {
7278 			break;
7279 		}
7280 
7281 		/*
7282 		 * If the address matches our offset, we have success.
7283 		 */
7284 		if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
7285 			*indexp = index;
7286 			return KERN_SUCCESS;
7287 		}
7288 
7289 		/*
7290 		 * Move to the next slot, try again.
7291 		 */
7292 		index = DRT_HASH_NEXT(cmap, index);
7293 	}
7294 	/*
7295 	 * It's not there.
7296 	 */
7297 	return KERN_FAILURE;
7298 }
7299 
7300 /*
7301  * Find the hashtable slot for the supplied offset.  If we haven't allocated
7302  * one yet, allocate one and populate the address field.  Note that it will
7303  * not have a nonzero page count and thus will still technically be free, so
7304  * in the case where we are called to clean pages, the slot will remain free.
7305  */
7306 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap ** cmapp,u_int64_t offset,int * indexp,int recursed)7307 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
7308 {
7309 	struct vfs_drt_clustermap *cmap;
7310 	kern_return_t   kret;
7311 	u_int32_t       index;
7312 	u_int32_t       i;
7313 
7314 	cmap = *cmapp;
7315 
7316 	/* look for an existing entry */
7317 	kret = vfs_drt_search_index(cmap, offset, indexp);
7318 	if (kret == KERN_SUCCESS) {
7319 		return kret;
7320 	}
7321 
7322 	/* need to allocate an entry */
7323 	offset = DRT_ALIGN_ADDRESS(offset);
7324 	index = DRT_HASH(cmap, offset);
7325 
7326 	/* scan from the index forwards looking for a vacant slot */
7327 	for (i = 0; i < cmap->scm_modulus; i++) {
7328 		/* slot vacant? */
7329 		if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap, index) == 0) {
7330 			cmap->scm_buckets++;
7331 			if (index < cmap->scm_lastclean) {
7332 				cmap->scm_lastclean = index;
7333 			}
7334 			DRT_HASH_SET_ADDRESS(cmap, index, offset);
7335 			DRT_HASH_SET_COUNT(cmap, index, 0);
7336 			DRT_BITVECTOR_CLEAR(cmap, index);
7337 			*indexp = index;
7338 			vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
7339 			return KERN_SUCCESS;
7340 		}
7341 		cmap->scm_iskips += i;
7342 		index = DRT_HASH_NEXT(cmap, index);
7343 	}
7344 
7345 	/*
7346 	 * We haven't found a vacant slot, so the map is full.  If we're not
7347 	 * already recursed, try reallocating/compacting it.
7348 	 */
7349 	if (recursed) {
7350 		return KERN_FAILURE;
7351 	}
7352 	kret = vfs_drt_alloc_map(cmapp);
7353 	if (kret == KERN_SUCCESS) {
7354 		/* now try to insert again */
7355 		kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
7356 	}
7357 	return kret;
7358 }
7359 
7360 /*
7361  * Implementation of set dirty/clean.
7362  *
7363  * In the 'clean' case, not finding a map is OK.
7364  */
7365 static kern_return_t
vfs_drt_do_mark_pages(void ** private,u_int64_t offset,u_int length,u_int * setcountp,int dirty)7366 vfs_drt_do_mark_pages(
7367 	void            **private,
7368 	u_int64_t       offset,
7369 	u_int           length,
7370 	u_int           *setcountp,
7371 	int             dirty)
7372 {
7373 	struct vfs_drt_clustermap *cmap, **cmapp;
7374 	kern_return_t   kret;
7375 	int             i, index, pgoff, pgcount, setcount, ecount;
7376 
7377 	cmapp = (struct vfs_drt_clustermap **)private;
7378 	cmap = *cmapp;
7379 
7380 	vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
7381 
7382 	if (setcountp != NULL) {
7383 		*setcountp = 0;
7384 	}
7385 
7386 	/* allocate a cluster map if we don't already have one */
7387 	if (cmap == NULL) {
7388 		/* no cluster map, nothing to clean */
7389 		if (!dirty) {
7390 			vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
7391 			return KERN_SUCCESS;
7392 		}
7393 		kret = vfs_drt_alloc_map(cmapp);
7394 		if (kret != KERN_SUCCESS) {
7395 			vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
7396 			return kret;
7397 		}
7398 	}
7399 	setcount = 0;
7400 
7401 	/*
7402 	 * Iterate over the length of the region.
7403 	 */
7404 	while (length > 0) {
7405 		/*
7406 		 * Get the hashtable index for this offset.
7407 		 *
7408 		 * XXX this will add blank entries if we are clearing a range
7409 		 * that hasn't been dirtied.
7410 		 */
7411 		kret = vfs_drt_get_index(cmapp, offset, &index, 0);
7412 		cmap = *cmapp;  /* may have changed! */
7413 		/* this may be a partial-success return */
7414 		if (kret != KERN_SUCCESS) {
7415 			if (setcountp != NULL) {
7416 				*setcountp = setcount;
7417 			}
7418 			vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
7419 
7420 			return kret;
7421 		}
7422 
7423 		/*
7424 		 * Work out how many pages we're modifying in this
7425 		 * hashtable entry.
7426 		 */
7427 		pgoff = (int)((offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE);
7428 		pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
7429 
7430 		/*
7431 		 * Iterate over pages, dirty/clearing as we go.
7432 		 */
7433 		ecount = DRT_HASH_GET_COUNT(cmap, index);
7434 		for (i = 0; i < pgcount; i++) {
7435 			if (dirty) {
7436 				if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7437 					if (ecount >= DRT_BITVECTOR_PAGES) {
7438 						panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7439 					}
7440 					DRT_HASH_SET_BIT(cmap, index, pgoff + i);
7441 					ecount++;
7442 					setcount++;
7443 				}
7444 			} else {
7445 				if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7446 					if (ecount <= 0) {
7447 						panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7448 					}
7449 					assert(ecount > 0);
7450 					DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
7451 					ecount--;
7452 					setcount++;
7453 				}
7454 			}
7455 		}
7456 		DRT_HASH_SET_COUNT(cmap, index, ecount);
7457 
7458 		offset += pgcount * PAGE_SIZE;
7459 		length -= pgcount * PAGE_SIZE;
7460 	}
7461 	if (setcountp != NULL) {
7462 		*setcountp = setcount;
7463 	}
7464 
7465 	vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
7466 
7467 	return KERN_SUCCESS;
7468 }
7469 
7470 /*
7471  * Mark a set of pages as dirty/clean.
7472  *
7473  * This is a public interface.
7474  *
7475  * cmapp
7476  *	Pointer to storage suitable for holding a pointer.  Note that
7477  *	this must either be NULL or a value set by this function.
7478  *
7479  * size
7480  *	Current file size in bytes.
7481  *
7482  * offset
7483  *	Offset of the first page to be marked as dirty, in bytes.  Must be
7484  *	page-aligned.
7485  *
7486  * length
7487  *	Length of dirty region, in bytes.  Must be a multiple of PAGE_SIZE.
7488  *
7489  * setcountp
7490  *	Number of pages newly marked dirty by this call (optional).
7491  *
7492  * Returns KERN_SUCCESS if all the pages were successfully marked.
7493  */
7494 static kern_return_t
vfs_drt_mark_pages(void ** cmapp,off_t offset,u_int length,u_int * setcountp)7495 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
7496 {
7497 	/* XXX size unused, drop from interface */
7498 	return vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1);
7499 }
7500 
7501 #if 0
7502 static kern_return_t
7503 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
7504 {
7505 	return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7506 }
7507 #endif
7508 
7509 /*
7510  * Get a cluster of dirty pages.
7511  *
7512  * This is a public interface.
7513  *
7514  * cmapp
7515  *	Pointer to storage managed by drt_mark_pages.  Note that this must
7516  *	be NULL or a value set by drt_mark_pages.
7517  *
7518  * offsetp
7519  *	Returns the byte offset into the file of the first page in the cluster.
7520  *
7521  * lengthp
7522  *	Returns the length in bytes of the cluster of dirty pages.
7523  *
7524  * Returns success if a cluster was found.  If KERN_FAILURE is returned, there
7525  * are no dirty pages meeting the minmum size criteria.  Private storage will
7526  * be released if there are no more dirty pages left in the map
7527  *
7528  */
7529 static kern_return_t
vfs_drt_get_cluster(void ** cmapp,off_t * offsetp,u_int * lengthp)7530 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
7531 {
7532 	struct vfs_drt_clustermap *cmap;
7533 	u_int64_t       offset;
7534 	u_int           length;
7535 	u_int32_t       j;
7536 	int             index, i, fs, ls;
7537 
7538 	/* sanity */
7539 	if ((cmapp == NULL) || (*cmapp == NULL)) {
7540 		return KERN_FAILURE;
7541 	}
7542 	cmap = *cmapp;
7543 
7544 	/* walk the hashtable */
7545 	for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
7546 		index = DRT_HASH(cmap, offset);
7547 
7548 		if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
7549 			continue;
7550 		}
7551 
7552 		/* scan the bitfield for a string of bits */
7553 		fs = -1;
7554 
7555 		for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7556 			if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7557 				fs = i;
7558 				break;
7559 			}
7560 		}
7561 		if (fs == -1) {
7562 			/*  didn't find any bits set */
7563 			panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7564 			    cmap, index, DRT_HASH_GET_COUNT(cmap, index));
7565 		}
7566 		for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
7567 			if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
7568 				break;
7569 			}
7570 		}
7571 
7572 		/* compute offset and length, mark pages clean */
7573 		offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7574 		length = ls * PAGE_SIZE;
7575 		vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7576 		cmap->scm_lastclean = index;
7577 
7578 		/* return successful */
7579 		*offsetp = (off_t)offset;
7580 		*lengthp = length;
7581 
7582 		vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
7583 		return KERN_SUCCESS;
7584 	}
7585 	/*
7586 	 * We didn't find anything... hashtable is empty
7587 	 * emit stats into trace buffer and
7588 	 * then free it
7589 	 */
7590 	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7591 	    cmap->scm_modulus,
7592 	    cmap->scm_buckets,
7593 	    cmap->scm_lastclean,
7594 	    cmap->scm_iskips);
7595 
7596 	vfs_drt_free_map(cmap);
7597 	*cmapp = NULL;
7598 
7599 	return KERN_FAILURE;
7600 }
7601 
7602 
7603 static kern_return_t
vfs_drt_control(void ** cmapp,int op_type)7604 vfs_drt_control(void **cmapp, int op_type)
7605 {
7606 	struct vfs_drt_clustermap *cmap;
7607 
7608 	/* sanity */
7609 	if ((cmapp == NULL) || (*cmapp == NULL)) {
7610 		return KERN_FAILURE;
7611 	}
7612 	cmap = *cmapp;
7613 
7614 	switch (op_type) {
7615 	case 0:
7616 		/* emit stats into trace buffer */
7617 		vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7618 		    cmap->scm_modulus,
7619 		    cmap->scm_buckets,
7620 		    cmap->scm_lastclean,
7621 		    cmap->scm_iskips);
7622 
7623 		vfs_drt_free_map(cmap);
7624 		*cmapp = NULL;
7625 		break;
7626 
7627 	case 1:
7628 		cmap->scm_lastclean = 0;
7629 		break;
7630 	}
7631 	return KERN_SUCCESS;
7632 }
7633 
7634 
7635 
7636 /*
7637  * Emit a summary of the state of the clustermap into the trace buffer
7638  * along with some caller-provided data.
7639  */
7640 #if KDEBUG
7641 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,int code,int arg1,int arg2,int arg3,int arg4)7642 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
7643 {
7644 	KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
7645 }
7646 #else
7647 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,__unused int code,__unused int arg1,__unused int arg2,__unused int arg3,__unused int arg4)7648 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
7649     __unused int arg1, __unused int arg2, __unused int arg3,
7650     __unused int arg4)
7651 {
7652 }
7653 #endif
7654 
7655 #if 0
7656 /*
7657  * Perform basic sanity check on the hash entry summary count
7658  * vs. the actual bits set in the entry.
7659  */
7660 static void
7661 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7662 {
7663 	int index, i;
7664 	int bits_on;
7665 
7666 	for (index = 0; index < cmap->scm_modulus; index++) {
7667 		if (DRT_HASH_VACANT(cmap, index)) {
7668 			continue;
7669 		}
7670 
7671 		for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7672 			if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7673 				bits_on++;
7674 			}
7675 		}
7676 		if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
7677 			panic("bits_on = %d,  index = %d", bits_on, index);
7678 		}
7679 	}
7680 }
7681 #endif
7682 
7683 /*
7684  * Internal interface only.
7685  */
7686 static kern_return_t
vfs_get_scmap_push_behavior_internal(void ** cmapp,int * push_flag)7687 vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag)
7688 {
7689 	struct vfs_drt_clustermap *cmap;
7690 
7691 	/* sanity */
7692 	if ((cmapp == NULL) || (*cmapp == NULL) || (push_flag == NULL)) {
7693 		return KERN_FAILURE;
7694 	}
7695 	cmap = *cmapp;
7696 
7697 	if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7698 		/*
7699 		 * If we have a full xlarge sparse cluster,
7700 		 * we push it out all at once so the cluster
7701 		 * map can be available to absorb more I/Os.
7702 		 * This is done on large memory configs so
7703 		 * the small I/Os don't interfere with the
7704 		 * pro workloads.
7705 		 */
7706 		*push_flag = PUSH_ALL;
7707 	}
7708 	return KERN_SUCCESS;
7709 }
7710