1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
62 */
63
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <kern/kalloc.h>
71 #include <sys/time.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <miscfs/specfs/specdev.h>
75 #include <sys/uio_internal.h>
76 #include <libkern/libkern.h>
77 #include <machine/machine_routines.h>
78
79 #include <sys/ubc_internal.h>
80 #include <vm/vnode_pager.h>
81
82 #include <mach/mach_types.h>
83 #include <mach/memory_object_types.h>
84 #include <mach/vm_map.h>
85 #include <mach/upl.h>
86 #include <kern/task.h>
87 #include <kern/policy_internal.h>
88
89 #include <vm/vm_kern.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_pageout.h>
92 #include <vm/vm_fault.h>
93
94 #include <sys/kdebug.h>
95 #include <sys/kdebug_triage.h>
96 #include <libkern/OSAtomic.h>
97
98 #include <sys/sdt.h>
99
100 #include <stdbool.h>
101
102 #include <vfs/vfs_disk_conditioner.h>
103
104 #if 0
105 #undef KERNEL_DEBUG
106 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
107 #endif
108
109
110 #define CL_READ 0x01
111 #define CL_WRITE 0x02
112 #define CL_ASYNC 0x04
113 #define CL_COMMIT 0x08
114 #define CL_PAGEOUT 0x10
115 #define CL_AGE 0x20
116 #define CL_NOZERO 0x40
117 #define CL_PAGEIN 0x80
118 #define CL_DEV_MEMORY 0x100
119 #define CL_PRESERVE 0x200
120 #define CL_THROTTLE 0x400
121 #define CL_KEEPCACHED 0x800
122 #define CL_DIRECT_IO 0x1000
123 #define CL_PASSIVE 0x2000
124 #define CL_IOSTREAMING 0x4000
125 #define CL_CLOSE 0x8000
126 #define CL_ENCRYPTED 0x10000
127 #define CL_RAW_ENCRYPTED 0x20000
128 #define CL_NOCACHE 0x40000
129 #define CL_DIRECT_IO_FSBLKSZ 0x80000
130
131 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
132
133 #define CLUSTER_IO_WAITING ((buf_t)1)
134
135 extern upl_t vector_upl_create(vm_offset_t, uint32_t);
136 extern upl_size_t vector_upl_get_size(const upl_t);
137 extern uint32_t vector_upl_max_upls(upl_t);
138 extern boolean_t vector_upl_is_valid(upl_t);
139 extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
140 extern void vector_upl_set_pagelist(upl_t);
141 extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
142
143 struct clios {
144 lck_mtx_t io_mtxp;
145 u_int io_completed; /* amount of io that has currently completed */
146 u_int io_issued; /* amount of io that was successfully issued */
147 int io_error; /* error code of first error encountered */
148 int io_wanted; /* someone is sleeping waiting for a change in state */
149 };
150
151 struct cl_direct_read_lock {
152 LIST_ENTRY(cl_direct_read_lock) chain;
153 int32_t ref_count;
154 vnode_t vp;
155 lck_rw_t rw_lock;
156 };
157
158 #define CL_DIRECT_READ_LOCK_BUCKETS 61
159
160 static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
161 cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
162
163 static LCK_GRP_DECLARE(cl_mtx_grp, "cluster I/O");
164 static LCK_MTX_DECLARE(cl_transaction_mtxp, &cl_mtx_grp);
165 static LCK_SPIN_DECLARE(cl_direct_read_spin_lock, &cl_mtx_grp);
166
167 static ZONE_DEFINE(cl_rd_zone, "cluster_read",
168 sizeof(struct cl_readahead), ZC_ZFREE_CLEARMEM);
169
170 static ZONE_DEFINE(cl_wr_zone, "cluster_write",
171 sizeof(struct cl_writebehind), ZC_ZFREE_CLEARMEM);
172
173 #define IO_UNKNOWN 0
174 #define IO_DIRECT 1
175 #define IO_CONTIG 2
176 #define IO_COPY 3
177
178 #define PUSH_DELAY 0x01
179 #define PUSH_ALL 0x02
180 #define PUSH_SYNC 0x04
181
182
183 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size);
184 static void cluster_wait_IO(buf_t cbp_head, int async);
185 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
186
187 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
188
189 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
190 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
191 static int cluster_iodone(buf_t bp, void *callback_arg);
192 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
193 static int cluster_is_throttled(vnode_t vp);
194
195 static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
196
197 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
198
199 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
200 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
201
202 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
203 int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
204 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
205 int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
206 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
207 int (*)(buf_t, void *), void *callback_arg, int flags) __attribute__((noinline));
208
209 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
210 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
211 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
212 int flags, int (*callback)(buf_t, void *), void *callback_arg, uint32_t min_io_size) __attribute__((noinline));
213 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
214 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag) __attribute__((noinline));
215
216 static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
217 off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
218
219 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
220
221 static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
222 static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
223 int (*callback)(buf_t, void *), void *callback_arg, int bflag);
224
225 static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
226
227 static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
228 void *callback_arg, int *err, boolean_t vm_initiated);
229
230 static int sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
231 static int sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
232 int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
233 static int sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
234 int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
235
236 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
237 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
238 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
239 static kern_return_t vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag);
240
241
242 /*
243 * For throttled IO to check whether
244 * a block is cached by the boot cache
245 * and thus it can avoid delaying the IO.
246 *
247 * bootcache_contains_block is initially
248 * NULL. The BootCache will set it while
249 * the cache is active and clear it when
250 * the cache is jettisoned.
251 *
252 * Returns 0 if the block is not
253 * contained in the cache, 1 if it is
254 * contained.
255 *
256 * The function pointer remains valid
257 * after the cache has been evicted even
258 * if bootcache_contains_block has been
259 * cleared.
260 *
261 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
262 */
263 int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
264
265
266 /*
267 * limit the internal I/O size so that we
268 * can represent it in a 32 bit int
269 */
270 #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
271 #define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
272 #define MAX_VECTS 16
273 /*
274 * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
275 * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
276 * we have not historically allowed the write to bypass the UBC.
277 */
278 #define MIN_DIRECT_WRITE_SIZE (16384)
279
280 #define WRITE_THROTTLE 6
281 #define WRITE_THROTTLE_SSD 2
282 #define WRITE_BEHIND 1
283 #define WRITE_BEHIND_SSD 1
284
285 #if !defined(XNU_TARGET_OS_OSX)
286 #define PREFETCH 1
287 #define PREFETCH_SSD 1
288 uint32_t speculative_prefetch_max = (2048 * 1024); /* maximum bytes in a specluative read-ahead */
289 uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
290 #else /* XNU_TARGET_OS_OSX */
291 #define PREFETCH 3
292 #define PREFETCH_SSD 2
293 uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
294 uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
295 #endif /* ! XNU_TARGET_OS_OSX */
296
297 /* maximum bytes for read-ahead */
298 uint32_t prefetch_max = (1024 * 1024 * 1024);
299 /* maximum bytes for outstanding reads */
300 uint32_t overlapping_read_max = (1024 * 1024 * 1024);
301 /* maximum bytes for outstanding writes */
302 uint32_t overlapping_write_max = (1024 * 1024 * 1024);
303
304 #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
305 #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
306
307 int speculative_reads_disabled = 0;
308
309 /*
310 * throttle the number of async writes that
311 * can be outstanding on a single vnode
312 * before we issue a synchronous write
313 */
314 #define THROTTLE_MAXCNT 0
315
316 uint32_t throttle_max_iosize = (128 * 1024);
317
318 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
319
320 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
321
322
323 void
cluster_init(void)324 cluster_init(void)
325 {
326 for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
327 LIST_INIT(&cl_direct_read_locks[i]);
328 }
329 }
330
331
332 uint32_t
cluster_max_io_size(mount_t mp,int type)333 cluster_max_io_size(mount_t mp, int type)
334 {
335 uint32_t max_io_size;
336 uint32_t segcnt;
337 uint32_t maxcnt;
338
339 switch (type) {
340 case CL_READ:
341 segcnt = mp->mnt_segreadcnt;
342 maxcnt = mp->mnt_maxreadcnt;
343 break;
344 case CL_WRITE:
345 segcnt = mp->mnt_segwritecnt;
346 maxcnt = mp->mnt_maxwritecnt;
347 break;
348 default:
349 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
350 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
351 break;
352 }
353 if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
354 /*
355 * don't allow a size beyond the max UPL size we can create
356 */
357 segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
358 }
359 max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
360
361 if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
362 /*
363 * don't allow a size smaller than the old fixed limit
364 */
365 max_io_size = MAX_UPL_TRANSFER_BYTES;
366 } else {
367 /*
368 * make sure the size specified is a multiple of PAGE_SIZE
369 */
370 max_io_size &= ~PAGE_MASK;
371 }
372 return max_io_size;
373 }
374
375 /*
376 * Returns max prefetch value. If the value overflows or exceeds the specified
377 * 'prefetch_limit', it will be capped at 'prefetch_limit' value.
378 */
379 static inline uint32_t
cluster_max_prefetch(vnode_t vp,uint32_t max_io_size,uint32_t prefetch_limit)380 cluster_max_prefetch(vnode_t vp, uint32_t max_io_size, uint32_t prefetch_limit)
381 {
382 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
383 uint32_t io_scale = IO_SCALE(vp, is_ssd ? PREFETCH_SSD : PREFETCH);
384 uint32_t prefetch = 0;
385
386 if (__improbable(os_mul_overflow(max_io_size, io_scale, &prefetch) ||
387 (prefetch > prefetch_limit))) {
388 prefetch = prefetch_limit;
389 }
390
391 return prefetch;
392 }
393
394 static inline uint32_t
calculate_max_throttle_size(vnode_t vp)395 calculate_max_throttle_size(vnode_t vp)
396 {
397 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
398 uint32_t io_scale = IO_SCALE(vp, is_ssd ? 2 : 1);
399
400 return MIN(io_scale * THROTTLE_MAX_IOSIZE, MAX_UPL_TRANSFER_BYTES);
401 }
402
403 static inline uint32_t
calculate_max_throttle_cnt(vnode_t vp)404 calculate_max_throttle_cnt(vnode_t vp)
405 {
406 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
407 uint32_t io_scale = IO_SCALE(vp, 1);
408
409 return is_ssd ? MIN(io_scale, 4) : THROTTLE_MAXCNT;
410 }
411
412 #define CLW_ALLOCATE 0x01
413 #define CLW_RETURNLOCKED 0x02
414 #define CLW_IONOCACHE 0x04
415 #define CLW_IOPASSIVE 0x08
416
417 /*
418 * if the read ahead context doesn't yet exist,
419 * allocate and initialize it...
420 * the vnode lock serializes multiple callers
421 * during the actual assignment... first one
422 * to grab the lock wins... the other callers
423 * will release the now unnecessary storage
424 *
425 * once the context is present, try to grab (but don't block on)
426 * the lock associated with it... if someone
427 * else currently owns it, than the read
428 * will run without read-ahead. this allows
429 * multiple readers to run in parallel and
430 * since there's only 1 read ahead context,
431 * there's no real loss in only allowing 1
432 * reader to have read-ahead enabled.
433 */
434 static struct cl_readahead *
cluster_get_rap(vnode_t vp)435 cluster_get_rap(vnode_t vp)
436 {
437 struct ubc_info *ubc;
438 struct cl_readahead *rap;
439
440 ubc = vp->v_ubcinfo;
441
442 if ((rap = ubc->cl_rahead) == NULL) {
443 rap = zalloc_flags(cl_rd_zone, Z_WAITOK | Z_ZERO);
444 rap->cl_lastr = -1;
445 lck_mtx_init(&rap->cl_lockr, &cl_mtx_grp, LCK_ATTR_NULL);
446
447 vnode_lock(vp);
448
449 if (ubc->cl_rahead == NULL) {
450 ubc->cl_rahead = rap;
451 } else {
452 lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
453 zfree(cl_rd_zone, rap);
454 rap = ubc->cl_rahead;
455 }
456 vnode_unlock(vp);
457 }
458 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) {
459 return rap;
460 }
461
462 return (struct cl_readahead *)NULL;
463 }
464
465
466 /*
467 * if the write behind context doesn't yet exist,
468 * and CLW_ALLOCATE is specified, allocate and initialize it...
469 * the vnode lock serializes multiple callers
470 * during the actual assignment... first one
471 * to grab the lock wins... the other callers
472 * will release the now unnecessary storage
473 *
474 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
475 * the lock associated with the write behind context before
476 * returning
477 */
478
479 static struct cl_writebehind *
cluster_get_wbp(vnode_t vp,int flags)480 cluster_get_wbp(vnode_t vp, int flags)
481 {
482 struct ubc_info *ubc;
483 struct cl_writebehind *wbp;
484
485 ubc = vp->v_ubcinfo;
486
487 if ((wbp = ubc->cl_wbehind) == NULL) {
488 if (!(flags & CLW_ALLOCATE)) {
489 return (struct cl_writebehind *)NULL;
490 }
491
492 wbp = zalloc_flags(cl_wr_zone, Z_WAITOK | Z_ZERO);
493
494 lck_mtx_init(&wbp->cl_lockw, &cl_mtx_grp, LCK_ATTR_NULL);
495
496 vnode_lock(vp);
497
498 if (ubc->cl_wbehind == NULL) {
499 ubc->cl_wbehind = wbp;
500 } else {
501 lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
502 zfree(cl_wr_zone, wbp);
503 wbp = ubc->cl_wbehind;
504 }
505 vnode_unlock(vp);
506 }
507 if (flags & CLW_RETURNLOCKED) {
508 lck_mtx_lock(&wbp->cl_lockw);
509 }
510
511 return wbp;
512 }
513
514
515 static void
cluster_syncup(vnode_t vp,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,int flags)516 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
517 {
518 struct cl_writebehind *wbp;
519
520 if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
521 if (wbp->cl_number) {
522 lck_mtx_lock(&wbp->cl_lockw);
523
524 cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
525
526 lck_mtx_unlock(&wbp->cl_lockw);
527 }
528 }
529 }
530
531
532 static int
cluster_io_present_in_BC(vnode_t vp,off_t f_offset)533 cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
534 {
535 daddr64_t blkno;
536 size_t io_size;
537 int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
538
539 if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
540 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL)) {
541 return 0;
542 }
543
544 if (io_size == 0) {
545 return 0;
546 }
547
548 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
549 return 1;
550 }
551 }
552 return 0;
553 }
554
555
556 static int
cluster_is_throttled(vnode_t vp)557 cluster_is_throttled(vnode_t vp)
558 {
559 return throttle_io_will_be_throttled(-1, vp->v_mount);
560 }
561
562
563 static void
cluster_iostate_wait(struct clios * iostate,u_int target,const char * wait_name)564 cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
565 {
566 lck_mtx_lock(&iostate->io_mtxp);
567
568 while ((iostate->io_issued - iostate->io_completed) > target) {
569 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
570 iostate->io_issued, iostate->io_completed, target, 0, 0);
571
572 iostate->io_wanted = 1;
573 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
574
575 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
576 iostate->io_issued, iostate->io_completed, target, 0, 0);
577 }
578 lck_mtx_unlock(&iostate->io_mtxp);
579 }
580
581
582 static void
cluster_handle_associated_upl(struct clios * iostate,upl_t upl,upl_offset_t upl_offset,upl_size_t size,off_t f_offset)583 cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
584 upl_offset_t upl_offset, upl_size_t size, off_t f_offset)
585 {
586 if (!size) {
587 return;
588 }
589
590 upl_t associated_upl = upl_associated_upl(upl);
591
592 if (!associated_upl) {
593 return;
594 }
595
596 upl_offset_t upl_end = upl_offset + size;
597 upl_size_t upl_size = vector_upl_get_size(upl);
598 upl_offset_t assoc_upl_offset, assoc_upl_end;
599 upl_size_t assoc_upl_size = upl_get_size(associated_upl);
600
601 /* knock off the simple case first -> this transaction covers the entire UPL */
602 if ((trunc_page_32(upl_offset) == 0) && (round_page_32(upl_end) == upl_size)) {
603 assoc_upl_offset = 0;
604 assoc_upl_end = assoc_upl_size;
605 goto do_commit;
606 } else if ((upl_offset & PAGE_MASK) == (f_offset & PAGE_MASK)) { /* or the UPL's are actually aligned */
607 assoc_upl_offset = trunc_page_32(upl_offset);
608 assoc_upl_end = round_page_32(upl_offset + size);
609 goto do_commit;
610 }
611
612 /*
613 * ( See also cluster_io where the associated upl is created )
614 * While we create the upl in one go, we will be dumping the pages in
615 * the upl in "transaction sized chunks" relative to the upl. Except
616 * for the first transction, the upl_offset will always be page aligned.
617 * and when the upl's are not aligned the associated upl offset will not
618 * be page aligned and so we have to truncate and round up the starting
619 * and the end of the pages in question and see if they are shared with
620 * other transctions or not. If two transctions "share" a page in the
621 * associated upl, the first one to complete "marks" it and skips that
622 * page and the second one will include it in the "commit range"
623 *
624 * As an example, consider the case where 4 transctions are needed (this
625 * is the worst case).
626 *
627 * Transaction for 0-1 (size -> PAGE_SIZE - upl_offset)
628 *
629 * This covers the associated upl from a -> c. a->b is not shared but
630 * b-c is shared with the next transction so the first one to complete
631 * will only "mark" it.
632 *
633 * Transaction for 1-2 (size -> PAGE_SIZE)
634 *
635 * For transaction 1, assoc_upl_offset would be 0 (corresponding to the
636 * file offset a) and assoc_upl_end would correspond to the file offset c
637 *
638 * (associated_upl - based on f_offset alignment)
639 * 0 a b c d e f
640 * <----|----|----|----|----|----|-----|---->
641 *
642 *
643 * (upl - based on user buffer address alignment)
644 * <__--|----|----|--__>
645 *
646 * 0 1 2 3
647 *
648 * Here the cached upl strictly only
649 * needs to be from b to e (3 pages). However,
650 * we also need to be able the offset
651 * for the associated upl from the f_offset
652 * and upl_offset and so we round down the
653 * cached upl f_offset coresponding to the
654 * upl_offset 0 i.e. we end up locking one
655 * more page than is strictly required. We don't
656 * do the same thing at the end.
657 *
658 * (upl)
659 * <___-|----|---_>
660 *
661 * 0 1 2
662 *
663 * The f_offset < upl_offset condition caters
664 * to writes in the first page of the file
665 * with the upl alignment being as above.
666 * where the subtracting the upl_offset would
667 * would result in the offset going negative
668 *
669 */
670 if (f_offset > upl_offset) {
671 assoc_upl_offset = (upl_offset_t)(f_offset - trunc_page_64(f_offset - upl_offset));
672 assoc_upl_end = assoc_upl_offset + size;
673 /*
674 * When upl_offset < PAGE_SIZE (i.e. the first transaction),
675 * the corresponding first page is further back than simply
676 * trunc_page_32(assoc_upl_size). upl_offset is page aligned
677 * for every transction other than the first one which means
678 * upl_offset & PAGE_MASK will be 0 for every transction other
679 * than the first one.
680 */
681 assoc_upl_offset = trunc_page_32(assoc_upl_offset - (upl_offset & PAGE_MASK));
682 } else {
683 assoc_upl_offset = f_offset;
684 assoc_upl_end = assoc_upl_offset + size;
685 assoc_upl_offset = trunc_page_32(assoc_upl_offset);
686 }
687 assoc_upl_end = round_page_32(assoc_upl_end);
688
689 assert(assoc_upl_end <= assoc_upl_size);
690
691 upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
692
693 lck_mtx_lock_spin(&iostate->io_mtxp);
694
695 int first_pg = assoc_upl_offset >> PAGE_SHIFT;
696 if (!(trunc_page_32(upl_offset) == 0) && !upl_page_get_mark(assoc_pl, first_pg)) {
697 /*
698 * The first page isn't marked so let another transaction
699 * completion handle it.
700 */
701 upl_page_set_mark(assoc_pl, first_pg, true);
702 assoc_upl_offset += PAGE_SIZE;
703 }
704
705 int last_pg = trunc_page_32(assoc_upl_end - 1) >> PAGE_SHIFT;
706 if (!(round_page_32(upl_end) == upl_size) && !upl_page_get_mark(assoc_pl, last_pg)) {
707 /*
708 * The last page isn't marked so mark the page and let another
709 * transaction completion handle it.
710 */
711 upl_page_set_mark(assoc_pl, last_pg, true);
712 assoc_upl_end -= PAGE_SIZE;
713 }
714
715 lck_mtx_unlock(&iostate->io_mtxp);
716
717 if (assoc_upl_end <= assoc_upl_offset) {
718 return;
719 }
720
721 do_commit:
722 size = assoc_upl_end - assoc_upl_offset;
723
724 boolean_t empty;
725
726 /*
727 * We can unlock these pages now and as this is for a
728 * direct/uncached write, we want to dump the pages too.
729 */
730 kern_return_t kr = upl_abort_range(associated_upl, assoc_upl_offset, size,
731 UPL_ABORT_DUMP_PAGES, &empty);
732
733 assert(!kr);
734
735 if (!kr && empty) {
736 upl_set_associated_upl(upl, NULL);
737 upl_deallocate(associated_upl);
738 }
739 }
740
741 static int
cluster_ioerror(upl_t upl,int upl_offset,int abort_size,int error,int io_flags,vnode_t vp)742 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
743 {
744 int upl_abort_code = 0;
745 int page_in = 0;
746 int page_out = 0;
747
748 if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) {
749 /*
750 * direct write of any flavor, or a direct read that wasn't aligned
751 */
752 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
753 } else {
754 if (io_flags & B_PAGEIO) {
755 if (io_flags & B_READ) {
756 page_in = 1;
757 } else {
758 page_out = 1;
759 }
760 }
761 if (io_flags & B_CACHE) {
762 /*
763 * leave pages in the cache unchanged on error
764 */
765 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
766 } else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) {
767 /*
768 * transient error on pageout/write path... leave pages unchanged
769 */
770 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
771 } else if (page_in) {
772 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
773 } else {
774 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
775 }
776
777 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
778 }
779 return upl_abort_code;
780 }
781
782
783 static int
cluster_iodone(buf_t bp,void * callback_arg)784 cluster_iodone(buf_t bp, void *callback_arg)
785 {
786 int b_flags;
787 int error;
788 int total_size;
789 int total_resid;
790 int upl_offset;
791 int zero_offset;
792 int pg_offset = 0;
793 int commit_size = 0;
794 int upl_flags = 0;
795 int transaction_size = 0;
796 upl_t upl;
797 buf_t cbp;
798 buf_t cbp_head;
799 buf_t cbp_next;
800 buf_t real_bp;
801 vnode_t vp;
802 struct clios *iostate;
803 void *verify_ctx;
804 boolean_t transaction_complete = FALSE;
805
806 __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
807
808 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
809 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
810
811 if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
812 lck_mtx_lock_spin(&cl_transaction_mtxp);
813
814 bp->b_flags |= B_TDONE;
815
816 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
817 /*
818 * all I/O requests that are part of this transaction
819 * have to complete before we can process it
820 */
821 if (!(cbp->b_flags & B_TDONE)) {
822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
823 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
824
825 lck_mtx_unlock(&cl_transaction_mtxp);
826
827 return 0;
828 }
829
830 if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
831 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
832 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
833
834 lck_mtx_unlock(&cl_transaction_mtxp);
835 wakeup(cbp);
836
837 return 0;
838 }
839
840 if (cbp->b_flags & B_EOT) {
841 transaction_complete = TRUE;
842 }
843 }
844 lck_mtx_unlock(&cl_transaction_mtxp);
845
846 if (transaction_complete == FALSE) {
847 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
848 cbp_head, 0, 0, 0, 0);
849 return 0;
850 }
851 }
852 error = 0;
853 total_size = 0;
854 total_resid = 0;
855
856 cbp = cbp_head;
857 vp = cbp->b_vp;
858 upl_offset = cbp->b_uploffset;
859 upl = cbp->b_upl;
860 b_flags = cbp->b_flags;
861 real_bp = cbp->b_real_bp;
862 zero_offset = cbp->b_validend;
863 iostate = (struct clios *)cbp->b_iostate;
864
865 if (real_bp) {
866 real_bp->b_dev = cbp->b_dev;
867 }
868
869 while (cbp) {
870 if ((cbp->b_flags & B_ERROR) && error == 0) {
871 error = cbp->b_error;
872 }
873
874 total_resid += cbp->b_resid;
875 total_size += cbp->b_bcount;
876
877 cbp_next = cbp->b_trans_next;
878
879 if (cbp_next == NULL) {
880 /*
881 * compute the overall size of the transaction
882 * in case we created one that has 'holes' in it
883 * 'total_size' represents the amount of I/O we
884 * did, not the span of the transaction w/r to the UPL
885 */
886 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
887 }
888
889 if (cbp != cbp_head) {
890 free_io_buf(cbp);
891 }
892
893 cbp = cbp_next;
894 }
895
896 if (ISSET(b_flags, B_COMMIT_UPL)) {
897 cluster_handle_associated_upl(iostate,
898 cbp_head->b_upl,
899 upl_offset,
900 transaction_size,
901 cbp_head->b_lblkno * cbp_head->b_lblksize);
902 }
903
904 if (error == 0 && total_resid) {
905 error = EIO;
906 }
907
908 if (error == 0) {
909 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
910
911 if (cliodone_func != NULL) {
912 cbp_head->b_bcount = transaction_size;
913
914 error = (*cliodone_func)(cbp_head, callback_arg);
915 }
916 }
917 if (zero_offset) {
918 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
919 }
920
921 verify_ctx = cbp_head->b_attr.ba_verify_ctx;
922 cbp_head->b_attr.ba_verify_ctx = NULL;
923 if (verify_ctx) {
924 vnode_verify_flags_t verify_flags = VNODE_VERIFY_CONTEXT_FREE;
925 caddr_t verify_buf = NULL;
926 off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
927 size_t verify_length = transaction_size;
928 vm_offset_t vaddr;
929
930 if (!error) {
931 verify_flags |= VNODE_VERIFY_WITH_CONTEXT;
932 error = ubc_upl_map_range(upl, upl_offset, round_page(transaction_size), VM_PROT_DEFAULT, &vaddr); /* Map it in */
933 if (error) {
934 panic("ubc_upl_map_range returned error %d, upl = %p, upl_offset = %d, size = %d",
935 error, upl, (int)upl_offset, (int)round_page(transaction_size));
936 } else {
937 verify_buf = (caddr_t)vaddr;
938 }
939 }
940
941 error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length, 0, &verify_ctx, verify_flags, NULL);
942
943 if (verify_buf) {
944 (void)ubc_upl_unmap_range(upl, upl_offset, round_page(transaction_size));
945 verify_buf = NULL;
946 }
947 } else if (cbp_head->b_attr.ba_flags & BA_WILL_VERIFY) {
948 error = EBADMSG;
949 }
950
951 free_io_buf(cbp_head);
952
953 if (iostate) {
954 int need_wakeup = 0;
955
956 /*
957 * someone has issued multiple I/Os asynchrounsly
958 * and is waiting for them to complete (streaming)
959 */
960 lck_mtx_lock_spin(&iostate->io_mtxp);
961
962 if (error && iostate->io_error == 0) {
963 iostate->io_error = error;
964 }
965
966 iostate->io_completed += total_size;
967
968 if (iostate->io_wanted) {
969 /*
970 * someone is waiting for the state of
971 * this io stream to change
972 */
973 iostate->io_wanted = 0;
974 need_wakeup = 1;
975 }
976 lck_mtx_unlock(&iostate->io_mtxp);
977
978 if (need_wakeup) {
979 wakeup((caddr_t)&iostate->io_wanted);
980 }
981 }
982
983 if (b_flags & B_COMMIT_UPL) {
984 pg_offset = upl_offset & PAGE_MASK;
985 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
986
987 if (error) {
988 upl_set_iodone_error(upl, error);
989
990 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
991 } else {
992 upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
993
994 if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
995 upl_flags |= UPL_COMMIT_SET_DIRTY;
996 }
997
998 if (b_flags & B_AGE) {
999 upl_flags |= UPL_COMMIT_INACTIVATE;
1000 }
1001
1002 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
1003 }
1004 }
1005 if (real_bp) {
1006 if (error) {
1007 real_bp->b_flags |= B_ERROR;
1008 real_bp->b_error = error;
1009 }
1010 real_bp->b_resid = total_resid;
1011
1012 buf_biodone(real_bp);
1013 }
1014 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
1015 upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
1016
1017 return error;
1018 }
1019
1020
1021 uint32_t
cluster_throttle_io_limit(vnode_t vp,uint32_t * limit)1022 cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
1023 {
1024 if (cluster_is_throttled(vp)) {
1025 *limit = calculate_max_throttle_size(vp);
1026 return 1;
1027 }
1028 return 0;
1029 }
1030
1031
1032 void
cluster_zero(upl_t upl,upl_offset_t upl_offset,int size,buf_t bp)1033 cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
1034 {
1035 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
1036 upl_offset, size, bp, 0, 0);
1037
1038 if (bp == NULL || bp->b_datap == 0) {
1039 upl_page_info_t *pl;
1040 addr64_t zero_addr;
1041
1042 pl = ubc_upl_pageinfo(upl);
1043
1044 if (upl_device_page(pl) == TRUE) {
1045 zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
1046
1047 bzero_phys_nc(zero_addr, size);
1048 } else {
1049 while (size) {
1050 int page_offset;
1051 int page_index;
1052 int zero_cnt;
1053
1054 page_index = upl_offset / PAGE_SIZE;
1055 page_offset = upl_offset & PAGE_MASK;
1056
1057 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
1058 zero_cnt = min(PAGE_SIZE - page_offset, size);
1059
1060 bzero_phys(zero_addr, zero_cnt);
1061
1062 size -= zero_cnt;
1063 upl_offset += zero_cnt;
1064 }
1065 }
1066 } else {
1067 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
1068 }
1069
1070 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
1071 upl_offset, size, 0, 0, 0);
1072 }
1073
1074
1075 static void
cluster_EOT(buf_t cbp_head,buf_t cbp_tail,int zero_offset,size_t verify_block_size)1076 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size)
1077 {
1078 /*
1079 * We will assign a verification context to cbp_head.
1080 * This will be passed back to the filesystem when
1081 * verifying (in cluster_iodone).
1082 */
1083 if (verify_block_size) {
1084 off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
1085 size_t length;
1086 void *verify_ctx = NULL;
1087 int error = 0;
1088 vnode_t vp = buf_vnode(cbp_head);
1089
1090 if (cbp_head == cbp_tail) {
1091 length = cbp_head->b_bcount;
1092 } else {
1093 length = ((cbp_tail->b_lblkno * cbp_tail->b_lblksize) + cbp_tail->b_bcount) - start_off;
1094 }
1095
1096 /*
1097 * zero_offset is non zero for the transaction containing the EOF
1098 * (if the filesize is not page aligned). In that case we might
1099 * have the transaction size not be page/verify block size aligned
1100 */
1101 if ((zero_offset == 0) &&
1102 ((length < verify_block_size) || (length % verify_block_size)) != 0) {
1103 panic("%s length = %zu, verify_block_size = %zu",
1104 __FUNCTION__, length, verify_block_size);
1105 }
1106
1107 error = VNOP_VERIFY(vp, start_off, NULL, length,
1108 &verify_block_size, &verify_ctx, VNODE_VERIFY_CONTEXT_ALLOC, NULL);
1109
1110 cbp_head->b_attr.ba_verify_ctx = verify_ctx;
1111 } else {
1112 cbp_head->b_attr.ba_verify_ctx = NULL;
1113 }
1114
1115 cbp_head->b_validend = zero_offset;
1116 cbp_tail->b_flags |= B_EOT;
1117 }
1118
1119 static void
cluster_wait_IO(buf_t cbp_head,int async)1120 cluster_wait_IO(buf_t cbp_head, int async)
1121 {
1122 buf_t cbp;
1123
1124 if (async) {
1125 /*
1126 * Async callback completion will not normally generate a
1127 * wakeup upon I/O completion. To get woken up, we set
1128 * b_trans_next (which is safe for us to modify) on the last
1129 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1130 * to wake us up when all buffers as part of this transaction
1131 * are completed. This is done under the umbrella of
1132 * cl_transaction_mtxp which is also taken in cluster_iodone.
1133 */
1134 bool done = true;
1135 buf_t last = NULL;
1136
1137 lck_mtx_lock_spin(&cl_transaction_mtxp);
1138
1139 for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1140 if (!ISSET(cbp->b_flags, B_TDONE)) {
1141 done = false;
1142 }
1143 }
1144
1145 if (!done) {
1146 last->b_trans_next = CLUSTER_IO_WAITING;
1147
1148 DTRACE_IO1(wait__start, buf_t, last);
1149 do {
1150 msleep(last, &cl_transaction_mtxp, PSPIN | (PRIBIO + 1), "cluster_wait_IO", NULL);
1151
1152 /*
1153 * We should only have been woken up if all the
1154 * buffers are completed, but just in case...
1155 */
1156 done = true;
1157 for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1158 if (!ISSET(cbp->b_flags, B_TDONE)) {
1159 done = false;
1160 break;
1161 }
1162 }
1163 } while (!done);
1164 DTRACE_IO1(wait__done, buf_t, last);
1165
1166 last->b_trans_next = NULL;
1167 }
1168
1169 lck_mtx_unlock(&cl_transaction_mtxp);
1170 } else { // !async
1171 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1172 buf_biowait(cbp);
1173 }
1174 }
1175 }
1176
1177 static void
cluster_complete_transaction(buf_t * cbp_head,void * callback_arg,int * retval,int flags,int needwait)1178 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1179 {
1180 buf_t cbp;
1181 int error;
1182 boolean_t isswapout = FALSE;
1183
1184 /*
1185 * cluster_complete_transaction will
1186 * only be called if we've issued a complete chain in synchronous mode
1187 * or, we've already done a cluster_wait_IO on an incomplete chain
1188 */
1189 if (needwait) {
1190 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1191 buf_biowait(cbp);
1192 }
1193 }
1194 /*
1195 * we've already waited on all of the I/Os in this transaction,
1196 * so mark all of the buf_t's in this transaction as B_TDONE
1197 * so that cluster_iodone sees the transaction as completed
1198 */
1199 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1200 cbp->b_flags |= B_TDONE;
1201 }
1202 cbp = *cbp_head;
1203
1204 if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
1205 isswapout = TRUE;
1206 }
1207
1208 error = cluster_iodone(cbp, callback_arg);
1209
1210 if (!(flags & CL_ASYNC) && error && *retval == 0) {
1211 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
1212 *retval = error;
1213 } else if (isswapout == TRUE) {
1214 *retval = error;
1215 }
1216 }
1217 *cbp_head = (buf_t)NULL;
1218 }
1219
1220
1221 static int
cluster_io(vnode_t vp,upl_t upl,vm_offset_t upl_offset,off_t f_offset,int non_rounded_size,int flags,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)1222 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1223 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1224 {
1225 buf_t cbp;
1226 u_int size;
1227 u_int io_size;
1228 int io_flags;
1229 int bmap_flags;
1230 int error = 0;
1231 int retval = 0;
1232 buf_t cbp_head = NULL;
1233 buf_t cbp_tail = NULL;
1234 int trans_count = 0;
1235 int max_trans_count;
1236 u_int pg_count;
1237 int pg_offset;
1238 u_int max_iosize;
1239 u_int max_vectors;
1240 int priv;
1241 int zero_offset = 0;
1242 int async_throttle = 0;
1243 mount_t mp;
1244 vm_offset_t upl_end_offset;
1245 boolean_t need_EOT = FALSE;
1246 size_t verify_block_size = 0;
1247
1248 /*
1249 * we currently don't support buffers larger than a page
1250 */
1251 if (real_bp && non_rounded_size > PAGE_SIZE) {
1252 panic("%s(): Called with real buffer of size %d bytes which "
1253 "is greater than the maximum allowed size of "
1254 "%d bytes (the system PAGE_SIZE).\n",
1255 __FUNCTION__, non_rounded_size, PAGE_SIZE);
1256 }
1257
1258 mp = vp->v_mount;
1259
1260 /*
1261 * we don't want to do any funny rounding of the size for IO requests
1262 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
1263 * belong to us... we can't extend (nor do we need to) the I/O to fill
1264 * out a page
1265 */
1266 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1267 /*
1268 * round the requested size up so that this I/O ends on a
1269 * page boundary in case this is a 'write'... if the filesystem
1270 * has blocks allocated to back the page beyond the EOF, we want to
1271 * make sure to write out the zero's that are sitting beyond the EOF
1272 * so that in case the filesystem doesn't explicitly zero this area
1273 * if a hole is created via a lseek/write beyond the current EOF,
1274 * it will return zeros when it's read back from the disk. If the
1275 * physical allocation doesn't extend for the whole page, we'll
1276 * only write/read from the disk up to the end of this allocation
1277 * via the extent info returned from the VNOP_BLOCKMAP call.
1278 */
1279 pg_offset = upl_offset & PAGE_MASK;
1280
1281 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1282 } else {
1283 /*
1284 * anyone advertising a blocksize of 1 byte probably
1285 * can't deal with us rounding up the request size
1286 * AFP is one such filesystem/device
1287 */
1288 size = non_rounded_size;
1289 }
1290 upl_end_offset = upl_offset + size;
1291
1292 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1293
1294 /*
1295 * Set the maximum transaction size to the maximum desired number of
1296 * buffers.
1297 */
1298 max_trans_count = 8;
1299 if (flags & CL_DEV_MEMORY) {
1300 max_trans_count = 16;
1301 }
1302
1303 if (flags & CL_READ) {
1304 io_flags = B_READ;
1305 bmap_flags = VNODE_READ;
1306
1307 max_iosize = mp->mnt_maxreadcnt;
1308 max_vectors = mp->mnt_segreadcnt;
1309
1310 if ((flags & CL_PAGEIN) && /* Cluster layer verification will be limited to pagein for now */
1311 !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1312 (VNOP_VERIFY(vp, f_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) &&
1313 verify_block_size) {
1314 if (verify_block_size != PAGE_SIZE) {
1315 verify_block_size = 0;
1316 }
1317 if (real_bp && verify_block_size) {
1318 panic("%s(): Called with real buffer and needs verification ",
1319 __FUNCTION__);
1320 }
1321 }
1322 } else {
1323 io_flags = B_WRITE;
1324 bmap_flags = VNODE_WRITE;
1325
1326 max_iosize = mp->mnt_maxwritecnt;
1327 max_vectors = mp->mnt_segwritecnt;
1328 }
1329 if (verify_block_size) {
1330 bmap_flags |= VNODE_CLUSTER_VERIFY;
1331 }
1332 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1333
1334 /*
1335 * make sure the maximum iosize is a
1336 * multiple of the page size
1337 */
1338 max_iosize &= ~PAGE_MASK;
1339
1340 /*
1341 * Ensure the maximum iosize is sensible.
1342 */
1343 if (!max_iosize) {
1344 max_iosize = PAGE_SIZE;
1345 }
1346
1347 if (flags & CL_THROTTLE) {
1348 if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1349 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
1350
1351 if (max_iosize > max_throttle_size) {
1352 max_iosize = max_throttle_size;
1353 }
1354 async_throttle = calculate_max_throttle_cnt(vp);
1355 } else {
1356 if ((flags & CL_DEV_MEMORY)) {
1357 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1358 } else {
1359 u_int max_cluster;
1360 u_int max_cluster_size;
1361 u_int scale;
1362
1363 if (vp->v_mount->mnt_minsaturationbytecount) {
1364 max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1365
1366 scale = 1;
1367 } else {
1368 max_cluster_size = MAX_CLUSTER_SIZE(vp);
1369
1370 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
1371 scale = WRITE_THROTTLE_SSD;
1372 } else {
1373 scale = WRITE_THROTTLE;
1374 }
1375 }
1376 if (max_iosize > max_cluster_size) {
1377 max_cluster = max_cluster_size;
1378 } else {
1379 max_cluster = max_iosize;
1380 }
1381
1382 if (size < max_cluster) {
1383 max_cluster = size;
1384 }
1385
1386 if (flags & CL_CLOSE) {
1387 scale += MAX_CLUSTERS;
1388 }
1389
1390 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
1391 }
1392 }
1393 }
1394 if (flags & CL_AGE) {
1395 io_flags |= B_AGE;
1396 }
1397 if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
1398 io_flags |= B_PAGEIO;
1399 }
1400 if (flags & (CL_IOSTREAMING)) {
1401 io_flags |= B_IOSTREAMING;
1402 }
1403 if (flags & CL_COMMIT) {
1404 io_flags |= B_COMMIT_UPL;
1405 }
1406 if (flags & CL_DIRECT_IO) {
1407 io_flags |= B_PHYS;
1408 }
1409 if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
1410 io_flags |= B_CACHE;
1411 }
1412 if (flags & CL_PASSIVE) {
1413 io_flags |= B_PASSIVE;
1414 }
1415 if (flags & CL_ENCRYPTED) {
1416 io_flags |= B_ENCRYPTED_IO;
1417 }
1418
1419 if (vp->v_flag & VSYSTEM) {
1420 io_flags |= B_META;
1421 }
1422
1423 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1424 /*
1425 * then we are going to end up
1426 * with a page that we can't complete (the file size wasn't a multiple
1427 * of PAGE_SIZE and we're trying to read to the end of the file
1428 * so we'll go ahead and zero out the portion of the page we can't
1429 * read in from the file
1430 */
1431 zero_offset = (int)(upl_offset + non_rounded_size);
1432 } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1433 assert(ISSET(flags, CL_COMMIT));
1434
1435 // For a direct/uncached write, we need to lock pages...
1436 upl_t cached_upl;
1437 off_t cached_upl_f_offset;
1438 int cached_upl_size;
1439
1440 assert(upl_offset < PAGE_SIZE);
1441
1442 /*
1443 *
1444 * f_offset = b
1445 * upl_offset = 8K
1446 *
1447 * (cached_upl - based on f_offset alignment)
1448 * 0 a b c
1449 * <----|----|----|----|----|----|-----|---->
1450 *
1451 *
1452 * (upl - based on user buffer address alignment)
1453 * <__--|----|----|--__>
1454 *
1455 * 0 1x 2x 3x
1456 *
1457 * Here the cached upl strictly only
1458 * needs to be from b to c. However,
1459 * we also need to be able the offset
1460 * for the associated upl from the f_offset
1461 * and upl_offset and so we round down the
1462 * cached upl f_offset coresponding to the
1463 * upl_offset 0 i.e. we end up locking one
1464 * more page than is strictly required.
1465 *
1466 * (upl)
1467 * <___-|----|---_>
1468 *
1469 * 0 1x 2x
1470 *
1471 * The f_offset < upl_offset condition caters
1472 * to writes in the first page of the file
1473 * with the upl alignment being as above.
1474 * where the subtracting the upl_offset would
1475 * would result in the offset going negative
1476 */
1477 if (f_offset >= upl_offset) {
1478 cached_upl_f_offset = trunc_page_64(f_offset - upl_offset);
1479 } else {
1480 cached_upl_f_offset = 0; /* trunc_page_32(upl_offset) */
1481 }
1482 cached_upl_size = round_page_32(f_offset - cached_upl_f_offset + non_rounded_size);
1483
1484
1485 /*
1486 * Create a UPL to lock the pages in the cache whilst the
1487 * write is in progress.
1488 */
1489 ubc_create_upl_kernel(vp, cached_upl_f_offset, cached_upl_size, &cached_upl,
1490 NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
1491
1492 /*
1493 * Attach this UPL to the other UPL so that we can find it
1494 * later.
1495 */
1496 upl_set_associated_upl(upl, cached_upl);
1497 }
1498
1499 while (size) {
1500 daddr64_t blkno;
1501 daddr64_t lblkno;
1502 size_t io_size_tmp;
1503 u_int io_size_wanted;
1504 uint32_t lblksize;
1505
1506 if (size > max_iosize) {
1507 io_size = max_iosize;
1508 } else {
1509 io_size = size;
1510 }
1511
1512 io_size_wanted = io_size;
1513 io_size_tmp = (size_t)io_size;
1514
1515 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1516 break;
1517 }
1518
1519 if (io_size_tmp > io_size_wanted) {
1520 io_size = io_size_wanted;
1521 } else {
1522 io_size = (u_int)io_size_tmp;
1523 }
1524
1525 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1526 real_bp->b_blkno = blkno;
1527 }
1528
1529 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1530 (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
1531
1532 if (io_size == 0) {
1533 /*
1534 * vnop_blockmap didn't return an error... however, it did
1535 * return an extent size of 0 which means we can't
1536 * make forward progress on this I/O... a hole in the
1537 * file would be returned as a blkno of -1 with a non-zero io_size
1538 * a real extent is returned with a blkno != -1 and a non-zero io_size
1539 */
1540 error = EINVAL;
1541 break;
1542 }
1543 if (!(flags & CL_READ) && blkno == -1) {
1544 off_t e_offset;
1545 int pageout_flags;
1546
1547 if (upl_get_internal_vectorupl(upl)) {
1548 panic("Vector UPLs should not take this code-path");
1549 }
1550 /*
1551 * we're writing into a 'hole'
1552 */
1553 if (flags & CL_PAGEOUT) {
1554 /*
1555 * if we got here via cluster_pageout
1556 * then just error the request and return
1557 * the 'hole' should already have been covered
1558 */
1559 error = EINVAL;
1560 break;
1561 }
1562 /*
1563 * we can get here if the cluster code happens to
1564 * pick up a page that was dirtied via mmap vs
1565 * a 'write' and the page targets a 'hole'...
1566 * i.e. the writes to the cluster were sparse
1567 * and the file was being written for the first time
1568 *
1569 * we can also get here if the filesystem supports
1570 * 'holes' that are less than PAGE_SIZE.... because
1571 * we can't know if the range in the page that covers
1572 * the 'hole' has been dirtied via an mmap or not,
1573 * we have to assume the worst and try to push the
1574 * entire page to storage.
1575 *
1576 * Try paging out the page individually before
1577 * giving up entirely and dumping it (the pageout
1578 * path will insure that the zero extent accounting
1579 * has been taken care of before we get back into cluster_io)
1580 *
1581 * go direct to vnode_pageout so that we don't have to
1582 * unbusy the page from the UPL... we used to do this
1583 * so that we could call ubc_msync, but that results
1584 * in a potential deadlock if someone else races us to acquire
1585 * that page and wins and in addition needs one of the pages
1586 * we're continuing to hold in the UPL
1587 */
1588 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1589
1590 if (!(flags & CL_ASYNC)) {
1591 pageout_flags |= UPL_IOSYNC;
1592 }
1593 if (!(flags & CL_COMMIT)) {
1594 pageout_flags |= UPL_NOCOMMIT;
1595 }
1596
1597 if (cbp_head) {
1598 buf_t prev_cbp;
1599 uint32_t bytes_in_last_page;
1600
1601 /*
1602 * first we have to wait for the the current outstanding I/Os
1603 * to complete... EOT hasn't been set yet on this transaction
1604 * so the pages won't be released
1605 */
1606 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1607
1608 bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1609 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1610 bytes_in_last_page += cbp->b_bcount;
1611 }
1612 bytes_in_last_page &= PAGE_MASK;
1613
1614 while (bytes_in_last_page) {
1615 /*
1616 * we've got a transcation that
1617 * includes the page we're about to push out through vnode_pageout...
1618 * find the bp's in the list which intersect this page and either
1619 * remove them entirely from the transaction (there could be multiple bp's), or
1620 * round it's iosize down to the page boundary (there can only be one)...
1621 *
1622 * find the last bp in the list and act on it
1623 */
1624 for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
1625 prev_cbp = cbp;
1626 }
1627
1628 if (bytes_in_last_page >= cbp->b_bcount) {
1629 /*
1630 * this buf no longer has any I/O associated with it
1631 */
1632 bytes_in_last_page -= cbp->b_bcount;
1633 cbp->b_bcount = 0;
1634
1635 free_io_buf(cbp);
1636
1637 if (cbp == cbp_head) {
1638 assert(bytes_in_last_page == 0);
1639 /*
1640 * the buf we just freed was the only buf in
1641 * this transaction... so there's no I/O to do
1642 */
1643 cbp_head = NULL;
1644 cbp_tail = NULL;
1645 } else {
1646 /*
1647 * remove the buf we just freed from
1648 * the transaction list
1649 */
1650 prev_cbp->b_trans_next = NULL;
1651 cbp_tail = prev_cbp;
1652 }
1653 } else {
1654 /*
1655 * this is the last bp that has I/O
1656 * intersecting the page of interest
1657 * only some of the I/O is in the intersection
1658 * so clip the size but keep it in the transaction list
1659 */
1660 cbp->b_bcount -= bytes_in_last_page;
1661 cbp_tail = cbp;
1662 bytes_in_last_page = 0;
1663 }
1664 }
1665 if (cbp_head) {
1666 /*
1667 * there was more to the current transaction
1668 * than just the page we are pushing out via vnode_pageout...
1669 * mark it as finished and complete it... we've already
1670 * waited for the I/Os to complete above in the call to cluster_wait_IO
1671 */
1672 cluster_EOT(cbp_head, cbp_tail, 0, 0);
1673
1674 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1675
1676 trans_count = 0;
1677 }
1678 }
1679 if (vnode_pageout(vp, upl, (upl_offset_t)trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1680 error = EINVAL;
1681 }
1682 e_offset = round_page_64(f_offset + 1);
1683 io_size = (u_int)(e_offset - f_offset);
1684
1685 f_offset += io_size;
1686 upl_offset += io_size;
1687
1688 if (size >= io_size) {
1689 size -= io_size;
1690 } else {
1691 size = 0;
1692 }
1693 /*
1694 * keep track of how much of the original request
1695 * that we've actually completed... non_rounded_size
1696 * may go negative due to us rounding the request
1697 * to a page size multiple (i.e. size > non_rounded_size)
1698 */
1699 non_rounded_size -= io_size;
1700
1701 if (non_rounded_size <= 0) {
1702 /*
1703 * we've transferred all of the data in the original
1704 * request, but we were unable to complete the tail
1705 * of the last page because the file didn't have
1706 * an allocation to back that portion... this is ok.
1707 */
1708 size = 0;
1709 }
1710 if (error) {
1711 if (size == 0) {
1712 flags &= ~CL_COMMIT;
1713 }
1714 break;
1715 }
1716 continue;
1717 }
1718
1719 lblksize = CLUSTER_IO_BLOCK_SIZE;
1720 lblkno = (daddr64_t)(f_offset / lblksize);
1721
1722 /*
1723 * we have now figured out how much I/O we can do - this is in 'io_size'
1724 * pg_offset is the starting point in the first page for the I/O
1725 * pg_count is the number of full and partial pages that 'io_size' encompasses
1726 */
1727 pg_offset = upl_offset & PAGE_MASK;
1728
1729 if (flags & CL_DEV_MEMORY) {
1730 /*
1731 * treat physical requests as one 'giant' page
1732 */
1733 pg_count = 1;
1734 } else {
1735 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1736 }
1737
1738 if ((flags & CL_READ) && blkno == -1) {
1739 vm_offset_t commit_offset;
1740 int bytes_to_zero;
1741 int complete_transaction_now = 0;
1742
1743 /*
1744 * if we're reading and blkno == -1, then we've got a
1745 * 'hole' in the file that we need to deal with by zeroing
1746 * out the affected area in the upl
1747 */
1748 if (io_size >= (u_int)non_rounded_size) {
1749 /*
1750 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1751 * than 'zero_offset' will be non-zero
1752 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1753 * (indicated by the io_size finishing off the I/O request for this UPL)
1754 * than we're not going to issue an I/O for the
1755 * last page in this upl... we need to zero both the hole and the tail
1756 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1757 */
1758 bytes_to_zero = non_rounded_size;
1759 if (!(flags & CL_NOZERO)) {
1760 bytes_to_zero = (int)((((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset);
1761 }
1762
1763 zero_offset = 0;
1764 } else {
1765 bytes_to_zero = io_size;
1766 }
1767
1768 pg_count = 0;
1769
1770 cluster_zero(upl, (upl_offset_t)upl_offset, bytes_to_zero, real_bp);
1771
1772 if (cbp_head) {
1773 int pg_resid;
1774
1775 /*
1776 * if there is a current I/O chain pending
1777 * then the first page of the group we just zero'd
1778 * will be handled by the I/O completion if the zero
1779 * fill started in the middle of the page
1780 */
1781 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1782
1783 pg_resid = (int)(commit_offset - upl_offset);
1784
1785 if (bytes_to_zero >= pg_resid) {
1786 /*
1787 * the last page of the current I/O
1788 * has been completed...
1789 * compute the number of fully zero'd
1790 * pages that are beyond it
1791 * plus the last page if its partial
1792 * and we have no more I/O to issue...
1793 * otherwise a partial page is left
1794 * to begin the next I/O
1795 */
1796 if ((int)io_size >= non_rounded_size) {
1797 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1798 } else {
1799 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1800 }
1801
1802 complete_transaction_now = 1;
1803 }
1804 } else {
1805 /*
1806 * no pending I/O to deal with
1807 * so, commit all of the fully zero'd pages
1808 * plus the last page if its partial
1809 * and we have no more I/O to issue...
1810 * otherwise a partial page is left
1811 * to begin the next I/O
1812 */
1813 if ((int)io_size >= non_rounded_size) {
1814 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1815 } else {
1816 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1817 }
1818
1819 commit_offset = upl_offset & ~PAGE_MASK;
1820 }
1821
1822 // Associated UPL is currently only used in the direct write path
1823 assert(!upl_associated_upl(upl));
1824
1825 if ((flags & CL_COMMIT) && pg_count) {
1826 ubc_upl_commit_range(upl, (upl_offset_t)commit_offset,
1827 pg_count * PAGE_SIZE,
1828 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1829 }
1830 upl_offset += io_size;
1831 f_offset += io_size;
1832 size -= io_size;
1833
1834 /*
1835 * keep track of how much of the original request
1836 * that we've actually completed... non_rounded_size
1837 * may go negative due to us rounding the request
1838 * to a page size multiple (i.e. size > non_rounded_size)
1839 */
1840 non_rounded_size -= io_size;
1841
1842 if (non_rounded_size <= 0) {
1843 /*
1844 * we've transferred all of the data in the original
1845 * request, but we were unable to complete the tail
1846 * of the last page because the file didn't have
1847 * an allocation to back that portion... this is ok.
1848 */
1849 size = 0;
1850 }
1851 if (cbp_head && (complete_transaction_now || size == 0)) {
1852 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1853
1854 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
1855
1856 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1857
1858 trans_count = 0;
1859 }
1860 continue;
1861 }
1862 if (pg_count > max_vectors) {
1863 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1864 io_size = PAGE_SIZE - pg_offset;
1865 pg_count = 1;
1866 } else {
1867 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1868 pg_count = max_vectors;
1869 }
1870 }
1871 /*
1872 * If the transaction is going to reach the maximum number of
1873 * desired elements, truncate the i/o to the nearest page so
1874 * that the actual i/o is initiated after this buffer is
1875 * created and added to the i/o chain.
1876 *
1877 * I/O directed to physically contiguous memory
1878 * doesn't have a requirement to make sure we 'fill' a page
1879 */
1880 if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1881 ((upl_offset + io_size) & PAGE_MASK)) {
1882 vm_offset_t aligned_ofs;
1883
1884 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1885 /*
1886 * If the io_size does not actually finish off even a
1887 * single page we have to keep adding buffers to the
1888 * transaction despite having reached the desired limit.
1889 *
1890 * Eventually we get here with the page being finished
1891 * off (and exceeded) and then we truncate the size of
1892 * this i/o request so that it is page aligned so that
1893 * we can finally issue the i/o on the transaction.
1894 */
1895 if (aligned_ofs > upl_offset) {
1896 io_size = (u_int)(aligned_ofs - upl_offset);
1897 pg_count--;
1898 }
1899 }
1900
1901 if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1902 /*
1903 * if we're not targeting a virtual device i.e. a disk image
1904 * it's safe to dip into the reserve pool since real devices
1905 * can complete this I/O request without requiring additional
1906 * bufs from the alloc_io_buf pool
1907 */
1908 priv = 1;
1909 } else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT) && !cbp_head) {
1910 /*
1911 * Throttle the speculative IO
1912 *
1913 * We can only throttle this if it is the first iobuf
1914 * for the transaction. alloc_io_buf implements
1915 * additional restrictions for diskimages anyway.
1916 */
1917 priv = 0;
1918 } else {
1919 priv = 1;
1920 }
1921
1922 cbp = alloc_io_buf(vp, priv);
1923
1924 if (flags & CL_PAGEOUT) {
1925 u_int i;
1926
1927 /*
1928 * since blocks are in offsets of lblksize (CLUSTER_IO_BLOCK_SIZE), scale
1929 * iteration to (PAGE_SIZE * pg_count) of blks.
1930 */
1931 for (i = 0; i < (PAGE_SIZE * pg_count) / lblksize; i++) {
1932 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
1933 panic("BUSY bp found in cluster_io");
1934 }
1935 }
1936 }
1937 if (flags & CL_ASYNC) {
1938 if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) {
1939 panic("buf_setcallback failed");
1940 }
1941 }
1942 cbp->b_cliodone = (void *)callback;
1943 cbp->b_flags |= io_flags;
1944 if (flags & CL_NOCACHE) {
1945 cbp->b_attr.ba_flags |= BA_NOCACHE;
1946 }
1947 if (verify_block_size) {
1948 cbp->b_attr.ba_flags |= BA_WILL_VERIFY;
1949 }
1950
1951 cbp->b_lblkno = lblkno;
1952 cbp->b_lblksize = lblksize;
1953 cbp->b_blkno = blkno;
1954 cbp->b_bcount = io_size;
1955
1956 if (buf_setupl(cbp, upl, (uint32_t)upl_offset)) {
1957 panic("buf_setupl failed");
1958 }
1959 #if CONFIG_IOSCHED
1960 upl_set_blkno(upl, upl_offset, io_size, blkno);
1961 #endif
1962 cbp->b_trans_next = (buf_t)NULL;
1963
1964 if ((cbp->b_iostate = (void *)iostate)) {
1965 /*
1966 * caller wants to track the state of this
1967 * io... bump the amount issued against this stream
1968 */
1969 iostate->io_issued += io_size;
1970 }
1971
1972 if (flags & CL_READ) {
1973 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1974 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1975 } else {
1976 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1977 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1978 }
1979
1980 if (cbp_head) {
1981 cbp_tail->b_trans_next = cbp;
1982 cbp_tail = cbp;
1983 } else {
1984 cbp_head = cbp;
1985 cbp_tail = cbp;
1986
1987 if ((cbp_head->b_real_bp = real_bp)) {
1988 real_bp = (buf_t)NULL;
1989 }
1990 }
1991 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1992
1993 trans_count++;
1994
1995 upl_offset += io_size;
1996 f_offset += io_size;
1997 size -= io_size;
1998 /*
1999 * keep track of how much of the original request
2000 * that we've actually completed... non_rounded_size
2001 * may go negative due to us rounding the request
2002 * to a page size multiple (i.e. size > non_rounded_size)
2003 */
2004 non_rounded_size -= io_size;
2005
2006 if (non_rounded_size <= 0) {
2007 /*
2008 * we've transferred all of the data in the original
2009 * request, but we were unable to complete the tail
2010 * of the last page because the file didn't have
2011 * an allocation to back that portion... this is ok.
2012 */
2013 size = 0;
2014 }
2015 if (size == 0) {
2016 /*
2017 * we have no more I/O to issue, so go
2018 * finish the final transaction
2019 */
2020 need_EOT = TRUE;
2021 } else if (((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
2022 ((flags & CL_ASYNC) || trans_count > max_trans_count)) {
2023 /*
2024 * I/O directed to physically contiguous memory...
2025 * which doesn't have a requirement to make sure we 'fill' a page
2026 * or...
2027 * the current I/O we've prepared fully
2028 * completes the last page in this request
2029 * and ...
2030 * it's either an ASYNC request or
2031 * we've already accumulated more than 8 I/O's into
2032 * this transaction so mark it as complete so that
2033 * it can finish asynchronously or via the cluster_complete_transaction
2034 * below if the request is synchronous
2035 */
2036 need_EOT = TRUE;
2037 }
2038 if (need_EOT == TRUE) {
2039 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
2040 }
2041
2042 if (flags & CL_THROTTLE) {
2043 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
2044 }
2045
2046 if (!(io_flags & B_READ)) {
2047 vnode_startwrite(vp);
2048 }
2049
2050 if (flags & CL_RAW_ENCRYPTED) {
2051 /*
2052 * User requested raw encrypted bytes.
2053 * Twiddle the bit in the ba_flags for the buffer
2054 */
2055 cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
2056 }
2057
2058 (void) VNOP_STRATEGY(cbp);
2059
2060 if (need_EOT == TRUE) {
2061 if (!(flags & CL_ASYNC)) {
2062 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
2063 }
2064
2065 need_EOT = FALSE;
2066 trans_count = 0;
2067 cbp_head = NULL;
2068 }
2069 }
2070 if (error) {
2071 int abort_size;
2072
2073 io_size = 0;
2074
2075 if (cbp_head) {
2076 /*
2077 * Wait until all of the outstanding I/O
2078 * for this partial transaction has completed
2079 */
2080 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
2081
2082 /*
2083 * Rewind the upl offset to the beginning of the
2084 * transaction.
2085 */
2086 upl_offset = cbp_head->b_uploffset;
2087 }
2088
2089 if (ISSET(flags, CL_COMMIT)) {
2090 cluster_handle_associated_upl(iostate, upl,
2091 (upl_offset_t)upl_offset,
2092 (upl_size_t)(upl_end_offset - upl_offset),
2093 (cbp_head ? (cbp_head->b_lblkno * cbp_head->b_lblksize) : f_offset));
2094 }
2095
2096 // Free all the IO buffers in this transaction
2097 for (cbp = cbp_head; cbp;) {
2098 buf_t cbp_next;
2099
2100 size += cbp->b_bcount;
2101 io_size += cbp->b_bcount;
2102
2103 cbp_next = cbp->b_trans_next;
2104 free_io_buf(cbp);
2105 cbp = cbp_next;
2106 }
2107
2108 if (iostate) {
2109 int need_wakeup = 0;
2110
2111 /*
2112 * update the error condition for this stream
2113 * since we never really issued the io
2114 * just go ahead and adjust it back
2115 */
2116 lck_mtx_lock_spin(&iostate->io_mtxp);
2117
2118 if (iostate->io_error == 0) {
2119 iostate->io_error = error;
2120 }
2121 iostate->io_issued -= io_size;
2122
2123 if (iostate->io_wanted) {
2124 /*
2125 * someone is waiting for the state of
2126 * this io stream to change
2127 */
2128 iostate->io_wanted = 0;
2129 need_wakeup = 1;
2130 }
2131 lck_mtx_unlock(&iostate->io_mtxp);
2132
2133 if (need_wakeup) {
2134 wakeup((caddr_t)&iostate->io_wanted);
2135 }
2136 }
2137
2138 if (flags & CL_COMMIT) {
2139 int upl_flags;
2140
2141 pg_offset = upl_offset & PAGE_MASK;
2142 abort_size = (int)((upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK);
2143
2144 upl_flags = cluster_ioerror(upl, (int)(upl_offset - pg_offset),
2145 abort_size, error, io_flags, vp);
2146
2147 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
2148 upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
2149 }
2150 if (retval == 0) {
2151 retval = error;
2152 }
2153 } else if (cbp_head) {
2154 panic("%s(): cbp_head is not NULL.", __FUNCTION__);
2155 }
2156
2157 if (real_bp) {
2158 /*
2159 * can get here if we either encountered an error
2160 * or we completely zero-filled the request and
2161 * no I/O was issued
2162 */
2163 if (error) {
2164 real_bp->b_flags |= B_ERROR;
2165 real_bp->b_error = error;
2166 }
2167 buf_biodone(real_bp);
2168 }
2169 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
2170
2171 return retval;
2172 }
2173
2174 #define reset_vector_run_state() \
2175 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
2176
2177 static int
vector_cluster_io(vnode_t vp,upl_t vector_upl,vm_offset_t vector_upl_offset,off_t v_upl_uio_offset,int vector_upl_iosize,int io_flag,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)2178 vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
2179 int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
2180 {
2181 vector_upl_set_pagelist(vector_upl);
2182
2183 if (io_flag & CL_READ) {
2184 if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
2185 io_flag &= ~CL_PRESERVE; /*don't zero fill*/
2186 } else {
2187 io_flag |= CL_PRESERVE; /*zero fill*/
2188 }
2189 }
2190 return cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg);
2191 }
2192
2193 static int
cluster_read_prefetch(vnode_t vp,off_t f_offset,u_int size,off_t filesize,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2194 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2195 {
2196 int pages_in_prefetch;
2197
2198 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
2199 (int)f_offset, size, (int)filesize, 0, 0);
2200
2201 if (f_offset >= filesize) {
2202 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2203 (int)f_offset, 0, 0, 0, 0);
2204 return 0;
2205 }
2206 if ((off_t)size > (filesize - f_offset)) {
2207 size = (u_int)(filesize - f_offset);
2208 }
2209 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
2210
2211 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
2212
2213 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2214 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
2215
2216 return pages_in_prefetch;
2217 }
2218
2219
2220
2221 static void
cluster_read_ahead(vnode_t vp,struct cl_extent * extent,off_t filesize,struct cl_readahead * rap,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2222 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
2223 int bflag)
2224 {
2225 daddr64_t r_addr;
2226 off_t f_offset;
2227 int size_of_prefetch;
2228 u_int max_prefetch;
2229
2230
2231 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
2232 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
2233
2234 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
2235 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2236 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
2237 return;
2238 }
2239 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
2240 rap->cl_ralen = 0;
2241 rap->cl_maxra = 0;
2242
2243 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2244 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
2245
2246 return;
2247 }
2248
2249 max_prefetch = cluster_max_prefetch(vp,
2250 cluster_max_io_size(vp->v_mount, CL_READ), speculative_prefetch_max);
2251
2252 if (max_prefetch <= PAGE_SIZE) {
2253 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2254 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
2255 return;
2256 }
2257 if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
2258 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2259 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2260 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
2261 return;
2262 }
2263 }
2264 r_addr = MAX(extent->e_addr, rap->cl_maxra) + 1;
2265 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2266
2267 size_of_prefetch = 0;
2268
2269 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2270
2271 if (size_of_prefetch) {
2272 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2273 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
2274 return;
2275 }
2276 if (f_offset < filesize) {
2277 daddr64_t read_size;
2278
2279 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
2280
2281 read_size = (extent->e_addr + 1) - extent->b_addr;
2282
2283 if (read_size > rap->cl_ralen) {
2284 if (read_size > max_prefetch / PAGE_SIZE) {
2285 rap->cl_ralen = max_prefetch / PAGE_SIZE;
2286 } else {
2287 rap->cl_ralen = (int)read_size;
2288 }
2289 }
2290 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2291
2292 if (size_of_prefetch) {
2293 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2294 }
2295 }
2296 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2297 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
2298 }
2299
2300
2301 int
cluster_pageout(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2302 cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2303 int size, off_t filesize, int flags)
2304 {
2305 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2306 }
2307
2308
2309 int
cluster_pageout_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2310 cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2311 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2312 {
2313 int io_size;
2314 int rounded_size;
2315 off_t max_size;
2316 int local_flags;
2317
2318 local_flags = CL_PAGEOUT | CL_THROTTLE;
2319
2320 if ((flags & UPL_IOSYNC) == 0) {
2321 local_flags |= CL_ASYNC;
2322 }
2323 if ((flags & UPL_NOCOMMIT) == 0) {
2324 local_flags |= CL_COMMIT;
2325 }
2326 if ((flags & UPL_KEEPCACHED)) {
2327 local_flags |= CL_KEEPCACHED;
2328 }
2329 if (flags & UPL_PAGING_ENCRYPTED) {
2330 local_flags |= CL_ENCRYPTED;
2331 }
2332
2333
2334 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2335 (int)f_offset, size, (int)filesize, local_flags, 0);
2336
2337 /*
2338 * If they didn't specify any I/O, then we are done...
2339 * we can't issue an abort because we don't know how
2340 * big the upl really is
2341 */
2342 if (size <= 0) {
2343 return EINVAL;
2344 }
2345
2346 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2347 if (local_flags & CL_COMMIT) {
2348 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2349 }
2350 return EROFS;
2351 }
2352 /*
2353 * can't page-in from a negative offset
2354 * or if we're starting beyond the EOF
2355 * or if the file offset isn't page aligned
2356 * or the size requested isn't a multiple of PAGE_SIZE
2357 */
2358 if (f_offset < 0 || f_offset >= filesize ||
2359 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2360 if (local_flags & CL_COMMIT) {
2361 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2362 }
2363 return EINVAL;
2364 }
2365 max_size = filesize - f_offset;
2366
2367 if (size < max_size) {
2368 io_size = size;
2369 } else {
2370 io_size = (int)max_size;
2371 }
2372
2373 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2374
2375 if (size > rounded_size) {
2376 if (local_flags & CL_COMMIT) {
2377 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2378 UPL_ABORT_FREE_ON_EMPTY);
2379 }
2380 }
2381 return cluster_io(vp, upl, upl_offset, f_offset, io_size,
2382 local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2383 }
2384
2385
2386 int
cluster_pagein(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2387 cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2388 int size, off_t filesize, int flags)
2389 {
2390 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2391 }
2392
2393
2394 int
cluster_pagein_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2395 cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2396 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2397 {
2398 u_int io_size;
2399 int rounded_size;
2400 off_t max_size;
2401 int retval;
2402 int local_flags = 0;
2403
2404 if (upl == NULL || size < 0) {
2405 panic("cluster_pagein: NULL upl passed in");
2406 }
2407
2408 if ((flags & UPL_IOSYNC) == 0) {
2409 local_flags |= CL_ASYNC;
2410 }
2411 if ((flags & UPL_NOCOMMIT) == 0) {
2412 local_flags |= CL_COMMIT;
2413 }
2414 if (flags & UPL_IOSTREAMING) {
2415 local_flags |= CL_IOSTREAMING;
2416 }
2417 if (flags & UPL_PAGING_ENCRYPTED) {
2418 local_flags |= CL_ENCRYPTED;
2419 }
2420
2421
2422 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2423 (int)f_offset, size, (int)filesize, local_flags, 0);
2424
2425 /*
2426 * can't page-in from a negative offset
2427 * or if we're starting beyond the EOF
2428 * or if the file offset isn't page aligned
2429 * or the size requested isn't a multiple of PAGE_SIZE
2430 */
2431 if (f_offset < 0 || f_offset >= filesize ||
2432 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2433 if (local_flags & CL_COMMIT) {
2434 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2435 }
2436
2437 if (f_offset >= filesize) {
2438 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CLUSTER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CL_PGIN_PAST_EOF), 0 /* arg */);
2439 }
2440
2441 return EINVAL;
2442 }
2443 max_size = filesize - f_offset;
2444
2445 if (size < max_size) {
2446 io_size = size;
2447 } else {
2448 io_size = (int)max_size;
2449 }
2450
2451 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2452
2453 if (size > rounded_size && (local_flags & CL_COMMIT)) {
2454 ubc_upl_abort_range(upl, upl_offset + rounded_size,
2455 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2456 }
2457
2458 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
2459 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2460
2461 return retval;
2462 }
2463
2464
2465 int
cluster_bp(buf_t bp)2466 cluster_bp(buf_t bp)
2467 {
2468 return cluster_bp_ext(bp, NULL, NULL);
2469 }
2470
2471
2472 int
cluster_bp_ext(buf_t bp,int (* callback)(buf_t,void *),void * callback_arg)2473 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2474 {
2475 off_t f_offset;
2476 int flags;
2477
2478 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2479 bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2480
2481 if (bp->b_flags & B_READ) {
2482 flags = CL_ASYNC | CL_READ;
2483 } else {
2484 flags = CL_ASYNC;
2485 }
2486 if (bp->b_flags & B_PASSIVE) {
2487 flags |= CL_PASSIVE;
2488 }
2489
2490 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2491
2492 return cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg);
2493 }
2494
2495
2496
2497 int
cluster_write(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags)2498 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2499 {
2500 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2501 }
2502
2503
2504 int
cluster_write_ext(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags,int (* callback)(buf_t,void *),void * callback_arg)2505 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2506 int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2507 {
2508 user_ssize_t cur_resid;
2509 int retval = 0;
2510 int flags;
2511 int zflags;
2512 int bflag;
2513 int write_type = IO_COPY;
2514 u_int32_t write_length;
2515 uint32_t min_direct_size = MIN_DIRECT_WRITE_SIZE;
2516
2517 flags = xflags;
2518
2519 if (flags & IO_PASSIVE) {
2520 bflag = CL_PASSIVE;
2521 } else {
2522 bflag = 0;
2523 }
2524
2525 if (vp->v_flag & VNOCACHE_DATA) {
2526 flags |= IO_NOCACHE;
2527 bflag |= CL_NOCACHE;
2528 }
2529 if (uio == NULL) {
2530 /*
2531 * no user data...
2532 * this call is being made to zero-fill some range in the file
2533 */
2534 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2535
2536 return retval;
2537 }
2538 /*
2539 * do a write through the cache if one of the following is true....
2540 * NOCACHE is not true or NODIRECT is true
2541 * the uio request doesn't target USERSPACE
2542 * otherwise, find out if we want the direct or contig variant for
2543 * the first vector in the uio request
2544 */
2545 if (((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
2546 if (proc_allow_nocache_write_fs_blksize(current_proc())) {
2547 uint32_t fs_bsize = vp->v_mount->mnt_vfsstat.f_bsize;
2548
2549 if (fs_bsize && (fs_bsize < MIN_DIRECT_WRITE_SIZE) &&
2550 ((fs_bsize & (fs_bsize - 1)) == 0)) {
2551 min_direct_size = fs_bsize;
2552 }
2553 }
2554 retval = cluster_io_type(uio, &write_type, &write_length, min_direct_size);
2555 }
2556
2557 if ((flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
2558 /*
2559 * must go through the cached variant in this case
2560 */
2561 write_type = IO_COPY;
2562 }
2563
2564 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
2565 switch (write_type) {
2566 case IO_COPY:
2567 /*
2568 * make sure the uio_resid isn't too big...
2569 * internally, we want to handle all of the I/O in
2570 * chunk sizes that fit in a 32 bit int
2571 */
2572 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2573 /*
2574 * we're going to have to call cluster_write_copy
2575 * more than once...
2576 *
2577 * only want the last call to cluster_write_copy to
2578 * have the IO_TAILZEROFILL flag set and only the
2579 * first call should have IO_HEADZEROFILL
2580 */
2581 zflags = flags & ~IO_TAILZEROFILL;
2582 flags &= ~IO_HEADZEROFILL;
2583
2584 write_length = MAX_IO_REQUEST_SIZE;
2585 } else {
2586 /*
2587 * last call to cluster_write_copy
2588 */
2589 zflags = flags;
2590
2591 write_length = (u_int32_t)cur_resid;
2592 }
2593 retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
2594 break;
2595
2596 case IO_CONTIG:
2597 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
2598
2599 if (flags & IO_HEADZEROFILL) {
2600 /*
2601 * only do this once per request
2602 */
2603 flags &= ~IO_HEADZEROFILL;
2604
2605 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
2606 headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2607 if (retval) {
2608 break;
2609 }
2610 }
2611 retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
2612
2613 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
2614 /*
2615 * we're done with the data from the user specified buffer(s)
2616 * and we've been requested to zero fill at the tail
2617 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2618 * by rearranging the args and passing in IO_HEADZEROFILL
2619 */
2620
2621 /*
2622 * Update the oldEOF to reflect the current EOF. If the UPL page
2623 * to zero-fill is not valid (when F_NOCACHE is set), the
2624 * cluster_write_copy() will perform RMW on the UPL page when
2625 * the oldEOF is not aligned on page boundary due to unaligned
2626 * write.
2627 */
2628 if (uio->uio_offset > oldEOF) {
2629 oldEOF = uio->uio_offset;
2630 }
2631 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)oldEOF, tailOff, uio->uio_offset,
2632 (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2633 }
2634 break;
2635
2636 case IO_DIRECT:
2637 /*
2638 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2639 */
2640 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg, min_direct_size);
2641 break;
2642
2643 case IO_UNKNOWN:
2644 retval = cluster_io_type(uio, &write_type, &write_length, min_direct_size);
2645 break;
2646 }
2647 /*
2648 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2649 * multiple times to service a multi-vector request that is not aligned properly
2650 * we need to update the oldEOF so that we
2651 * don't zero-fill the head of a page if we've successfully written
2652 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2653 * page that is beyond the oldEOF if the write is unaligned... we only
2654 * want that to happen for the very first page of the cluster_write,
2655 * NOT the first page of each vector making up a multi-vector write.
2656 */
2657 if (uio->uio_offset > oldEOF) {
2658 oldEOF = uio->uio_offset;
2659 }
2660 }
2661 return retval;
2662 }
2663
2664
2665 static int
cluster_write_direct(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,int * write_type,u_int32_t * write_length,int flags,int (* callback)(buf_t,void *),void * callback_arg,uint32_t min_io_size)2666 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
2667 int flags, int (*callback)(buf_t, void *), void *callback_arg, uint32_t min_io_size)
2668 {
2669 upl_t upl = NULL;
2670 upl_page_info_t *pl;
2671 vm_offset_t upl_offset;
2672 vm_offset_t vector_upl_offset = 0;
2673 u_int32_t io_req_size;
2674 u_int32_t offset_in_file;
2675 u_int32_t offset_in_iovbase;
2676 u_int32_t io_size;
2677 int io_flag = 0;
2678 upl_size_t upl_size = 0, vector_upl_size = 0;
2679 vm_size_t upl_needed_size;
2680 mach_msg_type_number_t pages_in_pl = 0;
2681 upl_control_flags_t upl_flags;
2682 kern_return_t kret = KERN_SUCCESS;
2683 mach_msg_type_number_t i = 0;
2684 int force_data_sync;
2685 int retval = 0;
2686 int first_IO = 1;
2687 struct clios iostate;
2688 user_addr_t iov_base;
2689 u_int32_t mem_alignment_mask;
2690 u_int32_t devblocksize;
2691 u_int32_t max_io_size;
2692 u_int32_t max_upl_size;
2693 u_int32_t max_vector_size;
2694 u_int32_t bytes_outstanding_limit;
2695 boolean_t io_throttled = FALSE;
2696
2697 u_int32_t vector_upl_iosize = 0;
2698 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
2699 off_t v_upl_uio_offset = 0;
2700 int vector_upl_index = 0;
2701 upl_t vector_upl = NULL;
2702
2703 uint32_t io_align_mask;
2704
2705 /*
2706 * When we enter this routine, we know
2707 * -- the resid will not exceed iov_len
2708 */
2709 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
2710 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2711
2712 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
2713
2714 max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2715
2716 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2717
2718 if (flags & IO_PASSIVE) {
2719 io_flag |= CL_PASSIVE;
2720 }
2721
2722 if (flags & IO_NOCACHE) {
2723 io_flag |= CL_NOCACHE;
2724 }
2725
2726 if (flags & IO_SKIP_ENCRYPTION) {
2727 io_flag |= CL_ENCRYPTED;
2728 }
2729
2730 iostate.io_completed = 0;
2731 iostate.io_issued = 0;
2732 iostate.io_error = 0;
2733 iostate.io_wanted = 0;
2734
2735 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
2736
2737 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2738 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2739
2740 if (devblocksize == 1) {
2741 /*
2742 * the AFP client advertises a devblocksize of 1
2743 * however, its BLOCKMAP routine maps to physical
2744 * blocks that are PAGE_SIZE in size...
2745 * therefore we can't ask for I/Os that aren't page aligned
2746 * or aren't multiples of PAGE_SIZE in size
2747 * by setting devblocksize to PAGE_SIZE, we re-instate
2748 * the old behavior we had before the mem_alignment_mask
2749 * changes went in...
2750 */
2751 devblocksize = PAGE_SIZE;
2752 }
2753
2754 io_align_mask = PAGE_MASK;
2755 if (min_io_size < MIN_DIRECT_WRITE_SIZE) {
2756 /* The process has opted into fs blocksize direct io writes */
2757 assert((min_io_size & (min_io_size - 1)) == 0);
2758 io_align_mask = min_io_size - 1;
2759 io_flag |= CL_DIRECT_IO_FSBLKSZ;
2760 }
2761
2762 next_dwrite:
2763 io_req_size = *write_length;
2764 iov_base = uio_curriovbase(uio);
2765
2766 offset_in_file = (u_int32_t)(uio->uio_offset & io_align_mask);
2767 offset_in_iovbase = (u_int32_t)(iov_base & mem_alignment_mask);
2768
2769 if (offset_in_file || offset_in_iovbase) {
2770 /*
2771 * one of the 2 important offsets is misaligned
2772 * so fire an I/O through the cache for this entire vector
2773 */
2774 goto wait_for_dwrites;
2775 }
2776 if (iov_base & (devblocksize - 1)) {
2777 /*
2778 * the offset in memory must be on a device block boundary
2779 * so that we can guarantee that we can generate an
2780 * I/O that ends on a page boundary in cluster_io
2781 */
2782 goto wait_for_dwrites;
2783 }
2784
2785 task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2786 while ((io_req_size >= PAGE_SIZE || io_req_size >= min_io_size) && uio->uio_offset < newEOF && retval == 0) {
2787 int throttle_type;
2788
2789 if ((throttle_type = cluster_is_throttled(vp))) {
2790 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
2791
2792 /*
2793 * we're in the throttle window, at the very least
2794 * we want to limit the size of the I/O we're about
2795 * to issue
2796 */
2797 if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2798 /*
2799 * we're in the throttle window and at least 1 I/O
2800 * has already been issued by a throttleable thread
2801 * in this window, so return with EAGAIN to indicate
2802 * to the FS issuing the cluster_write call that it
2803 * should now throttle after dropping any locks
2804 */
2805 throttle_info_update_by_mount(vp->v_mount);
2806
2807 io_throttled = TRUE;
2808 goto wait_for_dwrites;
2809 }
2810 max_vector_size = max_throttle_size;
2811 max_io_size = max_throttle_size;
2812 } else {
2813 max_vector_size = MAX_VECTOR_UPL_SIZE;
2814 max_io_size = max_upl_size;
2815 }
2816
2817 if (first_IO) {
2818 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2819 first_IO = 0;
2820 }
2821 io_size = io_req_size & ~io_align_mask;
2822 iov_base = uio_curriovbase(uio);
2823
2824 if (io_size > max_io_size) {
2825 io_size = max_io_size;
2826 }
2827
2828 if (useVectorUPL && (iov_base & PAGE_MASK)) {
2829 /*
2830 * We have an iov_base that's not page-aligned.
2831 * Issue all I/O's that have been collected within
2832 * this Vectored UPL.
2833 */
2834 if (vector_upl_index) {
2835 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2836 reset_vector_run_state();
2837 }
2838
2839 /*
2840 * After this point, if we are using the Vector UPL path and the base is
2841 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2842 */
2843 }
2844
2845 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2846 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2847
2848 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
2849 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
2850
2851 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2852 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2853 pages_in_pl = 0;
2854 upl_size = (upl_size_t)upl_needed_size;
2855 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2856 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2857
2858 kret = vm_map_get_upl(map,
2859 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2860 &upl_size,
2861 &upl,
2862 NULL,
2863 &pages_in_pl,
2864 &upl_flags,
2865 VM_KERN_MEMORY_FILE,
2866 force_data_sync);
2867
2868 if (kret != KERN_SUCCESS) {
2869 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2870 0, 0, 0, kret, 0);
2871 /*
2872 * failed to get pagelist
2873 *
2874 * we may have already spun some portion of this request
2875 * off as async requests... we need to wait for the I/O
2876 * to complete before returning
2877 */
2878 goto wait_for_dwrites;
2879 }
2880 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2881 pages_in_pl = upl_size / PAGE_SIZE;
2882
2883 for (i = 0; i < pages_in_pl; i++) {
2884 if (!upl_valid_page(pl, i)) {
2885 break;
2886 }
2887 }
2888 if (i == pages_in_pl) {
2889 break;
2890 }
2891
2892 /*
2893 * didn't get all the pages back that we
2894 * needed... release this upl and try again
2895 */
2896 ubc_upl_abort(upl, 0);
2897 }
2898 if (force_data_sync >= 3) {
2899 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2900 i, pages_in_pl, upl_size, kret, 0);
2901 /*
2902 * for some reason, we couldn't acquire a hold on all
2903 * the pages needed in the user's address space
2904 *
2905 * we may have already spun some portion of this request
2906 * off as async requests... we need to wait for the I/O
2907 * to complete before returning
2908 */
2909 goto wait_for_dwrites;
2910 }
2911
2912 /*
2913 * Consider the possibility that upl_size wasn't satisfied.
2914 */
2915 if (upl_size < upl_needed_size) {
2916 if (upl_size && upl_offset == 0) {
2917 io_size = upl_size;
2918 } else {
2919 io_size = 0;
2920 }
2921 }
2922 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2923 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2924
2925 if (io_size == 0) {
2926 ubc_upl_abort(upl, 0);
2927 /*
2928 * we may have already spun some portion of this request
2929 * off as async requests... we need to wait for the I/O
2930 * to complete before returning
2931 */
2932 goto wait_for_dwrites;
2933 }
2934
2935 if (useVectorUPL) {
2936 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2937 if (end_off) {
2938 issueVectorUPL = 1;
2939 }
2940 /*
2941 * After this point, if we are using a vector UPL, then
2942 * either all the UPL elements end on a page boundary OR
2943 * this UPL is the last element because it does not end
2944 * on a page boundary.
2945 */
2946 }
2947
2948 /*
2949 * we want push out these writes asynchronously so that we can overlap
2950 * the preparation of the next I/O
2951 * if there are already too many outstanding writes
2952 * wait until some complete before issuing the next
2953 */
2954 if (vp->v_mount->mnt_minsaturationbytecount) {
2955 bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
2956 } else {
2957 if (__improbable(os_mul_overflow(max_upl_size, IO_SCALE(vp, 2),
2958 &bytes_outstanding_limit) ||
2959 (bytes_outstanding_limit > overlapping_write_max))) {
2960 bytes_outstanding_limit = overlapping_write_max;
2961 }
2962 }
2963
2964 cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
2965
2966 if (iostate.io_error) {
2967 /*
2968 * one of the earlier writes we issued ran into a hard error
2969 * don't issue any more writes, cleanup the UPL
2970 * that was just created but not used, then
2971 * go wait for all writes that are part of this stream
2972 * to complete before returning the error to the caller
2973 */
2974 ubc_upl_abort(upl, 0);
2975
2976 goto wait_for_dwrites;
2977 }
2978
2979 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2980 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2981
2982 if (!useVectorUPL) {
2983 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2984 io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2985 } else {
2986 if (!vector_upl_index) {
2987 vector_upl = vector_upl_create(upl_offset, uio->uio_iovcnt);
2988 v_upl_uio_offset = uio->uio_offset;
2989 vector_upl_offset = upl_offset;
2990 }
2991
2992 vector_upl_set_subupl(vector_upl, upl, upl_size);
2993 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2994 vector_upl_index++;
2995 vector_upl_iosize += io_size;
2996 vector_upl_size += upl_size;
2997
2998 if (issueVectorUPL || vector_upl_index == vector_upl_max_upls(vector_upl) || vector_upl_size >= max_vector_size) {
2999 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
3000 reset_vector_run_state();
3001 }
3002 }
3003
3004 /*
3005 * update the uio structure to
3006 * reflect the I/O that we just issued
3007 */
3008 uio_update(uio, (user_size_t)io_size);
3009
3010 /*
3011 * in case we end up calling through to cluster_write_copy to finish
3012 * the tail of this request, we need to update the oldEOF so that we
3013 * don't zero-fill the head of a page if we've successfully written
3014 * data to that area... 'cluster_write_copy' will zero-fill the head of a
3015 * page that is beyond the oldEOF if the write is unaligned... we only
3016 * want that to happen for the very first page of the cluster_write,
3017 * NOT the first page of each vector making up a multi-vector write.
3018 */
3019 if (uio->uio_offset > oldEOF) {
3020 oldEOF = uio->uio_offset;
3021 }
3022
3023 io_req_size -= io_size;
3024
3025 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
3026 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
3027 } /* end while */
3028
3029 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
3030 retval = cluster_io_type(uio, write_type, write_length, min_io_size);
3031
3032 if (retval == 0 && *write_type == IO_DIRECT) {
3033 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
3034 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
3035
3036 goto next_dwrite;
3037 }
3038 }
3039
3040 wait_for_dwrites:
3041
3042 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
3043 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
3044 reset_vector_run_state();
3045 }
3046 /*
3047 * make sure all async writes issued as part of this stream
3048 * have completed before we return
3049 */
3050 cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
3051
3052 if (iostate.io_error) {
3053 retval = iostate.io_error;
3054 }
3055
3056 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
3057
3058 if (io_throttled == TRUE && retval == 0) {
3059 retval = EAGAIN;
3060 }
3061
3062 if (io_req_size && retval == 0) {
3063 /*
3064 * we couldn't handle the tail of this request in DIRECT mode
3065 * so fire it through the copy path
3066 *
3067 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
3068 * so we can just pass 0 in for the headOff and tailOff
3069 */
3070 if (uio->uio_offset > oldEOF) {
3071 oldEOF = uio->uio_offset;
3072 }
3073
3074 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
3075
3076 *write_type = IO_UNKNOWN;
3077 }
3078 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
3079 (int)uio->uio_offset, io_req_size, retval, 4, 0);
3080
3081 return retval;
3082 }
3083
3084
3085 static int
cluster_write_contig(vnode_t vp,struct uio * uio,off_t newEOF,int * write_type,u_int32_t * write_length,int (* callback)(buf_t,void *),void * callback_arg,int bflag)3086 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
3087 int (*callback)(buf_t, void *), void *callback_arg, int bflag)
3088 {
3089 upl_page_info_t *pl;
3090 addr64_t src_paddr = 0;
3091 upl_t upl[MAX_VECTS];
3092 vm_offset_t upl_offset;
3093 u_int32_t tail_size = 0;
3094 u_int32_t io_size;
3095 u_int32_t xsize;
3096 upl_size_t upl_size;
3097 vm_size_t upl_needed_size;
3098 mach_msg_type_number_t pages_in_pl;
3099 upl_control_flags_t upl_flags;
3100 kern_return_t kret;
3101 struct clios iostate;
3102 int error = 0;
3103 int cur_upl = 0;
3104 int num_upl = 0;
3105 int n;
3106 user_addr_t iov_base;
3107 u_int32_t devblocksize;
3108 u_int32_t mem_alignment_mask;
3109
3110 /*
3111 * When we enter this routine, we know
3112 * -- the io_req_size will not exceed iov_len
3113 * -- the target address is physically contiguous
3114 */
3115 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
3116
3117 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3118 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3119
3120 iostate.io_completed = 0;
3121 iostate.io_issued = 0;
3122 iostate.io_error = 0;
3123 iostate.io_wanted = 0;
3124
3125 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
3126
3127 next_cwrite:
3128 io_size = *write_length;
3129
3130 iov_base = uio_curriovbase(uio);
3131
3132 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3133 upl_needed_size = upl_offset + io_size;
3134
3135 pages_in_pl = 0;
3136 upl_size = (upl_size_t)upl_needed_size;
3137 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
3138 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3139
3140 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
3141 kret = vm_map_get_upl(map,
3142 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
3143 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
3144
3145 if (kret != KERN_SUCCESS) {
3146 /*
3147 * failed to get pagelist
3148 */
3149 error = EINVAL;
3150 goto wait_for_cwrites;
3151 }
3152 num_upl++;
3153
3154 /*
3155 * Consider the possibility that upl_size wasn't satisfied.
3156 */
3157 if (upl_size < upl_needed_size) {
3158 /*
3159 * This is a failure in the physical memory case.
3160 */
3161 error = EINVAL;
3162 goto wait_for_cwrites;
3163 }
3164 pl = ubc_upl_pageinfo(upl[cur_upl]);
3165
3166 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
3167
3168 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3169 u_int32_t head_size;
3170
3171 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
3172
3173 if (head_size > io_size) {
3174 head_size = io_size;
3175 }
3176
3177 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
3178
3179 if (error) {
3180 goto wait_for_cwrites;
3181 }
3182
3183 upl_offset += head_size;
3184 src_paddr += head_size;
3185 io_size -= head_size;
3186
3187 iov_base += head_size;
3188 }
3189 if ((u_int32_t)iov_base & mem_alignment_mask) {
3190 /*
3191 * request doesn't set up on a memory boundary
3192 * the underlying DMA engine can handle...
3193 * return an error instead of going through
3194 * the slow copy path since the intent of this
3195 * path is direct I/O from device memory
3196 */
3197 error = EINVAL;
3198 goto wait_for_cwrites;
3199 }
3200
3201 tail_size = io_size & (devblocksize - 1);
3202 io_size -= tail_size;
3203
3204 while (io_size && error == 0) {
3205 if (io_size > MAX_IO_CONTIG_SIZE) {
3206 xsize = MAX_IO_CONTIG_SIZE;
3207 } else {
3208 xsize = io_size;
3209 }
3210 /*
3211 * request asynchronously so that we can overlap
3212 * the preparation of the next I/O... we'll do
3213 * the commit after all the I/O has completed
3214 * since its all issued against the same UPL
3215 * if there are already too many outstanding writes
3216 * wait until some have completed before issuing the next
3217 */
3218 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
3219
3220 if (iostate.io_error) {
3221 /*
3222 * one of the earlier writes we issued ran into a hard error
3223 * don't issue any more writes...
3224 * go wait for all writes that are part of this stream
3225 * to complete before returning the error to the caller
3226 */
3227 goto wait_for_cwrites;
3228 }
3229 /*
3230 * issue an asynchronous write to cluster_io
3231 */
3232 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
3233 xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
3234
3235 if (error == 0) {
3236 /*
3237 * The cluster_io write completed successfully,
3238 * update the uio structure
3239 */
3240 uio_update(uio, (user_size_t)xsize);
3241
3242 upl_offset += xsize;
3243 src_paddr += xsize;
3244 io_size -= xsize;
3245 }
3246 }
3247 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
3248 error = cluster_io_type(uio, write_type, write_length, 0);
3249
3250 if (error == 0 && *write_type == IO_CONTIG) {
3251 cur_upl++;
3252 goto next_cwrite;
3253 }
3254 } else {
3255 *write_type = IO_UNKNOWN;
3256 }
3257
3258 wait_for_cwrites:
3259 /*
3260 * make sure all async writes that are part of this stream
3261 * have completed before we proceed
3262 */
3263 cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
3264
3265 if (iostate.io_error) {
3266 error = iostate.io_error;
3267 }
3268
3269 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
3270
3271 if (error == 0 && tail_size) {
3272 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
3273 }
3274
3275 for (n = 0; n < num_upl; n++) {
3276 /*
3277 * just release our hold on each physically contiguous
3278 * region without changing any state
3279 */
3280 ubc_upl_abort(upl[n], 0);
3281 }
3282
3283 return error;
3284 }
3285
3286
3287 /*
3288 * need to avoid a race between an msync of a range of pages dirtied via mmap
3289 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3290 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3291 *
3292 * we should never force-zero-fill pages that are already valid in the cache...
3293 * the entire page contains valid data (either from disk, zero-filled or dirtied
3294 * via an mmap) so we can only do damage by trying to zero-fill
3295 *
3296 */
3297 static int
cluster_zero_range(upl_t upl,upl_page_info_t * pl,int flags,int io_offset,off_t zero_off,off_t upl_f_offset,int bytes_to_zero)3298 cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3299 {
3300 int zero_pg_index;
3301 boolean_t need_cluster_zero = TRUE;
3302
3303 if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
3304 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
3305 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3306
3307 if (upl_valid_page(pl, zero_pg_index)) {
3308 /*
3309 * never force zero valid pages - dirty or clean
3310 * we'll leave these in the UPL for cluster_write_copy to deal with
3311 */
3312 need_cluster_zero = FALSE;
3313 }
3314 }
3315 if (need_cluster_zero == TRUE) {
3316 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
3317 }
3318
3319 return bytes_to_zero;
3320 }
3321
3322
3323 void
cluster_update_state(vnode_t vp,vm_object_offset_t s_offset,vm_object_offset_t e_offset,boolean_t vm_initiated)3324 cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3325 {
3326 struct cl_extent cl;
3327 boolean_t first_pass = TRUE;
3328
3329 assert(s_offset < e_offset);
3330 assert((s_offset & PAGE_MASK_64) == 0);
3331 assert((e_offset & PAGE_MASK_64) == 0);
3332
3333 cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3334 cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3335
3336 cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
3337 vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3338 }
3339
3340
3341 static void
cluster_update_state_internal(vnode_t vp,struct cl_extent * cl,int flags,boolean_t defer_writes,boolean_t * first_pass,off_t write_off,int write_cnt,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)3342 cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
3343 boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3344 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
3345 {
3346 struct cl_writebehind *wbp;
3347 int cl_index;
3348 int ret_cluster_try_push;
3349 u_int max_cluster_pgcount;
3350
3351
3352 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3353
3354 /*
3355 * take the lock to protect our accesses
3356 * of the writebehind and sparse cluster state
3357 */
3358 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3359
3360 if (wbp->cl_scmap) {
3361 if (!(flags & IO_NOCACHE)) {
3362 /*
3363 * we've fallen into the sparse
3364 * cluster method of delaying dirty pages
3365 */
3366 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3367
3368 lck_mtx_unlock(&wbp->cl_lockw);
3369 return;
3370 }
3371 /*
3372 * must have done cached writes that fell into
3373 * the sparse cluster mechanism... we've switched
3374 * to uncached writes on the file, so go ahead
3375 * and push whatever's in the sparse map
3376 * and switch back to normal clustering
3377 */
3378 wbp->cl_number = 0;
3379
3380 sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
3381 /*
3382 * no clusters of either type present at this point
3383 * so just go directly to start_new_cluster since
3384 * we know we need to delay this I/O since we've
3385 * already released the pages back into the cache
3386 * to avoid the deadlock with sparse_cluster_push
3387 */
3388 goto start_new_cluster;
3389 }
3390 if (*first_pass == TRUE) {
3391 if (write_off == wbp->cl_last_write) {
3392 wbp->cl_seq_written += write_cnt;
3393 } else {
3394 wbp->cl_seq_written = write_cnt;
3395 }
3396
3397 wbp->cl_last_write = write_off + write_cnt;
3398
3399 *first_pass = FALSE;
3400 }
3401 if (wbp->cl_number == 0) {
3402 /*
3403 * no clusters currently present
3404 */
3405 goto start_new_cluster;
3406 }
3407
3408 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3409 /*
3410 * check each cluster that we currently hold
3411 * try to merge some or all of this write into
3412 * one or more of the existing clusters... if
3413 * any portion of the write remains, start a
3414 * new cluster
3415 */
3416 if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3417 /*
3418 * the current write starts at or after the current cluster
3419 */
3420 if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3421 /*
3422 * we have a write that fits entirely
3423 * within the existing cluster limits
3424 */
3425 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3426 /*
3427 * update our idea of where the cluster ends
3428 */
3429 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3430 }
3431 break;
3432 }
3433 if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3434 /*
3435 * we have a write that starts in the middle of the current cluster
3436 * but extends beyond the cluster's limit... we know this because
3437 * of the previous checks
3438 * we'll extend the current cluster to the max
3439 * and update the b_addr for the current write to reflect that
3440 * the head of it was absorbed into this cluster...
3441 * note that we'll always have a leftover tail in this case since
3442 * full absorbtion would have occurred in the clause above
3443 */
3444 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3445
3446 cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3447 }
3448 /*
3449 * we come here for the case where the current write starts
3450 * beyond the limit of the existing cluster or we have a leftover
3451 * tail after a partial absorbtion
3452 *
3453 * in either case, we'll check the remaining clusters before
3454 * starting a new one
3455 */
3456 } else {
3457 /*
3458 * the current write starts in front of the cluster we're currently considering
3459 */
3460 if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3461 /*
3462 * we can just merge the new request into
3463 * this cluster and leave it in the cache
3464 * since the resulting cluster is still
3465 * less than the maximum allowable size
3466 */
3467 wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3468
3469 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3470 /*
3471 * the current write completely
3472 * envelops the existing cluster and since
3473 * each write is limited to at most max_cluster_pgcount pages
3474 * we can just use the start and last blocknos of the write
3475 * to generate the cluster limits
3476 */
3477 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3478 }
3479 break;
3480 }
3481 /*
3482 * if we were to combine this write with the current cluster
3483 * we would exceed the cluster size limit.... so,
3484 * let's see if there's any overlap of the new I/O with
3485 * the cluster we're currently considering... in fact, we'll
3486 * stretch the cluster out to it's full limit and see if we
3487 * get an intersection with the current write
3488 *
3489 */
3490 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3491 /*
3492 * the current write extends into the proposed cluster
3493 * clip the length of the current write after first combining it's
3494 * tail with the newly shaped cluster
3495 */
3496 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3497
3498 cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3499 }
3500 /*
3501 * if we get here, there was no way to merge
3502 * any portion of this write with this cluster
3503 * or we could only merge part of it which
3504 * will leave a tail...
3505 * we'll check the remaining clusters before starting a new one
3506 */
3507 }
3508 }
3509 if (cl_index < wbp->cl_number) {
3510 /*
3511 * we found an existing cluster(s) that we
3512 * could entirely merge this I/O into
3513 */
3514 goto delay_io;
3515 }
3516
3517 if (defer_writes == FALSE &&
3518 wbp->cl_number == MAX_CLUSTERS &&
3519 wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3520 uint32_t n;
3521
3522 if (vp->v_mount->mnt_minsaturationbytecount) {
3523 n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
3524
3525 if (n > MAX_CLUSTERS) {
3526 n = MAX_CLUSTERS;
3527 }
3528 } else {
3529 n = 0;
3530 }
3531
3532 if (n == 0) {
3533 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
3534 n = WRITE_BEHIND_SSD;
3535 } else {
3536 n = WRITE_BEHIND;
3537 }
3538 }
3539 while (n--) {
3540 cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
3541 }
3542 }
3543 if (wbp->cl_number < MAX_CLUSTERS) {
3544 /*
3545 * we didn't find an existing cluster to
3546 * merge into, but there's room to start
3547 * a new one
3548 */
3549 goto start_new_cluster;
3550 }
3551 /*
3552 * no exisitng cluster to merge with and no
3553 * room to start a new one... we'll try
3554 * pushing one of the existing ones... if none of
3555 * them are able to be pushed, we'll switch
3556 * to the sparse cluster mechanism
3557 * cluster_try_push updates cl_number to the
3558 * number of remaining clusters... and
3559 * returns the number of currently unused clusters
3560 */
3561 ret_cluster_try_push = 0;
3562
3563 /*
3564 * if writes are not deferred, call cluster push immediately
3565 */
3566 if (defer_writes == FALSE) {
3567 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
3568 }
3569 /*
3570 * execute following regardless of writes being deferred or not
3571 */
3572 if (ret_cluster_try_push == 0) {
3573 /*
3574 * no more room in the normal cluster mechanism
3575 * so let's switch to the more expansive but expensive
3576 * sparse mechanism....
3577 */
3578 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
3579 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3580
3581 lck_mtx_unlock(&wbp->cl_lockw);
3582 return;
3583 }
3584 start_new_cluster:
3585 wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3586 wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3587
3588 wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3589
3590 if (flags & IO_NOCACHE) {
3591 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3592 }
3593
3594 if (flags & IO_PASSIVE) {
3595 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3596 }
3597
3598 wbp->cl_number++;
3599 delay_io:
3600 lck_mtx_unlock(&wbp->cl_lockw);
3601 return;
3602 }
3603
3604
3605 static int
cluster_write_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int flags,int (* callback)(buf_t,void *),void * callback_arg)3606 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
3607 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3608 {
3609 upl_page_info_t *pl;
3610 upl_t upl;
3611 vm_offset_t upl_offset = 0;
3612 vm_size_t upl_size;
3613 off_t upl_f_offset;
3614 int pages_in_upl;
3615 int start_offset;
3616 int xfer_resid;
3617 int io_size;
3618 int io_offset;
3619 int bytes_to_zero;
3620 int bytes_to_move;
3621 kern_return_t kret;
3622 int retval = 0;
3623 int io_resid;
3624 long long total_size;
3625 long long zero_cnt;
3626 off_t zero_off;
3627 long long zero_cnt1;
3628 off_t zero_off1;
3629 off_t write_off = 0;
3630 int write_cnt = 0;
3631 boolean_t first_pass = FALSE;
3632 struct cl_extent cl;
3633 int bflag;
3634 u_int max_io_size;
3635
3636 if (uio) {
3637 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3638 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
3639
3640 io_resid = io_req_size;
3641 } else {
3642 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3643 0, 0, (int)oldEOF, (int)newEOF, 0);
3644
3645 io_resid = 0;
3646 }
3647 if (flags & IO_PASSIVE) {
3648 bflag = CL_PASSIVE;
3649 } else {
3650 bflag = 0;
3651 }
3652 if (flags & IO_NOCACHE) {
3653 bflag |= CL_NOCACHE;
3654 }
3655
3656 if (flags & IO_SKIP_ENCRYPTION) {
3657 bflag |= CL_ENCRYPTED;
3658 }
3659
3660 zero_cnt = 0;
3661 zero_cnt1 = 0;
3662 zero_off = 0;
3663 zero_off1 = 0;
3664
3665 max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
3666
3667 if (flags & IO_HEADZEROFILL) {
3668 /*
3669 * some filesystems (HFS is one) don't support unallocated holes within a file...
3670 * so we zero fill the intervening space between the old EOF and the offset
3671 * where the next chunk of real data begins.... ftruncate will also use this
3672 * routine to zero fill to the new EOF when growing a file... in this case, the
3673 * uio structure will not be provided
3674 */
3675 if (uio) {
3676 if (headOff < uio->uio_offset) {
3677 zero_cnt = uio->uio_offset - headOff;
3678 zero_off = headOff;
3679 }
3680 } else if (headOff < newEOF) {
3681 zero_cnt = newEOF - headOff;
3682 zero_off = headOff;
3683 }
3684 } else {
3685 if (uio && uio->uio_offset > oldEOF) {
3686 zero_off = uio->uio_offset & ~PAGE_MASK_64;
3687
3688 if (zero_off >= oldEOF) {
3689 zero_cnt = uio->uio_offset - zero_off;
3690
3691 flags |= IO_HEADZEROFILL;
3692 }
3693 }
3694 }
3695 if (flags & IO_TAILZEROFILL) {
3696 if (uio) {
3697 zero_off1 = uio->uio_offset + io_req_size;
3698
3699 if (zero_off1 < tailOff) {
3700 zero_cnt1 = tailOff - zero_off1;
3701 }
3702 }
3703 } else {
3704 if (uio && newEOF > oldEOF) {
3705 zero_off1 = uio->uio_offset + io_req_size;
3706
3707 if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3708 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3709
3710 flags |= IO_TAILZEROFILL;
3711 }
3712 }
3713 }
3714 if (zero_cnt == 0 && uio == (struct uio *) 0) {
3715 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3716 retval, 0, 0, 0, 0);
3717 return 0;
3718 }
3719 if (uio) {
3720 write_off = uio->uio_offset;
3721 write_cnt = (int)uio_resid(uio);
3722 /*
3723 * delay updating the sequential write info
3724 * in the control block until we've obtained
3725 * the lock for it
3726 */
3727 first_pass = TRUE;
3728 }
3729 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
3730 /*
3731 * for this iteration of the loop, figure out where our starting point is
3732 */
3733 if (zero_cnt) {
3734 start_offset = (int)(zero_off & PAGE_MASK_64);
3735 upl_f_offset = zero_off - start_offset;
3736 } else if (io_resid) {
3737 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3738 upl_f_offset = uio->uio_offset - start_offset;
3739 } else {
3740 start_offset = (int)(zero_off1 & PAGE_MASK_64);
3741 upl_f_offset = zero_off1 - start_offset;
3742 }
3743 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3744 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
3745
3746 if (total_size > max_io_size) {
3747 total_size = max_io_size;
3748 }
3749
3750 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3751
3752 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
3753 /*
3754 * assumption... total_size <= io_resid
3755 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3756 */
3757 if ((start_offset + total_size) > max_io_size) {
3758 total_size = max_io_size - start_offset;
3759 }
3760 xfer_resid = (int)total_size;
3761
3762 retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
3763
3764 if (retval) {
3765 break;
3766 }
3767
3768 io_resid -= (total_size - xfer_resid);
3769 total_size = xfer_resid;
3770 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3771 upl_f_offset = uio->uio_offset - start_offset;
3772
3773 if (total_size == 0) {
3774 if (start_offset) {
3775 /*
3776 * the write did not finish on a page boundary
3777 * which will leave upl_f_offset pointing to the
3778 * beginning of the last page written instead of
3779 * the page beyond it... bump it in this case
3780 * so that the cluster code records the last page
3781 * written as dirty
3782 */
3783 upl_f_offset += PAGE_SIZE_64;
3784 }
3785 upl_size = 0;
3786
3787 goto check_cluster;
3788 }
3789 }
3790 /*
3791 * compute the size of the upl needed to encompass
3792 * the requested write... limit each call to cluster_io
3793 * to the maximum UPL size... cluster_io will clip if
3794 * this exceeds the maximum io_size for the device,
3795 * make sure to account for
3796 * a starting offset that's not page aligned
3797 */
3798 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3799
3800 if (upl_size > max_io_size) {
3801 upl_size = max_io_size;
3802 }
3803
3804 pages_in_upl = (int)(upl_size / PAGE_SIZE);
3805 io_size = (int)(upl_size - start_offset);
3806
3807 if ((long long)io_size > total_size) {
3808 io_size = (int)total_size;
3809 }
3810
3811 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
3812
3813
3814 /*
3815 * Gather the pages from the buffer cache.
3816 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3817 * that we intend to modify these pages.
3818 */
3819 kret = ubc_create_upl_kernel(vp,
3820 upl_f_offset,
3821 (int)upl_size,
3822 &upl,
3823 &pl,
3824 UPL_SET_LITE | ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
3825 VM_KERN_MEMORY_FILE);
3826 if (kret != KERN_SUCCESS) {
3827 panic("cluster_write_copy: failed to get pagelist");
3828 }
3829
3830 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
3831 upl, (int)upl_f_offset, start_offset, 0, 0);
3832
3833 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
3834 int read_size;
3835
3836 /*
3837 * we're starting in the middle of the first page of the upl
3838 * and the page isn't currently valid, so we're going to have
3839 * to read it in first... this is a synchronous operation
3840 */
3841 read_size = PAGE_SIZE;
3842
3843 if ((upl_f_offset + read_size) > oldEOF) {
3844 read_size = (int)(oldEOF - upl_f_offset);
3845 }
3846
3847 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
3848 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3849 if (retval) {
3850 /*
3851 * we had an error during the read which causes us to abort
3852 * the current cluster_write request... before we do, we need
3853 * to release the rest of the pages in the upl without modifying
3854 * there state and mark the failed page in error
3855 */
3856 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3857
3858 if (upl_size > PAGE_SIZE) {
3859 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size,
3860 UPL_ABORT_FREE_ON_EMPTY);
3861 }
3862
3863 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3864 upl, 0, 0, retval, 0);
3865 break;
3866 }
3867 }
3868 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3869 /*
3870 * the last offset we're writing to in this upl does not end on a page
3871 * boundary... if it's not beyond the old EOF, then we'll also need to
3872 * pre-read this page in if it isn't already valid
3873 */
3874 upl_offset = upl_size - PAGE_SIZE;
3875
3876 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3877 !upl_valid_page(pl, (int)(upl_offset / PAGE_SIZE))) {
3878 int read_size;
3879
3880 read_size = PAGE_SIZE;
3881
3882 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
3883 read_size = (int)(oldEOF - (upl_f_offset + upl_offset));
3884 }
3885
3886 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
3887 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3888 if (retval) {
3889 /*
3890 * we had an error during the read which causes us to abort
3891 * the current cluster_write request... before we do, we
3892 * need to release the rest of the pages in the upl without
3893 * modifying there state and mark the failed page in error
3894 */
3895 ubc_upl_abort_range(upl, (upl_offset_t)upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3896
3897 if (upl_size > PAGE_SIZE) {
3898 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3899 }
3900
3901 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3902 upl, 0, 0, retval, 0);
3903 break;
3904 }
3905 }
3906 }
3907 xfer_resid = io_size;
3908 io_offset = start_offset;
3909
3910 while (zero_cnt && xfer_resid) {
3911 if (zero_cnt < (long long)xfer_resid) {
3912 bytes_to_zero = (int)zero_cnt;
3913 } else {
3914 bytes_to_zero = xfer_resid;
3915 }
3916
3917 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3918
3919 xfer_resid -= bytes_to_zero;
3920 zero_cnt -= bytes_to_zero;
3921 zero_off += bytes_to_zero;
3922 io_offset += bytes_to_zero;
3923 }
3924 if (xfer_resid && io_resid) {
3925 u_int32_t io_requested;
3926
3927 bytes_to_move = min(io_resid, xfer_resid);
3928 io_requested = bytes_to_move;
3929
3930 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3931
3932 if (retval) {
3933 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3934
3935 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3936 upl, 0, 0, retval, 0);
3937 } else {
3938 io_resid -= bytes_to_move;
3939 xfer_resid -= bytes_to_move;
3940 io_offset += bytes_to_move;
3941 }
3942 }
3943 while (xfer_resid && zero_cnt1 && retval == 0) {
3944 if (zero_cnt1 < (long long)xfer_resid) {
3945 bytes_to_zero = (int)zero_cnt1;
3946 } else {
3947 bytes_to_zero = xfer_resid;
3948 }
3949
3950 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
3951
3952 xfer_resid -= bytes_to_zero;
3953 zero_cnt1 -= bytes_to_zero;
3954 zero_off1 += bytes_to_zero;
3955 io_offset += bytes_to_zero;
3956 }
3957 if (retval == 0) {
3958 int do_zeroing = 1;
3959
3960 io_size += start_offset;
3961
3962 /* Force more restrictive zeroing behavior only on APFS */
3963 if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3964 do_zeroing = 0;
3965 }
3966
3967 if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3968 /*
3969 * if we're extending the file with this write
3970 * we'll zero fill the rest of the page so that
3971 * if the file gets extended again in such a way as to leave a
3972 * hole starting at this EOF, we'll have zero's in the correct spot
3973 */
3974 cluster_zero(upl, io_size, (int)(upl_size - io_size), NULL);
3975 }
3976 /*
3977 * release the upl now if we hold one since...
3978 * 1) pages in it may be present in the sparse cluster map
3979 * and may span 2 separate buckets there... if they do and
3980 * we happen to have to flush a bucket to make room and it intersects
3981 * this upl, a deadlock may result on page BUSY
3982 * 2) we're delaying the I/O... from this point forward we're just updating
3983 * the cluster state... no need to hold the pages, so commit them
3984 * 3) IO_SYNC is set...
3985 * because we had to ask for a UPL that provides currenty non-present pages, the
3986 * UPL has been automatically set to clear the dirty flags (both software and hardware)
3987 * upon committing it... this is not the behavior we want since it's possible for
3988 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3989 * we'll pick these pages back up later with the correct behavior specified.
3990 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3991 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3992 * we hold since the flushing context is holding the cluster lock.
3993 */
3994 ubc_upl_commit_range(upl, 0, (upl_size_t)upl_size,
3995 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
3996 check_cluster:
3997 /*
3998 * calculate the last logical block number
3999 * that this delayed I/O encompassed
4000 */
4001 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
4002
4003 if (flags & IO_SYNC) {
4004 /*
4005 * if the IO_SYNC flag is set than we need to bypass
4006 * any clustering and immediately issue the I/O
4007 *
4008 * we don't hold the lock at this point
4009 *
4010 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
4011 * so that we correctly deal with a change in state of the hardware modify bit...
4012 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
4013 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
4014 * responsible for generating the correct sized I/O(s)
4015 */
4016 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
4017 } else {
4018 boolean_t defer_writes = FALSE;
4019
4020 if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) {
4021 defer_writes = TRUE;
4022 }
4023
4024 cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
4025 write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
4026 }
4027 }
4028 }
4029 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
4030
4031 return retval;
4032 }
4033
4034
4035
4036 int
cluster_read(vnode_t vp,struct uio * uio,off_t filesize,int xflags)4037 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
4038 {
4039 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
4040 }
4041
4042
4043 int
cluster_read_ext(vnode_t vp,struct uio * uio,off_t filesize,int xflags,int (* callback)(buf_t,void *),void * callback_arg)4044 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
4045 {
4046 int retval = 0;
4047 int flags;
4048 user_ssize_t cur_resid;
4049 u_int32_t io_size;
4050 u_int32_t read_length = 0;
4051 int read_type = IO_COPY;
4052
4053 flags = xflags;
4054
4055 if (vp->v_flag & VNOCACHE_DATA) {
4056 flags |= IO_NOCACHE;
4057 }
4058 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) {
4059 flags |= IO_RAOFF;
4060 }
4061
4062 if (flags & IO_SKIP_ENCRYPTION) {
4063 flags |= IO_ENCRYPTED;
4064 }
4065
4066 /*
4067 * do a read through the cache if one of the following is true....
4068 * NOCACHE is not true
4069 * the uio request doesn't target USERSPACE
4070 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
4071 * Reading encrypted data from a CP filesystem should never result in the data touching
4072 * the UBC.
4073 *
4074 * otherwise, find out if we want the direct or contig variant for
4075 * the first vector in the uio request
4076 */
4077 if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED)) {
4078 retval = cluster_io_type(uio, &read_type, &read_length, 0);
4079 }
4080
4081 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
4082 switch (read_type) {
4083 case IO_COPY:
4084 /*
4085 * make sure the uio_resid isn't too big...
4086 * internally, we want to handle all of the I/O in
4087 * chunk sizes that fit in a 32 bit int
4088 */
4089 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
4090 io_size = MAX_IO_REQUEST_SIZE;
4091 } else {
4092 io_size = (u_int32_t)cur_resid;
4093 }
4094
4095 retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
4096 break;
4097
4098 case IO_DIRECT:
4099 retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
4100 break;
4101
4102 case IO_CONTIG:
4103 retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
4104 break;
4105
4106 case IO_UNKNOWN:
4107 retval = cluster_io_type(uio, &read_type, &read_length, 0);
4108 break;
4109 }
4110 }
4111 return retval;
4112 }
4113
4114
4115
4116 static void
cluster_read_upl_release(upl_t upl,int start_pg,int last_pg,int take_reference)4117 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
4118 {
4119 int range;
4120 int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4121
4122 if ((range = last_pg - start_pg)) {
4123 if (take_reference) {
4124 abort_flags |= UPL_ABORT_REFERENCE;
4125 }
4126
4127 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
4128 }
4129 }
4130
4131
4132 static int
cluster_read_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)4133 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4134 {
4135 upl_page_info_t *pl;
4136 upl_t upl = NULL;
4137 vm_offset_t upl_offset;
4138 u_int32_t upl_size;
4139 off_t upl_f_offset;
4140 int start_offset;
4141 int start_pg;
4142 int last_pg;
4143 int uio_last = 0;
4144 int pages_in_upl;
4145 off_t max_size;
4146 off_t last_ioread_offset;
4147 off_t last_request_offset;
4148 kern_return_t kret;
4149 int error = 0;
4150 int retval = 0;
4151 u_int32_t size_of_prefetch;
4152 u_int32_t xsize;
4153 u_int32_t io_size;
4154 u_int32_t max_rd_size;
4155 u_int32_t max_io_size;
4156 u_int32_t max_prefetch;
4157 u_int rd_ahead_enabled = 1;
4158 u_int prefetch_enabled = 1;
4159 struct cl_readahead * rap;
4160 struct clios iostate;
4161 struct cl_extent extent;
4162 int bflag;
4163 int take_reference = 1;
4164 int policy = IOPOL_DEFAULT;
4165 boolean_t iolock_inited = FALSE;
4166
4167 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
4168 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
4169
4170 if (flags & IO_ENCRYPTED) {
4171 panic("encrypted blocks will hit UBC!");
4172 }
4173
4174 policy = throttle_get_io_policy(NULL);
4175
4176 if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) {
4177 take_reference = 0;
4178 }
4179
4180 if (flags & IO_PASSIVE) {
4181 bflag = CL_PASSIVE;
4182 } else {
4183 bflag = 0;
4184 }
4185
4186 if (flags & IO_NOCACHE) {
4187 bflag |= CL_NOCACHE;
4188 }
4189
4190 if (flags & IO_SKIP_ENCRYPTION) {
4191 bflag |= CL_ENCRYPTED;
4192 }
4193
4194 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
4195 max_prefetch = cluster_max_prefetch(vp, max_io_size, prefetch_max);
4196 max_rd_size = max_prefetch;
4197
4198 last_request_offset = uio->uio_offset + io_req_size;
4199
4200 if (last_request_offset > filesize) {
4201 last_request_offset = filesize;
4202 }
4203
4204 if ((flags & (IO_RAOFF | IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
4205 rd_ahead_enabled = 0;
4206 rap = NULL;
4207 } else {
4208 if (cluster_is_throttled(vp)) {
4209 /*
4210 * we're in the throttle window, at the very least
4211 * we want to limit the size of the I/O we're about
4212 * to issue
4213 */
4214 rd_ahead_enabled = 0;
4215 prefetch_enabled = 0;
4216
4217 max_rd_size = calculate_max_throttle_size(vp);
4218 }
4219 if ((rap = cluster_get_rap(vp)) == NULL) {
4220 rd_ahead_enabled = 0;
4221 } else {
4222 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
4223 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
4224 }
4225 }
4226 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
4227 /*
4228 * determine if we already have a read-ahead in the pipe courtesy of the
4229 * last read systemcall that was issued...
4230 * if so, pick up it's extent to determine where we should start
4231 * with respect to any read-ahead that might be necessary to
4232 * garner all the data needed to complete this read systemcall
4233 */
4234 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
4235
4236 if (last_ioread_offset < uio->uio_offset) {
4237 last_ioread_offset = (off_t)0;
4238 } else if (last_ioread_offset > last_request_offset) {
4239 last_ioread_offset = last_request_offset;
4240 }
4241 } else {
4242 last_ioread_offset = (off_t)0;
4243 }
4244
4245 while (io_req_size && uio->uio_offset < filesize && retval == 0) {
4246 max_size = filesize - uio->uio_offset;
4247 bool leftover_upl_aborted = false;
4248
4249 if ((off_t)(io_req_size) < max_size) {
4250 io_size = io_req_size;
4251 } else {
4252 io_size = (u_int32_t)max_size;
4253 }
4254
4255 if (!(flags & IO_NOCACHE)) {
4256 while (io_size) {
4257 u_int32_t io_resid;
4258 u_int32_t io_requested;
4259
4260 /*
4261 * if we keep finding the pages we need already in the cache, then
4262 * don't bother to call cluster_read_prefetch since it costs CPU cycles
4263 * to determine that we have all the pages we need... once we miss in
4264 * the cache and have issued an I/O, than we'll assume that we're likely
4265 * to continue to miss in the cache and it's to our advantage to try and prefetch
4266 */
4267 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset))) {
4268 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4269 /*
4270 * we've already issued I/O for this request and
4271 * there's still work to do and
4272 * our prefetch stream is running dry, so issue a
4273 * pre-fetch I/O... the I/O latency will overlap
4274 * with the copying of the data
4275 */
4276 if (size_of_prefetch > max_rd_size) {
4277 size_of_prefetch = max_rd_size;
4278 }
4279
4280 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4281
4282 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4283
4284 if (last_ioread_offset > last_request_offset) {
4285 last_ioread_offset = last_request_offset;
4286 }
4287 }
4288 }
4289 /*
4290 * limit the size of the copy we're about to do so that
4291 * we can notice that our I/O pipe is running dry and
4292 * get the next I/O issued before it does go dry
4293 */
4294 if (last_ioread_offset && io_size > (max_io_size / 4)) {
4295 io_resid = (max_io_size / 4);
4296 } else {
4297 io_resid = io_size;
4298 }
4299
4300 io_requested = io_resid;
4301
4302 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
4303
4304 xsize = io_requested - io_resid;
4305
4306 io_size -= xsize;
4307 io_req_size -= xsize;
4308
4309 if (retval || io_resid) {
4310 /*
4311 * if we run into a real error or
4312 * a page that is not in the cache
4313 * we need to leave streaming mode
4314 */
4315 break;
4316 }
4317
4318 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
4319 /*
4320 * we're already finished the I/O for this read request
4321 * let's see if we should do a read-ahead
4322 */
4323 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4324 }
4325 }
4326 if (retval) {
4327 break;
4328 }
4329 if (io_size == 0) {
4330 if (rap != NULL) {
4331 if (extent.e_addr < rap->cl_lastr) {
4332 rap->cl_maxra = 0;
4333 }
4334 rap->cl_lastr = extent.e_addr;
4335 }
4336 break;
4337 }
4338 /*
4339 * recompute max_size since cluster_copy_ubc_data_internal
4340 * may have advanced uio->uio_offset
4341 */
4342 max_size = filesize - uio->uio_offset;
4343 }
4344
4345 iostate.io_completed = 0;
4346 iostate.io_issued = 0;
4347 iostate.io_error = 0;
4348 iostate.io_wanted = 0;
4349
4350 if ((flags & IO_RETURN_ON_THROTTLE)) {
4351 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4352 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4353 /*
4354 * we're in the throttle window and at least 1 I/O
4355 * has already been issued by a throttleable thread
4356 * in this window, so return with EAGAIN to indicate
4357 * to the FS issuing the cluster_read call that it
4358 * should now throttle after dropping any locks
4359 */
4360 throttle_info_update_by_mount(vp->v_mount);
4361
4362 retval = EAGAIN;
4363 break;
4364 }
4365 }
4366 }
4367
4368 /*
4369 * compute the size of the upl needed to encompass
4370 * the requested read... limit each call to cluster_io
4371 * to the maximum UPL size... cluster_io will clip if
4372 * this exceeds the maximum io_size for the device,
4373 * make sure to account for
4374 * a starting offset that's not page aligned
4375 */
4376 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4377 upl_f_offset = uio->uio_offset - (off_t)start_offset;
4378
4379 if (io_size > max_rd_size) {
4380 io_size = max_rd_size;
4381 }
4382
4383 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4384
4385 if (flags & IO_NOCACHE) {
4386 if (upl_size > max_io_size) {
4387 upl_size = max_io_size;
4388 }
4389 } else {
4390 if (upl_size > max_io_size / 4) {
4391 upl_size = max_io_size / 4;
4392 upl_size &= ~PAGE_MASK;
4393
4394 if (upl_size == 0) {
4395 upl_size = PAGE_SIZE;
4396 }
4397 }
4398 }
4399 pages_in_upl = upl_size / PAGE_SIZE;
4400
4401 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
4402 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4403
4404 kret = ubc_create_upl_kernel(vp,
4405 upl_f_offset,
4406 upl_size,
4407 &upl,
4408 &pl,
4409 UPL_FILE_IO | UPL_SET_LITE,
4410 VM_KERN_MEMORY_FILE);
4411 if (kret != KERN_SUCCESS) {
4412 panic("cluster_read_copy: failed to get pagelist");
4413 }
4414
4415 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
4416 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4417
4418 /*
4419 * scan from the beginning of the upl looking for the first
4420 * non-valid page.... this will become the first page in
4421 * the request we're going to make to 'cluster_io'... if all
4422 * of the pages are valid, we won't call through to 'cluster_io'
4423 */
4424 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
4425 if (!upl_valid_page(pl, start_pg)) {
4426 break;
4427 }
4428 }
4429
4430 /*
4431 * scan from the starting invalid page looking for a valid
4432 * page before the end of the upl is reached, if we
4433 * find one, then it will be the last page of the request to
4434 * 'cluster_io'
4435 */
4436 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4437 if (upl_valid_page(pl, last_pg)) {
4438 break;
4439 }
4440 }
4441
4442 if (start_pg < last_pg) {
4443 /*
4444 * we found a range of 'invalid' pages that must be filled
4445 * if the last page in this range is the last page of the file
4446 * we may have to clip the size of it to keep from reading past
4447 * the end of the last physical block associated with the file
4448 */
4449 if (iolock_inited == FALSE) {
4450 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4451
4452 iolock_inited = TRUE;
4453 }
4454 upl_offset = start_pg * PAGE_SIZE;
4455 io_size = (last_pg - start_pg) * PAGE_SIZE;
4456
4457 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
4458 io_size = (u_int32_t)(filesize - (upl_f_offset + upl_offset));
4459 }
4460
4461 /*
4462 * Find out if this needs verification, we'll have to manage the UPL
4463 * diffrently if so. Note that this call only lets us know if
4464 * verification is enabled on this mount point, the actual verification
4465 * is performed in the File system.
4466 */
4467 size_t verify_block_size = 0;
4468 if ((VNOP_VERIFY(vp, start_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) /* && verify_block_size */) {
4469 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4470 if (!upl_valid_page(pl, uio_last)) {
4471 break;
4472 }
4473 }
4474 if (uio_last < pages_in_upl) {
4475 /*
4476 * there were some invalid pages beyond the valid pages
4477 * that we didn't issue an I/O for, just release them
4478 * unchanged now, so that any prefetch/readahed can
4479 * include them
4480 */
4481 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4482 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4483 leftover_upl_aborted = true;
4484 }
4485 }
4486
4487 /*
4488 * issue an asynchronous read to cluster_io
4489 */
4490
4491 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
4492 io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
4493
4494 if (rap) {
4495 if (extent.e_addr < rap->cl_maxra) {
4496 /*
4497 * we've just issued a read for a block that should have been
4498 * in the cache courtesy of the read-ahead engine... something
4499 * has gone wrong with the pipeline, so reset the read-ahead
4500 * logic which will cause us to restart from scratch
4501 */
4502 rap->cl_maxra = 0;
4503 }
4504 }
4505 }
4506 if (error == 0) {
4507 /*
4508 * if the read completed successfully, or there was no I/O request
4509 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4510 * we'll first add on any 'valid'
4511 * pages that were present in the upl when we acquired it.
4512 */
4513 u_int val_size;
4514
4515 if (!leftover_upl_aborted) {
4516 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4517 if (!upl_valid_page(pl, uio_last)) {
4518 break;
4519 }
4520 }
4521 if (uio_last < pages_in_upl) {
4522 /*
4523 * there were some invalid pages beyond the valid pages
4524 * that we didn't issue an I/O for, just release them
4525 * unchanged now, so that any prefetch/readahed can
4526 * include them
4527 */
4528 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4529 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4530 }
4531 }
4532
4533 /*
4534 * compute size to transfer this round, if io_req_size is
4535 * still non-zero after this attempt, we'll loop around and
4536 * set up for another I/O.
4537 */
4538 val_size = (uio_last * PAGE_SIZE) - start_offset;
4539
4540 if (val_size > max_size) {
4541 val_size = (u_int)max_size;
4542 }
4543
4544 if (val_size > io_req_size) {
4545 val_size = io_req_size;
4546 }
4547
4548 if ((uio->uio_offset + val_size) > last_ioread_offset) {
4549 last_ioread_offset = uio->uio_offset + val_size;
4550 }
4551
4552 if ((size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4553 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4554 /*
4555 * if there's still I/O left to do for this request, and...
4556 * we're not in hard throttle mode, and...
4557 * we're close to using up the previous prefetch, then issue a
4558 * new pre-fetch I/O... the I/O latency will overlap
4559 * with the copying of the data
4560 */
4561 if (size_of_prefetch > max_rd_size) {
4562 size_of_prefetch = max_rd_size;
4563 }
4564
4565 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4566
4567 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4568
4569 if (last_ioread_offset > last_request_offset) {
4570 last_ioread_offset = last_request_offset;
4571 }
4572 }
4573 } else if ((uio->uio_offset + val_size) == last_request_offset) {
4574 /*
4575 * this transfer will finish this request, so...
4576 * let's try to read ahead if we're in
4577 * a sequential access pattern and we haven't
4578 * explicitly disabled it
4579 */
4580 if (rd_ahead_enabled) {
4581 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4582 }
4583
4584 if (rap != NULL) {
4585 if (extent.e_addr < rap->cl_lastr) {
4586 rap->cl_maxra = 0;
4587 }
4588 rap->cl_lastr = extent.e_addr;
4589 }
4590 }
4591 if (iolock_inited == TRUE) {
4592 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4593 }
4594
4595 if (iostate.io_error) {
4596 error = iostate.io_error;
4597 } else {
4598 u_int32_t io_requested;
4599
4600 io_requested = val_size;
4601
4602 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4603
4604 io_req_size -= (val_size - io_requested);
4605 }
4606 } else {
4607 if (iolock_inited == TRUE) {
4608 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4609 }
4610 }
4611 if (start_pg < last_pg) {
4612 /*
4613 * compute the range of pages that we actually issued an I/O for
4614 * and either commit them as valid if the I/O succeeded
4615 * or abort them if the I/O failed or we're not supposed to
4616 * keep them in the cache
4617 */
4618 io_size = (last_pg - start_pg) * PAGE_SIZE;
4619
4620 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4621
4622 if (error || (flags & IO_NOCACHE)) {
4623 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4624 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4625 } else {
4626 int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
4627
4628 if (take_reference) {
4629 commit_flags |= UPL_COMMIT_INACTIVATE;
4630 } else {
4631 commit_flags |= UPL_COMMIT_SPECULATE;
4632 }
4633
4634 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4635 }
4636 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4637 }
4638 if ((last_pg - start_pg) < pages_in_upl) {
4639 /*
4640 * the set of pages that we issued an I/O for did not encompass
4641 * the entire upl... so just release these without modifying
4642 * their state
4643 */
4644 if (error) {
4645 if (leftover_upl_aborted) {
4646 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, (uio_last - start_pg) * PAGE_SIZE,
4647 UPL_ABORT_FREE_ON_EMPTY);
4648 } else {
4649 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4650 }
4651 } else {
4652 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
4653 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
4654
4655 /*
4656 * handle any valid pages at the beginning of
4657 * the upl... release these appropriately
4658 */
4659 cluster_read_upl_release(upl, 0, start_pg, take_reference);
4660
4661 /*
4662 * handle any valid pages immediately after the
4663 * pages we issued I/O for... ... release these appropriately
4664 */
4665 cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
4666
4667 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
4668 }
4669 }
4670 if (retval == 0) {
4671 retval = error;
4672 }
4673
4674 if (io_req_size) {
4675 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
4676
4677 if (cluster_is_throttled(vp)) {
4678 /*
4679 * we're in the throttle window, at the very least
4680 * we want to limit the size of the I/O we're about
4681 * to issue
4682 */
4683 rd_ahead_enabled = 0;
4684 prefetch_enabled = 0;
4685 max_rd_size = max_throttle_size;
4686 } else {
4687 if (max_rd_size == max_throttle_size) {
4688 /*
4689 * coming out of throttled state
4690 */
4691 if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4692 if (rap != NULL) {
4693 rd_ahead_enabled = 1;
4694 }
4695 prefetch_enabled = 1;
4696 }
4697 max_rd_size = max_prefetch;
4698 last_ioread_offset = 0;
4699 }
4700 }
4701 }
4702 }
4703 if (iolock_inited == TRUE) {
4704 /*
4705 * cluster_io returned an error after it
4706 * had already issued some I/O. we need
4707 * to wait for that I/O to complete before
4708 * we can destroy the iostate mutex...
4709 * 'retval' already contains the early error
4710 * so no need to pick it up from iostate.io_error
4711 */
4712 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4713
4714 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
4715 }
4716 if (rap != NULL) {
4717 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4718 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
4719
4720 lck_mtx_unlock(&rap->cl_lockr);
4721 } else {
4722 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4723 (int)uio->uio_offset, io_req_size, 0, retval, 0);
4724 }
4725
4726 return retval;
4727 }
4728
4729 /*
4730 * We don't want another read/write lock for every vnode in the system
4731 * so we keep a hash of them here. There should never be very many of
4732 * these around at any point in time.
4733 */
4734 cl_direct_read_lock_t *
cluster_lock_direct_read(vnode_t vp,lck_rw_type_t type)4735 cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4736 {
4737 struct cl_direct_read_locks *head
4738 = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4739 % CL_DIRECT_READ_LOCK_BUCKETS];
4740
4741 struct cl_direct_read_lock *lck, *new_lck = NULL;
4742
4743 for (;;) {
4744 lck_spin_lock(&cl_direct_read_spin_lock);
4745
4746 LIST_FOREACH(lck, head, chain) {
4747 if (lck->vp == vp) {
4748 ++lck->ref_count;
4749 lck_spin_unlock(&cl_direct_read_spin_lock);
4750 if (new_lck) {
4751 // Someone beat us to it, ditch the allocation
4752 lck_rw_destroy(&new_lck->rw_lock, &cl_mtx_grp);
4753 kfree_type(cl_direct_read_lock_t, new_lck);
4754 }
4755 lck_rw_lock(&lck->rw_lock, type);
4756 return lck;
4757 }
4758 }
4759
4760 if (new_lck) {
4761 // Use the lock we allocated
4762 LIST_INSERT_HEAD(head, new_lck, chain);
4763 lck_spin_unlock(&cl_direct_read_spin_lock);
4764 lck_rw_lock(&new_lck->rw_lock, type);
4765 return new_lck;
4766 }
4767
4768 lck_spin_unlock(&cl_direct_read_spin_lock);
4769
4770 // Allocate a new lock
4771 new_lck = kalloc_type(cl_direct_read_lock_t, Z_WAITOK);
4772 lck_rw_init(&new_lck->rw_lock, &cl_mtx_grp, LCK_ATTR_NULL);
4773 new_lck->vp = vp;
4774 new_lck->ref_count = 1;
4775
4776 // Got to go round again
4777 }
4778 }
4779
4780 void
cluster_unlock_direct_read(cl_direct_read_lock_t * lck)4781 cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4782 {
4783 lck_rw_done(&lck->rw_lock);
4784
4785 lck_spin_lock(&cl_direct_read_spin_lock);
4786 if (lck->ref_count == 1) {
4787 LIST_REMOVE(lck, chain);
4788 lck_spin_unlock(&cl_direct_read_spin_lock);
4789 lck_rw_destroy(&lck->rw_lock, &cl_mtx_grp);
4790 kfree_type(cl_direct_read_lock_t, lck);
4791 } else {
4792 --lck->ref_count;
4793 lck_spin_unlock(&cl_direct_read_spin_lock);
4794 }
4795 }
4796
4797 static int
cluster_read_direct(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int flags,int (* callback)(buf_t,void *),void * callback_arg)4798 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4799 int flags, int (*callback)(buf_t, void *), void *callback_arg)
4800 {
4801 upl_t upl = NULL;
4802 upl_page_info_t *pl;
4803 off_t max_io_size;
4804 vm_offset_t upl_offset, vector_upl_offset = 0;
4805 upl_size_t upl_size = 0, vector_upl_size = 0;
4806 vm_size_t upl_needed_size;
4807 unsigned int pages_in_pl;
4808 upl_control_flags_t upl_flags;
4809 kern_return_t kret = KERN_SUCCESS;
4810 unsigned int i;
4811 int force_data_sync;
4812 int retval = 0;
4813 int no_zero_fill = 0;
4814 int io_flag = 0;
4815 int misaligned = 0;
4816 struct clios iostate;
4817 user_addr_t iov_base;
4818 u_int32_t io_req_size;
4819 u_int32_t offset_in_file;
4820 u_int32_t offset_in_iovbase;
4821 u_int32_t io_size;
4822 u_int32_t io_min;
4823 u_int32_t xsize;
4824 u_int32_t devblocksize;
4825 u_int32_t mem_alignment_mask;
4826 u_int32_t max_upl_size;
4827 u_int32_t max_rd_size;
4828 u_int32_t max_rd_ahead;
4829 u_int32_t max_vector_size;
4830 boolean_t io_throttled = FALSE;
4831
4832 u_int32_t vector_upl_iosize = 0;
4833 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
4834 off_t v_upl_uio_offset = 0;
4835 int vector_upl_index = 0;
4836 upl_t vector_upl = NULL;
4837 cl_direct_read_lock_t *lock = NULL;
4838
4839 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
4840
4841 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
4842 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4843
4844 max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
4845
4846 max_rd_size = max_upl_size;
4847
4848 if (__improbable(os_mul_overflow(max_rd_size, IO_SCALE(vp, 2),
4849 &max_rd_ahead) || (max_rd_ahead > overlapping_read_max))) {
4850 max_rd_ahead = overlapping_read_max;
4851 }
4852
4853 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
4854
4855 if (flags & IO_PASSIVE) {
4856 io_flag |= CL_PASSIVE;
4857 }
4858
4859 if (flags & IO_ENCRYPTED) {
4860 io_flag |= CL_RAW_ENCRYPTED;
4861 }
4862
4863 if (flags & IO_NOCACHE) {
4864 io_flag |= CL_NOCACHE;
4865 }
4866
4867 if (flags & IO_SKIP_ENCRYPTION) {
4868 io_flag |= CL_ENCRYPTED;
4869 }
4870
4871 iostate.io_completed = 0;
4872 iostate.io_issued = 0;
4873 iostate.io_error = 0;
4874 iostate.io_wanted = 0;
4875
4876 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4877
4878 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4879 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4880
4881 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4882 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
4883
4884 if (devblocksize == 1) {
4885 /*
4886 * the AFP client advertises a devblocksize of 1
4887 * however, its BLOCKMAP routine maps to physical
4888 * blocks that are PAGE_SIZE in size...
4889 * therefore we can't ask for I/Os that aren't page aligned
4890 * or aren't multiples of PAGE_SIZE in size
4891 * by setting devblocksize to PAGE_SIZE, we re-instate
4892 * the old behavior we had before the mem_alignment_mask
4893 * changes went in...
4894 */
4895 devblocksize = PAGE_SIZE;
4896 }
4897
4898 /*
4899 * We are going to need this uio for the prefaulting later
4900 * especially for the cases where multiple non-contiguous
4901 * iovs are passed into this routine.
4902 */
4903 uio_t uio_acct = uio_duplicate(uio);
4904
4905 next_dread:
4906 io_req_size = *read_length;
4907 iov_base = uio_curriovbase(uio);
4908
4909 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4910 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4911
4912 if (vm_map_page_mask(current_map()) < PAGE_MASK) {
4913 /*
4914 * XXX TODO4K
4915 * Direct I/O might not work as expected from a 16k kernel space
4916 * to a 4k user space because each 4k chunk might point to
4917 * a different 16k physical page...
4918 * Let's go the "misaligned" way.
4919 */
4920 if (!misaligned) {
4921 DEBUG4K_VFS("forcing misaligned\n");
4922 }
4923 misaligned = 1;
4924 }
4925
4926 if (offset_in_file || offset_in_iovbase) {
4927 /*
4928 * one of the 2 important offsets is misaligned
4929 * so fire an I/O through the cache for this entire vector
4930 */
4931 misaligned = 1;
4932 }
4933 if (iov_base & (devblocksize - 1)) {
4934 /*
4935 * the offset in memory must be on a device block boundary
4936 * so that we can guarantee that we can generate an
4937 * I/O that ends on a page boundary in cluster_io
4938 */
4939 misaligned = 1;
4940 }
4941
4942 max_io_size = filesize - uio->uio_offset;
4943
4944 /*
4945 * The user must request IO in aligned chunks. If the
4946 * offset into the file is bad, or the userland pointer
4947 * is non-aligned, then we cannot service the encrypted IO request.
4948 */
4949 if (flags & IO_ENCRYPTED) {
4950 if (misaligned || (io_req_size & (devblocksize - 1))) {
4951 retval = EINVAL;
4952 }
4953
4954 max_io_size = roundup(max_io_size, devblocksize);
4955 }
4956
4957 if ((off_t)io_req_size > max_io_size) {
4958 io_req_size = (u_int32_t)max_io_size;
4959 }
4960
4961 /*
4962 * When we get to this point, we know...
4963 * -- the offset into the file is on a devblocksize boundary
4964 */
4965
4966 while (io_req_size && retval == 0) {
4967 u_int32_t io_start;
4968
4969 if (cluster_is_throttled(vp)) {
4970 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
4971
4972 /*
4973 * we're in the throttle window, at the very least
4974 * we want to limit the size of the I/O we're about
4975 * to issue
4976 */
4977 max_rd_size = max_throttle_size;
4978 max_rd_ahead = max_throttle_size - 1;
4979 max_vector_size = max_throttle_size;
4980 } else {
4981 max_rd_size = max_upl_size;
4982 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4983 max_vector_size = MAX_VECTOR_UPL_SIZE;
4984 }
4985 io_start = io_size = io_req_size;
4986
4987 /*
4988 * First look for pages already in the cache
4989 * and move them to user space. But only do this
4990 * check if we are not retrieving encrypted data directly
4991 * from the filesystem; those blocks should never
4992 * be in the UBC.
4993 *
4994 * cluster_copy_ubc_data returns the resid
4995 * in io_size
4996 */
4997 if ((flags & IO_ENCRYPTED) == 0) {
4998 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
4999 }
5000 /*
5001 * calculate the number of bytes actually copied
5002 * starting size - residual
5003 */
5004 xsize = io_start - io_size;
5005
5006 io_req_size -= xsize;
5007
5008 if (useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
5009 /*
5010 * We found something in the cache or we have an iov_base that's not
5011 * page-aligned.
5012 *
5013 * Issue all I/O's that have been collected within this Vectored UPL.
5014 */
5015 if (vector_upl_index) {
5016 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5017 reset_vector_run_state();
5018 }
5019
5020 if (xsize) {
5021 useVectorUPL = 0;
5022 }
5023
5024 /*
5025 * After this point, if we are using the Vector UPL path and the base is
5026 * not page-aligned then the UPL with that base will be the first in the vector UPL.
5027 */
5028 }
5029
5030 /*
5031 * check to see if we are finished with this request.
5032 *
5033 * If we satisfied this IO already, then io_req_size will be 0.
5034 * Otherwise, see if the IO was mis-aligned and needs to go through
5035 * the UBC to deal with the 'tail'.
5036 *
5037 */
5038 if (io_req_size == 0 || (misaligned)) {
5039 /*
5040 * see if there's another uio vector to
5041 * process that's of type IO_DIRECT
5042 *
5043 * break out of while loop to get there
5044 */
5045 break;
5046 }
5047 /*
5048 * assume the request ends on a device block boundary
5049 */
5050 io_min = devblocksize;
5051
5052 /*
5053 * we can handle I/O's in multiples of the device block size
5054 * however, if io_size isn't a multiple of devblocksize we
5055 * want to clip it back to the nearest page boundary since
5056 * we are going to have to go through cluster_read_copy to
5057 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
5058 * multiple, we avoid asking the drive for the same physical
5059 * blocks twice.. once for the partial page at the end of the
5060 * request and a 2nd time for the page we read into the cache
5061 * (which overlaps the end of the direct read) in order to
5062 * get at the overhang bytes
5063 */
5064 if (io_size & (devblocksize - 1)) {
5065 assert(!(flags & IO_ENCRYPTED));
5066 /*
5067 * Clip the request to the previous page size boundary
5068 * since request does NOT end on a device block boundary
5069 */
5070 io_size &= ~PAGE_MASK;
5071 io_min = PAGE_SIZE;
5072 }
5073 if (retval || io_size < io_min) {
5074 /*
5075 * either an error or we only have the tail left to
5076 * complete via the copy path...
5077 * we may have already spun some portion of this request
5078 * off as async requests... we need to wait for the I/O
5079 * to complete before returning
5080 */
5081 goto wait_for_dreads;
5082 }
5083
5084 /*
5085 * Don't re-check the UBC data if we are looking for uncached IO
5086 * or asking for encrypted blocks.
5087 */
5088 if ((flags & IO_ENCRYPTED) == 0) {
5089 if ((xsize = io_size) > max_rd_size) {
5090 xsize = max_rd_size;
5091 }
5092
5093 io_size = 0;
5094
5095 if (!lock) {
5096 /*
5097 * We hold a lock here between the time we check the
5098 * cache and the time we issue I/O. This saves us
5099 * from having to lock the pages in the cache. Not
5100 * all clients will care about this lock but some
5101 * clients may want to guarantee stability between
5102 * here and when the I/O is issued in which case they
5103 * will take the lock exclusively.
5104 */
5105 lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
5106 }
5107
5108 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
5109
5110 if (io_size == 0) {
5111 /*
5112 * a page must have just come into the cache
5113 * since the first page in this range is no
5114 * longer absent, go back and re-evaluate
5115 */
5116 continue;
5117 }
5118 }
5119 if ((flags & IO_RETURN_ON_THROTTLE)) {
5120 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
5121 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
5122 /*
5123 * we're in the throttle window and at least 1 I/O
5124 * has already been issued by a throttleable thread
5125 * in this window, so return with EAGAIN to indicate
5126 * to the FS issuing the cluster_read call that it
5127 * should now throttle after dropping any locks
5128 */
5129 throttle_info_update_by_mount(vp->v_mount);
5130
5131 io_throttled = TRUE;
5132 goto wait_for_dreads;
5133 }
5134 }
5135 }
5136 if (io_size > max_rd_size) {
5137 io_size = max_rd_size;
5138 }
5139
5140 iov_base = uio_curriovbase(uio);
5141
5142 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5143 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5144
5145 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
5146 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
5147
5148 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
5149 no_zero_fill = 1;
5150 } else {
5151 no_zero_fill = 0;
5152 }
5153
5154 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5155 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
5156 pages_in_pl = 0;
5157 upl_size = (upl_size_t)upl_needed_size;
5158 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5159 if (no_zero_fill) {
5160 upl_flags |= UPL_NOZEROFILL;
5161 }
5162 if (force_data_sync) {
5163 upl_flags |= UPL_FORCE_DATA_SYNC;
5164 }
5165
5166 kret = vm_map_create_upl(map,
5167 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5168 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
5169
5170 if (kret != KERN_SUCCESS) {
5171 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5172 (int)upl_offset, upl_size, io_size, kret, 0);
5173 /*
5174 * failed to get pagelist
5175 *
5176 * we may have already spun some portion of this request
5177 * off as async requests... we need to wait for the I/O
5178 * to complete before returning
5179 */
5180 goto wait_for_dreads;
5181 }
5182 pages_in_pl = upl_size / PAGE_SIZE;
5183 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
5184
5185 for (i = 0; i < pages_in_pl; i++) {
5186 if (!upl_page_present(pl, i)) {
5187 break;
5188 }
5189 }
5190 if (i == pages_in_pl) {
5191 break;
5192 }
5193
5194 ubc_upl_abort(upl, 0);
5195 }
5196 if (force_data_sync >= 3) {
5197 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5198 (int)upl_offset, upl_size, io_size, kret, 0);
5199
5200 goto wait_for_dreads;
5201 }
5202 /*
5203 * Consider the possibility that upl_size wasn't satisfied.
5204 */
5205 if (upl_size < upl_needed_size) {
5206 if (upl_size && upl_offset == 0) {
5207 io_size = upl_size;
5208 } else {
5209 io_size = 0;
5210 }
5211 }
5212 if (io_size == 0) {
5213 ubc_upl_abort(upl, 0);
5214 goto wait_for_dreads;
5215 }
5216 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5217 (int)upl_offset, upl_size, io_size, kret, 0);
5218
5219 if (useVectorUPL) {
5220 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
5221 if (end_off) {
5222 issueVectorUPL = 1;
5223 }
5224 /*
5225 * After this point, if we are using a vector UPL, then
5226 * either all the UPL elements end on a page boundary OR
5227 * this UPL is the last element because it does not end
5228 * on a page boundary.
5229 */
5230 }
5231
5232 /*
5233 * request asynchronously so that we can overlap
5234 * the preparation of the next I/O
5235 * if there are already too many outstanding reads
5236 * wait until some have completed before issuing the next read
5237 */
5238 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
5239
5240 if (iostate.io_error) {
5241 /*
5242 * one of the earlier reads we issued ran into a hard error
5243 * don't issue any more reads, cleanup the UPL
5244 * that was just created but not used, then
5245 * go wait for any other reads to complete before
5246 * returning the error to the caller
5247 */
5248 ubc_upl_abort(upl, 0);
5249
5250 goto wait_for_dreads;
5251 }
5252 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
5253 upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
5254
5255 if (!useVectorUPL) {
5256 if (no_zero_fill) {
5257 io_flag &= ~CL_PRESERVE;
5258 } else {
5259 io_flag |= CL_PRESERVE;
5260 }
5261
5262 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5263 } else {
5264 if (!vector_upl_index) {
5265 vector_upl = vector_upl_create(upl_offset, uio->uio_iovcnt);
5266 v_upl_uio_offset = uio->uio_offset;
5267 vector_upl_offset = upl_offset;
5268 }
5269
5270 vector_upl_set_subupl(vector_upl, upl, upl_size);
5271 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
5272 vector_upl_index++;
5273 vector_upl_size += upl_size;
5274 vector_upl_iosize += io_size;
5275
5276 if (issueVectorUPL || vector_upl_index == vector_upl_max_upls(vector_upl) || vector_upl_size >= max_vector_size) {
5277 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5278 reset_vector_run_state();
5279 }
5280 }
5281
5282 if (lock) {
5283 // We don't need to wait for the I/O to complete
5284 cluster_unlock_direct_read(lock);
5285 lock = NULL;
5286 }
5287
5288 /*
5289 * update the uio structure
5290 */
5291 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5292 uio_update(uio, (user_size_t)max_io_size);
5293 } else {
5294 uio_update(uio, (user_size_t)io_size);
5295 }
5296
5297 io_req_size -= io_size;
5298
5299 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
5300 upl, (int)uio->uio_offset, io_req_size, retval, 0);
5301 } /* end while */
5302
5303 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
5304 retval = cluster_io_type(uio, read_type, read_length, 0);
5305
5306 if (retval == 0 && *read_type == IO_DIRECT) {
5307 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5308 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
5309
5310 goto next_dread;
5311 }
5312 }
5313
5314 wait_for_dreads:
5315
5316 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
5317 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5318 reset_vector_run_state();
5319 }
5320
5321 // We don't need to wait for the I/O to complete
5322 if (lock) {
5323 cluster_unlock_direct_read(lock);
5324 }
5325
5326 /*
5327 * make sure all async reads that are part of this stream
5328 * have completed before we return
5329 */
5330 cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
5331
5332 if (iostate.io_error) {
5333 retval = iostate.io_error;
5334 }
5335
5336 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5337
5338 if (io_throttled == TRUE && retval == 0) {
5339 retval = EAGAIN;
5340 }
5341
5342 vm_map_offset_t current_page_size, current_page_mask;
5343 current_page_size = vm_map_page_size(current_map());
5344 current_page_mask = vm_map_page_mask(current_map());
5345 if (uio_acct) {
5346 off_t bytes_to_prefault = 0, bytes_prefaulted = 0;
5347 user_addr_t curr_iov_base = 0;
5348 user_addr_t curr_iov_end = 0;
5349 user_size_t curr_iov_len = 0;
5350
5351 bytes_to_prefault = uio_offset(uio) - uio_offset(uio_acct);
5352
5353 for (; bytes_prefaulted < bytes_to_prefault;) {
5354 curr_iov_base = uio_curriovbase(uio_acct);
5355 curr_iov_len = MIN(uio_curriovlen(uio_acct), bytes_to_prefault - bytes_prefaulted);
5356 curr_iov_end = curr_iov_base + curr_iov_len;
5357
5358 for (; curr_iov_base < curr_iov_end;) {
5359 /*
5360 * This is specifically done for pmap accounting purposes.
5361 * vm_pre_fault() will call vm_fault() to enter the page into
5362 * the pmap if there isn't _a_ physical page for that VA already.
5363 */
5364 vm_pre_fault(vm_map_trunc_page(curr_iov_base, current_page_mask), VM_PROT_READ);
5365 curr_iov_base += current_page_size;
5366 bytes_prefaulted += current_page_size;
5367 }
5368 /*
5369 * Use update instead of advance so we can see how many iovs we processed.
5370 */
5371 uio_update(uio_acct, curr_iov_len);
5372 }
5373 uio_free(uio_acct);
5374 uio_acct = NULL;
5375 }
5376
5377 if (io_req_size && retval == 0) {
5378 /*
5379 * we couldn't handle the tail of this request in DIRECT mode
5380 * so fire it through the copy path
5381 */
5382 if (flags & IO_ENCRYPTED) {
5383 /*
5384 * We cannot fall back to the copy path for encrypted I/O. If this
5385 * happens, there is something wrong with the user buffer passed
5386 * down.
5387 */
5388 retval = EFAULT;
5389 } else {
5390 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
5391 }
5392
5393 *read_type = IO_UNKNOWN;
5394 }
5395 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
5396 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
5397
5398 return retval;
5399 }
5400
5401
5402 static int
cluster_read_contig(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int (* callback)(buf_t,void *),void * callback_arg,int flags)5403 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
5404 int (*callback)(buf_t, void *), void *callback_arg, int flags)
5405 {
5406 upl_page_info_t *pl;
5407 upl_t upl[MAX_VECTS];
5408 vm_offset_t upl_offset;
5409 addr64_t dst_paddr = 0;
5410 user_addr_t iov_base;
5411 off_t max_size;
5412 upl_size_t upl_size;
5413 vm_size_t upl_needed_size;
5414 mach_msg_type_number_t pages_in_pl;
5415 upl_control_flags_t upl_flags;
5416 kern_return_t kret;
5417 struct clios iostate;
5418 int error = 0;
5419 int cur_upl = 0;
5420 int num_upl = 0;
5421 int n;
5422 u_int32_t xsize;
5423 u_int32_t io_size;
5424 u_int32_t devblocksize;
5425 u_int32_t mem_alignment_mask;
5426 u_int32_t tail_size = 0;
5427 int bflag;
5428
5429 if (flags & IO_PASSIVE) {
5430 bflag = CL_PASSIVE;
5431 } else {
5432 bflag = 0;
5433 }
5434
5435 if (flags & IO_NOCACHE) {
5436 bflag |= CL_NOCACHE;
5437 }
5438
5439 /*
5440 * When we enter this routine, we know
5441 * -- the read_length will not exceed the current iov_len
5442 * -- the target address is physically contiguous for read_length
5443 */
5444 cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
5445
5446 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5447 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
5448
5449 iostate.io_completed = 0;
5450 iostate.io_issued = 0;
5451 iostate.io_error = 0;
5452 iostate.io_wanted = 0;
5453
5454 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
5455
5456 next_cread:
5457 io_size = *read_length;
5458
5459 max_size = filesize - uio->uio_offset;
5460
5461 if (io_size > max_size) {
5462 io_size = (u_int32_t)max_size;
5463 }
5464
5465 iov_base = uio_curriovbase(uio);
5466
5467 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5468 upl_needed_size = upl_offset + io_size;
5469
5470 pages_in_pl = 0;
5471 upl_size = (upl_size_t)upl_needed_size;
5472 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5473
5474
5475 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
5476 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
5477
5478 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5479 kret = vm_map_get_upl(map,
5480 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5481 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
5482
5483 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
5484 (int)upl_offset, upl_size, io_size, kret, 0);
5485
5486 if (kret != KERN_SUCCESS) {
5487 /*
5488 * failed to get pagelist
5489 */
5490 error = EINVAL;
5491 goto wait_for_creads;
5492 }
5493 num_upl++;
5494
5495 if (upl_size < upl_needed_size) {
5496 /*
5497 * The upl_size wasn't satisfied.
5498 */
5499 error = EINVAL;
5500 goto wait_for_creads;
5501 }
5502 pl = ubc_upl_pageinfo(upl[cur_upl]);
5503
5504 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
5505
5506 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
5507 u_int32_t head_size;
5508
5509 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
5510
5511 if (head_size > io_size) {
5512 head_size = io_size;
5513 }
5514
5515 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
5516
5517 if (error) {
5518 goto wait_for_creads;
5519 }
5520
5521 upl_offset += head_size;
5522 dst_paddr += head_size;
5523 io_size -= head_size;
5524
5525 iov_base += head_size;
5526 }
5527 if ((u_int32_t)iov_base & mem_alignment_mask) {
5528 /*
5529 * request doesn't set up on a memory boundary
5530 * the underlying DMA engine can handle...
5531 * return an error instead of going through
5532 * the slow copy path since the intent of this
5533 * path is direct I/O to device memory
5534 */
5535 error = EINVAL;
5536 goto wait_for_creads;
5537 }
5538
5539 tail_size = io_size & (devblocksize - 1);
5540
5541 io_size -= tail_size;
5542
5543 while (io_size && error == 0) {
5544 if (io_size > MAX_IO_CONTIG_SIZE) {
5545 xsize = MAX_IO_CONTIG_SIZE;
5546 } else {
5547 xsize = io_size;
5548 }
5549 /*
5550 * request asynchronously so that we can overlap
5551 * the preparation of the next I/O... we'll do
5552 * the commit after all the I/O has completed
5553 * since its all issued against the same UPL
5554 * if there are already too many outstanding reads
5555 * wait until some have completed before issuing the next
5556 */
5557 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
5558
5559 if (iostate.io_error) {
5560 /*
5561 * one of the earlier reads we issued ran into a hard error
5562 * don't issue any more reads...
5563 * go wait for any other reads to complete before
5564 * returning the error to the caller
5565 */
5566 goto wait_for_creads;
5567 }
5568 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
5569 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5570 (buf_t)NULL, &iostate, callback, callback_arg);
5571 /*
5572 * The cluster_io read was issued successfully,
5573 * update the uio structure
5574 */
5575 if (error == 0) {
5576 uio_update(uio, (user_size_t)xsize);
5577
5578 dst_paddr += xsize;
5579 upl_offset += xsize;
5580 io_size -= xsize;
5581 }
5582 }
5583 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5584 error = cluster_io_type(uio, read_type, read_length, 0);
5585
5586 if (error == 0 && *read_type == IO_CONTIG) {
5587 cur_upl++;
5588 goto next_cread;
5589 }
5590 } else {
5591 *read_type = IO_UNKNOWN;
5592 }
5593
5594 wait_for_creads:
5595 /*
5596 * make sure all async reads that are part of this stream
5597 * have completed before we proceed
5598 */
5599 cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
5600
5601 if (iostate.io_error) {
5602 error = iostate.io_error;
5603 }
5604
5605 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5606
5607 if (error == 0 && tail_size) {
5608 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
5609 }
5610
5611 for (n = 0; n < num_upl; n++) {
5612 /*
5613 * just release our hold on each physically contiguous
5614 * region without changing any state
5615 */
5616 ubc_upl_abort(upl[n], 0);
5617 }
5618
5619 return error;
5620 }
5621
5622
5623 static int
cluster_io_type(struct uio * uio,int * io_type,u_int32_t * io_length,u_int32_t min_length)5624 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5625 {
5626 user_size_t iov_len;
5627 user_addr_t iov_base = 0;
5628 upl_t upl;
5629 upl_size_t upl_size;
5630 upl_control_flags_t upl_flags;
5631 int retval = 0;
5632
5633 /*
5634 * skip over any emtpy vectors
5635 */
5636 uio_update(uio, (user_size_t)0);
5637
5638 iov_len = uio_curriovlen(uio);
5639
5640 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
5641
5642 if (iov_len) {
5643 iov_base = uio_curriovbase(uio);
5644 /*
5645 * make sure the size of the vector isn't too big...
5646 * internally, we want to handle all of the I/O in
5647 * chunk sizes that fit in a 32 bit int
5648 */
5649 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
5650 upl_size = MAX_IO_REQUEST_SIZE;
5651 } else {
5652 upl_size = (u_int32_t)iov_len;
5653 }
5654
5655 upl_flags = UPL_QUERY_OBJECT_TYPE;
5656
5657 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5658 if ((vm_map_get_upl(map,
5659 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5660 &upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
5661 /*
5662 * the user app must have passed in an invalid address
5663 */
5664 retval = EFAULT;
5665 }
5666 if (upl_size == 0) {
5667 retval = EFAULT;
5668 }
5669
5670 *io_length = upl_size;
5671
5672 if (upl_flags & UPL_PHYS_CONTIG) {
5673 *io_type = IO_CONTIG;
5674 } else if (iov_len >= min_length) {
5675 *io_type = IO_DIRECT;
5676 } else {
5677 *io_type = IO_COPY;
5678 }
5679 } else {
5680 /*
5681 * nothing left to do for this uio
5682 */
5683 *io_length = 0;
5684 *io_type = IO_UNKNOWN;
5685 }
5686 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
5687
5688 if (*io_type == IO_DIRECT &&
5689 vm_map_page_shift(current_map()) < PAGE_SHIFT) {
5690 /* no direct I/O for sub-page-size address spaces */
5691 DEBUG4K_VFS("io_type IO_DIRECT -> IO_COPY\n");
5692 *io_type = IO_COPY;
5693 }
5694
5695 return retval;
5696 }
5697
5698
5699 /*
5700 * generate advisory I/O's in the largest chunks possible
5701 * the completed pages will be released into the VM cache
5702 */
5703 int
advisory_read(vnode_t vp,off_t filesize,off_t f_offset,int resid)5704 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5705 {
5706 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5707 }
5708
5709 int
advisory_read_ext(vnode_t vp,off_t filesize,off_t f_offset,int resid,int (* callback)(buf_t,void *),void * callback_arg,int bflag)5710 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
5711 {
5712 upl_page_info_t *pl;
5713 upl_t upl = NULL;
5714 vm_offset_t upl_offset;
5715 int upl_size;
5716 off_t upl_f_offset;
5717 int start_offset;
5718 int start_pg;
5719 int last_pg;
5720 int pages_in_upl;
5721 off_t max_size;
5722 int io_size;
5723 kern_return_t kret;
5724 int retval = 0;
5725 int issued_io;
5726 int skip_range;
5727 uint32_t max_io_size;
5728
5729
5730 if (!UBCINFOEXISTS(vp)) {
5731 return EINVAL;
5732 }
5733
5734 if (f_offset < 0 || resid < 0) {
5735 return EINVAL;
5736 }
5737
5738 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
5739
5740 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
5741 if (max_io_size > speculative_prefetch_max_iosize) {
5742 max_io_size = speculative_prefetch_max_iosize;
5743 }
5744 }
5745
5746 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
5747 (int)f_offset, resid, (int)filesize, 0, 0);
5748
5749 while (resid && f_offset < filesize && retval == 0) {
5750 /*
5751 * compute the size of the upl needed to encompass
5752 * the requested read... limit each call to cluster_io
5753 * to the maximum UPL size... cluster_io will clip if
5754 * this exceeds the maximum io_size for the device,
5755 * make sure to account for
5756 * a starting offset that's not page aligned
5757 */
5758 start_offset = (int)(f_offset & PAGE_MASK_64);
5759 upl_f_offset = f_offset - (off_t)start_offset;
5760 max_size = filesize - f_offset;
5761
5762 if (resid < max_size) {
5763 io_size = resid;
5764 } else {
5765 io_size = (int)max_size;
5766 }
5767
5768 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5769 if ((uint32_t)upl_size > max_io_size) {
5770 upl_size = max_io_size;
5771 }
5772
5773 skip_range = 0;
5774 /*
5775 * return the number of contiguously present pages in the cache
5776 * starting at upl_f_offset within the file
5777 */
5778 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5779
5780 if (skip_range) {
5781 /*
5782 * skip over pages already present in the cache
5783 */
5784 io_size = skip_range - start_offset;
5785
5786 f_offset += io_size;
5787 resid -= io_size;
5788
5789 if (skip_range == upl_size) {
5790 continue;
5791 }
5792 /*
5793 * have to issue some real I/O
5794 * at this point, we know it's starting on a page boundary
5795 * because we've skipped over at least the first page in the request
5796 */
5797 start_offset = 0;
5798 upl_f_offset += skip_range;
5799 upl_size -= skip_range;
5800 }
5801 pages_in_upl = upl_size / PAGE_SIZE;
5802
5803 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
5804 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5805
5806 kret = ubc_create_upl_kernel(vp,
5807 upl_f_offset,
5808 upl_size,
5809 &upl,
5810 &pl,
5811 UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
5812 VM_KERN_MEMORY_FILE);
5813 if (kret != KERN_SUCCESS) {
5814 return retval;
5815 }
5816 issued_io = 0;
5817
5818 /*
5819 * before we start marching forward, we must make sure we end on
5820 * a present page, otherwise we will be working with a freed
5821 * upl
5822 */
5823 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5824 if (upl_page_present(pl, last_pg)) {
5825 break;
5826 }
5827 }
5828 pages_in_upl = last_pg + 1;
5829
5830
5831 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
5832 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5833
5834
5835 for (last_pg = 0; last_pg < pages_in_upl;) {
5836 /*
5837 * scan from the beginning of the upl looking for the first
5838 * page that is present.... this will become the first page in
5839 * the request we're going to make to 'cluster_io'... if all
5840 * of the pages are absent, we won't call through to 'cluster_io'
5841 */
5842 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5843 if (upl_page_present(pl, start_pg)) {
5844 break;
5845 }
5846 }
5847
5848 /*
5849 * scan from the starting present page looking for an absent
5850 * page before the end of the upl is reached, if we
5851 * find one, then it will terminate the range of pages being
5852 * presented to 'cluster_io'
5853 */
5854 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5855 if (!upl_page_present(pl, last_pg)) {
5856 break;
5857 }
5858 }
5859
5860 if (last_pg > start_pg) {
5861 /*
5862 * we found a range of pages that must be filled
5863 * if the last page in this range is the last page of the file
5864 * we may have to clip the size of it to keep from reading past
5865 * the end of the last physical block associated with the file
5866 */
5867 upl_offset = start_pg * PAGE_SIZE;
5868 io_size = (last_pg - start_pg) * PAGE_SIZE;
5869
5870 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5871 io_size = (int)(filesize - (upl_f_offset + upl_offset));
5872 }
5873
5874 /*
5875 * issue an asynchronous read to cluster_io
5876 */
5877 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5878 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5879
5880 issued_io = 1;
5881 }
5882 }
5883 if (issued_io == 0) {
5884 ubc_upl_abort(upl, 0);
5885 }
5886
5887 io_size = upl_size - start_offset;
5888
5889 if (io_size > resid) {
5890 io_size = resid;
5891 }
5892 f_offset += io_size;
5893 resid -= io_size;
5894 }
5895
5896 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
5897 (int)f_offset, resid, retval, 0, 0);
5898
5899 return retval;
5900 }
5901
5902
5903 int
cluster_push(vnode_t vp,int flags)5904 cluster_push(vnode_t vp, int flags)
5905 {
5906 return cluster_push_ext(vp, flags, NULL, NULL);
5907 }
5908
5909
5910 int
cluster_push_ext(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg)5911 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5912 {
5913 return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5914 }
5915
5916 /* write errors via err, but return the number of clusters written */
5917 extern uint32_t system_inshutdown;
5918 uint32_t cl_sparse_push_error = 0;
5919 int
cluster_push_err(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg,int * err)5920 cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
5921 {
5922 int retval;
5923 int my_sparse_wait = 0;
5924 struct cl_writebehind *wbp;
5925 int local_err = 0;
5926
5927 if (err) {
5928 *err = 0;
5929 }
5930
5931 if (!UBCINFOEXISTS(vp)) {
5932 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
5933 return 0;
5934 }
5935 /* return if deferred write is set */
5936 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5937 return 0;
5938 }
5939 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5940 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
5941 return 0;
5942 }
5943 if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
5944 lck_mtx_unlock(&wbp->cl_lockw);
5945
5946 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
5947 return 0;
5948 }
5949 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
5950 wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
5951
5952 /*
5953 * if we have an fsync in progress, we don't want to allow any additional
5954 * sync/fsync/close(s) to occur until it finishes.
5955 * note that its possible for writes to continue to occur to this file
5956 * while we're waiting and also once the fsync starts to clean if we're
5957 * in the sparse map case
5958 */
5959 while (wbp->cl_sparse_wait) {
5960 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5961
5962 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5963
5964 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5965 }
5966 if (flags & IO_SYNC) {
5967 my_sparse_wait = 1;
5968 wbp->cl_sparse_wait = 1;
5969
5970 /*
5971 * this is an fsync (or equivalent)... we must wait for any existing async
5972 * cleaning operations to complete before we evaulate the current state
5973 * and finish cleaning... this insures that all writes issued before this
5974 * fsync actually get cleaned to the disk before this fsync returns
5975 */
5976 while (wbp->cl_sparse_pushes) {
5977 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5978
5979 msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5980
5981 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5982 }
5983 }
5984 if (wbp->cl_scmap) {
5985 void *scmap;
5986
5987 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5988 scmap = wbp->cl_scmap;
5989 wbp->cl_scmap = NULL;
5990
5991 wbp->cl_sparse_pushes++;
5992
5993 lck_mtx_unlock(&wbp->cl_lockw);
5994
5995 retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5996
5997 lck_mtx_lock(&wbp->cl_lockw);
5998
5999 wbp->cl_sparse_pushes--;
6000
6001 if (retval) {
6002 if (wbp->cl_scmap != NULL) {
6003 /*
6004 * panic("cluster_push_err: Expected NULL cl_scmap\n");
6005 *
6006 * This can happen if we get an error from the underlying FS
6007 * e.g. ENOSPC, EPERM or EIO etc. We hope that these errors
6008 * are transient and the I/Os will succeed at a later point.
6009 *
6010 * The tricky part here is that a new sparse cluster has been
6011 * allocated and tracking a different set of dirty pages. So these
6012 * pages are not going to be pushed out with the next sparse_cluster_push.
6013 * An explicit msync or file close will, however, push the pages out.
6014 *
6015 * What if those calls still don't work? And so, during shutdown we keep
6016 * trying till we succeed...
6017 */
6018
6019 if (system_inshutdown) {
6020 if ((retval == ENOSPC) && (vp->v_mount->mnt_flag & (MNT_LOCAL | MNT_REMOVABLE)) == MNT_LOCAL) {
6021 os_atomic_inc(&cl_sparse_push_error, relaxed);
6022 }
6023 } else {
6024 vfs_drt_control(&scmap, 0); /* emit stats and free this memory. Dirty pages stay intact. */
6025 scmap = NULL;
6026 }
6027 } else {
6028 wbp->cl_scmap = scmap;
6029 }
6030 }
6031
6032 if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
6033 wakeup((caddr_t)&wbp->cl_sparse_pushes);
6034 }
6035 } else {
6036 retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
6037 }
6038
6039 local_err = retval;
6040
6041 if (err) {
6042 *err = retval;
6043 }
6044 retval = 1;
6045 } else {
6046 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
6047 if (err) {
6048 *err = local_err;
6049 }
6050 }
6051 lck_mtx_unlock(&wbp->cl_lockw);
6052
6053 if (flags & IO_SYNC) {
6054 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
6055 }
6056
6057 if (my_sparse_wait) {
6058 /*
6059 * I'm the owner of the serialization token
6060 * clear it and wakeup anyone that is waiting
6061 * for me to finish
6062 */
6063 lck_mtx_lock(&wbp->cl_lockw);
6064
6065 wbp->cl_sparse_wait = 0;
6066 wakeup((caddr_t)&wbp->cl_sparse_wait);
6067
6068 lck_mtx_unlock(&wbp->cl_lockw);
6069 }
6070 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
6071 wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
6072
6073 return retval;
6074 }
6075
6076
6077 __private_extern__ void
cluster_release(struct ubc_info * ubc)6078 cluster_release(struct ubc_info *ubc)
6079 {
6080 struct cl_writebehind *wbp;
6081 struct cl_readahead *rap;
6082
6083 if ((wbp = ubc->cl_wbehind)) {
6084 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
6085
6086 if (wbp->cl_scmap) {
6087 vfs_drt_control(&(wbp->cl_scmap), 0);
6088 }
6089 lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
6090 zfree(cl_wr_zone, wbp);
6091 ubc->cl_wbehind = NULL;
6092 } else {
6093 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
6094 }
6095
6096 if ((rap = ubc->cl_rahead)) {
6097 lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
6098 zfree(cl_rd_zone, rap);
6099 ubc->cl_rahead = NULL;
6100 }
6101
6102 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
6103 }
6104
6105
6106 static int
cluster_try_push(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,int * err,boolean_t vm_initiated)6107 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
6108 {
6109 int cl_index;
6110 int cl_index1;
6111 int min_index;
6112 int cl_len;
6113 int cl_pushed = 0;
6114 struct cl_wextent l_clusters[MAX_CLUSTERS];
6115 u_int max_cluster_pgcount;
6116 int error = 0;
6117
6118 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
6119 /*
6120 * the write behind context exists and has
6121 * already been locked...
6122 */
6123 if (wbp->cl_number == 0) {
6124 /*
6125 * no clusters to push
6126 * return number of empty slots
6127 */
6128 return MAX_CLUSTERS;
6129 }
6130
6131 /*
6132 * make a local 'sorted' copy of the clusters
6133 * and clear wbp->cl_number so that new clusters can
6134 * be developed
6135 */
6136 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6137 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
6138 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
6139 continue;
6140 }
6141 if (min_index == -1) {
6142 min_index = cl_index1;
6143 } else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
6144 min_index = cl_index1;
6145 }
6146 }
6147 if (min_index == -1) {
6148 break;
6149 }
6150
6151 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
6152 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
6153 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
6154
6155 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
6156 }
6157 wbp->cl_number = 0;
6158
6159 cl_len = cl_index;
6160
6161 /* skip switching to the sparse cluster mechanism if on diskimage */
6162 if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
6163 !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
6164 int i;
6165
6166 /*
6167 * determine if we appear to be writing the file sequentially
6168 * if not, by returning without having pushed any clusters
6169 * we will cause this vnode to be pushed into the sparse cluster mechanism
6170 * used for managing more random I/O patterns
6171 *
6172 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
6173 * that's why we're in try_push with PUSH_DELAY...
6174 *
6175 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
6176 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
6177 * so we can just make a simple pass through, up to, but not including the last one...
6178 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
6179 * are sequential
6180 *
6181 * we let the last one be partial as long as it was adjacent to the previous one...
6182 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
6183 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
6184 */
6185 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
6186 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
6187 goto dont_try;
6188 }
6189 if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
6190 goto dont_try;
6191 }
6192 }
6193 }
6194 if (vm_initiated == TRUE) {
6195 lck_mtx_unlock(&wbp->cl_lockw);
6196 }
6197
6198 for (cl_index = 0; cl_index < cl_len; cl_index++) {
6199 int flags;
6200 struct cl_extent cl;
6201 int retval;
6202
6203 flags = io_flags & (IO_PASSIVE | IO_CLOSE);
6204
6205 /*
6206 * try to push each cluster in turn...
6207 */
6208 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
6209 flags |= IO_NOCACHE;
6210 }
6211
6212 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
6213 flags |= IO_PASSIVE;
6214 }
6215
6216 if (push_flag & PUSH_SYNC) {
6217 flags |= IO_SYNC;
6218 }
6219
6220 cl.b_addr = l_clusters[cl_index].b_addr;
6221 cl.e_addr = l_clusters[cl_index].e_addr;
6222
6223 retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
6224
6225 if (retval == 0) {
6226 cl_pushed++;
6227
6228 l_clusters[cl_index].b_addr = 0;
6229 l_clusters[cl_index].e_addr = 0;
6230 } else if (error == 0) {
6231 error = retval;
6232 }
6233
6234 if (!(push_flag & PUSH_ALL)) {
6235 break;
6236 }
6237 }
6238 if (vm_initiated == TRUE) {
6239 lck_mtx_lock(&wbp->cl_lockw);
6240 }
6241
6242 if (err) {
6243 *err = error;
6244 }
6245
6246 dont_try:
6247 if (cl_len > cl_pushed) {
6248 /*
6249 * we didn't push all of the clusters, so
6250 * lets try to merge them back in to the vnode
6251 */
6252 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
6253 /*
6254 * we picked up some new clusters while we were trying to
6255 * push the old ones... this can happen because I've dropped
6256 * the vnode lock... the sum of the
6257 * leftovers plus the new cluster count exceeds our ability
6258 * to represent them, so switch to the sparse cluster mechanism
6259 *
6260 * collect the active public clusters...
6261 */
6262 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6263
6264 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
6265 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6266 continue;
6267 }
6268 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6269 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6270 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6271
6272 cl_index1++;
6273 }
6274 /*
6275 * update the cluster count
6276 */
6277 wbp->cl_number = cl_index1;
6278
6279 /*
6280 * and collect the original clusters that were moved into the
6281 * local storage for sorting purposes
6282 */
6283 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6284 } else {
6285 /*
6286 * we've got room to merge the leftovers back in
6287 * just append them starting at the next 'hole'
6288 * represented by wbp->cl_number
6289 */
6290 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
6291 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6292 continue;
6293 }
6294
6295 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6296 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6297 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6298
6299 cl_index1++;
6300 }
6301 /*
6302 * update the cluster count
6303 */
6304 wbp->cl_number = cl_index1;
6305 }
6306 }
6307 return MAX_CLUSTERS - wbp->cl_number;
6308 }
6309
6310
6311
6312 static int
cluster_push_now(vnode_t vp,struct cl_extent * cl,off_t EOF,int flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6313 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
6314 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6315 {
6316 upl_page_info_t *pl;
6317 upl_t upl;
6318 vm_offset_t upl_offset;
6319 int upl_size;
6320 off_t upl_f_offset;
6321 int pages_in_upl;
6322 int start_pg;
6323 int last_pg;
6324 int io_size;
6325 int io_flags;
6326 int upl_flags;
6327 int bflag;
6328 int size;
6329 int error = 0;
6330 int retval;
6331 kern_return_t kret;
6332
6333 if (flags & IO_PASSIVE) {
6334 bflag = CL_PASSIVE;
6335 } else {
6336 bflag = 0;
6337 }
6338
6339 if (flags & IO_SKIP_ENCRYPTION) {
6340 bflag |= CL_ENCRYPTED;
6341 }
6342
6343 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
6344 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
6345
6346 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
6347 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
6348
6349 return 0;
6350 }
6351 upl_size = pages_in_upl * PAGE_SIZE;
6352 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6353
6354 if (upl_f_offset + upl_size >= EOF) {
6355 if (upl_f_offset >= EOF) {
6356 /*
6357 * must have truncated the file and missed
6358 * clearing a dangling cluster (i.e. it's completely
6359 * beyond the new EOF
6360 */
6361 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
6362
6363 return 0;
6364 }
6365 size = (int)(EOF - upl_f_offset);
6366
6367 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
6368 pages_in_upl = upl_size / PAGE_SIZE;
6369 } else {
6370 size = upl_size;
6371 }
6372
6373
6374 if (vm_initiated) {
6375 vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
6376 UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
6377
6378 return error;
6379 }
6380 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
6381
6382 /*
6383 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
6384 *
6385 * - only pages that are currently dirty are returned... these are the ones we need to clean
6386 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6387 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
6388 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
6389 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
6390 *
6391 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6392 */
6393
6394 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) {
6395 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
6396 } else {
6397 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
6398 }
6399
6400 kret = ubc_create_upl_kernel(vp,
6401 upl_f_offset,
6402 upl_size,
6403 &upl,
6404 &pl,
6405 upl_flags,
6406 VM_KERN_MEMORY_FILE);
6407 if (kret != KERN_SUCCESS) {
6408 panic("cluster_push: failed to get pagelist");
6409 }
6410
6411 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
6412
6413 /*
6414 * since we only asked for the dirty pages back
6415 * it's possible that we may only get a few or even none, so...
6416 * before we start marching forward, we must make sure we know
6417 * where the last present page is in the UPL, otherwise we could
6418 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
6419 * employed by commit_range and abort_range.
6420 */
6421 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
6422 if (upl_page_present(pl, last_pg)) {
6423 break;
6424 }
6425 }
6426 pages_in_upl = last_pg + 1;
6427
6428 if (pages_in_upl == 0) {
6429 ubc_upl_abort(upl, 0);
6430
6431 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
6432 return 0;
6433 }
6434
6435 for (last_pg = 0; last_pg < pages_in_upl;) {
6436 /*
6437 * find the next dirty page in the UPL
6438 * this will become the first page in the
6439 * next I/O to generate
6440 */
6441 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
6442 if (upl_dirty_page(pl, start_pg)) {
6443 break;
6444 }
6445 if (upl_page_present(pl, start_pg)) {
6446 /*
6447 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
6448 * just release these unchanged since we're not going
6449 * to steal them or change their state
6450 */
6451 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
6452 }
6453 }
6454 if (start_pg >= pages_in_upl) {
6455 /*
6456 * done... no more dirty pages to push
6457 */
6458 break;
6459 }
6460 if (start_pg > last_pg) {
6461 /*
6462 * skipped over some non-dirty pages
6463 */
6464 size -= ((start_pg - last_pg) * PAGE_SIZE);
6465 }
6466
6467 /*
6468 * find a range of dirty pages to write
6469 */
6470 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
6471 if (!upl_dirty_page(pl, last_pg)) {
6472 break;
6473 }
6474 }
6475 upl_offset = start_pg * PAGE_SIZE;
6476
6477 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
6478
6479 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
6480
6481 if (!(flags & IO_SYNC)) {
6482 io_flags |= CL_ASYNC;
6483 }
6484
6485 if (flags & IO_CLOSE) {
6486 io_flags |= CL_CLOSE;
6487 }
6488
6489 if (flags & IO_NOCACHE) {
6490 io_flags |= CL_NOCACHE;
6491 }
6492
6493 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
6494 io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6495
6496 if (error == 0 && retval) {
6497 error = retval;
6498 }
6499
6500 size -= io_size;
6501 }
6502 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
6503
6504 return error;
6505 }
6506
6507
6508 /*
6509 * sparse_cluster_switch is called with the write behind lock held
6510 */
6511 static int
sparse_cluster_switch(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6512 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6513 {
6514 int cl_index;
6515 int error = 0;
6516
6517 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
6518
6519 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6520 int flags;
6521 struct cl_extent cl;
6522
6523 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
6524 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
6525 if (flags & UPL_POP_DIRTY) {
6526 cl.e_addr = cl.b_addr + 1;
6527
6528 error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
6529
6530 if (error) {
6531 break;
6532 }
6533 }
6534 }
6535 }
6536 }
6537 wbp->cl_number -= cl_index;
6538
6539 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
6540
6541 return error;
6542 }
6543
6544
6545 /*
6546 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
6547 * still associated with the write-behind context... however, if the scmap has been disassociated
6548 * from the write-behind context (the cluster_push case), the wb lock is not held
6549 */
6550 static int
sparse_cluster_push(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6551 sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
6552 int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6553 {
6554 struct cl_extent cl;
6555 off_t offset;
6556 u_int length;
6557 void *l_scmap;
6558 int error = 0;
6559
6560 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
6561
6562 if (push_flag & PUSH_ALL) {
6563 vfs_drt_control(scmap, 1);
6564 }
6565
6566 l_scmap = *scmap;
6567
6568 for (;;) {
6569 int retval;
6570
6571 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) {
6572 /*
6573 * Not finding anything to push will return KERN_FAILURE.
6574 * Confusing since it isn't really a failure. But that's the
6575 * reason we don't set 'error' here like we do below.
6576 */
6577 break;
6578 }
6579
6580 if (vm_initiated == TRUE) {
6581 lck_mtx_unlock(&wbp->cl_lockw);
6582 }
6583
6584 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6585 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6586
6587 retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
6588 if (error == 0 && retval) {
6589 error = retval;
6590 }
6591
6592 if (vm_initiated == TRUE) {
6593 lck_mtx_lock(&wbp->cl_lockw);
6594
6595 if (*scmap != l_scmap) {
6596 break;
6597 }
6598 }
6599
6600 if (error) {
6601 if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
6602 panic("Failed to restore dirty state on failure");
6603 }
6604
6605 break;
6606 }
6607
6608 if (!(push_flag & PUSH_ALL)) {
6609 break;
6610 }
6611 }
6612 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6613
6614 return error;
6615 }
6616
6617
6618 /*
6619 * sparse_cluster_add is called with the write behind lock held
6620 */
6621 static int
sparse_cluster_add(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,struct cl_extent * cl,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6622 sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
6623 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6624 {
6625 u_int new_dirty;
6626 u_int length;
6627 off_t offset;
6628 int error = 0;
6629 int push_flag = 0; /* Is this a valid value? */
6630
6631 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
6632
6633 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6634 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
6635
6636 while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
6637 /*
6638 * no room left in the map
6639 * only a partial update was done
6640 * push out some pages and try again
6641 */
6642
6643 if (vfs_get_scmap_push_behavior_internal(scmap, &push_flag)) {
6644 push_flag = 0;
6645 }
6646
6647 error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, 0, callback, callback_arg, vm_initiated);
6648
6649 if (error) {
6650 break;
6651 }
6652
6653 offset += (new_dirty * PAGE_SIZE_64);
6654 length -= (new_dirty * PAGE_SIZE);
6655 }
6656 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6657
6658 return error;
6659 }
6660
6661
6662 static int
cluster_align_phys_io(vnode_t vp,struct uio * uio,addr64_t usr_paddr,u_int32_t xsize,int flags,int (* callback)(buf_t,void *),void * callback_arg)6663 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
6664 {
6665 upl_page_info_t *pl;
6666 upl_t upl;
6667 addr64_t ubc_paddr;
6668 kern_return_t kret;
6669 int error = 0;
6670 int did_read = 0;
6671 int abort_flags;
6672 int upl_flags;
6673 int bflag;
6674
6675 if (flags & IO_PASSIVE) {
6676 bflag = CL_PASSIVE;
6677 } else {
6678 bflag = 0;
6679 }
6680
6681 if (flags & IO_NOCACHE) {
6682 bflag |= CL_NOCACHE;
6683 }
6684
6685 upl_flags = UPL_SET_LITE;
6686
6687 if (!(flags & CL_READ)) {
6688 /*
6689 * "write" operation: let the UPL subsystem know
6690 * that we intend to modify the buffer cache pages
6691 * we're gathering.
6692 */
6693 upl_flags |= UPL_WILL_MODIFY;
6694 } else {
6695 /*
6696 * indicate that there is no need to pull the
6697 * mapping for this page... we're only going
6698 * to read from it, not modify it.
6699 */
6700 upl_flags |= UPL_FILE_IO;
6701 }
6702 kret = ubc_create_upl_kernel(vp,
6703 uio->uio_offset & ~PAGE_MASK_64,
6704 PAGE_SIZE,
6705 &upl,
6706 &pl,
6707 upl_flags,
6708 VM_KERN_MEMORY_FILE);
6709
6710 if (kret != KERN_SUCCESS) {
6711 return EINVAL;
6712 }
6713
6714 if (!upl_valid_page(pl, 0)) {
6715 /*
6716 * issue a synchronous read to cluster_io
6717 */
6718 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6719 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6720 if (error) {
6721 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
6722
6723 return error;
6724 }
6725 did_read = 1;
6726 }
6727 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
6728
6729 /*
6730 * NOTE: There is no prototype for the following in BSD. It, and the definitions
6731 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6732 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
6733 * way to do so without exporting them to kexts as well.
6734 */
6735 if (flags & CL_READ) {
6736 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
6737 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
6738 } else {
6739 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
6740 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
6741 }
6742 if (!(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
6743 /*
6744 * issue a synchronous write to cluster_io
6745 */
6746 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6747 bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6748 }
6749 if (error == 0) {
6750 uio_update(uio, (user_size_t)xsize);
6751 }
6752
6753 if (did_read) {
6754 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6755 } else {
6756 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
6757 }
6758
6759 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
6760
6761 return error;
6762 }
6763
6764 int
cluster_copy_upl_data(struct uio * uio,upl_t upl,int upl_offset,int * io_resid)6765 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
6766 {
6767 int pg_offset;
6768 int pg_index;
6769 int csize;
6770 int segflg;
6771 int retval = 0;
6772 int xsize;
6773 upl_page_info_t *pl;
6774 int dirty_count;
6775
6776 xsize = *io_resid;
6777
6778 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6779 (int)uio->uio_offset, upl_offset, xsize, 0, 0);
6780
6781 segflg = uio->uio_segflg;
6782
6783 switch (segflg) {
6784 case UIO_USERSPACE32:
6785 case UIO_USERISPACE32:
6786 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6787 break;
6788
6789 case UIO_USERSPACE:
6790 case UIO_USERISPACE:
6791 uio->uio_segflg = UIO_PHYS_USERSPACE;
6792 break;
6793
6794 case UIO_USERSPACE64:
6795 case UIO_USERISPACE64:
6796 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6797 break;
6798
6799 case UIO_SYSSPACE:
6800 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6801 break;
6802 }
6803 pl = ubc_upl_pageinfo(upl);
6804
6805 pg_index = upl_offset / PAGE_SIZE;
6806 pg_offset = upl_offset & PAGE_MASK;
6807 csize = min(PAGE_SIZE - pg_offset, xsize);
6808
6809 dirty_count = 0;
6810 while (xsize && retval == 0) {
6811 addr64_t paddr;
6812
6813 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
6814 if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) {
6815 dirty_count++;
6816 }
6817
6818 retval = uiomove64(paddr, csize, uio);
6819
6820 pg_index += 1;
6821 pg_offset = 0;
6822 xsize -= csize;
6823 csize = min(PAGE_SIZE, xsize);
6824 }
6825 *io_resid = xsize;
6826
6827 uio->uio_segflg = segflg;
6828
6829 if (dirty_count) {
6830 task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
6831 }
6832
6833 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6834 (int)uio->uio_offset, xsize, retval, segflg, 0);
6835
6836 return retval;
6837 }
6838
6839
6840 int
cluster_copy_ubc_data(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty)6841 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
6842 {
6843 return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1);
6844 }
6845
6846
6847 static int
cluster_copy_ubc_data_internal(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty,int take_reference)6848 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
6849 {
6850 int segflg;
6851 int io_size;
6852 int xsize;
6853 int start_offset;
6854 int retval = 0;
6855 memory_object_control_t control;
6856
6857 io_size = *io_resid;
6858
6859 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6860 (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
6861
6862 control = ubc_getobject(vp, UBC_FLAGS_NONE);
6863
6864 if (control == MEMORY_OBJECT_CONTROL_NULL) {
6865 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6866 (int)uio->uio_offset, io_size, retval, 3, 0);
6867
6868 return 0;
6869 }
6870 segflg = uio->uio_segflg;
6871
6872 switch (segflg) {
6873 case UIO_USERSPACE32:
6874 case UIO_USERISPACE32:
6875 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6876 break;
6877
6878 case UIO_USERSPACE64:
6879 case UIO_USERISPACE64:
6880 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6881 break;
6882
6883 case UIO_USERSPACE:
6884 case UIO_USERISPACE:
6885 uio->uio_segflg = UIO_PHYS_USERSPACE;
6886 break;
6887
6888 case UIO_SYSSPACE:
6889 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6890 break;
6891 }
6892
6893 if ((io_size = *io_resid)) {
6894 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6895 xsize = (int)uio_resid(uio);
6896
6897 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6898 start_offset, io_size, mark_dirty, take_reference);
6899 xsize -= uio_resid(uio);
6900
6901 int num_bytes_copied = xsize;
6902 if (num_bytes_copied && uio_rw(uio)) {
6903 task_update_logical_writes(current_task(), num_bytes_copied, TASK_WRITE_DEFERRED, vp);
6904 }
6905 io_size -= xsize;
6906 }
6907 uio->uio_segflg = segflg;
6908 *io_resid = io_size;
6909
6910 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6911 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
6912
6913 return retval;
6914 }
6915
6916
6917 int
is_file_clean(vnode_t vp,off_t filesize)6918 is_file_clean(vnode_t vp, off_t filesize)
6919 {
6920 off_t f_offset;
6921 int flags;
6922 int total_dirty = 0;
6923
6924 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6925 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6926 if (flags & UPL_POP_DIRTY) {
6927 total_dirty++;
6928 }
6929 }
6930 }
6931 if (total_dirty) {
6932 return EINVAL;
6933 }
6934
6935 return 0;
6936 }
6937
6938
6939
6940 /*
6941 * Dirty region tracking/clustering mechanism.
6942 *
6943 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6944 * dirty regions within a larger space (file). It is primarily intended to
6945 * support clustering in large files with many dirty areas.
6946 *
6947 * The implementation assumes that the dirty regions are pages.
6948 *
6949 * To represent dirty pages within the file, we store bit vectors in a
6950 * variable-size circular hash.
6951 */
6952
6953 /*
6954 * Bitvector size. This determines the number of pages we group in a
6955 * single hashtable entry. Each hashtable entry is aligned to this
6956 * size within the file.
6957 */
6958 #define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
6959
6960 /*
6961 * File offset handling.
6962 *
6963 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6964 * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6965 */
6966 #define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6967 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
6968
6969 /*
6970 * Hashtable address field handling.
6971 *
6972 * The low-order bits of the hashtable address are used to conserve
6973 * space.
6974 *
6975 * DRT_HASH_COUNT_MASK must be large enough to store the range
6976 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6977 * to indicate that the bucket is actually unoccupied.
6978 */
6979 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6980 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
6981 do { \
6982 (scm)->scm_hashtable[(i)].dhe_control = \
6983 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6984 } while (0)
6985 #define DRT_HASH_COUNT_MASK 0x1ff
6986 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6987 #define DRT_HASH_SET_COUNT(scm, i, c) \
6988 do { \
6989 (scm)->scm_hashtable[(i)].dhe_control = \
6990 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
6991 } while (0)
6992 #define DRT_HASH_CLEAR(scm, i) \
6993 do { \
6994 (scm)->scm_hashtable[(i)].dhe_control = 0; \
6995 } while (0)
6996 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6997 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6998 #define DRT_HASH_COPY(oscm, oi, scm, i) \
6999 do { \
7000 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
7001 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
7002 } while(0);
7003
7004
7005 #if !defined(XNU_TARGET_OS_OSX)
7006 /*
7007 * Hash table moduli.
7008 *
7009 * Since the hashtable entry's size is dependent on the size of
7010 * the bitvector, and since the hashtable size is constrained to
7011 * both being prime and fitting within the desired allocation
7012 * size, these values need to be manually determined.
7013 *
7014 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
7015 *
7016 * The small hashtable allocation is 4096 bytes, so the modulus is 251.
7017 * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
7018 * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
7019 */
7020
7021 #define DRT_HASH_SMALL_MODULUS 251
7022 #define DRT_HASH_LARGE_MODULUS 2039
7023 #define DRT_HASH_XLARGE_MODULUS 8179
7024
7025 /*
7026 * Physical memory required before the large hash modulus is permitted.
7027 *
7028 * On small memory systems, the large hash modulus can lead to phsyical
7029 * memory starvation, so we avoid using it there.
7030 */
7031 #define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
7032 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (8 * 1024LL * 1024LL * 1024LL) /* 8GiB */
7033
7034 #define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
7035 #define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
7036 #define DRT_XLARGE_ALLOCATION 131072 /* 208 bytes spare */
7037
7038 #else /* XNU_TARGET_OS_OSX */
7039 /*
7040 * Hash table moduli.
7041 *
7042 * Since the hashtable entry's size is dependent on the size of
7043 * the bitvector, and since the hashtable size is constrained to
7044 * both being prime and fitting within the desired allocation
7045 * size, these values need to be manually determined.
7046 *
7047 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
7048 *
7049 * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
7050 * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
7051 * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
7052 */
7053
7054 #define DRT_HASH_SMALL_MODULUS 1019
7055 #define DRT_HASH_LARGE_MODULUS 8179
7056 #define DRT_HASH_XLARGE_MODULUS 32749
7057
7058 /*
7059 * Physical memory required before the large hash modulus is permitted.
7060 *
7061 * On small memory systems, the large hash modulus can lead to phsyical
7062 * memory starvation, so we avoid using it there.
7063 */
7064 #define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
7065 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (32 * 1024LL * 1024LL * 1024LL) /* 32GiB */
7066
7067 #define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
7068 #define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
7069 #define DRT_XLARGE_ALLOCATION 524288 /* 304 bytes spare */
7070
7071 #endif /* ! XNU_TARGET_OS_OSX */
7072
7073 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
7074
7075 /*
7076 * Hashtable entry.
7077 */
7078 struct vfs_drt_hashentry {
7079 u_int64_t dhe_control;
7080 /*
7081 * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
7082 * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
7083 * Since PAGE_SIZE is only known at boot time,
7084 * -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
7085 * -declare dhe_bitvector array for largest possible length
7086 */
7087 #define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
7088 u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
7089 };
7090
7091 /*
7092 * Hashtable bitvector handling.
7093 *
7094 * Bitvector fields are 32 bits long.
7095 */
7096
7097 #define DRT_HASH_SET_BIT(scm, i, bit) \
7098 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
7099
7100 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
7101 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
7102
7103 #define DRT_HASH_TEST_BIT(scm, i, bit) \
7104 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
7105
7106 #define DRT_BITVECTOR_CLEAR(scm, i) \
7107 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7108
7109 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
7110 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
7111 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
7112 (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7113
7114 /*
7115 * Dirty Region Tracking structure.
7116 *
7117 * The hashtable is allocated entirely inside the DRT structure.
7118 *
7119 * The hash is a simple circular prime modulus arrangement, the structure
7120 * is resized from small to large if it overflows.
7121 */
7122
7123 struct vfs_drt_clustermap {
7124 u_int32_t scm_magic; /* sanity/detection */
7125 #define DRT_SCM_MAGIC 0x12020003
7126 u_int32_t scm_modulus; /* current ring size */
7127 u_int32_t scm_buckets; /* number of occupied buckets */
7128 u_int32_t scm_lastclean; /* last entry we cleaned */
7129 u_int32_t scm_iskips; /* number of slot skips */
7130
7131 struct vfs_drt_hashentry scm_hashtable[0];
7132 };
7133
7134
7135 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
7136 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
7137
7138 /*
7139 * Debugging codes and arguments.
7140 */
7141 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
7142 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
7143 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
7144 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
7145 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
7146 * dirty */
7147 /* 0, setcount */
7148 /* 1 (clean, no map) */
7149 /* 2 (map alloc fail) */
7150 /* 3, resid (partial) */
7151 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
7152 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
7153 * lastclean, iskips */
7154
7155
7156 static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
7157 static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
7158 static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
7159 u_int64_t offset, int *indexp);
7160 static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
7161 u_int64_t offset,
7162 int *indexp,
7163 int recursed);
7164 static kern_return_t vfs_drt_do_mark_pages(
7165 void **cmapp,
7166 u_int64_t offset,
7167 u_int length,
7168 u_int *setcountp,
7169 int dirty);
7170 static void vfs_drt_trace(
7171 struct vfs_drt_clustermap *cmap,
7172 int code,
7173 int arg1,
7174 int arg2,
7175 int arg3,
7176 int arg4);
7177
7178
7179 /*
7180 * Allocate and initialise a sparse cluster map.
7181 *
7182 * Will allocate a new map, resize or compact an existing map.
7183 *
7184 * XXX we should probably have at least one intermediate map size,
7185 * as the 1:16 ratio seems a bit drastic.
7186 */
7187 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap ** cmapp)7188 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
7189 {
7190 struct vfs_drt_clustermap *cmap = NULL, *ocmap = NULL;
7191 kern_return_t kret = KERN_SUCCESS;
7192 u_int64_t offset = 0;
7193 u_int32_t i = 0;
7194 int modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
7195
7196 ocmap = NULL;
7197 if (cmapp != NULL) {
7198 ocmap = *cmapp;
7199 }
7200
7201 /*
7202 * Decide on the size of the new map.
7203 */
7204 if (ocmap == NULL) {
7205 modulus_size = DRT_HASH_SMALL_MODULUS;
7206 map_size = DRT_SMALL_ALLOCATION;
7207 } else {
7208 /* count the number of active buckets in the old map */
7209 active_buckets = 0;
7210 for (i = 0; i < ocmap->scm_modulus; i++) {
7211 if (!DRT_HASH_VACANT(ocmap, i) &&
7212 (DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
7213 active_buckets++;
7214 }
7215 }
7216 /*
7217 * If we're currently using the small allocation, check to
7218 * see whether we should grow to the large one.
7219 */
7220 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7221 /*
7222 * If the ring is nearly full and we are allowed to
7223 * use the large modulus, upgrade.
7224 */
7225 if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
7226 (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
7227 modulus_size = DRT_HASH_LARGE_MODULUS;
7228 map_size = DRT_LARGE_ALLOCATION;
7229 } else {
7230 modulus_size = DRT_HASH_SMALL_MODULUS;
7231 map_size = DRT_SMALL_ALLOCATION;
7232 }
7233 } else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7234 if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
7235 (max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
7236 modulus_size = DRT_HASH_XLARGE_MODULUS;
7237 map_size = DRT_XLARGE_ALLOCATION;
7238 } else {
7239 /*
7240 * If the ring is completely full and we can't
7241 * expand, there's nothing useful for us to do.
7242 * Behave as though we had compacted into the new
7243 * array and return.
7244 */
7245 return KERN_SUCCESS;
7246 }
7247 } else {
7248 /* already using the xlarge modulus */
7249 modulus_size = DRT_HASH_XLARGE_MODULUS;
7250 map_size = DRT_XLARGE_ALLOCATION;
7251
7252 /*
7253 * If the ring is completely full, there's
7254 * nothing useful for us to do. Behave as
7255 * though we had compacted into the new
7256 * array and return.
7257 */
7258 if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
7259 return KERN_SUCCESS;
7260 }
7261 }
7262 }
7263
7264 /*
7265 * Allocate and initialise the new map.
7266 */
7267
7268 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, map_size,
7269 KMA_DATA, VM_KERN_MEMORY_FILE);
7270 if (kret != KERN_SUCCESS) {
7271 return kret;
7272 }
7273 cmap->scm_magic = DRT_SCM_MAGIC;
7274 cmap->scm_modulus = modulus_size;
7275 cmap->scm_buckets = 0;
7276 cmap->scm_lastclean = 0;
7277 cmap->scm_iskips = 0;
7278 for (i = 0; i < cmap->scm_modulus; i++) {
7279 DRT_HASH_CLEAR(cmap, i);
7280 DRT_HASH_VACATE(cmap, i);
7281 DRT_BITVECTOR_CLEAR(cmap, i);
7282 }
7283
7284 /*
7285 * If there's an old map, re-hash entries from it into the new map.
7286 */
7287 copycount = 0;
7288 if (ocmap != NULL) {
7289 for (i = 0; i < ocmap->scm_modulus; i++) {
7290 /* skip empty buckets */
7291 if (DRT_HASH_VACANT(ocmap, i) ||
7292 (DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
7293 continue;
7294 }
7295 /* get new index */
7296 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
7297 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
7298 if (kret != KERN_SUCCESS) {
7299 /* XXX need to bail out gracefully here */
7300 panic("vfs_drt: new cluster map mysteriously too small");
7301 index = 0;
7302 }
7303 /* copy */
7304 DRT_HASH_COPY(ocmap, i, cmap, index);
7305 copycount++;
7306 }
7307 }
7308
7309 /* log what we've done */
7310 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
7311
7312 /*
7313 * It's important to ensure that *cmapp always points to
7314 * a valid map, so we must overwrite it before freeing
7315 * the old map.
7316 */
7317 *cmapp = cmap;
7318 if (ocmap != NULL) {
7319 /* emit stats into trace buffer */
7320 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
7321 ocmap->scm_modulus,
7322 ocmap->scm_buckets,
7323 ocmap->scm_lastclean,
7324 ocmap->scm_iskips);
7325
7326 vfs_drt_free_map(ocmap);
7327 }
7328 return KERN_SUCCESS;
7329 }
7330
7331
7332 /*
7333 * Free a sparse cluster map.
7334 */
7335 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap * cmap)7336 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
7337 {
7338 vm_size_t map_size = 0;
7339
7340 if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7341 map_size = DRT_SMALL_ALLOCATION;
7342 } else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7343 map_size = DRT_LARGE_ALLOCATION;
7344 } else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7345 map_size = DRT_XLARGE_ALLOCATION;
7346 } else {
7347 panic("vfs_drt_free_map: Invalid modulus %d", cmap->scm_modulus);
7348 }
7349
7350 kmem_free(kernel_map, (vm_offset_t)cmap, map_size);
7351 return KERN_SUCCESS;
7352 }
7353
7354
7355 /*
7356 * Find the hashtable slot currently occupied by an entry for the supplied offset.
7357 */
7358 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap * cmap,u_int64_t offset,int * indexp)7359 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
7360 {
7361 int index;
7362 u_int32_t i;
7363
7364 offset = DRT_ALIGN_ADDRESS(offset);
7365 index = DRT_HASH(cmap, offset);
7366
7367 /* traverse the hashtable */
7368 for (i = 0; i < cmap->scm_modulus; i++) {
7369 /*
7370 * If the slot is vacant, we can stop.
7371 */
7372 if (DRT_HASH_VACANT(cmap, index)) {
7373 break;
7374 }
7375
7376 /*
7377 * If the address matches our offset, we have success.
7378 */
7379 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
7380 *indexp = index;
7381 return KERN_SUCCESS;
7382 }
7383
7384 /*
7385 * Move to the next slot, try again.
7386 */
7387 index = DRT_HASH_NEXT(cmap, index);
7388 }
7389 /*
7390 * It's not there.
7391 */
7392 return KERN_FAILURE;
7393 }
7394
7395 /*
7396 * Find the hashtable slot for the supplied offset. If we haven't allocated
7397 * one yet, allocate one and populate the address field. Note that it will
7398 * not have a nonzero page count and thus will still technically be free, so
7399 * in the case where we are called to clean pages, the slot will remain free.
7400 */
7401 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap ** cmapp,u_int64_t offset,int * indexp,int recursed)7402 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
7403 {
7404 struct vfs_drt_clustermap *cmap;
7405 kern_return_t kret;
7406 u_int32_t index;
7407 u_int32_t i;
7408
7409 cmap = *cmapp;
7410
7411 /* look for an existing entry */
7412 kret = vfs_drt_search_index(cmap, offset, indexp);
7413 if (kret == KERN_SUCCESS) {
7414 return kret;
7415 }
7416
7417 /* need to allocate an entry */
7418 offset = DRT_ALIGN_ADDRESS(offset);
7419 index = DRT_HASH(cmap, offset);
7420
7421 /* scan from the index forwards looking for a vacant slot */
7422 for (i = 0; i < cmap->scm_modulus; i++) {
7423 /* slot vacant? */
7424 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap, index) == 0) {
7425 cmap->scm_buckets++;
7426 if (index < cmap->scm_lastclean) {
7427 cmap->scm_lastclean = index;
7428 }
7429 DRT_HASH_SET_ADDRESS(cmap, index, offset);
7430 DRT_HASH_SET_COUNT(cmap, index, 0);
7431 DRT_BITVECTOR_CLEAR(cmap, index);
7432 *indexp = index;
7433 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
7434 return KERN_SUCCESS;
7435 }
7436 cmap->scm_iskips += i;
7437 index = DRT_HASH_NEXT(cmap, index);
7438 }
7439
7440 /*
7441 * We haven't found a vacant slot, so the map is full. If we're not
7442 * already recursed, try reallocating/compacting it.
7443 */
7444 if (recursed) {
7445 return KERN_FAILURE;
7446 }
7447 kret = vfs_drt_alloc_map(cmapp);
7448 if (kret == KERN_SUCCESS) {
7449 /* now try to insert again */
7450 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
7451 }
7452 return kret;
7453 }
7454
7455 /*
7456 * Implementation of set dirty/clean.
7457 *
7458 * In the 'clean' case, not finding a map is OK.
7459 */
7460 static kern_return_t
vfs_drt_do_mark_pages(void ** private,u_int64_t offset,u_int length,u_int * setcountp,int dirty)7461 vfs_drt_do_mark_pages(
7462 void **private,
7463 u_int64_t offset,
7464 u_int length,
7465 u_int *setcountp,
7466 int dirty)
7467 {
7468 struct vfs_drt_clustermap *cmap, **cmapp;
7469 kern_return_t kret;
7470 int i, index, pgoff, pgcount, setcount, ecount;
7471
7472 cmapp = (struct vfs_drt_clustermap **)private;
7473 cmap = *cmapp;
7474
7475 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
7476
7477 if (setcountp != NULL) {
7478 *setcountp = 0;
7479 }
7480
7481 /* allocate a cluster map if we don't already have one */
7482 if (cmap == NULL) {
7483 /* no cluster map, nothing to clean */
7484 if (!dirty) {
7485 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
7486 return KERN_SUCCESS;
7487 }
7488 kret = vfs_drt_alloc_map(cmapp);
7489 if (kret != KERN_SUCCESS) {
7490 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
7491 return kret;
7492 }
7493 }
7494 setcount = 0;
7495
7496 /*
7497 * Iterate over the length of the region.
7498 */
7499 while (length > 0) {
7500 /*
7501 * Get the hashtable index for this offset.
7502 *
7503 * XXX this will add blank entries if we are clearing a range
7504 * that hasn't been dirtied.
7505 */
7506 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
7507 cmap = *cmapp; /* may have changed! */
7508 /* this may be a partial-success return */
7509 if (kret != KERN_SUCCESS) {
7510 if (setcountp != NULL) {
7511 *setcountp = setcount;
7512 }
7513 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
7514
7515 return kret;
7516 }
7517
7518 /*
7519 * Work out how many pages we're modifying in this
7520 * hashtable entry.
7521 */
7522 pgoff = (int)((offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE);
7523 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
7524
7525 /*
7526 * Iterate over pages, dirty/clearing as we go.
7527 */
7528 ecount = DRT_HASH_GET_COUNT(cmap, index);
7529 for (i = 0; i < pgcount; i++) {
7530 if (dirty) {
7531 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7532 if (ecount >= DRT_BITVECTOR_PAGES) {
7533 panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7534 }
7535 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
7536 ecount++;
7537 setcount++;
7538 }
7539 } else {
7540 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7541 if (ecount <= 0) {
7542 panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7543 }
7544 assert(ecount > 0);
7545 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
7546 ecount--;
7547 setcount++;
7548 }
7549 }
7550 }
7551 DRT_HASH_SET_COUNT(cmap, index, ecount);
7552
7553 offset += pgcount * PAGE_SIZE;
7554 length -= pgcount * PAGE_SIZE;
7555 }
7556 if (setcountp != NULL) {
7557 *setcountp = setcount;
7558 }
7559
7560 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
7561
7562 return KERN_SUCCESS;
7563 }
7564
7565 /*
7566 * Mark a set of pages as dirty/clean.
7567 *
7568 * This is a public interface.
7569 *
7570 * cmapp
7571 * Pointer to storage suitable for holding a pointer. Note that
7572 * this must either be NULL or a value set by this function.
7573 *
7574 * size
7575 * Current file size in bytes.
7576 *
7577 * offset
7578 * Offset of the first page to be marked as dirty, in bytes. Must be
7579 * page-aligned.
7580 *
7581 * length
7582 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
7583 *
7584 * setcountp
7585 * Number of pages newly marked dirty by this call (optional).
7586 *
7587 * Returns KERN_SUCCESS if all the pages were successfully marked.
7588 */
7589 static kern_return_t
vfs_drt_mark_pages(void ** cmapp,off_t offset,u_int length,u_int * setcountp)7590 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
7591 {
7592 /* XXX size unused, drop from interface */
7593 return vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1);
7594 }
7595
7596 #if 0
7597 static kern_return_t
7598 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
7599 {
7600 return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7601 }
7602 #endif
7603
7604 /*
7605 * Get a cluster of dirty pages.
7606 *
7607 * This is a public interface.
7608 *
7609 * cmapp
7610 * Pointer to storage managed by drt_mark_pages. Note that this must
7611 * be NULL or a value set by drt_mark_pages.
7612 *
7613 * offsetp
7614 * Returns the byte offset into the file of the first page in the cluster.
7615 *
7616 * lengthp
7617 * Returns the length in bytes of the cluster of dirty pages.
7618 *
7619 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
7620 * are no dirty pages meeting the minmum size criteria. Private storage will
7621 * be released if there are no more dirty pages left in the map
7622 *
7623 */
7624 static kern_return_t
vfs_drt_get_cluster(void ** cmapp,off_t * offsetp,u_int * lengthp)7625 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
7626 {
7627 struct vfs_drt_clustermap *cmap;
7628 u_int64_t offset;
7629 u_int length;
7630 u_int32_t j;
7631 int index, i, fs, ls;
7632
7633 /* sanity */
7634 if ((cmapp == NULL) || (*cmapp == NULL)) {
7635 return KERN_FAILURE;
7636 }
7637 cmap = *cmapp;
7638
7639 /* walk the hashtable */
7640 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
7641 index = DRT_HASH(cmap, offset);
7642
7643 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
7644 continue;
7645 }
7646
7647 /* scan the bitfield for a string of bits */
7648 fs = -1;
7649
7650 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7651 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7652 fs = i;
7653 break;
7654 }
7655 }
7656 if (fs == -1) {
7657 /* didn't find any bits set */
7658 panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7659 cmap, index, DRT_HASH_GET_COUNT(cmap, index));
7660 }
7661 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
7662 if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
7663 break;
7664 }
7665 }
7666
7667 /* compute offset and length, mark pages clean */
7668 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7669 length = ls * PAGE_SIZE;
7670 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7671 cmap->scm_lastclean = index;
7672
7673 /* return successful */
7674 *offsetp = (off_t)offset;
7675 *lengthp = length;
7676
7677 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
7678 return KERN_SUCCESS;
7679 }
7680 /*
7681 * We didn't find anything... hashtable is empty
7682 * emit stats into trace buffer and
7683 * then free it
7684 */
7685 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7686 cmap->scm_modulus,
7687 cmap->scm_buckets,
7688 cmap->scm_lastclean,
7689 cmap->scm_iskips);
7690
7691 vfs_drt_free_map(cmap);
7692 *cmapp = NULL;
7693
7694 return KERN_FAILURE;
7695 }
7696
7697
7698 static kern_return_t
vfs_drt_control(void ** cmapp,int op_type)7699 vfs_drt_control(void **cmapp, int op_type)
7700 {
7701 struct vfs_drt_clustermap *cmap;
7702
7703 /* sanity */
7704 if ((cmapp == NULL) || (*cmapp == NULL)) {
7705 return KERN_FAILURE;
7706 }
7707 cmap = *cmapp;
7708
7709 switch (op_type) {
7710 case 0:
7711 /* emit stats into trace buffer */
7712 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7713 cmap->scm_modulus,
7714 cmap->scm_buckets,
7715 cmap->scm_lastclean,
7716 cmap->scm_iskips);
7717
7718 vfs_drt_free_map(cmap);
7719 *cmapp = NULL;
7720 break;
7721
7722 case 1:
7723 cmap->scm_lastclean = 0;
7724 break;
7725 }
7726 return KERN_SUCCESS;
7727 }
7728
7729
7730
7731 /*
7732 * Emit a summary of the state of the clustermap into the trace buffer
7733 * along with some caller-provided data.
7734 */
7735 #if KDEBUG
7736 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,int code,int arg1,int arg2,int arg3,int arg4)7737 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
7738 {
7739 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
7740 }
7741 #else
7742 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,__unused int code,__unused int arg1,__unused int arg2,__unused int arg3,__unused int arg4)7743 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
7744 __unused int arg1, __unused int arg2, __unused int arg3,
7745 __unused int arg4)
7746 {
7747 }
7748 #endif
7749
7750 #if 0
7751 /*
7752 * Perform basic sanity check on the hash entry summary count
7753 * vs. the actual bits set in the entry.
7754 */
7755 static void
7756 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7757 {
7758 int index, i;
7759 int bits_on;
7760
7761 for (index = 0; index < cmap->scm_modulus; index++) {
7762 if (DRT_HASH_VACANT(cmap, index)) {
7763 continue;
7764 }
7765
7766 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7767 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7768 bits_on++;
7769 }
7770 }
7771 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
7772 panic("bits_on = %d, index = %d", bits_on, index);
7773 }
7774 }
7775 }
7776 #endif
7777
7778 /*
7779 * Internal interface only.
7780 */
7781 static kern_return_t
vfs_get_scmap_push_behavior_internal(void ** cmapp,int * push_flag)7782 vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag)
7783 {
7784 struct vfs_drt_clustermap *cmap;
7785
7786 /* sanity */
7787 if ((cmapp == NULL) || (*cmapp == NULL) || (push_flag == NULL)) {
7788 return KERN_FAILURE;
7789 }
7790 cmap = *cmapp;
7791
7792 if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7793 /*
7794 * If we have a full xlarge sparse cluster,
7795 * we push it out all at once so the cluster
7796 * map can be available to absorb more I/Os.
7797 * This is done on large memory configs so
7798 * the small I/Os don't interfere with the
7799 * pro workloads.
7800 */
7801 *push_flag = PUSH_ALL;
7802 }
7803 return KERN_SUCCESS;
7804 }
7805