1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
62 */
63
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <kern/kalloc.h>
71 #include <sys/time.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <miscfs/specfs/specdev.h>
75 #include <sys/uio_internal.h>
76 #include <libkern/libkern.h>
77 #include <machine/machine_routines.h>
78
79 #include <sys/ubc_internal.h>
80 #include <vm/vnode_pager.h>
81
82 #include <mach/mach_types.h>
83 #include <mach/memory_object_types.h>
84 #include <mach/vm_map.h>
85 #include <mach/upl.h>
86 #include <kern/task.h>
87 #include <kern/policy_internal.h>
88
89 #include <vm/vm_kern.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_pageout.h>
92 #include <vm/vm_fault.h>
93
94 #include <sys/kdebug.h>
95 #include <sys/kdebug_triage.h>
96 #include <libkern/OSAtomic.h>
97
98 #include <sys/sdt.h>
99
100 #include <stdbool.h>
101
102 #include <vfs/vfs_disk_conditioner.h>
103
104 #if 0
105 #undef KERNEL_DEBUG
106 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
107 #endif
108
109
110 #define CL_READ 0x01
111 #define CL_WRITE 0x02
112 #define CL_ASYNC 0x04
113 #define CL_COMMIT 0x08
114 #define CL_PAGEOUT 0x10
115 #define CL_AGE 0x20
116 #define CL_NOZERO 0x40
117 #define CL_PAGEIN 0x80
118 #define CL_DEV_MEMORY 0x100
119 #define CL_PRESERVE 0x200
120 #define CL_THROTTLE 0x400
121 #define CL_KEEPCACHED 0x800
122 #define CL_DIRECT_IO 0x1000
123 #define CL_PASSIVE 0x2000
124 #define CL_IOSTREAMING 0x4000
125 #define CL_CLOSE 0x8000
126 #define CL_ENCRYPTED 0x10000
127 #define CL_RAW_ENCRYPTED 0x20000
128 #define CL_NOCACHE 0x40000
129
130 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
131
132 #define CLUSTER_IO_WAITING ((buf_t)1)
133
134 extern upl_t vector_upl_create(vm_offset_t, uint32_t);
135 extern uint32_t vector_upl_max_upls(upl_t);
136 extern boolean_t vector_upl_is_valid(upl_t);
137 extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
138 extern void vector_upl_set_pagelist(upl_t);
139 extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
140
141 struct clios {
142 lck_mtx_t io_mtxp;
143 u_int io_completed; /* amount of io that has currently completed */
144 u_int io_issued; /* amount of io that was successfully issued */
145 int io_error; /* error code of first error encountered */
146 int io_wanted; /* someone is sleeping waiting for a change in state */
147 };
148
149 struct cl_direct_read_lock {
150 LIST_ENTRY(cl_direct_read_lock) chain;
151 int32_t ref_count;
152 vnode_t vp;
153 lck_rw_t rw_lock;
154 };
155
156 #define CL_DIRECT_READ_LOCK_BUCKETS 61
157
158 static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
159 cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
160
161 static LCK_GRP_DECLARE(cl_mtx_grp, "cluster I/O");
162 static LCK_MTX_DECLARE(cl_transaction_mtxp, &cl_mtx_grp);
163 static LCK_SPIN_DECLARE(cl_direct_read_spin_lock, &cl_mtx_grp);
164
165 static ZONE_DEFINE(cl_rd_zone, "cluster_read",
166 sizeof(struct cl_readahead), ZC_ZFREE_CLEARMEM);
167
168 static ZONE_DEFINE(cl_wr_zone, "cluster_write",
169 sizeof(struct cl_writebehind), ZC_ZFREE_CLEARMEM);
170
171 #define IO_UNKNOWN 0
172 #define IO_DIRECT 1
173 #define IO_CONTIG 2
174 #define IO_COPY 3
175
176 #define PUSH_DELAY 0x01
177 #define PUSH_ALL 0x02
178 #define PUSH_SYNC 0x04
179
180
181 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size);
182 static void cluster_wait_IO(buf_t cbp_head, int async);
183 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
184
185 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
186
187 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
188 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
189 static int cluster_iodone(buf_t bp, void *callback_arg);
190 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
191 static int cluster_is_throttled(vnode_t vp);
192
193 static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
194
195 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
196
197 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
198 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
199
200 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
201 int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
202 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
203 int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
204 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
205 int (*)(buf_t, void *), void *callback_arg, int flags) __attribute__((noinline));
206
207 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
208 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
209 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
210 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
211 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
212 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag) __attribute__((noinline));
213
214 static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
215 off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
216
217 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
218
219 static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
220 static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
221 int (*callback)(buf_t, void *), void *callback_arg, int bflag);
222
223 static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
224
225 static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
226 void *callback_arg, int *err, boolean_t vm_initiated);
227
228 static int sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
229 static int sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
230 int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
231 static int sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
232 int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
233
234 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
235 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
236 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
237 static kern_return_t vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag);
238
239
240 /*
241 * For throttled IO to check whether
242 * a block is cached by the boot cache
243 * and thus it can avoid delaying the IO.
244 *
245 * bootcache_contains_block is initially
246 * NULL. The BootCache will set it while
247 * the cache is active and clear it when
248 * the cache is jettisoned.
249 *
250 * Returns 0 if the block is not
251 * contained in the cache, 1 if it is
252 * contained.
253 *
254 * The function pointer remains valid
255 * after the cache has been evicted even
256 * if bootcache_contains_block has been
257 * cleared.
258 *
259 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
260 */
261 int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
262
263
264 /*
265 * limit the internal I/O size so that we
266 * can represent it in a 32 bit int
267 */
268 #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
269 #define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
270 #define MAX_VECTS 16
271 /*
272 * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
273 * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
274 * we have not historically allowed the write to bypass the UBC.
275 */
276 #define MIN_DIRECT_WRITE_SIZE (16384)
277
278 #define WRITE_THROTTLE 6
279 #define WRITE_THROTTLE_SSD 2
280 #define WRITE_BEHIND 1
281 #define WRITE_BEHIND_SSD 1
282
283 #if !defined(XNU_TARGET_OS_OSX)
284 #define PREFETCH 1
285 #define PREFETCH_SSD 1
286 uint32_t speculative_prefetch_max = (2048 * 1024); /* maximum bytes in a specluative read-ahead */
287 uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
288 #else /* XNU_TARGET_OS_OSX */
289 #define PREFETCH 3
290 #define PREFETCH_SSD 2
291 uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
292 uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
293 #endif /* ! XNU_TARGET_OS_OSX */
294
295 /* maximum bytes for read-ahead */
296 uint32_t prefetch_max = (1024 * 1024 * 1024);
297 /* maximum bytes for outstanding reads */
298 uint32_t overlapping_read_max = (1024 * 1024 * 1024);
299 /* maximum bytes for outstanding writes */
300 uint32_t overlapping_write_max = (1024 * 1024 * 1024);
301
302 #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
303 #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
304
305 int speculative_reads_disabled = 0;
306
307 /*
308 * throttle the number of async writes that
309 * can be outstanding on a single vnode
310 * before we issue a synchronous write
311 */
312 #define THROTTLE_MAXCNT 0
313
314 uint32_t throttle_max_iosize = (128 * 1024);
315
316 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
317
318 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
319
320
321 void
cluster_init(void)322 cluster_init(void)
323 {
324 for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
325 LIST_INIT(&cl_direct_read_locks[i]);
326 }
327 }
328
329
330 uint32_t
cluster_max_io_size(mount_t mp,int type)331 cluster_max_io_size(mount_t mp, int type)
332 {
333 uint32_t max_io_size;
334 uint32_t segcnt;
335 uint32_t maxcnt;
336
337 switch (type) {
338 case CL_READ:
339 segcnt = mp->mnt_segreadcnt;
340 maxcnt = mp->mnt_maxreadcnt;
341 break;
342 case CL_WRITE:
343 segcnt = mp->mnt_segwritecnt;
344 maxcnt = mp->mnt_maxwritecnt;
345 break;
346 default:
347 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
348 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
349 break;
350 }
351 if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
352 /*
353 * don't allow a size beyond the max UPL size we can create
354 */
355 segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
356 }
357 max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
358
359 if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
360 /*
361 * don't allow a size smaller than the old fixed limit
362 */
363 max_io_size = MAX_UPL_TRANSFER_BYTES;
364 } else {
365 /*
366 * make sure the size specified is a multiple of PAGE_SIZE
367 */
368 max_io_size &= ~PAGE_MASK;
369 }
370 return max_io_size;
371 }
372
373 /*
374 * Returns max prefetch value. If the value overflows or exceeds the specified
375 * 'prefetch_limit', it will be capped at 'prefetch_limit' value.
376 */
377 static inline uint32_t
cluster_max_prefetch(vnode_t vp,uint32_t max_io_size,uint32_t prefetch_limit)378 cluster_max_prefetch(vnode_t vp, uint32_t max_io_size, uint32_t prefetch_limit)
379 {
380 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
381 uint32_t io_scale = IO_SCALE(vp, is_ssd ? PREFETCH_SSD : PREFETCH);
382 uint32_t prefetch = 0;
383
384 if (__improbable(os_mul_overflow(max_io_size, io_scale, &prefetch) ||
385 (prefetch > prefetch_limit))) {
386 prefetch = prefetch_limit;
387 }
388
389 return prefetch;
390 }
391
392 static inline uint32_t
calculate_max_throttle_size(vnode_t vp)393 calculate_max_throttle_size(vnode_t vp)
394 {
395 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
396 uint32_t io_scale = IO_SCALE(vp, is_ssd ? 2 : 1);
397
398 return MIN(io_scale * THROTTLE_MAX_IOSIZE, MAX_UPL_TRANSFER_BYTES);
399 }
400
401 static inline uint32_t
calculate_max_throttle_cnt(vnode_t vp)402 calculate_max_throttle_cnt(vnode_t vp)
403 {
404 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
405 uint32_t io_scale = IO_SCALE(vp, 1);
406
407 return is_ssd ? MIN(io_scale, 4) : THROTTLE_MAXCNT;
408 }
409
410 #define CLW_ALLOCATE 0x01
411 #define CLW_RETURNLOCKED 0x02
412 #define CLW_IONOCACHE 0x04
413 #define CLW_IOPASSIVE 0x08
414
415 /*
416 * if the read ahead context doesn't yet exist,
417 * allocate and initialize it...
418 * the vnode lock serializes multiple callers
419 * during the actual assignment... first one
420 * to grab the lock wins... the other callers
421 * will release the now unnecessary storage
422 *
423 * once the context is present, try to grab (but don't block on)
424 * the lock associated with it... if someone
425 * else currently owns it, than the read
426 * will run without read-ahead. this allows
427 * multiple readers to run in parallel and
428 * since there's only 1 read ahead context,
429 * there's no real loss in only allowing 1
430 * reader to have read-ahead enabled.
431 */
432 static struct cl_readahead *
cluster_get_rap(vnode_t vp)433 cluster_get_rap(vnode_t vp)
434 {
435 struct ubc_info *ubc;
436 struct cl_readahead *rap;
437
438 ubc = vp->v_ubcinfo;
439
440 if ((rap = ubc->cl_rahead) == NULL) {
441 rap = zalloc_flags(cl_rd_zone, Z_WAITOK | Z_ZERO);
442 rap->cl_lastr = -1;
443 lck_mtx_init(&rap->cl_lockr, &cl_mtx_grp, LCK_ATTR_NULL);
444
445 vnode_lock(vp);
446
447 if (ubc->cl_rahead == NULL) {
448 ubc->cl_rahead = rap;
449 } else {
450 lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
451 zfree(cl_rd_zone, rap);
452 rap = ubc->cl_rahead;
453 }
454 vnode_unlock(vp);
455 }
456 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) {
457 return rap;
458 }
459
460 return (struct cl_readahead *)NULL;
461 }
462
463
464 /*
465 * if the write behind context doesn't yet exist,
466 * and CLW_ALLOCATE is specified, allocate and initialize it...
467 * the vnode lock serializes multiple callers
468 * during the actual assignment... first one
469 * to grab the lock wins... the other callers
470 * will release the now unnecessary storage
471 *
472 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
473 * the lock associated with the write behind context before
474 * returning
475 */
476
477 static struct cl_writebehind *
cluster_get_wbp(vnode_t vp,int flags)478 cluster_get_wbp(vnode_t vp, int flags)
479 {
480 struct ubc_info *ubc;
481 struct cl_writebehind *wbp;
482
483 ubc = vp->v_ubcinfo;
484
485 if ((wbp = ubc->cl_wbehind) == NULL) {
486 if (!(flags & CLW_ALLOCATE)) {
487 return (struct cl_writebehind *)NULL;
488 }
489
490 wbp = zalloc_flags(cl_wr_zone, Z_WAITOK | Z_ZERO);
491
492 lck_mtx_init(&wbp->cl_lockw, &cl_mtx_grp, LCK_ATTR_NULL);
493
494 vnode_lock(vp);
495
496 if (ubc->cl_wbehind == NULL) {
497 ubc->cl_wbehind = wbp;
498 } else {
499 lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
500 zfree(cl_wr_zone, wbp);
501 wbp = ubc->cl_wbehind;
502 }
503 vnode_unlock(vp);
504 }
505 if (flags & CLW_RETURNLOCKED) {
506 lck_mtx_lock(&wbp->cl_lockw);
507 }
508
509 return wbp;
510 }
511
512
513 static void
cluster_syncup(vnode_t vp,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,int flags)514 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
515 {
516 struct cl_writebehind *wbp;
517
518 if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
519 if (wbp->cl_number) {
520 lck_mtx_lock(&wbp->cl_lockw);
521
522 cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
523
524 lck_mtx_unlock(&wbp->cl_lockw);
525 }
526 }
527 }
528
529
530 static int
cluster_io_present_in_BC(vnode_t vp,off_t f_offset)531 cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
532 {
533 daddr64_t blkno;
534 size_t io_size;
535 int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
536
537 if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
538 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL)) {
539 return 0;
540 }
541
542 if (io_size == 0) {
543 return 0;
544 }
545
546 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
547 return 1;
548 }
549 }
550 return 0;
551 }
552
553
554 static int
cluster_is_throttled(vnode_t vp)555 cluster_is_throttled(vnode_t vp)
556 {
557 return throttle_io_will_be_throttled(-1, vp->v_mount);
558 }
559
560
561 static void
cluster_iostate_wait(struct clios * iostate,u_int target,const char * wait_name)562 cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
563 {
564 lck_mtx_lock(&iostate->io_mtxp);
565
566 while ((iostate->io_issued - iostate->io_completed) > target) {
567 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
568 iostate->io_issued, iostate->io_completed, target, 0, 0);
569
570 iostate->io_wanted = 1;
571 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
572
573 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
574 iostate->io_issued, iostate->io_completed, target, 0, 0);
575 }
576 lck_mtx_unlock(&iostate->io_mtxp);
577 }
578
579 static void
cluster_handle_associated_upl(struct clios * iostate,upl_t upl,upl_offset_t upl_offset,upl_size_t size)580 cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
581 upl_offset_t upl_offset, upl_size_t size)
582 {
583 if (!size) {
584 return;
585 }
586
587 upl_t associated_upl = upl_associated_upl(upl);
588
589 if (!associated_upl) {
590 return;
591 }
592
593 #if 0
594 printf("1: %d %d\n", upl_offset, upl_offset + size);
595 #endif
596
597 /*
598 * The associated UPL is page aligned to file offsets whereas the
599 * UPL it's attached to has different alignment requirements. The
600 * upl_offset that we have refers to @upl. The code that follows
601 * has to deal with the first and last pages in this transaction
602 * which might straddle pages in the associated UPL. To keep
603 * track of these pages, we use the mark bits: if the mark bit is
604 * set, we know another transaction has completed its part of that
605 * page and so we can unlock that page here.
606 *
607 * The following illustrates what we have to deal with:
608 *
609 * MEM u <------------ 1 PAGE ------------> e
610 * +-------------+----------------------+-----------------
611 * | |######################|#################
612 * +-------------+----------------------+-----------------
613 * FILE | <--- a ---> o <------------ 1 PAGE ------------>
614 *
615 * So here we show a write to offset @o. The data that is to be
616 * written is in a buffer that is not page aligned; it has offset
617 * @a in the page. The upl that carries the data starts in memory
618 * at @u. The associated upl starts in the file at offset @o. A
619 * transaction will always end on a page boundary (like @e above)
620 * except for the very last transaction in the group. We cannot
621 * unlock the page at @o in the associated upl until both the
622 * transaction ending at @e and the following transaction (that
623 * starts at @e) has completed.
624 */
625
626 /*
627 * We record whether or not the two UPLs are aligned as the mark
628 * bit in the first page of @upl.
629 */
630 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
631 bool is_unaligned = upl_page_get_mark(pl, 0);
632
633 if (is_unaligned) {
634 upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
635
636 upl_offset_t upl_end = upl_offset + size;
637 assert(upl_end >= PAGE_SIZE);
638
639 upl_size_t assoc_upl_size = upl_get_size(associated_upl);
640
641 /*
642 * In the very first transaction in the group, upl_offset will
643 * not be page aligned, but after that it will be and in that
644 * case we want the preceding page in the associated UPL hence
645 * the minus one.
646 */
647 assert(upl_offset);
648 if (upl_offset) {
649 upl_offset = trunc_page_32(upl_offset - 1);
650 }
651
652 lck_mtx_lock_spin(&iostate->io_mtxp);
653
654 // Look at the first page...
655 if (upl_offset
656 && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
657 /*
658 * The first page isn't marked so let another transaction
659 * completion handle it.
660 */
661 upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
662 upl_offset += PAGE_SIZE;
663 }
664
665 // And now the last page...
666
667 /*
668 * This needs to be > rather than >= because if it's equal, it
669 * means there's another transaction that is sharing the last
670 * page.
671 */
672 if (upl_end > assoc_upl_size) {
673 upl_end = assoc_upl_size;
674 } else {
675 upl_end = trunc_page_32(upl_end);
676 const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
677
678 if (!upl_page_get_mark(assoc_pl, last_pg)) {
679 /*
680 * The last page isn't marked so mark the page and let another
681 * transaction completion handle it.
682 */
683 upl_page_set_mark(assoc_pl, last_pg, true);
684 upl_end -= PAGE_SIZE;
685 }
686 }
687
688 lck_mtx_unlock(&iostate->io_mtxp);
689
690 #if 0
691 printf("2: %d %d\n", upl_offset, upl_end);
692 #endif
693
694 if (upl_end <= upl_offset) {
695 return;
696 }
697
698 size = upl_end - upl_offset;
699 } else {
700 assert(!(upl_offset & PAGE_MASK));
701 assert(!(size & PAGE_MASK));
702 }
703
704 boolean_t empty;
705
706 /*
707 * We can unlock these pages now and as this is for a
708 * direct/uncached write, we want to dump the pages too.
709 */
710 kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
711 UPL_ABORT_DUMP_PAGES, &empty);
712
713 assert(!kr);
714
715 if (!kr && empty) {
716 upl_set_associated_upl(upl, NULL);
717 upl_deallocate(associated_upl);
718 }
719 }
720
721 static int
cluster_ioerror(upl_t upl,int upl_offset,int abort_size,int error,int io_flags,vnode_t vp)722 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
723 {
724 int upl_abort_code = 0;
725 int page_in = 0;
726 int page_out = 0;
727
728 if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) {
729 /*
730 * direct write of any flavor, or a direct read that wasn't aligned
731 */
732 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
733 } else {
734 if (io_flags & B_PAGEIO) {
735 if (io_flags & B_READ) {
736 page_in = 1;
737 } else {
738 page_out = 1;
739 }
740 }
741 if (io_flags & B_CACHE) {
742 /*
743 * leave pages in the cache unchanged on error
744 */
745 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
746 } else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) {
747 /*
748 * transient error on pageout/write path... leave pages unchanged
749 */
750 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
751 } else if (page_in) {
752 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
753 } else {
754 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
755 }
756
757 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
758 }
759 return upl_abort_code;
760 }
761
762
763 static int
cluster_iodone(buf_t bp,void * callback_arg)764 cluster_iodone(buf_t bp, void *callback_arg)
765 {
766 int b_flags;
767 int error;
768 int total_size;
769 int total_resid;
770 int upl_offset;
771 int zero_offset;
772 int pg_offset = 0;
773 int commit_size = 0;
774 int upl_flags = 0;
775 int transaction_size = 0;
776 upl_t upl;
777 buf_t cbp;
778 buf_t cbp_head;
779 buf_t cbp_next;
780 buf_t real_bp;
781 vnode_t vp;
782 struct clios *iostate;
783 void *verify_ctx;
784 boolean_t transaction_complete = FALSE;
785
786 __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
787
788 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
789 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
790
791 if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
792 lck_mtx_lock_spin(&cl_transaction_mtxp);
793
794 bp->b_flags |= B_TDONE;
795
796 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
797 /*
798 * all I/O requests that are part of this transaction
799 * have to complete before we can process it
800 */
801 if (!(cbp->b_flags & B_TDONE)) {
802 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
803 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
804
805 lck_mtx_unlock(&cl_transaction_mtxp);
806
807 return 0;
808 }
809
810 if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
811 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
812 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
813
814 lck_mtx_unlock(&cl_transaction_mtxp);
815 wakeup(cbp);
816
817 return 0;
818 }
819
820 if (cbp->b_flags & B_EOT) {
821 transaction_complete = TRUE;
822 }
823 }
824 lck_mtx_unlock(&cl_transaction_mtxp);
825
826 if (transaction_complete == FALSE) {
827 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
828 cbp_head, 0, 0, 0, 0);
829 return 0;
830 }
831 }
832 error = 0;
833 total_size = 0;
834 total_resid = 0;
835
836 cbp = cbp_head;
837 vp = cbp->b_vp;
838 upl_offset = cbp->b_uploffset;
839 upl = cbp->b_upl;
840 b_flags = cbp->b_flags;
841 real_bp = cbp->b_real_bp;
842 zero_offset = cbp->b_validend;
843 iostate = (struct clios *)cbp->b_iostate;
844
845 if (real_bp) {
846 real_bp->b_dev = cbp->b_dev;
847 }
848
849 while (cbp) {
850 if ((cbp->b_flags & B_ERROR) && error == 0) {
851 error = cbp->b_error;
852 }
853
854 total_resid += cbp->b_resid;
855 total_size += cbp->b_bcount;
856
857 cbp_next = cbp->b_trans_next;
858
859 if (cbp_next == NULL) {
860 /*
861 * compute the overall size of the transaction
862 * in case we created one that has 'holes' in it
863 * 'total_size' represents the amount of I/O we
864 * did, not the span of the transaction w/r to the UPL
865 */
866 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
867 }
868
869 if (cbp != cbp_head) {
870 free_io_buf(cbp);
871 }
872
873 cbp = cbp_next;
874 }
875
876 if (ISSET(b_flags, B_COMMIT_UPL)) {
877 cluster_handle_associated_upl(iostate,
878 cbp_head->b_upl,
879 upl_offset,
880 transaction_size);
881 }
882
883 if (error == 0 && total_resid) {
884 error = EIO;
885 }
886
887 if (error == 0) {
888 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
889
890 if (cliodone_func != NULL) {
891 cbp_head->b_bcount = transaction_size;
892
893 error = (*cliodone_func)(cbp_head, callback_arg);
894 }
895 }
896 if (zero_offset) {
897 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
898 }
899
900 verify_ctx = cbp_head->b_attr.ba_verify_ctx;
901 cbp_head->b_attr.ba_verify_ctx = NULL;
902 if (verify_ctx) {
903 vnode_verify_flags_t verify_flags = VNODE_VERIFY_CONTEXT_FREE;
904 caddr_t verify_buf = NULL;
905 off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
906 size_t verify_length = transaction_size;
907 vm_offset_t vaddr;
908
909 if (!error) {
910 verify_flags |= VNODE_VERIFY_WITH_CONTEXT;
911 error = ubc_upl_map_range(upl, upl_offset, round_page(transaction_size), VM_PROT_DEFAULT, &vaddr); /* Map it in */
912 if (error) {
913 panic("ubc_upl_map_range returned error %d, upl = %p, upl_offset = %d, size = %d",
914 error, upl, (int)upl_offset, (int)round_page(transaction_size));
915 } else {
916 verify_buf = (caddr_t)vaddr;
917 }
918 }
919
920 error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length, 0, &verify_ctx, verify_flags, NULL);
921
922 if (verify_buf) {
923 (void)ubc_upl_unmap_range(upl, upl_offset, round_page(transaction_size));
924 verify_buf = NULL;
925 }
926 } else if (cbp_head->b_attr.ba_flags & BA_WILL_VERIFY) {
927 error = EBADMSG;
928 }
929
930 free_io_buf(cbp_head);
931
932 if (iostate) {
933 int need_wakeup = 0;
934
935 /*
936 * someone has issued multiple I/Os asynchrounsly
937 * and is waiting for them to complete (streaming)
938 */
939 lck_mtx_lock_spin(&iostate->io_mtxp);
940
941 if (error && iostate->io_error == 0) {
942 iostate->io_error = error;
943 }
944
945 iostate->io_completed += total_size;
946
947 if (iostate->io_wanted) {
948 /*
949 * someone is waiting for the state of
950 * this io stream to change
951 */
952 iostate->io_wanted = 0;
953 need_wakeup = 1;
954 }
955 lck_mtx_unlock(&iostate->io_mtxp);
956
957 if (need_wakeup) {
958 wakeup((caddr_t)&iostate->io_wanted);
959 }
960 }
961
962 if (b_flags & B_COMMIT_UPL) {
963 pg_offset = upl_offset & PAGE_MASK;
964 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
965
966 if (error) {
967 upl_set_iodone_error(upl, error);
968
969 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
970 } else {
971 upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
972
973 if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
974 upl_flags |= UPL_COMMIT_SET_DIRTY;
975 }
976
977 if (b_flags & B_AGE) {
978 upl_flags |= UPL_COMMIT_INACTIVATE;
979 }
980
981 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
982 }
983 }
984 if (real_bp) {
985 if (error) {
986 real_bp->b_flags |= B_ERROR;
987 real_bp->b_error = error;
988 }
989 real_bp->b_resid = total_resid;
990
991 buf_biodone(real_bp);
992 }
993 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
994 upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
995
996 return error;
997 }
998
999
1000 uint32_t
cluster_throttle_io_limit(vnode_t vp,uint32_t * limit)1001 cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
1002 {
1003 if (cluster_is_throttled(vp)) {
1004 *limit = calculate_max_throttle_size(vp);
1005 return 1;
1006 }
1007 return 0;
1008 }
1009
1010
1011 void
cluster_zero(upl_t upl,upl_offset_t upl_offset,int size,buf_t bp)1012 cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
1013 {
1014 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
1015 upl_offset, size, bp, 0, 0);
1016
1017 if (bp == NULL || bp->b_datap == 0) {
1018 upl_page_info_t *pl;
1019 addr64_t zero_addr;
1020
1021 pl = ubc_upl_pageinfo(upl);
1022
1023 if (upl_device_page(pl) == TRUE) {
1024 zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
1025
1026 bzero_phys_nc(zero_addr, size);
1027 } else {
1028 while (size) {
1029 int page_offset;
1030 int page_index;
1031 int zero_cnt;
1032
1033 page_index = upl_offset / PAGE_SIZE;
1034 page_offset = upl_offset & PAGE_MASK;
1035
1036 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
1037 zero_cnt = min(PAGE_SIZE - page_offset, size);
1038
1039 bzero_phys(zero_addr, zero_cnt);
1040
1041 size -= zero_cnt;
1042 upl_offset += zero_cnt;
1043 }
1044 }
1045 } else {
1046 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
1047 }
1048
1049 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
1050 upl_offset, size, 0, 0, 0);
1051 }
1052
1053
1054 static void
cluster_EOT(buf_t cbp_head,buf_t cbp_tail,int zero_offset,size_t verify_block_size)1055 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size)
1056 {
1057 /*
1058 * We will assign a verification context to cbp_head.
1059 * This will be passed back to the filesystem when
1060 * verifying (in cluster_iodone).
1061 */
1062 if (verify_block_size) {
1063 off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
1064 size_t length;
1065 void *verify_ctx = NULL;
1066 int error = 0;
1067 vnode_t vp = buf_vnode(cbp_head);
1068
1069 if (cbp_head == cbp_tail) {
1070 length = cbp_head->b_bcount;
1071 } else {
1072 length = ((cbp_tail->b_lblkno * cbp_tail->b_lblksize) + cbp_tail->b_bcount) - start_off;
1073 }
1074
1075 /*
1076 * zero_offset is non zero for the transaction containing the EOF
1077 * (if the filesize is not page aligned). In that case we might
1078 * have the transaction size not be page/verify block size aligned
1079 */
1080 if ((zero_offset == 0) &&
1081 ((length < verify_block_size) || (length % verify_block_size)) != 0) {
1082 panic("%s length = %zu, verify_block_size = %zu",
1083 __FUNCTION__, length, verify_block_size);
1084 }
1085
1086 error = VNOP_VERIFY(vp, start_off, NULL, length,
1087 &verify_block_size, &verify_ctx, VNODE_VERIFY_CONTEXT_ALLOC, NULL);
1088
1089 cbp_head->b_attr.ba_verify_ctx = verify_ctx;
1090 } else {
1091 cbp_head->b_attr.ba_verify_ctx = NULL;
1092 }
1093
1094 cbp_head->b_validend = zero_offset;
1095 cbp_tail->b_flags |= B_EOT;
1096 }
1097
1098 static void
cluster_wait_IO(buf_t cbp_head,int async)1099 cluster_wait_IO(buf_t cbp_head, int async)
1100 {
1101 buf_t cbp;
1102
1103 if (async) {
1104 /*
1105 * Async callback completion will not normally generate a
1106 * wakeup upon I/O completion. To get woken up, we set
1107 * b_trans_next (which is safe for us to modify) on the last
1108 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1109 * to wake us up when all buffers as part of this transaction
1110 * are completed. This is done under the umbrella of
1111 * cl_transaction_mtxp which is also taken in cluster_iodone.
1112 */
1113 bool done = true;
1114 buf_t last = NULL;
1115
1116 lck_mtx_lock_spin(&cl_transaction_mtxp);
1117
1118 for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1119 if (!ISSET(cbp->b_flags, B_TDONE)) {
1120 done = false;
1121 }
1122 }
1123
1124 if (!done) {
1125 last->b_trans_next = CLUSTER_IO_WAITING;
1126
1127 DTRACE_IO1(wait__start, buf_t, last);
1128 do {
1129 msleep(last, &cl_transaction_mtxp, PSPIN | (PRIBIO + 1), "cluster_wait_IO", NULL);
1130
1131 /*
1132 * We should only have been woken up if all the
1133 * buffers are completed, but just in case...
1134 */
1135 done = true;
1136 for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1137 if (!ISSET(cbp->b_flags, B_TDONE)) {
1138 done = false;
1139 break;
1140 }
1141 }
1142 } while (!done);
1143 DTRACE_IO1(wait__done, buf_t, last);
1144
1145 last->b_trans_next = NULL;
1146 }
1147
1148 lck_mtx_unlock(&cl_transaction_mtxp);
1149 } else { // !async
1150 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1151 buf_biowait(cbp);
1152 }
1153 }
1154 }
1155
1156 static void
cluster_complete_transaction(buf_t * cbp_head,void * callback_arg,int * retval,int flags,int needwait)1157 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1158 {
1159 buf_t cbp;
1160 int error;
1161 boolean_t isswapout = FALSE;
1162
1163 /*
1164 * cluster_complete_transaction will
1165 * only be called if we've issued a complete chain in synchronous mode
1166 * or, we've already done a cluster_wait_IO on an incomplete chain
1167 */
1168 if (needwait) {
1169 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1170 buf_biowait(cbp);
1171 }
1172 }
1173 /*
1174 * we've already waited on all of the I/Os in this transaction,
1175 * so mark all of the buf_t's in this transaction as B_TDONE
1176 * so that cluster_iodone sees the transaction as completed
1177 */
1178 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1179 cbp->b_flags |= B_TDONE;
1180 }
1181 cbp = *cbp_head;
1182
1183 if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
1184 isswapout = TRUE;
1185 }
1186
1187 error = cluster_iodone(cbp, callback_arg);
1188
1189 if (!(flags & CL_ASYNC) && error && *retval == 0) {
1190 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
1191 *retval = error;
1192 } else if (isswapout == TRUE) {
1193 *retval = error;
1194 }
1195 }
1196 *cbp_head = (buf_t)NULL;
1197 }
1198
1199
1200 static int
cluster_io(vnode_t vp,upl_t upl,vm_offset_t upl_offset,off_t f_offset,int non_rounded_size,int flags,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)1201 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1202 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1203 {
1204 buf_t cbp;
1205 u_int size;
1206 u_int io_size;
1207 int io_flags;
1208 int bmap_flags;
1209 int error = 0;
1210 int retval = 0;
1211 buf_t cbp_head = NULL;
1212 buf_t cbp_tail = NULL;
1213 int trans_count = 0;
1214 int max_trans_count;
1215 u_int pg_count;
1216 int pg_offset;
1217 u_int max_iosize;
1218 u_int max_vectors;
1219 int priv;
1220 int zero_offset = 0;
1221 int async_throttle = 0;
1222 mount_t mp;
1223 vm_offset_t upl_end_offset;
1224 boolean_t need_EOT = FALSE;
1225 size_t verify_block_size = 0;
1226
1227 /*
1228 * we currently don't support buffers larger than a page
1229 */
1230 if (real_bp && non_rounded_size > PAGE_SIZE) {
1231 panic("%s(): Called with real buffer of size %d bytes which "
1232 "is greater than the maximum allowed size of "
1233 "%d bytes (the system PAGE_SIZE).\n",
1234 __FUNCTION__, non_rounded_size, PAGE_SIZE);
1235 }
1236
1237 mp = vp->v_mount;
1238
1239 /*
1240 * we don't want to do any funny rounding of the size for IO requests
1241 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
1242 * belong to us... we can't extend (nor do we need to) the I/O to fill
1243 * out a page
1244 */
1245 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1246 /*
1247 * round the requested size up so that this I/O ends on a
1248 * page boundary in case this is a 'write'... if the filesystem
1249 * has blocks allocated to back the page beyond the EOF, we want to
1250 * make sure to write out the zero's that are sitting beyond the EOF
1251 * so that in case the filesystem doesn't explicitly zero this area
1252 * if a hole is created via a lseek/write beyond the current EOF,
1253 * it will return zeros when it's read back from the disk. If the
1254 * physical allocation doesn't extend for the whole page, we'll
1255 * only write/read from the disk up to the end of this allocation
1256 * via the extent info returned from the VNOP_BLOCKMAP call.
1257 */
1258 pg_offset = upl_offset & PAGE_MASK;
1259
1260 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1261 } else {
1262 /*
1263 * anyone advertising a blocksize of 1 byte probably
1264 * can't deal with us rounding up the request size
1265 * AFP is one such filesystem/device
1266 */
1267 size = non_rounded_size;
1268 }
1269 upl_end_offset = upl_offset + size;
1270
1271 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1272
1273 /*
1274 * Set the maximum transaction size to the maximum desired number of
1275 * buffers.
1276 */
1277 max_trans_count = 8;
1278 if (flags & CL_DEV_MEMORY) {
1279 max_trans_count = 16;
1280 }
1281
1282 if (flags & CL_READ) {
1283 io_flags = B_READ;
1284 bmap_flags = VNODE_READ;
1285
1286 max_iosize = mp->mnt_maxreadcnt;
1287 max_vectors = mp->mnt_segreadcnt;
1288
1289 if ((flags & CL_PAGEIN) && /* Cluster layer verification will be limited to pagein for now */
1290 !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1291 (VNOP_VERIFY(vp, f_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) &&
1292 verify_block_size) {
1293 if (verify_block_size != PAGE_SIZE) {
1294 verify_block_size = 0;
1295 }
1296 if (real_bp && verify_block_size) {
1297 panic("%s(): Called with real buffer and needs verification ",
1298 __FUNCTION__);
1299 }
1300 }
1301 } else {
1302 io_flags = B_WRITE;
1303 bmap_flags = VNODE_WRITE;
1304
1305 max_iosize = mp->mnt_maxwritecnt;
1306 max_vectors = mp->mnt_segwritecnt;
1307 }
1308 if (verify_block_size) {
1309 bmap_flags |= VNODE_CLUSTER_VERIFY;
1310 }
1311 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1312
1313 /*
1314 * make sure the maximum iosize is a
1315 * multiple of the page size
1316 */
1317 max_iosize &= ~PAGE_MASK;
1318
1319 /*
1320 * Ensure the maximum iosize is sensible.
1321 */
1322 if (!max_iosize) {
1323 max_iosize = PAGE_SIZE;
1324 }
1325
1326 if (flags & CL_THROTTLE) {
1327 if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1328 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
1329
1330 if (max_iosize > max_throttle_size) {
1331 max_iosize = max_throttle_size;
1332 }
1333 async_throttle = calculate_max_throttle_cnt(vp);
1334 } else {
1335 if ((flags & CL_DEV_MEMORY)) {
1336 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1337 } else {
1338 u_int max_cluster;
1339 u_int max_cluster_size;
1340 u_int scale;
1341
1342 if (vp->v_mount->mnt_minsaturationbytecount) {
1343 max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1344
1345 scale = 1;
1346 } else {
1347 max_cluster_size = MAX_CLUSTER_SIZE(vp);
1348
1349 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
1350 scale = WRITE_THROTTLE_SSD;
1351 } else {
1352 scale = WRITE_THROTTLE;
1353 }
1354 }
1355 if (max_iosize > max_cluster_size) {
1356 max_cluster = max_cluster_size;
1357 } else {
1358 max_cluster = max_iosize;
1359 }
1360
1361 if (size < max_cluster) {
1362 max_cluster = size;
1363 }
1364
1365 if (flags & CL_CLOSE) {
1366 scale += MAX_CLUSTERS;
1367 }
1368
1369 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
1370 }
1371 }
1372 }
1373 if (flags & CL_AGE) {
1374 io_flags |= B_AGE;
1375 }
1376 if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
1377 io_flags |= B_PAGEIO;
1378 }
1379 if (flags & (CL_IOSTREAMING)) {
1380 io_flags |= B_IOSTREAMING;
1381 }
1382 if (flags & CL_COMMIT) {
1383 io_flags |= B_COMMIT_UPL;
1384 }
1385 if (flags & CL_DIRECT_IO) {
1386 io_flags |= B_PHYS;
1387 }
1388 if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
1389 io_flags |= B_CACHE;
1390 }
1391 if (flags & CL_PASSIVE) {
1392 io_flags |= B_PASSIVE;
1393 }
1394 if (flags & CL_ENCRYPTED) {
1395 io_flags |= B_ENCRYPTED_IO;
1396 }
1397
1398 if (vp->v_flag & VSYSTEM) {
1399 io_flags |= B_META;
1400 }
1401
1402 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1403 /*
1404 * then we are going to end up
1405 * with a page that we can't complete (the file size wasn't a multiple
1406 * of PAGE_SIZE and we're trying to read to the end of the file
1407 * so we'll go ahead and zero out the portion of the page we can't
1408 * read in from the file
1409 */
1410 zero_offset = (int)(upl_offset + non_rounded_size);
1411 } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1412 assert(ISSET(flags, CL_COMMIT));
1413
1414 // For a direct/uncached write, we need to lock pages...
1415
1416 upl_t cached_upl;
1417
1418 /*
1419 * Create a UPL to lock the pages in the cache whilst the
1420 * write is in progress.
1421 */
1422 ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
1423 NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
1424
1425 /*
1426 * Attach this UPL to the other UPL so that we can find it
1427 * later.
1428 */
1429 upl_set_associated_upl(upl, cached_upl);
1430
1431 if (upl_offset & PAGE_MASK) {
1432 /*
1433 * The two UPLs are not aligned, so mark the first page in
1434 * @upl so that cluster_handle_associated_upl can handle
1435 * it accordingly.
1436 */
1437 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1438 upl_page_set_mark(pl, 0, true);
1439 }
1440 }
1441
1442 while (size) {
1443 daddr64_t blkno;
1444 daddr64_t lblkno;
1445 size_t io_size_tmp;
1446 u_int io_size_wanted;
1447 uint32_t lblksize;
1448
1449 if (size > max_iosize) {
1450 io_size = max_iosize;
1451 } else {
1452 io_size = size;
1453 }
1454
1455 io_size_wanted = io_size;
1456 io_size_tmp = (size_t)io_size;
1457
1458 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1459 break;
1460 }
1461
1462 if (io_size_tmp > io_size_wanted) {
1463 io_size = io_size_wanted;
1464 } else {
1465 io_size = (u_int)io_size_tmp;
1466 }
1467
1468 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1469 real_bp->b_blkno = blkno;
1470 }
1471
1472 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1473 (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
1474
1475 if (io_size == 0) {
1476 /*
1477 * vnop_blockmap didn't return an error... however, it did
1478 * return an extent size of 0 which means we can't
1479 * make forward progress on this I/O... a hole in the
1480 * file would be returned as a blkno of -1 with a non-zero io_size
1481 * a real extent is returned with a blkno != -1 and a non-zero io_size
1482 */
1483 error = EINVAL;
1484 break;
1485 }
1486 if (!(flags & CL_READ) && blkno == -1) {
1487 off_t e_offset;
1488 int pageout_flags;
1489
1490 if (upl_get_internal_vectorupl(upl)) {
1491 panic("Vector UPLs should not take this code-path");
1492 }
1493 /*
1494 * we're writing into a 'hole'
1495 */
1496 if (flags & CL_PAGEOUT) {
1497 /*
1498 * if we got here via cluster_pageout
1499 * then just error the request and return
1500 * the 'hole' should already have been covered
1501 */
1502 error = EINVAL;
1503 break;
1504 }
1505 /*
1506 * we can get here if the cluster code happens to
1507 * pick up a page that was dirtied via mmap vs
1508 * a 'write' and the page targets a 'hole'...
1509 * i.e. the writes to the cluster were sparse
1510 * and the file was being written for the first time
1511 *
1512 * we can also get here if the filesystem supports
1513 * 'holes' that are less than PAGE_SIZE.... because
1514 * we can't know if the range in the page that covers
1515 * the 'hole' has been dirtied via an mmap or not,
1516 * we have to assume the worst and try to push the
1517 * entire page to storage.
1518 *
1519 * Try paging out the page individually before
1520 * giving up entirely and dumping it (the pageout
1521 * path will insure that the zero extent accounting
1522 * has been taken care of before we get back into cluster_io)
1523 *
1524 * go direct to vnode_pageout so that we don't have to
1525 * unbusy the page from the UPL... we used to do this
1526 * so that we could call ubc_msync, but that results
1527 * in a potential deadlock if someone else races us to acquire
1528 * that page and wins and in addition needs one of the pages
1529 * we're continuing to hold in the UPL
1530 */
1531 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1532
1533 if (!(flags & CL_ASYNC)) {
1534 pageout_flags |= UPL_IOSYNC;
1535 }
1536 if (!(flags & CL_COMMIT)) {
1537 pageout_flags |= UPL_NOCOMMIT;
1538 }
1539
1540 if (cbp_head) {
1541 buf_t prev_cbp;
1542 uint32_t bytes_in_last_page;
1543
1544 /*
1545 * first we have to wait for the the current outstanding I/Os
1546 * to complete... EOT hasn't been set yet on this transaction
1547 * so the pages won't be released
1548 */
1549 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1550
1551 bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1552 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1553 bytes_in_last_page += cbp->b_bcount;
1554 }
1555 bytes_in_last_page &= PAGE_MASK;
1556
1557 while (bytes_in_last_page) {
1558 /*
1559 * we've got a transcation that
1560 * includes the page we're about to push out through vnode_pageout...
1561 * find the bp's in the list which intersect this page and either
1562 * remove them entirely from the transaction (there could be multiple bp's), or
1563 * round it's iosize down to the page boundary (there can only be one)...
1564 *
1565 * find the last bp in the list and act on it
1566 */
1567 for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
1568 prev_cbp = cbp;
1569 }
1570
1571 if (bytes_in_last_page >= cbp->b_bcount) {
1572 /*
1573 * this buf no longer has any I/O associated with it
1574 */
1575 bytes_in_last_page -= cbp->b_bcount;
1576 cbp->b_bcount = 0;
1577
1578 free_io_buf(cbp);
1579
1580 if (cbp == cbp_head) {
1581 assert(bytes_in_last_page == 0);
1582 /*
1583 * the buf we just freed was the only buf in
1584 * this transaction... so there's no I/O to do
1585 */
1586 cbp_head = NULL;
1587 cbp_tail = NULL;
1588 } else {
1589 /*
1590 * remove the buf we just freed from
1591 * the transaction list
1592 */
1593 prev_cbp->b_trans_next = NULL;
1594 cbp_tail = prev_cbp;
1595 }
1596 } else {
1597 /*
1598 * this is the last bp that has I/O
1599 * intersecting the page of interest
1600 * only some of the I/O is in the intersection
1601 * so clip the size but keep it in the transaction list
1602 */
1603 cbp->b_bcount -= bytes_in_last_page;
1604 cbp_tail = cbp;
1605 bytes_in_last_page = 0;
1606 }
1607 }
1608 if (cbp_head) {
1609 /*
1610 * there was more to the current transaction
1611 * than just the page we are pushing out via vnode_pageout...
1612 * mark it as finished and complete it... we've already
1613 * waited for the I/Os to complete above in the call to cluster_wait_IO
1614 */
1615 cluster_EOT(cbp_head, cbp_tail, 0, 0);
1616
1617 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1618
1619 trans_count = 0;
1620 }
1621 }
1622 if (vnode_pageout(vp, upl, (upl_offset_t)trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1623 error = EINVAL;
1624 }
1625 e_offset = round_page_64(f_offset + 1);
1626 io_size = (u_int)(e_offset - f_offset);
1627
1628 f_offset += io_size;
1629 upl_offset += io_size;
1630
1631 if (size >= io_size) {
1632 size -= io_size;
1633 } else {
1634 size = 0;
1635 }
1636 /*
1637 * keep track of how much of the original request
1638 * that we've actually completed... non_rounded_size
1639 * may go negative due to us rounding the request
1640 * to a page size multiple (i.e. size > non_rounded_size)
1641 */
1642 non_rounded_size -= io_size;
1643
1644 if (non_rounded_size <= 0) {
1645 /*
1646 * we've transferred all of the data in the original
1647 * request, but we were unable to complete the tail
1648 * of the last page because the file didn't have
1649 * an allocation to back that portion... this is ok.
1650 */
1651 size = 0;
1652 }
1653 if (error) {
1654 if (size == 0) {
1655 flags &= ~CL_COMMIT;
1656 }
1657 break;
1658 }
1659 continue;
1660 }
1661
1662 lblksize = CLUSTER_IO_BLOCK_SIZE;
1663 lblkno = (daddr64_t)(f_offset / lblksize);
1664
1665 /*
1666 * we have now figured out how much I/O we can do - this is in 'io_size'
1667 * pg_offset is the starting point in the first page for the I/O
1668 * pg_count is the number of full and partial pages that 'io_size' encompasses
1669 */
1670 pg_offset = upl_offset & PAGE_MASK;
1671
1672 if (flags & CL_DEV_MEMORY) {
1673 /*
1674 * treat physical requests as one 'giant' page
1675 */
1676 pg_count = 1;
1677 } else {
1678 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1679 }
1680
1681 if ((flags & CL_READ) && blkno == -1) {
1682 vm_offset_t commit_offset;
1683 int bytes_to_zero;
1684 int complete_transaction_now = 0;
1685
1686 /*
1687 * if we're reading and blkno == -1, then we've got a
1688 * 'hole' in the file that we need to deal with by zeroing
1689 * out the affected area in the upl
1690 */
1691 if (io_size >= (u_int)non_rounded_size) {
1692 /*
1693 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1694 * than 'zero_offset' will be non-zero
1695 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1696 * (indicated by the io_size finishing off the I/O request for this UPL)
1697 * than we're not going to issue an I/O for the
1698 * last page in this upl... we need to zero both the hole and the tail
1699 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1700 */
1701 bytes_to_zero = non_rounded_size;
1702 if (!(flags & CL_NOZERO)) {
1703 bytes_to_zero = (int)((((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset);
1704 }
1705
1706 zero_offset = 0;
1707 } else {
1708 bytes_to_zero = io_size;
1709 }
1710
1711 pg_count = 0;
1712
1713 cluster_zero(upl, (upl_offset_t)upl_offset, bytes_to_zero, real_bp);
1714
1715 if (cbp_head) {
1716 int pg_resid;
1717
1718 /*
1719 * if there is a current I/O chain pending
1720 * then the first page of the group we just zero'd
1721 * will be handled by the I/O completion if the zero
1722 * fill started in the middle of the page
1723 */
1724 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1725
1726 pg_resid = (int)(commit_offset - upl_offset);
1727
1728 if (bytes_to_zero >= pg_resid) {
1729 /*
1730 * the last page of the current I/O
1731 * has been completed...
1732 * compute the number of fully zero'd
1733 * pages that are beyond it
1734 * plus the last page if its partial
1735 * and we have no more I/O to issue...
1736 * otherwise a partial page is left
1737 * to begin the next I/O
1738 */
1739 if ((int)io_size >= non_rounded_size) {
1740 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1741 } else {
1742 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1743 }
1744
1745 complete_transaction_now = 1;
1746 }
1747 } else {
1748 /*
1749 * no pending I/O to deal with
1750 * so, commit all of the fully zero'd pages
1751 * plus the last page if its partial
1752 * and we have no more I/O to issue...
1753 * otherwise a partial page is left
1754 * to begin the next I/O
1755 */
1756 if ((int)io_size >= non_rounded_size) {
1757 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1758 } else {
1759 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1760 }
1761
1762 commit_offset = upl_offset & ~PAGE_MASK;
1763 }
1764
1765 // Associated UPL is currently only used in the direct write path
1766 assert(!upl_associated_upl(upl));
1767
1768 if ((flags & CL_COMMIT) && pg_count) {
1769 ubc_upl_commit_range(upl, (upl_offset_t)commit_offset,
1770 pg_count * PAGE_SIZE,
1771 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1772 }
1773 upl_offset += io_size;
1774 f_offset += io_size;
1775 size -= io_size;
1776
1777 /*
1778 * keep track of how much of the original request
1779 * that we've actually completed... non_rounded_size
1780 * may go negative due to us rounding the request
1781 * to a page size multiple (i.e. size > non_rounded_size)
1782 */
1783 non_rounded_size -= io_size;
1784
1785 if (non_rounded_size <= 0) {
1786 /*
1787 * we've transferred all of the data in the original
1788 * request, but we were unable to complete the tail
1789 * of the last page because the file didn't have
1790 * an allocation to back that portion... this is ok.
1791 */
1792 size = 0;
1793 }
1794 if (cbp_head && (complete_transaction_now || size == 0)) {
1795 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1796
1797 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
1798
1799 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1800
1801 trans_count = 0;
1802 }
1803 continue;
1804 }
1805 if (pg_count > max_vectors) {
1806 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1807 io_size = PAGE_SIZE - pg_offset;
1808 pg_count = 1;
1809 } else {
1810 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1811 pg_count = max_vectors;
1812 }
1813 }
1814 /*
1815 * If the transaction is going to reach the maximum number of
1816 * desired elements, truncate the i/o to the nearest page so
1817 * that the actual i/o is initiated after this buffer is
1818 * created and added to the i/o chain.
1819 *
1820 * I/O directed to physically contiguous memory
1821 * doesn't have a requirement to make sure we 'fill' a page
1822 */
1823 if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1824 ((upl_offset + io_size) & PAGE_MASK)) {
1825 vm_offset_t aligned_ofs;
1826
1827 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1828 /*
1829 * If the io_size does not actually finish off even a
1830 * single page we have to keep adding buffers to the
1831 * transaction despite having reached the desired limit.
1832 *
1833 * Eventually we get here with the page being finished
1834 * off (and exceeded) and then we truncate the size of
1835 * this i/o request so that it is page aligned so that
1836 * we can finally issue the i/o on the transaction.
1837 */
1838 if (aligned_ofs > upl_offset) {
1839 io_size = (u_int)(aligned_ofs - upl_offset);
1840 pg_count--;
1841 }
1842 }
1843
1844 if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1845 /*
1846 * if we're not targeting a virtual device i.e. a disk image
1847 * it's safe to dip into the reserve pool since real devices
1848 * can complete this I/O request without requiring additional
1849 * bufs from the alloc_io_buf pool
1850 */
1851 priv = 1;
1852 } else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT) && !cbp_head) {
1853 /*
1854 * Throttle the speculative IO
1855 *
1856 * We can only throttle this if it is the first iobuf
1857 * for the transaction. alloc_io_buf implements
1858 * additional restrictions for diskimages anyway.
1859 */
1860 priv = 0;
1861 } else {
1862 priv = 1;
1863 }
1864
1865 cbp = alloc_io_buf(vp, priv);
1866
1867 if (flags & CL_PAGEOUT) {
1868 u_int i;
1869
1870 /*
1871 * since blocks are in offsets of lblksize (CLUSTER_IO_BLOCK_SIZE), scale
1872 * iteration to (PAGE_SIZE * pg_count) of blks.
1873 */
1874 for (i = 0; i < (PAGE_SIZE * pg_count) / lblksize; i++) {
1875 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
1876 panic("BUSY bp found in cluster_io");
1877 }
1878 }
1879 }
1880 if (flags & CL_ASYNC) {
1881 if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) {
1882 panic("buf_setcallback failed");
1883 }
1884 }
1885 cbp->b_cliodone = (void *)callback;
1886 cbp->b_flags |= io_flags;
1887 if (flags & CL_NOCACHE) {
1888 cbp->b_attr.ba_flags |= BA_NOCACHE;
1889 }
1890 if (verify_block_size) {
1891 cbp->b_attr.ba_flags |= BA_WILL_VERIFY;
1892 }
1893
1894 cbp->b_lblkno = lblkno;
1895 cbp->b_lblksize = lblksize;
1896 cbp->b_blkno = blkno;
1897 cbp->b_bcount = io_size;
1898
1899 if (buf_setupl(cbp, upl, (uint32_t)upl_offset)) {
1900 panic("buf_setupl failed");
1901 }
1902 #if CONFIG_IOSCHED
1903 upl_set_blkno(upl, upl_offset, io_size, blkno);
1904 #endif
1905 cbp->b_trans_next = (buf_t)NULL;
1906
1907 if ((cbp->b_iostate = (void *)iostate)) {
1908 /*
1909 * caller wants to track the state of this
1910 * io... bump the amount issued against this stream
1911 */
1912 iostate->io_issued += io_size;
1913 }
1914
1915 if (flags & CL_READ) {
1916 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1917 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1918 } else {
1919 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1920 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1921 }
1922
1923 if (cbp_head) {
1924 cbp_tail->b_trans_next = cbp;
1925 cbp_tail = cbp;
1926 } else {
1927 cbp_head = cbp;
1928 cbp_tail = cbp;
1929
1930 if ((cbp_head->b_real_bp = real_bp)) {
1931 real_bp = (buf_t)NULL;
1932 }
1933 }
1934 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1935
1936 trans_count++;
1937
1938 upl_offset += io_size;
1939 f_offset += io_size;
1940 size -= io_size;
1941 /*
1942 * keep track of how much of the original request
1943 * that we've actually completed... non_rounded_size
1944 * may go negative due to us rounding the request
1945 * to a page size multiple (i.e. size > non_rounded_size)
1946 */
1947 non_rounded_size -= io_size;
1948
1949 if (non_rounded_size <= 0) {
1950 /*
1951 * we've transferred all of the data in the original
1952 * request, but we were unable to complete the tail
1953 * of the last page because the file didn't have
1954 * an allocation to back that portion... this is ok.
1955 */
1956 size = 0;
1957 }
1958 if (size == 0) {
1959 /*
1960 * we have no more I/O to issue, so go
1961 * finish the final transaction
1962 */
1963 need_EOT = TRUE;
1964 } else if (((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1965 ((flags & CL_ASYNC) || trans_count > max_trans_count)) {
1966 /*
1967 * I/O directed to physically contiguous memory...
1968 * which doesn't have a requirement to make sure we 'fill' a page
1969 * or...
1970 * the current I/O we've prepared fully
1971 * completes the last page in this request
1972 * and ...
1973 * it's either an ASYNC request or
1974 * we've already accumulated more than 8 I/O's into
1975 * this transaction so mark it as complete so that
1976 * it can finish asynchronously or via the cluster_complete_transaction
1977 * below if the request is synchronous
1978 */
1979 need_EOT = TRUE;
1980 }
1981 if (need_EOT == TRUE) {
1982 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
1983 }
1984
1985 if (flags & CL_THROTTLE) {
1986 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
1987 }
1988
1989 if (!(io_flags & B_READ)) {
1990 vnode_startwrite(vp);
1991 }
1992
1993 if (flags & CL_RAW_ENCRYPTED) {
1994 /*
1995 * User requested raw encrypted bytes.
1996 * Twiddle the bit in the ba_flags for the buffer
1997 */
1998 cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
1999 }
2000
2001 (void) VNOP_STRATEGY(cbp);
2002
2003 if (need_EOT == TRUE) {
2004 if (!(flags & CL_ASYNC)) {
2005 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
2006 }
2007
2008 need_EOT = FALSE;
2009 trans_count = 0;
2010 cbp_head = NULL;
2011 }
2012 }
2013 if (error) {
2014 int abort_size;
2015
2016 io_size = 0;
2017
2018 if (cbp_head) {
2019 /*
2020 * Wait until all of the outstanding I/O
2021 * for this partial transaction has completed
2022 */
2023 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
2024
2025 /*
2026 * Rewind the upl offset to the beginning of the
2027 * transaction.
2028 */
2029 upl_offset = cbp_head->b_uploffset;
2030 }
2031
2032 if (ISSET(flags, CL_COMMIT)) {
2033 cluster_handle_associated_upl(iostate, upl,
2034 (upl_offset_t)upl_offset,
2035 (upl_size_t)(upl_end_offset - upl_offset));
2036 }
2037
2038 // Free all the IO buffers in this transaction
2039 for (cbp = cbp_head; cbp;) {
2040 buf_t cbp_next;
2041
2042 size += cbp->b_bcount;
2043 io_size += cbp->b_bcount;
2044
2045 cbp_next = cbp->b_trans_next;
2046 free_io_buf(cbp);
2047 cbp = cbp_next;
2048 }
2049
2050 if (iostate) {
2051 int need_wakeup = 0;
2052
2053 /*
2054 * update the error condition for this stream
2055 * since we never really issued the io
2056 * just go ahead and adjust it back
2057 */
2058 lck_mtx_lock_spin(&iostate->io_mtxp);
2059
2060 if (iostate->io_error == 0) {
2061 iostate->io_error = error;
2062 }
2063 iostate->io_issued -= io_size;
2064
2065 if (iostate->io_wanted) {
2066 /*
2067 * someone is waiting for the state of
2068 * this io stream to change
2069 */
2070 iostate->io_wanted = 0;
2071 need_wakeup = 1;
2072 }
2073 lck_mtx_unlock(&iostate->io_mtxp);
2074
2075 if (need_wakeup) {
2076 wakeup((caddr_t)&iostate->io_wanted);
2077 }
2078 }
2079
2080 if (flags & CL_COMMIT) {
2081 int upl_flags;
2082
2083 pg_offset = upl_offset & PAGE_MASK;
2084 abort_size = (int)((upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK);
2085
2086 upl_flags = cluster_ioerror(upl, (int)(upl_offset - pg_offset),
2087 abort_size, error, io_flags, vp);
2088
2089 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
2090 upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
2091 }
2092 if (retval == 0) {
2093 retval = error;
2094 }
2095 } else if (cbp_head) {
2096 panic("%s(): cbp_head is not NULL.", __FUNCTION__);
2097 }
2098
2099 if (real_bp) {
2100 /*
2101 * can get here if we either encountered an error
2102 * or we completely zero-filled the request and
2103 * no I/O was issued
2104 */
2105 if (error) {
2106 real_bp->b_flags |= B_ERROR;
2107 real_bp->b_error = error;
2108 }
2109 buf_biodone(real_bp);
2110 }
2111 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
2112
2113 return retval;
2114 }
2115
2116 #define reset_vector_run_state() \
2117 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
2118
2119 static int
vector_cluster_io(vnode_t vp,upl_t vector_upl,vm_offset_t vector_upl_offset,off_t v_upl_uio_offset,int vector_upl_iosize,int io_flag,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)2120 vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
2121 int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
2122 {
2123 vector_upl_set_pagelist(vector_upl);
2124
2125 if (io_flag & CL_READ) {
2126 if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
2127 io_flag &= ~CL_PRESERVE; /*don't zero fill*/
2128 } else {
2129 io_flag |= CL_PRESERVE; /*zero fill*/
2130 }
2131 }
2132 return cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg);
2133 }
2134
2135 static int
cluster_read_prefetch(vnode_t vp,off_t f_offset,u_int size,off_t filesize,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2136 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2137 {
2138 int pages_in_prefetch;
2139
2140 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
2141 (int)f_offset, size, (int)filesize, 0, 0);
2142
2143 if (f_offset >= filesize) {
2144 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2145 (int)f_offset, 0, 0, 0, 0);
2146 return 0;
2147 }
2148 if ((off_t)size > (filesize - f_offset)) {
2149 size = (u_int)(filesize - f_offset);
2150 }
2151 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
2152
2153 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
2154
2155 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2156 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
2157
2158 return pages_in_prefetch;
2159 }
2160
2161
2162
2163 static void
cluster_read_ahead(vnode_t vp,struct cl_extent * extent,off_t filesize,struct cl_readahead * rap,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2164 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
2165 int bflag)
2166 {
2167 daddr64_t r_addr;
2168 off_t f_offset;
2169 int size_of_prefetch;
2170 u_int max_prefetch;
2171
2172
2173 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
2174 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
2175
2176 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
2177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2178 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
2179 return;
2180 }
2181 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
2182 rap->cl_ralen = 0;
2183 rap->cl_maxra = 0;
2184
2185 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2186 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
2187
2188 return;
2189 }
2190
2191 max_prefetch = cluster_max_prefetch(vp,
2192 cluster_max_io_size(vp->v_mount, CL_READ), speculative_prefetch_max);
2193
2194 if (max_prefetch <= PAGE_SIZE) {
2195 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2196 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
2197 return;
2198 }
2199 if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
2200 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2201 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2202 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
2203 return;
2204 }
2205 }
2206 r_addr = MAX(extent->e_addr, rap->cl_maxra) + 1;
2207 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2208
2209 size_of_prefetch = 0;
2210
2211 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2212
2213 if (size_of_prefetch) {
2214 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2215 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
2216 return;
2217 }
2218 if (f_offset < filesize) {
2219 daddr64_t read_size;
2220
2221 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
2222
2223 read_size = (extent->e_addr + 1) - extent->b_addr;
2224
2225 if (read_size > rap->cl_ralen) {
2226 if (read_size > max_prefetch / PAGE_SIZE) {
2227 rap->cl_ralen = max_prefetch / PAGE_SIZE;
2228 } else {
2229 rap->cl_ralen = (int)read_size;
2230 }
2231 }
2232 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2233
2234 if (size_of_prefetch) {
2235 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2236 }
2237 }
2238 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2239 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
2240 }
2241
2242
2243 int
cluster_pageout(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2244 cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2245 int size, off_t filesize, int flags)
2246 {
2247 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2248 }
2249
2250
2251 int
cluster_pageout_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2252 cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2253 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2254 {
2255 int io_size;
2256 int rounded_size;
2257 off_t max_size;
2258 int local_flags;
2259
2260 local_flags = CL_PAGEOUT | CL_THROTTLE;
2261
2262 if ((flags & UPL_IOSYNC) == 0) {
2263 local_flags |= CL_ASYNC;
2264 }
2265 if ((flags & UPL_NOCOMMIT) == 0) {
2266 local_flags |= CL_COMMIT;
2267 }
2268 if ((flags & UPL_KEEPCACHED)) {
2269 local_flags |= CL_KEEPCACHED;
2270 }
2271 if (flags & UPL_PAGING_ENCRYPTED) {
2272 local_flags |= CL_ENCRYPTED;
2273 }
2274
2275
2276 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2277 (int)f_offset, size, (int)filesize, local_flags, 0);
2278
2279 /*
2280 * If they didn't specify any I/O, then we are done...
2281 * we can't issue an abort because we don't know how
2282 * big the upl really is
2283 */
2284 if (size <= 0) {
2285 return EINVAL;
2286 }
2287
2288 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2289 if (local_flags & CL_COMMIT) {
2290 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2291 }
2292 return EROFS;
2293 }
2294 /*
2295 * can't page-in from a negative offset
2296 * or if we're starting beyond the EOF
2297 * or if the file offset isn't page aligned
2298 * or the size requested isn't a multiple of PAGE_SIZE
2299 */
2300 if (f_offset < 0 || f_offset >= filesize ||
2301 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2302 if (local_flags & CL_COMMIT) {
2303 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2304 }
2305 return EINVAL;
2306 }
2307 max_size = filesize - f_offset;
2308
2309 if (size < max_size) {
2310 io_size = size;
2311 } else {
2312 io_size = (int)max_size;
2313 }
2314
2315 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2316
2317 if (size > rounded_size) {
2318 if (local_flags & CL_COMMIT) {
2319 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2320 UPL_ABORT_FREE_ON_EMPTY);
2321 }
2322 }
2323 return cluster_io(vp, upl, upl_offset, f_offset, io_size,
2324 local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2325 }
2326
2327
2328 int
cluster_pagein(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2329 cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2330 int size, off_t filesize, int flags)
2331 {
2332 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2333 }
2334
2335
2336 int
cluster_pagein_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2337 cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2338 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2339 {
2340 u_int io_size;
2341 int rounded_size;
2342 off_t max_size;
2343 int retval;
2344 int local_flags = 0;
2345
2346 if (upl == NULL || size < 0) {
2347 panic("cluster_pagein: NULL upl passed in");
2348 }
2349
2350 if ((flags & UPL_IOSYNC) == 0) {
2351 local_flags |= CL_ASYNC;
2352 }
2353 if ((flags & UPL_NOCOMMIT) == 0) {
2354 local_flags |= CL_COMMIT;
2355 }
2356 if (flags & UPL_IOSTREAMING) {
2357 local_flags |= CL_IOSTREAMING;
2358 }
2359 if (flags & UPL_PAGING_ENCRYPTED) {
2360 local_flags |= CL_ENCRYPTED;
2361 }
2362
2363
2364 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2365 (int)f_offset, size, (int)filesize, local_flags, 0);
2366
2367 /*
2368 * can't page-in from a negative offset
2369 * or if we're starting beyond the EOF
2370 * or if the file offset isn't page aligned
2371 * or the size requested isn't a multiple of PAGE_SIZE
2372 */
2373 if (f_offset < 0 || f_offset >= filesize ||
2374 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2375 if (local_flags & CL_COMMIT) {
2376 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2377 }
2378
2379 if (f_offset >= filesize) {
2380 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CLUSTER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CL_PGIN_PAST_EOF), 0 /* arg */);
2381 }
2382
2383 return EINVAL;
2384 }
2385 max_size = filesize - f_offset;
2386
2387 if (size < max_size) {
2388 io_size = size;
2389 } else {
2390 io_size = (int)max_size;
2391 }
2392
2393 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2394
2395 if (size > rounded_size && (local_flags & CL_COMMIT)) {
2396 ubc_upl_abort_range(upl, upl_offset + rounded_size,
2397 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2398 }
2399
2400 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
2401 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2402
2403 return retval;
2404 }
2405
2406
2407 int
cluster_bp(buf_t bp)2408 cluster_bp(buf_t bp)
2409 {
2410 return cluster_bp_ext(bp, NULL, NULL);
2411 }
2412
2413
2414 int
cluster_bp_ext(buf_t bp,int (* callback)(buf_t,void *),void * callback_arg)2415 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2416 {
2417 off_t f_offset;
2418 int flags;
2419
2420 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2421 bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2422
2423 if (bp->b_flags & B_READ) {
2424 flags = CL_ASYNC | CL_READ;
2425 } else {
2426 flags = CL_ASYNC;
2427 }
2428 if (bp->b_flags & B_PASSIVE) {
2429 flags |= CL_PASSIVE;
2430 }
2431
2432 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2433
2434 return cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg);
2435 }
2436
2437
2438
2439 int
cluster_write(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags)2440 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2441 {
2442 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2443 }
2444
2445
2446 int
cluster_write_ext(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags,int (* callback)(buf_t,void *),void * callback_arg)2447 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2448 int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2449 {
2450 user_ssize_t cur_resid;
2451 int retval = 0;
2452 int flags;
2453 int zflags;
2454 int bflag;
2455 int write_type = IO_COPY;
2456 u_int32_t write_length;
2457
2458 flags = xflags;
2459
2460 if (flags & IO_PASSIVE) {
2461 bflag = CL_PASSIVE;
2462 } else {
2463 bflag = 0;
2464 }
2465
2466 if (vp->v_flag & VNOCACHE_DATA) {
2467 flags |= IO_NOCACHE;
2468 bflag |= CL_NOCACHE;
2469 }
2470 if (uio == NULL) {
2471 /*
2472 * no user data...
2473 * this call is being made to zero-fill some range in the file
2474 */
2475 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2476
2477 return retval;
2478 }
2479 /*
2480 * do a write through the cache if one of the following is true....
2481 * NOCACHE is not true or NODIRECT is true
2482 * the uio request doesn't target USERSPACE
2483 * otherwise, find out if we want the direct or contig variant for
2484 * the first vector in the uio request
2485 */
2486 if (((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
2487 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2488 }
2489
2490 if ((flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
2491 /*
2492 * must go through the cached variant in this case
2493 */
2494 write_type = IO_COPY;
2495 }
2496
2497 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
2498 switch (write_type) {
2499 case IO_COPY:
2500 /*
2501 * make sure the uio_resid isn't too big...
2502 * internally, we want to handle all of the I/O in
2503 * chunk sizes that fit in a 32 bit int
2504 */
2505 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2506 /*
2507 * we're going to have to call cluster_write_copy
2508 * more than once...
2509 *
2510 * only want the last call to cluster_write_copy to
2511 * have the IO_TAILZEROFILL flag set and only the
2512 * first call should have IO_HEADZEROFILL
2513 */
2514 zflags = flags & ~IO_TAILZEROFILL;
2515 flags &= ~IO_HEADZEROFILL;
2516
2517 write_length = MAX_IO_REQUEST_SIZE;
2518 } else {
2519 /*
2520 * last call to cluster_write_copy
2521 */
2522 zflags = flags;
2523
2524 write_length = (u_int32_t)cur_resid;
2525 }
2526 retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
2527 break;
2528
2529 case IO_CONTIG:
2530 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
2531
2532 if (flags & IO_HEADZEROFILL) {
2533 /*
2534 * only do this once per request
2535 */
2536 flags &= ~IO_HEADZEROFILL;
2537
2538 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
2539 headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2540 if (retval) {
2541 break;
2542 }
2543 }
2544 retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
2545
2546 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
2547 /*
2548 * we're done with the data from the user specified buffer(s)
2549 * and we've been requested to zero fill at the tail
2550 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2551 * by rearranging the args and passing in IO_HEADZEROFILL
2552 */
2553
2554 /*
2555 * Update the oldEOF to reflect the current EOF. If the UPL page
2556 * to zero-fill is not valid (when F_NOCACHE is set), the
2557 * cluster_write_copy() will perform RMW on the UPL page when
2558 * the oldEOF is not aligned on page boundary due to unaligned
2559 * write.
2560 */
2561 if (uio->uio_offset > oldEOF) {
2562 oldEOF = uio->uio_offset;
2563 }
2564 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)oldEOF, tailOff, uio->uio_offset,
2565 (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2566 }
2567 break;
2568
2569 case IO_DIRECT:
2570 /*
2571 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2572 */
2573 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
2574 break;
2575
2576 case IO_UNKNOWN:
2577 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2578 break;
2579 }
2580 /*
2581 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2582 * multiple times to service a multi-vector request that is not aligned properly
2583 * we need to update the oldEOF so that we
2584 * don't zero-fill the head of a page if we've successfully written
2585 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2586 * page that is beyond the oldEOF if the write is unaligned... we only
2587 * want that to happen for the very first page of the cluster_write,
2588 * NOT the first page of each vector making up a multi-vector write.
2589 */
2590 if (uio->uio_offset > oldEOF) {
2591 oldEOF = uio->uio_offset;
2592 }
2593 }
2594 return retval;
2595 }
2596
2597
2598 static int
cluster_write_direct(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,int * write_type,u_int32_t * write_length,int flags,int (* callback)(buf_t,void *),void * callback_arg)2599 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
2600 int flags, int (*callback)(buf_t, void *), void *callback_arg)
2601 {
2602 upl_t upl = NULL;
2603 upl_page_info_t *pl;
2604 vm_offset_t upl_offset;
2605 vm_offset_t vector_upl_offset = 0;
2606 u_int32_t io_req_size;
2607 u_int32_t offset_in_file;
2608 u_int32_t offset_in_iovbase;
2609 u_int32_t io_size;
2610 int io_flag = 0;
2611 upl_size_t upl_size = 0, vector_upl_size = 0;
2612 vm_size_t upl_needed_size;
2613 mach_msg_type_number_t pages_in_pl;
2614 upl_control_flags_t upl_flags;
2615 kern_return_t kret;
2616 mach_msg_type_number_t i;
2617 int force_data_sync;
2618 int retval = 0;
2619 int first_IO = 1;
2620 struct clios iostate;
2621 user_addr_t iov_base;
2622 u_int32_t mem_alignment_mask;
2623 u_int32_t devblocksize;
2624 u_int32_t max_io_size;
2625 u_int32_t max_upl_size;
2626 u_int32_t max_vector_size;
2627 u_int32_t bytes_outstanding_limit;
2628 boolean_t io_throttled = FALSE;
2629
2630 u_int32_t vector_upl_iosize = 0;
2631 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
2632 off_t v_upl_uio_offset = 0;
2633 int vector_upl_index = 0;
2634 upl_t vector_upl = NULL;
2635
2636
2637 /*
2638 * When we enter this routine, we know
2639 * -- the resid will not exceed iov_len
2640 */
2641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
2642 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2643
2644 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
2645
2646 max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2647
2648 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2649
2650 if (flags & IO_PASSIVE) {
2651 io_flag |= CL_PASSIVE;
2652 }
2653
2654 if (flags & IO_NOCACHE) {
2655 io_flag |= CL_NOCACHE;
2656 }
2657
2658 if (flags & IO_SKIP_ENCRYPTION) {
2659 io_flag |= CL_ENCRYPTED;
2660 }
2661
2662 iostate.io_completed = 0;
2663 iostate.io_issued = 0;
2664 iostate.io_error = 0;
2665 iostate.io_wanted = 0;
2666
2667 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
2668
2669 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2670 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2671
2672 if (devblocksize == 1) {
2673 /*
2674 * the AFP client advertises a devblocksize of 1
2675 * however, its BLOCKMAP routine maps to physical
2676 * blocks that are PAGE_SIZE in size...
2677 * therefore we can't ask for I/Os that aren't page aligned
2678 * or aren't multiples of PAGE_SIZE in size
2679 * by setting devblocksize to PAGE_SIZE, we re-instate
2680 * the old behavior we had before the mem_alignment_mask
2681 * changes went in...
2682 */
2683 devblocksize = PAGE_SIZE;
2684 }
2685
2686 next_dwrite:
2687 io_req_size = *write_length;
2688 iov_base = uio_curriovbase(uio);
2689
2690 offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2691 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
2692
2693 if (offset_in_file || offset_in_iovbase) {
2694 /*
2695 * one of the 2 important offsets is misaligned
2696 * so fire an I/O through the cache for this entire vector
2697 */
2698 goto wait_for_dwrites;
2699 }
2700 if (iov_base & (devblocksize - 1)) {
2701 /*
2702 * the offset in memory must be on a device block boundary
2703 * so that we can guarantee that we can generate an
2704 * I/O that ends on a page boundary in cluster_io
2705 */
2706 goto wait_for_dwrites;
2707 }
2708
2709 task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2710 while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
2711 int throttle_type;
2712
2713 if ((throttle_type = cluster_is_throttled(vp))) {
2714 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
2715
2716 /*
2717 * we're in the throttle window, at the very least
2718 * we want to limit the size of the I/O we're about
2719 * to issue
2720 */
2721 if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2722 /*
2723 * we're in the throttle window and at least 1 I/O
2724 * has already been issued by a throttleable thread
2725 * in this window, so return with EAGAIN to indicate
2726 * to the FS issuing the cluster_write call that it
2727 * should now throttle after dropping any locks
2728 */
2729 throttle_info_update_by_mount(vp->v_mount);
2730
2731 io_throttled = TRUE;
2732 goto wait_for_dwrites;
2733 }
2734 max_vector_size = max_throttle_size;
2735 max_io_size = max_throttle_size;
2736 } else {
2737 max_vector_size = MAX_VECTOR_UPL_SIZE;
2738 max_io_size = max_upl_size;
2739 }
2740
2741 if (first_IO) {
2742 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2743 first_IO = 0;
2744 }
2745 io_size = io_req_size & ~PAGE_MASK;
2746 iov_base = uio_curriovbase(uio);
2747
2748 if (io_size > max_io_size) {
2749 io_size = max_io_size;
2750 }
2751
2752 if (useVectorUPL && (iov_base & PAGE_MASK)) {
2753 /*
2754 * We have an iov_base that's not page-aligned.
2755 * Issue all I/O's that have been collected within
2756 * this Vectored UPL.
2757 */
2758 if (vector_upl_index) {
2759 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2760 reset_vector_run_state();
2761 }
2762
2763 /*
2764 * After this point, if we are using the Vector UPL path and the base is
2765 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2766 */
2767 }
2768
2769 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2770 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2771
2772 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
2773 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
2774
2775 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2776 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2777 pages_in_pl = 0;
2778 upl_size = (upl_size_t)upl_needed_size;
2779 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2780 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2781
2782 kret = vm_map_get_upl(map,
2783 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2784 &upl_size,
2785 &upl,
2786 NULL,
2787 &pages_in_pl,
2788 &upl_flags,
2789 VM_KERN_MEMORY_FILE,
2790 force_data_sync);
2791
2792 if (kret != KERN_SUCCESS) {
2793 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2794 0, 0, 0, kret, 0);
2795 /*
2796 * failed to get pagelist
2797 *
2798 * we may have already spun some portion of this request
2799 * off as async requests... we need to wait for the I/O
2800 * to complete before returning
2801 */
2802 goto wait_for_dwrites;
2803 }
2804 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2805 pages_in_pl = upl_size / PAGE_SIZE;
2806
2807 for (i = 0; i < pages_in_pl; i++) {
2808 if (!upl_valid_page(pl, i)) {
2809 break;
2810 }
2811 }
2812 if (i == pages_in_pl) {
2813 break;
2814 }
2815
2816 /*
2817 * didn't get all the pages back that we
2818 * needed... release this upl and try again
2819 */
2820 ubc_upl_abort(upl, 0);
2821 }
2822 if (force_data_sync >= 3) {
2823 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2824 i, pages_in_pl, upl_size, kret, 0);
2825 /*
2826 * for some reason, we couldn't acquire a hold on all
2827 * the pages needed in the user's address space
2828 *
2829 * we may have already spun some portion of this request
2830 * off as async requests... we need to wait for the I/O
2831 * to complete before returning
2832 */
2833 goto wait_for_dwrites;
2834 }
2835
2836 /*
2837 * Consider the possibility that upl_size wasn't satisfied.
2838 */
2839 if (upl_size < upl_needed_size) {
2840 if (upl_size && upl_offset == 0) {
2841 io_size = upl_size;
2842 } else {
2843 io_size = 0;
2844 }
2845 }
2846 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2847 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2848
2849 if (io_size == 0) {
2850 ubc_upl_abort(upl, 0);
2851 /*
2852 * we may have already spun some portion of this request
2853 * off as async requests... we need to wait for the I/O
2854 * to complete before returning
2855 */
2856 goto wait_for_dwrites;
2857 }
2858
2859 if (useVectorUPL) {
2860 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2861 if (end_off) {
2862 issueVectorUPL = 1;
2863 }
2864 /*
2865 * After this point, if we are using a vector UPL, then
2866 * either all the UPL elements end on a page boundary OR
2867 * this UPL is the last element because it does not end
2868 * on a page boundary.
2869 */
2870 }
2871
2872 /*
2873 * we want push out these writes asynchronously so that we can overlap
2874 * the preparation of the next I/O
2875 * if there are already too many outstanding writes
2876 * wait until some complete before issuing the next
2877 */
2878 if (vp->v_mount->mnt_minsaturationbytecount) {
2879 bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
2880 } else {
2881 if (__improbable(os_mul_overflow(max_upl_size, IO_SCALE(vp, 2),
2882 &bytes_outstanding_limit) ||
2883 (bytes_outstanding_limit > overlapping_write_max))) {
2884 bytes_outstanding_limit = overlapping_write_max;
2885 }
2886 }
2887
2888 cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
2889
2890 if (iostate.io_error) {
2891 /*
2892 * one of the earlier writes we issued ran into a hard error
2893 * don't issue any more writes, cleanup the UPL
2894 * that was just created but not used, then
2895 * go wait for all writes that are part of this stream
2896 * to complete before returning the error to the caller
2897 */
2898 ubc_upl_abort(upl, 0);
2899
2900 goto wait_for_dwrites;
2901 }
2902
2903 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2904 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2905
2906 if (!useVectorUPL) {
2907 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2908 io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2909 } else {
2910 if (!vector_upl_index) {
2911 vector_upl = vector_upl_create(upl_offset, uio->uio_iovcnt);
2912 v_upl_uio_offset = uio->uio_offset;
2913 vector_upl_offset = upl_offset;
2914 }
2915
2916 vector_upl_set_subupl(vector_upl, upl, upl_size);
2917 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2918 vector_upl_index++;
2919 vector_upl_iosize += io_size;
2920 vector_upl_size += upl_size;
2921
2922 if (issueVectorUPL || vector_upl_index == vector_upl_max_upls(vector_upl) || vector_upl_size >= max_vector_size) {
2923 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2924 reset_vector_run_state();
2925 }
2926 }
2927
2928 /*
2929 * update the uio structure to
2930 * reflect the I/O that we just issued
2931 */
2932 uio_update(uio, (user_size_t)io_size);
2933
2934 /*
2935 * in case we end up calling through to cluster_write_copy to finish
2936 * the tail of this request, we need to update the oldEOF so that we
2937 * don't zero-fill the head of a page if we've successfully written
2938 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2939 * page that is beyond the oldEOF if the write is unaligned... we only
2940 * want that to happen for the very first page of the cluster_write,
2941 * NOT the first page of each vector making up a multi-vector write.
2942 */
2943 if (uio->uio_offset > oldEOF) {
2944 oldEOF = uio->uio_offset;
2945 }
2946
2947 io_req_size -= io_size;
2948
2949 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2950 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2951 } /* end while */
2952
2953 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2954 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2955
2956 if (retval == 0 && *write_type == IO_DIRECT) {
2957 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2958 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2959
2960 goto next_dwrite;
2961 }
2962 }
2963
2964 wait_for_dwrites:
2965
2966 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
2967 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2968 reset_vector_run_state();
2969 }
2970 /*
2971 * make sure all async writes issued as part of this stream
2972 * have completed before we return
2973 */
2974 cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
2975
2976 if (iostate.io_error) {
2977 retval = iostate.io_error;
2978 }
2979
2980 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
2981
2982 if (io_throttled == TRUE && retval == 0) {
2983 retval = EAGAIN;
2984 }
2985
2986 if (io_req_size && retval == 0) {
2987 /*
2988 * we couldn't handle the tail of this request in DIRECT mode
2989 * so fire it through the copy path
2990 *
2991 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2992 * so we can just pass 0 in for the headOff and tailOff
2993 */
2994 if (uio->uio_offset > oldEOF) {
2995 oldEOF = uio->uio_offset;
2996 }
2997
2998 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
2999
3000 *write_type = IO_UNKNOWN;
3001 }
3002 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
3003 (int)uio->uio_offset, io_req_size, retval, 4, 0);
3004
3005 return retval;
3006 }
3007
3008
3009 static int
cluster_write_contig(vnode_t vp,struct uio * uio,off_t newEOF,int * write_type,u_int32_t * write_length,int (* callback)(buf_t,void *),void * callback_arg,int bflag)3010 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
3011 int (*callback)(buf_t, void *), void *callback_arg, int bflag)
3012 {
3013 upl_page_info_t *pl;
3014 addr64_t src_paddr = 0;
3015 upl_t upl[MAX_VECTS];
3016 vm_offset_t upl_offset;
3017 u_int32_t tail_size = 0;
3018 u_int32_t io_size;
3019 u_int32_t xsize;
3020 upl_size_t upl_size;
3021 vm_size_t upl_needed_size;
3022 mach_msg_type_number_t pages_in_pl;
3023 upl_control_flags_t upl_flags;
3024 kern_return_t kret;
3025 struct clios iostate;
3026 int error = 0;
3027 int cur_upl = 0;
3028 int num_upl = 0;
3029 int n;
3030 user_addr_t iov_base;
3031 u_int32_t devblocksize;
3032 u_int32_t mem_alignment_mask;
3033
3034 /*
3035 * When we enter this routine, we know
3036 * -- the io_req_size will not exceed iov_len
3037 * -- the target address is physically contiguous
3038 */
3039 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
3040
3041 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3042 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3043
3044 iostate.io_completed = 0;
3045 iostate.io_issued = 0;
3046 iostate.io_error = 0;
3047 iostate.io_wanted = 0;
3048
3049 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
3050
3051 next_cwrite:
3052 io_size = *write_length;
3053
3054 iov_base = uio_curriovbase(uio);
3055
3056 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3057 upl_needed_size = upl_offset + io_size;
3058
3059 pages_in_pl = 0;
3060 upl_size = (upl_size_t)upl_needed_size;
3061 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
3062 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3063
3064 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
3065 kret = vm_map_get_upl(map,
3066 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
3067 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
3068
3069 if (kret != KERN_SUCCESS) {
3070 /*
3071 * failed to get pagelist
3072 */
3073 error = EINVAL;
3074 goto wait_for_cwrites;
3075 }
3076 num_upl++;
3077
3078 /*
3079 * Consider the possibility that upl_size wasn't satisfied.
3080 */
3081 if (upl_size < upl_needed_size) {
3082 /*
3083 * This is a failure in the physical memory case.
3084 */
3085 error = EINVAL;
3086 goto wait_for_cwrites;
3087 }
3088 pl = ubc_upl_pageinfo(upl[cur_upl]);
3089
3090 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
3091
3092 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3093 u_int32_t head_size;
3094
3095 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
3096
3097 if (head_size > io_size) {
3098 head_size = io_size;
3099 }
3100
3101 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
3102
3103 if (error) {
3104 goto wait_for_cwrites;
3105 }
3106
3107 upl_offset += head_size;
3108 src_paddr += head_size;
3109 io_size -= head_size;
3110
3111 iov_base += head_size;
3112 }
3113 if ((u_int32_t)iov_base & mem_alignment_mask) {
3114 /*
3115 * request doesn't set up on a memory boundary
3116 * the underlying DMA engine can handle...
3117 * return an error instead of going through
3118 * the slow copy path since the intent of this
3119 * path is direct I/O from device memory
3120 */
3121 error = EINVAL;
3122 goto wait_for_cwrites;
3123 }
3124
3125 tail_size = io_size & (devblocksize - 1);
3126 io_size -= tail_size;
3127
3128 while (io_size && error == 0) {
3129 if (io_size > MAX_IO_CONTIG_SIZE) {
3130 xsize = MAX_IO_CONTIG_SIZE;
3131 } else {
3132 xsize = io_size;
3133 }
3134 /*
3135 * request asynchronously so that we can overlap
3136 * the preparation of the next I/O... we'll do
3137 * the commit after all the I/O has completed
3138 * since its all issued against the same UPL
3139 * if there are already too many outstanding writes
3140 * wait until some have completed before issuing the next
3141 */
3142 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
3143
3144 if (iostate.io_error) {
3145 /*
3146 * one of the earlier writes we issued ran into a hard error
3147 * don't issue any more writes...
3148 * go wait for all writes that are part of this stream
3149 * to complete before returning the error to the caller
3150 */
3151 goto wait_for_cwrites;
3152 }
3153 /*
3154 * issue an asynchronous write to cluster_io
3155 */
3156 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
3157 xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
3158
3159 if (error == 0) {
3160 /*
3161 * The cluster_io write completed successfully,
3162 * update the uio structure
3163 */
3164 uio_update(uio, (user_size_t)xsize);
3165
3166 upl_offset += xsize;
3167 src_paddr += xsize;
3168 io_size -= xsize;
3169 }
3170 }
3171 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
3172 error = cluster_io_type(uio, write_type, write_length, 0);
3173
3174 if (error == 0 && *write_type == IO_CONTIG) {
3175 cur_upl++;
3176 goto next_cwrite;
3177 }
3178 } else {
3179 *write_type = IO_UNKNOWN;
3180 }
3181
3182 wait_for_cwrites:
3183 /*
3184 * make sure all async writes that are part of this stream
3185 * have completed before we proceed
3186 */
3187 cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
3188
3189 if (iostate.io_error) {
3190 error = iostate.io_error;
3191 }
3192
3193 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
3194
3195 if (error == 0 && tail_size) {
3196 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
3197 }
3198
3199 for (n = 0; n < num_upl; n++) {
3200 /*
3201 * just release our hold on each physically contiguous
3202 * region without changing any state
3203 */
3204 ubc_upl_abort(upl[n], 0);
3205 }
3206
3207 return error;
3208 }
3209
3210
3211 /*
3212 * need to avoid a race between an msync of a range of pages dirtied via mmap
3213 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3214 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3215 *
3216 * we should never force-zero-fill pages that are already valid in the cache...
3217 * the entire page contains valid data (either from disk, zero-filled or dirtied
3218 * via an mmap) so we can only do damage by trying to zero-fill
3219 *
3220 */
3221 static int
cluster_zero_range(upl_t upl,upl_page_info_t * pl,int flags,int io_offset,off_t zero_off,off_t upl_f_offset,int bytes_to_zero)3222 cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3223 {
3224 int zero_pg_index;
3225 boolean_t need_cluster_zero = TRUE;
3226
3227 if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
3228 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
3229 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3230
3231 if (upl_valid_page(pl, zero_pg_index)) {
3232 /*
3233 * never force zero valid pages - dirty or clean
3234 * we'll leave these in the UPL for cluster_write_copy to deal with
3235 */
3236 need_cluster_zero = FALSE;
3237 }
3238 }
3239 if (need_cluster_zero == TRUE) {
3240 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
3241 }
3242
3243 return bytes_to_zero;
3244 }
3245
3246
3247 void
cluster_update_state(vnode_t vp,vm_object_offset_t s_offset,vm_object_offset_t e_offset,boolean_t vm_initiated)3248 cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3249 {
3250 struct cl_extent cl;
3251 boolean_t first_pass = TRUE;
3252
3253 assert(s_offset < e_offset);
3254 assert((s_offset & PAGE_MASK_64) == 0);
3255 assert((e_offset & PAGE_MASK_64) == 0);
3256
3257 cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3258 cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3259
3260 cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
3261 vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3262 }
3263
3264
3265 static void
cluster_update_state_internal(vnode_t vp,struct cl_extent * cl,int flags,boolean_t defer_writes,boolean_t * first_pass,off_t write_off,int write_cnt,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)3266 cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
3267 boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3268 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
3269 {
3270 struct cl_writebehind *wbp;
3271 int cl_index;
3272 int ret_cluster_try_push;
3273 u_int max_cluster_pgcount;
3274
3275
3276 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3277
3278 /*
3279 * take the lock to protect our accesses
3280 * of the writebehind and sparse cluster state
3281 */
3282 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3283
3284 if (wbp->cl_scmap) {
3285 if (!(flags & IO_NOCACHE)) {
3286 /*
3287 * we've fallen into the sparse
3288 * cluster method of delaying dirty pages
3289 */
3290 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3291
3292 lck_mtx_unlock(&wbp->cl_lockw);
3293 return;
3294 }
3295 /*
3296 * must have done cached writes that fell into
3297 * the sparse cluster mechanism... we've switched
3298 * to uncached writes on the file, so go ahead
3299 * and push whatever's in the sparse map
3300 * and switch back to normal clustering
3301 */
3302 wbp->cl_number = 0;
3303
3304 sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
3305 /*
3306 * no clusters of either type present at this point
3307 * so just go directly to start_new_cluster since
3308 * we know we need to delay this I/O since we've
3309 * already released the pages back into the cache
3310 * to avoid the deadlock with sparse_cluster_push
3311 */
3312 goto start_new_cluster;
3313 }
3314 if (*first_pass == TRUE) {
3315 if (write_off == wbp->cl_last_write) {
3316 wbp->cl_seq_written += write_cnt;
3317 } else {
3318 wbp->cl_seq_written = write_cnt;
3319 }
3320
3321 wbp->cl_last_write = write_off + write_cnt;
3322
3323 *first_pass = FALSE;
3324 }
3325 if (wbp->cl_number == 0) {
3326 /*
3327 * no clusters currently present
3328 */
3329 goto start_new_cluster;
3330 }
3331
3332 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3333 /*
3334 * check each cluster that we currently hold
3335 * try to merge some or all of this write into
3336 * one or more of the existing clusters... if
3337 * any portion of the write remains, start a
3338 * new cluster
3339 */
3340 if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3341 /*
3342 * the current write starts at or after the current cluster
3343 */
3344 if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3345 /*
3346 * we have a write that fits entirely
3347 * within the existing cluster limits
3348 */
3349 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3350 /*
3351 * update our idea of where the cluster ends
3352 */
3353 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3354 }
3355 break;
3356 }
3357 if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3358 /*
3359 * we have a write that starts in the middle of the current cluster
3360 * but extends beyond the cluster's limit... we know this because
3361 * of the previous checks
3362 * we'll extend the current cluster to the max
3363 * and update the b_addr for the current write to reflect that
3364 * the head of it was absorbed into this cluster...
3365 * note that we'll always have a leftover tail in this case since
3366 * full absorbtion would have occurred in the clause above
3367 */
3368 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3369
3370 cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3371 }
3372 /*
3373 * we come here for the case where the current write starts
3374 * beyond the limit of the existing cluster or we have a leftover
3375 * tail after a partial absorbtion
3376 *
3377 * in either case, we'll check the remaining clusters before
3378 * starting a new one
3379 */
3380 } else {
3381 /*
3382 * the current write starts in front of the cluster we're currently considering
3383 */
3384 if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3385 /*
3386 * we can just merge the new request into
3387 * this cluster and leave it in the cache
3388 * since the resulting cluster is still
3389 * less than the maximum allowable size
3390 */
3391 wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3392
3393 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3394 /*
3395 * the current write completely
3396 * envelops the existing cluster and since
3397 * each write is limited to at most max_cluster_pgcount pages
3398 * we can just use the start and last blocknos of the write
3399 * to generate the cluster limits
3400 */
3401 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3402 }
3403 break;
3404 }
3405 /*
3406 * if we were to combine this write with the current cluster
3407 * we would exceed the cluster size limit.... so,
3408 * let's see if there's any overlap of the new I/O with
3409 * the cluster we're currently considering... in fact, we'll
3410 * stretch the cluster out to it's full limit and see if we
3411 * get an intersection with the current write
3412 *
3413 */
3414 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3415 /*
3416 * the current write extends into the proposed cluster
3417 * clip the length of the current write after first combining it's
3418 * tail with the newly shaped cluster
3419 */
3420 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3421
3422 cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3423 }
3424 /*
3425 * if we get here, there was no way to merge
3426 * any portion of this write with this cluster
3427 * or we could only merge part of it which
3428 * will leave a tail...
3429 * we'll check the remaining clusters before starting a new one
3430 */
3431 }
3432 }
3433 if (cl_index < wbp->cl_number) {
3434 /*
3435 * we found an existing cluster(s) that we
3436 * could entirely merge this I/O into
3437 */
3438 goto delay_io;
3439 }
3440
3441 if (defer_writes == FALSE &&
3442 wbp->cl_number == MAX_CLUSTERS &&
3443 wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3444 uint32_t n;
3445
3446 if (vp->v_mount->mnt_minsaturationbytecount) {
3447 n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
3448
3449 if (n > MAX_CLUSTERS) {
3450 n = MAX_CLUSTERS;
3451 }
3452 } else {
3453 n = 0;
3454 }
3455
3456 if (n == 0) {
3457 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
3458 n = WRITE_BEHIND_SSD;
3459 } else {
3460 n = WRITE_BEHIND;
3461 }
3462 }
3463 while (n--) {
3464 cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
3465 }
3466 }
3467 if (wbp->cl_number < MAX_CLUSTERS) {
3468 /*
3469 * we didn't find an existing cluster to
3470 * merge into, but there's room to start
3471 * a new one
3472 */
3473 goto start_new_cluster;
3474 }
3475 /*
3476 * no exisitng cluster to merge with and no
3477 * room to start a new one... we'll try
3478 * pushing one of the existing ones... if none of
3479 * them are able to be pushed, we'll switch
3480 * to the sparse cluster mechanism
3481 * cluster_try_push updates cl_number to the
3482 * number of remaining clusters... and
3483 * returns the number of currently unused clusters
3484 */
3485 ret_cluster_try_push = 0;
3486
3487 /*
3488 * if writes are not deferred, call cluster push immediately
3489 */
3490 if (defer_writes == FALSE) {
3491 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
3492 }
3493 /*
3494 * execute following regardless of writes being deferred or not
3495 */
3496 if (ret_cluster_try_push == 0) {
3497 /*
3498 * no more room in the normal cluster mechanism
3499 * so let's switch to the more expansive but expensive
3500 * sparse mechanism....
3501 */
3502 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
3503 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3504
3505 lck_mtx_unlock(&wbp->cl_lockw);
3506 return;
3507 }
3508 start_new_cluster:
3509 wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3510 wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3511
3512 wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3513
3514 if (flags & IO_NOCACHE) {
3515 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3516 }
3517
3518 if (flags & IO_PASSIVE) {
3519 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3520 }
3521
3522 wbp->cl_number++;
3523 delay_io:
3524 lck_mtx_unlock(&wbp->cl_lockw);
3525 return;
3526 }
3527
3528
3529 static int
cluster_write_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int flags,int (* callback)(buf_t,void *),void * callback_arg)3530 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
3531 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3532 {
3533 upl_page_info_t *pl;
3534 upl_t upl;
3535 vm_offset_t upl_offset = 0;
3536 vm_size_t upl_size;
3537 off_t upl_f_offset;
3538 int pages_in_upl;
3539 int start_offset;
3540 int xfer_resid;
3541 int io_size;
3542 int io_offset;
3543 int bytes_to_zero;
3544 int bytes_to_move;
3545 kern_return_t kret;
3546 int retval = 0;
3547 int io_resid;
3548 long long total_size;
3549 long long zero_cnt;
3550 off_t zero_off;
3551 long long zero_cnt1;
3552 off_t zero_off1;
3553 off_t write_off = 0;
3554 int write_cnt = 0;
3555 boolean_t first_pass = FALSE;
3556 struct cl_extent cl;
3557 int bflag;
3558 u_int max_io_size;
3559
3560 if (uio) {
3561 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3562 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
3563
3564 io_resid = io_req_size;
3565 } else {
3566 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3567 0, 0, (int)oldEOF, (int)newEOF, 0);
3568
3569 io_resid = 0;
3570 }
3571 if (flags & IO_PASSIVE) {
3572 bflag = CL_PASSIVE;
3573 } else {
3574 bflag = 0;
3575 }
3576 if (flags & IO_NOCACHE) {
3577 bflag |= CL_NOCACHE;
3578 }
3579
3580 if (flags & IO_SKIP_ENCRYPTION) {
3581 bflag |= CL_ENCRYPTED;
3582 }
3583
3584 zero_cnt = 0;
3585 zero_cnt1 = 0;
3586 zero_off = 0;
3587 zero_off1 = 0;
3588
3589 max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
3590
3591 if (flags & IO_HEADZEROFILL) {
3592 /*
3593 * some filesystems (HFS is one) don't support unallocated holes within a file...
3594 * so we zero fill the intervening space between the old EOF and the offset
3595 * where the next chunk of real data begins.... ftruncate will also use this
3596 * routine to zero fill to the new EOF when growing a file... in this case, the
3597 * uio structure will not be provided
3598 */
3599 if (uio) {
3600 if (headOff < uio->uio_offset) {
3601 zero_cnt = uio->uio_offset - headOff;
3602 zero_off = headOff;
3603 }
3604 } else if (headOff < newEOF) {
3605 zero_cnt = newEOF - headOff;
3606 zero_off = headOff;
3607 }
3608 } else {
3609 if (uio && uio->uio_offset > oldEOF) {
3610 zero_off = uio->uio_offset & ~PAGE_MASK_64;
3611
3612 if (zero_off >= oldEOF) {
3613 zero_cnt = uio->uio_offset - zero_off;
3614
3615 flags |= IO_HEADZEROFILL;
3616 }
3617 }
3618 }
3619 if (flags & IO_TAILZEROFILL) {
3620 if (uio) {
3621 zero_off1 = uio->uio_offset + io_req_size;
3622
3623 if (zero_off1 < tailOff) {
3624 zero_cnt1 = tailOff - zero_off1;
3625 }
3626 }
3627 } else {
3628 if (uio && newEOF > oldEOF) {
3629 zero_off1 = uio->uio_offset + io_req_size;
3630
3631 if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3632 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3633
3634 flags |= IO_TAILZEROFILL;
3635 }
3636 }
3637 }
3638 if (zero_cnt == 0 && uio == (struct uio *) 0) {
3639 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3640 retval, 0, 0, 0, 0);
3641 return 0;
3642 }
3643 if (uio) {
3644 write_off = uio->uio_offset;
3645 write_cnt = (int)uio_resid(uio);
3646 /*
3647 * delay updating the sequential write info
3648 * in the control block until we've obtained
3649 * the lock for it
3650 */
3651 first_pass = TRUE;
3652 }
3653 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
3654 /*
3655 * for this iteration of the loop, figure out where our starting point is
3656 */
3657 if (zero_cnt) {
3658 start_offset = (int)(zero_off & PAGE_MASK_64);
3659 upl_f_offset = zero_off - start_offset;
3660 } else if (io_resid) {
3661 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3662 upl_f_offset = uio->uio_offset - start_offset;
3663 } else {
3664 start_offset = (int)(zero_off1 & PAGE_MASK_64);
3665 upl_f_offset = zero_off1 - start_offset;
3666 }
3667 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3668 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
3669
3670 if (total_size > max_io_size) {
3671 total_size = max_io_size;
3672 }
3673
3674 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3675
3676 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
3677 /*
3678 * assumption... total_size <= io_resid
3679 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3680 */
3681 if ((start_offset + total_size) > max_io_size) {
3682 total_size = max_io_size - start_offset;
3683 }
3684 xfer_resid = (int)total_size;
3685
3686 retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
3687
3688 if (retval) {
3689 break;
3690 }
3691
3692 io_resid -= (total_size - xfer_resid);
3693 total_size = xfer_resid;
3694 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3695 upl_f_offset = uio->uio_offset - start_offset;
3696
3697 if (total_size == 0) {
3698 if (start_offset) {
3699 /*
3700 * the write did not finish on a page boundary
3701 * which will leave upl_f_offset pointing to the
3702 * beginning of the last page written instead of
3703 * the page beyond it... bump it in this case
3704 * so that the cluster code records the last page
3705 * written as dirty
3706 */
3707 upl_f_offset += PAGE_SIZE_64;
3708 }
3709 upl_size = 0;
3710
3711 goto check_cluster;
3712 }
3713 }
3714 /*
3715 * compute the size of the upl needed to encompass
3716 * the requested write... limit each call to cluster_io
3717 * to the maximum UPL size... cluster_io will clip if
3718 * this exceeds the maximum io_size for the device,
3719 * make sure to account for
3720 * a starting offset that's not page aligned
3721 */
3722 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3723
3724 if (upl_size > max_io_size) {
3725 upl_size = max_io_size;
3726 }
3727
3728 pages_in_upl = (int)(upl_size / PAGE_SIZE);
3729 io_size = (int)(upl_size - start_offset);
3730
3731 if ((long long)io_size > total_size) {
3732 io_size = (int)total_size;
3733 }
3734
3735 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
3736
3737
3738 /*
3739 * Gather the pages from the buffer cache.
3740 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3741 * that we intend to modify these pages.
3742 */
3743 kret = ubc_create_upl_kernel(vp,
3744 upl_f_offset,
3745 (int)upl_size,
3746 &upl,
3747 &pl,
3748 UPL_SET_LITE | ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
3749 VM_KERN_MEMORY_FILE);
3750 if (kret != KERN_SUCCESS) {
3751 panic("cluster_write_copy: failed to get pagelist");
3752 }
3753
3754 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
3755 upl, (int)upl_f_offset, start_offset, 0, 0);
3756
3757 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
3758 int read_size;
3759
3760 /*
3761 * we're starting in the middle of the first page of the upl
3762 * and the page isn't currently valid, so we're going to have
3763 * to read it in first... this is a synchronous operation
3764 */
3765 read_size = PAGE_SIZE;
3766
3767 if ((upl_f_offset + read_size) > oldEOF) {
3768 read_size = (int)(oldEOF - upl_f_offset);
3769 }
3770
3771 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
3772 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3773 if (retval) {
3774 /*
3775 * we had an error during the read which causes us to abort
3776 * the current cluster_write request... before we do, we need
3777 * to release the rest of the pages in the upl without modifying
3778 * there state and mark the failed page in error
3779 */
3780 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3781
3782 if (upl_size > PAGE_SIZE) {
3783 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size,
3784 UPL_ABORT_FREE_ON_EMPTY);
3785 }
3786
3787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3788 upl, 0, 0, retval, 0);
3789 break;
3790 }
3791 }
3792 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3793 /*
3794 * the last offset we're writing to in this upl does not end on a page
3795 * boundary... if it's not beyond the old EOF, then we'll also need to
3796 * pre-read this page in if it isn't already valid
3797 */
3798 upl_offset = upl_size - PAGE_SIZE;
3799
3800 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3801 !upl_valid_page(pl, (int)(upl_offset / PAGE_SIZE))) {
3802 int read_size;
3803
3804 read_size = PAGE_SIZE;
3805
3806 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
3807 read_size = (int)(oldEOF - (upl_f_offset + upl_offset));
3808 }
3809
3810 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
3811 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3812 if (retval) {
3813 /*
3814 * we had an error during the read which causes us to abort
3815 * the current cluster_write request... before we do, we
3816 * need to release the rest of the pages in the upl without
3817 * modifying there state and mark the failed page in error
3818 */
3819 ubc_upl_abort_range(upl, (upl_offset_t)upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3820
3821 if (upl_size > PAGE_SIZE) {
3822 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3823 }
3824
3825 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3826 upl, 0, 0, retval, 0);
3827 break;
3828 }
3829 }
3830 }
3831 xfer_resid = io_size;
3832 io_offset = start_offset;
3833
3834 while (zero_cnt && xfer_resid) {
3835 if (zero_cnt < (long long)xfer_resid) {
3836 bytes_to_zero = (int)zero_cnt;
3837 } else {
3838 bytes_to_zero = xfer_resid;
3839 }
3840
3841 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3842
3843 xfer_resid -= bytes_to_zero;
3844 zero_cnt -= bytes_to_zero;
3845 zero_off += bytes_to_zero;
3846 io_offset += bytes_to_zero;
3847 }
3848 if (xfer_resid && io_resid) {
3849 u_int32_t io_requested;
3850
3851 bytes_to_move = min(io_resid, xfer_resid);
3852 io_requested = bytes_to_move;
3853
3854 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3855
3856 if (retval) {
3857 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3858
3859 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3860 upl, 0, 0, retval, 0);
3861 } else {
3862 io_resid -= bytes_to_move;
3863 xfer_resid -= bytes_to_move;
3864 io_offset += bytes_to_move;
3865 }
3866 }
3867 while (xfer_resid && zero_cnt1 && retval == 0) {
3868 if (zero_cnt1 < (long long)xfer_resid) {
3869 bytes_to_zero = (int)zero_cnt1;
3870 } else {
3871 bytes_to_zero = xfer_resid;
3872 }
3873
3874 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
3875
3876 xfer_resid -= bytes_to_zero;
3877 zero_cnt1 -= bytes_to_zero;
3878 zero_off1 += bytes_to_zero;
3879 io_offset += bytes_to_zero;
3880 }
3881 if (retval == 0) {
3882 int do_zeroing = 1;
3883
3884 io_size += start_offset;
3885
3886 /* Force more restrictive zeroing behavior only on APFS */
3887 if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3888 do_zeroing = 0;
3889 }
3890
3891 if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3892 /*
3893 * if we're extending the file with this write
3894 * we'll zero fill the rest of the page so that
3895 * if the file gets extended again in such a way as to leave a
3896 * hole starting at this EOF, we'll have zero's in the correct spot
3897 */
3898 cluster_zero(upl, io_size, (int)(upl_size - io_size), NULL);
3899 }
3900 /*
3901 * release the upl now if we hold one since...
3902 * 1) pages in it may be present in the sparse cluster map
3903 * and may span 2 separate buckets there... if they do and
3904 * we happen to have to flush a bucket to make room and it intersects
3905 * this upl, a deadlock may result on page BUSY
3906 * 2) we're delaying the I/O... from this point forward we're just updating
3907 * the cluster state... no need to hold the pages, so commit them
3908 * 3) IO_SYNC is set...
3909 * because we had to ask for a UPL that provides currenty non-present pages, the
3910 * UPL has been automatically set to clear the dirty flags (both software and hardware)
3911 * upon committing it... this is not the behavior we want since it's possible for
3912 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3913 * we'll pick these pages back up later with the correct behavior specified.
3914 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3915 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3916 * we hold since the flushing context is holding the cluster lock.
3917 */
3918 ubc_upl_commit_range(upl, 0, (upl_size_t)upl_size,
3919 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
3920 check_cluster:
3921 /*
3922 * calculate the last logical block number
3923 * that this delayed I/O encompassed
3924 */
3925 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3926
3927 if (flags & IO_SYNC) {
3928 /*
3929 * if the IO_SYNC flag is set than we need to bypass
3930 * any clustering and immediately issue the I/O
3931 *
3932 * we don't hold the lock at this point
3933 *
3934 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3935 * so that we correctly deal with a change in state of the hardware modify bit...
3936 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3937 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3938 * responsible for generating the correct sized I/O(s)
3939 */
3940 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
3941 } else {
3942 boolean_t defer_writes = FALSE;
3943
3944 if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) {
3945 defer_writes = TRUE;
3946 }
3947
3948 cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
3949 write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
3950 }
3951 }
3952 }
3953 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
3954
3955 return retval;
3956 }
3957
3958
3959
3960 int
cluster_read(vnode_t vp,struct uio * uio,off_t filesize,int xflags)3961 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
3962 {
3963 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3964 }
3965
3966
3967 int
cluster_read_ext(vnode_t vp,struct uio * uio,off_t filesize,int xflags,int (* callback)(buf_t,void *),void * callback_arg)3968 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3969 {
3970 int retval = 0;
3971 int flags;
3972 user_ssize_t cur_resid;
3973 u_int32_t io_size;
3974 u_int32_t read_length = 0;
3975 int read_type = IO_COPY;
3976
3977 flags = xflags;
3978
3979 if (vp->v_flag & VNOCACHE_DATA) {
3980 flags |= IO_NOCACHE;
3981 }
3982 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) {
3983 flags |= IO_RAOFF;
3984 }
3985
3986 if (flags & IO_SKIP_ENCRYPTION) {
3987 flags |= IO_ENCRYPTED;
3988 }
3989
3990 /*
3991 * do a read through the cache if one of the following is true....
3992 * NOCACHE is not true
3993 * the uio request doesn't target USERSPACE
3994 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3995 * Reading encrypted data from a CP filesystem should never result in the data touching
3996 * the UBC.
3997 *
3998 * otherwise, find out if we want the direct or contig variant for
3999 * the first vector in the uio request
4000 */
4001 if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED)) {
4002 retval = cluster_io_type(uio, &read_type, &read_length, 0);
4003 }
4004
4005 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
4006 switch (read_type) {
4007 case IO_COPY:
4008 /*
4009 * make sure the uio_resid isn't too big...
4010 * internally, we want to handle all of the I/O in
4011 * chunk sizes that fit in a 32 bit int
4012 */
4013 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
4014 io_size = MAX_IO_REQUEST_SIZE;
4015 } else {
4016 io_size = (u_int32_t)cur_resid;
4017 }
4018
4019 retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
4020 break;
4021
4022 case IO_DIRECT:
4023 retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
4024 break;
4025
4026 case IO_CONTIG:
4027 retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
4028 break;
4029
4030 case IO_UNKNOWN:
4031 retval = cluster_io_type(uio, &read_type, &read_length, 0);
4032 break;
4033 }
4034 }
4035 return retval;
4036 }
4037
4038
4039
4040 static void
cluster_read_upl_release(upl_t upl,int start_pg,int last_pg,int take_reference)4041 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
4042 {
4043 int range;
4044 int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4045
4046 if ((range = last_pg - start_pg)) {
4047 if (take_reference) {
4048 abort_flags |= UPL_ABORT_REFERENCE;
4049 }
4050
4051 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
4052 }
4053 }
4054
4055
4056 static int
cluster_read_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)4057 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4058 {
4059 upl_page_info_t *pl;
4060 upl_t upl;
4061 vm_offset_t upl_offset;
4062 u_int32_t upl_size;
4063 off_t upl_f_offset;
4064 int start_offset;
4065 int start_pg;
4066 int last_pg;
4067 int uio_last = 0;
4068 int pages_in_upl;
4069 off_t max_size;
4070 off_t last_ioread_offset;
4071 off_t last_request_offset;
4072 kern_return_t kret;
4073 int error = 0;
4074 int retval = 0;
4075 u_int32_t size_of_prefetch;
4076 u_int32_t xsize;
4077 u_int32_t io_size;
4078 u_int32_t max_rd_size;
4079 u_int32_t max_io_size;
4080 u_int32_t max_prefetch;
4081 u_int rd_ahead_enabled = 1;
4082 u_int prefetch_enabled = 1;
4083 struct cl_readahead * rap;
4084 struct clios iostate;
4085 struct cl_extent extent;
4086 int bflag;
4087 int take_reference = 1;
4088 int policy = IOPOL_DEFAULT;
4089 boolean_t iolock_inited = FALSE;
4090
4091 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
4092 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
4093
4094 if (flags & IO_ENCRYPTED) {
4095 panic("encrypted blocks will hit UBC!");
4096 }
4097
4098 policy = throttle_get_io_policy(NULL);
4099
4100 if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) {
4101 take_reference = 0;
4102 }
4103
4104 if (flags & IO_PASSIVE) {
4105 bflag = CL_PASSIVE;
4106 } else {
4107 bflag = 0;
4108 }
4109
4110 if (flags & IO_NOCACHE) {
4111 bflag |= CL_NOCACHE;
4112 }
4113
4114 if (flags & IO_SKIP_ENCRYPTION) {
4115 bflag |= CL_ENCRYPTED;
4116 }
4117
4118 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
4119 max_prefetch = cluster_max_prefetch(vp, max_io_size, prefetch_max);
4120 max_rd_size = max_prefetch;
4121
4122 last_request_offset = uio->uio_offset + io_req_size;
4123
4124 if (last_request_offset > filesize) {
4125 last_request_offset = filesize;
4126 }
4127
4128 if ((flags & (IO_RAOFF | IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
4129 rd_ahead_enabled = 0;
4130 rap = NULL;
4131 } else {
4132 if (cluster_is_throttled(vp)) {
4133 /*
4134 * we're in the throttle window, at the very least
4135 * we want to limit the size of the I/O we're about
4136 * to issue
4137 */
4138 rd_ahead_enabled = 0;
4139 prefetch_enabled = 0;
4140
4141 max_rd_size = calculate_max_throttle_size(vp);
4142 }
4143 if ((rap = cluster_get_rap(vp)) == NULL) {
4144 rd_ahead_enabled = 0;
4145 } else {
4146 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
4147 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
4148 }
4149 }
4150 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
4151 /*
4152 * determine if we already have a read-ahead in the pipe courtesy of the
4153 * last read systemcall that was issued...
4154 * if so, pick up it's extent to determine where we should start
4155 * with respect to any read-ahead that might be necessary to
4156 * garner all the data needed to complete this read systemcall
4157 */
4158 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
4159
4160 if (last_ioread_offset < uio->uio_offset) {
4161 last_ioread_offset = (off_t)0;
4162 } else if (last_ioread_offset > last_request_offset) {
4163 last_ioread_offset = last_request_offset;
4164 }
4165 } else {
4166 last_ioread_offset = (off_t)0;
4167 }
4168
4169 while (io_req_size && uio->uio_offset < filesize && retval == 0) {
4170 max_size = filesize - uio->uio_offset;
4171 bool leftover_upl_aborted = false;
4172
4173 if ((off_t)(io_req_size) < max_size) {
4174 io_size = io_req_size;
4175 } else {
4176 io_size = (u_int32_t)max_size;
4177 }
4178
4179 if (!(flags & IO_NOCACHE)) {
4180 while (io_size) {
4181 u_int32_t io_resid;
4182 u_int32_t io_requested;
4183
4184 /*
4185 * if we keep finding the pages we need already in the cache, then
4186 * don't bother to call cluster_read_prefetch since it costs CPU cycles
4187 * to determine that we have all the pages we need... once we miss in
4188 * the cache and have issued an I/O, than we'll assume that we're likely
4189 * to continue to miss in the cache and it's to our advantage to try and prefetch
4190 */
4191 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset))) {
4192 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4193 /*
4194 * we've already issued I/O for this request and
4195 * there's still work to do and
4196 * our prefetch stream is running dry, so issue a
4197 * pre-fetch I/O... the I/O latency will overlap
4198 * with the copying of the data
4199 */
4200 if (size_of_prefetch > max_rd_size) {
4201 size_of_prefetch = max_rd_size;
4202 }
4203
4204 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4205
4206 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4207
4208 if (last_ioread_offset > last_request_offset) {
4209 last_ioread_offset = last_request_offset;
4210 }
4211 }
4212 }
4213 /*
4214 * limit the size of the copy we're about to do so that
4215 * we can notice that our I/O pipe is running dry and
4216 * get the next I/O issued before it does go dry
4217 */
4218 if (last_ioread_offset && io_size > (max_io_size / 4)) {
4219 io_resid = (max_io_size / 4);
4220 } else {
4221 io_resid = io_size;
4222 }
4223
4224 io_requested = io_resid;
4225
4226 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
4227
4228 xsize = io_requested - io_resid;
4229
4230 io_size -= xsize;
4231 io_req_size -= xsize;
4232
4233 if (retval || io_resid) {
4234 /*
4235 * if we run into a real error or
4236 * a page that is not in the cache
4237 * we need to leave streaming mode
4238 */
4239 break;
4240 }
4241
4242 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
4243 /*
4244 * we're already finished the I/O for this read request
4245 * let's see if we should do a read-ahead
4246 */
4247 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4248 }
4249 }
4250 if (retval) {
4251 break;
4252 }
4253 if (io_size == 0) {
4254 if (rap != NULL) {
4255 if (extent.e_addr < rap->cl_lastr) {
4256 rap->cl_maxra = 0;
4257 }
4258 rap->cl_lastr = extent.e_addr;
4259 }
4260 break;
4261 }
4262 /*
4263 * recompute max_size since cluster_copy_ubc_data_internal
4264 * may have advanced uio->uio_offset
4265 */
4266 max_size = filesize - uio->uio_offset;
4267 }
4268
4269 iostate.io_completed = 0;
4270 iostate.io_issued = 0;
4271 iostate.io_error = 0;
4272 iostate.io_wanted = 0;
4273
4274 if ((flags & IO_RETURN_ON_THROTTLE)) {
4275 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4276 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4277 /*
4278 * we're in the throttle window and at least 1 I/O
4279 * has already been issued by a throttleable thread
4280 * in this window, so return with EAGAIN to indicate
4281 * to the FS issuing the cluster_read call that it
4282 * should now throttle after dropping any locks
4283 */
4284 throttle_info_update_by_mount(vp->v_mount);
4285
4286 retval = EAGAIN;
4287 break;
4288 }
4289 }
4290 }
4291
4292 /*
4293 * compute the size of the upl needed to encompass
4294 * the requested read... limit each call to cluster_io
4295 * to the maximum UPL size... cluster_io will clip if
4296 * this exceeds the maximum io_size for the device,
4297 * make sure to account for
4298 * a starting offset that's not page aligned
4299 */
4300 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4301 upl_f_offset = uio->uio_offset - (off_t)start_offset;
4302
4303 if (io_size > max_rd_size) {
4304 io_size = max_rd_size;
4305 }
4306
4307 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4308
4309 if (flags & IO_NOCACHE) {
4310 if (upl_size > max_io_size) {
4311 upl_size = max_io_size;
4312 }
4313 } else {
4314 if (upl_size > max_io_size / 4) {
4315 upl_size = max_io_size / 4;
4316 upl_size &= ~PAGE_MASK;
4317
4318 if (upl_size == 0) {
4319 upl_size = PAGE_SIZE;
4320 }
4321 }
4322 }
4323 pages_in_upl = upl_size / PAGE_SIZE;
4324
4325 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
4326 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4327
4328 kret = ubc_create_upl_kernel(vp,
4329 upl_f_offset,
4330 upl_size,
4331 &upl,
4332 &pl,
4333 UPL_FILE_IO | UPL_SET_LITE,
4334 VM_KERN_MEMORY_FILE);
4335 if (kret != KERN_SUCCESS) {
4336 panic("cluster_read_copy: failed to get pagelist");
4337 }
4338
4339 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
4340 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4341
4342 /*
4343 * scan from the beginning of the upl looking for the first
4344 * non-valid page.... this will become the first page in
4345 * the request we're going to make to 'cluster_io'... if all
4346 * of the pages are valid, we won't call through to 'cluster_io'
4347 */
4348 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
4349 if (!upl_valid_page(pl, start_pg)) {
4350 break;
4351 }
4352 }
4353
4354 /*
4355 * scan from the starting invalid page looking for a valid
4356 * page before the end of the upl is reached, if we
4357 * find one, then it will be the last page of the request to
4358 * 'cluster_io'
4359 */
4360 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4361 if (upl_valid_page(pl, last_pg)) {
4362 break;
4363 }
4364 }
4365
4366 if (start_pg < last_pg) {
4367 /*
4368 * we found a range of 'invalid' pages that must be filled
4369 * if the last page in this range is the last page of the file
4370 * we may have to clip the size of it to keep from reading past
4371 * the end of the last physical block associated with the file
4372 */
4373 if (iolock_inited == FALSE) {
4374 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4375
4376 iolock_inited = TRUE;
4377 }
4378 upl_offset = start_pg * PAGE_SIZE;
4379 io_size = (last_pg - start_pg) * PAGE_SIZE;
4380
4381 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
4382 io_size = (u_int32_t)(filesize - (upl_f_offset + upl_offset));
4383 }
4384
4385 /*
4386 * Find out if this needs verification, we'll have to manage the UPL
4387 * diffrently if so. Note that this call only lets us know if
4388 * verification is enabled on this mount point, the actual verification
4389 * is performed in the File system.
4390 */
4391 size_t verify_block_size = 0;
4392 if ((VNOP_VERIFY(vp, start_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) /* && verify_block_size */) {
4393 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4394 if (!upl_valid_page(pl, uio_last)) {
4395 break;
4396 }
4397 }
4398 if (uio_last < pages_in_upl) {
4399 /*
4400 * there were some invalid pages beyond the valid pages
4401 * that we didn't issue an I/O for, just release them
4402 * unchanged now, so that any prefetch/readahed can
4403 * include them
4404 */
4405 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4406 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4407 leftover_upl_aborted = true;
4408 }
4409 }
4410
4411 /*
4412 * issue an asynchronous read to cluster_io
4413 */
4414
4415 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
4416 io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
4417
4418 if (rap) {
4419 if (extent.e_addr < rap->cl_maxra) {
4420 /*
4421 * we've just issued a read for a block that should have been
4422 * in the cache courtesy of the read-ahead engine... something
4423 * has gone wrong with the pipeline, so reset the read-ahead
4424 * logic which will cause us to restart from scratch
4425 */
4426 rap->cl_maxra = 0;
4427 }
4428 }
4429 }
4430 if (error == 0) {
4431 /*
4432 * if the read completed successfully, or there was no I/O request
4433 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4434 * we'll first add on any 'valid'
4435 * pages that were present in the upl when we acquired it.
4436 */
4437 u_int val_size;
4438
4439 if (!leftover_upl_aborted) {
4440 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4441 if (!upl_valid_page(pl, uio_last)) {
4442 break;
4443 }
4444 }
4445 if (uio_last < pages_in_upl) {
4446 /*
4447 * there were some invalid pages beyond the valid pages
4448 * that we didn't issue an I/O for, just release them
4449 * unchanged now, so that any prefetch/readahed can
4450 * include them
4451 */
4452 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4453 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4454 }
4455 }
4456
4457 /*
4458 * compute size to transfer this round, if io_req_size is
4459 * still non-zero after this attempt, we'll loop around and
4460 * set up for another I/O.
4461 */
4462 val_size = (uio_last * PAGE_SIZE) - start_offset;
4463
4464 if (val_size > max_size) {
4465 val_size = (u_int)max_size;
4466 }
4467
4468 if (val_size > io_req_size) {
4469 val_size = io_req_size;
4470 }
4471
4472 if ((uio->uio_offset + val_size) > last_ioread_offset) {
4473 last_ioread_offset = uio->uio_offset + val_size;
4474 }
4475
4476 if ((size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4477 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4478 /*
4479 * if there's still I/O left to do for this request, and...
4480 * we're not in hard throttle mode, and...
4481 * we're close to using up the previous prefetch, then issue a
4482 * new pre-fetch I/O... the I/O latency will overlap
4483 * with the copying of the data
4484 */
4485 if (size_of_prefetch > max_rd_size) {
4486 size_of_prefetch = max_rd_size;
4487 }
4488
4489 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4490
4491 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4492
4493 if (last_ioread_offset > last_request_offset) {
4494 last_ioread_offset = last_request_offset;
4495 }
4496 }
4497 } else if ((uio->uio_offset + val_size) == last_request_offset) {
4498 /*
4499 * this transfer will finish this request, so...
4500 * let's try to read ahead if we're in
4501 * a sequential access pattern and we haven't
4502 * explicitly disabled it
4503 */
4504 if (rd_ahead_enabled) {
4505 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4506 }
4507
4508 if (rap != NULL) {
4509 if (extent.e_addr < rap->cl_lastr) {
4510 rap->cl_maxra = 0;
4511 }
4512 rap->cl_lastr = extent.e_addr;
4513 }
4514 }
4515 if (iolock_inited == TRUE) {
4516 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4517 }
4518
4519 if (iostate.io_error) {
4520 error = iostate.io_error;
4521 } else {
4522 u_int32_t io_requested;
4523
4524 io_requested = val_size;
4525
4526 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4527
4528 io_req_size -= (val_size - io_requested);
4529 }
4530 } else {
4531 if (iolock_inited == TRUE) {
4532 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4533 }
4534 }
4535 if (start_pg < last_pg) {
4536 /*
4537 * compute the range of pages that we actually issued an I/O for
4538 * and either commit them as valid if the I/O succeeded
4539 * or abort them if the I/O failed or we're not supposed to
4540 * keep them in the cache
4541 */
4542 io_size = (last_pg - start_pg) * PAGE_SIZE;
4543
4544 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4545
4546 if (error || (flags & IO_NOCACHE)) {
4547 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4548 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4549 } else {
4550 int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
4551
4552 if (take_reference) {
4553 commit_flags |= UPL_COMMIT_INACTIVATE;
4554 } else {
4555 commit_flags |= UPL_COMMIT_SPECULATE;
4556 }
4557
4558 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4559 }
4560 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4561 }
4562 if ((last_pg - start_pg) < pages_in_upl) {
4563 /*
4564 * the set of pages that we issued an I/O for did not encompass
4565 * the entire upl... so just release these without modifying
4566 * their state
4567 */
4568 if (error) {
4569 if (leftover_upl_aborted) {
4570 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, (uio_last - start_pg) * PAGE_SIZE,
4571 UPL_ABORT_FREE_ON_EMPTY);
4572 } else {
4573 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4574 }
4575 } else {
4576 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
4577 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
4578
4579 /*
4580 * handle any valid pages at the beginning of
4581 * the upl... release these appropriately
4582 */
4583 cluster_read_upl_release(upl, 0, start_pg, take_reference);
4584
4585 /*
4586 * handle any valid pages immediately after the
4587 * pages we issued I/O for... ... release these appropriately
4588 */
4589 cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
4590
4591 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
4592 }
4593 }
4594 if (retval == 0) {
4595 retval = error;
4596 }
4597
4598 if (io_req_size) {
4599 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
4600
4601 if (cluster_is_throttled(vp)) {
4602 /*
4603 * we're in the throttle window, at the very least
4604 * we want to limit the size of the I/O we're about
4605 * to issue
4606 */
4607 rd_ahead_enabled = 0;
4608 prefetch_enabled = 0;
4609 max_rd_size = max_throttle_size;
4610 } else {
4611 if (max_rd_size == max_throttle_size) {
4612 /*
4613 * coming out of throttled state
4614 */
4615 if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4616 if (rap != NULL) {
4617 rd_ahead_enabled = 1;
4618 }
4619 prefetch_enabled = 1;
4620 }
4621 max_rd_size = max_prefetch;
4622 last_ioread_offset = 0;
4623 }
4624 }
4625 }
4626 }
4627 if (iolock_inited == TRUE) {
4628 /*
4629 * cluster_io returned an error after it
4630 * had already issued some I/O. we need
4631 * to wait for that I/O to complete before
4632 * we can destroy the iostate mutex...
4633 * 'retval' already contains the early error
4634 * so no need to pick it up from iostate.io_error
4635 */
4636 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4637
4638 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
4639 }
4640 if (rap != NULL) {
4641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4642 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
4643
4644 lck_mtx_unlock(&rap->cl_lockr);
4645 } else {
4646 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4647 (int)uio->uio_offset, io_req_size, 0, retval, 0);
4648 }
4649
4650 return retval;
4651 }
4652
4653 /*
4654 * We don't want another read/write lock for every vnode in the system
4655 * so we keep a hash of them here. There should never be very many of
4656 * these around at any point in time.
4657 */
4658 cl_direct_read_lock_t *
cluster_lock_direct_read(vnode_t vp,lck_rw_type_t type)4659 cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4660 {
4661 struct cl_direct_read_locks *head
4662 = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4663 % CL_DIRECT_READ_LOCK_BUCKETS];
4664
4665 struct cl_direct_read_lock *lck, *new_lck = NULL;
4666
4667 for (;;) {
4668 lck_spin_lock(&cl_direct_read_spin_lock);
4669
4670 LIST_FOREACH(lck, head, chain) {
4671 if (lck->vp == vp) {
4672 ++lck->ref_count;
4673 lck_spin_unlock(&cl_direct_read_spin_lock);
4674 if (new_lck) {
4675 // Someone beat us to it, ditch the allocation
4676 lck_rw_destroy(&new_lck->rw_lock, &cl_mtx_grp);
4677 kfree_type(cl_direct_read_lock_t, new_lck);
4678 }
4679 lck_rw_lock(&lck->rw_lock, type);
4680 return lck;
4681 }
4682 }
4683
4684 if (new_lck) {
4685 // Use the lock we allocated
4686 LIST_INSERT_HEAD(head, new_lck, chain);
4687 lck_spin_unlock(&cl_direct_read_spin_lock);
4688 lck_rw_lock(&new_lck->rw_lock, type);
4689 return new_lck;
4690 }
4691
4692 lck_spin_unlock(&cl_direct_read_spin_lock);
4693
4694 // Allocate a new lock
4695 new_lck = kalloc_type(cl_direct_read_lock_t, Z_WAITOK);
4696 lck_rw_init(&new_lck->rw_lock, &cl_mtx_grp, LCK_ATTR_NULL);
4697 new_lck->vp = vp;
4698 new_lck->ref_count = 1;
4699
4700 // Got to go round again
4701 }
4702 }
4703
4704 void
cluster_unlock_direct_read(cl_direct_read_lock_t * lck)4705 cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4706 {
4707 lck_rw_done(&lck->rw_lock);
4708
4709 lck_spin_lock(&cl_direct_read_spin_lock);
4710 if (lck->ref_count == 1) {
4711 LIST_REMOVE(lck, chain);
4712 lck_spin_unlock(&cl_direct_read_spin_lock);
4713 lck_rw_destroy(&lck->rw_lock, &cl_mtx_grp);
4714 kfree_type(cl_direct_read_lock_t, lck);
4715 } else {
4716 --lck->ref_count;
4717 lck_spin_unlock(&cl_direct_read_spin_lock);
4718 }
4719 }
4720
4721 static int
cluster_read_direct(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int flags,int (* callback)(buf_t,void *),void * callback_arg)4722 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4723 int flags, int (*callback)(buf_t, void *), void *callback_arg)
4724 {
4725 upl_t upl = NULL;
4726 upl_page_info_t *pl;
4727 off_t max_io_size;
4728 vm_offset_t upl_offset, vector_upl_offset = 0;
4729 upl_size_t upl_size = 0, vector_upl_size = 0;
4730 vm_size_t upl_needed_size;
4731 unsigned int pages_in_pl;
4732 upl_control_flags_t upl_flags;
4733 kern_return_t kret;
4734 unsigned int i;
4735 int force_data_sync;
4736 int retval = 0;
4737 int no_zero_fill = 0;
4738 int io_flag = 0;
4739 int misaligned = 0;
4740 struct clios iostate;
4741 user_addr_t iov_base;
4742 u_int32_t io_req_size;
4743 u_int32_t offset_in_file;
4744 u_int32_t offset_in_iovbase;
4745 u_int32_t io_size;
4746 u_int32_t io_min;
4747 u_int32_t xsize;
4748 u_int32_t devblocksize;
4749 u_int32_t mem_alignment_mask;
4750 u_int32_t max_upl_size;
4751 u_int32_t max_rd_size;
4752 u_int32_t max_rd_ahead;
4753 u_int32_t max_vector_size;
4754 boolean_t io_throttled = FALSE;
4755
4756 u_int32_t vector_upl_iosize = 0;
4757 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
4758 off_t v_upl_uio_offset = 0;
4759 int vector_upl_index = 0;
4760 upl_t vector_upl = NULL;
4761 cl_direct_read_lock_t *lock = NULL;
4762
4763 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
4764
4765 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
4766 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4767
4768 max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
4769
4770 max_rd_size = max_upl_size;
4771
4772 if (__improbable(os_mul_overflow(max_rd_size, IO_SCALE(vp, 2),
4773 &max_rd_ahead) || (max_rd_ahead > overlapping_read_max))) {
4774 max_rd_ahead = overlapping_read_max;
4775 }
4776
4777 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
4778
4779 if (flags & IO_PASSIVE) {
4780 io_flag |= CL_PASSIVE;
4781 }
4782
4783 if (flags & IO_ENCRYPTED) {
4784 io_flag |= CL_RAW_ENCRYPTED;
4785 }
4786
4787 if (flags & IO_NOCACHE) {
4788 io_flag |= CL_NOCACHE;
4789 }
4790
4791 if (flags & IO_SKIP_ENCRYPTION) {
4792 io_flag |= CL_ENCRYPTED;
4793 }
4794
4795 iostate.io_completed = 0;
4796 iostate.io_issued = 0;
4797 iostate.io_error = 0;
4798 iostate.io_wanted = 0;
4799
4800 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4801
4802 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4803 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4804
4805 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4806 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
4807
4808 if (devblocksize == 1) {
4809 /*
4810 * the AFP client advertises a devblocksize of 1
4811 * however, its BLOCKMAP routine maps to physical
4812 * blocks that are PAGE_SIZE in size...
4813 * therefore we can't ask for I/Os that aren't page aligned
4814 * or aren't multiples of PAGE_SIZE in size
4815 * by setting devblocksize to PAGE_SIZE, we re-instate
4816 * the old behavior we had before the mem_alignment_mask
4817 * changes went in...
4818 */
4819 devblocksize = PAGE_SIZE;
4820 }
4821
4822 /*
4823 * We are going to need this uio for the prefaulting later
4824 * especially for the cases where multiple non-contiguous
4825 * iovs are passed into this routine.
4826 */
4827 uio_t uio_acct = uio_duplicate(uio);
4828
4829 next_dread:
4830 io_req_size = *read_length;
4831 iov_base = uio_curriovbase(uio);
4832
4833 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4834 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4835
4836 if (vm_map_page_mask(current_map()) < PAGE_MASK) {
4837 /*
4838 * XXX TODO4K
4839 * Direct I/O might not work as expected from a 16k kernel space
4840 * to a 4k user space because each 4k chunk might point to
4841 * a different 16k physical page...
4842 * Let's go the "misaligned" way.
4843 */
4844 if (!misaligned) {
4845 DEBUG4K_VFS("forcing misaligned\n");
4846 }
4847 misaligned = 1;
4848 }
4849
4850 if (offset_in_file || offset_in_iovbase) {
4851 /*
4852 * one of the 2 important offsets is misaligned
4853 * so fire an I/O through the cache for this entire vector
4854 */
4855 misaligned = 1;
4856 }
4857 if (iov_base & (devblocksize - 1)) {
4858 /*
4859 * the offset in memory must be on a device block boundary
4860 * so that we can guarantee that we can generate an
4861 * I/O that ends on a page boundary in cluster_io
4862 */
4863 misaligned = 1;
4864 }
4865
4866 max_io_size = filesize - uio->uio_offset;
4867
4868 /*
4869 * The user must request IO in aligned chunks. If the
4870 * offset into the file is bad, or the userland pointer
4871 * is non-aligned, then we cannot service the encrypted IO request.
4872 */
4873 if (flags & IO_ENCRYPTED) {
4874 if (misaligned || (io_req_size & (devblocksize - 1))) {
4875 retval = EINVAL;
4876 }
4877
4878 max_io_size = roundup(max_io_size, devblocksize);
4879 }
4880
4881 if ((off_t)io_req_size > max_io_size) {
4882 io_req_size = (u_int32_t)max_io_size;
4883 }
4884
4885 /*
4886 * When we get to this point, we know...
4887 * -- the offset into the file is on a devblocksize boundary
4888 */
4889
4890 while (io_req_size && retval == 0) {
4891 u_int32_t io_start;
4892
4893 if (cluster_is_throttled(vp)) {
4894 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
4895
4896 /*
4897 * we're in the throttle window, at the very least
4898 * we want to limit the size of the I/O we're about
4899 * to issue
4900 */
4901 max_rd_size = max_throttle_size;
4902 max_rd_ahead = max_throttle_size - 1;
4903 max_vector_size = max_throttle_size;
4904 } else {
4905 max_rd_size = max_upl_size;
4906 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4907 max_vector_size = MAX_VECTOR_UPL_SIZE;
4908 }
4909 io_start = io_size = io_req_size;
4910
4911 /*
4912 * First look for pages already in the cache
4913 * and move them to user space. But only do this
4914 * check if we are not retrieving encrypted data directly
4915 * from the filesystem; those blocks should never
4916 * be in the UBC.
4917 *
4918 * cluster_copy_ubc_data returns the resid
4919 * in io_size
4920 */
4921 if ((flags & IO_ENCRYPTED) == 0) {
4922 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
4923 }
4924 /*
4925 * calculate the number of bytes actually copied
4926 * starting size - residual
4927 */
4928 xsize = io_start - io_size;
4929
4930 io_req_size -= xsize;
4931
4932 if (useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
4933 /*
4934 * We found something in the cache or we have an iov_base that's not
4935 * page-aligned.
4936 *
4937 * Issue all I/O's that have been collected within this Vectored UPL.
4938 */
4939 if (vector_upl_index) {
4940 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4941 reset_vector_run_state();
4942 }
4943
4944 if (xsize) {
4945 useVectorUPL = 0;
4946 }
4947
4948 /*
4949 * After this point, if we are using the Vector UPL path and the base is
4950 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4951 */
4952 }
4953
4954 /*
4955 * check to see if we are finished with this request.
4956 *
4957 * If we satisfied this IO already, then io_req_size will be 0.
4958 * Otherwise, see if the IO was mis-aligned and needs to go through
4959 * the UBC to deal with the 'tail'.
4960 *
4961 */
4962 if (io_req_size == 0 || (misaligned)) {
4963 /*
4964 * see if there's another uio vector to
4965 * process that's of type IO_DIRECT
4966 *
4967 * break out of while loop to get there
4968 */
4969 break;
4970 }
4971 /*
4972 * assume the request ends on a device block boundary
4973 */
4974 io_min = devblocksize;
4975
4976 /*
4977 * we can handle I/O's in multiples of the device block size
4978 * however, if io_size isn't a multiple of devblocksize we
4979 * want to clip it back to the nearest page boundary since
4980 * we are going to have to go through cluster_read_copy to
4981 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4982 * multiple, we avoid asking the drive for the same physical
4983 * blocks twice.. once for the partial page at the end of the
4984 * request and a 2nd time for the page we read into the cache
4985 * (which overlaps the end of the direct read) in order to
4986 * get at the overhang bytes
4987 */
4988 if (io_size & (devblocksize - 1)) {
4989 assert(!(flags & IO_ENCRYPTED));
4990 /*
4991 * Clip the request to the previous page size boundary
4992 * since request does NOT end on a device block boundary
4993 */
4994 io_size &= ~PAGE_MASK;
4995 io_min = PAGE_SIZE;
4996 }
4997 if (retval || io_size < io_min) {
4998 /*
4999 * either an error or we only have the tail left to
5000 * complete via the copy path...
5001 * we may have already spun some portion of this request
5002 * off as async requests... we need to wait for the I/O
5003 * to complete before returning
5004 */
5005 goto wait_for_dreads;
5006 }
5007
5008 /*
5009 * Don't re-check the UBC data if we are looking for uncached IO
5010 * or asking for encrypted blocks.
5011 */
5012 if ((flags & IO_ENCRYPTED) == 0) {
5013 if ((xsize = io_size) > max_rd_size) {
5014 xsize = max_rd_size;
5015 }
5016
5017 io_size = 0;
5018
5019 if (!lock) {
5020 /*
5021 * We hold a lock here between the time we check the
5022 * cache and the time we issue I/O. This saves us
5023 * from having to lock the pages in the cache. Not
5024 * all clients will care about this lock but some
5025 * clients may want to guarantee stability between
5026 * here and when the I/O is issued in which case they
5027 * will take the lock exclusively.
5028 */
5029 lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
5030 }
5031
5032 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
5033
5034 if (io_size == 0) {
5035 /*
5036 * a page must have just come into the cache
5037 * since the first page in this range is no
5038 * longer absent, go back and re-evaluate
5039 */
5040 continue;
5041 }
5042 }
5043 if ((flags & IO_RETURN_ON_THROTTLE)) {
5044 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
5045 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
5046 /*
5047 * we're in the throttle window and at least 1 I/O
5048 * has already been issued by a throttleable thread
5049 * in this window, so return with EAGAIN to indicate
5050 * to the FS issuing the cluster_read call that it
5051 * should now throttle after dropping any locks
5052 */
5053 throttle_info_update_by_mount(vp->v_mount);
5054
5055 io_throttled = TRUE;
5056 goto wait_for_dreads;
5057 }
5058 }
5059 }
5060 if (io_size > max_rd_size) {
5061 io_size = max_rd_size;
5062 }
5063
5064 iov_base = uio_curriovbase(uio);
5065
5066 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5067 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5068
5069 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
5070 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
5071
5072 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
5073 no_zero_fill = 1;
5074 } else {
5075 no_zero_fill = 0;
5076 }
5077
5078 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5079 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
5080 pages_in_pl = 0;
5081 upl_size = (upl_size_t)upl_needed_size;
5082 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5083 if (no_zero_fill) {
5084 upl_flags |= UPL_NOZEROFILL;
5085 }
5086 if (force_data_sync) {
5087 upl_flags |= UPL_FORCE_DATA_SYNC;
5088 }
5089
5090 kret = vm_map_create_upl(map,
5091 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5092 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
5093
5094 if (kret != KERN_SUCCESS) {
5095 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5096 (int)upl_offset, upl_size, io_size, kret, 0);
5097 /*
5098 * failed to get pagelist
5099 *
5100 * we may have already spun some portion of this request
5101 * off as async requests... we need to wait for the I/O
5102 * to complete before returning
5103 */
5104 goto wait_for_dreads;
5105 }
5106 pages_in_pl = upl_size / PAGE_SIZE;
5107 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
5108
5109 for (i = 0; i < pages_in_pl; i++) {
5110 if (!upl_page_present(pl, i)) {
5111 break;
5112 }
5113 }
5114 if (i == pages_in_pl) {
5115 break;
5116 }
5117
5118 ubc_upl_abort(upl, 0);
5119 }
5120 if (force_data_sync >= 3) {
5121 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5122 (int)upl_offset, upl_size, io_size, kret, 0);
5123
5124 goto wait_for_dreads;
5125 }
5126 /*
5127 * Consider the possibility that upl_size wasn't satisfied.
5128 */
5129 if (upl_size < upl_needed_size) {
5130 if (upl_size && upl_offset == 0) {
5131 io_size = upl_size;
5132 } else {
5133 io_size = 0;
5134 }
5135 }
5136 if (io_size == 0) {
5137 ubc_upl_abort(upl, 0);
5138 goto wait_for_dreads;
5139 }
5140 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5141 (int)upl_offset, upl_size, io_size, kret, 0);
5142
5143 if (useVectorUPL) {
5144 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
5145 if (end_off) {
5146 issueVectorUPL = 1;
5147 }
5148 /*
5149 * After this point, if we are using a vector UPL, then
5150 * either all the UPL elements end on a page boundary OR
5151 * this UPL is the last element because it does not end
5152 * on a page boundary.
5153 */
5154 }
5155
5156 /*
5157 * request asynchronously so that we can overlap
5158 * the preparation of the next I/O
5159 * if there are already too many outstanding reads
5160 * wait until some have completed before issuing the next read
5161 */
5162 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
5163
5164 if (iostate.io_error) {
5165 /*
5166 * one of the earlier reads we issued ran into a hard error
5167 * don't issue any more reads, cleanup the UPL
5168 * that was just created but not used, then
5169 * go wait for any other reads to complete before
5170 * returning the error to the caller
5171 */
5172 ubc_upl_abort(upl, 0);
5173
5174 goto wait_for_dreads;
5175 }
5176 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
5177 upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
5178
5179 if (!useVectorUPL) {
5180 if (no_zero_fill) {
5181 io_flag &= ~CL_PRESERVE;
5182 } else {
5183 io_flag |= CL_PRESERVE;
5184 }
5185
5186 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5187 } else {
5188 if (!vector_upl_index) {
5189 vector_upl = vector_upl_create(upl_offset, uio->uio_iovcnt);
5190 v_upl_uio_offset = uio->uio_offset;
5191 vector_upl_offset = upl_offset;
5192 }
5193
5194 vector_upl_set_subupl(vector_upl, upl, upl_size);
5195 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
5196 vector_upl_index++;
5197 vector_upl_size += upl_size;
5198 vector_upl_iosize += io_size;
5199
5200 if (issueVectorUPL || vector_upl_index == vector_upl_max_upls(vector_upl) || vector_upl_size >= max_vector_size) {
5201 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5202 reset_vector_run_state();
5203 }
5204 }
5205
5206 if (lock) {
5207 // We don't need to wait for the I/O to complete
5208 cluster_unlock_direct_read(lock);
5209 lock = NULL;
5210 }
5211
5212 /*
5213 * update the uio structure
5214 */
5215 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5216 uio_update(uio, (user_size_t)max_io_size);
5217 } else {
5218 uio_update(uio, (user_size_t)io_size);
5219 }
5220
5221 io_req_size -= io_size;
5222
5223 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
5224 upl, (int)uio->uio_offset, io_req_size, retval, 0);
5225 } /* end while */
5226
5227 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
5228 retval = cluster_io_type(uio, read_type, read_length, 0);
5229
5230 if (retval == 0 && *read_type == IO_DIRECT) {
5231 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5232 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
5233
5234 goto next_dread;
5235 }
5236 }
5237
5238 wait_for_dreads:
5239
5240 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
5241 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5242 reset_vector_run_state();
5243 }
5244
5245 // We don't need to wait for the I/O to complete
5246 if (lock) {
5247 cluster_unlock_direct_read(lock);
5248 }
5249
5250 /*
5251 * make sure all async reads that are part of this stream
5252 * have completed before we return
5253 */
5254 cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
5255
5256 if (iostate.io_error) {
5257 retval = iostate.io_error;
5258 }
5259
5260 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5261
5262 if (io_throttled == TRUE && retval == 0) {
5263 retval = EAGAIN;
5264 }
5265
5266 vm_map_offset_t current_page_size, current_page_mask;
5267 current_page_size = vm_map_page_size(current_map());
5268 current_page_mask = vm_map_page_mask(current_map());
5269 if (uio_acct) {
5270 off_t bytes_to_prefault = 0, bytes_prefaulted = 0;
5271 user_addr_t curr_iov_base = 0;
5272 user_addr_t curr_iov_end = 0;
5273 user_size_t curr_iov_len = 0;
5274
5275 bytes_to_prefault = uio_offset(uio) - uio_offset(uio_acct);
5276
5277 for (; bytes_prefaulted < bytes_to_prefault;) {
5278 curr_iov_base = uio_curriovbase(uio_acct);
5279 curr_iov_len = MIN(uio_curriovlen(uio_acct), bytes_to_prefault - bytes_prefaulted);
5280 curr_iov_end = curr_iov_base + curr_iov_len;
5281
5282 for (; curr_iov_base < curr_iov_end;) {
5283 /*
5284 * This is specifically done for pmap accounting purposes.
5285 * vm_pre_fault() will call vm_fault() to enter the page into
5286 * the pmap if there isn't _a_ physical page for that VA already.
5287 */
5288 vm_pre_fault(vm_map_trunc_page(curr_iov_base, current_page_mask), VM_PROT_READ);
5289 curr_iov_base += current_page_size;
5290 bytes_prefaulted += current_page_size;
5291 }
5292 /*
5293 * Use update instead of advance so we can see how many iovs we processed.
5294 */
5295 uio_update(uio_acct, curr_iov_len);
5296 }
5297 uio_free(uio_acct);
5298 uio_acct = NULL;
5299 }
5300
5301 if (io_req_size && retval == 0) {
5302 /*
5303 * we couldn't handle the tail of this request in DIRECT mode
5304 * so fire it through the copy path
5305 */
5306 if (flags & IO_ENCRYPTED) {
5307 /*
5308 * We cannot fall back to the copy path for encrypted I/O. If this
5309 * happens, there is something wrong with the user buffer passed
5310 * down.
5311 */
5312 retval = EFAULT;
5313 } else {
5314 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
5315 }
5316
5317 *read_type = IO_UNKNOWN;
5318 }
5319 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
5320 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
5321
5322 return retval;
5323 }
5324
5325
5326 static int
cluster_read_contig(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int (* callback)(buf_t,void *),void * callback_arg,int flags)5327 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
5328 int (*callback)(buf_t, void *), void *callback_arg, int flags)
5329 {
5330 upl_page_info_t *pl;
5331 upl_t upl[MAX_VECTS];
5332 vm_offset_t upl_offset;
5333 addr64_t dst_paddr = 0;
5334 user_addr_t iov_base;
5335 off_t max_size;
5336 upl_size_t upl_size;
5337 vm_size_t upl_needed_size;
5338 mach_msg_type_number_t pages_in_pl;
5339 upl_control_flags_t upl_flags;
5340 kern_return_t kret;
5341 struct clios iostate;
5342 int error = 0;
5343 int cur_upl = 0;
5344 int num_upl = 0;
5345 int n;
5346 u_int32_t xsize;
5347 u_int32_t io_size;
5348 u_int32_t devblocksize;
5349 u_int32_t mem_alignment_mask;
5350 u_int32_t tail_size = 0;
5351 int bflag;
5352
5353 if (flags & IO_PASSIVE) {
5354 bflag = CL_PASSIVE;
5355 } else {
5356 bflag = 0;
5357 }
5358
5359 if (flags & IO_NOCACHE) {
5360 bflag |= CL_NOCACHE;
5361 }
5362
5363 /*
5364 * When we enter this routine, we know
5365 * -- the read_length will not exceed the current iov_len
5366 * -- the target address is physically contiguous for read_length
5367 */
5368 cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
5369
5370 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5371 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
5372
5373 iostate.io_completed = 0;
5374 iostate.io_issued = 0;
5375 iostate.io_error = 0;
5376 iostate.io_wanted = 0;
5377
5378 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
5379
5380 next_cread:
5381 io_size = *read_length;
5382
5383 max_size = filesize - uio->uio_offset;
5384
5385 if (io_size > max_size) {
5386 io_size = (u_int32_t)max_size;
5387 }
5388
5389 iov_base = uio_curriovbase(uio);
5390
5391 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5392 upl_needed_size = upl_offset + io_size;
5393
5394 pages_in_pl = 0;
5395 upl_size = (upl_size_t)upl_needed_size;
5396 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5397
5398
5399 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
5400 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
5401
5402 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5403 kret = vm_map_get_upl(map,
5404 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5405 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
5406
5407 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
5408 (int)upl_offset, upl_size, io_size, kret, 0);
5409
5410 if (kret != KERN_SUCCESS) {
5411 /*
5412 * failed to get pagelist
5413 */
5414 error = EINVAL;
5415 goto wait_for_creads;
5416 }
5417 num_upl++;
5418
5419 if (upl_size < upl_needed_size) {
5420 /*
5421 * The upl_size wasn't satisfied.
5422 */
5423 error = EINVAL;
5424 goto wait_for_creads;
5425 }
5426 pl = ubc_upl_pageinfo(upl[cur_upl]);
5427
5428 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
5429
5430 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
5431 u_int32_t head_size;
5432
5433 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
5434
5435 if (head_size > io_size) {
5436 head_size = io_size;
5437 }
5438
5439 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
5440
5441 if (error) {
5442 goto wait_for_creads;
5443 }
5444
5445 upl_offset += head_size;
5446 dst_paddr += head_size;
5447 io_size -= head_size;
5448
5449 iov_base += head_size;
5450 }
5451 if ((u_int32_t)iov_base & mem_alignment_mask) {
5452 /*
5453 * request doesn't set up on a memory boundary
5454 * the underlying DMA engine can handle...
5455 * return an error instead of going through
5456 * the slow copy path since the intent of this
5457 * path is direct I/O to device memory
5458 */
5459 error = EINVAL;
5460 goto wait_for_creads;
5461 }
5462
5463 tail_size = io_size & (devblocksize - 1);
5464
5465 io_size -= tail_size;
5466
5467 while (io_size && error == 0) {
5468 if (io_size > MAX_IO_CONTIG_SIZE) {
5469 xsize = MAX_IO_CONTIG_SIZE;
5470 } else {
5471 xsize = io_size;
5472 }
5473 /*
5474 * request asynchronously so that we can overlap
5475 * the preparation of the next I/O... we'll do
5476 * the commit after all the I/O has completed
5477 * since its all issued against the same UPL
5478 * if there are already too many outstanding reads
5479 * wait until some have completed before issuing the next
5480 */
5481 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
5482
5483 if (iostate.io_error) {
5484 /*
5485 * one of the earlier reads we issued ran into a hard error
5486 * don't issue any more reads...
5487 * go wait for any other reads to complete before
5488 * returning the error to the caller
5489 */
5490 goto wait_for_creads;
5491 }
5492 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
5493 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5494 (buf_t)NULL, &iostate, callback, callback_arg);
5495 /*
5496 * The cluster_io read was issued successfully,
5497 * update the uio structure
5498 */
5499 if (error == 0) {
5500 uio_update(uio, (user_size_t)xsize);
5501
5502 dst_paddr += xsize;
5503 upl_offset += xsize;
5504 io_size -= xsize;
5505 }
5506 }
5507 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5508 error = cluster_io_type(uio, read_type, read_length, 0);
5509
5510 if (error == 0 && *read_type == IO_CONTIG) {
5511 cur_upl++;
5512 goto next_cread;
5513 }
5514 } else {
5515 *read_type = IO_UNKNOWN;
5516 }
5517
5518 wait_for_creads:
5519 /*
5520 * make sure all async reads that are part of this stream
5521 * have completed before we proceed
5522 */
5523 cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
5524
5525 if (iostate.io_error) {
5526 error = iostate.io_error;
5527 }
5528
5529 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5530
5531 if (error == 0 && tail_size) {
5532 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
5533 }
5534
5535 for (n = 0; n < num_upl; n++) {
5536 /*
5537 * just release our hold on each physically contiguous
5538 * region without changing any state
5539 */
5540 ubc_upl_abort(upl[n], 0);
5541 }
5542
5543 return error;
5544 }
5545
5546
5547 static int
cluster_io_type(struct uio * uio,int * io_type,u_int32_t * io_length,u_int32_t min_length)5548 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5549 {
5550 user_size_t iov_len;
5551 user_addr_t iov_base = 0;
5552 upl_t upl;
5553 upl_size_t upl_size;
5554 upl_control_flags_t upl_flags;
5555 int retval = 0;
5556
5557 /*
5558 * skip over any emtpy vectors
5559 */
5560 uio_update(uio, (user_size_t)0);
5561
5562 iov_len = uio_curriovlen(uio);
5563
5564 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
5565
5566 if (iov_len) {
5567 iov_base = uio_curriovbase(uio);
5568 /*
5569 * make sure the size of the vector isn't too big...
5570 * internally, we want to handle all of the I/O in
5571 * chunk sizes that fit in a 32 bit int
5572 */
5573 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
5574 upl_size = MAX_IO_REQUEST_SIZE;
5575 } else {
5576 upl_size = (u_int32_t)iov_len;
5577 }
5578
5579 upl_flags = UPL_QUERY_OBJECT_TYPE;
5580
5581 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5582 if ((vm_map_get_upl(map,
5583 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5584 &upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
5585 /*
5586 * the user app must have passed in an invalid address
5587 */
5588 retval = EFAULT;
5589 }
5590 if (upl_size == 0) {
5591 retval = EFAULT;
5592 }
5593
5594 *io_length = upl_size;
5595
5596 if (upl_flags & UPL_PHYS_CONTIG) {
5597 *io_type = IO_CONTIG;
5598 } else if (iov_len >= min_length) {
5599 *io_type = IO_DIRECT;
5600 } else {
5601 *io_type = IO_COPY;
5602 }
5603 } else {
5604 /*
5605 * nothing left to do for this uio
5606 */
5607 *io_length = 0;
5608 *io_type = IO_UNKNOWN;
5609 }
5610 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
5611
5612 if (*io_type == IO_DIRECT &&
5613 vm_map_page_shift(current_map()) < PAGE_SHIFT) {
5614 /* no direct I/O for sub-page-size address spaces */
5615 DEBUG4K_VFS("io_type IO_DIRECT -> IO_COPY\n");
5616 *io_type = IO_COPY;
5617 }
5618
5619 return retval;
5620 }
5621
5622
5623 /*
5624 * generate advisory I/O's in the largest chunks possible
5625 * the completed pages will be released into the VM cache
5626 */
5627 int
advisory_read(vnode_t vp,off_t filesize,off_t f_offset,int resid)5628 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5629 {
5630 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5631 }
5632
5633 int
advisory_read_ext(vnode_t vp,off_t filesize,off_t f_offset,int resid,int (* callback)(buf_t,void *),void * callback_arg,int bflag)5634 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
5635 {
5636 upl_page_info_t *pl;
5637 upl_t upl;
5638 vm_offset_t upl_offset;
5639 int upl_size;
5640 off_t upl_f_offset;
5641 int start_offset;
5642 int start_pg;
5643 int last_pg;
5644 int pages_in_upl;
5645 off_t max_size;
5646 int io_size;
5647 kern_return_t kret;
5648 int retval = 0;
5649 int issued_io;
5650 int skip_range;
5651 uint32_t max_io_size;
5652
5653
5654 if (!UBCINFOEXISTS(vp)) {
5655 return EINVAL;
5656 }
5657
5658 if (f_offset < 0 || resid < 0) {
5659 return EINVAL;
5660 }
5661
5662 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
5663
5664 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
5665 if (max_io_size > speculative_prefetch_max_iosize) {
5666 max_io_size = speculative_prefetch_max_iosize;
5667 }
5668 }
5669
5670 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
5671 (int)f_offset, resid, (int)filesize, 0, 0);
5672
5673 while (resid && f_offset < filesize && retval == 0) {
5674 /*
5675 * compute the size of the upl needed to encompass
5676 * the requested read... limit each call to cluster_io
5677 * to the maximum UPL size... cluster_io will clip if
5678 * this exceeds the maximum io_size for the device,
5679 * make sure to account for
5680 * a starting offset that's not page aligned
5681 */
5682 start_offset = (int)(f_offset & PAGE_MASK_64);
5683 upl_f_offset = f_offset - (off_t)start_offset;
5684 max_size = filesize - f_offset;
5685
5686 if (resid < max_size) {
5687 io_size = resid;
5688 } else {
5689 io_size = (int)max_size;
5690 }
5691
5692 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5693 if ((uint32_t)upl_size > max_io_size) {
5694 upl_size = max_io_size;
5695 }
5696
5697 skip_range = 0;
5698 /*
5699 * return the number of contiguously present pages in the cache
5700 * starting at upl_f_offset within the file
5701 */
5702 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5703
5704 if (skip_range) {
5705 /*
5706 * skip over pages already present in the cache
5707 */
5708 io_size = skip_range - start_offset;
5709
5710 f_offset += io_size;
5711 resid -= io_size;
5712
5713 if (skip_range == upl_size) {
5714 continue;
5715 }
5716 /*
5717 * have to issue some real I/O
5718 * at this point, we know it's starting on a page boundary
5719 * because we've skipped over at least the first page in the request
5720 */
5721 start_offset = 0;
5722 upl_f_offset += skip_range;
5723 upl_size -= skip_range;
5724 }
5725 pages_in_upl = upl_size / PAGE_SIZE;
5726
5727 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
5728 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5729
5730 kret = ubc_create_upl_kernel(vp,
5731 upl_f_offset,
5732 upl_size,
5733 &upl,
5734 &pl,
5735 UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
5736 VM_KERN_MEMORY_FILE);
5737 if (kret != KERN_SUCCESS) {
5738 return retval;
5739 }
5740 issued_io = 0;
5741
5742 /*
5743 * before we start marching forward, we must make sure we end on
5744 * a present page, otherwise we will be working with a freed
5745 * upl
5746 */
5747 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5748 if (upl_page_present(pl, last_pg)) {
5749 break;
5750 }
5751 }
5752 pages_in_upl = last_pg + 1;
5753
5754
5755 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
5756 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5757
5758
5759 for (last_pg = 0; last_pg < pages_in_upl;) {
5760 /*
5761 * scan from the beginning of the upl looking for the first
5762 * page that is present.... this will become the first page in
5763 * the request we're going to make to 'cluster_io'... if all
5764 * of the pages are absent, we won't call through to 'cluster_io'
5765 */
5766 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5767 if (upl_page_present(pl, start_pg)) {
5768 break;
5769 }
5770 }
5771
5772 /*
5773 * scan from the starting present page looking for an absent
5774 * page before the end of the upl is reached, if we
5775 * find one, then it will terminate the range of pages being
5776 * presented to 'cluster_io'
5777 */
5778 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5779 if (!upl_page_present(pl, last_pg)) {
5780 break;
5781 }
5782 }
5783
5784 if (last_pg > start_pg) {
5785 /*
5786 * we found a range of pages that must be filled
5787 * if the last page in this range is the last page of the file
5788 * we may have to clip the size of it to keep from reading past
5789 * the end of the last physical block associated with the file
5790 */
5791 upl_offset = start_pg * PAGE_SIZE;
5792 io_size = (last_pg - start_pg) * PAGE_SIZE;
5793
5794 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5795 io_size = (int)(filesize - (upl_f_offset + upl_offset));
5796 }
5797
5798 /*
5799 * issue an asynchronous read to cluster_io
5800 */
5801 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5802 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5803
5804 issued_io = 1;
5805 }
5806 }
5807 if (issued_io == 0) {
5808 ubc_upl_abort(upl, 0);
5809 }
5810
5811 io_size = upl_size - start_offset;
5812
5813 if (io_size > resid) {
5814 io_size = resid;
5815 }
5816 f_offset += io_size;
5817 resid -= io_size;
5818 }
5819
5820 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
5821 (int)f_offset, resid, retval, 0, 0);
5822
5823 return retval;
5824 }
5825
5826
5827 int
cluster_push(vnode_t vp,int flags)5828 cluster_push(vnode_t vp, int flags)
5829 {
5830 return cluster_push_ext(vp, flags, NULL, NULL);
5831 }
5832
5833
5834 int
cluster_push_ext(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg)5835 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5836 {
5837 return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5838 }
5839
5840 /* write errors via err, but return the number of clusters written */
5841 extern uint32_t system_inshutdown;
5842 uint32_t cl_sparse_push_error = 0;
5843 int
cluster_push_err(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg,int * err)5844 cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
5845 {
5846 int retval;
5847 int my_sparse_wait = 0;
5848 struct cl_writebehind *wbp;
5849 int local_err = 0;
5850
5851 if (err) {
5852 *err = 0;
5853 }
5854
5855 if (!UBCINFOEXISTS(vp)) {
5856 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
5857 return 0;
5858 }
5859 /* return if deferred write is set */
5860 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5861 return 0;
5862 }
5863 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5864 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
5865 return 0;
5866 }
5867 if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
5868 lck_mtx_unlock(&wbp->cl_lockw);
5869
5870 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
5871 return 0;
5872 }
5873 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
5874 wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
5875
5876 /*
5877 * if we have an fsync in progress, we don't want to allow any additional
5878 * sync/fsync/close(s) to occur until it finishes.
5879 * note that its possible for writes to continue to occur to this file
5880 * while we're waiting and also once the fsync starts to clean if we're
5881 * in the sparse map case
5882 */
5883 while (wbp->cl_sparse_wait) {
5884 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5885
5886 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5887
5888 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5889 }
5890 if (flags & IO_SYNC) {
5891 my_sparse_wait = 1;
5892 wbp->cl_sparse_wait = 1;
5893
5894 /*
5895 * this is an fsync (or equivalent)... we must wait for any existing async
5896 * cleaning operations to complete before we evaulate the current state
5897 * and finish cleaning... this insures that all writes issued before this
5898 * fsync actually get cleaned to the disk before this fsync returns
5899 */
5900 while (wbp->cl_sparse_pushes) {
5901 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5902
5903 msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5904
5905 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5906 }
5907 }
5908 if (wbp->cl_scmap) {
5909 void *scmap;
5910
5911 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5912 scmap = wbp->cl_scmap;
5913 wbp->cl_scmap = NULL;
5914
5915 wbp->cl_sparse_pushes++;
5916
5917 lck_mtx_unlock(&wbp->cl_lockw);
5918
5919 retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5920
5921 lck_mtx_lock(&wbp->cl_lockw);
5922
5923 wbp->cl_sparse_pushes--;
5924
5925 if (retval) {
5926 if (wbp->cl_scmap != NULL) {
5927 /*
5928 * panic("cluster_push_err: Expected NULL cl_scmap\n");
5929 *
5930 * This can happen if we get an error from the underlying FS
5931 * e.g. ENOSPC, EPERM or EIO etc. We hope that these errors
5932 * are transient and the I/Os will succeed at a later point.
5933 *
5934 * The tricky part here is that a new sparse cluster has been
5935 * allocated and tracking a different set of dirty pages. So these
5936 * pages are not going to be pushed out with the next sparse_cluster_push.
5937 * An explicit msync or file close will, however, push the pages out.
5938 *
5939 * What if those calls still don't work? And so, during shutdown we keep
5940 * trying till we succeed...
5941 */
5942
5943 if (system_inshutdown) {
5944 if ((retval == ENOSPC) && (vp->v_mount->mnt_flag & (MNT_LOCAL | MNT_REMOVABLE)) == MNT_LOCAL) {
5945 os_atomic_inc(&cl_sparse_push_error, relaxed);
5946 }
5947 } else {
5948 vfs_drt_control(&scmap, 0); /* emit stats and free this memory. Dirty pages stay intact. */
5949 scmap = NULL;
5950 }
5951 } else {
5952 wbp->cl_scmap = scmap;
5953 }
5954 }
5955
5956 if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
5957 wakeup((caddr_t)&wbp->cl_sparse_pushes);
5958 }
5959 } else {
5960 retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5961 }
5962
5963 local_err = retval;
5964
5965 if (err) {
5966 *err = retval;
5967 }
5968 retval = 1;
5969 } else {
5970 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
5971 if (err) {
5972 *err = local_err;
5973 }
5974 }
5975 lck_mtx_unlock(&wbp->cl_lockw);
5976
5977 if (flags & IO_SYNC) {
5978 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
5979 }
5980
5981 if (my_sparse_wait) {
5982 /*
5983 * I'm the owner of the serialization token
5984 * clear it and wakeup anyone that is waiting
5985 * for me to finish
5986 */
5987 lck_mtx_lock(&wbp->cl_lockw);
5988
5989 wbp->cl_sparse_wait = 0;
5990 wakeup((caddr_t)&wbp->cl_sparse_wait);
5991
5992 lck_mtx_unlock(&wbp->cl_lockw);
5993 }
5994 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
5995 wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
5996
5997 return retval;
5998 }
5999
6000
6001 __private_extern__ void
cluster_release(struct ubc_info * ubc)6002 cluster_release(struct ubc_info *ubc)
6003 {
6004 struct cl_writebehind *wbp;
6005 struct cl_readahead *rap;
6006
6007 if ((wbp = ubc->cl_wbehind)) {
6008 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
6009
6010 if (wbp->cl_scmap) {
6011 vfs_drt_control(&(wbp->cl_scmap), 0);
6012 }
6013 lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
6014 zfree(cl_wr_zone, wbp);
6015 ubc->cl_wbehind = NULL;
6016 } else {
6017 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
6018 }
6019
6020 if ((rap = ubc->cl_rahead)) {
6021 lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
6022 zfree(cl_rd_zone, rap);
6023 ubc->cl_rahead = NULL;
6024 }
6025
6026 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
6027 }
6028
6029
6030 static int
cluster_try_push(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,int * err,boolean_t vm_initiated)6031 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
6032 {
6033 int cl_index;
6034 int cl_index1;
6035 int min_index;
6036 int cl_len;
6037 int cl_pushed = 0;
6038 struct cl_wextent l_clusters[MAX_CLUSTERS];
6039 u_int max_cluster_pgcount;
6040 int error = 0;
6041
6042 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
6043 /*
6044 * the write behind context exists and has
6045 * already been locked...
6046 */
6047 if (wbp->cl_number == 0) {
6048 /*
6049 * no clusters to push
6050 * return number of empty slots
6051 */
6052 return MAX_CLUSTERS;
6053 }
6054
6055 /*
6056 * make a local 'sorted' copy of the clusters
6057 * and clear wbp->cl_number so that new clusters can
6058 * be developed
6059 */
6060 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6061 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
6062 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
6063 continue;
6064 }
6065 if (min_index == -1) {
6066 min_index = cl_index1;
6067 } else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
6068 min_index = cl_index1;
6069 }
6070 }
6071 if (min_index == -1) {
6072 break;
6073 }
6074
6075 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
6076 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
6077 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
6078
6079 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
6080 }
6081 wbp->cl_number = 0;
6082
6083 cl_len = cl_index;
6084
6085 /* skip switching to the sparse cluster mechanism if on diskimage */
6086 if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
6087 !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
6088 int i;
6089
6090 /*
6091 * determine if we appear to be writing the file sequentially
6092 * if not, by returning without having pushed any clusters
6093 * we will cause this vnode to be pushed into the sparse cluster mechanism
6094 * used for managing more random I/O patterns
6095 *
6096 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
6097 * that's why we're in try_push with PUSH_DELAY...
6098 *
6099 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
6100 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
6101 * so we can just make a simple pass through, up to, but not including the last one...
6102 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
6103 * are sequential
6104 *
6105 * we let the last one be partial as long as it was adjacent to the previous one...
6106 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
6107 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
6108 */
6109 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
6110 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
6111 goto dont_try;
6112 }
6113 if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
6114 goto dont_try;
6115 }
6116 }
6117 }
6118 if (vm_initiated == TRUE) {
6119 lck_mtx_unlock(&wbp->cl_lockw);
6120 }
6121
6122 for (cl_index = 0; cl_index < cl_len; cl_index++) {
6123 int flags;
6124 struct cl_extent cl;
6125 int retval;
6126
6127 flags = io_flags & (IO_PASSIVE | IO_CLOSE);
6128
6129 /*
6130 * try to push each cluster in turn...
6131 */
6132 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
6133 flags |= IO_NOCACHE;
6134 }
6135
6136 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
6137 flags |= IO_PASSIVE;
6138 }
6139
6140 if (push_flag & PUSH_SYNC) {
6141 flags |= IO_SYNC;
6142 }
6143
6144 cl.b_addr = l_clusters[cl_index].b_addr;
6145 cl.e_addr = l_clusters[cl_index].e_addr;
6146
6147 retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
6148
6149 if (retval == 0) {
6150 cl_pushed++;
6151
6152 l_clusters[cl_index].b_addr = 0;
6153 l_clusters[cl_index].e_addr = 0;
6154 } else if (error == 0) {
6155 error = retval;
6156 }
6157
6158 if (!(push_flag & PUSH_ALL)) {
6159 break;
6160 }
6161 }
6162 if (vm_initiated == TRUE) {
6163 lck_mtx_lock(&wbp->cl_lockw);
6164 }
6165
6166 if (err) {
6167 *err = error;
6168 }
6169
6170 dont_try:
6171 if (cl_len > cl_pushed) {
6172 /*
6173 * we didn't push all of the clusters, so
6174 * lets try to merge them back in to the vnode
6175 */
6176 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
6177 /*
6178 * we picked up some new clusters while we were trying to
6179 * push the old ones... this can happen because I've dropped
6180 * the vnode lock... the sum of the
6181 * leftovers plus the new cluster count exceeds our ability
6182 * to represent them, so switch to the sparse cluster mechanism
6183 *
6184 * collect the active public clusters...
6185 */
6186 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6187
6188 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
6189 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6190 continue;
6191 }
6192 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6193 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6194 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6195
6196 cl_index1++;
6197 }
6198 /*
6199 * update the cluster count
6200 */
6201 wbp->cl_number = cl_index1;
6202
6203 /*
6204 * and collect the original clusters that were moved into the
6205 * local storage for sorting purposes
6206 */
6207 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6208 } else {
6209 /*
6210 * we've got room to merge the leftovers back in
6211 * just append them starting at the next 'hole'
6212 * represented by wbp->cl_number
6213 */
6214 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
6215 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6216 continue;
6217 }
6218
6219 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6220 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6221 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6222
6223 cl_index1++;
6224 }
6225 /*
6226 * update the cluster count
6227 */
6228 wbp->cl_number = cl_index1;
6229 }
6230 }
6231 return MAX_CLUSTERS - wbp->cl_number;
6232 }
6233
6234
6235
6236 static int
cluster_push_now(vnode_t vp,struct cl_extent * cl,off_t EOF,int flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6237 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
6238 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6239 {
6240 upl_page_info_t *pl;
6241 upl_t upl;
6242 vm_offset_t upl_offset;
6243 int upl_size;
6244 off_t upl_f_offset;
6245 int pages_in_upl;
6246 int start_pg;
6247 int last_pg;
6248 int io_size;
6249 int io_flags;
6250 int upl_flags;
6251 int bflag;
6252 int size;
6253 int error = 0;
6254 int retval;
6255 kern_return_t kret;
6256
6257 if (flags & IO_PASSIVE) {
6258 bflag = CL_PASSIVE;
6259 } else {
6260 bflag = 0;
6261 }
6262
6263 if (flags & IO_SKIP_ENCRYPTION) {
6264 bflag |= CL_ENCRYPTED;
6265 }
6266
6267 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
6268 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
6269
6270 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
6271 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
6272
6273 return 0;
6274 }
6275 upl_size = pages_in_upl * PAGE_SIZE;
6276 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6277
6278 if (upl_f_offset + upl_size >= EOF) {
6279 if (upl_f_offset >= EOF) {
6280 /*
6281 * must have truncated the file and missed
6282 * clearing a dangling cluster (i.e. it's completely
6283 * beyond the new EOF
6284 */
6285 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
6286
6287 return 0;
6288 }
6289 size = (int)(EOF - upl_f_offset);
6290
6291 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
6292 pages_in_upl = upl_size / PAGE_SIZE;
6293 } else {
6294 size = upl_size;
6295 }
6296
6297
6298 if (vm_initiated) {
6299 vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
6300 UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
6301
6302 return error;
6303 }
6304 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
6305
6306 /*
6307 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
6308 *
6309 * - only pages that are currently dirty are returned... these are the ones we need to clean
6310 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6311 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
6312 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
6313 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
6314 *
6315 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6316 */
6317
6318 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) {
6319 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
6320 } else {
6321 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
6322 }
6323
6324 kret = ubc_create_upl_kernel(vp,
6325 upl_f_offset,
6326 upl_size,
6327 &upl,
6328 &pl,
6329 upl_flags,
6330 VM_KERN_MEMORY_FILE);
6331 if (kret != KERN_SUCCESS) {
6332 panic("cluster_push: failed to get pagelist");
6333 }
6334
6335 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
6336
6337 /*
6338 * since we only asked for the dirty pages back
6339 * it's possible that we may only get a few or even none, so...
6340 * before we start marching forward, we must make sure we know
6341 * where the last present page is in the UPL, otherwise we could
6342 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
6343 * employed by commit_range and abort_range.
6344 */
6345 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
6346 if (upl_page_present(pl, last_pg)) {
6347 break;
6348 }
6349 }
6350 pages_in_upl = last_pg + 1;
6351
6352 if (pages_in_upl == 0) {
6353 ubc_upl_abort(upl, 0);
6354
6355 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
6356 return 0;
6357 }
6358
6359 for (last_pg = 0; last_pg < pages_in_upl;) {
6360 /*
6361 * find the next dirty page in the UPL
6362 * this will become the first page in the
6363 * next I/O to generate
6364 */
6365 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
6366 if (upl_dirty_page(pl, start_pg)) {
6367 break;
6368 }
6369 if (upl_page_present(pl, start_pg)) {
6370 /*
6371 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
6372 * just release these unchanged since we're not going
6373 * to steal them or change their state
6374 */
6375 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
6376 }
6377 }
6378 if (start_pg >= pages_in_upl) {
6379 /*
6380 * done... no more dirty pages to push
6381 */
6382 break;
6383 }
6384 if (start_pg > last_pg) {
6385 /*
6386 * skipped over some non-dirty pages
6387 */
6388 size -= ((start_pg - last_pg) * PAGE_SIZE);
6389 }
6390
6391 /*
6392 * find a range of dirty pages to write
6393 */
6394 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
6395 if (!upl_dirty_page(pl, last_pg)) {
6396 break;
6397 }
6398 }
6399 upl_offset = start_pg * PAGE_SIZE;
6400
6401 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
6402
6403 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
6404
6405 if (!(flags & IO_SYNC)) {
6406 io_flags |= CL_ASYNC;
6407 }
6408
6409 if (flags & IO_CLOSE) {
6410 io_flags |= CL_CLOSE;
6411 }
6412
6413 if (flags & IO_NOCACHE) {
6414 io_flags |= CL_NOCACHE;
6415 }
6416
6417 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
6418 io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6419
6420 if (error == 0 && retval) {
6421 error = retval;
6422 }
6423
6424 size -= io_size;
6425 }
6426 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
6427
6428 return error;
6429 }
6430
6431
6432 /*
6433 * sparse_cluster_switch is called with the write behind lock held
6434 */
6435 static int
sparse_cluster_switch(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6436 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6437 {
6438 int cl_index;
6439 int error = 0;
6440
6441 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
6442
6443 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6444 int flags;
6445 struct cl_extent cl;
6446
6447 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
6448 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
6449 if (flags & UPL_POP_DIRTY) {
6450 cl.e_addr = cl.b_addr + 1;
6451
6452 error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
6453
6454 if (error) {
6455 break;
6456 }
6457 }
6458 }
6459 }
6460 }
6461 wbp->cl_number -= cl_index;
6462
6463 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
6464
6465 return error;
6466 }
6467
6468
6469 /*
6470 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
6471 * still associated with the write-behind context... however, if the scmap has been disassociated
6472 * from the write-behind context (the cluster_push case), the wb lock is not held
6473 */
6474 static int
sparse_cluster_push(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6475 sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
6476 int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6477 {
6478 struct cl_extent cl;
6479 off_t offset;
6480 u_int length;
6481 void *l_scmap;
6482 int error = 0;
6483
6484 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
6485
6486 if (push_flag & PUSH_ALL) {
6487 vfs_drt_control(scmap, 1);
6488 }
6489
6490 l_scmap = *scmap;
6491
6492 for (;;) {
6493 int retval;
6494
6495 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) {
6496 /*
6497 * Not finding anything to push will return KERN_FAILURE.
6498 * Confusing since it isn't really a failure. But that's the
6499 * reason we don't set 'error' here like we do below.
6500 */
6501 break;
6502 }
6503
6504 if (vm_initiated == TRUE) {
6505 lck_mtx_unlock(&wbp->cl_lockw);
6506 }
6507
6508 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6509 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6510
6511 retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
6512 if (error == 0 && retval) {
6513 error = retval;
6514 }
6515
6516 if (vm_initiated == TRUE) {
6517 lck_mtx_lock(&wbp->cl_lockw);
6518
6519 if (*scmap != l_scmap) {
6520 break;
6521 }
6522 }
6523
6524 if (error) {
6525 if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
6526 panic("Failed to restore dirty state on failure");
6527 }
6528
6529 break;
6530 }
6531
6532 if (!(push_flag & PUSH_ALL)) {
6533 break;
6534 }
6535 }
6536 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6537
6538 return error;
6539 }
6540
6541
6542 /*
6543 * sparse_cluster_add is called with the write behind lock held
6544 */
6545 static int
sparse_cluster_add(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,struct cl_extent * cl,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6546 sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
6547 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6548 {
6549 u_int new_dirty;
6550 u_int length;
6551 off_t offset;
6552 int error = 0;
6553 int push_flag = 0; /* Is this a valid value? */
6554
6555 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
6556
6557 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6558 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
6559
6560 while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
6561 /*
6562 * no room left in the map
6563 * only a partial update was done
6564 * push out some pages and try again
6565 */
6566
6567 if (vfs_get_scmap_push_behavior_internal(scmap, &push_flag)) {
6568 push_flag = 0;
6569 }
6570
6571 error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, 0, callback, callback_arg, vm_initiated);
6572
6573 if (error) {
6574 break;
6575 }
6576
6577 offset += (new_dirty * PAGE_SIZE_64);
6578 length -= (new_dirty * PAGE_SIZE);
6579 }
6580 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6581
6582 return error;
6583 }
6584
6585
6586 static int
cluster_align_phys_io(vnode_t vp,struct uio * uio,addr64_t usr_paddr,u_int32_t xsize,int flags,int (* callback)(buf_t,void *),void * callback_arg)6587 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
6588 {
6589 upl_page_info_t *pl;
6590 upl_t upl;
6591 addr64_t ubc_paddr;
6592 kern_return_t kret;
6593 int error = 0;
6594 int did_read = 0;
6595 int abort_flags;
6596 int upl_flags;
6597 int bflag;
6598
6599 if (flags & IO_PASSIVE) {
6600 bflag = CL_PASSIVE;
6601 } else {
6602 bflag = 0;
6603 }
6604
6605 if (flags & IO_NOCACHE) {
6606 bflag |= CL_NOCACHE;
6607 }
6608
6609 upl_flags = UPL_SET_LITE;
6610
6611 if (!(flags & CL_READ)) {
6612 /*
6613 * "write" operation: let the UPL subsystem know
6614 * that we intend to modify the buffer cache pages
6615 * we're gathering.
6616 */
6617 upl_flags |= UPL_WILL_MODIFY;
6618 } else {
6619 /*
6620 * indicate that there is no need to pull the
6621 * mapping for this page... we're only going
6622 * to read from it, not modify it.
6623 */
6624 upl_flags |= UPL_FILE_IO;
6625 }
6626 kret = ubc_create_upl_kernel(vp,
6627 uio->uio_offset & ~PAGE_MASK_64,
6628 PAGE_SIZE,
6629 &upl,
6630 &pl,
6631 upl_flags,
6632 VM_KERN_MEMORY_FILE);
6633
6634 if (kret != KERN_SUCCESS) {
6635 return EINVAL;
6636 }
6637
6638 if (!upl_valid_page(pl, 0)) {
6639 /*
6640 * issue a synchronous read to cluster_io
6641 */
6642 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6643 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6644 if (error) {
6645 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
6646
6647 return error;
6648 }
6649 did_read = 1;
6650 }
6651 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
6652
6653 /*
6654 * NOTE: There is no prototype for the following in BSD. It, and the definitions
6655 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6656 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
6657 * way to do so without exporting them to kexts as well.
6658 */
6659 if (flags & CL_READ) {
6660 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
6661 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
6662 } else {
6663 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
6664 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
6665 }
6666 if (!(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
6667 /*
6668 * issue a synchronous write to cluster_io
6669 */
6670 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6671 bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6672 }
6673 if (error == 0) {
6674 uio_update(uio, (user_size_t)xsize);
6675 }
6676
6677 if (did_read) {
6678 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6679 } else {
6680 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
6681 }
6682
6683 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
6684
6685 return error;
6686 }
6687
6688 int
cluster_copy_upl_data(struct uio * uio,upl_t upl,int upl_offset,int * io_resid)6689 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
6690 {
6691 int pg_offset;
6692 int pg_index;
6693 int csize;
6694 int segflg;
6695 int retval = 0;
6696 int xsize;
6697 upl_page_info_t *pl;
6698 int dirty_count;
6699
6700 xsize = *io_resid;
6701
6702 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6703 (int)uio->uio_offset, upl_offset, xsize, 0, 0);
6704
6705 segflg = uio->uio_segflg;
6706
6707 switch (segflg) {
6708 case UIO_USERSPACE32:
6709 case UIO_USERISPACE32:
6710 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6711 break;
6712
6713 case UIO_USERSPACE:
6714 case UIO_USERISPACE:
6715 uio->uio_segflg = UIO_PHYS_USERSPACE;
6716 break;
6717
6718 case UIO_USERSPACE64:
6719 case UIO_USERISPACE64:
6720 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6721 break;
6722
6723 case UIO_SYSSPACE:
6724 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6725 break;
6726 }
6727 pl = ubc_upl_pageinfo(upl);
6728
6729 pg_index = upl_offset / PAGE_SIZE;
6730 pg_offset = upl_offset & PAGE_MASK;
6731 csize = min(PAGE_SIZE - pg_offset, xsize);
6732
6733 dirty_count = 0;
6734 while (xsize && retval == 0) {
6735 addr64_t paddr;
6736
6737 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
6738 if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) {
6739 dirty_count++;
6740 }
6741
6742 retval = uiomove64(paddr, csize, uio);
6743
6744 pg_index += 1;
6745 pg_offset = 0;
6746 xsize -= csize;
6747 csize = min(PAGE_SIZE, xsize);
6748 }
6749 *io_resid = xsize;
6750
6751 uio->uio_segflg = segflg;
6752
6753 task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
6754 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6755 (int)uio->uio_offset, xsize, retval, segflg, 0);
6756
6757 return retval;
6758 }
6759
6760
6761 int
cluster_copy_ubc_data(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty)6762 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
6763 {
6764 return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1);
6765 }
6766
6767
6768 static int
cluster_copy_ubc_data_internal(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty,int take_reference)6769 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
6770 {
6771 int segflg;
6772 int io_size;
6773 int xsize;
6774 int start_offset;
6775 int retval = 0;
6776 memory_object_control_t control;
6777
6778 io_size = *io_resid;
6779
6780 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6781 (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
6782
6783 control = ubc_getobject(vp, UBC_FLAGS_NONE);
6784
6785 if (control == MEMORY_OBJECT_CONTROL_NULL) {
6786 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6787 (int)uio->uio_offset, io_size, retval, 3, 0);
6788
6789 return 0;
6790 }
6791 segflg = uio->uio_segflg;
6792
6793 switch (segflg) {
6794 case UIO_USERSPACE32:
6795 case UIO_USERISPACE32:
6796 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6797 break;
6798
6799 case UIO_USERSPACE64:
6800 case UIO_USERISPACE64:
6801 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6802 break;
6803
6804 case UIO_USERSPACE:
6805 case UIO_USERISPACE:
6806 uio->uio_segflg = UIO_PHYS_USERSPACE;
6807 break;
6808
6809 case UIO_SYSSPACE:
6810 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6811 break;
6812 }
6813
6814 if ((io_size = *io_resid)) {
6815 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6816 xsize = (int)uio_resid(uio);
6817
6818 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6819 start_offset, io_size, mark_dirty, take_reference);
6820 xsize -= uio_resid(uio);
6821 io_size -= xsize;
6822 }
6823 uio->uio_segflg = segflg;
6824 *io_resid = io_size;
6825
6826 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6827 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
6828
6829 return retval;
6830 }
6831
6832
6833 int
is_file_clean(vnode_t vp,off_t filesize)6834 is_file_clean(vnode_t vp, off_t filesize)
6835 {
6836 off_t f_offset;
6837 int flags;
6838 int total_dirty = 0;
6839
6840 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6841 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6842 if (flags & UPL_POP_DIRTY) {
6843 total_dirty++;
6844 }
6845 }
6846 }
6847 if (total_dirty) {
6848 return EINVAL;
6849 }
6850
6851 return 0;
6852 }
6853
6854
6855
6856 /*
6857 * Dirty region tracking/clustering mechanism.
6858 *
6859 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6860 * dirty regions within a larger space (file). It is primarily intended to
6861 * support clustering in large files with many dirty areas.
6862 *
6863 * The implementation assumes that the dirty regions are pages.
6864 *
6865 * To represent dirty pages within the file, we store bit vectors in a
6866 * variable-size circular hash.
6867 */
6868
6869 /*
6870 * Bitvector size. This determines the number of pages we group in a
6871 * single hashtable entry. Each hashtable entry is aligned to this
6872 * size within the file.
6873 */
6874 #define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
6875
6876 /*
6877 * File offset handling.
6878 *
6879 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6880 * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6881 */
6882 #define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6883 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
6884
6885 /*
6886 * Hashtable address field handling.
6887 *
6888 * The low-order bits of the hashtable address are used to conserve
6889 * space.
6890 *
6891 * DRT_HASH_COUNT_MASK must be large enough to store the range
6892 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6893 * to indicate that the bucket is actually unoccupied.
6894 */
6895 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6896 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
6897 do { \
6898 (scm)->scm_hashtable[(i)].dhe_control = \
6899 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6900 } while (0)
6901 #define DRT_HASH_COUNT_MASK 0x1ff
6902 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6903 #define DRT_HASH_SET_COUNT(scm, i, c) \
6904 do { \
6905 (scm)->scm_hashtable[(i)].dhe_control = \
6906 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
6907 } while (0)
6908 #define DRT_HASH_CLEAR(scm, i) \
6909 do { \
6910 (scm)->scm_hashtable[(i)].dhe_control = 0; \
6911 } while (0)
6912 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6913 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6914 #define DRT_HASH_COPY(oscm, oi, scm, i) \
6915 do { \
6916 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
6917 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
6918 } while(0);
6919
6920
6921 #if !defined(XNU_TARGET_OS_OSX)
6922 /*
6923 * Hash table moduli.
6924 *
6925 * Since the hashtable entry's size is dependent on the size of
6926 * the bitvector, and since the hashtable size is constrained to
6927 * both being prime and fitting within the desired allocation
6928 * size, these values need to be manually determined.
6929 *
6930 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6931 *
6932 * The small hashtable allocation is 4096 bytes, so the modulus is 251.
6933 * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
6934 * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
6935 */
6936
6937 #define DRT_HASH_SMALL_MODULUS 251
6938 #define DRT_HASH_LARGE_MODULUS 2039
6939 #define DRT_HASH_XLARGE_MODULUS 8179
6940
6941 /*
6942 * Physical memory required before the large hash modulus is permitted.
6943 *
6944 * On small memory systems, the large hash modulus can lead to phsyical
6945 * memory starvation, so we avoid using it there.
6946 */
6947 #define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
6948 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (8 * 1024LL * 1024LL * 1024LL) /* 8GiB */
6949
6950 #define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
6951 #define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
6952 #define DRT_XLARGE_ALLOCATION 131072 /* 208 bytes spare */
6953
6954 #else /* XNU_TARGET_OS_OSX */
6955 /*
6956 * Hash table moduli.
6957 *
6958 * Since the hashtable entry's size is dependent on the size of
6959 * the bitvector, and since the hashtable size is constrained to
6960 * both being prime and fitting within the desired allocation
6961 * size, these values need to be manually determined.
6962 *
6963 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6964 *
6965 * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6966 * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
6967 * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
6968 */
6969
6970 #define DRT_HASH_SMALL_MODULUS 1019
6971 #define DRT_HASH_LARGE_MODULUS 8179
6972 #define DRT_HASH_XLARGE_MODULUS 32749
6973
6974 /*
6975 * Physical memory required before the large hash modulus is permitted.
6976 *
6977 * On small memory systems, the large hash modulus can lead to phsyical
6978 * memory starvation, so we avoid using it there.
6979 */
6980 #define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
6981 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (32 * 1024LL * 1024LL * 1024LL) /* 32GiB */
6982
6983 #define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
6984 #define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
6985 #define DRT_XLARGE_ALLOCATION 524288 /* 304 bytes spare */
6986
6987 #endif /* ! XNU_TARGET_OS_OSX */
6988
6989 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6990
6991 /*
6992 * Hashtable entry.
6993 */
6994 struct vfs_drt_hashentry {
6995 u_int64_t dhe_control;
6996 /*
6997 * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6998 * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
6999 * Since PAGE_SIZE is only known at boot time,
7000 * -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
7001 * -declare dhe_bitvector array for largest possible length
7002 */
7003 #define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
7004 u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
7005 };
7006
7007 /*
7008 * Hashtable bitvector handling.
7009 *
7010 * Bitvector fields are 32 bits long.
7011 */
7012
7013 #define DRT_HASH_SET_BIT(scm, i, bit) \
7014 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
7015
7016 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
7017 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
7018
7019 #define DRT_HASH_TEST_BIT(scm, i, bit) \
7020 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
7021
7022 #define DRT_BITVECTOR_CLEAR(scm, i) \
7023 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7024
7025 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
7026 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
7027 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
7028 (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7029
7030 /*
7031 * Dirty Region Tracking structure.
7032 *
7033 * The hashtable is allocated entirely inside the DRT structure.
7034 *
7035 * The hash is a simple circular prime modulus arrangement, the structure
7036 * is resized from small to large if it overflows.
7037 */
7038
7039 struct vfs_drt_clustermap {
7040 u_int32_t scm_magic; /* sanity/detection */
7041 #define DRT_SCM_MAGIC 0x12020003
7042 u_int32_t scm_modulus; /* current ring size */
7043 u_int32_t scm_buckets; /* number of occupied buckets */
7044 u_int32_t scm_lastclean; /* last entry we cleaned */
7045 u_int32_t scm_iskips; /* number of slot skips */
7046
7047 struct vfs_drt_hashentry scm_hashtable[0];
7048 };
7049
7050
7051 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
7052 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
7053
7054 /*
7055 * Debugging codes and arguments.
7056 */
7057 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
7058 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
7059 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
7060 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
7061 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
7062 * dirty */
7063 /* 0, setcount */
7064 /* 1 (clean, no map) */
7065 /* 2 (map alloc fail) */
7066 /* 3, resid (partial) */
7067 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
7068 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
7069 * lastclean, iskips */
7070
7071
7072 static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
7073 static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
7074 static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
7075 u_int64_t offset, int *indexp);
7076 static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
7077 u_int64_t offset,
7078 int *indexp,
7079 int recursed);
7080 static kern_return_t vfs_drt_do_mark_pages(
7081 void **cmapp,
7082 u_int64_t offset,
7083 u_int length,
7084 u_int *setcountp,
7085 int dirty);
7086 static void vfs_drt_trace(
7087 struct vfs_drt_clustermap *cmap,
7088 int code,
7089 int arg1,
7090 int arg2,
7091 int arg3,
7092 int arg4);
7093
7094
7095 /*
7096 * Allocate and initialise a sparse cluster map.
7097 *
7098 * Will allocate a new map, resize or compact an existing map.
7099 *
7100 * XXX we should probably have at least one intermediate map size,
7101 * as the 1:16 ratio seems a bit drastic.
7102 */
7103 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap ** cmapp)7104 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
7105 {
7106 struct vfs_drt_clustermap *cmap = NULL, *ocmap = NULL;
7107 kern_return_t kret = KERN_SUCCESS;
7108 u_int64_t offset = 0;
7109 u_int32_t i = 0;
7110 int modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
7111
7112 ocmap = NULL;
7113 if (cmapp != NULL) {
7114 ocmap = *cmapp;
7115 }
7116
7117 /*
7118 * Decide on the size of the new map.
7119 */
7120 if (ocmap == NULL) {
7121 modulus_size = DRT_HASH_SMALL_MODULUS;
7122 map_size = DRT_SMALL_ALLOCATION;
7123 } else {
7124 /* count the number of active buckets in the old map */
7125 active_buckets = 0;
7126 for (i = 0; i < ocmap->scm_modulus; i++) {
7127 if (!DRT_HASH_VACANT(ocmap, i) &&
7128 (DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
7129 active_buckets++;
7130 }
7131 }
7132 /*
7133 * If we're currently using the small allocation, check to
7134 * see whether we should grow to the large one.
7135 */
7136 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7137 /*
7138 * If the ring is nearly full and we are allowed to
7139 * use the large modulus, upgrade.
7140 */
7141 if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
7142 (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
7143 modulus_size = DRT_HASH_LARGE_MODULUS;
7144 map_size = DRT_LARGE_ALLOCATION;
7145 } else {
7146 modulus_size = DRT_HASH_SMALL_MODULUS;
7147 map_size = DRT_SMALL_ALLOCATION;
7148 }
7149 } else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7150 if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
7151 (max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
7152 modulus_size = DRT_HASH_XLARGE_MODULUS;
7153 map_size = DRT_XLARGE_ALLOCATION;
7154 } else {
7155 /*
7156 * If the ring is completely full and we can't
7157 * expand, there's nothing useful for us to do.
7158 * Behave as though we had compacted into the new
7159 * array and return.
7160 */
7161 return KERN_SUCCESS;
7162 }
7163 } else {
7164 /* already using the xlarge modulus */
7165 modulus_size = DRT_HASH_XLARGE_MODULUS;
7166 map_size = DRT_XLARGE_ALLOCATION;
7167
7168 /*
7169 * If the ring is completely full, there's
7170 * nothing useful for us to do. Behave as
7171 * though we had compacted into the new
7172 * array and return.
7173 */
7174 if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
7175 return KERN_SUCCESS;
7176 }
7177 }
7178 }
7179
7180 /*
7181 * Allocate and initialise the new map.
7182 */
7183
7184 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, map_size,
7185 KMA_DATA, VM_KERN_MEMORY_FILE);
7186 if (kret != KERN_SUCCESS) {
7187 return kret;
7188 }
7189 cmap->scm_magic = DRT_SCM_MAGIC;
7190 cmap->scm_modulus = modulus_size;
7191 cmap->scm_buckets = 0;
7192 cmap->scm_lastclean = 0;
7193 cmap->scm_iskips = 0;
7194 for (i = 0; i < cmap->scm_modulus; i++) {
7195 DRT_HASH_CLEAR(cmap, i);
7196 DRT_HASH_VACATE(cmap, i);
7197 DRT_BITVECTOR_CLEAR(cmap, i);
7198 }
7199
7200 /*
7201 * If there's an old map, re-hash entries from it into the new map.
7202 */
7203 copycount = 0;
7204 if (ocmap != NULL) {
7205 for (i = 0; i < ocmap->scm_modulus; i++) {
7206 /* skip empty buckets */
7207 if (DRT_HASH_VACANT(ocmap, i) ||
7208 (DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
7209 continue;
7210 }
7211 /* get new index */
7212 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
7213 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
7214 if (kret != KERN_SUCCESS) {
7215 /* XXX need to bail out gracefully here */
7216 panic("vfs_drt: new cluster map mysteriously too small");
7217 index = 0;
7218 }
7219 /* copy */
7220 DRT_HASH_COPY(ocmap, i, cmap, index);
7221 copycount++;
7222 }
7223 }
7224
7225 /* log what we've done */
7226 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
7227
7228 /*
7229 * It's important to ensure that *cmapp always points to
7230 * a valid map, so we must overwrite it before freeing
7231 * the old map.
7232 */
7233 *cmapp = cmap;
7234 if (ocmap != NULL) {
7235 /* emit stats into trace buffer */
7236 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
7237 ocmap->scm_modulus,
7238 ocmap->scm_buckets,
7239 ocmap->scm_lastclean,
7240 ocmap->scm_iskips);
7241
7242 vfs_drt_free_map(ocmap);
7243 }
7244 return KERN_SUCCESS;
7245 }
7246
7247
7248 /*
7249 * Free a sparse cluster map.
7250 */
7251 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap * cmap)7252 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
7253 {
7254 vm_size_t map_size = 0;
7255
7256 if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7257 map_size = DRT_SMALL_ALLOCATION;
7258 } else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7259 map_size = DRT_LARGE_ALLOCATION;
7260 } else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7261 map_size = DRT_XLARGE_ALLOCATION;
7262 } else {
7263 panic("vfs_drt_free_map: Invalid modulus %d", cmap->scm_modulus);
7264 }
7265
7266 kmem_free(kernel_map, (vm_offset_t)cmap, map_size);
7267 return KERN_SUCCESS;
7268 }
7269
7270
7271 /*
7272 * Find the hashtable slot currently occupied by an entry for the supplied offset.
7273 */
7274 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap * cmap,u_int64_t offset,int * indexp)7275 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
7276 {
7277 int index;
7278 u_int32_t i;
7279
7280 offset = DRT_ALIGN_ADDRESS(offset);
7281 index = DRT_HASH(cmap, offset);
7282
7283 /* traverse the hashtable */
7284 for (i = 0; i < cmap->scm_modulus; i++) {
7285 /*
7286 * If the slot is vacant, we can stop.
7287 */
7288 if (DRT_HASH_VACANT(cmap, index)) {
7289 break;
7290 }
7291
7292 /*
7293 * If the address matches our offset, we have success.
7294 */
7295 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
7296 *indexp = index;
7297 return KERN_SUCCESS;
7298 }
7299
7300 /*
7301 * Move to the next slot, try again.
7302 */
7303 index = DRT_HASH_NEXT(cmap, index);
7304 }
7305 /*
7306 * It's not there.
7307 */
7308 return KERN_FAILURE;
7309 }
7310
7311 /*
7312 * Find the hashtable slot for the supplied offset. If we haven't allocated
7313 * one yet, allocate one and populate the address field. Note that it will
7314 * not have a nonzero page count and thus will still technically be free, so
7315 * in the case where we are called to clean pages, the slot will remain free.
7316 */
7317 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap ** cmapp,u_int64_t offset,int * indexp,int recursed)7318 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
7319 {
7320 struct vfs_drt_clustermap *cmap;
7321 kern_return_t kret;
7322 u_int32_t index;
7323 u_int32_t i;
7324
7325 cmap = *cmapp;
7326
7327 /* look for an existing entry */
7328 kret = vfs_drt_search_index(cmap, offset, indexp);
7329 if (kret == KERN_SUCCESS) {
7330 return kret;
7331 }
7332
7333 /* need to allocate an entry */
7334 offset = DRT_ALIGN_ADDRESS(offset);
7335 index = DRT_HASH(cmap, offset);
7336
7337 /* scan from the index forwards looking for a vacant slot */
7338 for (i = 0; i < cmap->scm_modulus; i++) {
7339 /* slot vacant? */
7340 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap, index) == 0) {
7341 cmap->scm_buckets++;
7342 if (index < cmap->scm_lastclean) {
7343 cmap->scm_lastclean = index;
7344 }
7345 DRT_HASH_SET_ADDRESS(cmap, index, offset);
7346 DRT_HASH_SET_COUNT(cmap, index, 0);
7347 DRT_BITVECTOR_CLEAR(cmap, index);
7348 *indexp = index;
7349 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
7350 return KERN_SUCCESS;
7351 }
7352 cmap->scm_iskips += i;
7353 index = DRT_HASH_NEXT(cmap, index);
7354 }
7355
7356 /*
7357 * We haven't found a vacant slot, so the map is full. If we're not
7358 * already recursed, try reallocating/compacting it.
7359 */
7360 if (recursed) {
7361 return KERN_FAILURE;
7362 }
7363 kret = vfs_drt_alloc_map(cmapp);
7364 if (kret == KERN_SUCCESS) {
7365 /* now try to insert again */
7366 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
7367 }
7368 return kret;
7369 }
7370
7371 /*
7372 * Implementation of set dirty/clean.
7373 *
7374 * In the 'clean' case, not finding a map is OK.
7375 */
7376 static kern_return_t
vfs_drt_do_mark_pages(void ** private,u_int64_t offset,u_int length,u_int * setcountp,int dirty)7377 vfs_drt_do_mark_pages(
7378 void **private,
7379 u_int64_t offset,
7380 u_int length,
7381 u_int *setcountp,
7382 int dirty)
7383 {
7384 struct vfs_drt_clustermap *cmap, **cmapp;
7385 kern_return_t kret;
7386 int i, index, pgoff, pgcount, setcount, ecount;
7387
7388 cmapp = (struct vfs_drt_clustermap **)private;
7389 cmap = *cmapp;
7390
7391 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
7392
7393 if (setcountp != NULL) {
7394 *setcountp = 0;
7395 }
7396
7397 /* allocate a cluster map if we don't already have one */
7398 if (cmap == NULL) {
7399 /* no cluster map, nothing to clean */
7400 if (!dirty) {
7401 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
7402 return KERN_SUCCESS;
7403 }
7404 kret = vfs_drt_alloc_map(cmapp);
7405 if (kret != KERN_SUCCESS) {
7406 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
7407 return kret;
7408 }
7409 }
7410 setcount = 0;
7411
7412 /*
7413 * Iterate over the length of the region.
7414 */
7415 while (length > 0) {
7416 /*
7417 * Get the hashtable index for this offset.
7418 *
7419 * XXX this will add blank entries if we are clearing a range
7420 * that hasn't been dirtied.
7421 */
7422 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
7423 cmap = *cmapp; /* may have changed! */
7424 /* this may be a partial-success return */
7425 if (kret != KERN_SUCCESS) {
7426 if (setcountp != NULL) {
7427 *setcountp = setcount;
7428 }
7429 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
7430
7431 return kret;
7432 }
7433
7434 /*
7435 * Work out how many pages we're modifying in this
7436 * hashtable entry.
7437 */
7438 pgoff = (int)((offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE);
7439 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
7440
7441 /*
7442 * Iterate over pages, dirty/clearing as we go.
7443 */
7444 ecount = DRT_HASH_GET_COUNT(cmap, index);
7445 for (i = 0; i < pgcount; i++) {
7446 if (dirty) {
7447 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7448 if (ecount >= DRT_BITVECTOR_PAGES) {
7449 panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7450 }
7451 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
7452 ecount++;
7453 setcount++;
7454 }
7455 } else {
7456 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7457 if (ecount <= 0) {
7458 panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7459 }
7460 assert(ecount > 0);
7461 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
7462 ecount--;
7463 setcount++;
7464 }
7465 }
7466 }
7467 DRT_HASH_SET_COUNT(cmap, index, ecount);
7468
7469 offset += pgcount * PAGE_SIZE;
7470 length -= pgcount * PAGE_SIZE;
7471 }
7472 if (setcountp != NULL) {
7473 *setcountp = setcount;
7474 }
7475
7476 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
7477
7478 return KERN_SUCCESS;
7479 }
7480
7481 /*
7482 * Mark a set of pages as dirty/clean.
7483 *
7484 * This is a public interface.
7485 *
7486 * cmapp
7487 * Pointer to storage suitable for holding a pointer. Note that
7488 * this must either be NULL or a value set by this function.
7489 *
7490 * size
7491 * Current file size in bytes.
7492 *
7493 * offset
7494 * Offset of the first page to be marked as dirty, in bytes. Must be
7495 * page-aligned.
7496 *
7497 * length
7498 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
7499 *
7500 * setcountp
7501 * Number of pages newly marked dirty by this call (optional).
7502 *
7503 * Returns KERN_SUCCESS if all the pages were successfully marked.
7504 */
7505 static kern_return_t
vfs_drt_mark_pages(void ** cmapp,off_t offset,u_int length,u_int * setcountp)7506 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
7507 {
7508 /* XXX size unused, drop from interface */
7509 return vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1);
7510 }
7511
7512 #if 0
7513 static kern_return_t
7514 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
7515 {
7516 return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7517 }
7518 #endif
7519
7520 /*
7521 * Get a cluster of dirty pages.
7522 *
7523 * This is a public interface.
7524 *
7525 * cmapp
7526 * Pointer to storage managed by drt_mark_pages. Note that this must
7527 * be NULL or a value set by drt_mark_pages.
7528 *
7529 * offsetp
7530 * Returns the byte offset into the file of the first page in the cluster.
7531 *
7532 * lengthp
7533 * Returns the length in bytes of the cluster of dirty pages.
7534 *
7535 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
7536 * are no dirty pages meeting the minmum size criteria. Private storage will
7537 * be released if there are no more dirty pages left in the map
7538 *
7539 */
7540 static kern_return_t
vfs_drt_get_cluster(void ** cmapp,off_t * offsetp,u_int * lengthp)7541 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
7542 {
7543 struct vfs_drt_clustermap *cmap;
7544 u_int64_t offset;
7545 u_int length;
7546 u_int32_t j;
7547 int index, i, fs, ls;
7548
7549 /* sanity */
7550 if ((cmapp == NULL) || (*cmapp == NULL)) {
7551 return KERN_FAILURE;
7552 }
7553 cmap = *cmapp;
7554
7555 /* walk the hashtable */
7556 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
7557 index = DRT_HASH(cmap, offset);
7558
7559 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
7560 continue;
7561 }
7562
7563 /* scan the bitfield for a string of bits */
7564 fs = -1;
7565
7566 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7567 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7568 fs = i;
7569 break;
7570 }
7571 }
7572 if (fs == -1) {
7573 /* didn't find any bits set */
7574 panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7575 cmap, index, DRT_HASH_GET_COUNT(cmap, index));
7576 }
7577 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
7578 if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
7579 break;
7580 }
7581 }
7582
7583 /* compute offset and length, mark pages clean */
7584 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7585 length = ls * PAGE_SIZE;
7586 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7587 cmap->scm_lastclean = index;
7588
7589 /* return successful */
7590 *offsetp = (off_t)offset;
7591 *lengthp = length;
7592
7593 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
7594 return KERN_SUCCESS;
7595 }
7596 /*
7597 * We didn't find anything... hashtable is empty
7598 * emit stats into trace buffer and
7599 * then free it
7600 */
7601 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7602 cmap->scm_modulus,
7603 cmap->scm_buckets,
7604 cmap->scm_lastclean,
7605 cmap->scm_iskips);
7606
7607 vfs_drt_free_map(cmap);
7608 *cmapp = NULL;
7609
7610 return KERN_FAILURE;
7611 }
7612
7613
7614 static kern_return_t
vfs_drt_control(void ** cmapp,int op_type)7615 vfs_drt_control(void **cmapp, int op_type)
7616 {
7617 struct vfs_drt_clustermap *cmap;
7618
7619 /* sanity */
7620 if ((cmapp == NULL) || (*cmapp == NULL)) {
7621 return KERN_FAILURE;
7622 }
7623 cmap = *cmapp;
7624
7625 switch (op_type) {
7626 case 0:
7627 /* emit stats into trace buffer */
7628 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7629 cmap->scm_modulus,
7630 cmap->scm_buckets,
7631 cmap->scm_lastclean,
7632 cmap->scm_iskips);
7633
7634 vfs_drt_free_map(cmap);
7635 *cmapp = NULL;
7636 break;
7637
7638 case 1:
7639 cmap->scm_lastclean = 0;
7640 break;
7641 }
7642 return KERN_SUCCESS;
7643 }
7644
7645
7646
7647 /*
7648 * Emit a summary of the state of the clustermap into the trace buffer
7649 * along with some caller-provided data.
7650 */
7651 #if KDEBUG
7652 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,int code,int arg1,int arg2,int arg3,int arg4)7653 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
7654 {
7655 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
7656 }
7657 #else
7658 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,__unused int code,__unused int arg1,__unused int arg2,__unused int arg3,__unused int arg4)7659 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
7660 __unused int arg1, __unused int arg2, __unused int arg3,
7661 __unused int arg4)
7662 {
7663 }
7664 #endif
7665
7666 #if 0
7667 /*
7668 * Perform basic sanity check on the hash entry summary count
7669 * vs. the actual bits set in the entry.
7670 */
7671 static void
7672 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7673 {
7674 int index, i;
7675 int bits_on;
7676
7677 for (index = 0; index < cmap->scm_modulus; index++) {
7678 if (DRT_HASH_VACANT(cmap, index)) {
7679 continue;
7680 }
7681
7682 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7683 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7684 bits_on++;
7685 }
7686 }
7687 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
7688 panic("bits_on = %d, index = %d", bits_on, index);
7689 }
7690 }
7691 }
7692 #endif
7693
7694 /*
7695 * Internal interface only.
7696 */
7697 static kern_return_t
vfs_get_scmap_push_behavior_internal(void ** cmapp,int * push_flag)7698 vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag)
7699 {
7700 struct vfs_drt_clustermap *cmap;
7701
7702 /* sanity */
7703 if ((cmapp == NULL) || (*cmapp == NULL) || (push_flag == NULL)) {
7704 return KERN_FAILURE;
7705 }
7706 cmap = *cmapp;
7707
7708 if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7709 /*
7710 * If we have a full xlarge sparse cluster,
7711 * we push it out all at once so the cluster
7712 * map can be available to absorb more I/Os.
7713 * This is done on large memory configs so
7714 * the small I/Os don't interfere with the
7715 * pro workloads.
7716 */
7717 *push_flag = PUSH_ALL;
7718 }
7719 return KERN_SUCCESS;
7720 }
7721