1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
62 */
63
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <kern/kalloc.h>
71 #include <sys/time.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <miscfs/specfs/specdev.h>
75 #include <sys/uio_internal.h>
76 #include <libkern/libkern.h>
77 #include <machine/machine_routines.h>
78
79 #include <sys/ubc_internal.h>
80 #include <vm/vnode_pager.h>
81
82 #include <mach/mach_types.h>
83 #include <mach/memory_object_types.h>
84 #include <mach/vm_map.h>
85 #include <mach/upl.h>
86 #include <kern/task.h>
87 #include <kern/policy_internal.h>
88
89 #include <vm/vm_kern.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_pageout.h>
92 #include <vm/vm_fault.h>
93
94 #include <sys/kdebug.h>
95 #include <sys/kdebug_triage.h>
96 #include <libkern/OSAtomic.h>
97
98 #include <sys/sdt.h>
99
100 #include <stdbool.h>
101
102 #include <vfs/vfs_disk_conditioner.h>
103
104 #if 0
105 #undef KERNEL_DEBUG
106 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
107 #endif
108
109
110 #define CL_READ 0x01
111 #define CL_WRITE 0x02
112 #define CL_ASYNC 0x04
113 #define CL_COMMIT 0x08
114 #define CL_PAGEOUT 0x10
115 #define CL_AGE 0x20
116 #define CL_NOZERO 0x40
117 #define CL_PAGEIN 0x80
118 #define CL_DEV_MEMORY 0x100
119 #define CL_PRESERVE 0x200
120 #define CL_THROTTLE 0x400
121 #define CL_KEEPCACHED 0x800
122 #define CL_DIRECT_IO 0x1000
123 #define CL_PASSIVE 0x2000
124 #define CL_IOSTREAMING 0x4000
125 #define CL_CLOSE 0x8000
126 #define CL_ENCRYPTED 0x10000
127 #define CL_RAW_ENCRYPTED 0x20000
128 #define CL_NOCACHE 0x40000
129
130 #define MAX_VECTOR_UPL_ELEMENTS 8
131 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
132
133 #define CLUSTER_IO_WAITING ((buf_t)1)
134
135 extern upl_t vector_upl_create(vm_offset_t);
136 extern boolean_t vector_upl_is_valid(upl_t);
137 extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
138 extern void vector_upl_set_pagelist(upl_t);
139 extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
140
141 struct clios {
142 lck_mtx_t io_mtxp;
143 u_int io_completed; /* amount of io that has currently completed */
144 u_int io_issued; /* amount of io that was successfully issued */
145 int io_error; /* error code of first error encountered */
146 int io_wanted; /* someone is sleeping waiting for a change in state */
147 };
148
149 struct cl_direct_read_lock {
150 LIST_ENTRY(cl_direct_read_lock) chain;
151 int32_t ref_count;
152 vnode_t vp;
153 lck_rw_t rw_lock;
154 };
155
156 #define CL_DIRECT_READ_LOCK_BUCKETS 61
157
158 static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
159 cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
160
161 static LCK_GRP_DECLARE(cl_mtx_grp, "cluster I/O");
162 static LCK_MTX_DECLARE(cl_transaction_mtxp, &cl_mtx_grp);
163 static LCK_SPIN_DECLARE(cl_direct_read_spin_lock, &cl_mtx_grp);
164
165 static ZONE_DEFINE(cl_rd_zone, "cluster_read",
166 sizeof(struct cl_readahead), ZC_ZFREE_CLEARMEM);
167
168 static ZONE_DEFINE(cl_wr_zone, "cluster_write",
169 sizeof(struct cl_writebehind), ZC_ZFREE_CLEARMEM);
170
171 #define IO_UNKNOWN 0
172 #define IO_DIRECT 1
173 #define IO_CONTIG 2
174 #define IO_COPY 3
175
176 #define PUSH_DELAY 0x01
177 #define PUSH_ALL 0x02
178 #define PUSH_SYNC 0x04
179
180
181 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size);
182 static void cluster_wait_IO(buf_t cbp_head, int async);
183 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
184
185 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
186
187 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
188 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
189 static int cluster_iodone(buf_t bp, void *callback_arg);
190 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
191 static int cluster_is_throttled(vnode_t vp);
192
193 static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
194
195 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
196
197 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
198 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
199
200 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
201 int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
202 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
203 int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
204 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
205 int (*)(buf_t, void *), void *callback_arg, int flags) __attribute__((noinline));
206
207 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
208 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
209 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
210 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
211 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
212 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag) __attribute__((noinline));
213
214 static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
215 off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
216
217 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
218
219 static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
220 static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
221 int (*callback)(buf_t, void *), void *callback_arg, int bflag);
222
223 static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
224
225 static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
226 void *callback_arg, int *err, boolean_t vm_initiated);
227
228 static int sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
229 static int sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
230 int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
231 static int sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
232 int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
233
234 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
235 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
236 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
237 static kern_return_t vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag);
238
239
240 /*
241 * For throttled IO to check whether
242 * a block is cached by the boot cache
243 * and thus it can avoid delaying the IO.
244 *
245 * bootcache_contains_block is initially
246 * NULL. The BootCache will set it while
247 * the cache is active and clear it when
248 * the cache is jettisoned.
249 *
250 * Returns 0 if the block is not
251 * contained in the cache, 1 if it is
252 * contained.
253 *
254 * The function pointer remains valid
255 * after the cache has been evicted even
256 * if bootcache_contains_block has been
257 * cleared.
258 *
259 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
260 */
261 int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
262
263
264 /*
265 * limit the internal I/O size so that we
266 * can represent it in a 32 bit int
267 */
268 #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
269 #define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
270 #define MAX_VECTS 16
271 /*
272 * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
273 * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
274 * we have not historically allowed the write to bypass the UBC.
275 */
276 #define MIN_DIRECT_WRITE_SIZE (16384)
277
278 #define WRITE_THROTTLE 6
279 #define WRITE_THROTTLE_SSD 2
280 #define WRITE_BEHIND 1
281 #define WRITE_BEHIND_SSD 1
282
283 #if !defined(XNU_TARGET_OS_OSX)
284 #define PREFETCH 1
285 #define PREFETCH_SSD 1
286 uint32_t speculative_prefetch_max = (2048 * 1024); /* maximum bytes in a specluative read-ahead */
287 uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
288 #else /* XNU_TARGET_OS_OSX */
289 #define PREFETCH 3
290 #define PREFETCH_SSD 2
291 uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
292 uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
293 #endif /* ! XNU_TARGET_OS_OSX */
294
295
296 #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
297 #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
298 #define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
299
300 int speculative_reads_disabled = 0;
301
302 /*
303 * throttle the number of async writes that
304 * can be outstanding on a single vnode
305 * before we issue a synchronous write
306 */
307 #define THROTTLE_MAXCNT 0
308
309 uint32_t throttle_max_iosize = (128 * 1024);
310
311 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
312
313 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
314
315
316 void
cluster_init(void)317 cluster_init(void)
318 {
319 for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
320 LIST_INIT(&cl_direct_read_locks[i]);
321 }
322 }
323
324
325 uint32_t
cluster_max_io_size(mount_t mp,int type)326 cluster_max_io_size(mount_t mp, int type)
327 {
328 uint32_t max_io_size;
329 uint32_t segcnt;
330 uint32_t maxcnt;
331
332 switch (type) {
333 case CL_READ:
334 segcnt = mp->mnt_segreadcnt;
335 maxcnt = mp->mnt_maxreadcnt;
336 break;
337 case CL_WRITE:
338 segcnt = mp->mnt_segwritecnt;
339 maxcnt = mp->mnt_maxwritecnt;
340 break;
341 default:
342 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
343 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
344 break;
345 }
346 if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
347 /*
348 * don't allow a size beyond the max UPL size we can create
349 */
350 segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
351 }
352 max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
353
354 if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
355 /*
356 * don't allow a size smaller than the old fixed limit
357 */
358 max_io_size = MAX_UPL_TRANSFER_BYTES;
359 } else {
360 /*
361 * make sure the size specified is a multiple of PAGE_SIZE
362 */
363 max_io_size &= ~PAGE_MASK;
364 }
365 return max_io_size;
366 }
367
368
369
370
371 #define CLW_ALLOCATE 0x01
372 #define CLW_RETURNLOCKED 0x02
373 #define CLW_IONOCACHE 0x04
374 #define CLW_IOPASSIVE 0x08
375
376 /*
377 * if the read ahead context doesn't yet exist,
378 * allocate and initialize it...
379 * the vnode lock serializes multiple callers
380 * during the actual assignment... first one
381 * to grab the lock wins... the other callers
382 * will release the now unnecessary storage
383 *
384 * once the context is present, try to grab (but don't block on)
385 * the lock associated with it... if someone
386 * else currently owns it, than the read
387 * will run without read-ahead. this allows
388 * multiple readers to run in parallel and
389 * since there's only 1 read ahead context,
390 * there's no real loss in only allowing 1
391 * reader to have read-ahead enabled.
392 */
393 static struct cl_readahead *
cluster_get_rap(vnode_t vp)394 cluster_get_rap(vnode_t vp)
395 {
396 struct ubc_info *ubc;
397 struct cl_readahead *rap;
398
399 ubc = vp->v_ubcinfo;
400
401 if ((rap = ubc->cl_rahead) == NULL) {
402 rap = zalloc_flags(cl_rd_zone, Z_WAITOK | Z_ZERO);
403 rap->cl_lastr = -1;
404 lck_mtx_init(&rap->cl_lockr, &cl_mtx_grp, LCK_ATTR_NULL);
405
406 vnode_lock(vp);
407
408 if (ubc->cl_rahead == NULL) {
409 ubc->cl_rahead = rap;
410 } else {
411 lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
412 zfree(cl_rd_zone, rap);
413 rap = ubc->cl_rahead;
414 }
415 vnode_unlock(vp);
416 }
417 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) {
418 return rap;
419 }
420
421 return (struct cl_readahead *)NULL;
422 }
423
424
425 /*
426 * if the write behind context doesn't yet exist,
427 * and CLW_ALLOCATE is specified, allocate and initialize it...
428 * the vnode lock serializes multiple callers
429 * during the actual assignment... first one
430 * to grab the lock wins... the other callers
431 * will release the now unnecessary storage
432 *
433 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
434 * the lock associated with the write behind context before
435 * returning
436 */
437
438 static struct cl_writebehind *
cluster_get_wbp(vnode_t vp,int flags)439 cluster_get_wbp(vnode_t vp, int flags)
440 {
441 struct ubc_info *ubc;
442 struct cl_writebehind *wbp;
443
444 ubc = vp->v_ubcinfo;
445
446 if ((wbp = ubc->cl_wbehind) == NULL) {
447 if (!(flags & CLW_ALLOCATE)) {
448 return (struct cl_writebehind *)NULL;
449 }
450
451 wbp = zalloc_flags(cl_wr_zone, Z_WAITOK | Z_ZERO);
452
453 lck_mtx_init(&wbp->cl_lockw, &cl_mtx_grp, LCK_ATTR_NULL);
454
455 vnode_lock(vp);
456
457 if (ubc->cl_wbehind == NULL) {
458 ubc->cl_wbehind = wbp;
459 } else {
460 lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
461 zfree(cl_wr_zone, wbp);
462 wbp = ubc->cl_wbehind;
463 }
464 vnode_unlock(vp);
465 }
466 if (flags & CLW_RETURNLOCKED) {
467 lck_mtx_lock(&wbp->cl_lockw);
468 }
469
470 return wbp;
471 }
472
473
474 static void
cluster_syncup(vnode_t vp,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,int flags)475 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
476 {
477 struct cl_writebehind *wbp;
478
479 if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
480 if (wbp->cl_number) {
481 lck_mtx_lock(&wbp->cl_lockw);
482
483 cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
484
485 lck_mtx_unlock(&wbp->cl_lockw);
486 }
487 }
488 }
489
490
491 static int
cluster_io_present_in_BC(vnode_t vp,off_t f_offset)492 cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
493 {
494 daddr64_t blkno;
495 size_t io_size;
496 int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
497
498 if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
499 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL)) {
500 return 0;
501 }
502
503 if (io_size == 0) {
504 return 0;
505 }
506
507 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
508 return 1;
509 }
510 }
511 return 0;
512 }
513
514
515 static int
cluster_is_throttled(vnode_t vp)516 cluster_is_throttled(vnode_t vp)
517 {
518 return throttle_io_will_be_throttled(-1, vp->v_mount);
519 }
520
521
522 static void
cluster_iostate_wait(struct clios * iostate,u_int target,const char * wait_name)523 cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
524 {
525 lck_mtx_lock(&iostate->io_mtxp);
526
527 while ((iostate->io_issued - iostate->io_completed) > target) {
528 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
529 iostate->io_issued, iostate->io_completed, target, 0, 0);
530
531 iostate->io_wanted = 1;
532 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
533
534 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
535 iostate->io_issued, iostate->io_completed, target, 0, 0);
536 }
537 lck_mtx_unlock(&iostate->io_mtxp);
538 }
539
540 static void
cluster_handle_associated_upl(struct clios * iostate,upl_t upl,upl_offset_t upl_offset,upl_size_t size)541 cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
542 upl_offset_t upl_offset, upl_size_t size)
543 {
544 if (!size) {
545 return;
546 }
547
548 upl_t associated_upl = upl_associated_upl(upl);
549
550 if (!associated_upl) {
551 return;
552 }
553
554 #if 0
555 printf("1: %d %d\n", upl_offset, upl_offset + size);
556 #endif
557
558 /*
559 * The associated UPL is page aligned to file offsets whereas the
560 * UPL it's attached to has different alignment requirements. The
561 * upl_offset that we have refers to @upl. The code that follows
562 * has to deal with the first and last pages in this transaction
563 * which might straddle pages in the associated UPL. To keep
564 * track of these pages, we use the mark bits: if the mark bit is
565 * set, we know another transaction has completed its part of that
566 * page and so we can unlock that page here.
567 *
568 * The following illustrates what we have to deal with:
569 *
570 * MEM u <------------ 1 PAGE ------------> e
571 * +-------------+----------------------+-----------------
572 * | |######################|#################
573 * +-------------+----------------------+-----------------
574 * FILE | <--- a ---> o <------------ 1 PAGE ------------>
575 *
576 * So here we show a write to offset @o. The data that is to be
577 * written is in a buffer that is not page aligned; it has offset
578 * @a in the page. The upl that carries the data starts in memory
579 * at @u. The associated upl starts in the file at offset @o. A
580 * transaction will always end on a page boundary (like @e above)
581 * except for the very last transaction in the group. We cannot
582 * unlock the page at @o in the associated upl until both the
583 * transaction ending at @e and the following transaction (that
584 * starts at @e) has completed.
585 */
586
587 /*
588 * We record whether or not the two UPLs are aligned as the mark
589 * bit in the first page of @upl.
590 */
591 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
592 bool is_unaligned = upl_page_get_mark(pl, 0);
593
594 if (is_unaligned) {
595 upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
596
597 upl_offset_t upl_end = upl_offset + size;
598 assert(upl_end >= PAGE_SIZE);
599
600 upl_size_t assoc_upl_size = upl_get_size(associated_upl);
601
602 /*
603 * In the very first transaction in the group, upl_offset will
604 * not be page aligned, but after that it will be and in that
605 * case we want the preceding page in the associated UPL hence
606 * the minus one.
607 */
608 assert(upl_offset);
609 if (upl_offset) {
610 upl_offset = trunc_page_32(upl_offset - 1);
611 }
612
613 lck_mtx_lock_spin(&iostate->io_mtxp);
614
615 // Look at the first page...
616 if (upl_offset
617 && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
618 /*
619 * The first page isn't marked so let another transaction
620 * completion handle it.
621 */
622 upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
623 upl_offset += PAGE_SIZE;
624 }
625
626 // And now the last page...
627
628 /*
629 * This needs to be > rather than >= because if it's equal, it
630 * means there's another transaction that is sharing the last
631 * page.
632 */
633 if (upl_end > assoc_upl_size) {
634 upl_end = assoc_upl_size;
635 } else {
636 upl_end = trunc_page_32(upl_end);
637 const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
638
639 if (!upl_page_get_mark(assoc_pl, last_pg)) {
640 /*
641 * The last page isn't marked so mark the page and let another
642 * transaction completion handle it.
643 */
644 upl_page_set_mark(assoc_pl, last_pg, true);
645 upl_end -= PAGE_SIZE;
646 }
647 }
648
649 lck_mtx_unlock(&iostate->io_mtxp);
650
651 #if 0
652 printf("2: %d %d\n", upl_offset, upl_end);
653 #endif
654
655 if (upl_end <= upl_offset) {
656 return;
657 }
658
659 size = upl_end - upl_offset;
660 } else {
661 assert(!(upl_offset & PAGE_MASK));
662 assert(!(size & PAGE_MASK));
663 }
664
665 boolean_t empty;
666
667 /*
668 * We can unlock these pages now and as this is for a
669 * direct/uncached write, we want to dump the pages too.
670 */
671 kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
672 UPL_ABORT_DUMP_PAGES, &empty);
673
674 assert(!kr);
675
676 if (!kr && empty) {
677 upl_set_associated_upl(upl, NULL);
678 upl_deallocate(associated_upl);
679 }
680 }
681
682 static int
cluster_ioerror(upl_t upl,int upl_offset,int abort_size,int error,int io_flags,vnode_t vp)683 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
684 {
685 int upl_abort_code = 0;
686 int page_in = 0;
687 int page_out = 0;
688
689 if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) {
690 /*
691 * direct write of any flavor, or a direct read that wasn't aligned
692 */
693 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
694 } else {
695 if (io_flags & B_PAGEIO) {
696 if (io_flags & B_READ) {
697 page_in = 1;
698 } else {
699 page_out = 1;
700 }
701 }
702 if (io_flags & B_CACHE) {
703 /*
704 * leave pages in the cache unchanged on error
705 */
706 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
707 } else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) {
708 /*
709 * transient error on pageout/write path... leave pages unchanged
710 */
711 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
712 } else if (page_in) {
713 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
714 } else {
715 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
716 }
717
718 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
719 }
720 return upl_abort_code;
721 }
722
723
724 static int
cluster_iodone(buf_t bp,void * callback_arg)725 cluster_iodone(buf_t bp, void *callback_arg)
726 {
727 int b_flags;
728 int error;
729 int total_size;
730 int total_resid;
731 int upl_offset;
732 int zero_offset;
733 int pg_offset = 0;
734 int commit_size = 0;
735 int upl_flags = 0;
736 int transaction_size = 0;
737 upl_t upl;
738 buf_t cbp;
739 buf_t cbp_head;
740 buf_t cbp_next;
741 buf_t real_bp;
742 vnode_t vp;
743 struct clios *iostate;
744 void *verify_ctx;
745 boolean_t transaction_complete = FALSE;
746
747 __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
748
749 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
750 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
751
752 if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
753 lck_mtx_lock_spin(&cl_transaction_mtxp);
754
755 bp->b_flags |= B_TDONE;
756
757 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
758 /*
759 * all I/O requests that are part of this transaction
760 * have to complete before we can process it
761 */
762 if (!(cbp->b_flags & B_TDONE)) {
763 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
764 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
765
766 lck_mtx_unlock(&cl_transaction_mtxp);
767
768 return 0;
769 }
770
771 if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
772 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
773 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
774
775 lck_mtx_unlock(&cl_transaction_mtxp);
776 wakeup(cbp);
777
778 return 0;
779 }
780
781 if (cbp->b_flags & B_EOT) {
782 transaction_complete = TRUE;
783 }
784 }
785 lck_mtx_unlock(&cl_transaction_mtxp);
786
787 if (transaction_complete == FALSE) {
788 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
789 cbp_head, 0, 0, 0, 0);
790 return 0;
791 }
792 }
793 error = 0;
794 total_size = 0;
795 total_resid = 0;
796
797 cbp = cbp_head;
798 vp = cbp->b_vp;
799 upl_offset = cbp->b_uploffset;
800 upl = cbp->b_upl;
801 b_flags = cbp->b_flags;
802 real_bp = cbp->b_real_bp;
803 zero_offset = cbp->b_validend;
804 iostate = (struct clios *)cbp->b_iostate;
805
806 if (real_bp) {
807 real_bp->b_dev = cbp->b_dev;
808 }
809
810 while (cbp) {
811 if ((cbp->b_flags & B_ERROR) && error == 0) {
812 error = cbp->b_error;
813 }
814
815 total_resid += cbp->b_resid;
816 total_size += cbp->b_bcount;
817
818 cbp_next = cbp->b_trans_next;
819
820 if (cbp_next == NULL) {
821 /*
822 * compute the overall size of the transaction
823 * in case we created one that has 'holes' in it
824 * 'total_size' represents the amount of I/O we
825 * did, not the span of the transaction w/r to the UPL
826 */
827 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
828 }
829
830 if (cbp != cbp_head) {
831 free_io_buf(cbp);
832 }
833
834 cbp = cbp_next;
835 }
836
837 if (ISSET(b_flags, B_COMMIT_UPL)) {
838 cluster_handle_associated_upl(iostate,
839 cbp_head->b_upl,
840 upl_offset,
841 transaction_size);
842 }
843
844 if (error == 0 && total_resid) {
845 error = EIO;
846 }
847
848 if (error == 0) {
849 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
850
851 if (cliodone_func != NULL) {
852 cbp_head->b_bcount = transaction_size;
853
854 error = (*cliodone_func)(cbp_head, callback_arg);
855 }
856 }
857 if (zero_offset) {
858 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
859 }
860
861 verify_ctx = cbp_head->b_attr.ba_verify_ctx;
862 cbp_head->b_attr.ba_verify_ctx = NULL;
863 if (verify_ctx) {
864 vnode_verify_flags_t verify_flags = VNODE_VERIFY_CONTEXT_FREE;
865 caddr_t verify_buf = NULL;
866 off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
867 size_t verify_length = transaction_size;
868 vm_offset_t vaddr;
869
870 if (!error) {
871 verify_flags |= VNODE_VERIFY_WITH_CONTEXT;
872 error = ubc_upl_map_range(upl, upl_offset, round_page(transaction_size), VM_PROT_DEFAULT, &vaddr); /* Map it in */
873 if (error) {
874 panic("ubc_upl_map_range returned error %d, upl = %p, upl_offset = %d, size = %d",
875 error, upl, (int)upl_offset, (int)round_page(transaction_size));
876 } else {
877 verify_buf = (caddr_t)vaddr;
878 }
879 }
880
881 error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length, 0, &verify_ctx, verify_flags, NULL);
882
883 if (verify_buf) {
884 (void)ubc_upl_unmap_range(upl, upl_offset, round_page(transaction_size));
885 verify_buf = NULL;
886 }
887 }
888
889 free_io_buf(cbp_head);
890
891 if (iostate) {
892 int need_wakeup = 0;
893
894 /*
895 * someone has issued multiple I/Os asynchrounsly
896 * and is waiting for them to complete (streaming)
897 */
898 lck_mtx_lock_spin(&iostate->io_mtxp);
899
900 if (error && iostate->io_error == 0) {
901 iostate->io_error = error;
902 }
903
904 iostate->io_completed += total_size;
905
906 if (iostate->io_wanted) {
907 /*
908 * someone is waiting for the state of
909 * this io stream to change
910 */
911 iostate->io_wanted = 0;
912 need_wakeup = 1;
913 }
914 lck_mtx_unlock(&iostate->io_mtxp);
915
916 if (need_wakeup) {
917 wakeup((caddr_t)&iostate->io_wanted);
918 }
919 }
920
921 if (b_flags & B_COMMIT_UPL) {
922 pg_offset = upl_offset & PAGE_MASK;
923 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
924
925 if (error) {
926 upl_set_iodone_error(upl, error);
927
928 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
929 } else {
930 upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
931
932 if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
933 upl_flags |= UPL_COMMIT_SET_DIRTY;
934 }
935
936 if (b_flags & B_AGE) {
937 upl_flags |= UPL_COMMIT_INACTIVATE;
938 }
939
940 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
941 }
942 }
943 if (real_bp) {
944 if (error) {
945 real_bp->b_flags |= B_ERROR;
946 real_bp->b_error = error;
947 }
948 real_bp->b_resid = total_resid;
949
950 buf_biodone(real_bp);
951 }
952 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
953 upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
954
955 return error;
956 }
957
958
959 uint32_t
cluster_throttle_io_limit(vnode_t vp,uint32_t * limit)960 cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
961 {
962 if (cluster_is_throttled(vp)) {
963 *limit = THROTTLE_MAX_IOSIZE;
964 return 1;
965 }
966 return 0;
967 }
968
969
970 void
cluster_zero(upl_t upl,upl_offset_t upl_offset,int size,buf_t bp)971 cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
972 {
973 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
974 upl_offset, size, bp, 0, 0);
975
976 if (bp == NULL || bp->b_datap == 0) {
977 upl_page_info_t *pl;
978 addr64_t zero_addr;
979
980 pl = ubc_upl_pageinfo(upl);
981
982 if (upl_device_page(pl) == TRUE) {
983 zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
984
985 bzero_phys_nc(zero_addr, size);
986 } else {
987 while (size) {
988 int page_offset;
989 int page_index;
990 int zero_cnt;
991
992 page_index = upl_offset / PAGE_SIZE;
993 page_offset = upl_offset & PAGE_MASK;
994
995 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
996 zero_cnt = min(PAGE_SIZE - page_offset, size);
997
998 bzero_phys(zero_addr, zero_cnt);
999
1000 size -= zero_cnt;
1001 upl_offset += zero_cnt;
1002 }
1003 }
1004 } else {
1005 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
1006 }
1007
1008 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
1009 upl_offset, size, 0, 0, 0);
1010 }
1011
1012
1013 static void
cluster_EOT(buf_t cbp_head,buf_t cbp_tail,int zero_offset,size_t verify_block_size)1014 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size)
1015 {
1016 /*
1017 * We will assign a verification context to cbp_head.
1018 * This will be passed back to the filesystem when
1019 * verifying (in cluster_iodone).
1020 */
1021 if (verify_block_size) {
1022 off_t start_off = cbp_head->b_lblkno * cbp_head->b_lblksize;
1023 size_t length;
1024 void *verify_ctx = NULL;
1025 int error = 0;
1026 vnode_t vp = buf_vnode(cbp_head);
1027
1028 if (cbp_head == cbp_tail) {
1029 length = cbp_head->b_bcount;
1030 } else {
1031 length = ((cbp_tail->b_lblkno * cbp_tail->b_lblksize) + cbp_tail->b_bcount) - start_off;
1032 }
1033
1034 /*
1035 * zero_offset is non zero for the transaction containing the EOF
1036 * (if the filesize is not page aligned). In that case we might
1037 * have the transaction size not be page/verify block size aligned
1038 */
1039 if ((zero_offset == 0) &&
1040 ((length < verify_block_size) || (length % verify_block_size)) != 0) {
1041 panic("%s length = %zu, verify_block_size = %zu",
1042 __FUNCTION__, length, verify_block_size);
1043 }
1044
1045 error = VNOP_VERIFY(vp, start_off, NULL, length,
1046 &verify_block_size, &verify_ctx, VNODE_VERIFY_CONTEXT_ALLOC, NULL);
1047
1048 if (!verify_ctx) {
1049 if (!error && verify_block_size) {
1050 /*
1051 * fetch the verify block size again, it is
1052 * possible that the verification was turned off
1053 * in the filesystem between the time it was
1054 * checked last and now.
1055 */
1056 error = VNOP_VERIFY(vp, start_off, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL);
1057 }
1058
1059 if (error || verify_block_size) {
1060 panic("No verify context for vp = %p, start_off = %lld, length = %zu, error = %d",
1061 buf_vnode(cbp_head), start_off, length, error);
1062 }
1063 }
1064
1065 cbp_head->b_attr.ba_verify_ctx = verify_ctx;
1066 } else {
1067 cbp_head->b_attr.ba_verify_ctx = NULL;
1068 }
1069
1070 cbp_head->b_validend = zero_offset;
1071 cbp_tail->b_flags |= B_EOT;
1072 }
1073
1074 static void
cluster_wait_IO(buf_t cbp_head,int async)1075 cluster_wait_IO(buf_t cbp_head, int async)
1076 {
1077 buf_t cbp;
1078
1079 if (async) {
1080 /*
1081 * Async callback completion will not normally generate a
1082 * wakeup upon I/O completion. To get woken up, we set
1083 * b_trans_next (which is safe for us to modify) on the last
1084 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1085 * to wake us up when all buffers as part of this transaction
1086 * are completed. This is done under the umbrella of
1087 * cl_transaction_mtxp which is also taken in cluster_iodone.
1088 */
1089 bool done = true;
1090 buf_t last = NULL;
1091
1092 lck_mtx_lock_spin(&cl_transaction_mtxp);
1093
1094 for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1095 if (!ISSET(cbp->b_flags, B_TDONE)) {
1096 done = false;
1097 }
1098 }
1099
1100 if (!done) {
1101 last->b_trans_next = CLUSTER_IO_WAITING;
1102
1103 DTRACE_IO1(wait__start, buf_t, last);
1104 do {
1105 msleep(last, &cl_transaction_mtxp, PSPIN | (PRIBIO + 1), "cluster_wait_IO", NULL);
1106
1107 /*
1108 * We should only have been woken up if all the
1109 * buffers are completed, but just in case...
1110 */
1111 done = true;
1112 for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1113 if (!ISSET(cbp->b_flags, B_TDONE)) {
1114 done = false;
1115 break;
1116 }
1117 }
1118 } while (!done);
1119 DTRACE_IO1(wait__done, buf_t, last);
1120
1121 last->b_trans_next = NULL;
1122 }
1123
1124 lck_mtx_unlock(&cl_transaction_mtxp);
1125 } else { // !async
1126 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1127 buf_biowait(cbp);
1128 }
1129 }
1130 }
1131
1132 static void
cluster_complete_transaction(buf_t * cbp_head,void * callback_arg,int * retval,int flags,int needwait)1133 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1134 {
1135 buf_t cbp;
1136 int error;
1137 boolean_t isswapout = FALSE;
1138
1139 /*
1140 * cluster_complete_transaction will
1141 * only be called if we've issued a complete chain in synchronous mode
1142 * or, we've already done a cluster_wait_IO on an incomplete chain
1143 */
1144 if (needwait) {
1145 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1146 buf_biowait(cbp);
1147 }
1148 }
1149 /*
1150 * we've already waited on all of the I/Os in this transaction,
1151 * so mark all of the buf_t's in this transaction as B_TDONE
1152 * so that cluster_iodone sees the transaction as completed
1153 */
1154 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1155 cbp->b_flags |= B_TDONE;
1156 }
1157 cbp = *cbp_head;
1158
1159 if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
1160 isswapout = TRUE;
1161 }
1162
1163 error = cluster_iodone(cbp, callback_arg);
1164
1165 if (!(flags & CL_ASYNC) && error && *retval == 0) {
1166 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
1167 *retval = error;
1168 } else if (isswapout == TRUE) {
1169 *retval = error;
1170 }
1171 }
1172 *cbp_head = (buf_t)NULL;
1173 }
1174
1175
1176 static int
cluster_io(vnode_t vp,upl_t upl,vm_offset_t upl_offset,off_t f_offset,int non_rounded_size,int flags,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)1177 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1178 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1179 {
1180 buf_t cbp;
1181 u_int size;
1182 u_int io_size;
1183 int io_flags;
1184 int bmap_flags;
1185 int error = 0;
1186 int retval = 0;
1187 buf_t cbp_head = NULL;
1188 buf_t cbp_tail = NULL;
1189 int trans_count = 0;
1190 int max_trans_count;
1191 u_int pg_count;
1192 int pg_offset;
1193 u_int max_iosize;
1194 u_int max_vectors;
1195 int priv;
1196 int zero_offset = 0;
1197 int async_throttle = 0;
1198 mount_t mp;
1199 vm_offset_t upl_end_offset;
1200 boolean_t need_EOT = FALSE;
1201 size_t verify_block_size = 0;
1202
1203 /*
1204 * we currently don't support buffers larger than a page
1205 */
1206 if (real_bp && non_rounded_size > PAGE_SIZE) {
1207 panic("%s(): Called with real buffer of size %d bytes which "
1208 "is greater than the maximum allowed size of "
1209 "%d bytes (the system PAGE_SIZE).\n",
1210 __FUNCTION__, non_rounded_size, PAGE_SIZE);
1211 }
1212
1213 mp = vp->v_mount;
1214
1215 /*
1216 * we don't want to do any funny rounding of the size for IO requests
1217 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
1218 * belong to us... we can't extend (nor do we need to) the I/O to fill
1219 * out a page
1220 */
1221 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1222 /*
1223 * round the requested size up so that this I/O ends on a
1224 * page boundary in case this is a 'write'... if the filesystem
1225 * has blocks allocated to back the page beyond the EOF, we want to
1226 * make sure to write out the zero's that are sitting beyond the EOF
1227 * so that in case the filesystem doesn't explicitly zero this area
1228 * if a hole is created via a lseek/write beyond the current EOF,
1229 * it will return zeros when it's read back from the disk. If the
1230 * physical allocation doesn't extend for the whole page, we'll
1231 * only write/read from the disk up to the end of this allocation
1232 * via the extent info returned from the VNOP_BLOCKMAP call.
1233 */
1234 pg_offset = upl_offset & PAGE_MASK;
1235
1236 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1237 } else {
1238 /*
1239 * anyone advertising a blocksize of 1 byte probably
1240 * can't deal with us rounding up the request size
1241 * AFP is one such filesystem/device
1242 */
1243 size = non_rounded_size;
1244 }
1245 upl_end_offset = upl_offset + size;
1246
1247 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1248
1249 /*
1250 * Set the maximum transaction size to the maximum desired number of
1251 * buffers.
1252 */
1253 max_trans_count = 8;
1254 if (flags & CL_DEV_MEMORY) {
1255 max_trans_count = 16;
1256 }
1257
1258 if (flags & CL_READ) {
1259 io_flags = B_READ;
1260 bmap_flags = VNODE_READ;
1261
1262 max_iosize = mp->mnt_maxreadcnt;
1263 max_vectors = mp->mnt_segreadcnt;
1264
1265 if ((flags & CL_PAGEIN) && /* Cluster layer verification will be limited to pagein for now */
1266 !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1267 (VNOP_VERIFY(vp, f_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) &&
1268 verify_block_size) {
1269 if (verify_block_size != PAGE_SIZE) {
1270 verify_block_size = 0;
1271 }
1272 if (real_bp && verify_block_size) {
1273 panic("%s(): Called with real buffer and needs verification ",
1274 __FUNCTION__);
1275 }
1276 }
1277 } else {
1278 io_flags = B_WRITE;
1279 bmap_flags = VNODE_WRITE;
1280
1281 max_iosize = mp->mnt_maxwritecnt;
1282 max_vectors = mp->mnt_segwritecnt;
1283 }
1284 if (verify_block_size) {
1285 bmap_flags |= VNODE_CLUSTER_VERIFY;
1286 }
1287 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1288
1289 /*
1290 * make sure the maximum iosize is a
1291 * multiple of the page size
1292 */
1293 max_iosize &= ~PAGE_MASK;
1294
1295 /*
1296 * Ensure the maximum iosize is sensible.
1297 */
1298 if (!max_iosize) {
1299 max_iosize = PAGE_SIZE;
1300 }
1301
1302 if (flags & CL_THROTTLE) {
1303 if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1304 if (max_iosize > THROTTLE_MAX_IOSIZE) {
1305 max_iosize = THROTTLE_MAX_IOSIZE;
1306 }
1307 async_throttle = THROTTLE_MAXCNT;
1308 } else {
1309 if ((flags & CL_DEV_MEMORY)) {
1310 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1311 } else {
1312 u_int max_cluster;
1313 u_int max_cluster_size;
1314 u_int scale;
1315
1316 if (vp->v_mount->mnt_minsaturationbytecount) {
1317 max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1318
1319 scale = 1;
1320 } else {
1321 max_cluster_size = MAX_CLUSTER_SIZE(vp);
1322
1323 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
1324 scale = WRITE_THROTTLE_SSD;
1325 } else {
1326 scale = WRITE_THROTTLE;
1327 }
1328 }
1329 if (max_iosize > max_cluster_size) {
1330 max_cluster = max_cluster_size;
1331 } else {
1332 max_cluster = max_iosize;
1333 }
1334
1335 if (size < max_cluster) {
1336 max_cluster = size;
1337 }
1338
1339 if (flags & CL_CLOSE) {
1340 scale += MAX_CLUSTERS;
1341 }
1342
1343 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
1344 }
1345 }
1346 }
1347 if (flags & CL_AGE) {
1348 io_flags |= B_AGE;
1349 }
1350 if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
1351 io_flags |= B_PAGEIO;
1352 }
1353 if (flags & (CL_IOSTREAMING)) {
1354 io_flags |= B_IOSTREAMING;
1355 }
1356 if (flags & CL_COMMIT) {
1357 io_flags |= B_COMMIT_UPL;
1358 }
1359 if (flags & CL_DIRECT_IO) {
1360 io_flags |= B_PHYS;
1361 }
1362 if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
1363 io_flags |= B_CACHE;
1364 }
1365 if (flags & CL_PASSIVE) {
1366 io_flags |= B_PASSIVE;
1367 }
1368 if (flags & CL_ENCRYPTED) {
1369 io_flags |= B_ENCRYPTED_IO;
1370 }
1371
1372 if (vp->v_flag & VSYSTEM) {
1373 io_flags |= B_META;
1374 }
1375
1376 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1377 /*
1378 * then we are going to end up
1379 * with a page that we can't complete (the file size wasn't a multiple
1380 * of PAGE_SIZE and we're trying to read to the end of the file
1381 * so we'll go ahead and zero out the portion of the page we can't
1382 * read in from the file
1383 */
1384 zero_offset = (int)(upl_offset + non_rounded_size);
1385 } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1386 assert(ISSET(flags, CL_COMMIT));
1387
1388 // For a direct/uncached write, we need to lock pages...
1389
1390 upl_t cached_upl;
1391
1392 /*
1393 * Create a UPL to lock the pages in the cache whilst the
1394 * write is in progress.
1395 */
1396 ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
1397 NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
1398
1399 /*
1400 * Attach this UPL to the other UPL so that we can find it
1401 * later.
1402 */
1403 upl_set_associated_upl(upl, cached_upl);
1404
1405 if (upl_offset & PAGE_MASK) {
1406 /*
1407 * The two UPLs are not aligned, so mark the first page in
1408 * @upl so that cluster_handle_associated_upl can handle
1409 * it accordingly.
1410 */
1411 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1412 upl_page_set_mark(pl, 0, true);
1413 }
1414 }
1415
1416 while (size) {
1417 daddr64_t blkno;
1418 daddr64_t lblkno;
1419 size_t io_size_tmp;
1420 u_int io_size_wanted;
1421 uint32_t lblksize;
1422
1423 if (size > max_iosize) {
1424 io_size = max_iosize;
1425 } else {
1426 io_size = size;
1427 }
1428
1429 io_size_wanted = io_size;
1430 io_size_tmp = (size_t)io_size;
1431
1432 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1433 break;
1434 }
1435
1436 if (io_size_tmp > io_size_wanted) {
1437 io_size = io_size_wanted;
1438 } else {
1439 io_size = (u_int)io_size_tmp;
1440 }
1441
1442 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1443 real_bp->b_blkno = blkno;
1444 }
1445
1446 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1447 (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
1448
1449 if (io_size == 0) {
1450 /*
1451 * vnop_blockmap didn't return an error... however, it did
1452 * return an extent size of 0 which means we can't
1453 * make forward progress on this I/O... a hole in the
1454 * file would be returned as a blkno of -1 with a non-zero io_size
1455 * a real extent is returned with a blkno != -1 and a non-zero io_size
1456 */
1457 error = EINVAL;
1458 break;
1459 }
1460 if (!(flags & CL_READ) && blkno == -1) {
1461 off_t e_offset;
1462 int pageout_flags;
1463
1464 if (upl_get_internal_vectorupl(upl)) {
1465 panic("Vector UPLs should not take this code-path");
1466 }
1467 /*
1468 * we're writing into a 'hole'
1469 */
1470 if (flags & CL_PAGEOUT) {
1471 /*
1472 * if we got here via cluster_pageout
1473 * then just error the request and return
1474 * the 'hole' should already have been covered
1475 */
1476 error = EINVAL;
1477 break;
1478 }
1479 /*
1480 * we can get here if the cluster code happens to
1481 * pick up a page that was dirtied via mmap vs
1482 * a 'write' and the page targets a 'hole'...
1483 * i.e. the writes to the cluster were sparse
1484 * and the file was being written for the first time
1485 *
1486 * we can also get here if the filesystem supports
1487 * 'holes' that are less than PAGE_SIZE.... because
1488 * we can't know if the range in the page that covers
1489 * the 'hole' has been dirtied via an mmap or not,
1490 * we have to assume the worst and try to push the
1491 * entire page to storage.
1492 *
1493 * Try paging out the page individually before
1494 * giving up entirely and dumping it (the pageout
1495 * path will insure that the zero extent accounting
1496 * has been taken care of before we get back into cluster_io)
1497 *
1498 * go direct to vnode_pageout so that we don't have to
1499 * unbusy the page from the UPL... we used to do this
1500 * so that we could call ubc_msync, but that results
1501 * in a potential deadlock if someone else races us to acquire
1502 * that page and wins and in addition needs one of the pages
1503 * we're continuing to hold in the UPL
1504 */
1505 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1506
1507 if (!(flags & CL_ASYNC)) {
1508 pageout_flags |= UPL_IOSYNC;
1509 }
1510 if (!(flags & CL_COMMIT)) {
1511 pageout_flags |= UPL_NOCOMMIT;
1512 }
1513
1514 if (cbp_head) {
1515 buf_t prev_cbp;
1516 uint32_t bytes_in_last_page;
1517
1518 /*
1519 * first we have to wait for the the current outstanding I/Os
1520 * to complete... EOT hasn't been set yet on this transaction
1521 * so the pages won't be released
1522 */
1523 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1524
1525 bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1526 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1527 bytes_in_last_page += cbp->b_bcount;
1528 }
1529 bytes_in_last_page &= PAGE_MASK;
1530
1531 while (bytes_in_last_page) {
1532 /*
1533 * we've got a transcation that
1534 * includes the page we're about to push out through vnode_pageout...
1535 * find the bp's in the list which intersect this page and either
1536 * remove them entirely from the transaction (there could be multiple bp's), or
1537 * round it's iosize down to the page boundary (there can only be one)...
1538 *
1539 * find the last bp in the list and act on it
1540 */
1541 for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
1542 prev_cbp = cbp;
1543 }
1544
1545 if (bytes_in_last_page >= cbp->b_bcount) {
1546 /*
1547 * this buf no longer has any I/O associated with it
1548 */
1549 bytes_in_last_page -= cbp->b_bcount;
1550 cbp->b_bcount = 0;
1551
1552 free_io_buf(cbp);
1553
1554 if (cbp == cbp_head) {
1555 assert(bytes_in_last_page == 0);
1556 /*
1557 * the buf we just freed was the only buf in
1558 * this transaction... so there's no I/O to do
1559 */
1560 cbp_head = NULL;
1561 cbp_tail = NULL;
1562 } else {
1563 /*
1564 * remove the buf we just freed from
1565 * the transaction list
1566 */
1567 prev_cbp->b_trans_next = NULL;
1568 cbp_tail = prev_cbp;
1569 }
1570 } else {
1571 /*
1572 * this is the last bp that has I/O
1573 * intersecting the page of interest
1574 * only some of the I/O is in the intersection
1575 * so clip the size but keep it in the transaction list
1576 */
1577 cbp->b_bcount -= bytes_in_last_page;
1578 cbp_tail = cbp;
1579 bytes_in_last_page = 0;
1580 }
1581 }
1582 if (cbp_head) {
1583 /*
1584 * there was more to the current transaction
1585 * than just the page we are pushing out via vnode_pageout...
1586 * mark it as finished and complete it... we've already
1587 * waited for the I/Os to complete above in the call to cluster_wait_IO
1588 */
1589 cluster_EOT(cbp_head, cbp_tail, 0, 0);
1590
1591 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1592
1593 trans_count = 0;
1594 }
1595 }
1596 if (vnode_pageout(vp, upl, (upl_offset_t)trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1597 error = EINVAL;
1598 }
1599 e_offset = round_page_64(f_offset + 1);
1600 io_size = (u_int)(e_offset - f_offset);
1601
1602 f_offset += io_size;
1603 upl_offset += io_size;
1604
1605 if (size >= io_size) {
1606 size -= io_size;
1607 } else {
1608 size = 0;
1609 }
1610 /*
1611 * keep track of how much of the original request
1612 * that we've actually completed... non_rounded_size
1613 * may go negative due to us rounding the request
1614 * to a page size multiple (i.e. size > non_rounded_size)
1615 */
1616 non_rounded_size -= io_size;
1617
1618 if (non_rounded_size <= 0) {
1619 /*
1620 * we've transferred all of the data in the original
1621 * request, but we were unable to complete the tail
1622 * of the last page because the file didn't have
1623 * an allocation to back that portion... this is ok.
1624 */
1625 size = 0;
1626 }
1627 if (error) {
1628 if (size == 0) {
1629 flags &= ~CL_COMMIT;
1630 }
1631 break;
1632 }
1633 continue;
1634 }
1635
1636 lblksize = CLUSTER_IO_BLOCK_SIZE;
1637 lblkno = (daddr64_t)(f_offset / lblksize);
1638
1639 /*
1640 * we have now figured out how much I/O we can do - this is in 'io_size'
1641 * pg_offset is the starting point in the first page for the I/O
1642 * pg_count is the number of full and partial pages that 'io_size' encompasses
1643 */
1644 pg_offset = upl_offset & PAGE_MASK;
1645
1646 if (flags & CL_DEV_MEMORY) {
1647 /*
1648 * treat physical requests as one 'giant' page
1649 */
1650 pg_count = 1;
1651 } else {
1652 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1653 }
1654
1655 if ((flags & CL_READ) && blkno == -1) {
1656 vm_offset_t commit_offset;
1657 int bytes_to_zero;
1658 int complete_transaction_now = 0;
1659
1660 /*
1661 * if we're reading and blkno == -1, then we've got a
1662 * 'hole' in the file that we need to deal with by zeroing
1663 * out the affected area in the upl
1664 */
1665 if (io_size >= (u_int)non_rounded_size) {
1666 /*
1667 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1668 * than 'zero_offset' will be non-zero
1669 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1670 * (indicated by the io_size finishing off the I/O request for this UPL)
1671 * than we're not going to issue an I/O for the
1672 * last page in this upl... we need to zero both the hole and the tail
1673 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1674 */
1675 bytes_to_zero = non_rounded_size;
1676 if (!(flags & CL_NOZERO)) {
1677 bytes_to_zero = (int)((((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset);
1678 }
1679
1680 zero_offset = 0;
1681 } else {
1682 bytes_to_zero = io_size;
1683 }
1684
1685 pg_count = 0;
1686
1687 cluster_zero(upl, (upl_offset_t)upl_offset, bytes_to_zero, real_bp);
1688
1689 if (cbp_head) {
1690 int pg_resid;
1691
1692 /*
1693 * if there is a current I/O chain pending
1694 * then the first page of the group we just zero'd
1695 * will be handled by the I/O completion if the zero
1696 * fill started in the middle of the page
1697 */
1698 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1699
1700 pg_resid = (int)(commit_offset - upl_offset);
1701
1702 if (bytes_to_zero >= pg_resid) {
1703 /*
1704 * the last page of the current I/O
1705 * has been completed...
1706 * compute the number of fully zero'd
1707 * pages that are beyond it
1708 * plus the last page if its partial
1709 * and we have no more I/O to issue...
1710 * otherwise a partial page is left
1711 * to begin the next I/O
1712 */
1713 if ((int)io_size >= non_rounded_size) {
1714 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1715 } else {
1716 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1717 }
1718
1719 complete_transaction_now = 1;
1720 }
1721 } else {
1722 /*
1723 * no pending I/O to deal with
1724 * so, commit all of the fully zero'd pages
1725 * plus the last page if its partial
1726 * and we have no more I/O to issue...
1727 * otherwise a partial page is left
1728 * to begin the next I/O
1729 */
1730 if ((int)io_size >= non_rounded_size) {
1731 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1732 } else {
1733 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1734 }
1735
1736 commit_offset = upl_offset & ~PAGE_MASK;
1737 }
1738
1739 // Associated UPL is currently only used in the direct write path
1740 assert(!upl_associated_upl(upl));
1741
1742 if ((flags & CL_COMMIT) && pg_count) {
1743 ubc_upl_commit_range(upl, (upl_offset_t)commit_offset,
1744 pg_count * PAGE_SIZE,
1745 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1746 }
1747 upl_offset += io_size;
1748 f_offset += io_size;
1749 size -= io_size;
1750
1751 /*
1752 * keep track of how much of the original request
1753 * that we've actually completed... non_rounded_size
1754 * may go negative due to us rounding the request
1755 * to a page size multiple (i.e. size > non_rounded_size)
1756 */
1757 non_rounded_size -= io_size;
1758
1759 if (non_rounded_size <= 0) {
1760 /*
1761 * we've transferred all of the data in the original
1762 * request, but we were unable to complete the tail
1763 * of the last page because the file didn't have
1764 * an allocation to back that portion... this is ok.
1765 */
1766 size = 0;
1767 }
1768 if (cbp_head && (complete_transaction_now || size == 0)) {
1769 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1770
1771 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
1772
1773 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1774
1775 trans_count = 0;
1776 }
1777 continue;
1778 }
1779 if (pg_count > max_vectors) {
1780 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1781 io_size = PAGE_SIZE - pg_offset;
1782 pg_count = 1;
1783 } else {
1784 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1785 pg_count = max_vectors;
1786 }
1787 }
1788 /*
1789 * If the transaction is going to reach the maximum number of
1790 * desired elements, truncate the i/o to the nearest page so
1791 * that the actual i/o is initiated after this buffer is
1792 * created and added to the i/o chain.
1793 *
1794 * I/O directed to physically contiguous memory
1795 * doesn't have a requirement to make sure we 'fill' a page
1796 */
1797 if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1798 ((upl_offset + io_size) & PAGE_MASK)) {
1799 vm_offset_t aligned_ofs;
1800
1801 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1802 /*
1803 * If the io_size does not actually finish off even a
1804 * single page we have to keep adding buffers to the
1805 * transaction despite having reached the desired limit.
1806 *
1807 * Eventually we get here with the page being finished
1808 * off (and exceeded) and then we truncate the size of
1809 * this i/o request so that it is page aligned so that
1810 * we can finally issue the i/o on the transaction.
1811 */
1812 if (aligned_ofs > upl_offset) {
1813 io_size = (u_int)(aligned_ofs - upl_offset);
1814 pg_count--;
1815 }
1816 }
1817
1818 if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1819 /*
1820 * if we're not targeting a virtual device i.e. a disk image
1821 * it's safe to dip into the reserve pool since real devices
1822 * can complete this I/O request without requiring additional
1823 * bufs from the alloc_io_buf pool
1824 */
1825 priv = 1;
1826 } else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT) && !cbp_head) {
1827 /*
1828 * Throttle the speculative IO
1829 *
1830 * We can only throttle this if it is the first iobuf
1831 * for the transaction. alloc_io_buf implements
1832 * additional restrictions for diskimages anyway.
1833 */
1834 priv = 0;
1835 } else {
1836 priv = 1;
1837 }
1838
1839 cbp = alloc_io_buf(vp, priv);
1840
1841 if (flags & CL_PAGEOUT) {
1842 u_int i;
1843
1844 /*
1845 * since blocks are in offsets of lblksize (CLUSTER_IO_BLOCK_SIZE), scale
1846 * iteration to (PAGE_SIZE * pg_count) of blks.
1847 */
1848 for (i = 0; i < (PAGE_SIZE * pg_count) / lblksize; i++) {
1849 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
1850 panic("BUSY bp found in cluster_io");
1851 }
1852 }
1853 }
1854 if (flags & CL_ASYNC) {
1855 if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) {
1856 panic("buf_setcallback failed");
1857 }
1858 }
1859 cbp->b_cliodone = (void *)callback;
1860 cbp->b_flags |= io_flags;
1861 if (flags & CL_NOCACHE) {
1862 cbp->b_attr.ba_flags |= BA_NOCACHE;
1863 }
1864 if (verify_block_size) {
1865 cbp->b_attr.ba_flags |= BA_WILL_VERIFY;
1866 }
1867
1868 cbp->b_lblkno = lblkno;
1869 cbp->b_lblksize = lblksize;
1870 cbp->b_blkno = blkno;
1871 cbp->b_bcount = io_size;
1872
1873 if (buf_setupl(cbp, upl, (uint32_t)upl_offset)) {
1874 panic("buf_setupl failed");
1875 }
1876 #if CONFIG_IOSCHED
1877 upl_set_blkno(upl, upl_offset, io_size, blkno);
1878 #endif
1879 cbp->b_trans_next = (buf_t)NULL;
1880
1881 if ((cbp->b_iostate = (void *)iostate)) {
1882 /*
1883 * caller wants to track the state of this
1884 * io... bump the amount issued against this stream
1885 */
1886 iostate->io_issued += io_size;
1887 }
1888
1889 if (flags & CL_READ) {
1890 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1891 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1892 } else {
1893 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1894 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1895 }
1896
1897 if (cbp_head) {
1898 cbp_tail->b_trans_next = cbp;
1899 cbp_tail = cbp;
1900 } else {
1901 cbp_head = cbp;
1902 cbp_tail = cbp;
1903
1904 if ((cbp_head->b_real_bp = real_bp)) {
1905 real_bp = (buf_t)NULL;
1906 }
1907 }
1908 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1909
1910 trans_count++;
1911
1912 upl_offset += io_size;
1913 f_offset += io_size;
1914 size -= io_size;
1915 /*
1916 * keep track of how much of the original request
1917 * that we've actually completed... non_rounded_size
1918 * may go negative due to us rounding the request
1919 * to a page size multiple (i.e. size > non_rounded_size)
1920 */
1921 non_rounded_size -= io_size;
1922
1923 if (non_rounded_size <= 0) {
1924 /*
1925 * we've transferred all of the data in the original
1926 * request, but we were unable to complete the tail
1927 * of the last page because the file didn't have
1928 * an allocation to back that portion... this is ok.
1929 */
1930 size = 0;
1931 }
1932 if (size == 0) {
1933 /*
1934 * we have no more I/O to issue, so go
1935 * finish the final transaction
1936 */
1937 need_EOT = TRUE;
1938 } else if (((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1939 ((flags & CL_ASYNC) || trans_count > max_trans_count)) {
1940 /*
1941 * I/O directed to physically contiguous memory...
1942 * which doesn't have a requirement to make sure we 'fill' a page
1943 * or...
1944 * the current I/O we've prepared fully
1945 * completes the last page in this request
1946 * and ...
1947 * it's either an ASYNC request or
1948 * we've already accumulated more than 8 I/O's into
1949 * this transaction so mark it as complete so that
1950 * it can finish asynchronously or via the cluster_complete_transaction
1951 * below if the request is synchronous
1952 */
1953 need_EOT = TRUE;
1954 }
1955 if (need_EOT == TRUE) {
1956 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
1957 }
1958
1959 if (flags & CL_THROTTLE) {
1960 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
1961 }
1962
1963 if (!(io_flags & B_READ)) {
1964 vnode_startwrite(vp);
1965 }
1966
1967 if (flags & CL_RAW_ENCRYPTED) {
1968 /*
1969 * User requested raw encrypted bytes.
1970 * Twiddle the bit in the ba_flags for the buffer
1971 */
1972 cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
1973 }
1974
1975 (void) VNOP_STRATEGY(cbp);
1976
1977 if (need_EOT == TRUE) {
1978 if (!(flags & CL_ASYNC)) {
1979 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
1980 }
1981
1982 need_EOT = FALSE;
1983 trans_count = 0;
1984 cbp_head = NULL;
1985 }
1986 }
1987 if (error) {
1988 int abort_size;
1989
1990 io_size = 0;
1991
1992 if (cbp_head) {
1993 /*
1994 * Wait until all of the outstanding I/O
1995 * for this partial transaction has completed
1996 */
1997 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1998
1999 /*
2000 * Rewind the upl offset to the beginning of the
2001 * transaction.
2002 */
2003 upl_offset = cbp_head->b_uploffset;
2004 }
2005
2006 if (ISSET(flags, CL_COMMIT)) {
2007 cluster_handle_associated_upl(iostate, upl,
2008 (upl_offset_t)upl_offset,
2009 (upl_size_t)(upl_end_offset - upl_offset));
2010 }
2011
2012 // Free all the IO buffers in this transaction
2013 for (cbp = cbp_head; cbp;) {
2014 buf_t cbp_next;
2015
2016 size += cbp->b_bcount;
2017 io_size += cbp->b_bcount;
2018
2019 cbp_next = cbp->b_trans_next;
2020 free_io_buf(cbp);
2021 cbp = cbp_next;
2022 }
2023
2024 if (iostate) {
2025 int need_wakeup = 0;
2026
2027 /*
2028 * update the error condition for this stream
2029 * since we never really issued the io
2030 * just go ahead and adjust it back
2031 */
2032 lck_mtx_lock_spin(&iostate->io_mtxp);
2033
2034 if (iostate->io_error == 0) {
2035 iostate->io_error = error;
2036 }
2037 iostate->io_issued -= io_size;
2038
2039 if (iostate->io_wanted) {
2040 /*
2041 * someone is waiting for the state of
2042 * this io stream to change
2043 */
2044 iostate->io_wanted = 0;
2045 need_wakeup = 1;
2046 }
2047 lck_mtx_unlock(&iostate->io_mtxp);
2048
2049 if (need_wakeup) {
2050 wakeup((caddr_t)&iostate->io_wanted);
2051 }
2052 }
2053
2054 if (flags & CL_COMMIT) {
2055 int upl_flags;
2056
2057 pg_offset = upl_offset & PAGE_MASK;
2058 abort_size = (int)((upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK);
2059
2060 upl_flags = cluster_ioerror(upl, (int)(upl_offset - pg_offset),
2061 abort_size, error, io_flags, vp);
2062
2063 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
2064 upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
2065 }
2066 if (retval == 0) {
2067 retval = error;
2068 }
2069 } else if (cbp_head) {
2070 panic("%s(): cbp_head is not NULL.", __FUNCTION__);
2071 }
2072
2073 if (real_bp) {
2074 /*
2075 * can get here if we either encountered an error
2076 * or we completely zero-filled the request and
2077 * no I/O was issued
2078 */
2079 if (error) {
2080 real_bp->b_flags |= B_ERROR;
2081 real_bp->b_error = error;
2082 }
2083 buf_biodone(real_bp);
2084 }
2085 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
2086
2087 return retval;
2088 }
2089
2090 #define reset_vector_run_state() \
2091 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
2092
2093 static int
vector_cluster_io(vnode_t vp,upl_t vector_upl,vm_offset_t vector_upl_offset,off_t v_upl_uio_offset,int vector_upl_iosize,int io_flag,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)2094 vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
2095 int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
2096 {
2097 vector_upl_set_pagelist(vector_upl);
2098
2099 if (io_flag & CL_READ) {
2100 if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
2101 io_flag &= ~CL_PRESERVE; /*don't zero fill*/
2102 } else {
2103 io_flag |= CL_PRESERVE; /*zero fill*/
2104 }
2105 }
2106 return cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg);
2107 }
2108
2109 static int
cluster_read_prefetch(vnode_t vp,off_t f_offset,u_int size,off_t filesize,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2110 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2111 {
2112 int pages_in_prefetch;
2113
2114 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
2115 (int)f_offset, size, (int)filesize, 0, 0);
2116
2117 if (f_offset >= filesize) {
2118 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2119 (int)f_offset, 0, 0, 0, 0);
2120 return 0;
2121 }
2122 if ((off_t)size > (filesize - f_offset)) {
2123 size = (u_int)(filesize - f_offset);
2124 }
2125 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
2126
2127 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
2128
2129 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2130 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
2131
2132 return pages_in_prefetch;
2133 }
2134
2135
2136
2137 static void
cluster_read_ahead(vnode_t vp,struct cl_extent * extent,off_t filesize,struct cl_readahead * rap,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2138 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
2139 int bflag)
2140 {
2141 daddr64_t r_addr;
2142 off_t f_offset;
2143 int size_of_prefetch;
2144 u_int max_prefetch;
2145
2146
2147 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
2148 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
2149
2150 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
2151 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2152 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
2153 return;
2154 }
2155 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
2156 rap->cl_ralen = 0;
2157 rap->cl_maxra = 0;
2158
2159 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2160 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
2161
2162 return;
2163 }
2164 max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), disk_conditioner_mount_is_ssd(vp->v_mount));
2165
2166 if (max_prefetch > speculative_prefetch_max) {
2167 max_prefetch = speculative_prefetch_max;
2168 }
2169
2170 if (max_prefetch <= PAGE_SIZE) {
2171 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2172 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
2173 return;
2174 }
2175 if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
2176 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2178 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
2179 return;
2180 }
2181 }
2182 r_addr = MAX(extent->e_addr, rap->cl_maxra) + 1;
2183 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2184
2185 size_of_prefetch = 0;
2186
2187 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2188
2189 if (size_of_prefetch) {
2190 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2191 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
2192 return;
2193 }
2194 if (f_offset < filesize) {
2195 daddr64_t read_size;
2196
2197 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
2198
2199 read_size = (extent->e_addr + 1) - extent->b_addr;
2200
2201 if (read_size > rap->cl_ralen) {
2202 if (read_size > max_prefetch / PAGE_SIZE) {
2203 rap->cl_ralen = max_prefetch / PAGE_SIZE;
2204 } else {
2205 rap->cl_ralen = (int)read_size;
2206 }
2207 }
2208 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2209
2210 if (size_of_prefetch) {
2211 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2212 }
2213 }
2214 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2215 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
2216 }
2217
2218
2219 int
cluster_pageout(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2220 cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2221 int size, off_t filesize, int flags)
2222 {
2223 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2224 }
2225
2226
2227 int
cluster_pageout_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2228 cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2229 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2230 {
2231 int io_size;
2232 int rounded_size;
2233 off_t max_size;
2234 int local_flags;
2235
2236 local_flags = CL_PAGEOUT | CL_THROTTLE;
2237
2238 if ((flags & UPL_IOSYNC) == 0) {
2239 local_flags |= CL_ASYNC;
2240 }
2241 if ((flags & UPL_NOCOMMIT) == 0) {
2242 local_flags |= CL_COMMIT;
2243 }
2244 if ((flags & UPL_KEEPCACHED)) {
2245 local_flags |= CL_KEEPCACHED;
2246 }
2247 if (flags & UPL_PAGING_ENCRYPTED) {
2248 local_flags |= CL_ENCRYPTED;
2249 }
2250
2251
2252 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2253 (int)f_offset, size, (int)filesize, local_flags, 0);
2254
2255 /*
2256 * If they didn't specify any I/O, then we are done...
2257 * we can't issue an abort because we don't know how
2258 * big the upl really is
2259 */
2260 if (size <= 0) {
2261 return EINVAL;
2262 }
2263
2264 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2265 if (local_flags & CL_COMMIT) {
2266 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2267 }
2268 return EROFS;
2269 }
2270 /*
2271 * can't page-in from a negative offset
2272 * or if we're starting beyond the EOF
2273 * or if the file offset isn't page aligned
2274 * or the size requested isn't a multiple of PAGE_SIZE
2275 */
2276 if (f_offset < 0 || f_offset >= filesize ||
2277 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2278 if (local_flags & CL_COMMIT) {
2279 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2280 }
2281 return EINVAL;
2282 }
2283 max_size = filesize - f_offset;
2284
2285 if (size < max_size) {
2286 io_size = size;
2287 } else {
2288 io_size = (int)max_size;
2289 }
2290
2291 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2292
2293 if (size > rounded_size) {
2294 if (local_flags & CL_COMMIT) {
2295 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2296 UPL_ABORT_FREE_ON_EMPTY);
2297 }
2298 }
2299 return cluster_io(vp, upl, upl_offset, f_offset, io_size,
2300 local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2301 }
2302
2303
2304 int
cluster_pagein(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2305 cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2306 int size, off_t filesize, int flags)
2307 {
2308 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2309 }
2310
2311
2312 int
cluster_pagein_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2313 cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2314 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2315 {
2316 u_int io_size;
2317 int rounded_size;
2318 off_t max_size;
2319 int retval;
2320 int local_flags = 0;
2321
2322 if (upl == NULL || size < 0) {
2323 panic("cluster_pagein: NULL upl passed in");
2324 }
2325
2326 if ((flags & UPL_IOSYNC) == 0) {
2327 local_flags |= CL_ASYNC;
2328 }
2329 if ((flags & UPL_NOCOMMIT) == 0) {
2330 local_flags |= CL_COMMIT;
2331 }
2332 if (flags & UPL_IOSTREAMING) {
2333 local_flags |= CL_IOSTREAMING;
2334 }
2335 if (flags & UPL_PAGING_ENCRYPTED) {
2336 local_flags |= CL_ENCRYPTED;
2337 }
2338
2339
2340 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2341 (int)f_offset, size, (int)filesize, local_flags, 0);
2342
2343 /*
2344 * can't page-in from a negative offset
2345 * or if we're starting beyond the EOF
2346 * or if the file offset isn't page aligned
2347 * or the size requested isn't a multiple of PAGE_SIZE
2348 */
2349 if (f_offset < 0 || f_offset >= filesize ||
2350 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2351 if (local_flags & CL_COMMIT) {
2352 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2353 }
2354
2355 if (f_offset >= filesize) {
2356 kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CLUSTER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CL_PGIN_PAST_EOF), 0 /* arg */);
2357 }
2358
2359 return EINVAL;
2360 }
2361 max_size = filesize - f_offset;
2362
2363 if (size < max_size) {
2364 io_size = size;
2365 } else {
2366 io_size = (int)max_size;
2367 }
2368
2369 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2370
2371 if (size > rounded_size && (local_flags & CL_COMMIT)) {
2372 ubc_upl_abort_range(upl, upl_offset + rounded_size,
2373 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2374 }
2375
2376 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
2377 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2378
2379 return retval;
2380 }
2381
2382
2383 int
cluster_bp(buf_t bp)2384 cluster_bp(buf_t bp)
2385 {
2386 return cluster_bp_ext(bp, NULL, NULL);
2387 }
2388
2389
2390 int
cluster_bp_ext(buf_t bp,int (* callback)(buf_t,void *),void * callback_arg)2391 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2392 {
2393 off_t f_offset;
2394 int flags;
2395
2396 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2397 bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2398
2399 if (bp->b_flags & B_READ) {
2400 flags = CL_ASYNC | CL_READ;
2401 } else {
2402 flags = CL_ASYNC;
2403 }
2404 if (bp->b_flags & B_PASSIVE) {
2405 flags |= CL_PASSIVE;
2406 }
2407
2408 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2409
2410 return cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg);
2411 }
2412
2413
2414
2415 int
cluster_write(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags)2416 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2417 {
2418 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2419 }
2420
2421
2422 int
cluster_write_ext(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags,int (* callback)(buf_t,void *),void * callback_arg)2423 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2424 int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2425 {
2426 user_ssize_t cur_resid;
2427 int retval = 0;
2428 int flags;
2429 int zflags;
2430 int bflag;
2431 int write_type = IO_COPY;
2432 u_int32_t write_length;
2433
2434 flags = xflags;
2435
2436 if (flags & IO_PASSIVE) {
2437 bflag = CL_PASSIVE;
2438 } else {
2439 bflag = 0;
2440 }
2441
2442 if (vp->v_flag & VNOCACHE_DATA) {
2443 flags |= IO_NOCACHE;
2444 bflag |= CL_NOCACHE;
2445 }
2446 if (uio == NULL) {
2447 /*
2448 * no user data...
2449 * this call is being made to zero-fill some range in the file
2450 */
2451 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2452
2453 return retval;
2454 }
2455 /*
2456 * do a write through the cache if one of the following is true....
2457 * NOCACHE is not true or NODIRECT is true
2458 * the uio request doesn't target USERSPACE
2459 * otherwise, find out if we want the direct or contig variant for
2460 * the first vector in the uio request
2461 */
2462 if (((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
2463 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2464 }
2465
2466 if ((flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
2467 /*
2468 * must go through the cached variant in this case
2469 */
2470 write_type = IO_COPY;
2471 }
2472
2473 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
2474 switch (write_type) {
2475 case IO_COPY:
2476 /*
2477 * make sure the uio_resid isn't too big...
2478 * internally, we want to handle all of the I/O in
2479 * chunk sizes that fit in a 32 bit int
2480 */
2481 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2482 /*
2483 * we're going to have to call cluster_write_copy
2484 * more than once...
2485 *
2486 * only want the last call to cluster_write_copy to
2487 * have the IO_TAILZEROFILL flag set and only the
2488 * first call should have IO_HEADZEROFILL
2489 */
2490 zflags = flags & ~IO_TAILZEROFILL;
2491 flags &= ~IO_HEADZEROFILL;
2492
2493 write_length = MAX_IO_REQUEST_SIZE;
2494 } else {
2495 /*
2496 * last call to cluster_write_copy
2497 */
2498 zflags = flags;
2499
2500 write_length = (u_int32_t)cur_resid;
2501 }
2502 retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
2503 break;
2504
2505 case IO_CONTIG:
2506 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
2507
2508 if (flags & IO_HEADZEROFILL) {
2509 /*
2510 * only do this once per request
2511 */
2512 flags &= ~IO_HEADZEROFILL;
2513
2514 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
2515 headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2516 if (retval) {
2517 break;
2518 }
2519 }
2520 retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
2521
2522 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
2523 /*
2524 * we're done with the data from the user specified buffer(s)
2525 * and we've been requested to zero fill at the tail
2526 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2527 * by rearranging the args and passing in IO_HEADZEROFILL
2528 */
2529
2530 /*
2531 * Update the oldEOF to reflect the current EOF. If the UPL page
2532 * to zero-fill is not valid (when F_NOCACHE is set), the
2533 * cluster_write_copy() will perform RMW on the UPL page when
2534 * the oldEOF is not aligned on page boundary due to unaligned
2535 * write.
2536 */
2537 if (uio->uio_offset > oldEOF) {
2538 oldEOF = uio->uio_offset;
2539 }
2540 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)oldEOF, tailOff, uio->uio_offset,
2541 (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2542 }
2543 break;
2544
2545 case IO_DIRECT:
2546 /*
2547 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2548 */
2549 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
2550 break;
2551
2552 case IO_UNKNOWN:
2553 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2554 break;
2555 }
2556 /*
2557 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2558 * multiple times to service a multi-vector request that is not aligned properly
2559 * we need to update the oldEOF so that we
2560 * don't zero-fill the head of a page if we've successfully written
2561 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2562 * page that is beyond the oldEOF if the write is unaligned... we only
2563 * want that to happen for the very first page of the cluster_write,
2564 * NOT the first page of each vector making up a multi-vector write.
2565 */
2566 if (uio->uio_offset > oldEOF) {
2567 oldEOF = uio->uio_offset;
2568 }
2569 }
2570 return retval;
2571 }
2572
2573
2574 static int
cluster_write_direct(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,int * write_type,u_int32_t * write_length,int flags,int (* callback)(buf_t,void *),void * callback_arg)2575 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
2576 int flags, int (*callback)(buf_t, void *), void *callback_arg)
2577 {
2578 upl_t upl;
2579 upl_page_info_t *pl;
2580 vm_offset_t upl_offset;
2581 vm_offset_t vector_upl_offset = 0;
2582 u_int32_t io_req_size;
2583 u_int32_t offset_in_file;
2584 u_int32_t offset_in_iovbase;
2585 u_int32_t io_size;
2586 int io_flag = 0;
2587 upl_size_t upl_size, vector_upl_size = 0;
2588 vm_size_t upl_needed_size;
2589 mach_msg_type_number_t pages_in_pl;
2590 upl_control_flags_t upl_flags;
2591 kern_return_t kret;
2592 mach_msg_type_number_t i;
2593 int force_data_sync;
2594 int retval = 0;
2595 int first_IO = 1;
2596 struct clios iostate;
2597 user_addr_t iov_base;
2598 u_int32_t mem_alignment_mask;
2599 u_int32_t devblocksize;
2600 u_int32_t max_io_size;
2601 u_int32_t max_upl_size;
2602 u_int32_t max_vector_size;
2603 u_int32_t bytes_outstanding_limit;
2604 boolean_t io_throttled = FALSE;
2605
2606 u_int32_t vector_upl_iosize = 0;
2607 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
2608 off_t v_upl_uio_offset = 0;
2609 int vector_upl_index = 0;
2610 upl_t vector_upl = NULL;
2611
2612
2613 /*
2614 * When we enter this routine, we know
2615 * -- the resid will not exceed iov_len
2616 */
2617 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
2618 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2619
2620 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
2621
2622 max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2623
2624 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2625
2626 if (flags & IO_PASSIVE) {
2627 io_flag |= CL_PASSIVE;
2628 }
2629
2630 if (flags & IO_NOCACHE) {
2631 io_flag |= CL_NOCACHE;
2632 }
2633
2634 if (flags & IO_SKIP_ENCRYPTION) {
2635 io_flag |= CL_ENCRYPTED;
2636 }
2637
2638 iostate.io_completed = 0;
2639 iostate.io_issued = 0;
2640 iostate.io_error = 0;
2641 iostate.io_wanted = 0;
2642
2643 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
2644
2645 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2646 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2647
2648 if (devblocksize == 1) {
2649 /*
2650 * the AFP client advertises a devblocksize of 1
2651 * however, its BLOCKMAP routine maps to physical
2652 * blocks that are PAGE_SIZE in size...
2653 * therefore we can't ask for I/Os that aren't page aligned
2654 * or aren't multiples of PAGE_SIZE in size
2655 * by setting devblocksize to PAGE_SIZE, we re-instate
2656 * the old behavior we had before the mem_alignment_mask
2657 * changes went in...
2658 */
2659 devblocksize = PAGE_SIZE;
2660 }
2661
2662 next_dwrite:
2663 io_req_size = *write_length;
2664 iov_base = uio_curriovbase(uio);
2665
2666 offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2667 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
2668
2669 if (offset_in_file || offset_in_iovbase) {
2670 /*
2671 * one of the 2 important offsets is misaligned
2672 * so fire an I/O through the cache for this entire vector
2673 */
2674 goto wait_for_dwrites;
2675 }
2676 if (iov_base & (devblocksize - 1)) {
2677 /*
2678 * the offset in memory must be on a device block boundary
2679 * so that we can guarantee that we can generate an
2680 * I/O that ends on a page boundary in cluster_io
2681 */
2682 goto wait_for_dwrites;
2683 }
2684
2685 task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2686 while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
2687 int throttle_type;
2688
2689 if ((throttle_type = cluster_is_throttled(vp))) {
2690 /*
2691 * we're in the throttle window, at the very least
2692 * we want to limit the size of the I/O we're about
2693 * to issue
2694 */
2695 if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2696 /*
2697 * we're in the throttle window and at least 1 I/O
2698 * has already been issued by a throttleable thread
2699 * in this window, so return with EAGAIN to indicate
2700 * to the FS issuing the cluster_write call that it
2701 * should now throttle after dropping any locks
2702 */
2703 throttle_info_update_by_mount(vp->v_mount);
2704
2705 io_throttled = TRUE;
2706 goto wait_for_dwrites;
2707 }
2708 max_vector_size = THROTTLE_MAX_IOSIZE;
2709 max_io_size = THROTTLE_MAX_IOSIZE;
2710 } else {
2711 max_vector_size = MAX_VECTOR_UPL_SIZE;
2712 max_io_size = max_upl_size;
2713 }
2714
2715 if (first_IO) {
2716 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2717 first_IO = 0;
2718 }
2719 io_size = io_req_size & ~PAGE_MASK;
2720 iov_base = uio_curriovbase(uio);
2721
2722 if (io_size > max_io_size) {
2723 io_size = max_io_size;
2724 }
2725
2726 if (useVectorUPL && (iov_base & PAGE_MASK)) {
2727 /*
2728 * We have an iov_base that's not page-aligned.
2729 * Issue all I/O's that have been collected within
2730 * this Vectored UPL.
2731 */
2732 if (vector_upl_index) {
2733 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2734 reset_vector_run_state();
2735 }
2736
2737 /*
2738 * After this point, if we are using the Vector UPL path and the base is
2739 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2740 */
2741 }
2742
2743 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2744 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2745
2746 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
2747 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
2748
2749 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2750 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2751 pages_in_pl = 0;
2752 upl_size = (upl_size_t)upl_needed_size;
2753 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2754 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2755
2756 kret = vm_map_get_upl(map,
2757 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2758 &upl_size,
2759 &upl,
2760 NULL,
2761 &pages_in_pl,
2762 &upl_flags,
2763 VM_KERN_MEMORY_FILE,
2764 force_data_sync);
2765
2766 if (kret != KERN_SUCCESS) {
2767 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2768 0, 0, 0, kret, 0);
2769 /*
2770 * failed to get pagelist
2771 *
2772 * we may have already spun some portion of this request
2773 * off as async requests... we need to wait for the I/O
2774 * to complete before returning
2775 */
2776 goto wait_for_dwrites;
2777 }
2778 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2779 pages_in_pl = upl_size / PAGE_SIZE;
2780
2781 for (i = 0; i < pages_in_pl; i++) {
2782 if (!upl_valid_page(pl, i)) {
2783 break;
2784 }
2785 }
2786 if (i == pages_in_pl) {
2787 break;
2788 }
2789
2790 /*
2791 * didn't get all the pages back that we
2792 * needed... release this upl and try again
2793 */
2794 ubc_upl_abort(upl, 0);
2795 }
2796 if (force_data_sync >= 3) {
2797 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2798 i, pages_in_pl, upl_size, kret, 0);
2799 /*
2800 * for some reason, we couldn't acquire a hold on all
2801 * the pages needed in the user's address space
2802 *
2803 * we may have already spun some portion of this request
2804 * off as async requests... we need to wait for the I/O
2805 * to complete before returning
2806 */
2807 goto wait_for_dwrites;
2808 }
2809
2810 /*
2811 * Consider the possibility that upl_size wasn't satisfied.
2812 */
2813 if (upl_size < upl_needed_size) {
2814 if (upl_size && upl_offset == 0) {
2815 io_size = upl_size;
2816 } else {
2817 io_size = 0;
2818 }
2819 }
2820 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2821 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2822
2823 if (io_size == 0) {
2824 ubc_upl_abort(upl, 0);
2825 /*
2826 * we may have already spun some portion of this request
2827 * off as async requests... we need to wait for the I/O
2828 * to complete before returning
2829 */
2830 goto wait_for_dwrites;
2831 }
2832
2833 if (useVectorUPL) {
2834 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2835 if (end_off) {
2836 issueVectorUPL = 1;
2837 }
2838 /*
2839 * After this point, if we are using a vector UPL, then
2840 * either all the UPL elements end on a page boundary OR
2841 * this UPL is the last element because it does not end
2842 * on a page boundary.
2843 */
2844 }
2845
2846 /*
2847 * we want push out these writes asynchronously so that we can overlap
2848 * the preparation of the next I/O
2849 * if there are already too many outstanding writes
2850 * wait until some complete before issuing the next
2851 */
2852 if (vp->v_mount->mnt_minsaturationbytecount) {
2853 bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
2854 } else {
2855 bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2);
2856 }
2857
2858 cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
2859
2860 if (iostate.io_error) {
2861 /*
2862 * one of the earlier writes we issued ran into a hard error
2863 * don't issue any more writes, cleanup the UPL
2864 * that was just created but not used, then
2865 * go wait for all writes that are part of this stream
2866 * to complete before returning the error to the caller
2867 */
2868 ubc_upl_abort(upl, 0);
2869
2870 goto wait_for_dwrites;
2871 }
2872
2873 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2874 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2875
2876 if (!useVectorUPL) {
2877 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2878 io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2879 } else {
2880 if (!vector_upl_index) {
2881 vector_upl = vector_upl_create(upl_offset);
2882 v_upl_uio_offset = uio->uio_offset;
2883 vector_upl_offset = upl_offset;
2884 }
2885
2886 vector_upl_set_subupl(vector_upl, upl, upl_size);
2887 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2888 vector_upl_index++;
2889 vector_upl_iosize += io_size;
2890 vector_upl_size += upl_size;
2891
2892 if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
2893 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2894 reset_vector_run_state();
2895 }
2896 }
2897
2898 /*
2899 * update the uio structure to
2900 * reflect the I/O that we just issued
2901 */
2902 uio_update(uio, (user_size_t)io_size);
2903
2904 /*
2905 * in case we end up calling through to cluster_write_copy to finish
2906 * the tail of this request, we need to update the oldEOF so that we
2907 * don't zero-fill the head of a page if we've successfully written
2908 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2909 * page that is beyond the oldEOF if the write is unaligned... we only
2910 * want that to happen for the very first page of the cluster_write,
2911 * NOT the first page of each vector making up a multi-vector write.
2912 */
2913 if (uio->uio_offset > oldEOF) {
2914 oldEOF = uio->uio_offset;
2915 }
2916
2917 io_req_size -= io_size;
2918
2919 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2920 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2921 } /* end while */
2922
2923 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2924 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2925
2926 if (retval == 0 && *write_type == IO_DIRECT) {
2927 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2928 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2929
2930 goto next_dwrite;
2931 }
2932 }
2933
2934 wait_for_dwrites:
2935
2936 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
2937 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2938 reset_vector_run_state();
2939 }
2940 /*
2941 * make sure all async writes issued as part of this stream
2942 * have completed before we return
2943 */
2944 cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
2945
2946 if (iostate.io_error) {
2947 retval = iostate.io_error;
2948 }
2949
2950 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
2951
2952 if (io_throttled == TRUE && retval == 0) {
2953 retval = EAGAIN;
2954 }
2955
2956 if (io_req_size && retval == 0) {
2957 /*
2958 * we couldn't handle the tail of this request in DIRECT mode
2959 * so fire it through the copy path
2960 *
2961 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2962 * so we can just pass 0 in for the headOff and tailOff
2963 */
2964 if (uio->uio_offset > oldEOF) {
2965 oldEOF = uio->uio_offset;
2966 }
2967
2968 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
2969
2970 *write_type = IO_UNKNOWN;
2971 }
2972 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
2973 (int)uio->uio_offset, io_req_size, retval, 4, 0);
2974
2975 return retval;
2976 }
2977
2978
2979 static int
cluster_write_contig(vnode_t vp,struct uio * uio,off_t newEOF,int * write_type,u_int32_t * write_length,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2980 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
2981 int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2982 {
2983 upl_page_info_t *pl;
2984 addr64_t src_paddr = 0;
2985 upl_t upl[MAX_VECTS];
2986 vm_offset_t upl_offset;
2987 u_int32_t tail_size = 0;
2988 u_int32_t io_size;
2989 u_int32_t xsize;
2990 upl_size_t upl_size;
2991 vm_size_t upl_needed_size;
2992 mach_msg_type_number_t pages_in_pl;
2993 upl_control_flags_t upl_flags;
2994 kern_return_t kret;
2995 struct clios iostate;
2996 int error = 0;
2997 int cur_upl = 0;
2998 int num_upl = 0;
2999 int n;
3000 user_addr_t iov_base;
3001 u_int32_t devblocksize;
3002 u_int32_t mem_alignment_mask;
3003
3004 /*
3005 * When we enter this routine, we know
3006 * -- the io_req_size will not exceed iov_len
3007 * -- the target address is physically contiguous
3008 */
3009 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
3010
3011 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3012 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3013
3014 iostate.io_completed = 0;
3015 iostate.io_issued = 0;
3016 iostate.io_error = 0;
3017 iostate.io_wanted = 0;
3018
3019 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
3020
3021 next_cwrite:
3022 io_size = *write_length;
3023
3024 iov_base = uio_curriovbase(uio);
3025
3026 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3027 upl_needed_size = upl_offset + io_size;
3028
3029 pages_in_pl = 0;
3030 upl_size = (upl_size_t)upl_needed_size;
3031 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
3032 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3033
3034 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
3035 kret = vm_map_get_upl(map,
3036 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
3037 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
3038
3039 if (kret != KERN_SUCCESS) {
3040 /*
3041 * failed to get pagelist
3042 */
3043 error = EINVAL;
3044 goto wait_for_cwrites;
3045 }
3046 num_upl++;
3047
3048 /*
3049 * Consider the possibility that upl_size wasn't satisfied.
3050 */
3051 if (upl_size < upl_needed_size) {
3052 /*
3053 * This is a failure in the physical memory case.
3054 */
3055 error = EINVAL;
3056 goto wait_for_cwrites;
3057 }
3058 pl = ubc_upl_pageinfo(upl[cur_upl]);
3059
3060 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
3061
3062 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3063 u_int32_t head_size;
3064
3065 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
3066
3067 if (head_size > io_size) {
3068 head_size = io_size;
3069 }
3070
3071 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
3072
3073 if (error) {
3074 goto wait_for_cwrites;
3075 }
3076
3077 upl_offset += head_size;
3078 src_paddr += head_size;
3079 io_size -= head_size;
3080
3081 iov_base += head_size;
3082 }
3083 if ((u_int32_t)iov_base & mem_alignment_mask) {
3084 /*
3085 * request doesn't set up on a memory boundary
3086 * the underlying DMA engine can handle...
3087 * return an error instead of going through
3088 * the slow copy path since the intent of this
3089 * path is direct I/O from device memory
3090 */
3091 error = EINVAL;
3092 goto wait_for_cwrites;
3093 }
3094
3095 tail_size = io_size & (devblocksize - 1);
3096 io_size -= tail_size;
3097
3098 while (io_size && error == 0) {
3099 if (io_size > MAX_IO_CONTIG_SIZE) {
3100 xsize = MAX_IO_CONTIG_SIZE;
3101 } else {
3102 xsize = io_size;
3103 }
3104 /*
3105 * request asynchronously so that we can overlap
3106 * the preparation of the next I/O... we'll do
3107 * the commit after all the I/O has completed
3108 * since its all issued against the same UPL
3109 * if there are already too many outstanding writes
3110 * wait until some have completed before issuing the next
3111 */
3112 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
3113
3114 if (iostate.io_error) {
3115 /*
3116 * one of the earlier writes we issued ran into a hard error
3117 * don't issue any more writes...
3118 * go wait for all writes that are part of this stream
3119 * to complete before returning the error to the caller
3120 */
3121 goto wait_for_cwrites;
3122 }
3123 /*
3124 * issue an asynchronous write to cluster_io
3125 */
3126 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
3127 xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
3128
3129 if (error == 0) {
3130 /*
3131 * The cluster_io write completed successfully,
3132 * update the uio structure
3133 */
3134 uio_update(uio, (user_size_t)xsize);
3135
3136 upl_offset += xsize;
3137 src_paddr += xsize;
3138 io_size -= xsize;
3139 }
3140 }
3141 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
3142 error = cluster_io_type(uio, write_type, write_length, 0);
3143
3144 if (error == 0 && *write_type == IO_CONTIG) {
3145 cur_upl++;
3146 goto next_cwrite;
3147 }
3148 } else {
3149 *write_type = IO_UNKNOWN;
3150 }
3151
3152 wait_for_cwrites:
3153 /*
3154 * make sure all async writes that are part of this stream
3155 * have completed before we proceed
3156 */
3157 cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
3158
3159 if (iostate.io_error) {
3160 error = iostate.io_error;
3161 }
3162
3163 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
3164
3165 if (error == 0 && tail_size) {
3166 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
3167 }
3168
3169 for (n = 0; n < num_upl; n++) {
3170 /*
3171 * just release our hold on each physically contiguous
3172 * region without changing any state
3173 */
3174 ubc_upl_abort(upl[n], 0);
3175 }
3176
3177 return error;
3178 }
3179
3180
3181 /*
3182 * need to avoid a race between an msync of a range of pages dirtied via mmap
3183 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3184 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3185 *
3186 * we should never force-zero-fill pages that are already valid in the cache...
3187 * the entire page contains valid data (either from disk, zero-filled or dirtied
3188 * via an mmap) so we can only do damage by trying to zero-fill
3189 *
3190 */
3191 static int
cluster_zero_range(upl_t upl,upl_page_info_t * pl,int flags,int io_offset,off_t zero_off,off_t upl_f_offset,int bytes_to_zero)3192 cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3193 {
3194 int zero_pg_index;
3195 boolean_t need_cluster_zero = TRUE;
3196
3197 if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
3198 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
3199 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3200
3201 if (upl_valid_page(pl, zero_pg_index)) {
3202 /*
3203 * never force zero valid pages - dirty or clean
3204 * we'll leave these in the UPL for cluster_write_copy to deal with
3205 */
3206 need_cluster_zero = FALSE;
3207 }
3208 }
3209 if (need_cluster_zero == TRUE) {
3210 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
3211 }
3212
3213 return bytes_to_zero;
3214 }
3215
3216
3217 void
cluster_update_state(vnode_t vp,vm_object_offset_t s_offset,vm_object_offset_t e_offset,boolean_t vm_initiated)3218 cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3219 {
3220 struct cl_extent cl;
3221 boolean_t first_pass = TRUE;
3222
3223 assert(s_offset < e_offset);
3224 assert((s_offset & PAGE_MASK_64) == 0);
3225 assert((e_offset & PAGE_MASK_64) == 0);
3226
3227 cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3228 cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3229
3230 cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
3231 vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3232 }
3233
3234
3235 static void
cluster_update_state_internal(vnode_t vp,struct cl_extent * cl,int flags,boolean_t defer_writes,boolean_t * first_pass,off_t write_off,int write_cnt,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)3236 cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
3237 boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3238 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
3239 {
3240 struct cl_writebehind *wbp;
3241 int cl_index;
3242 int ret_cluster_try_push;
3243 u_int max_cluster_pgcount;
3244
3245
3246 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3247
3248 /*
3249 * take the lock to protect our accesses
3250 * of the writebehind and sparse cluster state
3251 */
3252 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3253
3254 if (wbp->cl_scmap) {
3255 if (!(flags & IO_NOCACHE)) {
3256 /*
3257 * we've fallen into the sparse
3258 * cluster method of delaying dirty pages
3259 */
3260 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3261
3262 lck_mtx_unlock(&wbp->cl_lockw);
3263 return;
3264 }
3265 /*
3266 * must have done cached writes that fell into
3267 * the sparse cluster mechanism... we've switched
3268 * to uncached writes on the file, so go ahead
3269 * and push whatever's in the sparse map
3270 * and switch back to normal clustering
3271 */
3272 wbp->cl_number = 0;
3273
3274 sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
3275 /*
3276 * no clusters of either type present at this point
3277 * so just go directly to start_new_cluster since
3278 * we know we need to delay this I/O since we've
3279 * already released the pages back into the cache
3280 * to avoid the deadlock with sparse_cluster_push
3281 */
3282 goto start_new_cluster;
3283 }
3284 if (*first_pass == TRUE) {
3285 if (write_off == wbp->cl_last_write) {
3286 wbp->cl_seq_written += write_cnt;
3287 } else {
3288 wbp->cl_seq_written = write_cnt;
3289 }
3290
3291 wbp->cl_last_write = write_off + write_cnt;
3292
3293 *first_pass = FALSE;
3294 }
3295 if (wbp->cl_number == 0) {
3296 /*
3297 * no clusters currently present
3298 */
3299 goto start_new_cluster;
3300 }
3301
3302 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3303 /*
3304 * check each cluster that we currently hold
3305 * try to merge some or all of this write into
3306 * one or more of the existing clusters... if
3307 * any portion of the write remains, start a
3308 * new cluster
3309 */
3310 if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3311 /*
3312 * the current write starts at or after the current cluster
3313 */
3314 if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3315 /*
3316 * we have a write that fits entirely
3317 * within the existing cluster limits
3318 */
3319 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3320 /*
3321 * update our idea of where the cluster ends
3322 */
3323 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3324 }
3325 break;
3326 }
3327 if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3328 /*
3329 * we have a write that starts in the middle of the current cluster
3330 * but extends beyond the cluster's limit... we know this because
3331 * of the previous checks
3332 * we'll extend the current cluster to the max
3333 * and update the b_addr for the current write to reflect that
3334 * the head of it was absorbed into this cluster...
3335 * note that we'll always have a leftover tail in this case since
3336 * full absorbtion would have occurred in the clause above
3337 */
3338 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3339
3340 cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3341 }
3342 /*
3343 * we come here for the case where the current write starts
3344 * beyond the limit of the existing cluster or we have a leftover
3345 * tail after a partial absorbtion
3346 *
3347 * in either case, we'll check the remaining clusters before
3348 * starting a new one
3349 */
3350 } else {
3351 /*
3352 * the current write starts in front of the cluster we're currently considering
3353 */
3354 if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3355 /*
3356 * we can just merge the new request into
3357 * this cluster and leave it in the cache
3358 * since the resulting cluster is still
3359 * less than the maximum allowable size
3360 */
3361 wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3362
3363 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3364 /*
3365 * the current write completely
3366 * envelops the existing cluster and since
3367 * each write is limited to at most max_cluster_pgcount pages
3368 * we can just use the start and last blocknos of the write
3369 * to generate the cluster limits
3370 */
3371 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3372 }
3373 break;
3374 }
3375 /*
3376 * if we were to combine this write with the current cluster
3377 * we would exceed the cluster size limit.... so,
3378 * let's see if there's any overlap of the new I/O with
3379 * the cluster we're currently considering... in fact, we'll
3380 * stretch the cluster out to it's full limit and see if we
3381 * get an intersection with the current write
3382 *
3383 */
3384 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3385 /*
3386 * the current write extends into the proposed cluster
3387 * clip the length of the current write after first combining it's
3388 * tail with the newly shaped cluster
3389 */
3390 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3391
3392 cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3393 }
3394 /*
3395 * if we get here, there was no way to merge
3396 * any portion of this write with this cluster
3397 * or we could only merge part of it which
3398 * will leave a tail...
3399 * we'll check the remaining clusters before starting a new one
3400 */
3401 }
3402 }
3403 if (cl_index < wbp->cl_number) {
3404 /*
3405 * we found an existing cluster(s) that we
3406 * could entirely merge this I/O into
3407 */
3408 goto delay_io;
3409 }
3410
3411 if (defer_writes == FALSE &&
3412 wbp->cl_number == MAX_CLUSTERS &&
3413 wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3414 uint32_t n;
3415
3416 if (vp->v_mount->mnt_minsaturationbytecount) {
3417 n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
3418
3419 if (n > MAX_CLUSTERS) {
3420 n = MAX_CLUSTERS;
3421 }
3422 } else {
3423 n = 0;
3424 }
3425
3426 if (n == 0) {
3427 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
3428 n = WRITE_BEHIND_SSD;
3429 } else {
3430 n = WRITE_BEHIND;
3431 }
3432 }
3433 while (n--) {
3434 cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
3435 }
3436 }
3437 if (wbp->cl_number < MAX_CLUSTERS) {
3438 /*
3439 * we didn't find an existing cluster to
3440 * merge into, but there's room to start
3441 * a new one
3442 */
3443 goto start_new_cluster;
3444 }
3445 /*
3446 * no exisitng cluster to merge with and no
3447 * room to start a new one... we'll try
3448 * pushing one of the existing ones... if none of
3449 * them are able to be pushed, we'll switch
3450 * to the sparse cluster mechanism
3451 * cluster_try_push updates cl_number to the
3452 * number of remaining clusters... and
3453 * returns the number of currently unused clusters
3454 */
3455 ret_cluster_try_push = 0;
3456
3457 /*
3458 * if writes are not deferred, call cluster push immediately
3459 */
3460 if (defer_writes == FALSE) {
3461 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
3462 }
3463 /*
3464 * execute following regardless of writes being deferred or not
3465 */
3466 if (ret_cluster_try_push == 0) {
3467 /*
3468 * no more room in the normal cluster mechanism
3469 * so let's switch to the more expansive but expensive
3470 * sparse mechanism....
3471 */
3472 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
3473 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3474
3475 lck_mtx_unlock(&wbp->cl_lockw);
3476 return;
3477 }
3478 start_new_cluster:
3479 wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3480 wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3481
3482 wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3483
3484 if (flags & IO_NOCACHE) {
3485 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3486 }
3487
3488 if (flags & IO_PASSIVE) {
3489 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3490 }
3491
3492 wbp->cl_number++;
3493 delay_io:
3494 lck_mtx_unlock(&wbp->cl_lockw);
3495 return;
3496 }
3497
3498
3499 static int
cluster_write_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int flags,int (* callback)(buf_t,void *),void * callback_arg)3500 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
3501 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3502 {
3503 upl_page_info_t *pl;
3504 upl_t upl;
3505 vm_offset_t upl_offset = 0;
3506 vm_size_t upl_size;
3507 off_t upl_f_offset;
3508 int pages_in_upl;
3509 int start_offset;
3510 int xfer_resid;
3511 int io_size;
3512 int io_offset;
3513 int bytes_to_zero;
3514 int bytes_to_move;
3515 kern_return_t kret;
3516 int retval = 0;
3517 int io_resid;
3518 long long total_size;
3519 long long zero_cnt;
3520 off_t zero_off;
3521 long long zero_cnt1;
3522 off_t zero_off1;
3523 off_t write_off = 0;
3524 int write_cnt = 0;
3525 boolean_t first_pass = FALSE;
3526 struct cl_extent cl;
3527 int bflag;
3528 u_int max_io_size;
3529
3530 if (uio) {
3531 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3532 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
3533
3534 io_resid = io_req_size;
3535 } else {
3536 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3537 0, 0, (int)oldEOF, (int)newEOF, 0);
3538
3539 io_resid = 0;
3540 }
3541 if (flags & IO_PASSIVE) {
3542 bflag = CL_PASSIVE;
3543 } else {
3544 bflag = 0;
3545 }
3546 if (flags & IO_NOCACHE) {
3547 bflag |= CL_NOCACHE;
3548 }
3549
3550 if (flags & IO_SKIP_ENCRYPTION) {
3551 bflag |= CL_ENCRYPTED;
3552 }
3553
3554 zero_cnt = 0;
3555 zero_cnt1 = 0;
3556 zero_off = 0;
3557 zero_off1 = 0;
3558
3559 max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
3560
3561 if (flags & IO_HEADZEROFILL) {
3562 /*
3563 * some filesystems (HFS is one) don't support unallocated holes within a file...
3564 * so we zero fill the intervening space between the old EOF and the offset
3565 * where the next chunk of real data begins.... ftruncate will also use this
3566 * routine to zero fill to the new EOF when growing a file... in this case, the
3567 * uio structure will not be provided
3568 */
3569 if (uio) {
3570 if (headOff < uio->uio_offset) {
3571 zero_cnt = uio->uio_offset - headOff;
3572 zero_off = headOff;
3573 }
3574 } else if (headOff < newEOF) {
3575 zero_cnt = newEOF - headOff;
3576 zero_off = headOff;
3577 }
3578 } else {
3579 if (uio && uio->uio_offset > oldEOF) {
3580 zero_off = uio->uio_offset & ~PAGE_MASK_64;
3581
3582 if (zero_off >= oldEOF) {
3583 zero_cnt = uio->uio_offset - zero_off;
3584
3585 flags |= IO_HEADZEROFILL;
3586 }
3587 }
3588 }
3589 if (flags & IO_TAILZEROFILL) {
3590 if (uio) {
3591 zero_off1 = uio->uio_offset + io_req_size;
3592
3593 if (zero_off1 < tailOff) {
3594 zero_cnt1 = tailOff - zero_off1;
3595 }
3596 }
3597 } else {
3598 if (uio && newEOF > oldEOF) {
3599 zero_off1 = uio->uio_offset + io_req_size;
3600
3601 if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3602 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3603
3604 flags |= IO_TAILZEROFILL;
3605 }
3606 }
3607 }
3608 if (zero_cnt == 0 && uio == (struct uio *) 0) {
3609 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3610 retval, 0, 0, 0, 0);
3611 return 0;
3612 }
3613 if (uio) {
3614 write_off = uio->uio_offset;
3615 write_cnt = (int)uio_resid(uio);
3616 /*
3617 * delay updating the sequential write info
3618 * in the control block until we've obtained
3619 * the lock for it
3620 */
3621 first_pass = TRUE;
3622 }
3623 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
3624 /*
3625 * for this iteration of the loop, figure out where our starting point is
3626 */
3627 if (zero_cnt) {
3628 start_offset = (int)(zero_off & PAGE_MASK_64);
3629 upl_f_offset = zero_off - start_offset;
3630 } else if (io_resid) {
3631 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3632 upl_f_offset = uio->uio_offset - start_offset;
3633 } else {
3634 start_offset = (int)(zero_off1 & PAGE_MASK_64);
3635 upl_f_offset = zero_off1 - start_offset;
3636 }
3637 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3638 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
3639
3640 if (total_size > max_io_size) {
3641 total_size = max_io_size;
3642 }
3643
3644 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3645
3646 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
3647 /*
3648 * assumption... total_size <= io_resid
3649 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3650 */
3651 if ((start_offset + total_size) > max_io_size) {
3652 total_size = max_io_size - start_offset;
3653 }
3654 xfer_resid = (int)total_size;
3655
3656 retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
3657
3658 if (retval) {
3659 break;
3660 }
3661
3662 io_resid -= (total_size - xfer_resid);
3663 total_size = xfer_resid;
3664 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3665 upl_f_offset = uio->uio_offset - start_offset;
3666
3667 if (total_size == 0) {
3668 if (start_offset) {
3669 /*
3670 * the write did not finish on a page boundary
3671 * which will leave upl_f_offset pointing to the
3672 * beginning of the last page written instead of
3673 * the page beyond it... bump it in this case
3674 * so that the cluster code records the last page
3675 * written as dirty
3676 */
3677 upl_f_offset += PAGE_SIZE_64;
3678 }
3679 upl_size = 0;
3680
3681 goto check_cluster;
3682 }
3683 }
3684 /*
3685 * compute the size of the upl needed to encompass
3686 * the requested write... limit each call to cluster_io
3687 * to the maximum UPL size... cluster_io will clip if
3688 * this exceeds the maximum io_size for the device,
3689 * make sure to account for
3690 * a starting offset that's not page aligned
3691 */
3692 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3693
3694 if (upl_size > max_io_size) {
3695 upl_size = max_io_size;
3696 }
3697
3698 pages_in_upl = (int)(upl_size / PAGE_SIZE);
3699 io_size = (int)(upl_size - start_offset);
3700
3701 if ((long long)io_size > total_size) {
3702 io_size = (int)total_size;
3703 }
3704
3705 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
3706
3707
3708 /*
3709 * Gather the pages from the buffer cache.
3710 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3711 * that we intend to modify these pages.
3712 */
3713 kret = ubc_create_upl_kernel(vp,
3714 upl_f_offset,
3715 (int)upl_size,
3716 &upl,
3717 &pl,
3718 UPL_SET_LITE | ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
3719 VM_KERN_MEMORY_FILE);
3720 if (kret != KERN_SUCCESS) {
3721 panic("cluster_write_copy: failed to get pagelist");
3722 }
3723
3724 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
3725 upl, (int)upl_f_offset, start_offset, 0, 0);
3726
3727 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
3728 int read_size;
3729
3730 /*
3731 * we're starting in the middle of the first page of the upl
3732 * and the page isn't currently valid, so we're going to have
3733 * to read it in first... this is a synchronous operation
3734 */
3735 read_size = PAGE_SIZE;
3736
3737 if ((upl_f_offset + read_size) > oldEOF) {
3738 read_size = (int)(oldEOF - upl_f_offset);
3739 }
3740
3741 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
3742 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3743 if (retval) {
3744 /*
3745 * we had an error during the read which causes us to abort
3746 * the current cluster_write request... before we do, we need
3747 * to release the rest of the pages in the upl without modifying
3748 * there state and mark the failed page in error
3749 */
3750 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3751
3752 if (upl_size > PAGE_SIZE) {
3753 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size,
3754 UPL_ABORT_FREE_ON_EMPTY);
3755 }
3756
3757 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3758 upl, 0, 0, retval, 0);
3759 break;
3760 }
3761 }
3762 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3763 /*
3764 * the last offset we're writing to in this upl does not end on a page
3765 * boundary... if it's not beyond the old EOF, then we'll also need to
3766 * pre-read this page in if it isn't already valid
3767 */
3768 upl_offset = upl_size - PAGE_SIZE;
3769
3770 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3771 !upl_valid_page(pl, (int)(upl_offset / PAGE_SIZE))) {
3772 int read_size;
3773
3774 read_size = PAGE_SIZE;
3775
3776 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
3777 read_size = (int)(oldEOF - (upl_f_offset + upl_offset));
3778 }
3779
3780 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
3781 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3782 if (retval) {
3783 /*
3784 * we had an error during the read which causes us to abort
3785 * the current cluster_write request... before we do, we
3786 * need to release the rest of the pages in the upl without
3787 * modifying there state and mark the failed page in error
3788 */
3789 ubc_upl_abort_range(upl, (upl_offset_t)upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3790
3791 if (upl_size > PAGE_SIZE) {
3792 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3793 }
3794
3795 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3796 upl, 0, 0, retval, 0);
3797 break;
3798 }
3799 }
3800 }
3801 xfer_resid = io_size;
3802 io_offset = start_offset;
3803
3804 while (zero_cnt && xfer_resid) {
3805 if (zero_cnt < (long long)xfer_resid) {
3806 bytes_to_zero = (int)zero_cnt;
3807 } else {
3808 bytes_to_zero = xfer_resid;
3809 }
3810
3811 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3812
3813 xfer_resid -= bytes_to_zero;
3814 zero_cnt -= bytes_to_zero;
3815 zero_off += bytes_to_zero;
3816 io_offset += bytes_to_zero;
3817 }
3818 if (xfer_resid && io_resid) {
3819 u_int32_t io_requested;
3820
3821 bytes_to_move = min(io_resid, xfer_resid);
3822 io_requested = bytes_to_move;
3823
3824 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3825
3826 if (retval) {
3827 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
3828
3829 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3830 upl, 0, 0, retval, 0);
3831 } else {
3832 io_resid -= bytes_to_move;
3833 xfer_resid -= bytes_to_move;
3834 io_offset += bytes_to_move;
3835 }
3836 }
3837 while (xfer_resid && zero_cnt1 && retval == 0) {
3838 if (zero_cnt1 < (long long)xfer_resid) {
3839 bytes_to_zero = (int)zero_cnt1;
3840 } else {
3841 bytes_to_zero = xfer_resid;
3842 }
3843
3844 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
3845
3846 xfer_resid -= bytes_to_zero;
3847 zero_cnt1 -= bytes_to_zero;
3848 zero_off1 += bytes_to_zero;
3849 io_offset += bytes_to_zero;
3850 }
3851 if (retval == 0) {
3852 int do_zeroing = 1;
3853
3854 io_size += start_offset;
3855
3856 /* Force more restrictive zeroing behavior only on APFS */
3857 if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3858 do_zeroing = 0;
3859 }
3860
3861 if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3862 /*
3863 * if we're extending the file with this write
3864 * we'll zero fill the rest of the page so that
3865 * if the file gets extended again in such a way as to leave a
3866 * hole starting at this EOF, we'll have zero's in the correct spot
3867 */
3868 cluster_zero(upl, io_size, (int)(upl_size - io_size), NULL);
3869 }
3870 /*
3871 * release the upl now if we hold one since...
3872 * 1) pages in it may be present in the sparse cluster map
3873 * and may span 2 separate buckets there... if they do and
3874 * we happen to have to flush a bucket to make room and it intersects
3875 * this upl, a deadlock may result on page BUSY
3876 * 2) we're delaying the I/O... from this point forward we're just updating
3877 * the cluster state... no need to hold the pages, so commit them
3878 * 3) IO_SYNC is set...
3879 * because we had to ask for a UPL that provides currenty non-present pages, the
3880 * UPL has been automatically set to clear the dirty flags (both software and hardware)
3881 * upon committing it... this is not the behavior we want since it's possible for
3882 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3883 * we'll pick these pages back up later with the correct behavior specified.
3884 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3885 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3886 * we hold since the flushing context is holding the cluster lock.
3887 */
3888 ubc_upl_commit_range(upl, 0, (upl_size_t)upl_size,
3889 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
3890 check_cluster:
3891 /*
3892 * calculate the last logical block number
3893 * that this delayed I/O encompassed
3894 */
3895 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3896
3897 if (flags & IO_SYNC) {
3898 /*
3899 * if the IO_SYNC flag is set than we need to bypass
3900 * any clustering and immediately issue the I/O
3901 *
3902 * we don't hold the lock at this point
3903 *
3904 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3905 * so that we correctly deal with a change in state of the hardware modify bit...
3906 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3907 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3908 * responsible for generating the correct sized I/O(s)
3909 */
3910 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
3911 } else {
3912 boolean_t defer_writes = FALSE;
3913
3914 if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) {
3915 defer_writes = TRUE;
3916 }
3917
3918 cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
3919 write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
3920 }
3921 }
3922 }
3923 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
3924
3925 return retval;
3926 }
3927
3928
3929
3930 int
cluster_read(vnode_t vp,struct uio * uio,off_t filesize,int xflags)3931 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
3932 {
3933 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3934 }
3935
3936
3937 int
cluster_read_ext(vnode_t vp,struct uio * uio,off_t filesize,int xflags,int (* callback)(buf_t,void *),void * callback_arg)3938 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3939 {
3940 int retval = 0;
3941 int flags;
3942 user_ssize_t cur_resid;
3943 u_int32_t io_size;
3944 u_int32_t read_length = 0;
3945 int read_type = IO_COPY;
3946
3947 flags = xflags;
3948
3949 if (vp->v_flag & VNOCACHE_DATA) {
3950 flags |= IO_NOCACHE;
3951 }
3952 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) {
3953 flags |= IO_RAOFF;
3954 }
3955
3956 if (flags & IO_SKIP_ENCRYPTION) {
3957 flags |= IO_ENCRYPTED;
3958 }
3959
3960 /*
3961 * do a read through the cache if one of the following is true....
3962 * NOCACHE is not true
3963 * the uio request doesn't target USERSPACE
3964 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3965 * Reading encrypted data from a CP filesystem should never result in the data touching
3966 * the UBC.
3967 *
3968 * otherwise, find out if we want the direct or contig variant for
3969 * the first vector in the uio request
3970 */
3971 if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED)) {
3972 retval = cluster_io_type(uio, &read_type, &read_length, 0);
3973 }
3974
3975 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
3976 switch (read_type) {
3977 case IO_COPY:
3978 /*
3979 * make sure the uio_resid isn't too big...
3980 * internally, we want to handle all of the I/O in
3981 * chunk sizes that fit in a 32 bit int
3982 */
3983 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
3984 io_size = MAX_IO_REQUEST_SIZE;
3985 } else {
3986 io_size = (u_int32_t)cur_resid;
3987 }
3988
3989 retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
3990 break;
3991
3992 case IO_DIRECT:
3993 retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
3994 break;
3995
3996 case IO_CONTIG:
3997 retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
3998 break;
3999
4000 case IO_UNKNOWN:
4001 retval = cluster_io_type(uio, &read_type, &read_length, 0);
4002 break;
4003 }
4004 }
4005 return retval;
4006 }
4007
4008
4009
4010 static void
cluster_read_upl_release(upl_t upl,int start_pg,int last_pg,int take_reference)4011 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
4012 {
4013 int range;
4014 int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4015
4016 if ((range = last_pg - start_pg)) {
4017 if (take_reference) {
4018 abort_flags |= UPL_ABORT_REFERENCE;
4019 }
4020
4021 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
4022 }
4023 }
4024
4025
4026 static int
cluster_read_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)4027 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4028 {
4029 upl_page_info_t *pl;
4030 upl_t upl;
4031 vm_offset_t upl_offset;
4032 u_int32_t upl_size;
4033 off_t upl_f_offset;
4034 int start_offset;
4035 int start_pg;
4036 int last_pg;
4037 int uio_last = 0;
4038 int pages_in_upl;
4039 off_t max_size;
4040 off_t last_ioread_offset;
4041 off_t last_request_offset;
4042 kern_return_t kret;
4043 int error = 0;
4044 int retval = 0;
4045 u_int32_t size_of_prefetch;
4046 u_int32_t xsize;
4047 u_int32_t io_size;
4048 u_int32_t max_rd_size;
4049 u_int32_t max_io_size;
4050 u_int32_t max_prefetch;
4051 u_int rd_ahead_enabled = 1;
4052 u_int prefetch_enabled = 1;
4053 struct cl_readahead * rap;
4054 struct clios iostate;
4055 struct cl_extent extent;
4056 int bflag;
4057 int take_reference = 1;
4058 int policy = IOPOL_DEFAULT;
4059 boolean_t iolock_inited = FALSE;
4060
4061 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
4062 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
4063
4064 if (flags & IO_ENCRYPTED) {
4065 panic("encrypted blocks will hit UBC!");
4066 }
4067
4068 policy = throttle_get_io_policy(NULL);
4069
4070 if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) {
4071 take_reference = 0;
4072 }
4073
4074 if (flags & IO_PASSIVE) {
4075 bflag = CL_PASSIVE;
4076 } else {
4077 bflag = 0;
4078 }
4079
4080 if (flags & IO_NOCACHE) {
4081 bflag |= CL_NOCACHE;
4082 }
4083
4084 if (flags & IO_SKIP_ENCRYPTION) {
4085 bflag |= CL_ENCRYPTED;
4086 }
4087
4088 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
4089 max_prefetch = MAX_PREFETCH(vp, max_io_size, disk_conditioner_mount_is_ssd(vp->v_mount));
4090 max_rd_size = max_prefetch;
4091
4092 last_request_offset = uio->uio_offset + io_req_size;
4093
4094 if (last_request_offset > filesize) {
4095 last_request_offset = filesize;
4096 }
4097
4098 if ((flags & (IO_RAOFF | IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
4099 rd_ahead_enabled = 0;
4100 rap = NULL;
4101 } else {
4102 if (cluster_is_throttled(vp)) {
4103 /*
4104 * we're in the throttle window, at the very least
4105 * we want to limit the size of the I/O we're about
4106 * to issue
4107 */
4108 rd_ahead_enabled = 0;
4109 prefetch_enabled = 0;
4110
4111 max_rd_size = THROTTLE_MAX_IOSIZE;
4112 }
4113 if ((rap = cluster_get_rap(vp)) == NULL) {
4114 rd_ahead_enabled = 0;
4115 } else {
4116 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
4117 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
4118 }
4119 }
4120 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
4121 /*
4122 * determine if we already have a read-ahead in the pipe courtesy of the
4123 * last read systemcall that was issued...
4124 * if so, pick up it's extent to determine where we should start
4125 * with respect to any read-ahead that might be necessary to
4126 * garner all the data needed to complete this read systemcall
4127 */
4128 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
4129
4130 if (last_ioread_offset < uio->uio_offset) {
4131 last_ioread_offset = (off_t)0;
4132 } else if (last_ioread_offset > last_request_offset) {
4133 last_ioread_offset = last_request_offset;
4134 }
4135 } else {
4136 last_ioread_offset = (off_t)0;
4137 }
4138
4139 while (io_req_size && uio->uio_offset < filesize && retval == 0) {
4140 max_size = filesize - uio->uio_offset;
4141 bool leftover_upl_aborted = false;
4142
4143 if ((off_t)(io_req_size) < max_size) {
4144 io_size = io_req_size;
4145 } else {
4146 io_size = (u_int32_t)max_size;
4147 }
4148
4149 if (!(flags & IO_NOCACHE)) {
4150 while (io_size) {
4151 u_int32_t io_resid;
4152 u_int32_t io_requested;
4153
4154 /*
4155 * if we keep finding the pages we need already in the cache, then
4156 * don't bother to call cluster_read_prefetch since it costs CPU cycles
4157 * to determine that we have all the pages we need... once we miss in
4158 * the cache and have issued an I/O, than we'll assume that we're likely
4159 * to continue to miss in the cache and it's to our advantage to try and prefetch
4160 */
4161 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset))) {
4162 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4163 /*
4164 * we've already issued I/O for this request and
4165 * there's still work to do and
4166 * our prefetch stream is running dry, so issue a
4167 * pre-fetch I/O... the I/O latency will overlap
4168 * with the copying of the data
4169 */
4170 if (size_of_prefetch > max_rd_size) {
4171 size_of_prefetch = max_rd_size;
4172 }
4173
4174 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4175
4176 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4177
4178 if (last_ioread_offset > last_request_offset) {
4179 last_ioread_offset = last_request_offset;
4180 }
4181 }
4182 }
4183 /*
4184 * limit the size of the copy we're about to do so that
4185 * we can notice that our I/O pipe is running dry and
4186 * get the next I/O issued before it does go dry
4187 */
4188 if (last_ioread_offset && io_size > (max_io_size / 4)) {
4189 io_resid = (max_io_size / 4);
4190 } else {
4191 io_resid = io_size;
4192 }
4193
4194 io_requested = io_resid;
4195
4196 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
4197
4198 xsize = io_requested - io_resid;
4199
4200 io_size -= xsize;
4201 io_req_size -= xsize;
4202
4203 if (retval || io_resid) {
4204 /*
4205 * if we run into a real error or
4206 * a page that is not in the cache
4207 * we need to leave streaming mode
4208 */
4209 break;
4210 }
4211
4212 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
4213 /*
4214 * we're already finished the I/O for this read request
4215 * let's see if we should do a read-ahead
4216 */
4217 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4218 }
4219 }
4220 if (retval) {
4221 break;
4222 }
4223 if (io_size == 0) {
4224 if (rap != NULL) {
4225 if (extent.e_addr < rap->cl_lastr) {
4226 rap->cl_maxra = 0;
4227 }
4228 rap->cl_lastr = extent.e_addr;
4229 }
4230 break;
4231 }
4232 /*
4233 * recompute max_size since cluster_copy_ubc_data_internal
4234 * may have advanced uio->uio_offset
4235 */
4236 max_size = filesize - uio->uio_offset;
4237 }
4238
4239 iostate.io_completed = 0;
4240 iostate.io_issued = 0;
4241 iostate.io_error = 0;
4242 iostate.io_wanted = 0;
4243
4244 if ((flags & IO_RETURN_ON_THROTTLE)) {
4245 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4246 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4247 /*
4248 * we're in the throttle window and at least 1 I/O
4249 * has already been issued by a throttleable thread
4250 * in this window, so return with EAGAIN to indicate
4251 * to the FS issuing the cluster_read call that it
4252 * should now throttle after dropping any locks
4253 */
4254 throttle_info_update_by_mount(vp->v_mount);
4255
4256 retval = EAGAIN;
4257 break;
4258 }
4259 }
4260 }
4261
4262 /*
4263 * compute the size of the upl needed to encompass
4264 * the requested read... limit each call to cluster_io
4265 * to the maximum UPL size... cluster_io will clip if
4266 * this exceeds the maximum io_size for the device,
4267 * make sure to account for
4268 * a starting offset that's not page aligned
4269 */
4270 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4271 upl_f_offset = uio->uio_offset - (off_t)start_offset;
4272
4273 if (io_size > max_rd_size) {
4274 io_size = max_rd_size;
4275 }
4276
4277 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4278
4279 if (flags & IO_NOCACHE) {
4280 if (upl_size > max_io_size) {
4281 upl_size = max_io_size;
4282 }
4283 } else {
4284 if (upl_size > max_io_size / 4) {
4285 upl_size = max_io_size / 4;
4286 upl_size &= ~PAGE_MASK;
4287
4288 if (upl_size == 0) {
4289 upl_size = PAGE_SIZE;
4290 }
4291 }
4292 }
4293 pages_in_upl = upl_size / PAGE_SIZE;
4294
4295 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
4296 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4297
4298 kret = ubc_create_upl_kernel(vp,
4299 upl_f_offset,
4300 upl_size,
4301 &upl,
4302 &pl,
4303 UPL_FILE_IO | UPL_SET_LITE,
4304 VM_KERN_MEMORY_FILE);
4305 if (kret != KERN_SUCCESS) {
4306 panic("cluster_read_copy: failed to get pagelist");
4307 }
4308
4309 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
4310 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4311
4312 /*
4313 * scan from the beginning of the upl looking for the first
4314 * non-valid page.... this will become the first page in
4315 * the request we're going to make to 'cluster_io'... if all
4316 * of the pages are valid, we won't call through to 'cluster_io'
4317 */
4318 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
4319 if (!upl_valid_page(pl, start_pg)) {
4320 break;
4321 }
4322 }
4323
4324 /*
4325 * scan from the starting invalid page looking for a valid
4326 * page before the end of the upl is reached, if we
4327 * find one, then it will be the last page of the request to
4328 * 'cluster_io'
4329 */
4330 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4331 if (upl_valid_page(pl, last_pg)) {
4332 break;
4333 }
4334 }
4335
4336 if (start_pg < last_pg) {
4337 /*
4338 * we found a range of 'invalid' pages that must be filled
4339 * if the last page in this range is the last page of the file
4340 * we may have to clip the size of it to keep from reading past
4341 * the end of the last physical block associated with the file
4342 */
4343 if (iolock_inited == FALSE) {
4344 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4345
4346 iolock_inited = TRUE;
4347 }
4348 upl_offset = start_pg * PAGE_SIZE;
4349 io_size = (last_pg - start_pg) * PAGE_SIZE;
4350
4351 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
4352 io_size = (u_int32_t)(filesize - (upl_f_offset + upl_offset));
4353 }
4354
4355 /*
4356 * Find out if this needs verification, we'll have to manage the UPL
4357 * diffrently if so. Note that this call only lets us know if
4358 * verification is enabled on this mount point, the actual verification
4359 * is performed in the File system.
4360 */
4361 size_t verify_block_size = 0;
4362 if ((VNOP_VERIFY(vp, start_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) /* && verify_block_size */) {
4363 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4364 if (!upl_valid_page(pl, uio_last)) {
4365 break;
4366 }
4367 }
4368 if (uio_last < pages_in_upl) {
4369 /*
4370 * there were some invalid pages beyond the valid pages
4371 * that we didn't issue an I/O for, just release them
4372 * unchanged now, so that any prefetch/readahed can
4373 * include them
4374 */
4375 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4376 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4377 leftover_upl_aborted = true;
4378 }
4379 }
4380
4381 /*
4382 * issue an asynchronous read to cluster_io
4383 */
4384
4385 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
4386 io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
4387
4388 if (rap) {
4389 if (extent.e_addr < rap->cl_maxra) {
4390 /*
4391 * we've just issued a read for a block that should have been
4392 * in the cache courtesy of the read-ahead engine... something
4393 * has gone wrong with the pipeline, so reset the read-ahead
4394 * logic which will cause us to restart from scratch
4395 */
4396 rap->cl_maxra = 0;
4397 }
4398 }
4399 }
4400 if (error == 0) {
4401 /*
4402 * if the read completed successfully, or there was no I/O request
4403 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4404 * we'll first add on any 'valid'
4405 * pages that were present in the upl when we acquired it.
4406 */
4407 u_int val_size;
4408
4409 if (!leftover_upl_aborted) {
4410 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4411 if (!upl_valid_page(pl, uio_last)) {
4412 break;
4413 }
4414 }
4415 if (uio_last < pages_in_upl) {
4416 /*
4417 * there were some invalid pages beyond the valid pages
4418 * that we didn't issue an I/O for, just release them
4419 * unchanged now, so that any prefetch/readahed can
4420 * include them
4421 */
4422 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4423 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4424 }
4425 }
4426
4427 /*
4428 * compute size to transfer this round, if io_req_size is
4429 * still non-zero after this attempt, we'll loop around and
4430 * set up for another I/O.
4431 */
4432 val_size = (uio_last * PAGE_SIZE) - start_offset;
4433
4434 if (val_size > max_size) {
4435 val_size = (u_int)max_size;
4436 }
4437
4438 if (val_size > io_req_size) {
4439 val_size = io_req_size;
4440 }
4441
4442 if ((uio->uio_offset + val_size) > last_ioread_offset) {
4443 last_ioread_offset = uio->uio_offset + val_size;
4444 }
4445
4446 if ((size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4447 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4448 /*
4449 * if there's still I/O left to do for this request, and...
4450 * we're not in hard throttle mode, and...
4451 * we're close to using up the previous prefetch, then issue a
4452 * new pre-fetch I/O... the I/O latency will overlap
4453 * with the copying of the data
4454 */
4455 if (size_of_prefetch > max_rd_size) {
4456 size_of_prefetch = max_rd_size;
4457 }
4458
4459 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4460
4461 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4462
4463 if (last_ioread_offset > last_request_offset) {
4464 last_ioread_offset = last_request_offset;
4465 }
4466 }
4467 } else if ((uio->uio_offset + val_size) == last_request_offset) {
4468 /*
4469 * this transfer will finish this request, so...
4470 * let's try to read ahead if we're in
4471 * a sequential access pattern and we haven't
4472 * explicitly disabled it
4473 */
4474 if (rd_ahead_enabled) {
4475 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4476 }
4477
4478 if (rap != NULL) {
4479 if (extent.e_addr < rap->cl_lastr) {
4480 rap->cl_maxra = 0;
4481 }
4482 rap->cl_lastr = extent.e_addr;
4483 }
4484 }
4485 if (iolock_inited == TRUE) {
4486 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4487 }
4488
4489 if (iostate.io_error) {
4490 error = iostate.io_error;
4491 } else {
4492 u_int32_t io_requested;
4493
4494 io_requested = val_size;
4495
4496 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4497
4498 io_req_size -= (val_size - io_requested);
4499 }
4500 } else {
4501 if (iolock_inited == TRUE) {
4502 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4503 }
4504 }
4505 if (start_pg < last_pg) {
4506 /*
4507 * compute the range of pages that we actually issued an I/O for
4508 * and either commit them as valid if the I/O succeeded
4509 * or abort them if the I/O failed or we're not supposed to
4510 * keep them in the cache
4511 */
4512 io_size = (last_pg - start_pg) * PAGE_SIZE;
4513
4514 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4515
4516 if (error || (flags & IO_NOCACHE)) {
4517 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4518 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4519 } else {
4520 int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
4521
4522 if (take_reference) {
4523 commit_flags |= UPL_COMMIT_INACTIVATE;
4524 } else {
4525 commit_flags |= UPL_COMMIT_SPECULATE;
4526 }
4527
4528 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4529 }
4530 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4531 }
4532 if ((last_pg - start_pg) < pages_in_upl) {
4533 /*
4534 * the set of pages that we issued an I/O for did not encompass
4535 * the entire upl... so just release these without modifying
4536 * their state
4537 */
4538 if (error) {
4539 if (leftover_upl_aborted) {
4540 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, (uio_last - start_pg) * PAGE_SIZE,
4541 UPL_ABORT_FREE_ON_EMPTY);
4542 } else {
4543 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4544 }
4545 } else {
4546 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
4547 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
4548
4549 /*
4550 * handle any valid pages at the beginning of
4551 * the upl... release these appropriately
4552 */
4553 cluster_read_upl_release(upl, 0, start_pg, take_reference);
4554
4555 /*
4556 * handle any valid pages immediately after the
4557 * pages we issued I/O for... ... release these appropriately
4558 */
4559 cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
4560
4561 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
4562 }
4563 }
4564 if (retval == 0) {
4565 retval = error;
4566 }
4567
4568 if (io_req_size) {
4569 if (cluster_is_throttled(vp)) {
4570 /*
4571 * we're in the throttle window, at the very least
4572 * we want to limit the size of the I/O we're about
4573 * to issue
4574 */
4575 rd_ahead_enabled = 0;
4576 prefetch_enabled = 0;
4577 max_rd_size = THROTTLE_MAX_IOSIZE;
4578 } else {
4579 if (max_rd_size == THROTTLE_MAX_IOSIZE) {
4580 /*
4581 * coming out of throttled state
4582 */
4583 if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4584 if (rap != NULL) {
4585 rd_ahead_enabled = 1;
4586 }
4587 prefetch_enabled = 1;
4588 }
4589 max_rd_size = max_prefetch;
4590 last_ioread_offset = 0;
4591 }
4592 }
4593 }
4594 }
4595 if (iolock_inited == TRUE) {
4596 /*
4597 * cluster_io returned an error after it
4598 * had already issued some I/O. we need
4599 * to wait for that I/O to complete before
4600 * we can destroy the iostate mutex...
4601 * 'retval' already contains the early error
4602 * so no need to pick it up from iostate.io_error
4603 */
4604 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4605
4606 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
4607 }
4608 if (rap != NULL) {
4609 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4610 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
4611
4612 lck_mtx_unlock(&rap->cl_lockr);
4613 } else {
4614 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4615 (int)uio->uio_offset, io_req_size, 0, retval, 0);
4616 }
4617
4618 return retval;
4619 }
4620
4621 /*
4622 * We don't want another read/write lock for every vnode in the system
4623 * so we keep a hash of them here. There should never be very many of
4624 * these around at any point in time.
4625 */
4626 cl_direct_read_lock_t *
cluster_lock_direct_read(vnode_t vp,lck_rw_type_t type)4627 cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4628 {
4629 struct cl_direct_read_locks *head
4630 = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4631 % CL_DIRECT_READ_LOCK_BUCKETS];
4632
4633 struct cl_direct_read_lock *lck, *new_lck = NULL;
4634
4635 for (;;) {
4636 lck_spin_lock(&cl_direct_read_spin_lock);
4637
4638 LIST_FOREACH(lck, head, chain) {
4639 if (lck->vp == vp) {
4640 ++lck->ref_count;
4641 lck_spin_unlock(&cl_direct_read_spin_lock);
4642 if (new_lck) {
4643 // Someone beat us to it, ditch the allocation
4644 lck_rw_destroy(&new_lck->rw_lock, &cl_mtx_grp);
4645 kfree_type(cl_direct_read_lock_t, new_lck);
4646 }
4647 lck_rw_lock(&lck->rw_lock, type);
4648 return lck;
4649 }
4650 }
4651
4652 if (new_lck) {
4653 // Use the lock we allocated
4654 LIST_INSERT_HEAD(head, new_lck, chain);
4655 lck_spin_unlock(&cl_direct_read_spin_lock);
4656 lck_rw_lock(&new_lck->rw_lock, type);
4657 return new_lck;
4658 }
4659
4660 lck_spin_unlock(&cl_direct_read_spin_lock);
4661
4662 // Allocate a new lock
4663 new_lck = kalloc_type(cl_direct_read_lock_t, Z_WAITOK);
4664 lck_rw_init(&new_lck->rw_lock, &cl_mtx_grp, LCK_ATTR_NULL);
4665 new_lck->vp = vp;
4666 new_lck->ref_count = 1;
4667
4668 // Got to go round again
4669 }
4670 }
4671
4672 void
cluster_unlock_direct_read(cl_direct_read_lock_t * lck)4673 cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4674 {
4675 lck_rw_done(&lck->rw_lock);
4676
4677 lck_spin_lock(&cl_direct_read_spin_lock);
4678 if (lck->ref_count == 1) {
4679 LIST_REMOVE(lck, chain);
4680 lck_spin_unlock(&cl_direct_read_spin_lock);
4681 lck_rw_destroy(&lck->rw_lock, &cl_mtx_grp);
4682 kfree_type(cl_direct_read_lock_t, lck);
4683 } else {
4684 --lck->ref_count;
4685 lck_spin_unlock(&cl_direct_read_spin_lock);
4686 }
4687 }
4688
4689 static int
cluster_read_direct(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int flags,int (* callback)(buf_t,void *),void * callback_arg)4690 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4691 int flags, int (*callback)(buf_t, void *), void *callback_arg)
4692 {
4693 upl_t upl;
4694 upl_page_info_t *pl;
4695 off_t max_io_size;
4696 vm_offset_t upl_offset, vector_upl_offset = 0;
4697 upl_size_t upl_size, vector_upl_size = 0;
4698 vm_size_t upl_needed_size;
4699 unsigned int pages_in_pl;
4700 upl_control_flags_t upl_flags;
4701 kern_return_t kret;
4702 unsigned int i;
4703 int force_data_sync;
4704 int retval = 0;
4705 int no_zero_fill = 0;
4706 int io_flag = 0;
4707 int misaligned = 0;
4708 struct clios iostate;
4709 user_addr_t iov_base;
4710 u_int32_t io_req_size;
4711 u_int32_t offset_in_file;
4712 u_int32_t offset_in_iovbase;
4713 u_int32_t io_size;
4714 u_int32_t io_min;
4715 u_int32_t xsize;
4716 u_int32_t devblocksize;
4717 u_int32_t mem_alignment_mask;
4718 u_int32_t max_upl_size;
4719 u_int32_t max_rd_size;
4720 u_int32_t max_rd_ahead;
4721 u_int32_t max_vector_size;
4722 boolean_t io_throttled = FALSE;
4723
4724 u_int32_t vector_upl_iosize = 0;
4725 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
4726 off_t v_upl_uio_offset = 0;
4727 int vector_upl_index = 0;
4728 upl_t vector_upl = NULL;
4729 cl_direct_read_lock_t *lock = NULL;
4730
4731 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
4732
4733 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
4734 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4735
4736 max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
4737
4738 max_rd_size = max_upl_size;
4739 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4740
4741 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
4742
4743 if (flags & IO_PASSIVE) {
4744 io_flag |= CL_PASSIVE;
4745 }
4746
4747 if (flags & IO_ENCRYPTED) {
4748 io_flag |= CL_RAW_ENCRYPTED;
4749 }
4750
4751 if (flags & IO_NOCACHE) {
4752 io_flag |= CL_NOCACHE;
4753 }
4754
4755 if (flags & IO_SKIP_ENCRYPTION) {
4756 io_flag |= CL_ENCRYPTED;
4757 }
4758
4759 iostate.io_completed = 0;
4760 iostate.io_issued = 0;
4761 iostate.io_error = 0;
4762 iostate.io_wanted = 0;
4763
4764 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
4765
4766 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4767 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4768
4769 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4770 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
4771
4772 if (devblocksize == 1) {
4773 /*
4774 * the AFP client advertises a devblocksize of 1
4775 * however, its BLOCKMAP routine maps to physical
4776 * blocks that are PAGE_SIZE in size...
4777 * therefore we can't ask for I/Os that aren't page aligned
4778 * or aren't multiples of PAGE_SIZE in size
4779 * by setting devblocksize to PAGE_SIZE, we re-instate
4780 * the old behavior we had before the mem_alignment_mask
4781 * changes went in...
4782 */
4783 devblocksize = PAGE_SIZE;
4784 }
4785
4786 /*
4787 * We are going to need this uio for the prefaulting later
4788 * especially for the cases where multiple non-contiguous
4789 * iovs are passed into this routine.
4790 */
4791 uio_t uio_acct = uio_duplicate(uio);
4792
4793 next_dread:
4794 io_req_size = *read_length;
4795 iov_base = uio_curriovbase(uio);
4796
4797 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4798 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4799
4800 if (vm_map_page_mask(current_map()) < PAGE_MASK) {
4801 /*
4802 * XXX TODO4K
4803 * Direct I/O might not work as expected from a 16k kernel space
4804 * to a 4k user space because each 4k chunk might point to
4805 * a different 16k physical page...
4806 * Let's go the "misaligned" way.
4807 */
4808 if (!misaligned) {
4809 DEBUG4K_VFS("forcing misaligned\n");
4810 }
4811 misaligned = 1;
4812 }
4813
4814 if (offset_in_file || offset_in_iovbase) {
4815 /*
4816 * one of the 2 important offsets is misaligned
4817 * so fire an I/O through the cache for this entire vector
4818 */
4819 misaligned = 1;
4820 }
4821 if (iov_base & (devblocksize - 1)) {
4822 /*
4823 * the offset in memory must be on a device block boundary
4824 * so that we can guarantee that we can generate an
4825 * I/O that ends on a page boundary in cluster_io
4826 */
4827 misaligned = 1;
4828 }
4829
4830 max_io_size = filesize - uio->uio_offset;
4831
4832 /*
4833 * The user must request IO in aligned chunks. If the
4834 * offset into the file is bad, or the userland pointer
4835 * is non-aligned, then we cannot service the encrypted IO request.
4836 */
4837 if (flags & IO_ENCRYPTED) {
4838 if (misaligned || (io_req_size & (devblocksize - 1))) {
4839 retval = EINVAL;
4840 }
4841
4842 max_io_size = roundup(max_io_size, devblocksize);
4843 }
4844
4845 if ((off_t)io_req_size > max_io_size) {
4846 io_req_size = (u_int32_t)max_io_size;
4847 }
4848
4849 /*
4850 * When we get to this point, we know...
4851 * -- the offset into the file is on a devblocksize boundary
4852 */
4853
4854 while (io_req_size && retval == 0) {
4855 u_int32_t io_start;
4856
4857 if (cluster_is_throttled(vp)) {
4858 /*
4859 * we're in the throttle window, at the very least
4860 * we want to limit the size of the I/O we're about
4861 * to issue
4862 */
4863 max_rd_size = THROTTLE_MAX_IOSIZE;
4864 max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
4865 max_vector_size = THROTTLE_MAX_IOSIZE;
4866 } else {
4867 max_rd_size = max_upl_size;
4868 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4869 max_vector_size = MAX_VECTOR_UPL_SIZE;
4870 }
4871 io_start = io_size = io_req_size;
4872
4873 /*
4874 * First look for pages already in the cache
4875 * and move them to user space. But only do this
4876 * check if we are not retrieving encrypted data directly
4877 * from the filesystem; those blocks should never
4878 * be in the UBC.
4879 *
4880 * cluster_copy_ubc_data returns the resid
4881 * in io_size
4882 */
4883 if ((flags & IO_ENCRYPTED) == 0) {
4884 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
4885 }
4886 /*
4887 * calculate the number of bytes actually copied
4888 * starting size - residual
4889 */
4890 xsize = io_start - io_size;
4891
4892 io_req_size -= xsize;
4893
4894 if (useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
4895 /*
4896 * We found something in the cache or we have an iov_base that's not
4897 * page-aligned.
4898 *
4899 * Issue all I/O's that have been collected within this Vectored UPL.
4900 */
4901 if (vector_upl_index) {
4902 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4903 reset_vector_run_state();
4904 }
4905
4906 if (xsize) {
4907 useVectorUPL = 0;
4908 }
4909
4910 /*
4911 * After this point, if we are using the Vector UPL path and the base is
4912 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4913 */
4914 }
4915
4916 /*
4917 * check to see if we are finished with this request.
4918 *
4919 * If we satisfied this IO already, then io_req_size will be 0.
4920 * Otherwise, see if the IO was mis-aligned and needs to go through
4921 * the UBC to deal with the 'tail'.
4922 *
4923 */
4924 if (io_req_size == 0 || (misaligned)) {
4925 /*
4926 * see if there's another uio vector to
4927 * process that's of type IO_DIRECT
4928 *
4929 * break out of while loop to get there
4930 */
4931 break;
4932 }
4933 /*
4934 * assume the request ends on a device block boundary
4935 */
4936 io_min = devblocksize;
4937
4938 /*
4939 * we can handle I/O's in multiples of the device block size
4940 * however, if io_size isn't a multiple of devblocksize we
4941 * want to clip it back to the nearest page boundary since
4942 * we are going to have to go through cluster_read_copy to
4943 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4944 * multiple, we avoid asking the drive for the same physical
4945 * blocks twice.. once for the partial page at the end of the
4946 * request and a 2nd time for the page we read into the cache
4947 * (which overlaps the end of the direct read) in order to
4948 * get at the overhang bytes
4949 */
4950 if (io_size & (devblocksize - 1)) {
4951 assert(!(flags & IO_ENCRYPTED));
4952 /*
4953 * Clip the request to the previous page size boundary
4954 * since request does NOT end on a device block boundary
4955 */
4956 io_size &= ~PAGE_MASK;
4957 io_min = PAGE_SIZE;
4958 }
4959 if (retval || io_size < io_min) {
4960 /*
4961 * either an error or we only have the tail left to
4962 * complete via the copy path...
4963 * we may have already spun some portion of this request
4964 * off as async requests... we need to wait for the I/O
4965 * to complete before returning
4966 */
4967 goto wait_for_dreads;
4968 }
4969
4970 /*
4971 * Don't re-check the UBC data if we are looking for uncached IO
4972 * or asking for encrypted blocks.
4973 */
4974 if ((flags & IO_ENCRYPTED) == 0) {
4975 if ((xsize = io_size) > max_rd_size) {
4976 xsize = max_rd_size;
4977 }
4978
4979 io_size = 0;
4980
4981 if (!lock) {
4982 /*
4983 * We hold a lock here between the time we check the
4984 * cache and the time we issue I/O. This saves us
4985 * from having to lock the pages in the cache. Not
4986 * all clients will care about this lock but some
4987 * clients may want to guarantee stability between
4988 * here and when the I/O is issued in which case they
4989 * will take the lock exclusively.
4990 */
4991 lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
4992 }
4993
4994 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
4995
4996 if (io_size == 0) {
4997 /*
4998 * a page must have just come into the cache
4999 * since the first page in this range is no
5000 * longer absent, go back and re-evaluate
5001 */
5002 continue;
5003 }
5004 }
5005 if ((flags & IO_RETURN_ON_THROTTLE)) {
5006 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
5007 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
5008 /*
5009 * we're in the throttle window and at least 1 I/O
5010 * has already been issued by a throttleable thread
5011 * in this window, so return with EAGAIN to indicate
5012 * to the FS issuing the cluster_read call that it
5013 * should now throttle after dropping any locks
5014 */
5015 throttle_info_update_by_mount(vp->v_mount);
5016
5017 io_throttled = TRUE;
5018 goto wait_for_dreads;
5019 }
5020 }
5021 }
5022 if (io_size > max_rd_size) {
5023 io_size = max_rd_size;
5024 }
5025
5026 iov_base = uio_curriovbase(uio);
5027
5028 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5029 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5030
5031 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
5032 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
5033
5034 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
5035 no_zero_fill = 1;
5036 } else {
5037 no_zero_fill = 0;
5038 }
5039
5040 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5041 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
5042 pages_in_pl = 0;
5043 upl_size = (upl_size_t)upl_needed_size;
5044 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5045 if (no_zero_fill) {
5046 upl_flags |= UPL_NOZEROFILL;
5047 }
5048 if (force_data_sync) {
5049 upl_flags |= UPL_FORCE_DATA_SYNC;
5050 }
5051
5052 kret = vm_map_create_upl(map,
5053 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5054 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
5055
5056 if (kret != KERN_SUCCESS) {
5057 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5058 (int)upl_offset, upl_size, io_size, kret, 0);
5059 /*
5060 * failed to get pagelist
5061 *
5062 * we may have already spun some portion of this request
5063 * off as async requests... we need to wait for the I/O
5064 * to complete before returning
5065 */
5066 goto wait_for_dreads;
5067 }
5068 pages_in_pl = upl_size / PAGE_SIZE;
5069 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
5070
5071 for (i = 0; i < pages_in_pl; i++) {
5072 if (!upl_page_present(pl, i)) {
5073 break;
5074 }
5075 }
5076 if (i == pages_in_pl) {
5077 break;
5078 }
5079
5080 ubc_upl_abort(upl, 0);
5081 }
5082 if (force_data_sync >= 3) {
5083 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5084 (int)upl_offset, upl_size, io_size, kret, 0);
5085
5086 goto wait_for_dreads;
5087 }
5088 /*
5089 * Consider the possibility that upl_size wasn't satisfied.
5090 */
5091 if (upl_size < upl_needed_size) {
5092 if (upl_size && upl_offset == 0) {
5093 io_size = upl_size;
5094 } else {
5095 io_size = 0;
5096 }
5097 }
5098 if (io_size == 0) {
5099 ubc_upl_abort(upl, 0);
5100 goto wait_for_dreads;
5101 }
5102 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5103 (int)upl_offset, upl_size, io_size, kret, 0);
5104
5105 if (useVectorUPL) {
5106 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
5107 if (end_off) {
5108 issueVectorUPL = 1;
5109 }
5110 /*
5111 * After this point, if we are using a vector UPL, then
5112 * either all the UPL elements end on a page boundary OR
5113 * this UPL is the last element because it does not end
5114 * on a page boundary.
5115 */
5116 }
5117
5118 /*
5119 * request asynchronously so that we can overlap
5120 * the preparation of the next I/O
5121 * if there are already too many outstanding reads
5122 * wait until some have completed before issuing the next read
5123 */
5124 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
5125
5126 if (iostate.io_error) {
5127 /*
5128 * one of the earlier reads we issued ran into a hard error
5129 * don't issue any more reads, cleanup the UPL
5130 * that was just created but not used, then
5131 * go wait for any other reads to complete before
5132 * returning the error to the caller
5133 */
5134 ubc_upl_abort(upl, 0);
5135
5136 goto wait_for_dreads;
5137 }
5138 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
5139 upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
5140
5141 if (!useVectorUPL) {
5142 if (no_zero_fill) {
5143 io_flag &= ~CL_PRESERVE;
5144 } else {
5145 io_flag |= CL_PRESERVE;
5146 }
5147
5148 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5149 } else {
5150 if (!vector_upl_index) {
5151 vector_upl = vector_upl_create(upl_offset);
5152 v_upl_uio_offset = uio->uio_offset;
5153 vector_upl_offset = upl_offset;
5154 }
5155
5156 vector_upl_set_subupl(vector_upl, upl, upl_size);
5157 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
5158 vector_upl_index++;
5159 vector_upl_size += upl_size;
5160 vector_upl_iosize += io_size;
5161
5162 if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
5163 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5164 reset_vector_run_state();
5165 }
5166 }
5167
5168 if (lock) {
5169 // We don't need to wait for the I/O to complete
5170 cluster_unlock_direct_read(lock);
5171 lock = NULL;
5172 }
5173
5174 /*
5175 * update the uio structure
5176 */
5177 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5178 uio_update(uio, (user_size_t)max_io_size);
5179 } else {
5180 uio_update(uio, (user_size_t)io_size);
5181 }
5182
5183 io_req_size -= io_size;
5184
5185 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
5186 upl, (int)uio->uio_offset, io_req_size, retval, 0);
5187 } /* end while */
5188
5189 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
5190 retval = cluster_io_type(uio, read_type, read_length, 0);
5191
5192 if (retval == 0 && *read_type == IO_DIRECT) {
5193 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5194 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
5195
5196 goto next_dread;
5197 }
5198 }
5199
5200 wait_for_dreads:
5201
5202 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
5203 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5204 reset_vector_run_state();
5205 }
5206
5207 // We don't need to wait for the I/O to complete
5208 if (lock) {
5209 cluster_unlock_direct_read(lock);
5210 }
5211
5212 /*
5213 * make sure all async reads that are part of this stream
5214 * have completed before we return
5215 */
5216 cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
5217
5218 if (iostate.io_error) {
5219 retval = iostate.io_error;
5220 }
5221
5222 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5223
5224 if (io_throttled == TRUE && retval == 0) {
5225 retval = EAGAIN;
5226 }
5227
5228 vm_map_offset_t current_page_size, current_page_mask;
5229 current_page_size = vm_map_page_size(current_map());
5230 current_page_mask = vm_map_page_mask(current_map());
5231 if (uio_acct) {
5232 off_t bytes_to_prefault = 0, bytes_prefaulted = 0;
5233 user_addr_t curr_iov_base = 0;
5234 user_addr_t curr_iov_end = 0;
5235 user_size_t curr_iov_len = 0;
5236
5237 bytes_to_prefault = uio_offset(uio) - uio_offset(uio_acct);
5238
5239 for (; bytes_prefaulted < bytes_to_prefault;) {
5240 curr_iov_base = uio_curriovbase(uio_acct);
5241 curr_iov_len = MIN(uio_curriovlen(uio_acct), bytes_to_prefault - bytes_prefaulted);
5242 curr_iov_end = curr_iov_base + curr_iov_len;
5243
5244 for (; curr_iov_base < curr_iov_end;) {
5245 /*
5246 * This is specifically done for pmap accounting purposes.
5247 * vm_pre_fault() will call vm_fault() to enter the page into
5248 * the pmap if there isn't _a_ physical page for that VA already.
5249 */
5250 vm_pre_fault(vm_map_trunc_page(curr_iov_base, current_page_mask), VM_PROT_READ);
5251 curr_iov_base += current_page_size;
5252 bytes_prefaulted += current_page_size;
5253 }
5254 /*
5255 * Use update instead of advance so we can see how many iovs we processed.
5256 */
5257 uio_update(uio_acct, curr_iov_len);
5258 }
5259 uio_free(uio_acct);
5260 uio_acct = NULL;
5261 }
5262
5263 if (io_req_size && retval == 0) {
5264 /*
5265 * we couldn't handle the tail of this request in DIRECT mode
5266 * so fire it through the copy path
5267 */
5268 if (flags & IO_ENCRYPTED) {
5269 /*
5270 * We cannot fall back to the copy path for encrypted I/O. If this
5271 * happens, there is something wrong with the user buffer passed
5272 * down.
5273 */
5274 retval = EFAULT;
5275 } else {
5276 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
5277 }
5278
5279 *read_type = IO_UNKNOWN;
5280 }
5281 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
5282 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
5283
5284 return retval;
5285 }
5286
5287
5288 static int
cluster_read_contig(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int (* callback)(buf_t,void *),void * callback_arg,int flags)5289 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
5290 int (*callback)(buf_t, void *), void *callback_arg, int flags)
5291 {
5292 upl_page_info_t *pl;
5293 upl_t upl[MAX_VECTS];
5294 vm_offset_t upl_offset;
5295 addr64_t dst_paddr = 0;
5296 user_addr_t iov_base;
5297 off_t max_size;
5298 upl_size_t upl_size;
5299 vm_size_t upl_needed_size;
5300 mach_msg_type_number_t pages_in_pl;
5301 upl_control_flags_t upl_flags;
5302 kern_return_t kret;
5303 struct clios iostate;
5304 int error = 0;
5305 int cur_upl = 0;
5306 int num_upl = 0;
5307 int n;
5308 u_int32_t xsize;
5309 u_int32_t io_size;
5310 u_int32_t devblocksize;
5311 u_int32_t mem_alignment_mask;
5312 u_int32_t tail_size = 0;
5313 int bflag;
5314
5315 if (flags & IO_PASSIVE) {
5316 bflag = CL_PASSIVE;
5317 } else {
5318 bflag = 0;
5319 }
5320
5321 if (flags & IO_NOCACHE) {
5322 bflag |= CL_NOCACHE;
5323 }
5324
5325 /*
5326 * When we enter this routine, we know
5327 * -- the read_length will not exceed the current iov_len
5328 * -- the target address is physically contiguous for read_length
5329 */
5330 cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
5331
5332 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5333 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
5334
5335 iostate.io_completed = 0;
5336 iostate.io_issued = 0;
5337 iostate.io_error = 0;
5338 iostate.io_wanted = 0;
5339
5340 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
5341
5342 next_cread:
5343 io_size = *read_length;
5344
5345 max_size = filesize - uio->uio_offset;
5346
5347 if (io_size > max_size) {
5348 io_size = (u_int32_t)max_size;
5349 }
5350
5351 iov_base = uio_curriovbase(uio);
5352
5353 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5354 upl_needed_size = upl_offset + io_size;
5355
5356 pages_in_pl = 0;
5357 upl_size = (upl_size_t)upl_needed_size;
5358 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5359
5360
5361 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
5362 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
5363
5364 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5365 kret = vm_map_get_upl(map,
5366 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5367 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
5368
5369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
5370 (int)upl_offset, upl_size, io_size, kret, 0);
5371
5372 if (kret != KERN_SUCCESS) {
5373 /*
5374 * failed to get pagelist
5375 */
5376 error = EINVAL;
5377 goto wait_for_creads;
5378 }
5379 num_upl++;
5380
5381 if (upl_size < upl_needed_size) {
5382 /*
5383 * The upl_size wasn't satisfied.
5384 */
5385 error = EINVAL;
5386 goto wait_for_creads;
5387 }
5388 pl = ubc_upl_pageinfo(upl[cur_upl]);
5389
5390 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
5391
5392 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
5393 u_int32_t head_size;
5394
5395 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
5396
5397 if (head_size > io_size) {
5398 head_size = io_size;
5399 }
5400
5401 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
5402
5403 if (error) {
5404 goto wait_for_creads;
5405 }
5406
5407 upl_offset += head_size;
5408 dst_paddr += head_size;
5409 io_size -= head_size;
5410
5411 iov_base += head_size;
5412 }
5413 if ((u_int32_t)iov_base & mem_alignment_mask) {
5414 /*
5415 * request doesn't set up on a memory boundary
5416 * the underlying DMA engine can handle...
5417 * return an error instead of going through
5418 * the slow copy path since the intent of this
5419 * path is direct I/O to device memory
5420 */
5421 error = EINVAL;
5422 goto wait_for_creads;
5423 }
5424
5425 tail_size = io_size & (devblocksize - 1);
5426
5427 io_size -= tail_size;
5428
5429 while (io_size && error == 0) {
5430 if (io_size > MAX_IO_CONTIG_SIZE) {
5431 xsize = MAX_IO_CONTIG_SIZE;
5432 } else {
5433 xsize = io_size;
5434 }
5435 /*
5436 * request asynchronously so that we can overlap
5437 * the preparation of the next I/O... we'll do
5438 * the commit after all the I/O has completed
5439 * since its all issued against the same UPL
5440 * if there are already too many outstanding reads
5441 * wait until some have completed before issuing the next
5442 */
5443 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
5444
5445 if (iostate.io_error) {
5446 /*
5447 * one of the earlier reads we issued ran into a hard error
5448 * don't issue any more reads...
5449 * go wait for any other reads to complete before
5450 * returning the error to the caller
5451 */
5452 goto wait_for_creads;
5453 }
5454 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
5455 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5456 (buf_t)NULL, &iostate, callback, callback_arg);
5457 /*
5458 * The cluster_io read was issued successfully,
5459 * update the uio structure
5460 */
5461 if (error == 0) {
5462 uio_update(uio, (user_size_t)xsize);
5463
5464 dst_paddr += xsize;
5465 upl_offset += xsize;
5466 io_size -= xsize;
5467 }
5468 }
5469 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5470 error = cluster_io_type(uio, read_type, read_length, 0);
5471
5472 if (error == 0 && *read_type == IO_CONTIG) {
5473 cur_upl++;
5474 goto next_cread;
5475 }
5476 } else {
5477 *read_type = IO_UNKNOWN;
5478 }
5479
5480 wait_for_creads:
5481 /*
5482 * make sure all async reads that are part of this stream
5483 * have completed before we proceed
5484 */
5485 cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
5486
5487 if (iostate.io_error) {
5488 error = iostate.io_error;
5489 }
5490
5491 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5492
5493 if (error == 0 && tail_size) {
5494 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
5495 }
5496
5497 for (n = 0; n < num_upl; n++) {
5498 /*
5499 * just release our hold on each physically contiguous
5500 * region without changing any state
5501 */
5502 ubc_upl_abort(upl[n], 0);
5503 }
5504
5505 return error;
5506 }
5507
5508
5509 static int
cluster_io_type(struct uio * uio,int * io_type,u_int32_t * io_length,u_int32_t min_length)5510 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5511 {
5512 user_size_t iov_len;
5513 user_addr_t iov_base = 0;
5514 upl_t upl;
5515 upl_size_t upl_size;
5516 upl_control_flags_t upl_flags;
5517 int retval = 0;
5518
5519 /*
5520 * skip over any emtpy vectors
5521 */
5522 uio_update(uio, (user_size_t)0);
5523
5524 iov_len = uio_curriovlen(uio);
5525
5526 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
5527
5528 if (iov_len) {
5529 iov_base = uio_curriovbase(uio);
5530 /*
5531 * make sure the size of the vector isn't too big...
5532 * internally, we want to handle all of the I/O in
5533 * chunk sizes that fit in a 32 bit int
5534 */
5535 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
5536 upl_size = MAX_IO_REQUEST_SIZE;
5537 } else {
5538 upl_size = (u_int32_t)iov_len;
5539 }
5540
5541 upl_flags = UPL_QUERY_OBJECT_TYPE;
5542
5543 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5544 if ((vm_map_get_upl(map,
5545 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
5546 &upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
5547 /*
5548 * the user app must have passed in an invalid address
5549 */
5550 retval = EFAULT;
5551 }
5552 if (upl_size == 0) {
5553 retval = EFAULT;
5554 }
5555
5556 *io_length = upl_size;
5557
5558 if (upl_flags & UPL_PHYS_CONTIG) {
5559 *io_type = IO_CONTIG;
5560 } else if (iov_len >= min_length) {
5561 *io_type = IO_DIRECT;
5562 } else {
5563 *io_type = IO_COPY;
5564 }
5565 } else {
5566 /*
5567 * nothing left to do for this uio
5568 */
5569 *io_length = 0;
5570 *io_type = IO_UNKNOWN;
5571 }
5572 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
5573
5574 if (*io_type == IO_DIRECT &&
5575 vm_map_page_shift(current_map()) < PAGE_SHIFT) {
5576 /* no direct I/O for sub-page-size address spaces */
5577 DEBUG4K_VFS("io_type IO_DIRECT -> IO_COPY\n");
5578 *io_type = IO_COPY;
5579 }
5580
5581 return retval;
5582 }
5583
5584
5585 /*
5586 * generate advisory I/O's in the largest chunks possible
5587 * the completed pages will be released into the VM cache
5588 */
5589 int
advisory_read(vnode_t vp,off_t filesize,off_t f_offset,int resid)5590 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5591 {
5592 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5593 }
5594
5595 int
advisory_read_ext(vnode_t vp,off_t filesize,off_t f_offset,int resid,int (* callback)(buf_t,void *),void * callback_arg,int bflag)5596 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
5597 {
5598 upl_page_info_t *pl;
5599 upl_t upl;
5600 vm_offset_t upl_offset;
5601 int upl_size;
5602 off_t upl_f_offset;
5603 int start_offset;
5604 int start_pg;
5605 int last_pg;
5606 int pages_in_upl;
5607 off_t max_size;
5608 int io_size;
5609 kern_return_t kret;
5610 int retval = 0;
5611 int issued_io;
5612 int skip_range;
5613 uint32_t max_io_size;
5614
5615
5616 if (!UBCINFOEXISTS(vp)) {
5617 return EINVAL;
5618 }
5619
5620 if (f_offset < 0 || resid < 0) {
5621 return EINVAL;
5622 }
5623
5624 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
5625
5626 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
5627 if (max_io_size > speculative_prefetch_max_iosize) {
5628 max_io_size = speculative_prefetch_max_iosize;
5629 }
5630 }
5631
5632 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
5633 (int)f_offset, resid, (int)filesize, 0, 0);
5634
5635 while (resid && f_offset < filesize && retval == 0) {
5636 /*
5637 * compute the size of the upl needed to encompass
5638 * the requested read... limit each call to cluster_io
5639 * to the maximum UPL size... cluster_io will clip if
5640 * this exceeds the maximum io_size for the device,
5641 * make sure to account for
5642 * a starting offset that's not page aligned
5643 */
5644 start_offset = (int)(f_offset & PAGE_MASK_64);
5645 upl_f_offset = f_offset - (off_t)start_offset;
5646 max_size = filesize - f_offset;
5647
5648 if (resid < max_size) {
5649 io_size = resid;
5650 } else {
5651 io_size = (int)max_size;
5652 }
5653
5654 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5655 if ((uint32_t)upl_size > max_io_size) {
5656 upl_size = max_io_size;
5657 }
5658
5659 skip_range = 0;
5660 /*
5661 * return the number of contiguously present pages in the cache
5662 * starting at upl_f_offset within the file
5663 */
5664 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5665
5666 if (skip_range) {
5667 /*
5668 * skip over pages already present in the cache
5669 */
5670 io_size = skip_range - start_offset;
5671
5672 f_offset += io_size;
5673 resid -= io_size;
5674
5675 if (skip_range == upl_size) {
5676 continue;
5677 }
5678 /*
5679 * have to issue some real I/O
5680 * at this point, we know it's starting on a page boundary
5681 * because we've skipped over at least the first page in the request
5682 */
5683 start_offset = 0;
5684 upl_f_offset += skip_range;
5685 upl_size -= skip_range;
5686 }
5687 pages_in_upl = upl_size / PAGE_SIZE;
5688
5689 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
5690 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5691
5692 kret = ubc_create_upl_kernel(vp,
5693 upl_f_offset,
5694 upl_size,
5695 &upl,
5696 &pl,
5697 UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
5698 VM_KERN_MEMORY_FILE);
5699 if (kret != KERN_SUCCESS) {
5700 return retval;
5701 }
5702 issued_io = 0;
5703
5704 /*
5705 * before we start marching forward, we must make sure we end on
5706 * a present page, otherwise we will be working with a freed
5707 * upl
5708 */
5709 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5710 if (upl_page_present(pl, last_pg)) {
5711 break;
5712 }
5713 }
5714 pages_in_upl = last_pg + 1;
5715
5716
5717 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
5718 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5719
5720
5721 for (last_pg = 0; last_pg < pages_in_upl;) {
5722 /*
5723 * scan from the beginning of the upl looking for the first
5724 * page that is present.... this will become the first page in
5725 * the request we're going to make to 'cluster_io'... if all
5726 * of the pages are absent, we won't call through to 'cluster_io'
5727 */
5728 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5729 if (upl_page_present(pl, start_pg)) {
5730 break;
5731 }
5732 }
5733
5734 /*
5735 * scan from the starting present page looking for an absent
5736 * page before the end of the upl is reached, if we
5737 * find one, then it will terminate the range of pages being
5738 * presented to 'cluster_io'
5739 */
5740 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5741 if (!upl_page_present(pl, last_pg)) {
5742 break;
5743 }
5744 }
5745
5746 if (last_pg > start_pg) {
5747 /*
5748 * we found a range of pages that must be filled
5749 * if the last page in this range is the last page of the file
5750 * we may have to clip the size of it to keep from reading past
5751 * the end of the last physical block associated with the file
5752 */
5753 upl_offset = start_pg * PAGE_SIZE;
5754 io_size = (last_pg - start_pg) * PAGE_SIZE;
5755
5756 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5757 io_size = (int)(filesize - (upl_f_offset + upl_offset));
5758 }
5759
5760 /*
5761 * issue an asynchronous read to cluster_io
5762 */
5763 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5764 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5765
5766 issued_io = 1;
5767 }
5768 }
5769 if (issued_io == 0) {
5770 ubc_upl_abort(upl, 0);
5771 }
5772
5773 io_size = upl_size - start_offset;
5774
5775 if (io_size > resid) {
5776 io_size = resid;
5777 }
5778 f_offset += io_size;
5779 resid -= io_size;
5780 }
5781
5782 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
5783 (int)f_offset, resid, retval, 0, 0);
5784
5785 return retval;
5786 }
5787
5788
5789 int
cluster_push(vnode_t vp,int flags)5790 cluster_push(vnode_t vp, int flags)
5791 {
5792 return cluster_push_ext(vp, flags, NULL, NULL);
5793 }
5794
5795
5796 int
cluster_push_ext(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg)5797 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5798 {
5799 return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5800 }
5801
5802 /* write errors via err, but return the number of clusters written */
5803 extern uint32_t system_inshutdown;
5804 uint32_t cl_sparse_push_error = 0;
5805 int
cluster_push_err(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg,int * err)5806 cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
5807 {
5808 int retval;
5809 int my_sparse_wait = 0;
5810 struct cl_writebehind *wbp;
5811 int local_err = 0;
5812
5813 if (err) {
5814 *err = 0;
5815 }
5816
5817 if (!UBCINFOEXISTS(vp)) {
5818 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
5819 return 0;
5820 }
5821 /* return if deferred write is set */
5822 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5823 return 0;
5824 }
5825 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5826 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
5827 return 0;
5828 }
5829 if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
5830 lck_mtx_unlock(&wbp->cl_lockw);
5831
5832 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
5833 return 0;
5834 }
5835 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
5836 wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
5837
5838 /*
5839 * if we have an fsync in progress, we don't want to allow any additional
5840 * sync/fsync/close(s) to occur until it finishes.
5841 * note that its possible for writes to continue to occur to this file
5842 * while we're waiting and also once the fsync starts to clean if we're
5843 * in the sparse map case
5844 */
5845 while (wbp->cl_sparse_wait) {
5846 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5847
5848 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5849
5850 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5851 }
5852 if (flags & IO_SYNC) {
5853 my_sparse_wait = 1;
5854 wbp->cl_sparse_wait = 1;
5855
5856 /*
5857 * this is an fsync (or equivalent)... we must wait for any existing async
5858 * cleaning operations to complete before we evaulate the current state
5859 * and finish cleaning... this insures that all writes issued before this
5860 * fsync actually get cleaned to the disk before this fsync returns
5861 */
5862 while (wbp->cl_sparse_pushes) {
5863 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5864
5865 msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5866
5867 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5868 }
5869 }
5870 if (wbp->cl_scmap) {
5871 void *scmap;
5872
5873 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5874 scmap = wbp->cl_scmap;
5875 wbp->cl_scmap = NULL;
5876
5877 wbp->cl_sparse_pushes++;
5878
5879 lck_mtx_unlock(&wbp->cl_lockw);
5880
5881 retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5882
5883 lck_mtx_lock(&wbp->cl_lockw);
5884
5885 wbp->cl_sparse_pushes--;
5886
5887 if (retval) {
5888 if (wbp->cl_scmap != NULL) {
5889 /*
5890 * panic("cluster_push_err: Expected NULL cl_scmap\n");
5891 *
5892 * This can happen if we get an error from the underlying FS
5893 * e.g. ENOSPC, EPERM or EIO etc. We hope that these errors
5894 * are transient and the I/Os will succeed at a later point.
5895 *
5896 * The tricky part here is that a new sparse cluster has been
5897 * allocated and tracking a different set of dirty pages. So these
5898 * pages are not going to be pushed out with the next sparse_cluster_push.
5899 * An explicit msync or file close will, however, push the pages out.
5900 *
5901 * What if those calls still don't work? And so, during shutdown we keep
5902 * trying till we succeed...
5903 */
5904
5905 if (system_inshutdown) {
5906 if ((retval == ENOSPC) && (vp->v_mount->mnt_flag & (MNT_LOCAL | MNT_REMOVABLE)) == MNT_LOCAL) {
5907 os_atomic_inc(&cl_sparse_push_error, relaxed);
5908 }
5909 } else {
5910 vfs_drt_control(&scmap, 0); /* emit stats and free this memory. Dirty pages stay intact. */
5911 scmap = NULL;
5912 }
5913 } else {
5914 wbp->cl_scmap = scmap;
5915 }
5916 }
5917
5918 if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
5919 wakeup((caddr_t)&wbp->cl_sparse_pushes);
5920 }
5921 } else {
5922 retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5923 }
5924
5925 local_err = retval;
5926
5927 if (err) {
5928 *err = retval;
5929 }
5930 retval = 1;
5931 } else {
5932 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
5933 if (err) {
5934 *err = local_err;
5935 }
5936 }
5937 lck_mtx_unlock(&wbp->cl_lockw);
5938
5939 if (flags & IO_SYNC) {
5940 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
5941 }
5942
5943 if (my_sparse_wait) {
5944 /*
5945 * I'm the owner of the serialization token
5946 * clear it and wakeup anyone that is waiting
5947 * for me to finish
5948 */
5949 lck_mtx_lock(&wbp->cl_lockw);
5950
5951 wbp->cl_sparse_wait = 0;
5952 wakeup((caddr_t)&wbp->cl_sparse_wait);
5953
5954 lck_mtx_unlock(&wbp->cl_lockw);
5955 }
5956 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
5957 wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
5958
5959 return retval;
5960 }
5961
5962
5963 __private_extern__ void
cluster_release(struct ubc_info * ubc)5964 cluster_release(struct ubc_info *ubc)
5965 {
5966 struct cl_writebehind *wbp;
5967 struct cl_readahead *rap;
5968
5969 if ((wbp = ubc->cl_wbehind)) {
5970 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
5971
5972 if (wbp->cl_scmap) {
5973 vfs_drt_control(&(wbp->cl_scmap), 0);
5974 }
5975 lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
5976 zfree(cl_wr_zone, wbp);
5977 ubc->cl_wbehind = NULL;
5978 } else {
5979 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
5980 }
5981
5982 if ((rap = ubc->cl_rahead)) {
5983 lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
5984 zfree(cl_rd_zone, rap);
5985 ubc->cl_rahead = NULL;
5986 }
5987
5988 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
5989 }
5990
5991
5992 static int
cluster_try_push(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,int * err,boolean_t vm_initiated)5993 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
5994 {
5995 int cl_index;
5996 int cl_index1;
5997 int min_index;
5998 int cl_len;
5999 int cl_pushed = 0;
6000 struct cl_wextent l_clusters[MAX_CLUSTERS];
6001 u_int max_cluster_pgcount;
6002 int error = 0;
6003
6004 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
6005 /*
6006 * the write behind context exists and has
6007 * already been locked...
6008 */
6009 if (wbp->cl_number == 0) {
6010 /*
6011 * no clusters to push
6012 * return number of empty slots
6013 */
6014 return MAX_CLUSTERS;
6015 }
6016
6017 /*
6018 * make a local 'sorted' copy of the clusters
6019 * and clear wbp->cl_number so that new clusters can
6020 * be developed
6021 */
6022 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6023 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
6024 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
6025 continue;
6026 }
6027 if (min_index == -1) {
6028 min_index = cl_index1;
6029 } else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
6030 min_index = cl_index1;
6031 }
6032 }
6033 if (min_index == -1) {
6034 break;
6035 }
6036
6037 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
6038 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
6039 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
6040
6041 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
6042 }
6043 wbp->cl_number = 0;
6044
6045 cl_len = cl_index;
6046
6047 /* skip switching to the sparse cluster mechanism if on diskimage */
6048 if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
6049 !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
6050 int i;
6051
6052 /*
6053 * determine if we appear to be writing the file sequentially
6054 * if not, by returning without having pushed any clusters
6055 * we will cause this vnode to be pushed into the sparse cluster mechanism
6056 * used for managing more random I/O patterns
6057 *
6058 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
6059 * that's why we're in try_push with PUSH_DELAY...
6060 *
6061 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
6062 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
6063 * so we can just make a simple pass through, up to, but not including the last one...
6064 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
6065 * are sequential
6066 *
6067 * we let the last one be partial as long as it was adjacent to the previous one...
6068 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
6069 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
6070 */
6071 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
6072 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
6073 goto dont_try;
6074 }
6075 if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
6076 goto dont_try;
6077 }
6078 }
6079 }
6080 if (vm_initiated == TRUE) {
6081 lck_mtx_unlock(&wbp->cl_lockw);
6082 }
6083
6084 for (cl_index = 0; cl_index < cl_len; cl_index++) {
6085 int flags;
6086 struct cl_extent cl;
6087 int retval;
6088
6089 flags = io_flags & (IO_PASSIVE | IO_CLOSE);
6090
6091 /*
6092 * try to push each cluster in turn...
6093 */
6094 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
6095 flags |= IO_NOCACHE;
6096 }
6097
6098 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
6099 flags |= IO_PASSIVE;
6100 }
6101
6102 if (push_flag & PUSH_SYNC) {
6103 flags |= IO_SYNC;
6104 }
6105
6106 cl.b_addr = l_clusters[cl_index].b_addr;
6107 cl.e_addr = l_clusters[cl_index].e_addr;
6108
6109 retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
6110
6111 if (retval == 0) {
6112 cl_pushed++;
6113
6114 l_clusters[cl_index].b_addr = 0;
6115 l_clusters[cl_index].e_addr = 0;
6116 } else if (error == 0) {
6117 error = retval;
6118 }
6119
6120 if (!(push_flag & PUSH_ALL)) {
6121 break;
6122 }
6123 }
6124 if (vm_initiated == TRUE) {
6125 lck_mtx_lock(&wbp->cl_lockw);
6126 }
6127
6128 if (err) {
6129 *err = error;
6130 }
6131
6132 dont_try:
6133 if (cl_len > cl_pushed) {
6134 /*
6135 * we didn't push all of the clusters, so
6136 * lets try to merge them back in to the vnode
6137 */
6138 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
6139 /*
6140 * we picked up some new clusters while we were trying to
6141 * push the old ones... this can happen because I've dropped
6142 * the vnode lock... the sum of the
6143 * leftovers plus the new cluster count exceeds our ability
6144 * to represent them, so switch to the sparse cluster mechanism
6145 *
6146 * collect the active public clusters...
6147 */
6148 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6149
6150 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
6151 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6152 continue;
6153 }
6154 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6155 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6156 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6157
6158 cl_index1++;
6159 }
6160 /*
6161 * update the cluster count
6162 */
6163 wbp->cl_number = cl_index1;
6164
6165 /*
6166 * and collect the original clusters that were moved into the
6167 * local storage for sorting purposes
6168 */
6169 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6170 } else {
6171 /*
6172 * we've got room to merge the leftovers back in
6173 * just append them starting at the next 'hole'
6174 * represented by wbp->cl_number
6175 */
6176 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
6177 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6178 continue;
6179 }
6180
6181 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6182 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6183 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6184
6185 cl_index1++;
6186 }
6187 /*
6188 * update the cluster count
6189 */
6190 wbp->cl_number = cl_index1;
6191 }
6192 }
6193 return MAX_CLUSTERS - wbp->cl_number;
6194 }
6195
6196
6197
6198 static int
cluster_push_now(vnode_t vp,struct cl_extent * cl,off_t EOF,int flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6199 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
6200 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6201 {
6202 upl_page_info_t *pl;
6203 upl_t upl;
6204 vm_offset_t upl_offset;
6205 int upl_size;
6206 off_t upl_f_offset;
6207 int pages_in_upl;
6208 int start_pg;
6209 int last_pg;
6210 int io_size;
6211 int io_flags;
6212 int upl_flags;
6213 int bflag;
6214 int size;
6215 int error = 0;
6216 int retval;
6217 kern_return_t kret;
6218
6219 if (flags & IO_PASSIVE) {
6220 bflag = CL_PASSIVE;
6221 } else {
6222 bflag = 0;
6223 }
6224
6225 if (flags & IO_SKIP_ENCRYPTION) {
6226 bflag |= CL_ENCRYPTED;
6227 }
6228
6229 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
6230 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
6231
6232 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
6233 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
6234
6235 return 0;
6236 }
6237 upl_size = pages_in_upl * PAGE_SIZE;
6238 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6239
6240 if (upl_f_offset + upl_size >= EOF) {
6241 if (upl_f_offset >= EOF) {
6242 /*
6243 * must have truncated the file and missed
6244 * clearing a dangling cluster (i.e. it's completely
6245 * beyond the new EOF
6246 */
6247 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
6248
6249 return 0;
6250 }
6251 size = (int)(EOF - upl_f_offset);
6252
6253 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
6254 pages_in_upl = upl_size / PAGE_SIZE;
6255 } else {
6256 size = upl_size;
6257 }
6258
6259
6260 if (vm_initiated) {
6261 vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
6262 UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
6263
6264 return error;
6265 }
6266 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
6267
6268 /*
6269 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
6270 *
6271 * - only pages that are currently dirty are returned... these are the ones we need to clean
6272 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6273 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
6274 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
6275 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
6276 *
6277 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6278 */
6279
6280 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) {
6281 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
6282 } else {
6283 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
6284 }
6285
6286 kret = ubc_create_upl_kernel(vp,
6287 upl_f_offset,
6288 upl_size,
6289 &upl,
6290 &pl,
6291 upl_flags,
6292 VM_KERN_MEMORY_FILE);
6293 if (kret != KERN_SUCCESS) {
6294 panic("cluster_push: failed to get pagelist");
6295 }
6296
6297 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
6298
6299 /*
6300 * since we only asked for the dirty pages back
6301 * it's possible that we may only get a few or even none, so...
6302 * before we start marching forward, we must make sure we know
6303 * where the last present page is in the UPL, otherwise we could
6304 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
6305 * employed by commit_range and abort_range.
6306 */
6307 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
6308 if (upl_page_present(pl, last_pg)) {
6309 break;
6310 }
6311 }
6312 pages_in_upl = last_pg + 1;
6313
6314 if (pages_in_upl == 0) {
6315 ubc_upl_abort(upl, 0);
6316
6317 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
6318 return 0;
6319 }
6320
6321 for (last_pg = 0; last_pg < pages_in_upl;) {
6322 /*
6323 * find the next dirty page in the UPL
6324 * this will become the first page in the
6325 * next I/O to generate
6326 */
6327 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
6328 if (upl_dirty_page(pl, start_pg)) {
6329 break;
6330 }
6331 if (upl_page_present(pl, start_pg)) {
6332 /*
6333 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
6334 * just release these unchanged since we're not going
6335 * to steal them or change their state
6336 */
6337 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
6338 }
6339 }
6340 if (start_pg >= pages_in_upl) {
6341 /*
6342 * done... no more dirty pages to push
6343 */
6344 break;
6345 }
6346 if (start_pg > last_pg) {
6347 /*
6348 * skipped over some non-dirty pages
6349 */
6350 size -= ((start_pg - last_pg) * PAGE_SIZE);
6351 }
6352
6353 /*
6354 * find a range of dirty pages to write
6355 */
6356 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
6357 if (!upl_dirty_page(pl, last_pg)) {
6358 break;
6359 }
6360 }
6361 upl_offset = start_pg * PAGE_SIZE;
6362
6363 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
6364
6365 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
6366
6367 if (!(flags & IO_SYNC)) {
6368 io_flags |= CL_ASYNC;
6369 }
6370
6371 if (flags & IO_CLOSE) {
6372 io_flags |= CL_CLOSE;
6373 }
6374
6375 if (flags & IO_NOCACHE) {
6376 io_flags |= CL_NOCACHE;
6377 }
6378
6379 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
6380 io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6381
6382 if (error == 0 && retval) {
6383 error = retval;
6384 }
6385
6386 size -= io_size;
6387 }
6388 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
6389
6390 return error;
6391 }
6392
6393
6394 /*
6395 * sparse_cluster_switch is called with the write behind lock held
6396 */
6397 static int
sparse_cluster_switch(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6398 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6399 {
6400 int cl_index;
6401 int error = 0;
6402
6403 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
6404
6405 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6406 int flags;
6407 struct cl_extent cl;
6408
6409 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
6410 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
6411 if (flags & UPL_POP_DIRTY) {
6412 cl.e_addr = cl.b_addr + 1;
6413
6414 error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
6415
6416 if (error) {
6417 break;
6418 }
6419 }
6420 }
6421 }
6422 }
6423 wbp->cl_number -= cl_index;
6424
6425 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
6426
6427 return error;
6428 }
6429
6430
6431 /*
6432 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
6433 * still associated with the write-behind context... however, if the scmap has been disassociated
6434 * from the write-behind context (the cluster_push case), the wb lock is not held
6435 */
6436 static int
sparse_cluster_push(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6437 sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
6438 int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6439 {
6440 struct cl_extent cl;
6441 off_t offset;
6442 u_int length;
6443 void *l_scmap;
6444 int error = 0;
6445
6446 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
6447
6448 if (push_flag & PUSH_ALL) {
6449 vfs_drt_control(scmap, 1);
6450 }
6451
6452 l_scmap = *scmap;
6453
6454 for (;;) {
6455 int retval;
6456
6457 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) {
6458 /*
6459 * Not finding anything to push will return KERN_FAILURE.
6460 * Confusing since it isn't really a failure. But that's the
6461 * reason we don't set 'error' here like we do below.
6462 */
6463 break;
6464 }
6465
6466 if (vm_initiated == TRUE) {
6467 lck_mtx_unlock(&wbp->cl_lockw);
6468 }
6469
6470 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6471 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6472
6473 retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
6474 if (error == 0 && retval) {
6475 error = retval;
6476 }
6477
6478 if (vm_initiated == TRUE) {
6479 lck_mtx_lock(&wbp->cl_lockw);
6480
6481 if (*scmap != l_scmap) {
6482 break;
6483 }
6484 }
6485
6486 if (error) {
6487 if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
6488 panic("Failed to restore dirty state on failure");
6489 }
6490
6491 break;
6492 }
6493
6494 if (!(push_flag & PUSH_ALL)) {
6495 break;
6496 }
6497 }
6498 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6499
6500 return error;
6501 }
6502
6503
6504 /*
6505 * sparse_cluster_add is called with the write behind lock held
6506 */
6507 static int
sparse_cluster_add(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,struct cl_extent * cl,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6508 sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
6509 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6510 {
6511 u_int new_dirty;
6512 u_int length;
6513 off_t offset;
6514 int error = 0;
6515 int push_flag = 0; /* Is this a valid value? */
6516
6517 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
6518
6519 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6520 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
6521
6522 while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
6523 /*
6524 * no room left in the map
6525 * only a partial update was done
6526 * push out some pages and try again
6527 */
6528
6529 if (vfs_get_scmap_push_behavior_internal(scmap, &push_flag)) {
6530 push_flag = 0;
6531 }
6532
6533 error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, 0, callback, callback_arg, vm_initiated);
6534
6535 if (error) {
6536 break;
6537 }
6538
6539 offset += (new_dirty * PAGE_SIZE_64);
6540 length -= (new_dirty * PAGE_SIZE);
6541 }
6542 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6543
6544 return error;
6545 }
6546
6547
6548 static int
cluster_align_phys_io(vnode_t vp,struct uio * uio,addr64_t usr_paddr,u_int32_t xsize,int flags,int (* callback)(buf_t,void *),void * callback_arg)6549 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
6550 {
6551 upl_page_info_t *pl;
6552 upl_t upl;
6553 addr64_t ubc_paddr;
6554 kern_return_t kret;
6555 int error = 0;
6556 int did_read = 0;
6557 int abort_flags;
6558 int upl_flags;
6559 int bflag;
6560
6561 if (flags & IO_PASSIVE) {
6562 bflag = CL_PASSIVE;
6563 } else {
6564 bflag = 0;
6565 }
6566
6567 if (flags & IO_NOCACHE) {
6568 bflag |= CL_NOCACHE;
6569 }
6570
6571 upl_flags = UPL_SET_LITE;
6572
6573 if (!(flags & CL_READ)) {
6574 /*
6575 * "write" operation: let the UPL subsystem know
6576 * that we intend to modify the buffer cache pages
6577 * we're gathering.
6578 */
6579 upl_flags |= UPL_WILL_MODIFY;
6580 } else {
6581 /*
6582 * indicate that there is no need to pull the
6583 * mapping for this page... we're only going
6584 * to read from it, not modify it.
6585 */
6586 upl_flags |= UPL_FILE_IO;
6587 }
6588 kret = ubc_create_upl_kernel(vp,
6589 uio->uio_offset & ~PAGE_MASK_64,
6590 PAGE_SIZE,
6591 &upl,
6592 &pl,
6593 upl_flags,
6594 VM_KERN_MEMORY_FILE);
6595
6596 if (kret != KERN_SUCCESS) {
6597 return EINVAL;
6598 }
6599
6600 if (!upl_valid_page(pl, 0)) {
6601 /*
6602 * issue a synchronous read to cluster_io
6603 */
6604 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6605 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6606 if (error) {
6607 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
6608
6609 return error;
6610 }
6611 did_read = 1;
6612 }
6613 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
6614
6615 /*
6616 * NOTE: There is no prototype for the following in BSD. It, and the definitions
6617 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6618 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
6619 * way to do so without exporting them to kexts as well.
6620 */
6621 if (flags & CL_READ) {
6622 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
6623 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
6624 } else {
6625 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
6626 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
6627 }
6628 if (!(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
6629 /*
6630 * issue a synchronous write to cluster_io
6631 */
6632 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6633 bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6634 }
6635 if (error == 0) {
6636 uio_update(uio, (user_size_t)xsize);
6637 }
6638
6639 if (did_read) {
6640 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6641 } else {
6642 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
6643 }
6644
6645 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
6646
6647 return error;
6648 }
6649
6650 int
cluster_copy_upl_data(struct uio * uio,upl_t upl,int upl_offset,int * io_resid)6651 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
6652 {
6653 int pg_offset;
6654 int pg_index;
6655 int csize;
6656 int segflg;
6657 int retval = 0;
6658 int xsize;
6659 upl_page_info_t *pl;
6660 int dirty_count;
6661
6662 xsize = *io_resid;
6663
6664 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6665 (int)uio->uio_offset, upl_offset, xsize, 0, 0);
6666
6667 segflg = uio->uio_segflg;
6668
6669 switch (segflg) {
6670 case UIO_USERSPACE32:
6671 case UIO_USERISPACE32:
6672 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6673 break;
6674
6675 case UIO_USERSPACE:
6676 case UIO_USERISPACE:
6677 uio->uio_segflg = UIO_PHYS_USERSPACE;
6678 break;
6679
6680 case UIO_USERSPACE64:
6681 case UIO_USERISPACE64:
6682 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6683 break;
6684
6685 case UIO_SYSSPACE:
6686 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6687 break;
6688 }
6689 pl = ubc_upl_pageinfo(upl);
6690
6691 pg_index = upl_offset / PAGE_SIZE;
6692 pg_offset = upl_offset & PAGE_MASK;
6693 csize = min(PAGE_SIZE - pg_offset, xsize);
6694
6695 dirty_count = 0;
6696 while (xsize && retval == 0) {
6697 addr64_t paddr;
6698
6699 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
6700 if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) {
6701 dirty_count++;
6702 }
6703
6704 retval = uiomove64(paddr, csize, uio);
6705
6706 pg_index += 1;
6707 pg_offset = 0;
6708 xsize -= csize;
6709 csize = min(PAGE_SIZE, xsize);
6710 }
6711 *io_resid = xsize;
6712
6713 uio->uio_segflg = segflg;
6714
6715 task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
6716 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6717 (int)uio->uio_offset, xsize, retval, segflg, 0);
6718
6719 return retval;
6720 }
6721
6722
6723 int
cluster_copy_ubc_data(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty)6724 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
6725 {
6726 return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1);
6727 }
6728
6729
6730 static int
cluster_copy_ubc_data_internal(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty,int take_reference)6731 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
6732 {
6733 int segflg;
6734 int io_size;
6735 int xsize;
6736 int start_offset;
6737 int retval = 0;
6738 memory_object_control_t control;
6739
6740 io_size = *io_resid;
6741
6742 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6743 (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
6744
6745 control = ubc_getobject(vp, UBC_FLAGS_NONE);
6746
6747 if (control == MEMORY_OBJECT_CONTROL_NULL) {
6748 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6749 (int)uio->uio_offset, io_size, retval, 3, 0);
6750
6751 return 0;
6752 }
6753 segflg = uio->uio_segflg;
6754
6755 switch (segflg) {
6756 case UIO_USERSPACE32:
6757 case UIO_USERISPACE32:
6758 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6759 break;
6760
6761 case UIO_USERSPACE64:
6762 case UIO_USERISPACE64:
6763 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6764 break;
6765
6766 case UIO_USERSPACE:
6767 case UIO_USERISPACE:
6768 uio->uio_segflg = UIO_PHYS_USERSPACE;
6769 break;
6770
6771 case UIO_SYSSPACE:
6772 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6773 break;
6774 }
6775
6776 if ((io_size = *io_resid)) {
6777 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6778 xsize = (int)uio_resid(uio);
6779
6780 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6781 start_offset, io_size, mark_dirty, take_reference);
6782 xsize -= uio_resid(uio);
6783 io_size -= xsize;
6784 }
6785 uio->uio_segflg = segflg;
6786 *io_resid = io_size;
6787
6788 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6789 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
6790
6791 return retval;
6792 }
6793
6794
6795 int
is_file_clean(vnode_t vp,off_t filesize)6796 is_file_clean(vnode_t vp, off_t filesize)
6797 {
6798 off_t f_offset;
6799 int flags;
6800 int total_dirty = 0;
6801
6802 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6803 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6804 if (flags & UPL_POP_DIRTY) {
6805 total_dirty++;
6806 }
6807 }
6808 }
6809 if (total_dirty) {
6810 return EINVAL;
6811 }
6812
6813 return 0;
6814 }
6815
6816
6817
6818 /*
6819 * Dirty region tracking/clustering mechanism.
6820 *
6821 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6822 * dirty regions within a larger space (file). It is primarily intended to
6823 * support clustering in large files with many dirty areas.
6824 *
6825 * The implementation assumes that the dirty regions are pages.
6826 *
6827 * To represent dirty pages within the file, we store bit vectors in a
6828 * variable-size circular hash.
6829 */
6830
6831 /*
6832 * Bitvector size. This determines the number of pages we group in a
6833 * single hashtable entry. Each hashtable entry is aligned to this
6834 * size within the file.
6835 */
6836 #define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
6837
6838 /*
6839 * File offset handling.
6840 *
6841 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6842 * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6843 */
6844 #define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6845 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
6846
6847 /*
6848 * Hashtable address field handling.
6849 *
6850 * The low-order bits of the hashtable address are used to conserve
6851 * space.
6852 *
6853 * DRT_HASH_COUNT_MASK must be large enough to store the range
6854 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6855 * to indicate that the bucket is actually unoccupied.
6856 */
6857 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6858 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
6859 do { \
6860 (scm)->scm_hashtable[(i)].dhe_control = \
6861 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6862 } while (0)
6863 #define DRT_HASH_COUNT_MASK 0x1ff
6864 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6865 #define DRT_HASH_SET_COUNT(scm, i, c) \
6866 do { \
6867 (scm)->scm_hashtable[(i)].dhe_control = \
6868 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
6869 } while (0)
6870 #define DRT_HASH_CLEAR(scm, i) \
6871 do { \
6872 (scm)->scm_hashtable[(i)].dhe_control = 0; \
6873 } while (0)
6874 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6875 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6876 #define DRT_HASH_COPY(oscm, oi, scm, i) \
6877 do { \
6878 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
6879 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
6880 } while(0);
6881
6882
6883 #if !defined(XNU_TARGET_OS_OSX)
6884 /*
6885 * Hash table moduli.
6886 *
6887 * Since the hashtable entry's size is dependent on the size of
6888 * the bitvector, and since the hashtable size is constrained to
6889 * both being prime and fitting within the desired allocation
6890 * size, these values need to be manually determined.
6891 *
6892 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6893 *
6894 * The small hashtable allocation is 4096 bytes, so the modulus is 251.
6895 * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
6896 * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
6897 */
6898
6899 #define DRT_HASH_SMALL_MODULUS 251
6900 #define DRT_HASH_LARGE_MODULUS 2039
6901 #define DRT_HASH_XLARGE_MODULUS 8179
6902
6903 /*
6904 * Physical memory required before the large hash modulus is permitted.
6905 *
6906 * On small memory systems, the large hash modulus can lead to phsyical
6907 * memory starvation, so we avoid using it there.
6908 */
6909 #define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
6910 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (8 * 1024LL * 1024LL * 1024LL) /* 8GiB */
6911
6912 #define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
6913 #define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
6914 #define DRT_XLARGE_ALLOCATION 131072 /* 208 bytes spare */
6915
6916 #else /* XNU_TARGET_OS_OSX */
6917 /*
6918 * Hash table moduli.
6919 *
6920 * Since the hashtable entry's size is dependent on the size of
6921 * the bitvector, and since the hashtable size is constrained to
6922 * both being prime and fitting within the desired allocation
6923 * size, these values need to be manually determined.
6924 *
6925 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6926 *
6927 * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6928 * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
6929 * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
6930 */
6931
6932 #define DRT_HASH_SMALL_MODULUS 1019
6933 #define DRT_HASH_LARGE_MODULUS 8179
6934 #define DRT_HASH_XLARGE_MODULUS 32749
6935
6936 /*
6937 * Physical memory required before the large hash modulus is permitted.
6938 *
6939 * On small memory systems, the large hash modulus can lead to phsyical
6940 * memory starvation, so we avoid using it there.
6941 */
6942 #define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
6943 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (32 * 1024LL * 1024LL * 1024LL) /* 32GiB */
6944
6945 #define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
6946 #define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
6947 #define DRT_XLARGE_ALLOCATION 524288 /* 304 bytes spare */
6948
6949 #endif /* ! XNU_TARGET_OS_OSX */
6950
6951 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6952
6953 /*
6954 * Hashtable entry.
6955 */
6956 struct vfs_drt_hashentry {
6957 u_int64_t dhe_control;
6958 /*
6959 * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6960 * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
6961 * Since PAGE_SIZE is only known at boot time,
6962 * -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6963 * -declare dhe_bitvector array for largest possible length
6964 */
6965 #define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
6966 u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
6967 };
6968
6969 /*
6970 * Hashtable bitvector handling.
6971 *
6972 * Bitvector fields are 32 bits long.
6973 */
6974
6975 #define DRT_HASH_SET_BIT(scm, i, bit) \
6976 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
6977
6978 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
6979 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
6980
6981 #define DRT_HASH_TEST_BIT(scm, i, bit) \
6982 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
6983
6984 #define DRT_BITVECTOR_CLEAR(scm, i) \
6985 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6986
6987 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
6988 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
6989 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
6990 (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6991
6992 /*
6993 * Dirty Region Tracking structure.
6994 *
6995 * The hashtable is allocated entirely inside the DRT structure.
6996 *
6997 * The hash is a simple circular prime modulus arrangement, the structure
6998 * is resized from small to large if it overflows.
6999 */
7000
7001 struct vfs_drt_clustermap {
7002 u_int32_t scm_magic; /* sanity/detection */
7003 #define DRT_SCM_MAGIC 0x12020003
7004 u_int32_t scm_modulus; /* current ring size */
7005 u_int32_t scm_buckets; /* number of occupied buckets */
7006 u_int32_t scm_lastclean; /* last entry we cleaned */
7007 u_int32_t scm_iskips; /* number of slot skips */
7008
7009 struct vfs_drt_hashentry scm_hashtable[0];
7010 };
7011
7012
7013 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
7014 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
7015
7016 /*
7017 * Debugging codes and arguments.
7018 */
7019 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
7020 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
7021 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
7022 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
7023 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
7024 * dirty */
7025 /* 0, setcount */
7026 /* 1 (clean, no map) */
7027 /* 2 (map alloc fail) */
7028 /* 3, resid (partial) */
7029 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
7030 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
7031 * lastclean, iskips */
7032
7033
7034 static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
7035 static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
7036 static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
7037 u_int64_t offset, int *indexp);
7038 static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
7039 u_int64_t offset,
7040 int *indexp,
7041 int recursed);
7042 static kern_return_t vfs_drt_do_mark_pages(
7043 void **cmapp,
7044 u_int64_t offset,
7045 u_int length,
7046 u_int *setcountp,
7047 int dirty);
7048 static void vfs_drt_trace(
7049 struct vfs_drt_clustermap *cmap,
7050 int code,
7051 int arg1,
7052 int arg2,
7053 int arg3,
7054 int arg4);
7055
7056
7057 /*
7058 * Allocate and initialise a sparse cluster map.
7059 *
7060 * Will allocate a new map, resize or compact an existing map.
7061 *
7062 * XXX we should probably have at least one intermediate map size,
7063 * as the 1:16 ratio seems a bit drastic.
7064 */
7065 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap ** cmapp)7066 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
7067 {
7068 struct vfs_drt_clustermap *cmap = NULL, *ocmap = NULL;
7069 kern_return_t kret = KERN_SUCCESS;
7070 u_int64_t offset = 0;
7071 u_int32_t i = 0;
7072 int modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
7073
7074 ocmap = NULL;
7075 if (cmapp != NULL) {
7076 ocmap = *cmapp;
7077 }
7078
7079 /*
7080 * Decide on the size of the new map.
7081 */
7082 if (ocmap == NULL) {
7083 modulus_size = DRT_HASH_SMALL_MODULUS;
7084 map_size = DRT_SMALL_ALLOCATION;
7085 } else {
7086 /* count the number of active buckets in the old map */
7087 active_buckets = 0;
7088 for (i = 0; i < ocmap->scm_modulus; i++) {
7089 if (!DRT_HASH_VACANT(ocmap, i) &&
7090 (DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
7091 active_buckets++;
7092 }
7093 }
7094 /*
7095 * If we're currently using the small allocation, check to
7096 * see whether we should grow to the large one.
7097 */
7098 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7099 /*
7100 * If the ring is nearly full and we are allowed to
7101 * use the large modulus, upgrade.
7102 */
7103 if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
7104 (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
7105 modulus_size = DRT_HASH_LARGE_MODULUS;
7106 map_size = DRT_LARGE_ALLOCATION;
7107 } else {
7108 modulus_size = DRT_HASH_SMALL_MODULUS;
7109 map_size = DRT_SMALL_ALLOCATION;
7110 }
7111 } else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7112 if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
7113 (max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
7114 modulus_size = DRT_HASH_XLARGE_MODULUS;
7115 map_size = DRT_XLARGE_ALLOCATION;
7116 } else {
7117 /*
7118 * If the ring is completely full and we can't
7119 * expand, there's nothing useful for us to do.
7120 * Behave as though we had compacted into the new
7121 * array and return.
7122 */
7123 return KERN_SUCCESS;
7124 }
7125 } else {
7126 /* already using the xlarge modulus */
7127 modulus_size = DRT_HASH_XLARGE_MODULUS;
7128 map_size = DRT_XLARGE_ALLOCATION;
7129
7130 /*
7131 * If the ring is completely full, there's
7132 * nothing useful for us to do. Behave as
7133 * though we had compacted into the new
7134 * array and return.
7135 */
7136 if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
7137 return KERN_SUCCESS;
7138 }
7139 }
7140 }
7141
7142 /*
7143 * Allocate and initialise the new map.
7144 */
7145
7146 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, map_size,
7147 KMA_DATA, VM_KERN_MEMORY_FILE);
7148 if (kret != KERN_SUCCESS) {
7149 return kret;
7150 }
7151 cmap->scm_magic = DRT_SCM_MAGIC;
7152 cmap->scm_modulus = modulus_size;
7153 cmap->scm_buckets = 0;
7154 cmap->scm_lastclean = 0;
7155 cmap->scm_iskips = 0;
7156 for (i = 0; i < cmap->scm_modulus; i++) {
7157 DRT_HASH_CLEAR(cmap, i);
7158 DRT_HASH_VACATE(cmap, i);
7159 DRT_BITVECTOR_CLEAR(cmap, i);
7160 }
7161
7162 /*
7163 * If there's an old map, re-hash entries from it into the new map.
7164 */
7165 copycount = 0;
7166 if (ocmap != NULL) {
7167 for (i = 0; i < ocmap->scm_modulus; i++) {
7168 /* skip empty buckets */
7169 if (DRT_HASH_VACANT(ocmap, i) ||
7170 (DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
7171 continue;
7172 }
7173 /* get new index */
7174 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
7175 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
7176 if (kret != KERN_SUCCESS) {
7177 /* XXX need to bail out gracefully here */
7178 panic("vfs_drt: new cluster map mysteriously too small");
7179 index = 0;
7180 }
7181 /* copy */
7182 DRT_HASH_COPY(ocmap, i, cmap, index);
7183 copycount++;
7184 }
7185 }
7186
7187 /* log what we've done */
7188 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
7189
7190 /*
7191 * It's important to ensure that *cmapp always points to
7192 * a valid map, so we must overwrite it before freeing
7193 * the old map.
7194 */
7195 *cmapp = cmap;
7196 if (ocmap != NULL) {
7197 /* emit stats into trace buffer */
7198 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
7199 ocmap->scm_modulus,
7200 ocmap->scm_buckets,
7201 ocmap->scm_lastclean,
7202 ocmap->scm_iskips);
7203
7204 vfs_drt_free_map(ocmap);
7205 }
7206 return KERN_SUCCESS;
7207 }
7208
7209
7210 /*
7211 * Free a sparse cluster map.
7212 */
7213 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap * cmap)7214 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
7215 {
7216 vm_size_t map_size = 0;
7217
7218 if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7219 map_size = DRT_SMALL_ALLOCATION;
7220 } else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7221 map_size = DRT_LARGE_ALLOCATION;
7222 } else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7223 map_size = DRT_XLARGE_ALLOCATION;
7224 } else {
7225 panic("vfs_drt_free_map: Invalid modulus %d", cmap->scm_modulus);
7226 }
7227
7228 kmem_free(kernel_map, (vm_offset_t)cmap, map_size);
7229 return KERN_SUCCESS;
7230 }
7231
7232
7233 /*
7234 * Find the hashtable slot currently occupied by an entry for the supplied offset.
7235 */
7236 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap * cmap,u_int64_t offset,int * indexp)7237 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
7238 {
7239 int index;
7240 u_int32_t i;
7241
7242 offset = DRT_ALIGN_ADDRESS(offset);
7243 index = DRT_HASH(cmap, offset);
7244
7245 /* traverse the hashtable */
7246 for (i = 0; i < cmap->scm_modulus; i++) {
7247 /*
7248 * If the slot is vacant, we can stop.
7249 */
7250 if (DRT_HASH_VACANT(cmap, index)) {
7251 break;
7252 }
7253
7254 /*
7255 * If the address matches our offset, we have success.
7256 */
7257 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
7258 *indexp = index;
7259 return KERN_SUCCESS;
7260 }
7261
7262 /*
7263 * Move to the next slot, try again.
7264 */
7265 index = DRT_HASH_NEXT(cmap, index);
7266 }
7267 /*
7268 * It's not there.
7269 */
7270 return KERN_FAILURE;
7271 }
7272
7273 /*
7274 * Find the hashtable slot for the supplied offset. If we haven't allocated
7275 * one yet, allocate one and populate the address field. Note that it will
7276 * not have a nonzero page count and thus will still technically be free, so
7277 * in the case where we are called to clean pages, the slot will remain free.
7278 */
7279 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap ** cmapp,u_int64_t offset,int * indexp,int recursed)7280 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
7281 {
7282 struct vfs_drt_clustermap *cmap;
7283 kern_return_t kret;
7284 u_int32_t index;
7285 u_int32_t i;
7286
7287 cmap = *cmapp;
7288
7289 /* look for an existing entry */
7290 kret = vfs_drt_search_index(cmap, offset, indexp);
7291 if (kret == KERN_SUCCESS) {
7292 return kret;
7293 }
7294
7295 /* need to allocate an entry */
7296 offset = DRT_ALIGN_ADDRESS(offset);
7297 index = DRT_HASH(cmap, offset);
7298
7299 /* scan from the index forwards looking for a vacant slot */
7300 for (i = 0; i < cmap->scm_modulus; i++) {
7301 /* slot vacant? */
7302 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap, index) == 0) {
7303 cmap->scm_buckets++;
7304 if (index < cmap->scm_lastclean) {
7305 cmap->scm_lastclean = index;
7306 }
7307 DRT_HASH_SET_ADDRESS(cmap, index, offset);
7308 DRT_HASH_SET_COUNT(cmap, index, 0);
7309 DRT_BITVECTOR_CLEAR(cmap, index);
7310 *indexp = index;
7311 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
7312 return KERN_SUCCESS;
7313 }
7314 cmap->scm_iskips += i;
7315 index = DRT_HASH_NEXT(cmap, index);
7316 }
7317
7318 /*
7319 * We haven't found a vacant slot, so the map is full. If we're not
7320 * already recursed, try reallocating/compacting it.
7321 */
7322 if (recursed) {
7323 return KERN_FAILURE;
7324 }
7325 kret = vfs_drt_alloc_map(cmapp);
7326 if (kret == KERN_SUCCESS) {
7327 /* now try to insert again */
7328 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
7329 }
7330 return kret;
7331 }
7332
7333 /*
7334 * Implementation of set dirty/clean.
7335 *
7336 * In the 'clean' case, not finding a map is OK.
7337 */
7338 static kern_return_t
vfs_drt_do_mark_pages(void ** private,u_int64_t offset,u_int length,u_int * setcountp,int dirty)7339 vfs_drt_do_mark_pages(
7340 void **private,
7341 u_int64_t offset,
7342 u_int length,
7343 u_int *setcountp,
7344 int dirty)
7345 {
7346 struct vfs_drt_clustermap *cmap, **cmapp;
7347 kern_return_t kret;
7348 int i, index, pgoff, pgcount, setcount, ecount;
7349
7350 cmapp = (struct vfs_drt_clustermap **)private;
7351 cmap = *cmapp;
7352
7353 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
7354
7355 if (setcountp != NULL) {
7356 *setcountp = 0;
7357 }
7358
7359 /* allocate a cluster map if we don't already have one */
7360 if (cmap == NULL) {
7361 /* no cluster map, nothing to clean */
7362 if (!dirty) {
7363 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
7364 return KERN_SUCCESS;
7365 }
7366 kret = vfs_drt_alloc_map(cmapp);
7367 if (kret != KERN_SUCCESS) {
7368 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
7369 return kret;
7370 }
7371 }
7372 setcount = 0;
7373
7374 /*
7375 * Iterate over the length of the region.
7376 */
7377 while (length > 0) {
7378 /*
7379 * Get the hashtable index for this offset.
7380 *
7381 * XXX this will add blank entries if we are clearing a range
7382 * that hasn't been dirtied.
7383 */
7384 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
7385 cmap = *cmapp; /* may have changed! */
7386 /* this may be a partial-success return */
7387 if (kret != KERN_SUCCESS) {
7388 if (setcountp != NULL) {
7389 *setcountp = setcount;
7390 }
7391 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
7392
7393 return kret;
7394 }
7395
7396 /*
7397 * Work out how many pages we're modifying in this
7398 * hashtable entry.
7399 */
7400 pgoff = (int)((offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE);
7401 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
7402
7403 /*
7404 * Iterate over pages, dirty/clearing as we go.
7405 */
7406 ecount = DRT_HASH_GET_COUNT(cmap, index);
7407 for (i = 0; i < pgcount; i++) {
7408 if (dirty) {
7409 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7410 if (ecount >= DRT_BITVECTOR_PAGES) {
7411 panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7412 }
7413 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
7414 ecount++;
7415 setcount++;
7416 }
7417 } else {
7418 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7419 if (ecount <= 0) {
7420 panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7421 }
7422 assert(ecount > 0);
7423 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
7424 ecount--;
7425 setcount++;
7426 }
7427 }
7428 }
7429 DRT_HASH_SET_COUNT(cmap, index, ecount);
7430
7431 offset += pgcount * PAGE_SIZE;
7432 length -= pgcount * PAGE_SIZE;
7433 }
7434 if (setcountp != NULL) {
7435 *setcountp = setcount;
7436 }
7437
7438 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
7439
7440 return KERN_SUCCESS;
7441 }
7442
7443 /*
7444 * Mark a set of pages as dirty/clean.
7445 *
7446 * This is a public interface.
7447 *
7448 * cmapp
7449 * Pointer to storage suitable for holding a pointer. Note that
7450 * this must either be NULL or a value set by this function.
7451 *
7452 * size
7453 * Current file size in bytes.
7454 *
7455 * offset
7456 * Offset of the first page to be marked as dirty, in bytes. Must be
7457 * page-aligned.
7458 *
7459 * length
7460 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
7461 *
7462 * setcountp
7463 * Number of pages newly marked dirty by this call (optional).
7464 *
7465 * Returns KERN_SUCCESS if all the pages were successfully marked.
7466 */
7467 static kern_return_t
vfs_drt_mark_pages(void ** cmapp,off_t offset,u_int length,u_int * setcountp)7468 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
7469 {
7470 /* XXX size unused, drop from interface */
7471 return vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1);
7472 }
7473
7474 #if 0
7475 static kern_return_t
7476 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
7477 {
7478 return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7479 }
7480 #endif
7481
7482 /*
7483 * Get a cluster of dirty pages.
7484 *
7485 * This is a public interface.
7486 *
7487 * cmapp
7488 * Pointer to storage managed by drt_mark_pages. Note that this must
7489 * be NULL or a value set by drt_mark_pages.
7490 *
7491 * offsetp
7492 * Returns the byte offset into the file of the first page in the cluster.
7493 *
7494 * lengthp
7495 * Returns the length in bytes of the cluster of dirty pages.
7496 *
7497 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
7498 * are no dirty pages meeting the minmum size criteria. Private storage will
7499 * be released if there are no more dirty pages left in the map
7500 *
7501 */
7502 static kern_return_t
vfs_drt_get_cluster(void ** cmapp,off_t * offsetp,u_int * lengthp)7503 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
7504 {
7505 struct vfs_drt_clustermap *cmap;
7506 u_int64_t offset;
7507 u_int length;
7508 u_int32_t j;
7509 int index, i, fs, ls;
7510
7511 /* sanity */
7512 if ((cmapp == NULL) || (*cmapp == NULL)) {
7513 return KERN_FAILURE;
7514 }
7515 cmap = *cmapp;
7516
7517 /* walk the hashtable */
7518 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
7519 index = DRT_HASH(cmap, offset);
7520
7521 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
7522 continue;
7523 }
7524
7525 /* scan the bitfield for a string of bits */
7526 fs = -1;
7527
7528 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7529 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7530 fs = i;
7531 break;
7532 }
7533 }
7534 if (fs == -1) {
7535 /* didn't find any bits set */
7536 panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7537 cmap, index, DRT_HASH_GET_COUNT(cmap, index));
7538 }
7539 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
7540 if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
7541 break;
7542 }
7543 }
7544
7545 /* compute offset and length, mark pages clean */
7546 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7547 length = ls * PAGE_SIZE;
7548 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7549 cmap->scm_lastclean = index;
7550
7551 /* return successful */
7552 *offsetp = (off_t)offset;
7553 *lengthp = length;
7554
7555 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
7556 return KERN_SUCCESS;
7557 }
7558 /*
7559 * We didn't find anything... hashtable is empty
7560 * emit stats into trace buffer and
7561 * then free it
7562 */
7563 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7564 cmap->scm_modulus,
7565 cmap->scm_buckets,
7566 cmap->scm_lastclean,
7567 cmap->scm_iskips);
7568
7569 vfs_drt_free_map(cmap);
7570 *cmapp = NULL;
7571
7572 return KERN_FAILURE;
7573 }
7574
7575
7576 static kern_return_t
vfs_drt_control(void ** cmapp,int op_type)7577 vfs_drt_control(void **cmapp, int op_type)
7578 {
7579 struct vfs_drt_clustermap *cmap;
7580
7581 /* sanity */
7582 if ((cmapp == NULL) || (*cmapp == NULL)) {
7583 return KERN_FAILURE;
7584 }
7585 cmap = *cmapp;
7586
7587 switch (op_type) {
7588 case 0:
7589 /* emit stats into trace buffer */
7590 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7591 cmap->scm_modulus,
7592 cmap->scm_buckets,
7593 cmap->scm_lastclean,
7594 cmap->scm_iskips);
7595
7596 vfs_drt_free_map(cmap);
7597 *cmapp = NULL;
7598 break;
7599
7600 case 1:
7601 cmap->scm_lastclean = 0;
7602 break;
7603 }
7604 return KERN_SUCCESS;
7605 }
7606
7607
7608
7609 /*
7610 * Emit a summary of the state of the clustermap into the trace buffer
7611 * along with some caller-provided data.
7612 */
7613 #if KDEBUG
7614 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,int code,int arg1,int arg2,int arg3,int arg4)7615 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
7616 {
7617 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
7618 }
7619 #else
7620 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,__unused int code,__unused int arg1,__unused int arg2,__unused int arg3,__unused int arg4)7621 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
7622 __unused int arg1, __unused int arg2, __unused int arg3,
7623 __unused int arg4)
7624 {
7625 }
7626 #endif
7627
7628 #if 0
7629 /*
7630 * Perform basic sanity check on the hash entry summary count
7631 * vs. the actual bits set in the entry.
7632 */
7633 static void
7634 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7635 {
7636 int index, i;
7637 int bits_on;
7638
7639 for (index = 0; index < cmap->scm_modulus; index++) {
7640 if (DRT_HASH_VACANT(cmap, index)) {
7641 continue;
7642 }
7643
7644 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7645 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7646 bits_on++;
7647 }
7648 }
7649 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
7650 panic("bits_on = %d, index = %d", bits_on, index);
7651 }
7652 }
7653 }
7654 #endif
7655
7656 /*
7657 * Internal interface only.
7658 */
7659 static kern_return_t
vfs_get_scmap_push_behavior_internal(void ** cmapp,int * push_flag)7660 vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag)
7661 {
7662 struct vfs_drt_clustermap *cmap;
7663
7664 /* sanity */
7665 if ((cmapp == NULL) || (*cmapp == NULL) || (push_flag == NULL)) {
7666 return KERN_FAILURE;
7667 }
7668 cmap = *cmapp;
7669
7670 if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7671 /*
7672 * If we have a full xlarge sparse cluster,
7673 * we push it out all at once so the cluster
7674 * map can be available to absorb more I/Os.
7675 * This is done on large memory configs so
7676 * the small I/Os don't interfere with the
7677 * pro workloads.
7678 */
7679 *push_flag = PUSH_ALL;
7680 }
7681 return KERN_SUCCESS;
7682 }
7683