1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
62 */
63
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <kern/kalloc.h>
71 #include <sys/time.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <miscfs/specfs/specdev.h>
75 #include <sys/uio_internal.h>
76 #include <libkern/libkern.h>
77 #include <machine/machine_routines.h>
78 #include <machine/smp.h>
79
80 #include <sys/ubc_internal.h>
81 #include <vm/vnode_pager.h>
82 #include <vm/vm_upl.h>
83
84 #include <mach/mach_types.h>
85 #include <mach/memory_object_types.h>
86 #include <mach/vm_map.h>
87 #include <mach/upl.h>
88 #include <mach/thread_info.h>
89 #include <kern/task.h>
90 #include <kern/policy_internal.h>
91 #include <kern/thread.h>
92
93 #include <vm/vm_kern_xnu.h>
94 #include <vm/vm_map_xnu.h>
95 #include <vm/vm_pageout_xnu.h>
96 #include <vm/vm_fault.h>
97 #include <vm/vm_ubc.h>
98
99 #include <sys/kdebug.h>
100 #include <sys/kdebug_triage.h>
101 #include <libkern/OSAtomic.h>
102
103 #include <sys/sdt.h>
104
105 #include <stdbool.h>
106
107 #include <vfs/vfs_disk_conditioner.h>
108
109 #if 0
110 #undef KERNEL_DEBUG
111 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
112 #endif
113
114
115 #define CL_READ 0x01
116 #define CL_WRITE 0x02
117 #define CL_ASYNC 0x04
118 #define CL_COMMIT 0x08
119 #define CL_PAGEOUT 0x10
120 #define CL_AGE 0x20
121 #define CL_NOZERO 0x40
122 #define CL_PAGEIN 0x80
123 #define CL_DEV_MEMORY 0x100
124 #define CL_PRESERVE 0x200
125 #define CL_THROTTLE 0x400
126 #define CL_KEEPCACHED 0x800
127 #define CL_DIRECT_IO 0x1000
128 #define CL_PASSIVE 0x2000
129 #define CL_IOSTREAMING 0x4000
130 #define CL_CLOSE 0x8000
131 #define CL_ENCRYPTED 0x10000
132 #define CL_RAW_ENCRYPTED 0x20000
133 #define CL_NOCACHE 0x40000
134 #define CL_DIRECT_IO_FSBLKSZ 0x80000
135
136 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
137
138 #define CLUSTER_IO_WAITING ((buf_t)1)
139
140 extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, upl_size_t);
141
142 struct clios {
143 lck_mtx_t io_mtxp;
144 u_int io_completed; /* amount of io that has currently completed */
145 u_int io_issued; /* amount of io that was successfully issued */
146 int io_error; /* error code of first error encountered */
147 int io_wanted; /* someone is sleeping waiting for a change in state */
148 };
149
150 struct cl_direct_read_lock {
151 LIST_ENTRY(cl_direct_read_lock) chain;
152 int32_t ref_count;
153 vnode_t vp;
154 lck_rw_t rw_lock;
155 };
156
157 #define CL_DIRECT_READ_LOCK_BUCKETS 61
158
159 static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
160 cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
161
162 static LCK_GRP_DECLARE(cl_mtx_grp, "cluster I/O");
163 static LCK_MTX_DECLARE(cl_transaction_mtxp, &cl_mtx_grp);
164 static LCK_SPIN_DECLARE(cl_direct_read_spin_lock, &cl_mtx_grp);
165
166 static ZONE_DEFINE(cl_rd_zone, "cluster_read",
167 sizeof(struct cl_readahead), ZC_ZFREE_CLEARMEM);
168
169 static ZONE_DEFINE(cl_wr_zone, "cluster_write",
170 sizeof(struct cl_writebehind), ZC_ZFREE_CLEARMEM);
171
172 #define IO_UNKNOWN 0
173 #define IO_DIRECT 1
174 #define IO_CONTIG 2
175 #define IO_COPY 3
176
177 #define PUSH_DELAY 0x01
178 #define PUSH_ALL 0x02
179 #define PUSH_SYNC 0x04
180
181
182 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size);
183 static void cluster_wait_IO(buf_t cbp_head, int async);
184 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
185
186 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
187
188 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
189 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
190 static void cluster_iodone_verify_continue(void);
191 static int cluster_iodone(buf_t bp, void *callback_arg);
192 static int cluster_iodone_finish(buf_t cbp_head, void *callback_arg);
193 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
194 static int cluster_is_throttled(vnode_t vp);
195
196 static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
197
198 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
199
200 static int cluster_handle_split_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
201 u_int io_size, int rounded_size, int local_flags, int (*callback)(buf_t, void *), void *callback_arg);
202
203 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
204 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
205
206 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
207 int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
208 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
209 int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
210 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
211 int (*)(buf_t, void *), void *callback_arg, int flags) __attribute__((noinline));
212
213 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
214 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg) __attribute__((noinline));
215 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
216 int flags, int (*callback)(buf_t, void *), void *callback_arg, uint32_t min_io_size) __attribute__((noinline));
217 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
218 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag) __attribute__((noinline));
219
220 static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
221 off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
222
223 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
224
225 static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
226 static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
227 int (*callback)(buf_t, void *), void *callback_arg, int bflag);
228
229 static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
230
231 static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
232 void *callback_arg, int *err, boolean_t vm_initiated);
233
234 static int sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
235 static int sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
236 int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
237 static int sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
238 int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
239
240 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
241 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
242 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
243 static kern_return_t vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag);
244
245
246 /*
247 * For throttled IO to check whether
248 * a block is cached by the boot cache
249 * and thus it can avoid delaying the IO.
250 *
251 * bootcache_contains_block is initially
252 * NULL. The BootCache will set it while
253 * the cache is active and clear it when
254 * the cache is jettisoned.
255 *
256 * Returns 0 if the block is not
257 * contained in the cache, 1 if it is
258 * contained.
259 *
260 * The function pointer remains valid
261 * after the cache has been evicted even
262 * if bootcache_contains_block has been
263 * cleared.
264 *
265 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
266 */
267 int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
268
269
270 /*
271 * limit the internal I/O size so that we
272 * can represent it in a 32 bit int
273 */
274 #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
275 #define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
276 #define MAX_VECTS 16
277 /*
278 * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
279 * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
280 * we have not historically allowed the write to bypass the UBC.
281 */
282 #define MIN_DIRECT_WRITE_SIZE (16384)
283
284 #define WRITE_THROTTLE 6
285 #define WRITE_THROTTLE_SSD 2
286 #define WRITE_BEHIND 1
287 #define WRITE_BEHIND_SSD 1
288
289 #if !defined(XNU_TARGET_OS_OSX)
290 #define PREFETCH 1
291 #define PREFETCH_SSD 1
292 uint32_t speculative_prefetch_max = (2048 * 1024); /* maximum bytes in a specluative read-ahead */
293 uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
294 #else /* XNU_TARGET_OS_OSX */
295 #define PREFETCH 3
296 #define PREFETCH_SSD 2
297 uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
298 uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
299 #endif /* ! XNU_TARGET_OS_OSX */
300
301 /* maximum bytes for read-ahead */
302 uint32_t prefetch_max = (1024 * 1024 * 1024);
303 /* maximum bytes for outstanding reads */
304 uint32_t overlapping_read_max = (1024 * 1024 * 1024);
305 /* maximum bytes for outstanding writes */
306 uint32_t overlapping_write_max = (1024 * 1024 * 1024);
307
308 #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
309 #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
310
311 int speculative_reads_disabled = 0;
312
313 /*
314 * throttle the number of async writes that
315 * can be outstanding on a single vnode
316 * before we issue a synchronous write
317 */
318 #define THROTTLE_MAXCNT 0
319
320 uint32_t throttle_max_iosize = (128 * 1024);
321
322 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
323
324 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
325
326 uint32_t split_pgin = 1;
327 uint32_t split_all_pgin = 1;
328 uint32_t split_all_pgin_equal = 0;
329 uint32_t split_pgin_headio = 0;
330
331 SYSCTL_INT(_kern, OID_AUTO, split_pagein_io, CTLFLAG_RW | CTLFLAG_LOCKED, &split_pgin, 0, "");
332 #if DEVELOPMENT || DEBUG
333 SYSCTL_INT(_kern, OID_AUTO, split_pagein_io_all, CTLFLAG_RW | CTLFLAG_LOCKED, &split_all_pgin, 0, "");
334 SYSCTL_INT(_kern, OID_AUTO, split_pagein_io_equal, CTLFLAG_RW | CTLFLAG_LOCKED, &split_all_pgin_equal, 0, "");
335 SYSCTL_INT(_kern, OID_AUTO, split_pagein_do_headio, CTLFLAG_RW | CTLFLAG_LOCKED, &split_pgin_headio, 0, "");
336 #endif
337
338 struct verify_buf {
339 TAILQ_ENTRY(verify_buf) vb_entry;
340 buf_t vb_cbp;
341 void* vb_callback_arg;
342 int32_t vb_whichq;
343 };
344
345 TAILQ_HEAD(, verify_buf) verify_free_head;
346 TAILQ_HEAD(, verify_buf) verify_work_head;
347
348 #define MAX_VERIFY_THREADS 4
349 #define MAX_REQUESTS_PER_THREAD 2
350
351 static struct verify_buf verify_bufs[MAX_VERIFY_THREADS * MAX_REQUESTS_PER_THREAD];
352 /*
353 * Each thread needs to check if the item at the head of the queue has a UPL
354 * pointer that is any of the threads are currently operating on.
355 * slot 0 is for the io completion thread to do the request inline if there are no free
356 * queue slots.
357 */
358 static int verify_in_flight = 0;
359
360 #if defined(XNU_TARGET_OS_IOS) || defined(XNU_TARGET_OS_XR)
361 #define NUM_DEFAULT_THREADS 2
362 #elif defined(XNU_TARGET_OS_OSX)
363 #define NUM_DEFAULT_THREADS 4
364 #else
365 #define NUM_DEFAULT_THREADS 0
366 #endif
367
368 static TUNABLE(uint32_t, num_verify_threads, "num_verify_threads", NUM_DEFAULT_THREADS);
369 static uint32_t cluster_verify_threads = 0; /* will be launched as needed upto num_verify_threads */
370
371 #if __AMP__
372 static TUNABLE(uint32_t, ecore_verify_threads, "ecore_verify_threads", false);
373 #endif /* __AMP__ */
374
375 static void
cluster_verify_init(void)376 cluster_verify_init(void)
377 {
378 TAILQ_INIT(&verify_free_head);
379 TAILQ_INIT(&verify_work_head);
380
381 if (num_verify_threads > MAX_VERIFY_THREADS) {
382 num_verify_threads = MAX_VERIFY_THREADS;
383 }
384
385 for (int i = 0; i < num_verify_threads * MAX_REQUESTS_PER_THREAD; i++) {
386 TAILQ_INSERT_TAIL(&verify_free_head, &verify_bufs[i], vb_entry);
387 }
388 }
389
390 void
cluster_init(void)391 cluster_init(void)
392 {
393 for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
394 LIST_INIT(&cl_direct_read_locks[i]);
395 }
396
397 cluster_verify_init();
398 }
399
400 uint32_t
cluster_max_io_size(mount_t mp,int type)401 cluster_max_io_size(mount_t mp, int type)
402 {
403 uint32_t max_io_size;
404 uint32_t segcnt;
405 uint32_t maxcnt;
406
407 switch (type) {
408 case CL_READ:
409 segcnt = mp->mnt_segreadcnt;
410 maxcnt = mp->mnt_maxreadcnt;
411 break;
412 case CL_WRITE:
413 segcnt = mp->mnt_segwritecnt;
414 maxcnt = mp->mnt_maxwritecnt;
415 break;
416 default:
417 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
418 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
419 break;
420 }
421 if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
422 /*
423 * don't allow a size beyond the max UPL size we can create
424 */
425 segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
426 }
427 max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
428
429 if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
430 /*
431 * don't allow a size smaller than the old fixed limit
432 */
433 max_io_size = MAX_UPL_TRANSFER_BYTES;
434 } else {
435 /*
436 * make sure the size specified is a multiple of PAGE_SIZE
437 */
438 max_io_size &= ~PAGE_MASK;
439 }
440 return max_io_size;
441 }
442
443 /*
444 * Returns max prefetch value. If the value overflows or exceeds the specified
445 * 'prefetch_limit', it will be capped at 'prefetch_limit' value.
446 */
447 static inline uint32_t
cluster_max_prefetch(vnode_t vp,uint32_t max_io_size,uint32_t prefetch_limit)448 cluster_max_prefetch(vnode_t vp, uint32_t max_io_size, uint32_t prefetch_limit)
449 {
450 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
451 uint32_t io_scale = IO_SCALE(vp, is_ssd ? PREFETCH_SSD : PREFETCH);
452 uint32_t prefetch = 0;
453
454 if (__improbable(os_mul_overflow(max_io_size, io_scale, &prefetch) ||
455 (prefetch > prefetch_limit))) {
456 prefetch = prefetch_limit;
457 }
458
459 return prefetch;
460 }
461
462 static inline uint32_t
calculate_max_throttle_size(vnode_t vp)463 calculate_max_throttle_size(vnode_t vp)
464 {
465 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
466 uint32_t io_scale = IO_SCALE(vp, is_ssd ? 2 : 1);
467
468 return MIN(io_scale * THROTTLE_MAX_IOSIZE, MAX_UPL_TRANSFER_BYTES);
469 }
470
471 static inline uint32_t
calculate_max_throttle_cnt(vnode_t vp)472 calculate_max_throttle_cnt(vnode_t vp)
473 {
474 bool is_ssd = disk_conditioner_mount_is_ssd(vp->v_mount);
475 uint32_t io_scale = IO_SCALE(vp, 1);
476
477 return is_ssd ? MIN(io_scale, 4) : THROTTLE_MAXCNT;
478 }
479
480 #define CLW_ALLOCATE 0x01
481 #define CLW_RETURNLOCKED 0x02
482 #define CLW_IONOCACHE 0x04
483 #define CLW_IOPASSIVE 0x08
484
485 /*
486 * if the read ahead context doesn't yet exist,
487 * allocate and initialize it...
488 * the vnode lock serializes multiple callers
489 * during the actual assignment... first one
490 * to grab the lock wins... the other callers
491 * will release the now unnecessary storage
492 *
493 * once the context is present, try to grab (but don't block on)
494 * the lock associated with it... if someone
495 * else currently owns it, than the read
496 * will run without read-ahead. this allows
497 * multiple readers to run in parallel and
498 * since there's only 1 read ahead context,
499 * there's no real loss in only allowing 1
500 * reader to have read-ahead enabled.
501 */
502 static struct cl_readahead *
cluster_get_rap(vnode_t vp)503 cluster_get_rap(vnode_t vp)
504 {
505 struct ubc_info *ubc;
506 struct cl_readahead *rap;
507
508 ubc = vp->v_ubcinfo;
509
510 if ((rap = ubc->cl_rahead) == NULL) {
511 rap = zalloc_flags(cl_rd_zone, Z_WAITOK | Z_ZERO);
512 rap->cl_lastr = -1;
513 lck_mtx_init(&rap->cl_lockr, &cl_mtx_grp, LCK_ATTR_NULL);
514
515 vnode_lock(vp);
516
517 if (ubc->cl_rahead == NULL) {
518 ubc->cl_rahead = rap;
519 } else {
520 lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
521 zfree(cl_rd_zone, rap);
522 rap = ubc->cl_rahead;
523 }
524 vnode_unlock(vp);
525 }
526 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) {
527 return rap;
528 }
529
530 return (struct cl_readahead *)NULL;
531 }
532
533
534 /*
535 * if the write behind context doesn't yet exist,
536 * and CLW_ALLOCATE is specified, allocate and initialize it...
537 * the vnode lock serializes multiple callers
538 * during the actual assignment... first one
539 * to grab the lock wins... the other callers
540 * will release the now unnecessary storage
541 *
542 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
543 * the lock associated with the write behind context before
544 * returning
545 */
546
547 static struct cl_writebehind *
cluster_get_wbp(vnode_t vp,int flags)548 cluster_get_wbp(vnode_t vp, int flags)
549 {
550 struct ubc_info *ubc;
551 struct cl_writebehind *wbp;
552
553 ubc = vp->v_ubcinfo;
554
555 if ((wbp = ubc->cl_wbehind) == NULL) {
556 if (!(flags & CLW_ALLOCATE)) {
557 return (struct cl_writebehind *)NULL;
558 }
559
560 wbp = zalloc_flags(cl_wr_zone, Z_WAITOK | Z_ZERO);
561
562 lck_mtx_init(&wbp->cl_lockw, &cl_mtx_grp, LCK_ATTR_NULL);
563
564 vnode_lock(vp);
565
566 if (ubc->cl_wbehind == NULL) {
567 ubc->cl_wbehind = wbp;
568 } else {
569 lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
570 zfree(cl_wr_zone, wbp);
571 wbp = ubc->cl_wbehind;
572 }
573 vnode_unlock(vp);
574 }
575 if (flags & CLW_RETURNLOCKED) {
576 lck_mtx_lock(&wbp->cl_lockw);
577 }
578
579 return wbp;
580 }
581
582
583 static void
cluster_syncup(vnode_t vp,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,int flags)584 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
585 {
586 struct cl_writebehind *wbp;
587
588 if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
589 if (wbp->cl_number) {
590 lck_mtx_lock(&wbp->cl_lockw);
591
592 cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
593
594 lck_mtx_unlock(&wbp->cl_lockw);
595 }
596 }
597 }
598
599
600 static int
cluster_io_present_in_BC(vnode_t vp,off_t f_offset)601 cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
602 {
603 daddr64_t blkno;
604 size_t io_size;
605 int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
606
607 if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
608 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL)) {
609 return 0;
610 }
611
612 if (io_size == 0) {
613 return 0;
614 }
615
616 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
617 return 1;
618 }
619 }
620 return 0;
621 }
622
623
624 static int
cluster_is_throttled(vnode_t vp)625 cluster_is_throttled(vnode_t vp)
626 {
627 return throttle_io_will_be_throttled(-1, vp->v_mount);
628 }
629
630
631 static void
cluster_iostate_wait(struct clios * iostate,u_int target,const char * wait_name)632 cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
633 {
634 lck_mtx_lock(&iostate->io_mtxp);
635
636 while ((iostate->io_issued - iostate->io_completed) > target) {
637 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
638 iostate->io_issued, iostate->io_completed, target, 0, 0);
639
640 iostate->io_wanted = 1;
641 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
642
643 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
644 iostate->io_issued, iostate->io_completed, target, 0, 0);
645 }
646 lck_mtx_unlock(&iostate->io_mtxp);
647 }
648
649
650 static void
cluster_handle_associated_upl(struct clios * iostate,upl_t upl,upl_offset_t upl_offset,upl_size_t size,off_t f_offset)651 cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
652 upl_offset_t upl_offset, upl_size_t size, off_t f_offset)
653 {
654 if (!size) {
655 return;
656 }
657
658 upl_t associated_upl = upl_associated_upl(upl);
659
660 if (!associated_upl) {
661 return;
662 }
663
664 /*
665 * The associated upl functions as a "range lock" for the file.
666 *
667 * The associated upl is created and is attached to to the upl in
668 * cluster_io when the direct io write is being started. Since the
669 * upl may be released in parts so the corresponding associated upl
670 * has to be released in parts as well.
671 *
672 * We have the f_offset, upl_offset and size and from that we have figure
673 * out the associated upl offset and length, we are interested in.
674 */
675 upl_offset_t assoc_upl_offset, assoc_upl_end;
676
677 /* ALIGNED UPL's */
678 if ((upl_offset & PAGE_MASK) == (f_offset & PAGE_MASK)) {
679 assoc_upl_offset = trunc_page_32(upl_offset);
680 assoc_upl_end = round_page_32(upl_offset + size);
681 goto do_commit;
682 }
683
684 /*
685 * HANDLE UNALIGNED UPLS
686 *
687 * ( See also cluster_io where the associated upl is created )
688 * While we create the upl in one go, we will be dumping the pages in
689 * the upl in "transaction sized chunks" relative to the upl. Except
690 * for the first transction, the upl_offset will always be page aligned.
691 * and when the upl's are not aligned the associated upl offset will not
692 * be page aligned and so we have to truncate and round up the starting
693 * and the end of the pages in question and see if they are shared with
694 * other transctions or not. If two transctions "share" a page in the
695 * associated upl, the first one to complete "marks" it and skips that
696 * page and the second one will include it in the "commit range"
697 *
698 * As an example, consider the case where 4 transctions are needed (this
699 * is the worst case).
700 *
701 * Transaction for 0-1 (size -> PAGE_SIZE - upl_offset)
702 *
703 * This covers the associated upl from a -> c. a->b is not shared but
704 * b-c is shared with the next transction so the first one to complete
705 * will only "mark" it.
706 *
707 * Transaction for 1-2 (size -> PAGE_SIZE)
708 *
709 * For transaction 1, assoc_upl_offset would be 0 (corresponding to the
710 * file offset a or b depending on what file offset the upl_offset
711 * corrssponds to ) and assoc_upl_end would correspond to the file
712 * offset c.
713 *
714 * (associated_upl - based on f_offset alignment)
715 * 0 a b c d e f
716 * <----|----|----|----|----|----|-----|---->
717 *
718 *
719 * (upl - based on user buffer address alignment)
720 * <__--|----|----|--__>
721 *
722 * 0 1 2 3
723 *
724 */
725 upl_size_t assoc_upl_size = upl_get_size(associated_upl);
726 #if 0
727 /* knock off the simple case first -> this transaction covers the entire UPL */
728 upl_offset_t upl_end = round_page_32(upl_offset + size);
729 upl_size_t upl_size = vector_upl_get_size(upl);
730
731 if ((trunc_page_32(upl_offset) == 0) && (upl_end == upl_size)) {
732 assoc_upl_offset = 0;
733 assoc_upl_end = assoc_upl_size;
734 goto do_commit;
735 }
736 #endif
737 off_t assoc_upl_start_f_offset = upl_adjusted_offset(associated_upl, PAGE_MASK);
738
739 assoc_upl_offset = (upl_offset_t)trunc_page_64(f_offset - assoc_upl_start_f_offset);
740 assoc_upl_end = round_page_64(f_offset + size) - assoc_upl_start_f_offset;
741
742 /*
743 * We can only sanity check the offset returned by upl_adjusted_offset
744 * for the first transaction for this UPL i.e. when (upl_offset < PAGE_SIZE)
745 */
746 assertf((upl_offset >= PAGE_SIZE) || ((assoc_upl_start_f_offset == trunc_page_64(f_offset)) && (assoc_upl_offset == 0)),
747 "upl_offset = %d, f_offset = %lld, size = %d, start_f_offset = %lld, assoc_upl_offset = %d",
748 upl_offset, f_offset, size, assoc_upl_start_f_offset, assoc_upl_offset);
749
750 assertf((upl_offset == assoc_upl_offset) || (upl_offset > assoc_upl_offset && ((upl_offset - assoc_upl_offset) <= PAGE_SIZE)) ||
751 (assoc_upl_offset > upl_offset && ((assoc_upl_offset - upl_offset) <= PAGE_SIZE)),
752 "abs(upl_offset - assoc_upl_offset) > PAGE_SIZE : "
753 "upl_offset = %d, f_offset = %lld, size = %d, start_f_offset = %lld, assoc_upl_offset = %d",
754 upl_offset, f_offset, size, assoc_upl_start_f_offset, assoc_upl_offset);
755
756 assertf(assoc_upl_end <= assoc_upl_size,
757 "upl_offset = %d, f_offset = %lld, size = %d, start_f_offset = %lld, assoc_upl_size = %d, assoc_upl_offset = %d, assoc_upl_end = %d",
758 upl_offset, f_offset, size, assoc_upl_start_f_offset, assoc_upl_size, assoc_upl_offset, assoc_upl_end);
759
760 assertf((assoc_upl_size > PAGE_SIZE) || (assoc_upl_offset == 0 && assoc_upl_end == PAGE_SIZE),
761 "upl_offset = %d, f_offset = %lld, size = %d, start_f_offset = %lld, assoc_upl_size = %d, assoc_upl_offset = %d, assoc_upl_end = %d",
762 upl_offset, f_offset, size, assoc_upl_start_f_offset, assoc_upl_size, assoc_upl_offset, assoc_upl_end);
763
764 if (assoc_upl_size == PAGE_SIZE) {
765 assoc_upl_offset = 0;
766 assoc_upl_end = PAGE_SIZE;
767 goto do_commit;
768 }
769
770 /*
771 * We have to check if the first and last pages of the associated UPL
772 * range could potentially be shared with other transactions and if the
773 * "sharing transactions" are both done. The first one sets the mark bit
774 * and the second one checks it and if set it includes that page in the
775 * pages to be "freed".
776 */
777 bool check_first_pg = (assoc_upl_offset != 0) || ((f_offset + size) < (assoc_upl_start_f_offset + PAGE_SIZE));
778 bool check_last_pg = (assoc_upl_end != assoc_upl_size) || (f_offset > ((assoc_upl_start_f_offset + assoc_upl_size) - PAGE_SIZE));
779
780 if (check_first_pg || check_last_pg) {
781 int first_pg = assoc_upl_offset >> PAGE_SHIFT;
782 int last_pg = trunc_page_32(assoc_upl_end - 1) >> PAGE_SHIFT;
783 upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
784
785 lck_mtx_lock_spin(&iostate->io_mtxp);
786 if (check_first_pg && !upl_page_get_mark(assoc_pl, first_pg)) {
787 /*
788 * The first page isn't marked so let another transaction
789 * completion handle it.
790 */
791 upl_page_set_mark(assoc_pl, first_pg, true);
792 assoc_upl_offset += PAGE_SIZE;
793 }
794 if (check_last_pg && !upl_page_get_mark(assoc_pl, last_pg)) {
795 /*
796 * The last page isn't marked so mark the page and let another
797 * transaction completion handle it.
798 */
799 upl_page_set_mark(assoc_pl, last_pg, true);
800 assoc_upl_end -= PAGE_SIZE;
801 }
802 lck_mtx_unlock(&iostate->io_mtxp);
803 }
804
805 if (assoc_upl_end <= assoc_upl_offset) {
806 return;
807 }
808
809 do_commit:
810 size = assoc_upl_end - assoc_upl_offset;
811
812 boolean_t empty;
813
814 /*
815 * We can unlock these pages now and as this is for a
816 * direct/uncached write, we want to dump the pages too.
817 */
818 kern_return_t kr = upl_abort_range(associated_upl, assoc_upl_offset, size,
819 UPL_ABORT_DUMP_PAGES, &empty);
820
821 assert(!kr);
822
823 if (!kr && empty) {
824 upl_set_associated_upl(upl, NULL);
825 upl_deallocate(associated_upl);
826 }
827 }
828
829 static void
cluster_iodone_verify_continue(void)830 cluster_iodone_verify_continue(void)
831 {
832 lck_mtx_lock_spin(&cl_transaction_mtxp);
833 for (;;) {
834 struct verify_buf *vb = TAILQ_FIRST(&verify_work_head);
835
836 if (!vb) {
837 assert_wait(&verify_work_head, (THREAD_UNINT));
838 break;
839 }
840 buf_t cbp = vb->vb_cbp;
841 void* callback_arg = vb->vb_callback_arg;
842
843 TAILQ_REMOVE(&verify_work_head, vb, vb_entry);
844 vb->vb_cbp = NULL;
845 vb->vb_callback_arg = NULL;
846 vb->vb_whichq = 0;
847 TAILQ_INSERT_TAIL(&verify_free_head, vb, vb_entry);
848 lck_mtx_unlock(&cl_transaction_mtxp);
849
850 (void)cluster_iodone_finish(cbp, callback_arg);
851 cbp = NULL;
852 lck_mtx_lock_spin(&cl_transaction_mtxp);
853 }
854 lck_mtx_unlock(&cl_transaction_mtxp);
855 thread_block((thread_continue_t)cluster_iodone_verify_continue);
856 /* NOT REACHED */
857 }
858
859 static void
cluster_verify_thread(void)860 cluster_verify_thread(void)
861 {
862 thread_t self = current_thread();
863
864 thread_set_thread_name(self, "cluster_verify_thread");
865 #if __AMP__
866 if (ecore_verify_threads) {
867 kern_return_t kr = thread_soft_bind_cluster_type(self, 'E');
868 if (kr != KERN_SUCCESS) {
869 printf("%s: WARN: failed to bind thread to cluster type; does the hardware topology match expectations?\n", __FUNCTION__);
870 }
871 }
872 #endif /* __AMP__ */
873 #if !defined(__x86_64__)
874 thread_group_join_io_storage();
875 #endif /* __x86_64__ */
876 cluster_iodone_verify_continue();
877 /* NOT REACHED */
878 }
879
880 static bool
enqueue_buf_for_verify(buf_t cbp,void * callback_arg)881 enqueue_buf_for_verify(buf_t cbp, void *callback_arg)
882 {
883 struct verify_buf *vb;
884
885 vb = TAILQ_FIRST(&verify_free_head);
886 if (vb) {
887 TAILQ_REMOVE(&verify_free_head, vb, vb_entry);
888 vb->vb_cbp = cbp;
889 vb->vb_callback_arg = callback_arg;
890 vb->vb_whichq = 1;
891 TAILQ_INSERT_TAIL(&verify_work_head, vb, vb_entry);
892 return true;
893 } else {
894 return false;
895 }
896 }
897
898 static int
cluster_handle_verification(buf_t cbp_head,vnode_t vp,upl_t upl,int upl_offset,int transaction_size,int error)899 cluster_handle_verification(buf_t cbp_head, vnode_t vp, upl_t upl, int upl_offset, int transaction_size, int error)
900 {
901 off_t start_off = cbp_head->b_clfoffset;
902 void *verify_ctx = cbp_head->b_attr.ba_un.verify_ctx;
903 caddr_t verify_buf = NULL;
904 uint32_t verify_length = transaction_size;
905 vnode_verify_flags_t verify_flags = VNODE_VERIFY_CONTEXT_FREE;
906 int verify_error = EAGAIN;
907
908 assert(cbp_head->b_attr.ba_flags & BA_WILL_VERIFY);
909
910 cbp_head->b_attr.ba_un.verify_ctx = NULL;
911 if (error) {
912 goto free_context;
913 }
914
915 /*
916 * If we don't have a precomputed hash, we make a single call to both
917 * verify and free the context. If we have a precomputed hash, then we
918 * make two separate calls - one to verify the hash and the second one to
919 * free. If the filesystem returns EAGAIN we fall back to the non
920 * precomputed hash case.
921 */
922 if (cbp_head->b_attr.ba_verify_type && cbp_head->b_attr.ba_flags & BA_VERIFY_VALID) {
923 verify_buf = (caddr_t)buf_verifyptr_with_size(cbp_head, transaction_size, &verify_length);
924 verify_flags = VNODE_VERIFY_WITH_CONTEXT | VNODE_VERIFY_PRECOMPUTED;
925
926 if (verify_buf && verify_length) {
927 verify_error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length,
928 NULL, &verify_ctx, verify_flags, NULL, NULL);
929 } else {
930 verify_error = EAGAIN;
931 }
932
933 verify_buf = NULL;
934 verify_length = transaction_size;
935 verify_flags = VNODE_VERIFY_CONTEXT_FREE;
936 }
937
938 if (verify_error != EAGAIN) {
939 error = verify_error;
940 } else {
941 vm_offset_t vaddr;
942
943 /*
944 * Map it in.
945 *
946 * ubc_upl_map_range unfortunately cannot handle concurrent map
947 * requests for the same UPL and returns failures when it can't
948 * map. The map exclusive mechanism enforces mutual exclusion
949 * for concurrent requests.
950 */
951 verify_error = 0;
952 os_atomic_inc(&verify_in_flight, relaxed);
953 upl_set_map_exclusive(upl);
954 error = ubc_upl_map_range(upl, upl_offset, round_page(transaction_size), VM_PROT_DEFAULT, &vaddr);
955 if (error) {
956 upl_clear_map_exclusive(upl);
957 printf("ubc_upl_map_range returned error %d upl = %p, upl_offset = %d, size = %d",
958 error, upl, (int)upl_offset, (int)round_page(transaction_size));
959 error = EIO;
960 if (os_atomic_dec_orig(&verify_in_flight, relaxed) == 0) {
961 panic("verify_in_flight underflow");
962 }
963 } else {
964 verify_buf = (caddr_t)vaddr;
965 verify_flags |= VNODE_VERIFY_WITH_CONTEXT;
966 }
967 }
968
969 free_context:
970 verify_error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length,
971 NULL, &verify_ctx, verify_flags, NULL, NULL);
972 if (!error) {
973 error = verify_error;
974 }
975
976 if (verify_buf) {
977 (void)ubc_upl_unmap_range(upl, upl_offset, round_page(transaction_size));
978 upl_clear_map_exclusive(upl);
979 verify_buf = NULL;
980 if (os_atomic_dec_orig(&verify_in_flight, relaxed) == 0) {
981 panic("verify_in_flight underflow");
982 }
983 }
984
985 return error;
986 }
987
988 static int
cluster_ioerror(upl_t upl,int upl_offset,int abort_size,int error,int io_flags,vnode_t vp)989 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
990 {
991 int upl_abort_code = 0;
992 int page_in = 0;
993 int page_out = 0;
994
995 if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) {
996 /*
997 * direct write of any flavor, or a direct read that wasn't aligned
998 */
999 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
1000 } else {
1001 if (io_flags & B_PAGEIO) {
1002 if (io_flags & B_READ) {
1003 page_in = 1;
1004 } else {
1005 page_out = 1;
1006 }
1007 }
1008 if (io_flags & B_CACHE) {
1009 /*
1010 * leave pages in the cache unchanged on error
1011 */
1012 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1013 } else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) {
1014 /*
1015 * transient error on pageout/write path... leave pages unchanged
1016 */
1017 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1018 } else if (page_in) {
1019 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1020 } else {
1021 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY /* | UPL_ABORT_DUMP_PAGES */;
1022 }
1023
1024 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
1025 }
1026 return upl_abort_code;
1027 }
1028
1029
1030 static int
cluster_iodone(buf_t bp,void * callback_arg)1031 cluster_iodone(buf_t bp, void *callback_arg)
1032 {
1033 buf_t cbp;
1034 buf_t cbp_head;
1035 int error = 0;
1036 boolean_t transaction_complete = FALSE;
1037 bool async;
1038
1039 __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
1040
1041 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
1042 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1043
1044 async = cluster_verify_threads &&
1045 (os_atomic_load(&cbp_head->b_attr.ba_flags, acquire) & BA_ASYNC_VERIFY);
1046
1047 assert(!async || cbp_head->b_attr.ba_un.verify_ctx);
1048
1049 if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
1050 lck_mtx_lock_spin(&cl_transaction_mtxp);
1051
1052 bp->b_flags |= B_TDONE;
1053
1054 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1055 /*
1056 * all I/O requests that are part of this transaction
1057 * have to complete before we can process it
1058 */
1059 if (!(cbp->b_flags & B_TDONE)) {
1060 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
1061 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
1062
1063 lck_mtx_unlock(&cl_transaction_mtxp);
1064
1065 return 0;
1066 }
1067
1068 if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
1069 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
1070 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
1071
1072 lck_mtx_unlock(&cl_transaction_mtxp);
1073 wakeup(cbp);
1074
1075 return 0;
1076 }
1077
1078 if (cbp->b_flags & B_EOT) {
1079 transaction_complete = TRUE;
1080
1081 if (async) {
1082 async = enqueue_buf_for_verify(cbp_head, callback_arg);
1083 }
1084 }
1085 }
1086 lck_mtx_unlock(&cl_transaction_mtxp);
1087
1088 if (transaction_complete == FALSE) {
1089 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
1090 cbp_head, 0, 0, 0, 0);
1091 return 0;
1092 }
1093 } else if (async) {
1094 lck_mtx_lock_spin(&cl_transaction_mtxp);
1095 async = enqueue_buf_for_verify(cbp_head, callback_arg);
1096 lck_mtx_unlock(&cl_transaction_mtxp);
1097 }
1098
1099 if (async) {
1100 wakeup(&verify_work_head);
1101 } else {
1102 error = cluster_iodone_finish(cbp_head, callback_arg);
1103 }
1104
1105 return error;
1106 }
1107
1108 static int
cluster_iodone_finish(buf_t cbp_head,void * callback_arg)1109 cluster_iodone_finish(buf_t cbp_head, void *callback_arg)
1110 {
1111 int b_flags;
1112 int error;
1113 int total_size;
1114 int total_resid;
1115 int upl_offset;
1116 int zero_offset;
1117 int pg_offset = 0;
1118 int commit_size = 0;
1119 int upl_flags = 0;
1120 int transaction_size = 0;
1121 upl_t upl;
1122 buf_t cbp;
1123 buf_t cbp_next;
1124 buf_t real_bp;
1125 vnode_t vp;
1126 struct clios *iostate;
1127
1128 error = 0;
1129 total_size = 0;
1130 total_resid = 0;
1131
1132 cbp = cbp_head;
1133 vp = cbp->b_vp;
1134 upl_offset = cbp->b_uploffset;
1135 upl = cbp->b_upl;
1136 b_flags = cbp->b_flags;
1137 real_bp = cbp->b_real_bp;
1138 zero_offset = cbp->b_validend;
1139 iostate = (struct clios *)cbp->b_iostate;
1140
1141 if (real_bp) {
1142 real_bp->b_dev = cbp->b_dev;
1143 }
1144
1145 while (cbp) {
1146 if ((cbp->b_flags & B_ERROR) && error == 0) {
1147 error = cbp->b_error;
1148 }
1149
1150 total_resid += cbp->b_resid;
1151 total_size += cbp->b_bcount;
1152
1153 cbp_next = cbp->b_trans_next;
1154
1155 if (cbp_next == NULL) {
1156 /*
1157 * compute the overall size of the transaction
1158 * in case we created one that has 'holes' in it
1159 * 'total_size' represents the amount of I/O we
1160 * did, not the span of the transaction w/r to the UPL
1161 */
1162 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
1163 }
1164
1165 cbp = cbp_next;
1166 }
1167
1168 if (ISSET(b_flags, B_COMMIT_UPL)) {
1169 cluster_handle_associated_upl(iostate,
1170 cbp_head->b_upl,
1171 upl_offset,
1172 transaction_size,
1173 cbp_head->b_clfoffset);
1174 }
1175
1176 if (error == 0 && total_resid) {
1177 error = EIO;
1178 }
1179
1180 if (error == 0) {
1181 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
1182
1183 if (cliodone_func != NULL) {
1184 cbp_head->b_bcount = transaction_size;
1185
1186 error = (*cliodone_func)(cbp_head, callback_arg);
1187 }
1188 }
1189 if (zero_offset) {
1190 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
1191 }
1192
1193 if (cbp_head->b_attr.ba_un.verify_ctx) {
1194 error = cluster_handle_verification(cbp_head, vp, upl, upl_offset, transaction_size, error);
1195 } else if (cbp_head->b_attr.ba_flags & BA_WILL_VERIFY) {
1196 error = EBADMSG;
1197 }
1198
1199 if (iostate) {
1200 int need_wakeup = 0;
1201
1202 /*
1203 * someone has issued multiple I/Os asynchrounsly
1204 * and is waiting for them to complete (streaming)
1205 */
1206 lck_mtx_lock_spin(&iostate->io_mtxp);
1207
1208 if (error && iostate->io_error == 0) {
1209 iostate->io_error = error;
1210 }
1211
1212 iostate->io_completed += total_size;
1213
1214 if (iostate->io_wanted) {
1215 /*
1216 * someone is waiting for the state of
1217 * this io stream to change
1218 */
1219 iostate->io_wanted = 0;
1220 need_wakeup = 1;
1221 }
1222 lck_mtx_unlock(&iostate->io_mtxp);
1223
1224 if (need_wakeup) {
1225 wakeup((caddr_t)&iostate->io_wanted);
1226 }
1227 }
1228
1229 if (b_flags & B_COMMIT_UPL) {
1230 pg_offset = upl_offset & PAGE_MASK;
1231 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1232
1233 if (error) {
1234 upl_set_iodone_error(upl, error);
1235
1236 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
1237 } else {
1238 upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
1239
1240 if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
1241 upl_flags |= UPL_COMMIT_SET_DIRTY;
1242 }
1243
1244 if (b_flags & B_AGE) {
1245 upl_flags |= UPL_COMMIT_INACTIVATE;
1246 }
1247
1248 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
1249 }
1250 }
1251
1252 cbp = cbp_head->b_trans_next;
1253 while (cbp) {
1254 cbp_next = cbp->b_trans_next;
1255
1256 if (cbp != cbp_head) {
1257 free_io_buf(cbp);
1258 }
1259
1260 cbp = cbp_next;
1261 }
1262 free_io_buf(cbp_head);
1263
1264 if (real_bp) {
1265 if (error) {
1266 real_bp->b_flags |= B_ERROR;
1267 real_bp->b_error = error;
1268 }
1269 real_bp->b_resid = total_resid;
1270
1271 buf_biodone(real_bp);
1272 }
1273 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
1274 upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
1275
1276 return error;
1277 }
1278
1279
1280 uint32_t
cluster_throttle_io_limit(vnode_t vp,uint32_t * limit)1281 cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
1282 {
1283 if (cluster_is_throttled(vp)) {
1284 *limit = calculate_max_throttle_size(vp);
1285 return 1;
1286 }
1287 return 0;
1288 }
1289
1290
1291 void
cluster_zero(upl_t upl,upl_offset_t upl_offset,int size,buf_t bp)1292 cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
1293 {
1294 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
1295 upl_offset, size, bp, 0, 0);
1296
1297 if (bp == NULL || bp->b_datap == 0) {
1298 upl_page_info_t *pl;
1299 addr64_t zero_addr;
1300
1301 pl = ubc_upl_pageinfo(upl);
1302
1303 if (upl_device_page(pl) == TRUE) {
1304 zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
1305
1306 bzero_phys_nc(zero_addr, size);
1307 } else {
1308 while (size) {
1309 int page_offset;
1310 int page_index;
1311 int zero_cnt;
1312
1313 page_index = upl_offset / PAGE_SIZE;
1314 page_offset = upl_offset & PAGE_MASK;
1315
1316 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
1317 zero_cnt = min(PAGE_SIZE - page_offset, size);
1318
1319 bzero_phys(zero_addr, zero_cnt);
1320
1321 size -= zero_cnt;
1322 upl_offset += zero_cnt;
1323 }
1324 }
1325 } else {
1326 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
1327 }
1328
1329 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
1330 upl_offset, size, 0, 0, 0);
1331 }
1332
1333
1334 static void
cluster_EOT(buf_t cbp_head,buf_t cbp_tail,int zero_offset,size_t verify_block_size)1335 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block_size)
1336 {
1337 /*
1338 * We will assign a verification context to cbp_head.
1339 * This will be passed back to the filesystem when
1340 * verifying (in cluster_iodone).
1341 */
1342 if (verify_block_size) {
1343 off_t start_off = cbp_head->b_clfoffset;
1344 size_t length;
1345 void *verify_ctx = NULL;
1346 int error = 0;
1347 vnode_t vp = buf_vnode(cbp_head);
1348
1349 if (cbp_head == cbp_tail) {
1350 length = cbp_head->b_bcount;
1351 } else {
1352 length = (cbp_tail->b_clfoffset + cbp_tail->b_bcount) - start_off;
1353 }
1354
1355 /*
1356 * zero_offset is non zero for the transaction containing the EOF
1357 * (if the filesize is not page aligned). In that case we might
1358 * have the transaction size not be page/verify block size aligned
1359 */
1360 if ((zero_offset == 0) &&
1361 ((length < verify_block_size) || (length % verify_block_size)) != 0) {
1362 panic("%s length = %zu, verify_block_size = %zu",
1363 __FUNCTION__, length, verify_block_size);
1364 }
1365
1366 error = VNOP_VERIFY(vp, start_off, NULL, length,
1367 &verify_block_size, &verify_ctx, VNODE_VERIFY_CONTEXT_ALLOC, NULL, NULL);
1368
1369 assert(!(error && verify_ctx));
1370
1371 if (verify_ctx) {
1372 if (num_verify_threads && (os_atomic_load(&cluster_verify_threads, relaxed) == 0)) {
1373 if (os_atomic_inc_orig(&cluster_verify_threads, relaxed) == 0) {
1374 thread_t thread;
1375 int i;
1376
1377 for (i = 0; i < num_verify_threads && i < MAX_VERIFY_THREADS; i++) {
1378 kernel_thread_start((thread_continue_t)cluster_verify_thread, NULL, &thread);
1379 thread_deallocate(thread);
1380 }
1381 os_atomic_store(&cluster_verify_threads, i, relaxed);
1382 } else {
1383 os_atomic_dec(&cluster_verify_threads, relaxed);
1384 }
1385 }
1386 cbp_head->b_attr.ba_un.verify_ctx = verify_ctx;
1387 /*
1388 * At least one thread is busy (at the time we
1389 * checked), so we can let it get queued for
1390 * async processing. It's fine if we occasionally get
1391 * this wrong.
1392 */
1393 if (os_atomic_load(&verify_in_flight, relaxed)) {
1394 /* This flag and the setting of ba_un.verify_ctx needs to be ordered */
1395 os_atomic_or(&cbp_head->b_attr.ba_flags, BA_ASYNC_VERIFY, release);
1396 }
1397 }
1398 } else {
1399 cbp_head->b_attr.ba_un.verify_ctx = NULL;
1400 }
1401
1402 cbp_head->b_validend = zero_offset;
1403 cbp_tail->b_flags |= B_EOT;
1404 }
1405
1406 static void
cluster_wait_IO(buf_t cbp_head,int async)1407 cluster_wait_IO(buf_t cbp_head, int async)
1408 {
1409 buf_t cbp;
1410
1411 if (async) {
1412 /*
1413 * Async callback completion will not normally generate a
1414 * wakeup upon I/O completion. To get woken up, we set
1415 * b_trans_next (which is safe for us to modify) on the last
1416 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1417 * to wake us up when all buffers as part of this transaction
1418 * are completed. This is done under the umbrella of
1419 * cl_transaction_mtxp which is also taken in cluster_iodone.
1420 */
1421 bool done = true;
1422 buf_t last = NULL;
1423
1424 lck_mtx_lock_spin(&cl_transaction_mtxp);
1425
1426 for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1427 if (!ISSET(cbp->b_flags, B_TDONE)) {
1428 done = false;
1429 }
1430 }
1431
1432 if (!done) {
1433 last->b_trans_next = CLUSTER_IO_WAITING;
1434
1435 DTRACE_IO1(wait__start, buf_t, last);
1436 do {
1437 msleep(last, &cl_transaction_mtxp, PSPIN | (PRIBIO + 1), "cluster_wait_IO", NULL);
1438
1439 /*
1440 * We should only have been woken up if all the
1441 * buffers are completed, but just in case...
1442 */
1443 done = true;
1444 for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1445 if (!ISSET(cbp->b_flags, B_TDONE)) {
1446 done = false;
1447 break;
1448 }
1449 }
1450 } while (!done);
1451 DTRACE_IO1(wait__done, buf_t, last);
1452
1453 last->b_trans_next = NULL;
1454 }
1455
1456 lck_mtx_unlock(&cl_transaction_mtxp);
1457 } else { // !async
1458 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1459 buf_biowait(cbp);
1460 }
1461 }
1462 }
1463
1464 static void
cluster_complete_transaction(buf_t * cbp_head,void * callback_arg,int * retval,int flags,int needwait)1465 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1466 {
1467 buf_t cbp;
1468 int error;
1469 boolean_t isswapout = FALSE;
1470
1471 /*
1472 * cluster_complete_transaction will
1473 * only be called if we've issued a complete chain in synchronous mode
1474 * or, we've already done a cluster_wait_IO on an incomplete chain
1475 */
1476 if (needwait) {
1477 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1478 buf_biowait(cbp);
1479 }
1480 }
1481 /*
1482 * we've already waited on all of the I/Os in this transaction,
1483 * so mark all of the buf_t's in this transaction as B_TDONE
1484 * so that cluster_iodone sees the transaction as completed
1485 */
1486 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1487 cbp->b_flags |= B_TDONE;
1488 cbp->b_attr.ba_flags &= ~BA_ASYNC_VERIFY;
1489 }
1490 cbp = *cbp_head;
1491
1492 if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
1493 isswapout = TRUE;
1494 }
1495
1496 error = cluster_iodone(cbp, callback_arg);
1497
1498 if (!(flags & CL_ASYNC) && error && *retval == 0) {
1499 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
1500 *retval = error;
1501 } else if (isswapout == TRUE) {
1502 *retval = error;
1503 }
1504 }
1505 *cbp_head = (buf_t)NULL;
1506 }
1507
1508 uint64_t cluster_direct_write_wired = 0;
1509
1510 static int
cluster_io(vnode_t vp,upl_t upl,vm_offset_t upl_offset,off_t f_offset,int non_rounded_size,int flags,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)1511 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1512 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1513 {
1514 buf_t cbp;
1515 u_int size;
1516 u_int io_size;
1517 int io_flags;
1518 int bmap_flags;
1519 int error = 0;
1520 int retval = 0;
1521 buf_t cbp_head = NULL;
1522 buf_t cbp_tail = NULL;
1523 int trans_count = 0;
1524 int max_trans_count;
1525 u_int pg_count;
1526 int pg_offset;
1527 u_int max_iosize;
1528 u_int max_vectors;
1529 int priv;
1530 int zero_offset = 0;
1531 int async_throttle = 0;
1532 mount_t mp;
1533 size_t verify_block_size = 0;
1534 vm_offset_t upl_end_offset;
1535 vnode_verify_kind_t verify_kind = VK_HASH_NONE;
1536 boolean_t need_EOT = FALSE;
1537
1538 /*
1539 * we currently don't support buffers larger than a page
1540 */
1541 if (real_bp && non_rounded_size > PAGE_SIZE) {
1542 panic("%s(): Called with real buffer of size %d bytes which "
1543 "is greater than the maximum allowed size of "
1544 "%d bytes (the system PAGE_SIZE).\n",
1545 __FUNCTION__, non_rounded_size, PAGE_SIZE);
1546 }
1547
1548 mp = vp->v_mount;
1549
1550 /*
1551 * we don't want to do any funny rounding of the size for IO requests
1552 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
1553 * belong to us... we can't extend (nor do we need to) the I/O to fill
1554 * out a page
1555 */
1556 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1557 /*
1558 * round the requested size up so that this I/O ends on a
1559 * page boundary in case this is a 'write'... if the filesystem
1560 * has blocks allocated to back the page beyond the EOF, we want to
1561 * make sure to write out the zero's that are sitting beyond the EOF
1562 * so that in case the filesystem doesn't explicitly zero this area
1563 * if a hole is created via a lseek/write beyond the current EOF,
1564 * it will return zeros when it's read back from the disk. If the
1565 * physical allocation doesn't extend for the whole page, we'll
1566 * only write/read from the disk up to the end of this allocation
1567 * via the extent info returned from the VNOP_BLOCKMAP call.
1568 */
1569 pg_offset = upl_offset & PAGE_MASK;
1570
1571 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1572 } else {
1573 /*
1574 * anyone advertising a blocksize of 1 byte probably
1575 * can't deal with us rounding up the request size
1576 * AFP is one such filesystem/device
1577 */
1578 size = non_rounded_size;
1579 }
1580 upl_end_offset = upl_offset + size;
1581
1582 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1583
1584 /*
1585 * Set the maximum transaction size to the maximum desired number of
1586 * buffers.
1587 */
1588 max_trans_count = 8;
1589 if (flags & CL_DEV_MEMORY) {
1590 max_trans_count = 16;
1591 }
1592
1593 if (flags & CL_READ) {
1594 io_flags = B_READ;
1595 bmap_flags = VNODE_READ;
1596
1597 max_iosize = mp->mnt_maxreadcnt;
1598 max_vectors = mp->mnt_segreadcnt;
1599
1600 /* See if we can do cluster verification (pageins and aligned reads) */
1601 if ((flags & CL_PAGEIN || cluster_verify_threads) &&
1602 !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
1603 (VNOP_VERIFY(vp, f_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL, &verify_kind) == 0) &&
1604 verify_block_size) {
1605 if (verify_block_size != PAGE_SIZE) {
1606 verify_block_size = 0;
1607 }
1608 if (real_bp && verify_block_size) {
1609 panic("%s(): Called with real buffer and needs verification ",
1610 __FUNCTION__);
1611 }
1612 /*
1613 * For reads, only allow cluster verification if f_offset
1614 * and upl_offset are both page aligned. Additionally, for direct reads,
1615 * require that the length of the write also be page aligned.
1616 * If they are not page aligned, leave it to the filesystem to do verification.
1617 * Strictly speaking, the alignments need to be for verify_block_size
1618 * but since the only verify_block_size that is currently supported
1619 * is page size, we check against page alignment.
1620 */
1621 if (verify_block_size && !(flags & CL_PAGEIN) &&
1622 ((f_offset & PAGE_MASK) || (upl_offset & PAGE_MASK) ||
1623 ((flags & CL_DIRECT_IO) && (non_rounded_size & PAGE_MASK)))) {
1624 verify_block_size = 0;
1625 verify_kind = VK_HASH_NONE;
1626 }
1627 if (verify_block_size && verify_kind && !upl_has_fs_verify_info(upl)) {
1628 upl_set_fs_verify_info(upl,
1629 (upl_adjusted_size(upl, PAGE_MASK) / mp->mnt_devblocksize) * get_num_bytes_for_verify_kind(verify_kind));
1630 }
1631 }
1632 } else {
1633 io_flags = B_WRITE;
1634 bmap_flags = VNODE_WRITE;
1635
1636 max_iosize = mp->mnt_maxwritecnt;
1637 max_vectors = mp->mnt_segwritecnt;
1638 }
1639 if (verify_block_size) {
1640 bmap_flags |= VNODE_CLUSTER_VERIFY;
1641 }
1642 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1643
1644 /*
1645 * make sure the maximum iosize is a
1646 * multiple of the page size
1647 */
1648 max_iosize &= ~PAGE_MASK;
1649
1650 /*
1651 * Ensure the maximum iosize is sensible.
1652 */
1653 if (!max_iosize) {
1654 max_iosize = PAGE_SIZE;
1655 }
1656
1657 if (flags & CL_THROTTLE) {
1658 if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1659 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
1660
1661 if (max_iosize > max_throttle_size) {
1662 max_iosize = max_throttle_size;
1663 }
1664 async_throttle = calculate_max_throttle_cnt(vp);
1665 } else {
1666 if ((flags & CL_DEV_MEMORY)) {
1667 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1668 } else {
1669 u_int max_cluster;
1670 u_int max_cluster_size;
1671 u_int scale;
1672
1673 if (vp->v_mount->mnt_minsaturationbytecount) {
1674 max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1675
1676 scale = 1;
1677 } else {
1678 max_cluster_size = MAX_CLUSTER_SIZE(vp);
1679
1680 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
1681 scale = WRITE_THROTTLE_SSD;
1682 } else {
1683 scale = WRITE_THROTTLE;
1684 }
1685 }
1686 if (max_iosize > max_cluster_size) {
1687 max_cluster = max_cluster_size;
1688 } else {
1689 max_cluster = max_iosize;
1690 }
1691
1692 if (size < max_cluster) {
1693 max_cluster = size;
1694 }
1695
1696 if (flags & CL_CLOSE) {
1697 scale += MAX_CLUSTERS;
1698 }
1699
1700 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
1701 }
1702 }
1703 }
1704 if (flags & CL_AGE) {
1705 io_flags |= B_AGE;
1706 }
1707 if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
1708 io_flags |= B_PAGEIO;
1709 }
1710 if (flags & (CL_IOSTREAMING)) {
1711 io_flags |= B_IOSTREAMING;
1712 }
1713 if (flags & CL_COMMIT) {
1714 io_flags |= B_COMMIT_UPL;
1715 }
1716 if (flags & CL_DIRECT_IO) {
1717 io_flags |= B_PHYS;
1718 }
1719 if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
1720 io_flags |= B_CACHE;
1721 }
1722 if (flags & CL_PASSIVE) {
1723 io_flags |= B_PASSIVE;
1724 }
1725 if (flags & CL_ENCRYPTED) {
1726 io_flags |= B_ENCRYPTED_IO;
1727 }
1728
1729 if (vp->v_flag & VSYSTEM) {
1730 io_flags |= B_META;
1731 }
1732
1733 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1734 /*
1735 * then we are going to end up
1736 * with a page that we can't complete (the file size wasn't a multiple
1737 * of PAGE_SIZE and we're trying to read to the end of the file
1738 * so we'll go ahead and zero out the portion of the page we can't
1739 * read in from the file
1740 */
1741 zero_offset = (int)(upl_offset + non_rounded_size);
1742 } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1743 assert(ISSET(flags, CL_COMMIT));
1744
1745 // For a direct/uncached write, we need to lock pages...
1746 upl_t cached_upl = NULL;
1747 upl_page_info_t *cached_pl;
1748
1749 assert(upl_offset < PAGE_SIZE);
1750
1751 /*
1752 *
1753 * f_offset = b
1754 * upl_offset = 8K
1755 *
1756 * (cached_upl - based on f_offset alignment)
1757 * 0 a b c
1758 * <----|----|----|----|----|----|-----|---->
1759 *
1760 *
1761 * (upl - based on user buffer address alignment)
1762 * <__--|----|----|--__>
1763 *
1764 * 0 1x 2x 3x
1765 *
1766 */
1767 const off_t cached_upl_f_offset = trunc_page_64(f_offset);
1768 const int cached_upl_size = round_page_32((f_offset - cached_upl_f_offset) + non_rounded_size);
1769 int num_retries = 0;
1770
1771 /*
1772 * Create a UPL to lock the pages in the cache whilst the
1773 * write is in progress.
1774 */
1775 create_cached_upl:
1776 ubc_create_upl_kernel(vp, cached_upl_f_offset, cached_upl_size, &cached_upl,
1777 &cached_pl, UPL_SET_LITE | UPL_WILL_MODIFY, VM_KERN_MEMORY_FILE);
1778 if (cached_upl && upl_has_wired_pages(cached_upl)) {
1779 /*
1780 * Pages in this UPL would contain stale data after our direct write
1781 * (which is intended to overwrite these pages on disk). The UPL is
1782 * just holding these pages "busy" to synchronize with any other I/O
1783 * or mmap() access and we have to dump these pages when the direct
1784 * write is done.
1785 * But we can't do that for wired pages, so let's release this UPL
1786 * and fall back to the "cached" path.
1787 */
1788 // printf("******* FBDP %s:%d vp %p offset 0x%llx size 0x%llx - switching from direct to cached write\n", __FUNCTION__, __LINE__, vp, cached_upl_f_offset, (uint64_t)cached_upl_size);
1789 ubc_upl_abort_range(cached_upl, 0, cached_upl_size, UPL_ABORT_FREE_ON_EMPTY);
1790 cached_upl = NULL;
1791 cached_pl = NULL;
1792 cluster_direct_write_wired++;
1793 return ENOTSUP;
1794 }
1795
1796 /*
1797 * If we are not overwriting the first and last pages completely
1798 * we need to write them out first if they are dirty. These pages
1799 * will be discarded after the write completes so we might lose
1800 * the writes for the parts that are not overwrrtten.
1801 */
1802 bool first_page_needs_sync = false;
1803 bool last_page_needs_sync = false;
1804
1805 if (cached_upl && (cached_upl_f_offset < f_offset) && upl_dirty_page(cached_pl, 0)) {
1806 first_page_needs_sync = true;
1807 }
1808
1809 if (cached_upl && (cached_upl_f_offset + cached_upl_size) > (f_offset + non_rounded_size)) {
1810 int last_page = (cached_upl_size / PAGE_SIZE) - 1;
1811
1812 if ((last_page != 0 || !first_page_needs_sync) && upl_dirty_page(cached_pl, last_page)) {
1813 last_page_needs_sync = true;
1814 }
1815 }
1816
1817 if (first_page_needs_sync || last_page_needs_sync) {
1818 ubc_upl_abort_range(cached_upl, 0, cached_upl_size, UPL_ABORT_FREE_ON_EMPTY);
1819 cached_upl = NULL;
1820 cached_pl = NULL;
1821 if (first_page_needs_sync) {
1822 ubc_msync(vp, cached_upl_f_offset, cached_upl_f_offset + PAGE_SIZE, NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC);
1823 }
1824 if (last_page_needs_sync) {
1825 off_t cached_upl_end_offset = cached_upl_f_offset + cached_upl_size;
1826
1827 ubc_msync(vp, cached_upl_end_offset - PAGE_SIZE, cached_upl_end_offset, NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC);
1828 }
1829 if (++num_retries < 16) {
1830 goto create_cached_upl;
1831 }
1832 printf("%s : Number of retries for syncing first or last page reached %d\n", __FUNCTION__, num_retries);
1833 assertf(num_retries < 16, "%s : Number of retries for syncing first or last page reached %d\n", __FUNCTION__, num_retries);
1834 }
1835
1836 /*
1837 * Attach this UPL to the other UPL so that we can find it
1838 * later.
1839 */
1840 upl_set_associated_upl(upl, cached_upl);
1841 assertf(!cached_upl ||
1842 (upl_adjusted_offset(cached_upl, PAGE_MASK) == cached_upl_f_offset),
1843 "upl_adjusted_offset(cached_upl, PAGE_MASK) = %lld, cached_upl_f_offset = %lld",
1844 upl_adjusted_offset(cached_upl, PAGE_MASK), cached_upl_f_offset);
1845 }
1846
1847 while (size) {
1848 daddr64_t blkno;
1849 daddr64_t lblkno;
1850 size_t io_size_tmp;
1851 u_int io_size_wanted;
1852
1853 if (size > max_iosize) {
1854 io_size = max_iosize;
1855 } else {
1856 io_size = size;
1857 }
1858
1859 io_size_wanted = io_size;
1860 io_size_tmp = (size_t)io_size;
1861
1862 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1863 break;
1864 }
1865
1866 if (io_size_tmp > io_size_wanted) {
1867 io_size = io_size_wanted;
1868 } else {
1869 io_size = (u_int)io_size_tmp;
1870 }
1871
1872 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1873 real_bp->b_blkno = blkno;
1874 }
1875
1876 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1877 (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
1878
1879 if (io_size == 0) {
1880 /*
1881 * vnop_blockmap didn't return an error... however, it did
1882 * return an extent size of 0 which means we can't
1883 * make forward progress on this I/O... a hole in the
1884 * file would be returned as a blkno of -1 with a non-zero io_size
1885 * a real extent is returned with a blkno != -1 and a non-zero io_size
1886 */
1887 error = EINVAL;
1888 break;
1889 }
1890 if (!(flags & CL_READ) && blkno == -1) {
1891 off_t e_offset;
1892 int pageout_flags;
1893
1894 if (upl_get_internal_vectorupl(upl)) {
1895 panic("Vector UPLs should not take this code-path");
1896 }
1897 /*
1898 * we're writing into a 'hole'
1899 */
1900 if (flags & CL_PAGEOUT) {
1901 /*
1902 * if we got here via cluster_pageout
1903 * then just error the request and return
1904 * the 'hole' should already have been covered
1905 */
1906 error = EINVAL;
1907 break;
1908 }
1909 /*
1910 * we can get here if the cluster code happens to
1911 * pick up a page that was dirtied via mmap vs
1912 * a 'write' and the page targets a 'hole'...
1913 * i.e. the writes to the cluster were sparse
1914 * and the file was being written for the first time
1915 *
1916 * we can also get here if the filesystem supports
1917 * 'holes' that are less than PAGE_SIZE.... because
1918 * we can't know if the range in the page that covers
1919 * the 'hole' has been dirtied via an mmap or not,
1920 * we have to assume the worst and try to push the
1921 * entire page to storage.
1922 *
1923 * Try paging out the page individually before
1924 * giving up entirely and dumping it (the pageout
1925 * path will insure that the zero extent accounting
1926 * has been taken care of before we get back into cluster_io)
1927 *
1928 * go direct to vnode_pageout so that we don't have to
1929 * unbusy the page from the UPL... we used to do this
1930 * so that we could call ubc_msync, but that results
1931 * in a potential deadlock if someone else races us to acquire
1932 * that page and wins and in addition needs one of the pages
1933 * we're continuing to hold in the UPL
1934 */
1935 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1936
1937 if (!(flags & CL_ASYNC)) {
1938 pageout_flags |= UPL_IOSYNC;
1939 }
1940 if (!(flags & CL_COMMIT)) {
1941 pageout_flags |= UPL_NOCOMMIT;
1942 }
1943
1944 if (cbp_head) {
1945 buf_t prev_cbp;
1946 uint32_t bytes_in_last_page;
1947
1948 /*
1949 * first we have to wait for the the current outstanding I/Os
1950 * to complete... EOT hasn't been set yet on this transaction
1951 * so the pages won't be released
1952 */
1953 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1954
1955 bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1956 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1957 bytes_in_last_page += cbp->b_bcount;
1958 }
1959 bytes_in_last_page &= PAGE_MASK;
1960
1961 while (bytes_in_last_page) {
1962 /*
1963 * we've got a transcation that
1964 * includes the page we're about to push out through vnode_pageout...
1965 * find the bp's in the list which intersect this page and either
1966 * remove them entirely from the transaction (there could be multiple bp's), or
1967 * round it's iosize down to the page boundary (there can only be one)...
1968 *
1969 * find the last bp in the list and act on it
1970 */
1971 for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
1972 prev_cbp = cbp;
1973 }
1974
1975 if (bytes_in_last_page >= cbp->b_bcount) {
1976 /*
1977 * this buf no longer has any I/O associated with it
1978 */
1979 bytes_in_last_page -= cbp->b_bcount;
1980 cbp->b_bcount = 0;
1981
1982 free_io_buf(cbp);
1983
1984 if (cbp == cbp_head) {
1985 assert(bytes_in_last_page == 0);
1986 /*
1987 * the buf we just freed was the only buf in
1988 * this transaction... so there's no I/O to do
1989 */
1990 cbp_head = NULL;
1991 cbp_tail = NULL;
1992 } else {
1993 /*
1994 * remove the buf we just freed from
1995 * the transaction list
1996 */
1997 prev_cbp->b_trans_next = NULL;
1998 cbp_tail = prev_cbp;
1999 }
2000 } else {
2001 /*
2002 * this is the last bp that has I/O
2003 * intersecting the page of interest
2004 * only some of the I/O is in the intersection
2005 * so clip the size but keep it in the transaction list
2006 */
2007 cbp->b_bcount -= bytes_in_last_page;
2008 cbp_tail = cbp;
2009 bytes_in_last_page = 0;
2010 }
2011 }
2012 if (cbp_head) {
2013 /*
2014 * there was more to the current transaction
2015 * than just the page we are pushing out via vnode_pageout...
2016 * mark it as finished and complete it... we've already
2017 * waited for the I/Os to complete above in the call to cluster_wait_IO
2018 */
2019 cluster_EOT(cbp_head, cbp_tail, 0, 0);
2020
2021 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
2022
2023 trans_count = 0;
2024 }
2025 }
2026 if (vnode_pageout(vp, upl, (upl_offset_t)trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
2027 error = EINVAL;
2028 }
2029 e_offset = round_page_64(f_offset + 1);
2030 io_size = (u_int)(e_offset - f_offset);
2031
2032 f_offset += io_size;
2033 upl_offset += io_size;
2034
2035 if (size >= io_size) {
2036 size -= io_size;
2037 } else {
2038 size = 0;
2039 }
2040 /*
2041 * keep track of how much of the original request
2042 * that we've actually completed... non_rounded_size
2043 * may go negative due to us rounding the request
2044 * to a page size multiple (i.e. size > non_rounded_size)
2045 */
2046 non_rounded_size -= io_size;
2047
2048 if (non_rounded_size <= 0) {
2049 /*
2050 * we've transferred all of the data in the original
2051 * request, but we were unable to complete the tail
2052 * of the last page because the file didn't have
2053 * an allocation to back that portion... this is ok.
2054 */
2055 size = 0;
2056 }
2057 if (error) {
2058 if (size == 0) {
2059 flags &= ~CL_COMMIT;
2060 }
2061 break;
2062 }
2063 continue;
2064 }
2065
2066 lblkno = (daddr64_t)(f_offset / CLUSTER_IO_BLOCK_SIZE);
2067
2068 /*
2069 * we have now figured out how much I/O we can do - this is in 'io_size'
2070 * pg_offset is the starting point in the first page for the I/O
2071 * pg_count is the number of full and partial pages that 'io_size' encompasses
2072 */
2073 pg_offset = upl_offset & PAGE_MASK;
2074
2075 if (flags & CL_DEV_MEMORY) {
2076 /*
2077 * treat physical requests as one 'giant' page
2078 */
2079 pg_count = 1;
2080 } else {
2081 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
2082 }
2083
2084 if ((flags & CL_READ) && blkno == -1) {
2085 vm_offset_t commit_offset;
2086 int bytes_to_zero;
2087 int complete_transaction_now = 0;
2088
2089 /*
2090 * if we're reading and blkno == -1, then we've got a
2091 * 'hole' in the file that we need to deal with by zeroing
2092 * out the affected area in the upl
2093 */
2094 if (io_size >= (u_int)non_rounded_size) {
2095 /*
2096 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
2097 * than 'zero_offset' will be non-zero
2098 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
2099 * (indicated by the io_size finishing off the I/O request for this UPL)
2100 * than we're not going to issue an I/O for the
2101 * last page in this upl... we need to zero both the hole and the tail
2102 * of the page beyond the EOF, since the delayed zero-fill won't kick in
2103 */
2104 bytes_to_zero = non_rounded_size;
2105 if (!(flags & CL_NOZERO)) {
2106 bytes_to_zero = (int)((((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset);
2107 }
2108
2109 zero_offset = 0;
2110 } else {
2111 bytes_to_zero = io_size;
2112 }
2113
2114 pg_count = 0;
2115
2116 cluster_zero(upl, (upl_offset_t)upl_offset, bytes_to_zero, real_bp);
2117
2118 if (cbp_head) {
2119 int pg_resid;
2120
2121 /*
2122 * if there is a current I/O chain pending
2123 * then the first page of the group we just zero'd
2124 * will be handled by the I/O completion if the zero
2125 * fill started in the middle of the page
2126 */
2127 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2128
2129 pg_resid = (int)(commit_offset - upl_offset);
2130
2131 if (bytes_to_zero >= pg_resid) {
2132 /*
2133 * the last page of the current I/O
2134 * has been completed...
2135 * compute the number of fully zero'd
2136 * pages that are beyond it
2137 * plus the last page if its partial
2138 * and we have no more I/O to issue...
2139 * otherwise a partial page is left
2140 * to begin the next I/O
2141 */
2142 if ((int)io_size >= non_rounded_size) {
2143 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
2144 } else {
2145 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
2146 }
2147
2148 complete_transaction_now = 1;
2149 }
2150 } else {
2151 /*
2152 * no pending I/O to deal with
2153 * so, commit all of the fully zero'd pages
2154 * plus the last page if its partial
2155 * and we have no more I/O to issue...
2156 * otherwise a partial page is left
2157 * to begin the next I/O
2158 */
2159 if ((int)io_size >= non_rounded_size) {
2160 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
2161 } else {
2162 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
2163 }
2164
2165 commit_offset = upl_offset & ~PAGE_MASK;
2166 }
2167
2168 // Associated UPL is currently only used in the direct write path
2169 assert(!upl_associated_upl(upl));
2170
2171 if ((flags & CL_COMMIT) && pg_count) {
2172 ubc_upl_commit_range(upl, (upl_offset_t)commit_offset,
2173 pg_count * PAGE_SIZE,
2174 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
2175 }
2176 upl_offset += io_size;
2177 f_offset += io_size;
2178 size -= io_size;
2179
2180 /*
2181 * keep track of how much of the original request
2182 * that we've actually completed... non_rounded_size
2183 * may go negative due to us rounding the request
2184 * to a page size multiple (i.e. size > non_rounded_size)
2185 */
2186 non_rounded_size -= io_size;
2187
2188 if (non_rounded_size <= 0) {
2189 /*
2190 * we've transferred all of the data in the original
2191 * request, but we were unable to complete the tail
2192 * of the last page because the file didn't have
2193 * an allocation to back that portion... this is ok.
2194 */
2195 size = 0;
2196 }
2197 if (cbp_head && (complete_transaction_now || size == 0)) {
2198 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
2199
2200 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
2201
2202 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
2203
2204 trans_count = 0;
2205 }
2206 continue;
2207 }
2208 if (pg_count > max_vectors) {
2209 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
2210 io_size = PAGE_SIZE - pg_offset;
2211 pg_count = 1;
2212 } else {
2213 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
2214 pg_count = max_vectors;
2215 }
2216 }
2217 /*
2218 * If the transaction is going to reach the maximum number of
2219 * desired elements, truncate the i/o to the nearest page so
2220 * that the actual i/o is initiated after this buffer is
2221 * created and added to the i/o chain.
2222 *
2223 * I/O directed to physically contiguous memory
2224 * doesn't have a requirement to make sure we 'fill' a page
2225 */
2226 if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
2227 ((upl_offset + io_size) & PAGE_MASK)) {
2228 vm_offset_t aligned_ofs;
2229
2230 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
2231 /*
2232 * If the io_size does not actually finish off even a
2233 * single page we have to keep adding buffers to the
2234 * transaction despite having reached the desired limit.
2235 *
2236 * Eventually we get here with the page being finished
2237 * off (and exceeded) and then we truncate the size of
2238 * this i/o request so that it is page aligned so that
2239 * we can finally issue the i/o on the transaction.
2240 */
2241 if (aligned_ofs > upl_offset) {
2242 io_size = (u_int)(aligned_ofs - upl_offset);
2243 pg_count--;
2244 }
2245 }
2246
2247 if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
2248 /*
2249 * if we're not targeting a virtual device i.e. a disk image
2250 * it's safe to dip into the reserve pool since real devices
2251 * can complete this I/O request without requiring additional
2252 * bufs from the alloc_io_buf pool
2253 */
2254 priv = 1;
2255 } else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT) && !cbp_head) {
2256 /*
2257 * Throttle the speculative IO
2258 *
2259 * We can only throttle this if it is the first iobuf
2260 * for the transaction. alloc_io_buf implements
2261 * additional restrictions for diskimages anyway.
2262 */
2263 priv = 0;
2264 } else {
2265 priv = 1;
2266 }
2267
2268 cbp = alloc_io_buf(vp, priv);
2269
2270 if (flags & CL_PAGEOUT) {
2271 u_int i;
2272
2273 /*
2274 * since blocks are in offsets of CLUSTER_IO_BLOCK_SIZE, scale
2275 * iteration to (PAGE_SIZE * pg_count) of blks.
2276 */
2277 for (i = 0; i < (PAGE_SIZE * pg_count) / CLUSTER_IO_BLOCK_SIZE; i++) {
2278 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
2279 panic("BUSY bp found in cluster_io");
2280 }
2281 }
2282 }
2283 if (flags & CL_ASYNC) {
2284 if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) {
2285 panic("buf_setcallback failed");
2286 }
2287 }
2288 cbp->b_cliodone = (void *)callback;
2289 cbp->b_flags |= io_flags;
2290 if (flags & CL_NOCACHE) {
2291 cbp->b_attr.ba_flags |= BA_NOCACHE;
2292 }
2293 if (verify_block_size) {
2294 cbp->b_attr.ba_flags |= BA_WILL_VERIFY;
2295 if (verify_kind) {
2296 cbp->b_attr.ba_verify_type = verify_kind;
2297 }
2298 }
2299
2300 cbp->b_lblkno = lblkno;
2301 cbp->b_clfoffset = f_offset;
2302 cbp->b_blkno = blkno;
2303 cbp->b_bcount = io_size;
2304
2305 if (buf_setupl(cbp, upl, (uint32_t)upl_offset)) {
2306 panic("buf_setupl failed");
2307 }
2308 #if CONFIG_IOSCHED
2309 upl_set_blkno(upl, upl_offset, io_size, blkno);
2310 #endif
2311 cbp->b_trans_next = (buf_t)NULL;
2312
2313 if ((cbp->b_iostate = (void *)iostate)) {
2314 /*
2315 * caller wants to track the state of this
2316 * io... bump the amount issued against this stream
2317 */
2318 iostate->io_issued += io_size;
2319 }
2320
2321 if (flags & CL_READ) {
2322 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
2323 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
2324 } else {
2325 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
2326 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
2327 }
2328
2329 if (cbp_head) {
2330 cbp_tail->b_trans_next = cbp;
2331 cbp_tail = cbp;
2332 } else {
2333 cbp_head = cbp;
2334 cbp_tail = cbp;
2335
2336 if ((cbp_head->b_real_bp = real_bp)) {
2337 real_bp = (buf_t)NULL;
2338 }
2339 }
2340 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
2341
2342 trans_count++;
2343
2344 upl_offset += io_size;
2345 f_offset += io_size;
2346 size -= io_size;
2347 /*
2348 * keep track of how much of the original request
2349 * that we've actually completed... non_rounded_size
2350 * may go negative due to us rounding the request
2351 * to a page size multiple (i.e. size > non_rounded_size)
2352 */
2353 non_rounded_size -= io_size;
2354
2355 if (non_rounded_size <= 0) {
2356 /*
2357 * we've transferred all of the data in the original
2358 * request, but we were unable to complete the tail
2359 * of the last page because the file didn't have
2360 * an allocation to back that portion... this is ok.
2361 */
2362 size = 0;
2363 }
2364 if (size == 0) {
2365 /*
2366 * we have no more I/O to issue, so go
2367 * finish the final transaction
2368 */
2369 need_EOT = TRUE;
2370 } else if (((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
2371 ((flags & CL_ASYNC) || trans_count > max_trans_count)) {
2372 /*
2373 * I/O directed to physically contiguous memory...
2374 * which doesn't have a requirement to make sure we 'fill' a page
2375 * or...
2376 * the current I/O we've prepared fully
2377 * completes the last page in this request
2378 * and ...
2379 * it's either an ASYNC request or
2380 * we've already accumulated more than 8 I/O's into
2381 * this transaction so mark it as complete so that
2382 * it can finish asynchronously or via the cluster_complete_transaction
2383 * below if the request is synchronous
2384 */
2385 need_EOT = TRUE;
2386 }
2387 if (need_EOT == TRUE) {
2388 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0, verify_block_size);
2389 }
2390
2391 if (flags & CL_THROTTLE) {
2392 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
2393 }
2394
2395 if (!(io_flags & B_READ)) {
2396 vnode_startwrite(vp);
2397 }
2398
2399 if (flags & CL_RAW_ENCRYPTED) {
2400 /*
2401 * User requested raw encrypted bytes.
2402 * Twiddle the bit in the ba_flags for the buffer
2403 */
2404 cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
2405 }
2406
2407 (void) VNOP_STRATEGY(cbp);
2408
2409 if (need_EOT == TRUE) {
2410 if (!(flags & CL_ASYNC)) {
2411 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
2412 }
2413
2414 need_EOT = FALSE;
2415 trans_count = 0;
2416 cbp_head = NULL;
2417 }
2418 }
2419 if (error) {
2420 int abort_size;
2421
2422 io_size = 0;
2423
2424 if (cbp_head) {
2425 /*
2426 * Wait until all of the outstanding I/O
2427 * for this partial transaction has completed
2428 */
2429 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
2430
2431 /*
2432 * Rewind the upl offset to the beginning of the
2433 * transaction.
2434 */
2435 upl_offset = cbp_head->b_uploffset;
2436 }
2437
2438 if (ISSET(flags, CL_COMMIT)) {
2439 cluster_handle_associated_upl(iostate, upl,
2440 (upl_offset_t)upl_offset,
2441 (upl_size_t)(upl_end_offset - upl_offset),
2442 cbp_head ? cbp_head->b_clfoffset : f_offset);
2443 }
2444
2445 // Free all the IO buffers in this transaction
2446 for (cbp = cbp_head; cbp;) {
2447 buf_t cbp_next;
2448
2449 size += cbp->b_bcount;
2450 io_size += cbp->b_bcount;
2451
2452 cbp_next = cbp->b_trans_next;
2453 free_io_buf(cbp);
2454 cbp = cbp_next;
2455 }
2456
2457 if (iostate) {
2458 int need_wakeup = 0;
2459
2460 /*
2461 * update the error condition for this stream
2462 * since we never really issued the io
2463 * just go ahead and adjust it back
2464 */
2465 lck_mtx_lock_spin(&iostate->io_mtxp);
2466
2467 if (iostate->io_error == 0) {
2468 iostate->io_error = error;
2469 }
2470 iostate->io_issued -= io_size;
2471
2472 if (iostate->io_wanted) {
2473 /*
2474 * someone is waiting for the state of
2475 * this io stream to change
2476 */
2477 iostate->io_wanted = 0;
2478 need_wakeup = 1;
2479 }
2480 lck_mtx_unlock(&iostate->io_mtxp);
2481
2482 if (need_wakeup) {
2483 wakeup((caddr_t)&iostate->io_wanted);
2484 }
2485 }
2486
2487 if (flags & CL_COMMIT) {
2488 int upl_flags;
2489
2490 pg_offset = upl_offset & PAGE_MASK;
2491 abort_size = (int)((upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK);
2492
2493 upl_flags = cluster_ioerror(upl, (int)(upl_offset - pg_offset),
2494 abort_size, error, io_flags, vp);
2495
2496 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
2497 upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
2498 }
2499 if (retval == 0) {
2500 retval = error;
2501 }
2502 } else if (cbp_head) {
2503 panic("%s(): cbp_head is not NULL.", __FUNCTION__);
2504 }
2505
2506 if (real_bp) {
2507 /*
2508 * can get here if we either encountered an error
2509 * or we completely zero-filled the request and
2510 * no I/O was issued
2511 */
2512 if (error) {
2513 real_bp->b_flags |= B_ERROR;
2514 real_bp->b_error = error;
2515 }
2516 buf_biodone(real_bp);
2517 }
2518 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
2519
2520 return retval;
2521 }
2522
2523 #define reset_vector_run_state() \
2524 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
2525
2526 static int
vector_cluster_io(vnode_t vp,upl_t vector_upl,vm_offset_t vector_upl_offset,off_t v_upl_uio_offset,int vector_upl_iosize,int io_flag,buf_t real_bp,struct clios * iostate,int (* callback)(buf_t,void *),void * callback_arg)2527 vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
2528 int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
2529 {
2530 vector_upl_set_pagelist(vector_upl);
2531
2532 if (io_flag & CL_READ) {
2533 if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
2534 io_flag &= ~CL_PRESERVE; /*don't zero fill*/
2535 } else {
2536 io_flag |= CL_PRESERVE; /*zero fill*/
2537 }
2538 }
2539 return cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg);
2540 }
2541
2542 static int
cluster_read_prefetch(vnode_t vp,off_t f_offset,u_int size,off_t filesize,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2543 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2544 {
2545 int pages_in_prefetch;
2546
2547 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
2548 (int)f_offset, size, (int)filesize, 0, 0);
2549
2550 if (f_offset >= filesize) {
2551 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2552 (int)f_offset, 0, 0, 0, 0);
2553 return 0;
2554 }
2555 if ((off_t)size > (filesize - f_offset)) {
2556 size = (u_int)(filesize - f_offset);
2557 }
2558 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
2559
2560 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
2561
2562 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2563 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
2564
2565 return pages_in_prefetch;
2566 }
2567
2568
2569
2570 static void
cluster_read_ahead(vnode_t vp,struct cl_extent * extent,off_t filesize,struct cl_readahead * rap,int (* callback)(buf_t,void *),void * callback_arg,int bflag)2571 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
2572 int bflag)
2573 {
2574 daddr64_t r_addr;
2575 off_t f_offset;
2576 int size_of_prefetch;
2577 u_int max_prefetch;
2578
2579
2580 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
2581 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
2582
2583 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
2584 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2585 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
2586 return;
2587 }
2588 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
2589 rap->cl_ralen = 0;
2590 rap->cl_maxra = 0;
2591
2592 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2593 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
2594
2595 return;
2596 }
2597
2598 max_prefetch = cluster_max_prefetch(vp,
2599 cluster_max_io_size(vp->v_mount, CL_READ), speculative_prefetch_max);
2600
2601 if (max_prefetch <= PAGE_SIZE) {
2602 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2603 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
2604 return;
2605 }
2606 if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
2607 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2608 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2609 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
2610 return;
2611 }
2612 }
2613 r_addr = MAX(extent->e_addr, rap->cl_maxra) + 1;
2614 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2615
2616 size_of_prefetch = 0;
2617
2618 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2619
2620 if (size_of_prefetch) {
2621 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2622 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
2623 return;
2624 }
2625 if (f_offset < filesize) {
2626 daddr64_t read_size;
2627
2628 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
2629
2630 read_size = (extent->e_addr + 1) - extent->b_addr;
2631
2632 if (read_size > rap->cl_ralen) {
2633 if (read_size > max_prefetch / PAGE_SIZE) {
2634 rap->cl_ralen = max_prefetch / PAGE_SIZE;
2635 } else {
2636 rap->cl_ralen = (int)read_size;
2637 }
2638 }
2639 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2640
2641 if (size_of_prefetch) {
2642 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2643 }
2644 }
2645 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2646 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
2647 }
2648
2649
2650 int
cluster_pageout(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2651 cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2652 int size, off_t filesize, int flags)
2653 {
2654 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2655 }
2656
2657
2658 int
cluster_pageout_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2659 cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2660 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2661 {
2662 int io_size;
2663 int rounded_size;
2664 off_t max_size;
2665 int local_flags;
2666
2667 local_flags = CL_PAGEOUT | CL_THROTTLE;
2668
2669 if ((flags & UPL_IOSYNC) == 0) {
2670 local_flags |= CL_ASYNC;
2671 }
2672 if ((flags & UPL_NOCOMMIT) == 0) {
2673 local_flags |= CL_COMMIT;
2674 }
2675 if ((flags & UPL_KEEPCACHED)) {
2676 local_flags |= CL_KEEPCACHED;
2677 }
2678 if (flags & UPL_PAGING_ENCRYPTED) {
2679 local_flags |= CL_ENCRYPTED;
2680 }
2681
2682
2683 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2684 (int)f_offset, size, (int)filesize, local_flags, 0);
2685
2686 /*
2687 * If they didn't specify any I/O, then we are done...
2688 * we can't issue an abort because we don't know how
2689 * big the upl really is
2690 */
2691 if (size <= 0) {
2692 return EINVAL;
2693 }
2694
2695 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2696 if (local_flags & CL_COMMIT) {
2697 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2698 }
2699 return EROFS;
2700 }
2701 /*
2702 * can't page-in from a negative offset
2703 * or if we're starting beyond the EOF
2704 * or if the file offset isn't page aligned
2705 * or the size requested isn't a multiple of PAGE_SIZE
2706 */
2707 if (f_offset < 0 || f_offset >= filesize ||
2708 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2709 if (local_flags & CL_COMMIT) {
2710 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2711 }
2712 return EINVAL;
2713 }
2714 max_size = filesize - f_offset;
2715
2716 if (size < max_size) {
2717 io_size = size;
2718 } else {
2719 io_size = (int)max_size;
2720 }
2721
2722 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2723
2724 if (size > rounded_size) {
2725 if (local_flags & CL_COMMIT) {
2726 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2727 UPL_ABORT_FREE_ON_EMPTY);
2728 }
2729 }
2730 return cluster_io(vp, upl, upl_offset, f_offset, io_size,
2731 local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2732 }
2733
2734
2735 int
cluster_pagein(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags)2736 cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2737 int size, off_t filesize, int flags)
2738 {
2739 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2740 }
2741
2742 #define SPLIT_PAGEIN_MAX_IOSIZE 32768
2743
2744 /*
2745 * Do a big pagein request as multiple I/Os - the first I/O will be for
2746 * SPLIT_PAGEIN_MAX_IOSIZE (32K)sized which includes the page that the caused
2747 * the fault and then i/o will be initiated for the remaining.
2748 */
2749 static int
cluster_handle_split_pagein(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,u_int io_size,int rounded_size,int local_flags,int (* callback)(buf_t,void *),void * callback_arg)2750 cluster_handle_split_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2751 u_int io_size, int rounded_size, int local_flags, int (*callback)(buf_t, void *), void *callback_arg)
2752 {
2753 upl_page_info_t *pl = ubc_upl_pageinfo(upl);
2754 const off_t start_f_offset = f_offset;
2755 const upl_offset_t start_upl_offset = upl_offset;
2756 const int start_pg = upl_offset >> PAGE_SHIFT;
2757 const int last_pg = ((upl_offset + rounded_size) >> PAGE_SHIFT) - 1;
2758 u_int split_io_size = SPLIT_PAGEIN_MAX_IOSIZE;
2759 u_int head_io_size = 0;
2760 int retval = 0;
2761 int error = 0;
2762 int pg;
2763
2764 assert(SPLIT_PAGEIN_MAX_IOSIZE >= (2 * PAGE_SIZE));
2765
2766 for (pg = start_pg; (pg <= last_pg) && !(upl_page_is_needed(pl, pg)); pg++) {
2767 ;
2768 }
2769
2770 /*
2771 * The global variables affecting behaviour
2772 * split_all_pgin -> Split pageins even if we don't find the needed page.
2773 * split_pgin_headio -> for a pagein in which there is a head calculated,
2774 * do the head i/o or not.
2775 *
2776 * split_all_pgin_equal -> split the entire bug request into equal sized small i/os of 32K.
2777 *
2778 * Whichever way the i/o is split, the i/o for the needed page always happens first and then we decide
2779 * whether we have to do i/o for the head and then if we need to issue equal sized i/o.
2780 *
2781 * By default we are set up to do only the i/o for the needed page, followed by a "unsplit" tail.
2782 */
2783 if ((pg > start_pg) && (pg <= last_pg)) {
2784 head_io_size = ((pg - start_pg) * PAGE_SIZE);
2785
2786 if (head_io_size < SPLIT_PAGEIN_MAX_IOSIZE) {
2787 head_io_size = 0;
2788 } else if (!split_all_pgin) {
2789 goto out;
2790 } else if ((rounded_size - head_io_size) <= SPLIT_PAGEIN_MAX_IOSIZE) {
2791 head_io_size = (rounded_size - SPLIT_PAGEIN_MAX_IOSIZE);
2792 } else {
2793 head_io_size &= ~(SPLIT_PAGEIN_MAX_IOSIZE - 1);
2794 }
2795
2796 assertf(io_size > head_io_size, "io_size is %d, head_io_size = %d", io_size, head_io_size);
2797
2798 if (head_io_size) {
2799 upl_offset += head_io_size;
2800 f_offset += head_io_size;
2801 io_size -= head_io_size;
2802
2803 if (!split_pgin_headio) {
2804 if (local_flags & CL_COMMIT) {
2805 ubc_upl_abort_range(upl, start_upl_offset, head_io_size,
2806 UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2807 }
2808 head_io_size = 0;
2809 }
2810
2811 split_io_size = MIN(SPLIT_PAGEIN_MAX_IOSIZE, io_size);
2812 }
2813
2814 assertf(io_size >= split_io_size, "io_size is %d, split_io_size = %d", io_size, split_io_size);
2815 } else if ((pg > last_pg) && !split_all_pgin) {
2816 goto out;
2817 }
2818
2819 /* This is the 32K i/o for the "needed" page */
2820 retval = cluster_io(vp, upl, upl_offset, f_offset, split_io_size,
2821 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2822
2823 io_size -= split_io_size;
2824
2825 if (io_size) {
2826 upl_offset += split_io_size;
2827 f_offset += split_io_size;
2828 } else if (head_io_size) {
2829 io_size = head_io_size;
2830 head_io_size = 0;
2831 upl_offset = start_upl_offset;
2832 f_offset = start_f_offset;
2833 }
2834
2835 while (io_size) {
2836 if (split_all_pgin_equal && (io_size > SPLIT_PAGEIN_MAX_IOSIZE)) {
2837 split_io_size = SPLIT_PAGEIN_MAX_IOSIZE;
2838 } else {
2839 split_io_size = io_size;
2840 }
2841
2842 assertf(io_size >= split_io_size, "io_size is %d, split_io_size = %d", io_size, split_io_size);
2843
2844 /* We have to issue this i/o anyway even if we get an error from any of the previous ones */
2845 error = cluster_io(vp, upl, upl_offset, f_offset, split_io_size,
2846 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2847 if (!retval) {
2848 retval = error;
2849 }
2850
2851 io_size -= split_io_size;
2852
2853 if ((io_size == 0) && head_io_size) {
2854 io_size = head_io_size;
2855 head_io_size = 0;
2856 upl_offset = start_upl_offset;
2857 f_offset = start_f_offset;
2858 } else if (io_size) {
2859 upl_offset += split_io_size;
2860 f_offset += split_io_size;
2861 }
2862 }
2863
2864 return retval;
2865 out:
2866 return cluster_io(vp, upl, upl_offset, f_offset, io_size,
2867 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2868 }
2869
2870 int
cluster_pagein_ext(vnode_t vp,upl_t upl,upl_offset_t upl_offset,off_t f_offset,int size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)2871 cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2872 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2873 {
2874 u_int io_size;
2875 int rounded_size;
2876 off_t max_size;
2877 int retval;
2878 int local_flags = 0;
2879
2880 if (upl == NULL || size < 0) {
2881 panic("cluster_pagein: NULL upl passed in");
2882 }
2883
2884 if ((flags & UPL_IOSYNC) == 0) {
2885 local_flags |= CL_ASYNC;
2886 }
2887 if ((flags & UPL_NOCOMMIT) == 0) {
2888 local_flags |= CL_COMMIT;
2889 }
2890 if (flags & UPL_IOSTREAMING) {
2891 local_flags |= CL_IOSTREAMING;
2892 }
2893 if (flags & UPL_PAGING_ENCRYPTED) {
2894 local_flags |= CL_ENCRYPTED;
2895 }
2896
2897
2898 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2899 (int)f_offset, size, (int)filesize, local_flags, 0);
2900
2901 /*
2902 * can't page-in from a negative offset
2903 * or if we're starting beyond the EOF
2904 * or if the file offset isn't page aligned
2905 * or the size requested isn't a multiple of PAGE_SIZE
2906 */
2907 if (f_offset < 0 || f_offset >= filesize ||
2908 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2909 if (local_flags & CL_COMMIT) {
2910 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2911 }
2912
2913 if (f_offset >= filesize) {
2914 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_CLUSTER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_CL_PGIN_PAST_EOF), 0 /* arg */);
2915 }
2916
2917 return EINVAL;
2918 }
2919 max_size = filesize - f_offset;
2920
2921 if (size < max_size) {
2922 io_size = size;
2923 } else {
2924 io_size = (int)max_size;
2925 }
2926
2927 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2928
2929 if (size > rounded_size && (local_flags & CL_COMMIT)) {
2930 ubc_upl_abort_range(upl, upl_offset + rounded_size,
2931 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2932 }
2933
2934 if ((io_size > SPLIT_PAGEIN_MAX_IOSIZE) && vnode_isonssd(vp) && split_pgin) {
2935 return cluster_handle_split_pagein(vp, upl, upl_offset, f_offset, io_size,
2936 rounded_size, local_flags, callback, callback_arg);
2937 }
2938
2939 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
2940 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2941
2942 return retval;
2943 }
2944
2945
2946 int
cluster_bp(buf_t bp)2947 cluster_bp(buf_t bp)
2948 {
2949 return cluster_bp_ext(bp, NULL, NULL);
2950 }
2951
2952
2953 int
cluster_bp_ext(buf_t bp,int (* callback)(buf_t,void *),void * callback_arg)2954 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2955 {
2956 off_t f_offset;
2957 int flags;
2958
2959 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2960 bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2961
2962 if (bp->b_flags & B_READ) {
2963 flags = CL_ASYNC | CL_READ;
2964 } else {
2965 flags = CL_ASYNC;
2966 }
2967 if (bp->b_flags & B_PASSIVE) {
2968 flags |= CL_PASSIVE;
2969 }
2970
2971 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2972
2973 return cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg);
2974 }
2975
2976
2977
2978 int
cluster_write(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags)2979 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2980 {
2981 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2982 }
2983
2984
2985 int
cluster_write_ext(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int xflags,int (* callback)(buf_t,void *),void * callback_arg)2986 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2987 int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2988 {
2989 user_ssize_t cur_resid;
2990 int retval = 0;
2991 int flags;
2992 int zflags;
2993 int bflag;
2994 int write_type = IO_COPY;
2995 u_int32_t write_length = 0, saved_write_length;
2996 uint32_t min_direct_size = MIN_DIRECT_WRITE_SIZE;
2997
2998 flags = xflags;
2999
3000 if (flags & IO_PASSIVE) {
3001 bflag = CL_PASSIVE;
3002 } else {
3003 bflag = 0;
3004 }
3005
3006 if (vp->v_flag & VNOCACHE_DATA) {
3007 flags |= IO_NOCACHE;
3008 bflag |= CL_NOCACHE;
3009 }
3010 if (uio == NULL) {
3011 /*
3012 * no user data...
3013 * this call is being made to zero-fill some range in the file
3014 */
3015 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
3016
3017 return retval;
3018 }
3019 /*
3020 * do a write through the cache if one of the following is true....
3021 * NOCACHE is not true or NODIRECT is true
3022 * the uio request doesn't target USERSPACE
3023 * otherwise, find out if we want the direct or contig variant for
3024 * the first vector in the uio request
3025 */
3026 if (((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
3027 if (flags & IO_NOCACHE_SWRITE) {
3028 uint32_t fs_bsize = vp->v_mount->mnt_vfsstat.f_bsize;
3029
3030 if (fs_bsize && (fs_bsize < MIN_DIRECT_WRITE_SIZE) &&
3031 ((fs_bsize & (fs_bsize - 1)) == 0)) {
3032 min_direct_size = fs_bsize;
3033 }
3034 }
3035 retval = cluster_io_type(uio, &write_type, &write_length, min_direct_size);
3036 }
3037
3038 if ((flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
3039 /*
3040 * must go through the cached variant in this case
3041 */
3042 write_type = IO_COPY;
3043 }
3044
3045 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
3046 switch (write_type) {
3047 case IO_COPY:
3048 /*
3049 * make sure the uio_resid isn't too big...
3050 * internally, we want to handle all of the I/O in
3051 * chunk sizes that fit in a 32 bit int
3052 */
3053 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
3054 /*
3055 * we're going to have to call cluster_write_copy
3056 * more than once...
3057 *
3058 * only want the last call to cluster_write_copy to
3059 * have the IO_TAILZEROFILL flag set and only the
3060 * first call should have IO_HEADZEROFILL
3061 */
3062 zflags = flags & ~IO_TAILZEROFILL;
3063 flags &= ~IO_HEADZEROFILL;
3064
3065 write_length = MAX_IO_REQUEST_SIZE;
3066 } else {
3067 /*
3068 * last call to cluster_write_copy
3069 */
3070 zflags = flags;
3071
3072 write_length = (u_int32_t)cur_resid;
3073 }
3074 retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
3075 break;
3076
3077 case IO_CONTIG:
3078 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
3079
3080 if (flags & IO_HEADZEROFILL) {
3081 /*
3082 * only do this once per request
3083 */
3084 flags &= ~IO_HEADZEROFILL;
3085
3086 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
3087 headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
3088 if (retval) {
3089 break;
3090 }
3091 }
3092 retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
3093
3094 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
3095 /*
3096 * we're done with the data from the user specified buffer(s)
3097 * and we've been requested to zero fill at the tail
3098 * treat this as an IO_HEADZEROFILL which doesn't require a uio
3099 * by rearranging the args and passing in IO_HEADZEROFILL
3100 */
3101
3102 /*
3103 * Update the oldEOF to reflect the current EOF. If the UPL page
3104 * to zero-fill is not valid (when F_NOCACHE is set), the
3105 * cluster_write_copy() will perform RMW on the UPL page when
3106 * the oldEOF is not aligned on page boundary due to unaligned
3107 * write.
3108 */
3109 if (uio->uio_offset > oldEOF) {
3110 oldEOF = uio->uio_offset;
3111 }
3112 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)oldEOF, tailOff, uio->uio_offset,
3113 (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
3114 }
3115 break;
3116
3117 case IO_DIRECT:
3118 /*
3119 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
3120 */
3121 saved_write_length = write_length;
3122 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg, min_direct_size);
3123 if (retval == ENOTSUP) {
3124 /* direct I/O didn't work; retry with cached I/O */
3125 // printf("******* FBDP %s:%d ENOTSUP cnt %d resid 0x%llx offset 0x%llx write_length 0x%x -> 0x%x\n", __FUNCTION__, __LINE__, uio_iovcnt(uio), (uint64_t) uio_resid(uio), uio_offset(uio), write_length, saved_write_length);
3126 write_length = saved_write_length;
3127 write_type = IO_COPY;
3128 retval = 0;
3129 }
3130 break;
3131
3132 case IO_UNKNOWN:
3133 retval = cluster_io_type(uio, &write_type, &write_length, min_direct_size);
3134 break;
3135 }
3136 /*
3137 * in case we end up calling cluster_write_copy (from cluster_write_direct)
3138 * multiple times to service a multi-vector request that is not aligned properly
3139 * we need to update the oldEOF so that we
3140 * don't zero-fill the head of a page if we've successfully written
3141 * data to that area... 'cluster_write_copy' will zero-fill the head of a
3142 * page that is beyond the oldEOF if the write is unaligned... we only
3143 * want that to happen for the very first page of the cluster_write,
3144 * NOT the first page of each vector making up a multi-vector write.
3145 */
3146 if (uio->uio_offset > oldEOF) {
3147 oldEOF = uio->uio_offset;
3148 }
3149 }
3150 return retval;
3151 }
3152
3153
3154 static int
cluster_write_direct(vnode_t vp,struct uio * uio,off_t oldEOF,off_t newEOF,int * write_type,u_int32_t * write_length,int flags,int (* callback)(buf_t,void *),void * callback_arg,uint32_t min_io_size)3155 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
3156 int flags, int (*callback)(buf_t, void *), void *callback_arg, uint32_t min_io_size)
3157 {
3158 upl_t upl = NULL;
3159 upl_page_info_t *pl;
3160 vm_offset_t upl_offset;
3161 vm_offset_t vector_upl_offset = 0;
3162 u_int32_t io_req_size;
3163 u_int32_t offset_in_file;
3164 u_int32_t offset_in_iovbase;
3165 u_int32_t io_size;
3166 int io_flag = 0;
3167 upl_size_t upl_size = 0, vector_upl_size = 0;
3168 vm_size_t upl_needed_size;
3169 mach_msg_type_number_t pages_in_pl = 0;
3170 upl_control_flags_t upl_flags;
3171 kern_return_t kret = KERN_SUCCESS;
3172 mach_msg_type_number_t i = 0;
3173 int force_data_sync;
3174 int retval = 0;
3175 int first_IO = 1;
3176 struct clios iostate;
3177 user_addr_t iov_base;
3178 u_int32_t mem_alignment_mask;
3179 u_int32_t devblocksize;
3180 u_int32_t max_io_size;
3181 u_int32_t max_upl_size;
3182 u_int32_t max_vector_size;
3183 u_int32_t bytes_outstanding_limit;
3184 boolean_t io_throttled = FALSE;
3185
3186 u_int32_t vector_upl_iosize = 0;
3187 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
3188 off_t v_upl_uio_offset = 0;
3189 int vector_upl_index = 0;
3190 upl_t vector_upl = NULL;
3191 uio_t snapshot_uio = NULL;
3192
3193 uint32_t io_align_mask;
3194
3195 /*
3196 * When we enter this routine, we know
3197 * -- the resid will not exceed iov_len
3198 */
3199 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
3200 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
3201
3202 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
3203
3204 max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
3205
3206 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
3207
3208 if (flags & IO_PASSIVE) {
3209 io_flag |= CL_PASSIVE;
3210 }
3211
3212 if (flags & IO_NOCACHE) {
3213 io_flag |= CL_NOCACHE;
3214 }
3215
3216 if (flags & IO_SKIP_ENCRYPTION) {
3217 io_flag |= CL_ENCRYPTED;
3218 }
3219
3220 iostate.io_completed = 0;
3221 iostate.io_issued = 0;
3222 iostate.io_error = 0;
3223 iostate.io_wanted = 0;
3224
3225 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
3226
3227 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3228 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3229
3230 if (devblocksize == 1) {
3231 /*
3232 * the AFP client advertises a devblocksize of 1
3233 * however, its BLOCKMAP routine maps to physical
3234 * blocks that are PAGE_SIZE in size...
3235 * therefore we can't ask for I/Os that aren't page aligned
3236 * or aren't multiples of PAGE_SIZE in size
3237 * by setting devblocksize to PAGE_SIZE, we re-instate
3238 * the old behavior we had before the mem_alignment_mask
3239 * changes went in...
3240 */
3241 devblocksize = PAGE_SIZE;
3242 }
3243
3244 io_align_mask = PAGE_MASK;
3245 if (min_io_size < MIN_DIRECT_WRITE_SIZE) {
3246 /* The process has opted into fs blocksize direct io writes */
3247 assert((min_io_size & (min_io_size - 1)) == 0);
3248 io_align_mask = min_io_size - 1;
3249 io_flag |= CL_DIRECT_IO_FSBLKSZ;
3250 }
3251
3252 if (uio_iovcnt(uio) > 1) {
3253 /* vector uio -> take a snapshot so we can rollback if needed */
3254 if (snapshot_uio) {
3255 uio_free(snapshot_uio);
3256 snapshot_uio = NULL;
3257 }
3258 snapshot_uio = uio_duplicate(uio);
3259 }
3260
3261 next_dwrite:
3262 io_req_size = *write_length;
3263 iov_base = uio_curriovbase(uio);
3264
3265 offset_in_file = (u_int32_t)(uio->uio_offset & io_align_mask);
3266 offset_in_iovbase = (u_int32_t)(iov_base & mem_alignment_mask);
3267
3268 if (offset_in_file || offset_in_iovbase) {
3269 /*
3270 * one of the 2 important offsets is misaligned
3271 * so fire an I/O through the cache for this entire vector
3272 */
3273 goto wait_for_dwrites;
3274 }
3275 if (iov_base & (devblocksize - 1)) {
3276 /*
3277 * the offset in memory must be on a device block boundary
3278 * so that we can guarantee that we can generate an
3279 * I/O that ends on a page boundary in cluster_io
3280 */
3281 goto wait_for_dwrites;
3282 }
3283
3284 task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
3285 while ((io_req_size >= PAGE_SIZE || io_req_size >= min_io_size) && uio->uio_offset < newEOF && retval == 0) {
3286 int throttle_type;
3287
3288 if ((throttle_type = cluster_is_throttled(vp))) {
3289 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
3290
3291 /*
3292 * we're in the throttle window, at the very least
3293 * we want to limit the size of the I/O we're about
3294 * to issue
3295 */
3296 if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
3297 /*
3298 * we're in the throttle window and at least 1 I/O
3299 * has already been issued by a throttleable thread
3300 * in this window, so return with EAGAIN to indicate
3301 * to the FS issuing the cluster_write call that it
3302 * should now throttle after dropping any locks
3303 */
3304 throttle_info_update_by_mount(vp->v_mount);
3305
3306 io_throttled = TRUE;
3307 goto wait_for_dwrites;
3308 }
3309 max_vector_size = max_throttle_size;
3310 max_io_size = max_throttle_size;
3311 } else {
3312 max_vector_size = MAX_VECTOR_UPL_SIZE;
3313 max_io_size = max_upl_size;
3314 }
3315
3316 if (first_IO) {
3317 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
3318 first_IO = 0;
3319 }
3320 io_size = io_req_size & ~io_align_mask;
3321 iov_base = uio_curriovbase(uio);
3322
3323 if (io_size > max_io_size) {
3324 io_size = max_io_size;
3325 }
3326
3327 if (useVectorUPL && (iov_base & PAGE_MASK)) {
3328 /*
3329 * We have an iov_base that's not page-aligned.
3330 * Issue all I/O's that have been collected within
3331 * this Vectored UPL.
3332 */
3333 if (vector_upl_index) {
3334 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
3335 if (retval == ENOTSUP) {
3336 goto enotsup;
3337 }
3338 reset_vector_run_state();
3339 }
3340
3341 /*
3342 * After this point, if we are using the Vector UPL path and the base is
3343 * not page-aligned then the UPL with that base will be the first in the vector UPL.
3344 */
3345 }
3346
3347 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3348 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3349
3350 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
3351 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
3352
3353 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
3354 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3355 pages_in_pl = 0;
3356 upl_size = (upl_size_t)upl_needed_size;
3357 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
3358 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3359
3360 kret = vm_map_get_upl(map,
3361 #if HAS_MTE || HAS_MTE_EMULATION_SHIMS
3362 vm_memtag_canonicalize(map, (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK))),
3363 #else /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
3364 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3365 #endif /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
3366 &upl_size,
3367 &upl,
3368 NULL,
3369 &pages_in_pl,
3370 &upl_flags,
3371 VM_KERN_MEMORY_FILE,
3372 force_data_sync);
3373
3374 if (kret != KERN_SUCCESS) {
3375 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
3376 0, 0, 0, kret, 0);
3377 /*
3378 * failed to get pagelist
3379 *
3380 * we may have already spun some portion of this request
3381 * off as async requests... we need to wait for the I/O
3382 * to complete before returning
3383 */
3384 goto wait_for_dwrites;
3385 }
3386 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3387 pages_in_pl = upl_size / PAGE_SIZE;
3388
3389 for (i = 0; i < pages_in_pl; i++) {
3390 if (!upl_valid_page(pl, i)) {
3391 break;
3392 }
3393 }
3394 if (i == pages_in_pl) {
3395 break;
3396 }
3397
3398 /*
3399 * didn't get all the pages back that we
3400 * needed... release this upl and try again
3401 */
3402 ubc_upl_abort(upl, 0);
3403 }
3404 if (force_data_sync >= 3) {
3405 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
3406 i, pages_in_pl, upl_size, kret, 0);
3407 /*
3408 * for some reason, we couldn't acquire a hold on all
3409 * the pages needed in the user's address space
3410 *
3411 * we may have already spun some portion of this request
3412 * off as async requests... we need to wait for the I/O
3413 * to complete before returning
3414 */
3415 goto wait_for_dwrites;
3416 }
3417
3418 /*
3419 * Consider the possibility that upl_size wasn't satisfied.
3420 */
3421 if (upl_size < upl_needed_size) {
3422 if (upl_size && upl_offset == 0) {
3423 io_size = upl_size;
3424 } else {
3425 io_size = 0;
3426 }
3427 }
3428 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
3429 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
3430
3431 if (io_size == 0) {
3432 ubc_upl_abort(upl, 0);
3433 upl = NULL;
3434 /*
3435 * we may have already spun some portion of this request
3436 * off as async requests... we need to wait for the I/O
3437 * to complete before returning
3438 */
3439 goto wait_for_dwrites;
3440 }
3441
3442 if (useVectorUPL) {
3443 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
3444 if (end_off) {
3445 issueVectorUPL = 1;
3446 }
3447 /*
3448 * After this point, if we are using a vector UPL, then
3449 * either all the UPL elements end on a page boundary OR
3450 * this UPL is the last element because it does not end
3451 * on a page boundary.
3452 */
3453 }
3454
3455 /*
3456 * we want push out these writes asynchronously so that we can overlap
3457 * the preparation of the next I/O
3458 * if there are already too many outstanding writes
3459 * wait until some complete before issuing the next
3460 */
3461 if (vp->v_mount->mnt_minsaturationbytecount) {
3462 bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
3463 } else {
3464 if (__improbable(os_mul_overflow(max_upl_size, IO_SCALE(vp, 2),
3465 &bytes_outstanding_limit) ||
3466 (bytes_outstanding_limit > overlapping_write_max))) {
3467 bytes_outstanding_limit = overlapping_write_max;
3468 }
3469 }
3470
3471 cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
3472
3473 if (iostate.io_error) {
3474 /*
3475 * one of the earlier writes we issued ran into a hard error
3476 * don't issue any more writes, cleanup the UPL
3477 * that was just created but not used, then
3478 * go wait for all writes that are part of this stream
3479 * to complete before returning the error to the caller
3480 */
3481 ubc_upl_abort(upl, 0);
3482 upl = NULL;
3483
3484 goto wait_for_dwrites;
3485 }
3486
3487 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
3488 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
3489
3490 if (!useVectorUPL) {
3491 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
3492 io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
3493 } else {
3494 if (!vector_upl_index) {
3495 vector_upl = vector_upl_create(upl_offset, uio->uio_iovcnt);
3496 v_upl_uio_offset = uio->uio_offset;
3497 vector_upl_offset = upl_offset;
3498 }
3499
3500 vector_upl_set_subupl(vector_upl, upl, upl_size);
3501 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
3502 vector_upl_index++;
3503 vector_upl_iosize += io_size;
3504 vector_upl_size += upl_size;
3505
3506 if (issueVectorUPL || vector_upl_index == vector_upl_max_upls(vector_upl) || vector_upl_size >= max_vector_size) {
3507 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
3508 if (retval != ENOTSUP) {
3509 reset_vector_run_state();
3510 }
3511 }
3512 }
3513 if (retval == ENOTSUP) {
3514 enotsup:
3515 /*
3516 * Can't do direct I/O. Try again with cached I/O.
3517 */
3518 // printf("******* FBDP %s:%d ENOTSUP io_size 0%x resid 0x%llx\n", __FUNCTION__, __LINE__, io_size, uio_resid(uio));
3519 io_size = 0;
3520 if (snapshot_uio) {
3521 int restore_error;
3522
3523 /*
3524 * We've been collecting UPLs for this vector UPL and
3525 * moving the uio along. We need to undo that so that
3526 * the I/O can continue where it actually stopped...
3527 */
3528 restore_error = uio_restore(uio, snapshot_uio);
3529 assert(!restore_error);
3530 uio_free(snapshot_uio);
3531 snapshot_uio = NULL;
3532 }
3533 if (vector_upl_index) {
3534 ubc_upl_abort(vector_upl, 0);
3535 vector_upl = NULL;
3536 } else {
3537 ubc_upl_abort(upl, 0);
3538 upl = NULL;
3539 }
3540 goto wait_for_dwrites;
3541 }
3542
3543 /*
3544 * update the uio structure to
3545 * reflect the I/O that we just issued
3546 */
3547 uio_update(uio, (user_size_t)io_size);
3548
3549 /*
3550 * in case we end up calling through to cluster_write_copy to finish
3551 * the tail of this request, we need to update the oldEOF so that we
3552 * don't zero-fill the head of a page if we've successfully written
3553 * data to that area... 'cluster_write_copy' will zero-fill the head of a
3554 * page that is beyond the oldEOF if the write is unaligned... we only
3555 * want that to happen for the very first page of the cluster_write,
3556 * NOT the first page of each vector making up a multi-vector write.
3557 */
3558 if (uio->uio_offset > oldEOF) {
3559 oldEOF = uio->uio_offset;
3560 }
3561
3562 io_req_size -= io_size;
3563
3564 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
3565 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
3566 } /* end while */
3567
3568 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
3569 retval = cluster_io_type(uio, write_type, write_length, min_io_size);
3570
3571 if (retval == 0 && *write_type == IO_DIRECT) {
3572 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
3573 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
3574
3575 goto next_dwrite;
3576 }
3577 }
3578
3579 wait_for_dwrites:
3580
3581 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
3582 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
3583 reset_vector_run_state();
3584 }
3585 /*
3586 * make sure all async writes issued as part of this stream
3587 * have completed before we return
3588 */
3589 cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
3590
3591 if (iostate.io_error) {
3592 retval = iostate.io_error;
3593 }
3594
3595 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
3596
3597 if (io_throttled == TRUE && retval == 0) {
3598 retval = EAGAIN;
3599 }
3600
3601 if (io_req_size && retval == 0) {
3602 /*
3603 * we couldn't handle the tail of this request in DIRECT mode
3604 * so fire it through the copy path
3605 *
3606 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
3607 * so we can just pass 0 in for the headOff and tailOff
3608 */
3609 if (uio->uio_offset > oldEOF) {
3610 oldEOF = uio->uio_offset;
3611 }
3612
3613 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
3614
3615 *write_type = IO_UNKNOWN;
3616 }
3617
3618 if (snapshot_uio) {
3619 uio_free(snapshot_uio);
3620 snapshot_uio = NULL;
3621 }
3622
3623 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
3624 (int)uio->uio_offset, io_req_size, retval, 4, 0);
3625
3626 return retval;
3627 }
3628
3629
3630 static int
cluster_write_contig(vnode_t vp,struct uio * uio,off_t newEOF,int * write_type,u_int32_t * write_length,int (* callback)(buf_t,void *),void * callback_arg,int bflag)3631 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
3632 int (*callback)(buf_t, void *), void *callback_arg, int bflag)
3633 {
3634 upl_page_info_t *pl;
3635 addr64_t src_paddr = 0;
3636 upl_t upl[MAX_VECTS];
3637 vm_offset_t upl_offset;
3638 u_int32_t tail_size = 0;
3639 u_int32_t io_size;
3640 u_int32_t xsize;
3641 upl_size_t upl_size;
3642 vm_size_t upl_needed_size;
3643 mach_msg_type_number_t pages_in_pl;
3644 upl_control_flags_t upl_flags;
3645 kern_return_t kret;
3646 struct clios iostate;
3647 int error = 0;
3648 int cur_upl = 0;
3649 int num_upl = 0;
3650 int n;
3651 user_addr_t iov_base;
3652 u_int32_t devblocksize;
3653 u_int32_t mem_alignment_mask;
3654
3655 /*
3656 * When we enter this routine, we know
3657 * -- the io_req_size will not exceed iov_len
3658 * -- the target address is physically contiguous
3659 */
3660 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
3661
3662 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
3663 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
3664
3665 iostate.io_completed = 0;
3666 iostate.io_issued = 0;
3667 iostate.io_error = 0;
3668 iostate.io_wanted = 0;
3669
3670 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
3671
3672 next_cwrite:
3673 io_size = *write_length;
3674
3675 iov_base = uio_curriovbase(uio);
3676
3677 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
3678 upl_needed_size = upl_offset + io_size;
3679
3680 pages_in_pl = 0;
3681 upl_size = (upl_size_t)upl_needed_size;
3682 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
3683 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3684
3685 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
3686 kret = vm_map_get_upl(map,
3687 #if HAS_MTE || HAS_MTE_EMULATION_SHIMS
3688 vm_memtag_canonicalize(map, vm_map_trunc_page(iov_base, vm_map_page_mask(map))),
3689 #else /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
3690 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
3691 #endif /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
3692 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
3693
3694 if (kret != KERN_SUCCESS) {
3695 /*
3696 * failed to get pagelist
3697 */
3698 error = EINVAL;
3699 goto wait_for_cwrites;
3700 }
3701 num_upl++;
3702
3703 if (!(upl_flags & UPL_PHYS_CONTIG)) {
3704 /*
3705 * The created UPL needs to have the UPL_PHYS_CONTIG flag.
3706 */
3707 error = EINVAL;
3708 goto wait_for_cwrites;
3709 }
3710
3711 /*
3712 * Consider the possibility that upl_size wasn't satisfied.
3713 */
3714 if (upl_size < upl_needed_size) {
3715 /*
3716 * This is a failure in the physical memory case.
3717 */
3718 error = EINVAL;
3719 goto wait_for_cwrites;
3720 }
3721 pl = ubc_upl_pageinfo(upl[cur_upl]);
3722
3723 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
3724
3725 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3726 u_int32_t head_size;
3727
3728 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
3729
3730 if (head_size > io_size) {
3731 head_size = io_size;
3732 }
3733
3734 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
3735
3736 if (error) {
3737 goto wait_for_cwrites;
3738 }
3739
3740 upl_offset += head_size;
3741 src_paddr += head_size;
3742 io_size -= head_size;
3743
3744 iov_base += head_size;
3745 }
3746 if ((u_int32_t)iov_base & mem_alignment_mask) {
3747 /*
3748 * request doesn't set up on a memory boundary
3749 * the underlying DMA engine can handle...
3750 * return an error instead of going through
3751 * the slow copy path since the intent of this
3752 * path is direct I/O from device memory
3753 */
3754 error = EINVAL;
3755 goto wait_for_cwrites;
3756 }
3757
3758 tail_size = io_size & (devblocksize - 1);
3759 io_size -= tail_size;
3760
3761 while (io_size && error == 0) {
3762 if (io_size > MAX_IO_CONTIG_SIZE) {
3763 xsize = MAX_IO_CONTIG_SIZE;
3764 } else {
3765 xsize = io_size;
3766 }
3767 /*
3768 * request asynchronously so that we can overlap
3769 * the preparation of the next I/O... we'll do
3770 * the commit after all the I/O has completed
3771 * since its all issued against the same UPL
3772 * if there are already too many outstanding writes
3773 * wait until some have completed before issuing the next
3774 */
3775 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
3776
3777 if (iostate.io_error) {
3778 /*
3779 * one of the earlier writes we issued ran into a hard error
3780 * don't issue any more writes...
3781 * go wait for all writes that are part of this stream
3782 * to complete before returning the error to the caller
3783 */
3784 goto wait_for_cwrites;
3785 }
3786 /*
3787 * issue an asynchronous write to cluster_io
3788 */
3789 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
3790 xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
3791
3792 if (error == 0) {
3793 /*
3794 * The cluster_io write completed successfully,
3795 * update the uio structure
3796 */
3797 uio_update(uio, (user_size_t)xsize);
3798
3799 upl_offset += xsize;
3800 src_paddr += xsize;
3801 io_size -= xsize;
3802 }
3803 }
3804 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
3805 error = cluster_io_type(uio, write_type, write_length, 0);
3806
3807 if (error == 0 && *write_type == IO_CONTIG) {
3808 cur_upl++;
3809 goto next_cwrite;
3810 }
3811 } else {
3812 *write_type = IO_UNKNOWN;
3813 }
3814
3815 wait_for_cwrites:
3816 /*
3817 * make sure all async writes that are part of this stream
3818 * have completed before we proceed
3819 */
3820 cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
3821
3822 if (iostate.io_error) {
3823 error = iostate.io_error;
3824 }
3825
3826 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
3827
3828 if (error == 0 && tail_size) {
3829 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
3830 }
3831
3832 for (n = 0; n < num_upl; n++) {
3833 /*
3834 * just release our hold on each physically contiguous
3835 * region without changing any state
3836 */
3837 ubc_upl_abort(upl[n], 0);
3838 }
3839
3840 return error;
3841 }
3842
3843
3844 /*
3845 * need to avoid a race between an msync of a range of pages dirtied via mmap
3846 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3847 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3848 *
3849 * we should never force-zero-fill pages that are already valid in the cache...
3850 * the entire page contains valid data (either from disk, zero-filled or dirtied
3851 * via an mmap) so we can only do damage by trying to zero-fill
3852 *
3853 */
3854 static int
cluster_zero_range(upl_t upl,upl_page_info_t * pl,int flags,int io_offset,off_t zero_off,off_t upl_f_offset,int bytes_to_zero)3855 cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3856 {
3857 int zero_pg_index;
3858 boolean_t need_cluster_zero = TRUE;
3859
3860 if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
3861 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
3862 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3863
3864 if (upl_valid_page(pl, zero_pg_index)) {
3865 /*
3866 * never force zero valid pages - dirty or clean
3867 * we'll leave these in the UPL for cluster_write_copy to deal with
3868 */
3869 need_cluster_zero = FALSE;
3870 }
3871 }
3872 if (need_cluster_zero == TRUE) {
3873 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
3874 }
3875
3876 return bytes_to_zero;
3877 }
3878
3879
3880 void
cluster_update_state(vnode_t vp,vm_object_offset_t s_offset,vm_object_offset_t e_offset,boolean_t vm_initiated)3881 cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3882 {
3883 struct cl_extent cl;
3884 boolean_t first_pass = TRUE;
3885
3886 assert(s_offset < e_offset);
3887 assert((s_offset & PAGE_MASK_64) == 0);
3888 assert((e_offset & PAGE_MASK_64) == 0);
3889
3890 cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3891 cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3892
3893 cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
3894 vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3895 }
3896
3897
3898 static void
cluster_update_state_internal(vnode_t vp,struct cl_extent * cl,int flags,boolean_t defer_writes,boolean_t * first_pass,off_t write_off,int write_cnt,off_t newEOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)3899 cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
3900 boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3901 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
3902 {
3903 struct cl_writebehind *wbp;
3904 int cl_index;
3905 int ret_cluster_try_push;
3906 u_int max_cluster_pgcount;
3907
3908
3909 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3910
3911 /*
3912 * take the lock to protect our accesses
3913 * of the writebehind and sparse cluster state
3914 */
3915 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3916
3917 if (wbp->cl_scmap) {
3918 if (!(flags & IO_NOCACHE)) {
3919 /*
3920 * we've fallen into the sparse
3921 * cluster method of delaying dirty pages
3922 */
3923 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3924
3925 lck_mtx_unlock(&wbp->cl_lockw);
3926 return;
3927 }
3928 /*
3929 * must have done cached writes that fell into
3930 * the sparse cluster mechanism... we've switched
3931 * to uncached writes on the file, so go ahead
3932 * and push whatever's in the sparse map
3933 * and switch back to normal clustering
3934 */
3935 wbp->cl_number = 0;
3936
3937 sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
3938 /*
3939 * no clusters of either type present at this point
3940 * so just go directly to start_new_cluster since
3941 * we know we need to delay this I/O since we've
3942 * already released the pages back into the cache
3943 * to avoid the deadlock with sparse_cluster_push
3944 */
3945 goto start_new_cluster;
3946 }
3947 if (*first_pass == TRUE) {
3948 if (write_off == wbp->cl_last_write) {
3949 wbp->cl_seq_written += write_cnt;
3950 } else {
3951 wbp->cl_seq_written = write_cnt;
3952 }
3953
3954 wbp->cl_last_write = write_off + write_cnt;
3955
3956 *first_pass = FALSE;
3957 }
3958 if (wbp->cl_number == 0) {
3959 /*
3960 * no clusters currently present
3961 */
3962 goto start_new_cluster;
3963 }
3964
3965 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3966 /*
3967 * check each cluster that we currently hold
3968 * try to merge some or all of this write into
3969 * one or more of the existing clusters... if
3970 * any portion of the write remains, start a
3971 * new cluster
3972 */
3973 if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3974 /*
3975 * the current write starts at or after the current cluster
3976 */
3977 if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3978 /*
3979 * we have a write that fits entirely
3980 * within the existing cluster limits
3981 */
3982 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3983 /*
3984 * update our idea of where the cluster ends
3985 */
3986 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3987 }
3988 break;
3989 }
3990 if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3991 /*
3992 * we have a write that starts in the middle of the current cluster
3993 * but extends beyond the cluster's limit... we know this because
3994 * of the previous checks
3995 * we'll extend the current cluster to the max
3996 * and update the b_addr for the current write to reflect that
3997 * the head of it was absorbed into this cluster...
3998 * note that we'll always have a leftover tail in this case since
3999 * full absorbtion would have occurred in the clause above
4000 */
4001 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
4002
4003 cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
4004 }
4005 /*
4006 * we come here for the case where the current write starts
4007 * beyond the limit of the existing cluster or we have a leftover
4008 * tail after a partial absorbtion
4009 *
4010 * in either case, we'll check the remaining clusters before
4011 * starting a new one
4012 */
4013 } else {
4014 /*
4015 * the current write starts in front of the cluster we're currently considering
4016 */
4017 if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
4018 /*
4019 * we can just merge the new request into
4020 * this cluster and leave it in the cache
4021 * since the resulting cluster is still
4022 * less than the maximum allowable size
4023 */
4024 wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
4025
4026 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
4027 /*
4028 * the current write completely
4029 * envelops the existing cluster and since
4030 * each write is limited to at most max_cluster_pgcount pages
4031 * we can just use the start and last blocknos of the write
4032 * to generate the cluster limits
4033 */
4034 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
4035 }
4036 break;
4037 }
4038 /*
4039 * if we were to combine this write with the current cluster
4040 * we would exceed the cluster size limit.... so,
4041 * let's see if there's any overlap of the new I/O with
4042 * the cluster we're currently considering... in fact, we'll
4043 * stretch the cluster out to it's full limit and see if we
4044 * get an intersection with the current write
4045 *
4046 */
4047 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
4048 /*
4049 * the current write extends into the proposed cluster
4050 * clip the length of the current write after first combining it's
4051 * tail with the newly shaped cluster
4052 */
4053 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
4054
4055 cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
4056 }
4057 /*
4058 * if we get here, there was no way to merge
4059 * any portion of this write with this cluster
4060 * or we could only merge part of it which
4061 * will leave a tail...
4062 * we'll check the remaining clusters before starting a new one
4063 */
4064 }
4065 }
4066 if (cl_index < wbp->cl_number) {
4067 /*
4068 * we found an existing cluster(s) that we
4069 * could entirely merge this I/O into
4070 */
4071 goto delay_io;
4072 }
4073
4074 if (defer_writes == FALSE &&
4075 wbp->cl_number == MAX_CLUSTERS &&
4076 wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
4077 uint32_t n;
4078
4079 if (vp->v_mount->mnt_minsaturationbytecount) {
4080 n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
4081
4082 if (n > MAX_CLUSTERS) {
4083 n = MAX_CLUSTERS;
4084 }
4085 } else {
4086 n = 0;
4087 }
4088
4089 if (n == 0) {
4090 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
4091 n = WRITE_BEHIND_SSD;
4092 } else {
4093 n = WRITE_BEHIND;
4094 }
4095 }
4096 while (n--) {
4097 cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
4098 }
4099 }
4100 if (wbp->cl_number < MAX_CLUSTERS) {
4101 /*
4102 * we didn't find an existing cluster to
4103 * merge into, but there's room to start
4104 * a new one
4105 */
4106 goto start_new_cluster;
4107 }
4108 /*
4109 * no exisitng cluster to merge with and no
4110 * room to start a new one... we'll try
4111 * pushing one of the existing ones... if none of
4112 * them are able to be pushed, we'll switch
4113 * to the sparse cluster mechanism
4114 * cluster_try_push updates cl_number to the
4115 * number of remaining clusters... and
4116 * returns the number of currently unused clusters
4117 */
4118 ret_cluster_try_push = 0;
4119
4120 /*
4121 * if writes are not deferred, call cluster push immediately
4122 */
4123 if (defer_writes == FALSE) {
4124 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
4125 }
4126 /*
4127 * execute following regardless of writes being deferred or not
4128 */
4129 if (ret_cluster_try_push == 0) {
4130 /*
4131 * no more room in the normal cluster mechanism
4132 * so let's switch to the more expansive but expensive
4133 * sparse mechanism....
4134 */
4135 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
4136 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
4137
4138 lck_mtx_unlock(&wbp->cl_lockw);
4139 return;
4140 }
4141 start_new_cluster:
4142 wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
4143 wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
4144
4145 wbp->cl_clusters[wbp->cl_number].io_flags = 0;
4146
4147 if (flags & IO_NOCACHE) {
4148 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
4149 }
4150
4151 if (flags & IO_PASSIVE) {
4152 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
4153 }
4154
4155 wbp->cl_number++;
4156 delay_io:
4157 lck_mtx_unlock(&wbp->cl_lockw);
4158 return;
4159 }
4160
4161
4162 static int
cluster_write_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t oldEOF,off_t newEOF,off_t headOff,off_t tailOff,int flags,int (* callback)(buf_t,void *),void * callback_arg)4163 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
4164 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4165 {
4166 upl_page_info_t *pl;
4167 upl_t upl;
4168 vm_offset_t upl_offset = 0;
4169 vm_size_t upl_size;
4170 off_t upl_f_offset;
4171 int pages_in_upl;
4172 int start_offset;
4173 int xfer_resid;
4174 int io_size;
4175 int io_offset;
4176 int bytes_to_zero;
4177 int bytes_to_move;
4178 kern_return_t kret;
4179 int retval = 0;
4180 int io_resid;
4181 long long total_size;
4182 long long zero_cnt;
4183 off_t zero_off;
4184 long long zero_cnt1;
4185 off_t zero_off1;
4186 off_t write_off = 0;
4187 int write_cnt = 0;
4188 boolean_t first_pass = FALSE;
4189 struct cl_extent cl;
4190 int bflag;
4191 u_int max_io_size;
4192
4193 if (uio) {
4194 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
4195 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
4196
4197 io_resid = io_req_size;
4198 } else {
4199 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
4200 0, 0, (int)oldEOF, (int)newEOF, 0);
4201
4202 io_resid = 0;
4203 }
4204 if (flags & IO_PASSIVE) {
4205 bflag = CL_PASSIVE;
4206 } else {
4207 bflag = 0;
4208 }
4209 if (flags & IO_NOCACHE) {
4210 bflag |= CL_NOCACHE;
4211 }
4212
4213 if (flags & IO_SKIP_ENCRYPTION) {
4214 bflag |= CL_ENCRYPTED;
4215 }
4216
4217 zero_cnt = 0;
4218 zero_cnt1 = 0;
4219 zero_off = 0;
4220 zero_off1 = 0;
4221
4222 max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
4223
4224 if (flags & IO_HEADZEROFILL) {
4225 /*
4226 * some filesystems (HFS is one) don't support unallocated holes within a file...
4227 * so we zero fill the intervening space between the old EOF and the offset
4228 * where the next chunk of real data begins.... ftruncate will also use this
4229 * routine to zero fill to the new EOF when growing a file... in this case, the
4230 * uio structure will not be provided
4231 */
4232 if (uio) {
4233 if (headOff < uio->uio_offset) {
4234 zero_cnt = uio->uio_offset - headOff;
4235 zero_off = headOff;
4236 }
4237 } else if (headOff < newEOF) {
4238 zero_cnt = newEOF - headOff;
4239 zero_off = headOff;
4240 }
4241 } else {
4242 if (uio && uio->uio_offset > oldEOF) {
4243 zero_off = uio->uio_offset & ~PAGE_MASK_64;
4244
4245 if (zero_off >= oldEOF) {
4246 zero_cnt = uio->uio_offset - zero_off;
4247
4248 flags |= IO_HEADZEROFILL;
4249 }
4250 }
4251 }
4252 if (flags & IO_TAILZEROFILL) {
4253 if (uio) {
4254 zero_off1 = uio->uio_offset + io_req_size;
4255
4256 if (zero_off1 < tailOff) {
4257 zero_cnt1 = tailOff - zero_off1;
4258 }
4259 }
4260 } else {
4261 if (uio && newEOF > oldEOF) {
4262 zero_off1 = uio->uio_offset + io_req_size;
4263
4264 if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
4265 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
4266
4267 flags |= IO_TAILZEROFILL;
4268 }
4269 }
4270 }
4271 if (zero_cnt == 0 && uio == (struct uio *) 0) {
4272 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
4273 retval, 0, 0, 0, 0);
4274 return 0;
4275 }
4276 if (uio) {
4277 write_off = uio->uio_offset;
4278 write_cnt = (int)uio_resid(uio);
4279 /*
4280 * delay updating the sequential write info
4281 * in the control block until we've obtained
4282 * the lock for it
4283 */
4284 first_pass = TRUE;
4285 }
4286 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
4287 /*
4288 * for this iteration of the loop, figure out where our starting point is
4289 */
4290 if (zero_cnt) {
4291 start_offset = (int)(zero_off & PAGE_MASK_64);
4292 upl_f_offset = zero_off - start_offset;
4293 } else if (io_resid) {
4294 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4295 upl_f_offset = uio->uio_offset - start_offset;
4296 } else {
4297 start_offset = (int)(zero_off1 & PAGE_MASK_64);
4298 upl_f_offset = zero_off1 - start_offset;
4299 }
4300 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
4301 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
4302
4303 if (total_size > max_io_size) {
4304 total_size = max_io_size;
4305 }
4306
4307 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
4308
4309 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
4310 /*
4311 * assumption... total_size <= io_resid
4312 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
4313 */
4314 if ((start_offset + total_size) > max_io_size) {
4315 total_size = max_io_size - start_offset;
4316 }
4317 xfer_resid = (int)total_size;
4318
4319 retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
4320
4321 if (retval) {
4322 break;
4323 }
4324
4325 io_resid -= (total_size - xfer_resid);
4326 total_size = xfer_resid;
4327 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4328 upl_f_offset = uio->uio_offset - start_offset;
4329
4330 if (total_size == 0) {
4331 if (start_offset) {
4332 /*
4333 * the write did not finish on a page boundary
4334 * which will leave upl_f_offset pointing to the
4335 * beginning of the last page written instead of
4336 * the page beyond it... bump it in this case
4337 * so that the cluster code records the last page
4338 * written as dirty
4339 */
4340 upl_f_offset += PAGE_SIZE_64;
4341 }
4342 upl_size = 0;
4343
4344 goto check_cluster;
4345 }
4346 }
4347 /*
4348 * compute the size of the upl needed to encompass
4349 * the requested write... limit each call to cluster_io
4350 * to the maximum UPL size... cluster_io will clip if
4351 * this exceeds the maximum io_size for the device,
4352 * make sure to account for
4353 * a starting offset that's not page aligned
4354 */
4355 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4356
4357 if (upl_size > max_io_size) {
4358 upl_size = max_io_size;
4359 }
4360
4361 pages_in_upl = (int)(upl_size / PAGE_SIZE);
4362 io_size = (int)(upl_size - start_offset);
4363
4364 if ((long long)io_size > total_size) {
4365 io_size = (int)total_size;
4366 }
4367
4368 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
4369
4370
4371 /*
4372 * Gather the pages from the buffer cache.
4373 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
4374 * that we intend to modify these pages.
4375 */
4376 kret = ubc_create_upl_kernel(vp,
4377 upl_f_offset,
4378 (int)upl_size,
4379 &upl,
4380 &pl,
4381 UPL_SET_LITE | ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
4382 VM_KERN_MEMORY_FILE);
4383 if (kret != KERN_SUCCESS) {
4384 panic("cluster_write_copy: failed to get pagelist");
4385 }
4386
4387 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
4388 upl, (int)upl_f_offset, start_offset, 0, 0);
4389
4390 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
4391 int read_size;
4392
4393 /*
4394 * we're starting in the middle of the first page of the upl
4395 * and the page isn't currently valid, so we're going to have
4396 * to read it in first... this is a synchronous operation
4397 */
4398 read_size = PAGE_SIZE;
4399
4400 if ((upl_f_offset + read_size) > oldEOF) {
4401 read_size = (int)(oldEOF - upl_f_offset);
4402 }
4403
4404 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
4405 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
4406 if (retval) {
4407 /*
4408 * we had an error during the read which causes us to abort
4409 * the current cluster_write request... before we do, we need
4410 * to release the rest of the pages in the upl without modifying
4411 * there state and mark the failed page in error
4412 */
4413 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4414
4415 if (upl_size > PAGE_SIZE) {
4416 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size,
4417 UPL_ABORT_FREE_ON_EMPTY);
4418 }
4419
4420 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
4421 upl, 0, 0, retval, 0);
4422 break;
4423 }
4424 }
4425 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
4426 /*
4427 * the last offset we're writing to in this upl does not end on a page
4428 * boundary... if it's not beyond the old EOF, then we'll also need to
4429 * pre-read this page in if it isn't already valid
4430 */
4431 upl_offset = upl_size - PAGE_SIZE;
4432
4433 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
4434 !upl_valid_page(pl, (int)(upl_offset / PAGE_SIZE))) {
4435 int read_size;
4436
4437 read_size = PAGE_SIZE;
4438
4439 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
4440 read_size = (int)(oldEOF - (upl_f_offset + upl_offset));
4441 }
4442
4443 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
4444 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
4445 if (retval) {
4446 /*
4447 * we had an error during the read which causes us to abort
4448 * the current cluster_write request... before we do, we
4449 * need to release the rest of the pages in the upl without
4450 * modifying there state and mark the failed page in error
4451 */
4452 ubc_upl_abort_range(upl, (upl_offset_t)upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4453
4454 if (upl_size > PAGE_SIZE) {
4455 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
4456 }
4457
4458 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
4459 upl, 0, 0, retval, 0);
4460 break;
4461 }
4462 }
4463 }
4464 xfer_resid = io_size;
4465 io_offset = start_offset;
4466
4467 while (zero_cnt && xfer_resid) {
4468 if (zero_cnt < (long long)xfer_resid) {
4469 bytes_to_zero = (int)zero_cnt;
4470 } else {
4471 bytes_to_zero = xfer_resid;
4472 }
4473
4474 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
4475
4476 xfer_resid -= bytes_to_zero;
4477 zero_cnt -= bytes_to_zero;
4478 zero_off += bytes_to_zero;
4479 io_offset += bytes_to_zero;
4480 }
4481 if (xfer_resid && io_resid) {
4482 u_int32_t io_requested;
4483
4484 bytes_to_move = min(io_resid, xfer_resid);
4485 io_requested = bytes_to_move;
4486
4487 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
4488
4489 if (retval) {
4490 ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
4491
4492 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
4493 upl, 0, 0, retval, 0);
4494 } else {
4495 io_resid -= bytes_to_move;
4496 xfer_resid -= bytes_to_move;
4497 io_offset += bytes_to_move;
4498 }
4499 }
4500 while (xfer_resid && zero_cnt1 && retval == 0) {
4501 if (zero_cnt1 < (long long)xfer_resid) {
4502 bytes_to_zero = (int)zero_cnt1;
4503 } else {
4504 bytes_to_zero = xfer_resid;
4505 }
4506
4507 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
4508
4509 xfer_resid -= bytes_to_zero;
4510 zero_cnt1 -= bytes_to_zero;
4511 zero_off1 += bytes_to_zero;
4512 io_offset += bytes_to_zero;
4513 }
4514 if (retval == 0) {
4515 int do_zeroing = 1;
4516
4517 io_size += start_offset;
4518
4519 /* Force more restrictive zeroing behavior only on APFS */
4520 if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
4521 do_zeroing = 0;
4522 }
4523
4524 if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
4525 /*
4526 * if we're extending the file with this write
4527 * we'll zero fill the rest of the page so that
4528 * if the file gets extended again in such a way as to leave a
4529 * hole starting at this EOF, we'll have zero's in the correct spot
4530 */
4531 cluster_zero(upl, io_size, (int)(upl_size - io_size), NULL);
4532 }
4533 /*
4534 * release the upl now if we hold one since...
4535 * 1) pages in it may be present in the sparse cluster map
4536 * and may span 2 separate buckets there... if they do and
4537 * we happen to have to flush a bucket to make room and it intersects
4538 * this upl, a deadlock may result on page BUSY
4539 * 2) we're delaying the I/O... from this point forward we're just updating
4540 * the cluster state... no need to hold the pages, so commit them
4541 * 3) IO_SYNC is set...
4542 * because we had to ask for a UPL that provides currenty non-present pages, the
4543 * UPL has been automatically set to clear the dirty flags (both software and hardware)
4544 * upon committing it... this is not the behavior we want since it's possible for
4545 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
4546 * we'll pick these pages back up later with the correct behavior specified.
4547 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
4548 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
4549 * we hold since the flushing context is holding the cluster lock.
4550 */
4551 ubc_upl_commit_range(upl, 0, (upl_size_t)upl_size,
4552 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
4553 check_cluster:
4554 /*
4555 * calculate the last logical block number
4556 * that this delayed I/O encompassed
4557 */
4558 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
4559
4560 if (flags & IO_SYNC) {
4561 /*
4562 * if the IO_SYNC flag is set than we need to bypass
4563 * any clustering and immediately issue the I/O
4564 *
4565 * we don't hold the lock at this point
4566 *
4567 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
4568 * so that we correctly deal with a change in state of the hardware modify bit...
4569 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
4570 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
4571 * responsible for generating the correct sized I/O(s)
4572 */
4573 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
4574 } else {
4575 boolean_t defer_writes = FALSE;
4576
4577 if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) {
4578 defer_writes = TRUE;
4579 }
4580
4581 cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
4582 write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
4583 }
4584 }
4585 }
4586 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
4587
4588 return retval;
4589 }
4590
4591
4592
4593 int
cluster_read(vnode_t vp,struct uio * uio,off_t filesize,int xflags)4594 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
4595 {
4596 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
4597 }
4598
4599
4600 int
cluster_read_ext(vnode_t vp,struct uio * uio,off_t filesize,int xflags,int (* callback)(buf_t,void *),void * callback_arg)4601 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
4602 {
4603 int retval = 0;
4604 int flags;
4605 user_ssize_t cur_resid;
4606 u_int32_t io_size;
4607 u_int32_t read_length = 0;
4608 int read_type = IO_COPY;
4609 bool check_io_type;
4610
4611 flags = xflags;
4612
4613 if (vp->v_flag & VNOCACHE_DATA) {
4614 flags |= IO_NOCACHE;
4615 }
4616 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) {
4617 flags |= IO_RAOFF;
4618 }
4619
4620 if (flags & IO_SKIP_ENCRYPTION) {
4621 flags |= IO_ENCRYPTED;
4622 }
4623
4624 /*
4625 * do a read through the cache if one of the following is true....
4626 * NOCACHE is not true
4627 * the uio request doesn't target USERSPACE (unless IO_NOCACHE_SYSSPACE is also set)
4628 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
4629 * Reading encrypted data from a CP filesystem should never result in the data touching
4630 * the UBC.
4631 *
4632 * otherwise, find out if we want the direct or contig variant for
4633 * the first vector in the uio request
4634 */
4635 check_io_type = false;
4636 if (flags & IO_NOCACHE) {
4637 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
4638 /*
4639 * no-cache to user-space: ok to consider IO_DIRECT.
4640 */
4641 check_io_type = true;
4642 } else if (uio->uio_segflg == UIO_SYSSPACE &&
4643 (flags & IO_NOCACHE_SYSSPACE)) {
4644 /*
4645 * no-cache to kernel-space but w/ IO_NOCACHE_SYSSPACE:
4646 * ok to consider IO_DIRECT.
4647 * The caller should make sure to target kernel buffer
4648 * that is backed by regular anonymous memory (i.e.
4649 * not backed by the kernel object or an external
4650 * memory manager like device memory or a file).
4651 */
4652 check_io_type = true;
4653 }
4654 } else if (flags & IO_ENCRYPTED) {
4655 check_io_type = true;
4656 }
4657 if (check_io_type) {
4658 retval = cluster_io_type(uio, &read_type, &read_length, 0);
4659 }
4660
4661 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
4662 switch (read_type) {
4663 case IO_COPY:
4664 /*
4665 * make sure the uio_resid isn't too big...
4666 * internally, we want to handle all of the I/O in
4667 * chunk sizes that fit in a 32 bit int
4668 */
4669 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
4670 io_size = MAX_IO_REQUEST_SIZE;
4671 } else {
4672 io_size = (u_int32_t)cur_resid;
4673 }
4674
4675 retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
4676 break;
4677
4678 case IO_DIRECT:
4679 retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
4680 break;
4681
4682 case IO_CONTIG:
4683 retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
4684 break;
4685
4686 case IO_UNKNOWN:
4687 retval = cluster_io_type(uio, &read_type, &read_length, 0);
4688 break;
4689 }
4690 }
4691 return retval;
4692 }
4693
4694
4695
4696 static void
cluster_read_upl_release(upl_t upl,int start_pg,int last_pg,int take_reference)4697 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
4698 {
4699 int range;
4700 int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4701
4702 if ((range = last_pg - start_pg)) {
4703 if (take_reference) {
4704 abort_flags |= UPL_ABORT_REFERENCE;
4705 }
4706
4707 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
4708 }
4709 }
4710
4711
4712 static int
cluster_read_copy(vnode_t vp,struct uio * uio,u_int32_t io_req_size,off_t filesize,int flags,int (* callback)(buf_t,void *),void * callback_arg)4713 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
4714 {
4715 upl_page_info_t *pl;
4716 upl_t upl = NULL;
4717 vm_offset_t upl_offset;
4718 u_int32_t upl_size;
4719 off_t upl_f_offset;
4720 int start_offset;
4721 int start_pg;
4722 int last_pg;
4723 int uio_last = 0;
4724 int pages_in_upl;
4725 off_t max_size;
4726 off_t last_ioread_offset;
4727 off_t last_request_offset;
4728 kern_return_t kret;
4729 int error = 0;
4730 int retval = 0;
4731 u_int32_t size_of_prefetch;
4732 u_int32_t xsize;
4733 u_int32_t io_size;
4734 u_int32_t max_rd_size;
4735 u_int32_t max_io_size;
4736 u_int32_t max_prefetch;
4737 u_int rd_ahead_enabled = 1;
4738 u_int prefetch_enabled = 1;
4739 struct cl_readahead * rap;
4740 struct clios iostate;
4741 struct cl_extent extent;
4742 int bflag;
4743 int take_reference = 1;
4744 int policy = IOPOL_DEFAULT;
4745 boolean_t iolock_inited = FALSE;
4746
4747 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
4748 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
4749
4750 if (flags & IO_ENCRYPTED) {
4751 panic("encrypted blocks will hit UBC!");
4752 }
4753
4754 policy = throttle_get_io_policy(NULL);
4755
4756 if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) {
4757 take_reference = 0;
4758 }
4759
4760 if (flags & IO_PASSIVE) {
4761 bflag = CL_PASSIVE;
4762 } else {
4763 bflag = 0;
4764 }
4765
4766 if (flags & IO_NOCACHE) {
4767 bflag |= CL_NOCACHE;
4768 }
4769
4770 if (flags & IO_SKIP_ENCRYPTION) {
4771 bflag |= CL_ENCRYPTED;
4772 }
4773
4774 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
4775 max_prefetch = cluster_max_prefetch(vp, max_io_size, prefetch_max);
4776 max_rd_size = max_prefetch;
4777
4778 last_request_offset = uio->uio_offset + io_req_size;
4779
4780 if (last_request_offset > filesize) {
4781 last_request_offset = filesize;
4782 }
4783
4784 if ((flags & (IO_RAOFF | IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
4785 rd_ahead_enabled = 0;
4786 rap = NULL;
4787 } else {
4788 if (cluster_is_throttled(vp)) {
4789 /*
4790 * we're in the throttle window, at the very least
4791 * we want to limit the size of the I/O we're about
4792 * to issue
4793 */
4794 rd_ahead_enabled = 0;
4795 prefetch_enabled = 0;
4796
4797 max_rd_size = calculate_max_throttle_size(vp);
4798 }
4799 if ((rap = cluster_get_rap(vp)) == NULL) {
4800 rd_ahead_enabled = 0;
4801 } else {
4802 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
4803 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
4804 }
4805 }
4806 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
4807 /*
4808 * determine if we already have a read-ahead in the pipe courtesy of the
4809 * last read systemcall that was issued...
4810 * if so, pick up it's extent to determine where we should start
4811 * with respect to any read-ahead that might be necessary to
4812 * garner all the data needed to complete this read systemcall
4813 */
4814 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
4815
4816 if (last_ioread_offset < uio->uio_offset) {
4817 last_ioread_offset = (off_t)0;
4818 } else if (last_ioread_offset > last_request_offset) {
4819 last_ioread_offset = last_request_offset;
4820 }
4821 } else {
4822 last_ioread_offset = (off_t)0;
4823 }
4824
4825 while (io_req_size && uio->uio_offset < filesize && retval == 0) {
4826 max_size = filesize - uio->uio_offset;
4827 bool leftover_upl_aborted = false;
4828
4829 if ((off_t)(io_req_size) < max_size) {
4830 io_size = io_req_size;
4831 } else {
4832 io_size = (u_int32_t)max_size;
4833 }
4834
4835 if (!(flags & IO_NOCACHE)) {
4836 while (io_size) {
4837 u_int32_t io_resid;
4838 u_int32_t io_requested;
4839
4840 /*
4841 * if we keep finding the pages we need already in the cache, then
4842 * don't bother to call cluster_read_prefetch since it costs CPU cycles
4843 * to determine that we have all the pages we need... once we miss in
4844 * the cache and have issued an I/O, than we'll assume that we're likely
4845 * to continue to miss in the cache and it's to our advantage to try and prefetch
4846 */
4847 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset))) {
4848 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4849 /*
4850 * we've already issued I/O for this request and
4851 * there's still work to do and
4852 * our prefetch stream is running dry, so issue a
4853 * pre-fetch I/O... the I/O latency will overlap
4854 * with the copying of the data
4855 */
4856 if (size_of_prefetch > max_rd_size) {
4857 size_of_prefetch = max_rd_size;
4858 }
4859
4860 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4861
4862 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4863
4864 if (last_ioread_offset > last_request_offset) {
4865 last_ioread_offset = last_request_offset;
4866 }
4867 }
4868 }
4869 /*
4870 * limit the size of the copy we're about to do so that
4871 * we can notice that our I/O pipe is running dry and
4872 * get the next I/O issued before it does go dry
4873 */
4874 if (last_ioread_offset && io_size > (max_io_size / 4)) {
4875 io_resid = (max_io_size / 4);
4876 } else {
4877 io_resid = io_size;
4878 }
4879
4880 io_requested = io_resid;
4881
4882 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
4883
4884 xsize = io_requested - io_resid;
4885
4886 io_size -= xsize;
4887 io_req_size -= xsize;
4888
4889 if (retval || io_resid) {
4890 /*
4891 * if we run into a real error or
4892 * a page that is not in the cache
4893 * we need to leave streaming mode
4894 */
4895 break;
4896 }
4897
4898 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
4899 /*
4900 * we're already finished the I/O for this read request
4901 * let's see if we should do a read-ahead
4902 */
4903 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4904 }
4905 }
4906 if (retval) {
4907 break;
4908 }
4909 if (io_size == 0) {
4910 if (rap != NULL) {
4911 if (extent.e_addr < rap->cl_lastr) {
4912 rap->cl_maxra = 0;
4913 }
4914 rap->cl_lastr = extent.e_addr;
4915 }
4916 break;
4917 }
4918 /*
4919 * recompute max_size since cluster_copy_ubc_data_internal
4920 * may have advanced uio->uio_offset
4921 */
4922 max_size = filesize - uio->uio_offset;
4923 }
4924
4925 iostate.io_completed = 0;
4926 iostate.io_issued = 0;
4927 iostate.io_error = 0;
4928 iostate.io_wanted = 0;
4929
4930 if ((flags & IO_RETURN_ON_THROTTLE)) {
4931 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4932 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4933 /*
4934 * we're in the throttle window and at least 1 I/O
4935 * has already been issued by a throttleable thread
4936 * in this window, so return with EAGAIN to indicate
4937 * to the FS issuing the cluster_read call that it
4938 * should now throttle after dropping any locks
4939 */
4940 throttle_info_update_by_mount(vp->v_mount);
4941
4942 retval = EAGAIN;
4943 break;
4944 }
4945 }
4946 }
4947
4948 /*
4949 * compute the size of the upl needed to encompass
4950 * the requested read... limit each call to cluster_io
4951 * to the maximum UPL size... cluster_io will clip if
4952 * this exceeds the maximum io_size for the device,
4953 * make sure to account for
4954 * a starting offset that's not page aligned
4955 */
4956 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4957 upl_f_offset = uio->uio_offset - (off_t)start_offset;
4958
4959 if (io_size > max_rd_size) {
4960 io_size = max_rd_size;
4961 }
4962
4963 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4964
4965 if (flags & IO_NOCACHE) {
4966 if (upl_size > max_io_size) {
4967 upl_size = max_io_size;
4968 }
4969 } else {
4970 if (upl_size > max_io_size / 4) {
4971 upl_size = max_io_size / 4;
4972 upl_size &= ~PAGE_MASK;
4973
4974 if (upl_size == 0) {
4975 upl_size = PAGE_SIZE;
4976 }
4977 }
4978 }
4979 pages_in_upl = upl_size / PAGE_SIZE;
4980
4981 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
4982 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4983
4984 kret = ubc_create_upl_kernel(vp,
4985 upl_f_offset,
4986 upl_size,
4987 &upl,
4988 &pl,
4989 UPL_FILE_IO | UPL_SET_LITE,
4990 VM_KERN_MEMORY_FILE);
4991 if (kret != KERN_SUCCESS) {
4992 panic("cluster_read_copy: failed to get pagelist");
4993 }
4994
4995 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
4996 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4997
4998 /*
4999 * scan from the beginning of the upl looking for the first
5000 * non-valid page.... this will become the first page in
5001 * the request we're going to make to 'cluster_io'... if all
5002 * of the pages are valid, we won't call through to 'cluster_io'
5003 */
5004 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
5005 if (!upl_valid_page(pl, start_pg)) {
5006 break;
5007 }
5008 }
5009
5010 /*
5011 * scan from the starting invalid page looking for a valid
5012 * page before the end of the upl is reached, if we
5013 * find one, then it will be the last page of the request to
5014 * 'cluster_io'
5015 */
5016 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5017 if (upl_valid_page(pl, last_pg)) {
5018 break;
5019 }
5020 }
5021
5022 if (start_pg < last_pg) {
5023 /*
5024 * we found a range of 'invalid' pages that must be filled
5025 * if the last page in this range is the last page of the file
5026 * we may have to clip the size of it to keep from reading past
5027 * the end of the last physical block associated with the file
5028 */
5029 if (iolock_inited == FALSE) {
5030 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
5031
5032 iolock_inited = TRUE;
5033 }
5034 upl_offset = start_pg * PAGE_SIZE;
5035 io_size = (last_pg - start_pg) * PAGE_SIZE;
5036
5037 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5038 io_size = (u_int32_t)(filesize - (upl_f_offset + upl_offset));
5039 }
5040
5041 /*
5042 * Find out if this needs verification, we'll have to manage the UPL
5043 * diffrently if so. Note that this call only lets us know if
5044 * verification is enabled on this mount point, the actual verification
5045 * is performed in the File system.
5046 */
5047 size_t verify_block_size = 0;
5048 if ((VNOP_VERIFY(vp, start_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL, NULL) == 0) /* && verify_block_size */) {
5049 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
5050 if (!upl_valid_page(pl, uio_last)) {
5051 break;
5052 }
5053 }
5054 if (uio_last < pages_in_upl) {
5055 /*
5056 * there were some invalid pages beyond the valid pages
5057 * that we didn't issue an I/O for, just release them
5058 * unchanged now, so that any prefetch/readahed can
5059 * include them
5060 */
5061 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
5062 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
5063 leftover_upl_aborted = true;
5064 }
5065 }
5066
5067 /*
5068 * issue an asynchronous read to cluster_io
5069 */
5070
5071 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
5072 io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
5073
5074 if (rap) {
5075 if (extent.e_addr < rap->cl_maxra) {
5076 /*
5077 * we've just issued a read for a block that should have been
5078 * in the cache courtesy of the read-ahead engine... something
5079 * has gone wrong with the pipeline, so reset the read-ahead
5080 * logic which will cause us to restart from scratch
5081 */
5082 rap->cl_maxra = 0;
5083 }
5084 }
5085 }
5086 if (error == 0) {
5087 /*
5088 * if the read completed successfully, or there was no I/O request
5089 * issued, than copy the data into user land via 'cluster_upl_copy_data'
5090 * we'll first add on any 'valid'
5091 * pages that were present in the upl when we acquired it.
5092 */
5093 u_int val_size;
5094
5095 if (!leftover_upl_aborted) {
5096 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
5097 if (!upl_valid_page(pl, uio_last)) {
5098 break;
5099 }
5100 }
5101 if (uio_last < pages_in_upl) {
5102 /*
5103 * there were some invalid pages beyond the valid pages
5104 * that we didn't issue an I/O for, just release them
5105 * unchanged now, so that any prefetch/readahed can
5106 * include them
5107 */
5108 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
5109 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
5110 }
5111 }
5112
5113 /*
5114 * compute size to transfer this round, if io_req_size is
5115 * still non-zero after this attempt, we'll loop around and
5116 * set up for another I/O.
5117 */
5118 val_size = (uio_last * PAGE_SIZE) - start_offset;
5119
5120 if (val_size > max_size) {
5121 val_size = (u_int)max_size;
5122 }
5123
5124 if (val_size > io_req_size) {
5125 val_size = io_req_size;
5126 }
5127
5128 if ((uio->uio_offset + val_size) > last_ioread_offset) {
5129 last_ioread_offset = uio->uio_offset + val_size;
5130 }
5131
5132 if ((size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset)) && prefetch_enabled) {
5133 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
5134 /*
5135 * if there's still I/O left to do for this request, and...
5136 * we're not in hard throttle mode, and...
5137 * we're close to using up the previous prefetch, then issue a
5138 * new pre-fetch I/O... the I/O latency will overlap
5139 * with the copying of the data
5140 */
5141 if (size_of_prefetch > max_rd_size) {
5142 size_of_prefetch = max_rd_size;
5143 }
5144
5145 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
5146
5147 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
5148
5149 if (last_ioread_offset > last_request_offset) {
5150 last_ioread_offset = last_request_offset;
5151 }
5152 }
5153 } else if ((uio->uio_offset + val_size) == last_request_offset) {
5154 /*
5155 * this transfer will finish this request, so...
5156 * let's try to read ahead if we're in
5157 * a sequential access pattern and we haven't
5158 * explicitly disabled it
5159 */
5160 if (rd_ahead_enabled) {
5161 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
5162 }
5163
5164 if (rap != NULL) {
5165 if (extent.e_addr < rap->cl_lastr) {
5166 rap->cl_maxra = 0;
5167 }
5168 rap->cl_lastr = extent.e_addr;
5169 }
5170 }
5171 if (iolock_inited == TRUE) {
5172 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
5173 }
5174
5175 if (iostate.io_error) {
5176 error = iostate.io_error;
5177 } else {
5178 u_int32_t io_requested;
5179
5180 io_requested = val_size;
5181
5182 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
5183
5184 io_req_size -= (val_size - io_requested);
5185 }
5186 } else {
5187 if (iolock_inited == TRUE) {
5188 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
5189 }
5190 }
5191 if (start_pg < last_pg) {
5192 /*
5193 * compute the range of pages that we actually issued an I/O for
5194 * and either commit them as valid if the I/O succeeded
5195 * or abort them if the I/O failed or we're not supposed to
5196 * keep them in the cache
5197 */
5198 io_size = (last_pg - start_pg) * PAGE_SIZE;
5199
5200 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
5201
5202 if (error || (flags & IO_NOCACHE)) {
5203 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
5204 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
5205 } else {
5206 int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
5207
5208 if (take_reference) {
5209 commit_flags |= UPL_COMMIT_INACTIVATE;
5210 } else {
5211 commit_flags |= UPL_COMMIT_SPECULATE;
5212 }
5213
5214 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
5215 }
5216 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
5217 }
5218 if ((last_pg - start_pg) < pages_in_upl) {
5219 /*
5220 * the set of pages that we issued an I/O for did not encompass
5221 * the entire upl... so just release these without modifying
5222 * their state
5223 */
5224 if (error) {
5225 if (leftover_upl_aborted) {
5226 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, (uio_last - start_pg) * PAGE_SIZE,
5227 UPL_ABORT_FREE_ON_EMPTY);
5228 } else {
5229 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
5230 }
5231 } else {
5232 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
5233 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
5234
5235 /*
5236 * handle any valid pages at the beginning of
5237 * the upl... release these appropriately
5238 */
5239 cluster_read_upl_release(upl, 0, start_pg, take_reference);
5240
5241 /*
5242 * handle any valid pages immediately after the
5243 * pages we issued I/O for... ... release these appropriately
5244 */
5245 cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
5246
5247 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
5248 }
5249 }
5250 if (retval == 0) {
5251 retval = error;
5252 }
5253
5254 if (io_req_size) {
5255 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
5256
5257 if (cluster_is_throttled(vp)) {
5258 /*
5259 * we're in the throttle window, at the very least
5260 * we want to limit the size of the I/O we're about
5261 * to issue
5262 */
5263 rd_ahead_enabled = 0;
5264 prefetch_enabled = 0;
5265 max_rd_size = max_throttle_size;
5266 } else {
5267 if (max_rd_size == max_throttle_size) {
5268 /*
5269 * coming out of throttled state
5270 */
5271 if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
5272 if (rap != NULL) {
5273 rd_ahead_enabled = 1;
5274 }
5275 prefetch_enabled = 1;
5276 }
5277 max_rd_size = max_prefetch;
5278 last_ioread_offset = 0;
5279 }
5280 }
5281 }
5282 }
5283 if (iolock_inited == TRUE) {
5284 /*
5285 * cluster_io returned an error after it
5286 * had already issued some I/O. we need
5287 * to wait for that I/O to complete before
5288 * we can destroy the iostate mutex...
5289 * 'retval' already contains the early error
5290 * so no need to pick it up from iostate.io_error
5291 */
5292 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
5293
5294 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5295 }
5296 if (rap != NULL) {
5297 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
5298 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
5299
5300 lck_mtx_unlock(&rap->cl_lockr);
5301 } else {
5302 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
5303 (int)uio->uio_offset, io_req_size, 0, retval, 0);
5304 }
5305
5306 return retval;
5307 }
5308
5309 /*
5310 * We don't want another read/write lock for every vnode in the system
5311 * so we keep a hash of them here. There should never be very many of
5312 * these around at any point in time.
5313 */
5314 cl_direct_read_lock_t *
cluster_lock_direct_read(vnode_t vp,lck_rw_type_t type)5315 cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
5316 {
5317 struct cl_direct_read_locks *head
5318 = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
5319 % CL_DIRECT_READ_LOCK_BUCKETS];
5320
5321 struct cl_direct_read_lock *lck, *new_lck = NULL;
5322
5323 for (;;) {
5324 lck_spin_lock(&cl_direct_read_spin_lock);
5325
5326 LIST_FOREACH(lck, head, chain) {
5327 if (lck->vp == vp) {
5328 ++lck->ref_count;
5329 lck_spin_unlock(&cl_direct_read_spin_lock);
5330 if (new_lck) {
5331 // Someone beat us to it, ditch the allocation
5332 lck_rw_destroy(&new_lck->rw_lock, &cl_mtx_grp);
5333 kfree_type(cl_direct_read_lock_t, new_lck);
5334 }
5335 lck_rw_lock(&lck->rw_lock, type);
5336 return lck;
5337 }
5338 }
5339
5340 if (new_lck) {
5341 // Use the lock we allocated
5342 LIST_INSERT_HEAD(head, new_lck, chain);
5343 lck_spin_unlock(&cl_direct_read_spin_lock);
5344 lck_rw_lock(&new_lck->rw_lock, type);
5345 return new_lck;
5346 }
5347
5348 lck_spin_unlock(&cl_direct_read_spin_lock);
5349
5350 // Allocate a new lock
5351 new_lck = kalloc_type(cl_direct_read_lock_t, Z_WAITOK);
5352 lck_rw_init(&new_lck->rw_lock, &cl_mtx_grp, LCK_ATTR_NULL);
5353 new_lck->vp = vp;
5354 new_lck->ref_count = 1;
5355
5356 // Got to go round again
5357 }
5358 }
5359
5360 void
cluster_unlock_direct_read(cl_direct_read_lock_t * lck)5361 cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
5362 {
5363 lck_rw_done(&lck->rw_lock);
5364
5365 lck_spin_lock(&cl_direct_read_spin_lock);
5366 if (lck->ref_count == 1) {
5367 LIST_REMOVE(lck, chain);
5368 lck_spin_unlock(&cl_direct_read_spin_lock);
5369 lck_rw_destroy(&lck->rw_lock, &cl_mtx_grp);
5370 kfree_type(cl_direct_read_lock_t, lck);
5371 } else {
5372 --lck->ref_count;
5373 lck_spin_unlock(&cl_direct_read_spin_lock);
5374 }
5375 }
5376
5377 static int
cluster_read_direct(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int flags,int (* callback)(buf_t,void *),void * callback_arg)5378 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
5379 int flags, int (*callback)(buf_t, void *), void *callback_arg)
5380 {
5381 upl_t upl = NULL;
5382 upl_page_info_t *pl;
5383 off_t max_io_size;
5384 size_t verify_block_size = 0;
5385 vm_offset_t upl_offset, vector_upl_offset = 0;
5386 upl_size_t upl_size = 0, vector_upl_size = 0;
5387 vm_size_t upl_needed_size;
5388 unsigned int pages_in_pl;
5389 upl_control_flags_t upl_flags;
5390 kern_return_t kret = KERN_SUCCESS;
5391 unsigned int i;
5392 int force_data_sync;
5393 int retval = 0;
5394 int no_zero_fill = 0;
5395 int io_flag = 0;
5396 int misaligned = 0;
5397 struct clios iostate;
5398 user_addr_t iov_base;
5399 u_int32_t io_req_size;
5400 u_int32_t offset_in_file;
5401 u_int32_t offset_in_iovbase;
5402 u_int32_t io_size;
5403 u_int32_t io_min;
5404 u_int32_t xsize;
5405 u_int32_t devblocksize;
5406 u_int32_t mem_alignment_mask;
5407 u_int32_t max_upl_size;
5408 u_int32_t max_rd_size;
5409 u_int32_t max_rd_ahead;
5410 u_int32_t max_vector_size;
5411 boolean_t io_throttled = FALSE;
5412
5413 u_int32_t vector_upl_iosize = 0;
5414 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
5415 off_t v_upl_uio_offset = 0;
5416 int vector_upl_index = 0;
5417 upl_t vector_upl = NULL;
5418 cl_direct_read_lock_t *lock = NULL;
5419 uint32_t verify_mask = 0;
5420
5421 assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
5422
5423 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
5424 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
5425
5426 max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
5427
5428 max_rd_size = max_upl_size;
5429
5430 if (__improbable(os_mul_overflow(max_rd_size, IO_SCALE(vp, 2),
5431 &max_rd_ahead) || (max_rd_ahead > overlapping_read_max))) {
5432 max_rd_ahead = overlapping_read_max;
5433 }
5434
5435 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
5436
5437 if (flags & IO_PASSIVE) {
5438 io_flag |= CL_PASSIVE;
5439 }
5440
5441 if (flags & IO_ENCRYPTED) {
5442 io_flag |= CL_RAW_ENCRYPTED;
5443 }
5444
5445 if (flags & IO_NOCACHE) {
5446 io_flag |= CL_NOCACHE;
5447 }
5448
5449 if (flags & IO_SKIP_ENCRYPTION) {
5450 io_flag |= CL_ENCRYPTED;
5451 }
5452
5453 iostate.io_completed = 0;
5454 iostate.io_issued = 0;
5455 iostate.io_error = 0;
5456 iostate.io_wanted = 0;
5457
5458 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
5459
5460 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5461 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
5462
5463 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5464 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
5465
5466 if (devblocksize == 1) {
5467 /*
5468 * the AFP client advertises a devblocksize of 1
5469 * however, its BLOCKMAP routine maps to physical
5470 * blocks that are PAGE_SIZE in size...
5471 * therefore we can't ask for I/Os that aren't page aligned
5472 * or aren't multiples of PAGE_SIZE in size
5473 * by setting devblocksize to PAGE_SIZE, we re-instate
5474 * the old behavior we had before the mem_alignment_mask
5475 * changes went in...
5476 */
5477 devblocksize = PAGE_SIZE;
5478 }
5479
5480 /*
5481 * We are going to need this uio for the prefaulting later
5482 * especially for the cases where multiple non-contiguous
5483 * iovs are passed into this routine.
5484 *
5485 * Note that we only want to prefault for direct IOs to userspace buffers,
5486 * not kernel buffers.
5487 */
5488 uio_t uio_acct = NULL;
5489 if (uio->uio_segflg != UIO_SYSSPACE) {
5490 uio_acct = uio_duplicate(uio);
5491 }
5492
5493 retval = VNOP_VERIFY(vp, 0, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL, NULL);
5494 if (retval) {
5495 verify_block_size = 0;
5496 } else if (verify_block_size) {
5497 assert((verify_block_size & (verify_block_size - 1)) == 0);
5498 verify_mask = verify_block_size - 1;
5499 }
5500
5501 next_dread:
5502 io_req_size = *read_length;
5503 iov_base = uio_curriovbase(uio);
5504
5505 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
5506 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
5507
5508 if (vm_map_page_mask(current_map()) < PAGE_MASK) {
5509 /*
5510 * XXX TODO4K
5511 * Direct I/O might not work as expected from a 16k kernel space
5512 * to a 4k user space because each 4k chunk might point to
5513 * a different 16k physical page...
5514 * Let's go the "misaligned" way.
5515 */
5516 if (!misaligned) {
5517 DEBUG4K_VFS("forcing misaligned\n");
5518 }
5519 misaligned = 1;
5520 }
5521
5522 if (offset_in_file || offset_in_iovbase) {
5523 /*
5524 * one of the 2 important offsets is misaligned
5525 * so fire an I/O through the cache for this entire vector
5526 */
5527 misaligned = 1;
5528 }
5529 if (iov_base & (devblocksize - 1)) {
5530 /*
5531 * the offset in memory must be on a device block boundary
5532 * so that we can guarantee that we can generate an
5533 * I/O that ends on a page boundary in cluster_io
5534 */
5535 misaligned = 1;
5536 }
5537
5538 if (verify_block_size && !misaligned && ((uio_offset(uio) & verify_mask) || (uio_resid(uio) & verify_mask))) {
5539 /*
5540 * If the offset is not aligned to the verification block size
5541 * or the size is not aligned to the verification block size,
5542 * we simply send this through the cached i/o path as that is
5543 * what the Filesystem will end up doing anyway i.e. it will
5544 * read all the remaining data in order to verify it and then
5545 * discard the data it has read.
5546 */
5547 misaligned = 1;
5548 }
5549
5550 max_io_size = filesize - uio->uio_offset;
5551
5552 /*
5553 * The user must request IO in aligned chunks. If the
5554 * offset into the file is bad, or the userland pointer
5555 * is non-aligned, then we cannot service the encrypted IO request.
5556 */
5557 if (flags & IO_ENCRYPTED) {
5558 if (misaligned || (io_req_size & (devblocksize - 1))) {
5559 retval = EINVAL;
5560 }
5561
5562 max_io_size = roundup(max_io_size, devblocksize);
5563 }
5564
5565 if ((off_t)io_req_size > max_io_size) {
5566 io_req_size = (u_int32_t)max_io_size;
5567 }
5568
5569 /*
5570 * When we get to this point, we know...
5571 * -- the offset into the file is on a devblocksize boundary
5572 */
5573
5574 while (io_req_size && retval == 0) {
5575 u_int32_t io_start;
5576
5577 if (cluster_is_throttled(vp)) {
5578 uint32_t max_throttle_size = calculate_max_throttle_size(vp);
5579
5580 /*
5581 * we're in the throttle window, at the very least
5582 * we want to limit the size of the I/O we're about
5583 * to issue
5584 */
5585 max_rd_size = max_throttle_size;
5586 max_rd_ahead = max_throttle_size - 1;
5587 max_vector_size = max_throttle_size;
5588 } else {
5589 max_rd_size = max_upl_size;
5590 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
5591 max_vector_size = MAX_VECTOR_UPL_SIZE;
5592 }
5593 io_start = io_size = io_req_size;
5594
5595 /*
5596 * First look for pages already in the cache
5597 * and move them to user space. But only do this
5598 * check if we are not retrieving encrypted data directly
5599 * from the filesystem; those blocks should never
5600 * be in the UBC.
5601 *
5602 * cluster_copy_ubc_data returns the resid
5603 * in io_size
5604 */
5605 if ((flags & IO_ENCRYPTED) == 0) {
5606 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
5607 }
5608 /*
5609 * calculate the number of bytes actually copied
5610 * starting size - residual
5611 */
5612 xsize = io_start - io_size;
5613
5614 io_req_size -= xsize;
5615
5616 if (useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
5617 /*
5618 * We found something in the cache or we have an iov_base that's not
5619 * page-aligned.
5620 *
5621 * Issue all I/O's that have been collected within this Vectored UPL.
5622 */
5623 if (vector_upl_index) {
5624 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5625 reset_vector_run_state();
5626 }
5627
5628 if (xsize) {
5629 useVectorUPL = 0;
5630 }
5631
5632 /*
5633 * After this point, if we are using the Vector UPL path and the base is
5634 * not page-aligned then the UPL with that base will be the first in the vector UPL.
5635 */
5636 }
5637
5638 /*
5639 * check to see if we are finished with this request.
5640 *
5641 * If we satisfied this IO already, then io_req_size will be 0.
5642 * Otherwise, see if the IO was mis-aligned and needs to go through
5643 * the UBC to deal with the 'tail'.
5644 *
5645 */
5646 if (io_req_size == 0 || (misaligned)) {
5647 /*
5648 * see if there's another uio vector to
5649 * process that's of type IO_DIRECT
5650 *
5651 * break out of while loop to get there
5652 */
5653 break;
5654 }
5655 /*
5656 * assume the request ends on a device block boundary
5657 */
5658 io_min = devblocksize;
5659
5660 /*
5661 * we can handle I/O's in multiples of the device block size
5662 * however, if io_size isn't a multiple of devblocksize we
5663 * want to clip it back to the nearest page boundary since
5664 * we are going to have to go through cluster_read_copy to
5665 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
5666 * multiple, we avoid asking the drive for the same physical
5667 * blocks twice.. once for the partial page at the end of the
5668 * request and a 2nd time for the page we read into the cache
5669 * (which overlaps the end of the direct read) in order to
5670 * get at the overhang bytes
5671 */
5672 if (io_size & (devblocksize - 1)) {
5673 assert(!(flags & IO_ENCRYPTED));
5674 /*
5675 * Clip the request to the previous page size boundary
5676 * since request does NOT end on a device block boundary
5677 */
5678 io_size &= ~PAGE_MASK;
5679 io_min = PAGE_SIZE;
5680 }
5681 if (retval || io_size < io_min) {
5682 /*
5683 * either an error or we only have the tail left to
5684 * complete via the copy path...
5685 * we may have already spun some portion of this request
5686 * off as async requests... we need to wait for the I/O
5687 * to complete before returning
5688 */
5689 goto wait_for_dreads;
5690 }
5691
5692 /*
5693 * Don't re-check the UBC data if we are looking for uncached IO
5694 * or asking for encrypted blocks.
5695 */
5696 if ((flags & IO_ENCRYPTED) == 0) {
5697 if ((xsize = io_size) > max_rd_size) {
5698 xsize = max_rd_size;
5699 }
5700
5701 io_size = 0;
5702
5703 if (!lock) {
5704 /*
5705 * We hold a lock here between the time we check the
5706 * cache and the time we issue I/O. This saves us
5707 * from having to lock the pages in the cache. Not
5708 * all clients will care about this lock but some
5709 * clients may want to guarantee stability between
5710 * here and when the I/O is issued in which case they
5711 * will take the lock exclusively.
5712 */
5713 lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
5714 }
5715
5716 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
5717
5718 if (io_size == 0) {
5719 /*
5720 * a page must have just come into the cache
5721 * since the first page in this range is no
5722 * longer absent, go back and re-evaluate
5723 */
5724 continue;
5725 }
5726 }
5727 if ((flags & IO_RETURN_ON_THROTTLE)) {
5728 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
5729 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
5730 /*
5731 * we're in the throttle window and at least 1 I/O
5732 * has already been issued by a throttleable thread
5733 * in this window, so return with EAGAIN to indicate
5734 * to the FS issuing the cluster_read call that it
5735 * should now throttle after dropping any locks
5736 */
5737 throttle_info_update_by_mount(vp->v_mount);
5738
5739 io_throttled = TRUE;
5740 goto wait_for_dreads;
5741 }
5742 }
5743 }
5744 if (io_size > max_rd_size) {
5745 io_size = max_rd_size;
5746 }
5747
5748 iov_base = uio_curriovbase(uio);
5749
5750 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5751 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5752
5753 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
5754 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
5755
5756 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
5757 no_zero_fill = 1;
5758 } else {
5759 no_zero_fill = 0;
5760 }
5761
5762 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5763 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
5764 pages_in_pl = 0;
5765 upl_size = (upl_size_t)upl_needed_size;
5766 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5767 if (no_zero_fill) {
5768 upl_flags |= UPL_NOZEROFILL;
5769 }
5770 if (force_data_sync) {
5771 upl_flags |= UPL_FORCE_DATA_SYNC;
5772 }
5773
5774 kret = vm_map_create_upl(map,
5775 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5776 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
5777
5778 if (kret != KERN_SUCCESS) {
5779 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5780 (int)upl_offset, upl_size, io_size, kret, 0);
5781 /*
5782 * failed to get pagelist
5783 *
5784 * we may have already spun some portion of this request
5785 * off as async requests... we need to wait for the I/O
5786 * to complete before returning
5787 */
5788 goto wait_for_dreads;
5789 }
5790 pages_in_pl = upl_size / PAGE_SIZE;
5791 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
5792
5793 for (i = 0; i < pages_in_pl; i++) {
5794 if (!upl_page_present(pl, i)) {
5795 break;
5796 }
5797 }
5798 if (i == pages_in_pl) {
5799 break;
5800 }
5801
5802 ubc_upl_abort(upl, 0);
5803 }
5804 if (force_data_sync >= 3) {
5805 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5806 (int)upl_offset, upl_size, io_size, kret, 0);
5807
5808 goto wait_for_dreads;
5809 }
5810 /*
5811 * Consider the possibility that upl_size wasn't satisfied.
5812 */
5813 if (upl_size < upl_needed_size) {
5814 if (upl_size && upl_offset == 0) {
5815 io_size = upl_size;
5816 } else {
5817 io_size = 0;
5818 }
5819 }
5820 if (io_size == 0) {
5821 ubc_upl_abort(upl, 0);
5822 goto wait_for_dreads;
5823 }
5824 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
5825 (int)upl_offset, upl_size, io_size, kret, 0);
5826
5827 if (useVectorUPL) {
5828 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
5829 if (end_off) {
5830 issueVectorUPL = 1;
5831 }
5832 /*
5833 * After this point, if we are using a vector UPL, then
5834 * either all the UPL elements end on a page boundary OR
5835 * this UPL is the last element because it does not end
5836 * on a page boundary.
5837 */
5838 }
5839
5840 /*
5841 * request asynchronously so that we can overlap
5842 * the preparation of the next I/O
5843 * if there are already too many outstanding reads
5844 * wait until some have completed before issuing the next read
5845 */
5846 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
5847
5848 if (iostate.io_error) {
5849 /*
5850 * one of the earlier reads we issued ran into a hard error
5851 * don't issue any more reads, cleanup the UPL
5852 * that was just created but not used, then
5853 * go wait for any other reads to complete before
5854 * returning the error to the caller
5855 */
5856 ubc_upl_abort(upl, 0);
5857
5858 goto wait_for_dreads;
5859 }
5860 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
5861 upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
5862
5863 if (!useVectorUPL) {
5864 if (no_zero_fill) {
5865 io_flag &= ~CL_PRESERVE;
5866 } else {
5867 io_flag |= CL_PRESERVE;
5868 }
5869
5870 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5871 } else {
5872 if (!vector_upl_index) {
5873 vector_upl = vector_upl_create(upl_offset, uio->uio_iovcnt);
5874 v_upl_uio_offset = uio->uio_offset;
5875 vector_upl_offset = upl_offset;
5876 }
5877
5878 vector_upl_set_subupl(vector_upl, upl, upl_size);
5879 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
5880 vector_upl_index++;
5881 vector_upl_size += upl_size;
5882 vector_upl_iosize += io_size;
5883
5884 if (issueVectorUPL || vector_upl_index == vector_upl_max_upls(vector_upl) || vector_upl_size >= max_vector_size) {
5885 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5886 reset_vector_run_state();
5887 }
5888 }
5889
5890 if (lock) {
5891 // We don't need to wait for the I/O to complete
5892 cluster_unlock_direct_read(lock);
5893 lock = NULL;
5894 }
5895
5896 /*
5897 * update the uio structure
5898 */
5899 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5900 uio_update(uio, (user_size_t)max_io_size);
5901 } else {
5902 uio_update(uio, (user_size_t)io_size);
5903 }
5904
5905 io_req_size -= io_size;
5906
5907 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
5908 upl, (int)uio->uio_offset, io_req_size, retval, 0);
5909 } /* end while */
5910
5911 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
5912 retval = cluster_io_type(uio, read_type, read_length, 0);
5913
5914 if (retval == 0 && *read_type == IO_DIRECT) {
5915 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5916 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
5917
5918 goto next_dread;
5919 }
5920 }
5921
5922 wait_for_dreads:
5923
5924 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
5925 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5926 reset_vector_run_state();
5927 }
5928
5929 // We don't need to wait for the I/O to complete
5930 if (lock) {
5931 cluster_unlock_direct_read(lock);
5932 }
5933
5934 /*
5935 * make sure all async reads that are part of this stream
5936 * have completed before we return
5937 */
5938 cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
5939
5940 if (iostate.io_error) {
5941 retval = iostate.io_error;
5942 }
5943
5944 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
5945
5946 if (io_throttled == TRUE && retval == 0) {
5947 retval = EAGAIN;
5948 }
5949
5950 vm_map_offset_t current_page_size, current_page_mask;
5951 current_page_size = vm_map_page_size(current_map());
5952 current_page_mask = vm_map_page_mask(current_map());
5953 if (uio_acct) {
5954 assert(uio_acct->uio_segflg != UIO_SYSSPACE);
5955 off_t bytes_to_prefault = 0, bytes_prefaulted = 0;
5956 user_addr_t curr_iov_base = 0;
5957 user_addr_t curr_iov_end = 0;
5958 user_size_t curr_iov_len = 0;
5959
5960 bytes_to_prefault = uio_offset(uio) - uio_offset(uio_acct);
5961
5962 for (; bytes_prefaulted < bytes_to_prefault;) {
5963 curr_iov_base = uio_curriovbase(uio_acct);
5964 curr_iov_len = MIN(uio_curriovlen(uio_acct), bytes_to_prefault - bytes_prefaulted);
5965 curr_iov_end = curr_iov_base + curr_iov_len;
5966
5967 for (; curr_iov_base < curr_iov_end;) {
5968 /*
5969 * This is specifically done for pmap accounting purposes.
5970 * vm_pre_fault() will call vm_fault() to enter the page into
5971 * the pmap if there isn't _a_ physical page for that VA already.
5972 */
5973 vm_pre_fault(vm_map_trunc_page(curr_iov_base, current_page_mask), VM_PROT_READ);
5974 curr_iov_base += current_page_size;
5975 bytes_prefaulted += current_page_size;
5976 }
5977 /*
5978 * Use update instead of advance so we can see how many iovs we processed.
5979 */
5980 uio_update(uio_acct, curr_iov_len);
5981 }
5982 uio_free(uio_acct);
5983 uio_acct = NULL;
5984 }
5985
5986 if (io_req_size && retval == 0) {
5987 /*
5988 * we couldn't handle the tail of this request in DIRECT mode
5989 * so fire it through the copy path
5990 */
5991 if (flags & IO_ENCRYPTED) {
5992 /*
5993 * We cannot fall back to the copy path for encrypted I/O. If this
5994 * happens, there is something wrong with the user buffer passed
5995 * down.
5996 */
5997 retval = EFAULT;
5998 } else {
5999 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
6000 }
6001
6002 *read_type = IO_UNKNOWN;
6003 }
6004 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
6005 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
6006
6007 return retval;
6008 }
6009
6010
6011 static int
cluster_read_contig(vnode_t vp,struct uio * uio,off_t filesize,int * read_type,u_int32_t * read_length,int (* callback)(buf_t,void *),void * callback_arg,int flags)6012 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
6013 int (*callback)(buf_t, void *), void *callback_arg, int flags)
6014 {
6015 upl_page_info_t *pl;
6016 upl_t upl[MAX_VECTS];
6017 vm_offset_t upl_offset;
6018 addr64_t dst_paddr = 0;
6019 user_addr_t iov_base;
6020 off_t max_size;
6021 upl_size_t upl_size;
6022 vm_size_t upl_needed_size;
6023 mach_msg_type_number_t pages_in_pl;
6024 upl_control_flags_t upl_flags;
6025 kern_return_t kret;
6026 struct clios iostate;
6027 int error = 0;
6028 int cur_upl = 0;
6029 int num_upl = 0;
6030 int n;
6031 u_int32_t xsize;
6032 u_int32_t io_size;
6033 u_int32_t devblocksize;
6034 u_int32_t mem_alignment_mask;
6035 u_int32_t tail_size = 0;
6036 int bflag;
6037
6038 if (flags & IO_PASSIVE) {
6039 bflag = CL_PASSIVE;
6040 } else {
6041 bflag = 0;
6042 }
6043
6044 if (flags & IO_NOCACHE) {
6045 bflag |= CL_NOCACHE;
6046 }
6047
6048 /*
6049 * When we enter this routine, we know
6050 * -- the read_length will not exceed the current iov_len
6051 * -- the target address is physically contiguous for read_length
6052 */
6053 cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
6054
6055 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
6056 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
6057
6058 iostate.io_completed = 0;
6059 iostate.io_issued = 0;
6060 iostate.io_error = 0;
6061 iostate.io_wanted = 0;
6062
6063 lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
6064
6065 next_cread:
6066 io_size = *read_length;
6067
6068 max_size = filesize - uio->uio_offset;
6069
6070 if (io_size > max_size) {
6071 io_size = (u_int32_t)max_size;
6072 }
6073
6074 iov_base = uio_curriovbase(uio);
6075
6076 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
6077 upl_needed_size = upl_offset + io_size;
6078
6079 pages_in_pl = 0;
6080 upl_size = (upl_size_t)upl_needed_size;
6081 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
6082
6083
6084 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
6085 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
6086
6087 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
6088 kret = vm_map_get_upl(map,
6089 #if HAS_MTE || HAS_MTE_EMULATION_SHIMS
6090 vm_memtag_canonicalize(map, vm_map_trunc_page(iov_base, vm_map_page_mask(map))),
6091 #else /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
6092 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
6093 #endif /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
6094 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
6095
6096 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
6097 (int)upl_offset, upl_size, io_size, kret, 0);
6098
6099 if (kret != KERN_SUCCESS) {
6100 /*
6101 * failed to get pagelist
6102 */
6103 error = EINVAL;
6104 goto wait_for_creads;
6105 }
6106 num_upl++;
6107
6108 if (!(upl_flags & UPL_PHYS_CONTIG)) {
6109 /*
6110 * The created UPL needs to have the UPL_PHYS_CONTIG flag.
6111 */
6112 error = EINVAL;
6113 goto wait_for_creads;
6114 }
6115
6116 if (upl_size < upl_needed_size) {
6117 /*
6118 * The upl_size wasn't satisfied.
6119 */
6120 error = EINVAL;
6121 goto wait_for_creads;
6122 }
6123 pl = ubc_upl_pageinfo(upl[cur_upl]);
6124
6125 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
6126
6127 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
6128 u_int32_t head_size;
6129
6130 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
6131
6132 if (head_size > io_size) {
6133 head_size = io_size;
6134 }
6135
6136 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
6137
6138 if (error) {
6139 goto wait_for_creads;
6140 }
6141
6142 upl_offset += head_size;
6143 dst_paddr += head_size;
6144 io_size -= head_size;
6145
6146 iov_base += head_size;
6147 }
6148 if ((u_int32_t)iov_base & mem_alignment_mask) {
6149 /*
6150 * request doesn't set up on a memory boundary
6151 * the underlying DMA engine can handle...
6152 * return an error instead of going through
6153 * the slow copy path since the intent of this
6154 * path is direct I/O to device memory
6155 */
6156 error = EINVAL;
6157 goto wait_for_creads;
6158 }
6159
6160 tail_size = io_size & (devblocksize - 1);
6161
6162 io_size -= tail_size;
6163
6164 while (io_size && error == 0) {
6165 if (io_size > MAX_IO_CONTIG_SIZE) {
6166 xsize = MAX_IO_CONTIG_SIZE;
6167 } else {
6168 xsize = io_size;
6169 }
6170 /*
6171 * request asynchronously so that we can overlap
6172 * the preparation of the next I/O... we'll do
6173 * the commit after all the I/O has completed
6174 * since its all issued against the same UPL
6175 * if there are already too many outstanding reads
6176 * wait until some have completed before issuing the next
6177 */
6178 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
6179
6180 if (iostate.io_error) {
6181 /*
6182 * one of the earlier reads we issued ran into a hard error
6183 * don't issue any more reads...
6184 * go wait for any other reads to complete before
6185 * returning the error to the caller
6186 */
6187 goto wait_for_creads;
6188 }
6189 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
6190 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
6191 (buf_t)NULL, &iostate, callback, callback_arg);
6192 /*
6193 * The cluster_io read was issued successfully,
6194 * update the uio structure
6195 */
6196 if (error == 0) {
6197 uio_update(uio, (user_size_t)xsize);
6198
6199 dst_paddr += xsize;
6200 upl_offset += xsize;
6201 io_size -= xsize;
6202 }
6203 }
6204 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
6205 error = cluster_io_type(uio, read_type, read_length, 0);
6206
6207 if (error == 0 && *read_type == IO_CONTIG) {
6208 cur_upl++;
6209 goto next_cread;
6210 }
6211 } else {
6212 *read_type = IO_UNKNOWN;
6213 }
6214
6215 wait_for_creads:
6216 /*
6217 * make sure all async reads that are part of this stream
6218 * have completed before we proceed
6219 */
6220 cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
6221
6222 if (iostate.io_error) {
6223 error = iostate.io_error;
6224 }
6225
6226 lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
6227
6228 if (error == 0 && tail_size) {
6229 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
6230 }
6231
6232 for (n = 0; n < num_upl; n++) {
6233 /*
6234 * just release our hold on each physically contiguous
6235 * region without changing any state
6236 */
6237 ubc_upl_abort(upl[n], 0);
6238 }
6239
6240 return error;
6241 }
6242
6243
6244 static int
cluster_io_type(struct uio * uio,int * io_type,u_int32_t * io_length,u_int32_t min_length)6245 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
6246 {
6247 user_size_t iov_len;
6248 user_addr_t iov_base = 0;
6249 upl_t upl;
6250 upl_size_t upl_size;
6251 upl_control_flags_t upl_flags;
6252 int retval = 0;
6253
6254 /*
6255 * skip over any emtpy vectors
6256 */
6257 uio_update(uio, (user_size_t)0);
6258
6259 iov_len = MIN(uio_curriovlen(uio), uio_resid(uio));
6260
6261 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
6262
6263 if (iov_len) {
6264 iov_base = uio_curriovbase(uio);
6265 /*
6266 * make sure the size of the vector isn't too big...
6267 * internally, we want to handle all of the I/O in
6268 * chunk sizes that fit in a 32 bit int
6269 */
6270 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
6271 upl_size = MAX_IO_REQUEST_SIZE;
6272 } else {
6273 upl_size = (u_int32_t)iov_len;
6274 }
6275
6276 upl_flags = UPL_QUERY_OBJECT_TYPE;
6277
6278 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
6279 if ((vm_map_get_upl(map,
6280 #if HAS_MTE || HAS_MTE_EMULATION_SHIMS
6281 vm_memtag_canonicalize(map, vm_map_trunc_page(iov_base, vm_map_page_mask(map))),
6282 #else /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
6283 vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
6284 #endif /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
6285 &upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
6286 /*
6287 * the user app must have passed in an invalid address
6288 */
6289 retval = EFAULT;
6290 }
6291 if (upl_size == 0) {
6292 retval = EFAULT;
6293 }
6294
6295 *io_length = upl_size;
6296
6297 if (upl_flags & UPL_PHYS_CONTIG) {
6298 *io_type = IO_CONTIG;
6299 } else if (iov_len >= min_length) {
6300 *io_type = IO_DIRECT;
6301 } else {
6302 *io_type = IO_COPY;
6303 }
6304 } else {
6305 /*
6306 * nothing left to do for this uio
6307 */
6308 *io_length = 0;
6309 *io_type = IO_UNKNOWN;
6310 }
6311 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
6312
6313 if (*io_type == IO_DIRECT &&
6314 vm_map_page_shift(current_map()) < PAGE_SHIFT) {
6315 /* no direct I/O for sub-page-size address spaces */
6316 DEBUG4K_VFS("io_type IO_DIRECT -> IO_COPY\n");
6317 *io_type = IO_COPY;
6318 }
6319
6320 return retval;
6321 }
6322
6323
6324 /*
6325 * generate advisory I/O's in the largest chunks possible
6326 * the completed pages will be released into the VM cache
6327 */
6328 int
advisory_read(vnode_t vp,off_t filesize,off_t f_offset,int resid)6329 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
6330 {
6331 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
6332 }
6333
6334 int
advisory_read_ext(vnode_t vp,off_t filesize,off_t f_offset,int resid,int (* callback)(buf_t,void *),void * callback_arg,int bflag)6335 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
6336 {
6337 upl_page_info_t *pl;
6338 upl_t upl = NULL;
6339 vm_offset_t upl_offset;
6340 int upl_size;
6341 off_t upl_f_offset;
6342 int start_offset;
6343 int start_pg;
6344 int last_pg;
6345 int pages_in_upl;
6346 off_t max_size;
6347 int io_size;
6348 kern_return_t kret;
6349 int retval = 0;
6350 int issued_io;
6351 int skip_range;
6352 uint32_t max_io_size;
6353
6354
6355 if (!UBCINFOEXISTS(vp)) {
6356 return EINVAL;
6357 }
6358
6359 if (f_offset < 0 || resid < 0) {
6360 return EINVAL;
6361 }
6362
6363 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
6364
6365 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
6366 if (max_io_size > speculative_prefetch_max_iosize) {
6367 max_io_size = speculative_prefetch_max_iosize;
6368 }
6369 }
6370
6371 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
6372 (int)f_offset, resid, (int)filesize, 0, 0);
6373
6374 while (resid && f_offset < filesize && retval == 0) {
6375 /*
6376 * compute the size of the upl needed to encompass
6377 * the requested read... limit each call to cluster_io
6378 * to the maximum UPL size... cluster_io will clip if
6379 * this exceeds the maximum io_size for the device,
6380 * make sure to account for
6381 * a starting offset that's not page aligned
6382 */
6383 start_offset = (int)(f_offset & PAGE_MASK_64);
6384 upl_f_offset = f_offset - (off_t)start_offset;
6385 max_size = filesize - f_offset;
6386
6387 if (resid < max_size) {
6388 io_size = resid;
6389 } else {
6390 io_size = (int)max_size;
6391 }
6392
6393 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
6394 if ((uint32_t)upl_size > max_io_size) {
6395 upl_size = max_io_size;
6396 }
6397
6398 skip_range = 0;
6399 /*
6400 * return the number of contiguously present pages in the cache
6401 * starting at upl_f_offset within the file
6402 */
6403 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
6404
6405 if (skip_range) {
6406 /*
6407 * skip over pages already present in the cache
6408 */
6409 io_size = skip_range - start_offset;
6410
6411 f_offset += io_size;
6412 resid -= io_size;
6413
6414 if (skip_range == upl_size) {
6415 continue;
6416 }
6417 /*
6418 * have to issue some real I/O
6419 * at this point, we know it's starting on a page boundary
6420 * because we've skipped over at least the first page in the request
6421 */
6422 start_offset = 0;
6423 upl_f_offset += skip_range;
6424 upl_size -= skip_range;
6425 }
6426 pages_in_upl = upl_size / PAGE_SIZE;
6427
6428 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
6429 upl, (int)upl_f_offset, upl_size, start_offset, 0);
6430
6431 kret = ubc_create_upl_kernel(vp,
6432 upl_f_offset,
6433 upl_size,
6434 &upl,
6435 &pl,
6436 UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
6437 VM_KERN_MEMORY_FILE);
6438 if (kret != KERN_SUCCESS) {
6439 return retval;
6440 }
6441 issued_io = 0;
6442
6443 /*
6444 * before we start marching forward, we must make sure we end on
6445 * a present page, otherwise we will be working with a freed
6446 * upl
6447 */
6448 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
6449 if (upl_page_present(pl, last_pg)) {
6450 break;
6451 }
6452 }
6453 pages_in_upl = last_pg + 1;
6454
6455
6456 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
6457 upl, (int)upl_f_offset, upl_size, start_offset, 0);
6458
6459
6460 for (last_pg = 0; last_pg < pages_in_upl;) {
6461 /*
6462 * scan from the beginning of the upl looking for the first
6463 * page that is present.... this will become the first page in
6464 * the request we're going to make to 'cluster_io'... if all
6465 * of the pages are absent, we won't call through to 'cluster_io'
6466 */
6467 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
6468 if (upl_page_present(pl, start_pg)) {
6469 break;
6470 }
6471 }
6472
6473 /*
6474 * scan from the starting present page looking for an absent
6475 * page before the end of the upl is reached, if we
6476 * find one, then it will terminate the range of pages being
6477 * presented to 'cluster_io'
6478 */
6479 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
6480 if (!upl_page_present(pl, last_pg)) {
6481 break;
6482 }
6483 }
6484
6485 if (last_pg > start_pg) {
6486 /*
6487 * we found a range of pages that must be filled
6488 * if the last page in this range is the last page of the file
6489 * we may have to clip the size of it to keep from reading past
6490 * the end of the last physical block associated with the file
6491 */
6492 upl_offset = start_pg * PAGE_SIZE;
6493 io_size = (last_pg - start_pg) * PAGE_SIZE;
6494
6495 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
6496 io_size = (int)(filesize - (upl_f_offset + upl_offset));
6497 }
6498
6499 /*
6500 * issue an asynchronous read to cluster_io
6501 */
6502 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
6503 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6504
6505 issued_io = 1;
6506 }
6507 }
6508 if (issued_io == 0) {
6509 ubc_upl_abort(upl, 0);
6510 }
6511
6512 io_size = upl_size - start_offset;
6513
6514 if (io_size > resid) {
6515 io_size = resid;
6516 }
6517 f_offset += io_size;
6518 resid -= io_size;
6519 }
6520
6521 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
6522 (int)f_offset, resid, retval, 0, 0);
6523
6524 return retval;
6525 }
6526
6527
6528 int
cluster_push(vnode_t vp,int flags)6529 cluster_push(vnode_t vp, int flags)
6530 {
6531 return cluster_push_ext(vp, flags, NULL, NULL);
6532 }
6533
6534
6535 int
cluster_push_ext(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg)6536 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
6537 {
6538 return cluster_push_err(vp, flags, callback, callback_arg, NULL);
6539 }
6540
6541 /* write errors via err, but return the number of clusters written */
6542 extern uint32_t system_inshutdown;
6543 uint32_t cl_sparse_push_error = 0;
6544 int
cluster_push_err(vnode_t vp,int flags,int (* callback)(buf_t,void *),void * callback_arg,int * err)6545 cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
6546 {
6547 int retval;
6548 int my_sparse_wait = 0;
6549 struct cl_writebehind *wbp;
6550 int local_err = 0;
6551
6552 if (err) {
6553 *err = 0;
6554 }
6555
6556 if (!UBCINFOEXISTS(vp)) {
6557 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
6558 return 0;
6559 }
6560 /* return if deferred write is set */
6561 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
6562 return 0;
6563 }
6564 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
6565 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
6566 return 0;
6567 }
6568 if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
6569 lck_mtx_unlock(&wbp->cl_lockw);
6570
6571 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
6572 return 0;
6573 }
6574 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
6575 wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
6576
6577 /*
6578 * if we have an fsync in progress, we don't want to allow any additional
6579 * sync/fsync/close(s) to occur until it finishes.
6580 * note that its possible for writes to continue to occur to this file
6581 * while we're waiting and also once the fsync starts to clean if we're
6582 * in the sparse map case
6583 */
6584 while (wbp->cl_sparse_wait) {
6585 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
6586
6587 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
6588
6589 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
6590 }
6591 if (flags & IO_SYNC) {
6592 my_sparse_wait = 1;
6593 wbp->cl_sparse_wait = 1;
6594
6595 /*
6596 * this is an fsync (or equivalent)... we must wait for any existing async
6597 * cleaning operations to complete before we evaulate the current state
6598 * and finish cleaning... this insures that all writes issued before this
6599 * fsync actually get cleaned to the disk before this fsync returns
6600 */
6601 while (wbp->cl_sparse_pushes) {
6602 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
6603
6604 msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
6605
6606 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
6607 }
6608 }
6609 if (wbp->cl_scmap) {
6610 void *scmap;
6611
6612 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
6613 scmap = wbp->cl_scmap;
6614 wbp->cl_scmap = NULL;
6615
6616 wbp->cl_sparse_pushes++;
6617
6618 lck_mtx_unlock(&wbp->cl_lockw);
6619
6620 retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
6621
6622 lck_mtx_lock(&wbp->cl_lockw);
6623
6624 wbp->cl_sparse_pushes--;
6625
6626 if (retval) {
6627 if (wbp->cl_scmap != NULL) {
6628 /*
6629 * panic("cluster_push_err: Expected NULL cl_scmap\n");
6630 *
6631 * This can happen if we get an error from the underlying FS
6632 * e.g. ENOSPC, EPERM or EIO etc. We hope that these errors
6633 * are transient and the I/Os will succeed at a later point.
6634 *
6635 * The tricky part here is that a new sparse cluster has been
6636 * allocated and tracking a different set of dirty pages. So these
6637 * pages are not going to be pushed out with the next sparse_cluster_push.
6638 * An explicit msync or file close will, however, push the pages out.
6639 *
6640 * What if those calls still don't work? And so, during shutdown we keep
6641 * trying till we succeed...
6642 */
6643
6644 if (system_inshutdown) {
6645 if ((retval == ENOSPC) && (vp->v_mount->mnt_flag & (MNT_LOCAL | MNT_REMOVABLE)) == MNT_LOCAL) {
6646 os_atomic_inc(&cl_sparse_push_error, relaxed);
6647 }
6648 } else {
6649 vfs_drt_control(&scmap, 0); /* emit stats and free this memory. Dirty pages stay intact. */
6650 scmap = NULL;
6651 }
6652 } else {
6653 wbp->cl_scmap = scmap;
6654 }
6655 }
6656
6657 if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
6658 wakeup((caddr_t)&wbp->cl_sparse_pushes);
6659 }
6660 } else {
6661 retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
6662 }
6663
6664 local_err = retval;
6665
6666 if (err) {
6667 *err = retval;
6668 }
6669 retval = 1;
6670 } else {
6671 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
6672 if (err) {
6673 *err = local_err;
6674 }
6675 }
6676 lck_mtx_unlock(&wbp->cl_lockw);
6677
6678 if (flags & IO_SYNC) {
6679 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
6680 }
6681
6682 if (my_sparse_wait) {
6683 /*
6684 * I'm the owner of the serialization token
6685 * clear it and wakeup anyone that is waiting
6686 * for me to finish
6687 */
6688 lck_mtx_lock(&wbp->cl_lockw);
6689
6690 wbp->cl_sparse_wait = 0;
6691 wakeup((caddr_t)&wbp->cl_sparse_wait);
6692
6693 lck_mtx_unlock(&wbp->cl_lockw);
6694 }
6695 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
6696 wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
6697
6698 return retval;
6699 }
6700
6701
6702 __private_extern__ void
cluster_release(struct ubc_info * ubc)6703 cluster_release(struct ubc_info *ubc)
6704 {
6705 struct cl_writebehind *wbp;
6706 struct cl_readahead *rap;
6707
6708 if ((wbp = ubc->cl_wbehind)) {
6709 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
6710
6711 if (wbp->cl_scmap) {
6712 vfs_drt_control(&(wbp->cl_scmap), 0);
6713 }
6714 lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
6715 zfree(cl_wr_zone, wbp);
6716 ubc->cl_wbehind = NULL;
6717 } else {
6718 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
6719 }
6720
6721 if ((rap = ubc->cl_rahead)) {
6722 lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
6723 zfree(cl_rd_zone, rap);
6724 ubc->cl_rahead = NULL;
6725 }
6726
6727 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
6728 }
6729
6730
6731 static int
cluster_try_push(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,int * err,boolean_t vm_initiated)6732 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
6733 {
6734 int cl_index;
6735 int cl_index1;
6736 int min_index;
6737 int cl_len;
6738 int cl_pushed = 0;
6739 struct cl_wextent l_clusters[MAX_CLUSTERS];
6740 u_int max_cluster_pgcount;
6741 int error = 0;
6742
6743 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
6744 /*
6745 * the write behind context exists and has
6746 * already been locked...
6747 */
6748 if (wbp->cl_number == 0) {
6749 /*
6750 * no clusters to push
6751 * return number of empty slots
6752 */
6753 return MAX_CLUSTERS;
6754 }
6755
6756 /*
6757 * make a local 'sorted' copy of the clusters
6758 * and clear wbp->cl_number so that new clusters can
6759 * be developed
6760 */
6761 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6762 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
6763 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
6764 continue;
6765 }
6766 if (min_index == -1) {
6767 min_index = cl_index1;
6768 } else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
6769 min_index = cl_index1;
6770 }
6771 }
6772 if (min_index == -1) {
6773 break;
6774 }
6775
6776 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
6777 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
6778 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
6779
6780 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
6781 }
6782 wbp->cl_number = 0;
6783
6784 cl_len = cl_index;
6785
6786 /* skip switching to the sparse cluster mechanism if on diskimage */
6787 if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
6788 !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
6789 int i;
6790
6791 /*
6792 * determine if we appear to be writing the file sequentially
6793 * if not, by returning without having pushed any clusters
6794 * we will cause this vnode to be pushed into the sparse cluster mechanism
6795 * used for managing more random I/O patterns
6796 *
6797 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
6798 * that's why we're in try_push with PUSH_DELAY...
6799 *
6800 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
6801 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
6802 * so we can just make a simple pass through, up to, but not including the last one...
6803 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
6804 * are sequential
6805 *
6806 * we let the last one be partial as long as it was adjacent to the previous one...
6807 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
6808 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
6809 */
6810 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
6811 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
6812 goto dont_try;
6813 }
6814 if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
6815 goto dont_try;
6816 }
6817 }
6818 }
6819 if (vm_initiated == TRUE) {
6820 lck_mtx_unlock(&wbp->cl_lockw);
6821 }
6822
6823 for (cl_index = 0; cl_index < cl_len; cl_index++) {
6824 int flags;
6825 struct cl_extent cl;
6826 int retval;
6827
6828 flags = io_flags & (IO_PASSIVE | IO_CLOSE);
6829
6830 /*
6831 * try to push each cluster in turn...
6832 */
6833 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
6834 flags |= IO_NOCACHE;
6835 }
6836
6837 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
6838 flags |= IO_PASSIVE;
6839 }
6840
6841 if (push_flag & PUSH_SYNC) {
6842 flags |= IO_SYNC;
6843 }
6844
6845 cl.b_addr = l_clusters[cl_index].b_addr;
6846 cl.e_addr = l_clusters[cl_index].e_addr;
6847
6848 retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
6849
6850 if (retval == 0) {
6851 cl_pushed++;
6852
6853 l_clusters[cl_index].b_addr = 0;
6854 l_clusters[cl_index].e_addr = 0;
6855 } else if (error == 0) {
6856 error = retval;
6857 }
6858
6859 if (!(push_flag & PUSH_ALL)) {
6860 break;
6861 }
6862 }
6863 if (vm_initiated == TRUE) {
6864 lck_mtx_lock(&wbp->cl_lockw);
6865 }
6866
6867 if (err) {
6868 *err = error;
6869 }
6870
6871 dont_try:
6872 if (cl_len > cl_pushed) {
6873 /*
6874 * we didn't push all of the clusters, so
6875 * lets try to merge them back in to the vnode
6876 */
6877 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
6878 /*
6879 * we picked up some new clusters while we were trying to
6880 * push the old ones... this can happen because I've dropped
6881 * the vnode lock... the sum of the
6882 * leftovers plus the new cluster count exceeds our ability
6883 * to represent them, so switch to the sparse cluster mechanism
6884 *
6885 * collect the active public clusters...
6886 */
6887 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6888
6889 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
6890 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6891 continue;
6892 }
6893 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6894 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6895 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6896
6897 cl_index1++;
6898 }
6899 /*
6900 * update the cluster count
6901 */
6902 wbp->cl_number = cl_index1;
6903
6904 /*
6905 * and collect the original clusters that were moved into the
6906 * local storage for sorting purposes
6907 */
6908 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
6909 } else {
6910 /*
6911 * we've got room to merge the leftovers back in
6912 * just append them starting at the next 'hole'
6913 * represented by wbp->cl_number
6914 */
6915 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
6916 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
6917 continue;
6918 }
6919
6920 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
6921 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
6922 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
6923
6924 cl_index1++;
6925 }
6926 /*
6927 * update the cluster count
6928 */
6929 wbp->cl_number = cl_index1;
6930 }
6931 }
6932 return MAX_CLUSTERS - wbp->cl_number;
6933 }
6934
6935
6936
6937 static int
cluster_push_now(vnode_t vp,struct cl_extent * cl,off_t EOF,int flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)6938 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
6939 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6940 {
6941 upl_page_info_t *pl;
6942 upl_t upl;
6943 vm_offset_t upl_offset;
6944 int upl_size;
6945 off_t upl_f_offset;
6946 int pages_in_upl;
6947 int start_pg;
6948 int last_pg;
6949 int io_size;
6950 int io_flags;
6951 int upl_flags;
6952 int bflag;
6953 int size;
6954 int error = 0;
6955 int retval;
6956 kern_return_t kret;
6957
6958 if (flags & IO_PASSIVE) {
6959 bflag = CL_PASSIVE;
6960 } else {
6961 bflag = 0;
6962 }
6963
6964 if (flags & IO_SKIP_ENCRYPTION) {
6965 bflag |= CL_ENCRYPTED;
6966 }
6967
6968 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
6969 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
6970
6971 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
6972 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
6973
6974 return 0;
6975 }
6976 upl_size = pages_in_upl * PAGE_SIZE;
6977 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6978
6979 if (upl_f_offset + upl_size >= EOF) {
6980 if (upl_f_offset >= EOF) {
6981 /*
6982 * must have truncated the file and missed
6983 * clearing a dangling cluster (i.e. it's completely
6984 * beyond the new EOF
6985 */
6986 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
6987
6988 return 0;
6989 }
6990 size = (int)(EOF - upl_f_offset);
6991
6992 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
6993 pages_in_upl = upl_size / PAGE_SIZE;
6994 } else {
6995 size = upl_size;
6996 }
6997
6998
6999 if (vm_initiated) {
7000 vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
7001 UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
7002
7003 return error;
7004 }
7005 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
7006
7007 /*
7008 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
7009 *
7010 * - only pages that are currently dirty are returned... these are the ones we need to clean
7011 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
7012 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
7013 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
7014 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
7015 *
7016 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
7017 */
7018
7019 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) {
7020 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
7021 } else {
7022 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
7023 }
7024
7025 kret = ubc_create_upl_kernel(vp,
7026 upl_f_offset,
7027 upl_size,
7028 &upl,
7029 &pl,
7030 upl_flags,
7031 VM_KERN_MEMORY_FILE);
7032 if (kret != KERN_SUCCESS) {
7033 panic("cluster_push: failed to get pagelist");
7034 }
7035
7036 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
7037
7038 /*
7039 * since we only asked for the dirty pages back
7040 * it's possible that we may only get a few or even none, so...
7041 * before we start marching forward, we must make sure we know
7042 * where the last present page is in the UPL, otherwise we could
7043 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
7044 * employed by commit_range and abort_range.
7045 */
7046 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
7047 if (upl_page_present(pl, last_pg)) {
7048 break;
7049 }
7050 }
7051 pages_in_upl = last_pg + 1;
7052
7053 if (pages_in_upl == 0) {
7054 ubc_upl_abort(upl, 0);
7055
7056 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
7057 return 0;
7058 }
7059
7060 for (last_pg = 0; last_pg < pages_in_upl;) {
7061 /*
7062 * find the next dirty page in the UPL
7063 * this will become the first page in the
7064 * next I/O to generate
7065 */
7066 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
7067 if (upl_dirty_page(pl, start_pg)) {
7068 break;
7069 }
7070 if (upl_page_present(pl, start_pg)) {
7071 /*
7072 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
7073 * just release these unchanged since we're not going
7074 * to steal them or change their state
7075 */
7076 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
7077 }
7078 }
7079 if (start_pg >= pages_in_upl) {
7080 /*
7081 * done... no more dirty pages to push
7082 */
7083 break;
7084 }
7085 if (start_pg > last_pg) {
7086 /*
7087 * skipped over some non-dirty pages
7088 */
7089 size -= ((start_pg - last_pg) * PAGE_SIZE);
7090 }
7091
7092 /*
7093 * find a range of dirty pages to write
7094 */
7095 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
7096 if (!upl_dirty_page(pl, last_pg)) {
7097 break;
7098 }
7099 }
7100 upl_offset = start_pg * PAGE_SIZE;
7101
7102 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
7103
7104 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
7105
7106 if (!(flags & IO_SYNC)) {
7107 io_flags |= CL_ASYNC;
7108 }
7109
7110 if (flags & IO_CLOSE) {
7111 io_flags |= CL_CLOSE;
7112 }
7113
7114 if (flags & IO_NOCACHE) {
7115 io_flags |= CL_NOCACHE;
7116 }
7117
7118 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
7119 io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
7120
7121 if (error == 0 && retval) {
7122 error = retval;
7123 }
7124
7125 size -= io_size;
7126 }
7127 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
7128
7129 return error;
7130 }
7131
7132
7133 /*
7134 * sparse_cluster_switch is called with the write behind lock held
7135 */
7136 static int
sparse_cluster_switch(struct cl_writebehind * wbp,vnode_t vp,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)7137 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
7138 {
7139 int cl_index;
7140 int error = 0;
7141
7142 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
7143
7144 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
7145 int flags;
7146 struct cl_extent cl;
7147
7148 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
7149 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
7150 if (flags & UPL_POP_DIRTY) {
7151 cl.e_addr = cl.b_addr + 1;
7152
7153 error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
7154
7155 if (error) {
7156 break;
7157 }
7158 }
7159 }
7160 }
7161 }
7162 wbp->cl_number -= cl_index;
7163
7164 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
7165
7166 return error;
7167 }
7168
7169
7170 /*
7171 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
7172 * still associated with the write-behind context... however, if the scmap has been disassociated
7173 * from the write-behind context (the cluster_push case), the wb lock is not held
7174 */
7175 static int
sparse_cluster_push(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,off_t EOF,int push_flag,int io_flags,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)7176 sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
7177 int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
7178 {
7179 struct cl_extent cl;
7180 off_t offset;
7181 u_int length;
7182 void *l_scmap;
7183 int error = 0;
7184
7185 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
7186
7187 if (push_flag & PUSH_ALL) {
7188 vfs_drt_control(scmap, 1);
7189 }
7190
7191 l_scmap = *scmap;
7192
7193 for (;;) {
7194 int retval;
7195
7196 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) {
7197 /*
7198 * Not finding anything to push will return KERN_FAILURE.
7199 * Confusing since it isn't really a failure. But that's the
7200 * reason we don't set 'error' here like we do below.
7201 */
7202 break;
7203 }
7204
7205 if (vm_initiated == TRUE) {
7206 lck_mtx_unlock(&wbp->cl_lockw);
7207 }
7208
7209 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
7210 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
7211
7212 retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
7213 if (error == 0 && retval) {
7214 error = retval;
7215 }
7216
7217 if (vm_initiated == TRUE) {
7218 lck_mtx_lock(&wbp->cl_lockw);
7219
7220 if (*scmap != l_scmap) {
7221 break;
7222 }
7223 }
7224
7225 if (error) {
7226 if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
7227 panic("Failed to restore dirty state on failure");
7228 }
7229
7230 break;
7231 }
7232
7233 if (!(push_flag & PUSH_ALL)) {
7234 break;
7235 }
7236 }
7237 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
7238
7239 return error;
7240 }
7241
7242
7243 /*
7244 * sparse_cluster_add is called with the write behind lock held
7245 */
7246 static int
sparse_cluster_add(struct cl_writebehind * wbp,void ** scmap,vnode_t vp,struct cl_extent * cl,off_t EOF,int (* callback)(buf_t,void *),void * callback_arg,boolean_t vm_initiated)7247 sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
7248 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
7249 {
7250 u_int new_dirty;
7251 u_int length;
7252 off_t offset;
7253 int error = 0;
7254 int push_flag = 0; /* Is this a valid value? */
7255
7256 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
7257
7258 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
7259 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
7260
7261 while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
7262 /*
7263 * no room left in the map
7264 * only a partial update was done
7265 * push out some pages and try again
7266 */
7267
7268 if (vfs_get_scmap_push_behavior_internal(scmap, &push_flag)) {
7269 push_flag = 0;
7270 }
7271
7272 error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, 0, callback, callback_arg, vm_initiated);
7273
7274 if (error) {
7275 break;
7276 }
7277
7278 offset += (new_dirty * PAGE_SIZE_64);
7279 length -= (new_dirty * PAGE_SIZE);
7280 }
7281 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
7282
7283 return error;
7284 }
7285
7286
7287 static int
cluster_align_phys_io(vnode_t vp,struct uio * uio,addr64_t usr_paddr,u_int32_t xsize,int flags,int (* callback)(buf_t,void *),void * callback_arg)7288 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
7289 {
7290 upl_page_info_t *pl;
7291 upl_t upl;
7292 addr64_t ubc_paddr;
7293 kern_return_t kret;
7294 int error = 0;
7295 int did_read = 0;
7296 int abort_flags;
7297 int upl_flags;
7298 int bflag;
7299
7300 if (flags & IO_PASSIVE) {
7301 bflag = CL_PASSIVE;
7302 } else {
7303 bflag = 0;
7304 }
7305
7306 if (flags & IO_NOCACHE) {
7307 bflag |= CL_NOCACHE;
7308 }
7309
7310 upl_flags = UPL_SET_LITE;
7311
7312 if (!(flags & CL_READ)) {
7313 /*
7314 * "write" operation: let the UPL subsystem know
7315 * that we intend to modify the buffer cache pages
7316 * we're gathering.
7317 */
7318 upl_flags |= UPL_WILL_MODIFY;
7319 } else {
7320 /*
7321 * indicate that there is no need to pull the
7322 * mapping for this page... we're only going
7323 * to read from it, not modify it.
7324 */
7325 upl_flags |= UPL_FILE_IO;
7326 }
7327 kret = ubc_create_upl_kernel(vp,
7328 uio->uio_offset & ~PAGE_MASK_64,
7329 PAGE_SIZE,
7330 &upl,
7331 &pl,
7332 upl_flags,
7333 VM_KERN_MEMORY_FILE);
7334
7335 if (kret != KERN_SUCCESS) {
7336 return EINVAL;
7337 }
7338
7339 if (!upl_valid_page(pl, 0)) {
7340 /*
7341 * issue a synchronous read to cluster_io
7342 */
7343 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
7344 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
7345 if (error) {
7346 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
7347
7348 return error;
7349 }
7350 did_read = 1;
7351 }
7352 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
7353
7354 /*
7355 * NOTE: There is no prototype for the following in BSD. It, and the definitions
7356 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
7357 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
7358 * way to do so without exporting them to kexts as well.
7359 */
7360 if (flags & CL_READ) {
7361 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
7362 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
7363 } else {
7364 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
7365 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
7366 }
7367 if (!(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
7368 /*
7369 * issue a synchronous write to cluster_io
7370 */
7371 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
7372 bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
7373 }
7374 if (error == 0) {
7375 uio_update(uio, (user_size_t)xsize);
7376 }
7377
7378 if (did_read) {
7379 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
7380 } else {
7381 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
7382 }
7383
7384 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
7385
7386 return error;
7387 }
7388
7389 int
cluster_copy_upl_data(struct uio * uio,upl_t upl,int upl_offset,int * io_resid)7390 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
7391 {
7392 int pg_offset;
7393 int pg_index;
7394 int csize;
7395 int segflg;
7396 int retval = 0;
7397 int xsize;
7398 upl_page_info_t *pl;
7399 int dirty_count;
7400
7401 xsize = *io_resid;
7402
7403 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
7404 (int)uio->uio_offset, upl_offset, xsize, 0, 0);
7405
7406 segflg = uio->uio_segflg;
7407
7408 switch (segflg) {
7409 case UIO_USERSPACE32:
7410 case UIO_USERISPACE32:
7411 uio->uio_segflg = UIO_PHYS_USERSPACE32;
7412 break;
7413
7414 case UIO_USERSPACE:
7415 case UIO_USERISPACE:
7416 uio->uio_segflg = UIO_PHYS_USERSPACE;
7417 break;
7418
7419 case UIO_USERSPACE64:
7420 case UIO_USERISPACE64:
7421 uio->uio_segflg = UIO_PHYS_USERSPACE64;
7422 break;
7423
7424 case UIO_SYSSPACE:
7425 uio->uio_segflg = UIO_PHYS_SYSSPACE;
7426 break;
7427 }
7428 pl = ubc_upl_pageinfo(upl);
7429
7430 pg_index = upl_offset / PAGE_SIZE;
7431 pg_offset = upl_offset & PAGE_MASK;
7432 csize = min(PAGE_SIZE - pg_offset, xsize);
7433
7434 dirty_count = 0;
7435 while (xsize && retval == 0) {
7436 addr64_t paddr;
7437 ppnum_t pn = upl_phys_page(pl, pg_index);
7438
7439 paddr = ((addr64_t)pn << PAGE_SHIFT) + pg_offset;
7440 if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) {
7441 dirty_count++;
7442 }
7443
7444 /* such phyiscal pages should never be restricted pages */
7445 if (pmap_is_page_restricted(pn)) {
7446 panic("%s: cannot uiomove64 into a restricted page", __func__);
7447 }
7448
7449 retval = uiomove64(paddr, csize, uio);
7450
7451 pg_index += 1;
7452 pg_offset = 0;
7453 xsize -= csize;
7454 csize = min(PAGE_SIZE, xsize);
7455 }
7456 *io_resid = xsize;
7457
7458 uio->uio_segflg = segflg;
7459
7460 if (dirty_count) {
7461 task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
7462 }
7463
7464 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
7465 (int)uio->uio_offset, xsize, retval, segflg, 0);
7466
7467 return retval;
7468 }
7469
7470
7471 int
cluster_copy_ubc_data(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty)7472 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
7473 {
7474 return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1);
7475 }
7476
7477
7478 static int
cluster_copy_ubc_data_internal(vnode_t vp,struct uio * uio,int * io_resid,int mark_dirty,int take_reference)7479 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
7480 {
7481 int segflg;
7482 int io_size;
7483 int xsize;
7484 int start_offset;
7485 int retval = 0;
7486 memory_object_control_t control;
7487
7488 io_size = *io_resid;
7489
7490 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
7491 (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
7492
7493 control = ubc_getobject(vp, UBC_FLAGS_NONE);
7494
7495 if (control == MEMORY_OBJECT_CONTROL_NULL) {
7496 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
7497 (int)uio->uio_offset, io_size, retval, 3, 0);
7498
7499 return 0;
7500 }
7501 segflg = uio->uio_segflg;
7502
7503 switch (segflg) {
7504 case UIO_USERSPACE32:
7505 case UIO_USERISPACE32:
7506 uio->uio_segflg = UIO_PHYS_USERSPACE32;
7507 break;
7508
7509 case UIO_USERSPACE64:
7510 case UIO_USERISPACE64:
7511 uio->uio_segflg = UIO_PHYS_USERSPACE64;
7512 break;
7513
7514 case UIO_USERSPACE:
7515 case UIO_USERISPACE:
7516 uio->uio_segflg = UIO_PHYS_USERSPACE;
7517 break;
7518
7519 case UIO_SYSSPACE:
7520 uio->uio_segflg = UIO_PHYS_SYSSPACE;
7521 break;
7522 }
7523
7524 if ((io_size = *io_resid)) {
7525 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
7526 xsize = (int)uio_resid(uio);
7527
7528 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
7529 start_offset, io_size, mark_dirty, take_reference);
7530 xsize -= uio_resid(uio);
7531
7532 int num_bytes_copied = xsize;
7533 if (num_bytes_copied && uio_rw(uio)) {
7534 task_update_logical_writes(current_task(), num_bytes_copied, TASK_WRITE_DEFERRED, vp);
7535 }
7536 io_size -= xsize;
7537 }
7538 uio->uio_segflg = segflg;
7539 *io_resid = io_size;
7540
7541 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
7542 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
7543
7544 return retval;
7545 }
7546
7547
7548 int
is_file_clean(vnode_t vp,off_t filesize)7549 is_file_clean(vnode_t vp, off_t filesize)
7550 {
7551 off_t f_offset;
7552 int flags;
7553 int total_dirty = 0;
7554
7555 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
7556 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
7557 if (flags & UPL_POP_DIRTY) {
7558 total_dirty++;
7559 }
7560 }
7561 }
7562 if (total_dirty) {
7563 return EINVAL;
7564 }
7565
7566 return 0;
7567 }
7568
7569
7570
7571 /*
7572 * Dirty region tracking/clustering mechanism.
7573 *
7574 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
7575 * dirty regions within a larger space (file). It is primarily intended to
7576 * support clustering in large files with many dirty areas.
7577 *
7578 * The implementation assumes that the dirty regions are pages.
7579 *
7580 * To represent dirty pages within the file, we store bit vectors in a
7581 * variable-size circular hash.
7582 */
7583
7584 /*
7585 * Bitvector size. This determines the number of pages we group in a
7586 * single hashtable entry. Each hashtable entry is aligned to this
7587 * size within the file.
7588 */
7589 #define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
7590
7591 /*
7592 * File offset handling.
7593 *
7594 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
7595 * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
7596 */
7597 #define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
7598 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
7599
7600 /*
7601 * Hashtable address field handling.
7602 *
7603 * The low-order bits of the hashtable address are used to conserve
7604 * space.
7605 *
7606 * DRT_HASH_COUNT_MASK must be large enough to store the range
7607 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
7608 * to indicate that the bucket is actually unoccupied.
7609 */
7610 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
7611 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
7612 do { \
7613 (scm)->scm_hashtable[(i)].dhe_control = \
7614 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
7615 } while (0)
7616 #define DRT_HASH_COUNT_MASK 0x1ff
7617 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
7618 #define DRT_HASH_SET_COUNT(scm, i, c) \
7619 do { \
7620 (scm)->scm_hashtable[(i)].dhe_control = \
7621 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
7622 } while (0)
7623 #define DRT_HASH_CLEAR(scm, i) \
7624 do { \
7625 (scm)->scm_hashtable[(i)].dhe_control = 0; \
7626 } while (0)
7627 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
7628 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
7629 #define DRT_HASH_COPY(oscm, oi, scm, i) \
7630 do { \
7631 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
7632 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
7633 } while(0);
7634
7635
7636 #if !defined(XNU_TARGET_OS_OSX)
7637 /*
7638 * Hash table moduli.
7639 *
7640 * Since the hashtable entry's size is dependent on the size of
7641 * the bitvector, and since the hashtable size is constrained to
7642 * both being prime and fitting within the desired allocation
7643 * size, these values need to be manually determined.
7644 *
7645 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
7646 *
7647 * The small hashtable allocation is 4096 bytes, so the modulus is 251.
7648 * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
7649 * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
7650 */
7651
7652 #define DRT_HASH_SMALL_MODULUS 251
7653 #define DRT_HASH_LARGE_MODULUS 2039
7654 #define DRT_HASH_XLARGE_MODULUS 8179
7655
7656 /*
7657 * Physical memory required before the large hash modulus is permitted.
7658 *
7659 * On small memory systems, the large hash modulus can lead to phsyical
7660 * memory starvation, so we avoid using it there.
7661 */
7662 #define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
7663 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (8 * 1024LL * 1024LL * 1024LL) /* 8GiB */
7664
7665 #define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
7666 #define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
7667 #define DRT_XLARGE_ALLOCATION 131072 /* 208 bytes spare */
7668
7669 #else /* XNU_TARGET_OS_OSX */
7670 /*
7671 * Hash table moduli.
7672 *
7673 * Since the hashtable entry's size is dependent on the size of
7674 * the bitvector, and since the hashtable size is constrained to
7675 * both being prime and fitting within the desired allocation
7676 * size, these values need to be manually determined.
7677 *
7678 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
7679 *
7680 * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
7681 * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
7682 * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
7683 */
7684
7685 #define DRT_HASH_SMALL_MODULUS 1019
7686 #define DRT_HASH_LARGE_MODULUS 8179
7687 #define DRT_HASH_XLARGE_MODULUS 32749
7688
7689 /*
7690 * Physical memory required before the large hash modulus is permitted.
7691 *
7692 * On small memory systems, the large hash modulus can lead to phsyical
7693 * memory starvation, so we avoid using it there.
7694 */
7695 #define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
7696 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (32 * 1024LL * 1024LL * 1024LL) /* 32GiB */
7697
7698 #define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
7699 #define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
7700 #define DRT_XLARGE_ALLOCATION 524288 /* 304 bytes spare */
7701
7702 #endif /* ! XNU_TARGET_OS_OSX */
7703
7704 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
7705
7706 /*
7707 * Hashtable entry.
7708 */
7709 struct vfs_drt_hashentry {
7710 u_int64_t dhe_control;
7711 /*
7712 * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
7713 * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
7714 * Since PAGE_SIZE is only known at boot time,
7715 * -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
7716 * -declare dhe_bitvector array for largest possible length
7717 */
7718 #define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
7719 u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
7720 };
7721
7722 /*
7723 * Hashtable bitvector handling.
7724 *
7725 * Bitvector fields are 32 bits long.
7726 */
7727
7728 #define DRT_HASH_SET_BIT(scm, i, bit) \
7729 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
7730
7731 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
7732 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
7733
7734 #define DRT_HASH_TEST_BIT(scm, i, bit) \
7735 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
7736
7737 #define DRT_BITVECTOR_CLEAR(scm, i) \
7738 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7739
7740 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
7741 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
7742 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
7743 (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
7744
7745 /*
7746 * Dirty Region Tracking structure.
7747 *
7748 * The hashtable is allocated entirely inside the DRT structure.
7749 *
7750 * The hash is a simple circular prime modulus arrangement, the structure
7751 * is resized from small to large if it overflows.
7752 */
7753
7754 struct vfs_drt_clustermap {
7755 u_int32_t scm_magic; /* sanity/detection */
7756 #define DRT_SCM_MAGIC 0x12020003
7757 u_int32_t scm_modulus; /* current ring size */
7758 u_int32_t scm_buckets; /* number of occupied buckets */
7759 u_int32_t scm_lastclean; /* last entry we cleaned */
7760 u_int32_t scm_iskips; /* number of slot skips */
7761
7762 struct vfs_drt_hashentry scm_hashtable[0];
7763 };
7764
7765
7766 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
7767 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
7768
7769 /*
7770 * Debugging codes and arguments.
7771 */
7772 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
7773 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
7774 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
7775 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
7776 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
7777 * dirty */
7778 /* 0, setcount */
7779 /* 1 (clean, no map) */
7780 /* 2 (map alloc fail) */
7781 /* 3, resid (partial) */
7782 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
7783 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
7784 * lastclean, iskips */
7785
7786
7787 static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
7788 static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
7789 static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
7790 u_int64_t offset, int *indexp);
7791 static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
7792 u_int64_t offset,
7793 int *indexp,
7794 int recursed);
7795 static kern_return_t vfs_drt_do_mark_pages(
7796 void **cmapp,
7797 u_int64_t offset,
7798 u_int length,
7799 u_int *setcountp,
7800 int dirty);
7801 static void vfs_drt_trace(
7802 struct vfs_drt_clustermap *cmap,
7803 int code,
7804 int arg1,
7805 int arg2,
7806 int arg3,
7807 int arg4);
7808
7809
7810 /*
7811 * Allocate and initialise a sparse cluster map.
7812 *
7813 * Will allocate a new map, resize or compact an existing map.
7814 *
7815 * XXX we should probably have at least one intermediate map size,
7816 * as the 1:16 ratio seems a bit drastic.
7817 */
7818 static kern_return_t
vfs_drt_alloc_map(struct vfs_drt_clustermap ** cmapp)7819 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
7820 {
7821 struct vfs_drt_clustermap *cmap = NULL, *ocmap = NULL;
7822 kern_return_t kret = KERN_SUCCESS;
7823 u_int64_t offset = 0;
7824 u_int32_t i = 0;
7825 int modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
7826
7827 ocmap = NULL;
7828 if (cmapp != NULL) {
7829 ocmap = *cmapp;
7830 }
7831
7832 /*
7833 * Decide on the size of the new map.
7834 */
7835 if (ocmap == NULL) {
7836 modulus_size = DRT_HASH_SMALL_MODULUS;
7837 map_size = DRT_SMALL_ALLOCATION;
7838 } else {
7839 /* count the number of active buckets in the old map */
7840 active_buckets = 0;
7841 for (i = 0; i < ocmap->scm_modulus; i++) {
7842 if (!DRT_HASH_VACANT(ocmap, i) &&
7843 (DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
7844 active_buckets++;
7845 }
7846 }
7847 /*
7848 * If we're currently using the small allocation, check to
7849 * see whether we should grow to the large one.
7850 */
7851 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7852 /*
7853 * If the ring is nearly full and we are allowed to
7854 * use the large modulus, upgrade.
7855 */
7856 if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
7857 (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
7858 modulus_size = DRT_HASH_LARGE_MODULUS;
7859 map_size = DRT_LARGE_ALLOCATION;
7860 } else {
7861 modulus_size = DRT_HASH_SMALL_MODULUS;
7862 map_size = DRT_SMALL_ALLOCATION;
7863 }
7864 } else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7865 if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
7866 (max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
7867 modulus_size = DRT_HASH_XLARGE_MODULUS;
7868 map_size = DRT_XLARGE_ALLOCATION;
7869 } else {
7870 /*
7871 * If the ring is completely full and we can't
7872 * expand, there's nothing useful for us to do.
7873 * Behave as though we had compacted into the new
7874 * array and return.
7875 */
7876 return KERN_SUCCESS;
7877 }
7878 } else {
7879 /* already using the xlarge modulus */
7880 modulus_size = DRT_HASH_XLARGE_MODULUS;
7881 map_size = DRT_XLARGE_ALLOCATION;
7882
7883 /*
7884 * If the ring is completely full, there's
7885 * nothing useful for us to do. Behave as
7886 * though we had compacted into the new
7887 * array and return.
7888 */
7889 if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
7890 return KERN_SUCCESS;
7891 }
7892 }
7893 }
7894
7895 /*
7896 * Allocate and initialise the new map.
7897 */
7898
7899 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, map_size,
7900 KMA_DATA, VM_KERN_MEMORY_FILE);
7901 if (kret != KERN_SUCCESS) {
7902 return kret;
7903 }
7904 cmap->scm_magic = DRT_SCM_MAGIC;
7905 cmap->scm_modulus = modulus_size;
7906 cmap->scm_buckets = 0;
7907 cmap->scm_lastclean = 0;
7908 cmap->scm_iskips = 0;
7909 for (i = 0; i < cmap->scm_modulus; i++) {
7910 DRT_HASH_CLEAR(cmap, i);
7911 DRT_HASH_VACATE(cmap, i);
7912 DRT_BITVECTOR_CLEAR(cmap, i);
7913 }
7914
7915 /*
7916 * If there's an old map, re-hash entries from it into the new map.
7917 */
7918 copycount = 0;
7919 if (ocmap != NULL) {
7920 for (i = 0; i < ocmap->scm_modulus; i++) {
7921 /* skip empty buckets */
7922 if (DRT_HASH_VACANT(ocmap, i) ||
7923 (DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
7924 continue;
7925 }
7926 /* get new index */
7927 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
7928 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
7929 if (kret != KERN_SUCCESS) {
7930 /* XXX need to bail out gracefully here */
7931 panic("vfs_drt: new cluster map mysteriously too small");
7932 index = 0;
7933 }
7934 /* copy */
7935 DRT_HASH_COPY(ocmap, i, cmap, index);
7936 copycount++;
7937 }
7938 }
7939
7940 /* log what we've done */
7941 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
7942
7943 /*
7944 * It's important to ensure that *cmapp always points to
7945 * a valid map, so we must overwrite it before freeing
7946 * the old map.
7947 */
7948 *cmapp = cmap;
7949 if (ocmap != NULL) {
7950 /* emit stats into trace buffer */
7951 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
7952 ocmap->scm_modulus,
7953 ocmap->scm_buckets,
7954 ocmap->scm_lastclean,
7955 ocmap->scm_iskips);
7956
7957 vfs_drt_free_map(ocmap);
7958 }
7959 return KERN_SUCCESS;
7960 }
7961
7962
7963 /*
7964 * Free a sparse cluster map.
7965 */
7966 static kern_return_t
vfs_drt_free_map(struct vfs_drt_clustermap * cmap)7967 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
7968 {
7969 vm_size_t map_size = 0;
7970
7971 if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7972 map_size = DRT_SMALL_ALLOCATION;
7973 } else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7974 map_size = DRT_LARGE_ALLOCATION;
7975 } else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7976 map_size = DRT_XLARGE_ALLOCATION;
7977 } else {
7978 panic("vfs_drt_free_map: Invalid modulus %d", cmap->scm_modulus);
7979 }
7980
7981 kmem_free(kernel_map, (vm_offset_t)cmap, map_size);
7982 return KERN_SUCCESS;
7983 }
7984
7985
7986 /*
7987 * Find the hashtable slot currently occupied by an entry for the supplied offset.
7988 */
7989 static kern_return_t
vfs_drt_search_index(struct vfs_drt_clustermap * cmap,u_int64_t offset,int * indexp)7990 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
7991 {
7992 int index;
7993 u_int32_t i;
7994
7995 offset = DRT_ALIGN_ADDRESS(offset);
7996 index = DRT_HASH(cmap, offset);
7997
7998 /* traverse the hashtable */
7999 for (i = 0; i < cmap->scm_modulus; i++) {
8000 /*
8001 * If the slot is vacant, we can stop.
8002 */
8003 if (DRT_HASH_VACANT(cmap, index)) {
8004 break;
8005 }
8006
8007 /*
8008 * If the address matches our offset, we have success.
8009 */
8010 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
8011 *indexp = index;
8012 return KERN_SUCCESS;
8013 }
8014
8015 /*
8016 * Move to the next slot, try again.
8017 */
8018 index = DRT_HASH_NEXT(cmap, index);
8019 }
8020 /*
8021 * It's not there.
8022 */
8023 return KERN_FAILURE;
8024 }
8025
8026 /*
8027 * Find the hashtable slot for the supplied offset. If we haven't allocated
8028 * one yet, allocate one and populate the address field. Note that it will
8029 * not have a nonzero page count and thus will still technically be free, so
8030 * in the case where we are called to clean pages, the slot will remain free.
8031 */
8032 static kern_return_t
vfs_drt_get_index(struct vfs_drt_clustermap ** cmapp,u_int64_t offset,int * indexp,int recursed)8033 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
8034 {
8035 struct vfs_drt_clustermap *cmap;
8036 kern_return_t kret;
8037 u_int32_t index;
8038 u_int32_t i;
8039
8040 cmap = *cmapp;
8041
8042 /* look for an existing entry */
8043 kret = vfs_drt_search_index(cmap, offset, indexp);
8044 if (kret == KERN_SUCCESS) {
8045 return kret;
8046 }
8047
8048 /* need to allocate an entry */
8049 offset = DRT_ALIGN_ADDRESS(offset);
8050 index = DRT_HASH(cmap, offset);
8051
8052 /* scan from the index forwards looking for a vacant slot */
8053 for (i = 0; i < cmap->scm_modulus; i++) {
8054 /* slot vacant? */
8055 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap, index) == 0) {
8056 cmap->scm_buckets++;
8057 if (index < cmap->scm_lastclean) {
8058 cmap->scm_lastclean = index;
8059 }
8060 DRT_HASH_SET_ADDRESS(cmap, index, offset);
8061 DRT_HASH_SET_COUNT(cmap, index, 0);
8062 DRT_BITVECTOR_CLEAR(cmap, index);
8063 *indexp = index;
8064 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
8065 return KERN_SUCCESS;
8066 }
8067 cmap->scm_iskips += i;
8068 index = DRT_HASH_NEXT(cmap, index);
8069 }
8070
8071 /*
8072 * We haven't found a vacant slot, so the map is full. If we're not
8073 * already recursed, try reallocating/compacting it.
8074 */
8075 if (recursed) {
8076 return KERN_FAILURE;
8077 }
8078 kret = vfs_drt_alloc_map(cmapp);
8079 if (kret == KERN_SUCCESS) {
8080 /* now try to insert again */
8081 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
8082 }
8083 return kret;
8084 }
8085
8086 /*
8087 * Implementation of set dirty/clean.
8088 *
8089 * In the 'clean' case, not finding a map is OK.
8090 */
8091 static kern_return_t
vfs_drt_do_mark_pages(void ** private,u_int64_t offset,u_int length,u_int * setcountp,int dirty)8092 vfs_drt_do_mark_pages(
8093 void **private,
8094 u_int64_t offset,
8095 u_int length,
8096 u_int *setcountp,
8097 int dirty)
8098 {
8099 struct vfs_drt_clustermap *cmap, **cmapp;
8100 kern_return_t kret;
8101 int i, index, pgoff, pgcount, setcount, ecount;
8102
8103 cmapp = (struct vfs_drt_clustermap **)private;
8104 cmap = *cmapp;
8105
8106 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
8107
8108 if (setcountp != NULL) {
8109 *setcountp = 0;
8110 }
8111
8112 /* allocate a cluster map if we don't already have one */
8113 if (cmap == NULL) {
8114 /* no cluster map, nothing to clean */
8115 if (!dirty) {
8116 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
8117 return KERN_SUCCESS;
8118 }
8119 kret = vfs_drt_alloc_map(cmapp);
8120 if (kret != KERN_SUCCESS) {
8121 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
8122 return kret;
8123 }
8124 }
8125 setcount = 0;
8126
8127 /*
8128 * Iterate over the length of the region.
8129 */
8130 while (length > 0) {
8131 /*
8132 * Get the hashtable index for this offset.
8133 *
8134 * XXX this will add blank entries if we are clearing a range
8135 * that hasn't been dirtied.
8136 */
8137 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
8138 cmap = *cmapp; /* may have changed! */
8139 /* this may be a partial-success return */
8140 if (kret != KERN_SUCCESS) {
8141 if (setcountp != NULL) {
8142 *setcountp = setcount;
8143 }
8144 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
8145
8146 return kret;
8147 }
8148
8149 /*
8150 * Work out how many pages we're modifying in this
8151 * hashtable entry.
8152 */
8153 pgoff = (int)((offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE);
8154 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
8155
8156 /*
8157 * Iterate over pages, dirty/clearing as we go.
8158 */
8159 ecount = DRT_HASH_GET_COUNT(cmap, index);
8160 for (i = 0; i < pgcount; i++) {
8161 if (dirty) {
8162 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
8163 if (ecount >= DRT_BITVECTOR_PAGES) {
8164 panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
8165 }
8166 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
8167 ecount++;
8168 setcount++;
8169 }
8170 } else {
8171 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
8172 if (ecount <= 0) {
8173 panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
8174 }
8175 assert(ecount > 0);
8176 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
8177 ecount--;
8178 setcount++;
8179 }
8180 }
8181 }
8182 DRT_HASH_SET_COUNT(cmap, index, ecount);
8183
8184 offset += pgcount * PAGE_SIZE;
8185 length -= pgcount * PAGE_SIZE;
8186 }
8187 if (setcountp != NULL) {
8188 *setcountp = setcount;
8189 }
8190
8191 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
8192
8193 return KERN_SUCCESS;
8194 }
8195
8196 /*
8197 * Mark a set of pages as dirty/clean.
8198 *
8199 * This is a public interface.
8200 *
8201 * cmapp
8202 * Pointer to storage suitable for holding a pointer. Note that
8203 * this must either be NULL or a value set by this function.
8204 *
8205 * size
8206 * Current file size in bytes.
8207 *
8208 * offset
8209 * Offset of the first page to be marked as dirty, in bytes. Must be
8210 * page-aligned.
8211 *
8212 * length
8213 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
8214 *
8215 * setcountp
8216 * Number of pages newly marked dirty by this call (optional).
8217 *
8218 * Returns KERN_SUCCESS if all the pages were successfully marked.
8219 */
8220 static kern_return_t
vfs_drt_mark_pages(void ** cmapp,off_t offset,u_int length,u_int * setcountp)8221 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
8222 {
8223 /* XXX size unused, drop from interface */
8224 return vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1);
8225 }
8226
8227 #if 0
8228 static kern_return_t
8229 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
8230 {
8231 return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
8232 }
8233 #endif
8234
8235 /*
8236 * Get a cluster of dirty pages.
8237 *
8238 * This is a public interface.
8239 *
8240 * cmapp
8241 * Pointer to storage managed by drt_mark_pages. Note that this must
8242 * be NULL or a value set by drt_mark_pages.
8243 *
8244 * offsetp
8245 * Returns the byte offset into the file of the first page in the cluster.
8246 *
8247 * lengthp
8248 * Returns the length in bytes of the cluster of dirty pages.
8249 *
8250 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
8251 * are no dirty pages meeting the minmum size criteria. Private storage will
8252 * be released if there are no more dirty pages left in the map
8253 *
8254 */
8255 static kern_return_t
vfs_drt_get_cluster(void ** cmapp,off_t * offsetp,u_int * lengthp)8256 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
8257 {
8258 struct vfs_drt_clustermap *cmap;
8259 u_int64_t offset;
8260 u_int length;
8261 u_int32_t j;
8262 int index, i, fs, ls;
8263
8264 /* sanity */
8265 if ((cmapp == NULL) || (*cmapp == NULL)) {
8266 return KERN_FAILURE;
8267 }
8268 cmap = *cmapp;
8269
8270 /* walk the hashtable */
8271 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
8272 index = DRT_HASH(cmap, offset);
8273
8274 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
8275 continue;
8276 }
8277
8278 /* scan the bitfield for a string of bits */
8279 fs = -1;
8280
8281 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
8282 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
8283 fs = i;
8284 break;
8285 }
8286 }
8287 if (fs == -1) {
8288 /* didn't find any bits set */
8289 panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
8290 cmap, index, DRT_HASH_GET_COUNT(cmap, index));
8291 }
8292 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
8293 if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
8294 break;
8295 }
8296 }
8297
8298 /* compute offset and length, mark pages clean */
8299 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
8300 length = ls * PAGE_SIZE;
8301 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
8302 cmap->scm_lastclean = index;
8303
8304 /* return successful */
8305 *offsetp = (off_t)offset;
8306 *lengthp = length;
8307
8308 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
8309 return KERN_SUCCESS;
8310 }
8311 /*
8312 * We didn't find anything... hashtable is empty
8313 * emit stats into trace buffer and
8314 * then free it
8315 */
8316 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
8317 cmap->scm_modulus,
8318 cmap->scm_buckets,
8319 cmap->scm_lastclean,
8320 cmap->scm_iskips);
8321
8322 vfs_drt_free_map(cmap);
8323 *cmapp = NULL;
8324
8325 return KERN_FAILURE;
8326 }
8327
8328
8329 static kern_return_t
vfs_drt_control(void ** cmapp,int op_type)8330 vfs_drt_control(void **cmapp, int op_type)
8331 {
8332 struct vfs_drt_clustermap *cmap;
8333
8334 /* sanity */
8335 if ((cmapp == NULL) || (*cmapp == NULL)) {
8336 return KERN_FAILURE;
8337 }
8338 cmap = *cmapp;
8339
8340 switch (op_type) {
8341 case 0:
8342 /* emit stats into trace buffer */
8343 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
8344 cmap->scm_modulus,
8345 cmap->scm_buckets,
8346 cmap->scm_lastclean,
8347 cmap->scm_iskips);
8348
8349 vfs_drt_free_map(cmap);
8350 *cmapp = NULL;
8351 break;
8352
8353 case 1:
8354 cmap->scm_lastclean = 0;
8355 break;
8356 }
8357 return KERN_SUCCESS;
8358 }
8359
8360
8361
8362 /*
8363 * Emit a summary of the state of the clustermap into the trace buffer
8364 * along with some caller-provided data.
8365 */
8366 #if KDEBUG
8367 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,int code,int arg1,int arg2,int arg3,int arg4)8368 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
8369 {
8370 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
8371 }
8372 #else
8373 static void
vfs_drt_trace(__unused struct vfs_drt_clustermap * cmap,__unused int code,__unused int arg1,__unused int arg2,__unused int arg3,__unused int arg4)8374 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
8375 __unused int arg1, __unused int arg2, __unused int arg3,
8376 __unused int arg4)
8377 {
8378 }
8379 #endif
8380
8381 #if 0
8382 /*
8383 * Perform basic sanity check on the hash entry summary count
8384 * vs. the actual bits set in the entry.
8385 */
8386 static void
8387 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
8388 {
8389 int index, i;
8390 int bits_on;
8391
8392 for (index = 0; index < cmap->scm_modulus; index++) {
8393 if (DRT_HASH_VACANT(cmap, index)) {
8394 continue;
8395 }
8396
8397 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
8398 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
8399 bits_on++;
8400 }
8401 }
8402 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
8403 panic("bits_on = %d, index = %d", bits_on, index);
8404 }
8405 }
8406 }
8407 #endif
8408
8409 /*
8410 * Internal interface only.
8411 */
8412 static kern_return_t
vfs_get_scmap_push_behavior_internal(void ** cmapp,int * push_flag)8413 vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag)
8414 {
8415 struct vfs_drt_clustermap *cmap;
8416
8417 /* sanity */
8418 if ((cmapp == NULL) || (*cmapp == NULL) || (push_flag == NULL)) {
8419 return KERN_FAILURE;
8420 }
8421 cmap = *cmapp;
8422
8423 if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
8424 /*
8425 * If we have a full xlarge sparse cluster,
8426 * we push it out all at once so the cluster
8427 * map can be available to absorb more I/Os.
8428 * This is done on large memory configs so
8429 * the small I/Os don't interfere with the
8430 * pro workloads.
8431 */
8432 *push_flag = PUSH_ALL;
8433 }
8434 return KERN_SUCCESS;
8435 }
8436