1 /*
2 * Copyright (c) 2000-2019 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1993, 1995
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
62 */
63
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/conf.h>
70 #include <sys/buf_internal.h>
71 #include <sys/mount_internal.h>
72 #include <sys/vnode_internal.h>
73 #include <sys/file_internal.h>
74 #include <sys/namei.h>
75 #include <sys/stat.h>
76 #include <sys/errno.h>
77 #include <sys/ioctl.h>
78 #include <sys/file.h>
79 #include <sys/user.h>
80 #include <sys/malloc.h>
81 #include <sys/disk.h>
82 #include <sys/uio_internal.h>
83 #include <sys/resource.h>
84 #include <machine/machine_routines.h>
85 #include <miscfs/specfs/specdev.h>
86 #include <vfs/vfs_support.h>
87 #include <vfs/vfs_disk_conditioner.h>
88
89 #include <kern/assert.h>
90 #include <kern/task.h>
91 #include <kern/sched_prim.h>
92 #include <kern/thread.h>
93 #include <kern/policy_internal.h>
94 #include <kern/timer_call.h>
95 #include <kern/waitq.h>
96
97 #include <pexpert/pexpert.h>
98
99 #include <sys/kdebug.h>
100 #include <libkern/section_keywords.h>
101
102 #if CONFIG_IO_COMPRESSION_STATS
103 #include <vfs/vfs_io_compression_stats.h>
104 #endif /* CONFIG_IO_COMPRESSION_STATS */
105
106 /* XXX following three prototypes should be in a header file somewhere */
107 extern dev_t chrtoblk(dev_t dev);
108 extern boolean_t iskmemdev(dev_t dev);
109 extern int bpfkqfilter(dev_t dev, struct knote *kn);
110 extern int ptsd_kqfilter(dev_t, struct knote *);
111 extern int ptmx_kqfilter(dev_t, struct knote *);
112 #if CONFIG_PHYS_WRITE_ACCT
113 uint64_t kernel_pm_writes; // to track the sync writes occurring during power management transitions
114 #endif /* CONFIG_PHYS_WRITE_ACCT */
115
116
117 struct vnode *speclisth[SPECHSZ];
118
119 /* symbolic sleep message strings for devices */
120 char devopn[] = "devopn";
121 char devio[] = "devio";
122 char devwait[] = "devwait";
123 char devin[] = "devin";
124 char devout[] = "devout";
125 char devioc[] = "devioc";
126 char devcls[] = "devcls";
127
128 #define VOPFUNC int (*)(void *)
129
130 int(**spec_vnodeop_p)(void *);
131 const struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
132 { .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)(void (*)(void))vn_default_error },
133 { .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)spec_lookup }, /* lookup */
134 { .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create }, /* create */
135 { .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)err_mknod }, /* mknod */
136 { .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)spec_open }, /* open */
137 { .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)spec_close }, /* close */
138 { .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)spec_access }, /* access */
139 { .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)spec_getattr }, /* getattr */
140 { .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)spec_setattr }, /* setattr */
141 { .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)spec_read }, /* read */
142 { .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)spec_write }, /* write */
143 { .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)spec_ioctl }, /* ioctl */
144 { .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)spec_select }, /* select */
145 { .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)nop_revoke }, /* revoke */
146 { .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap }, /* mmap */
147 { .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)spec_fsync }, /* fsync */
148 { .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)err_remove }, /* remove */
149 { .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)err_link }, /* link */
150 { .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)err_rename }, /* rename */
151 { .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)err_mkdir }, /* mkdir */
152 { .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)err_rmdir }, /* rmdir */
153 { .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)err_symlink }, /* symlink */
154 { .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)err_readdir }, /* readdir */
155 { .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink }, /* readlink */
156 { .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)nop_inactive }, /* inactive */
157 { .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)nop_reclaim }, /* reclaim */
158 { .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)spec_strategy }, /* strategy */
159 { .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)spec_pathconf }, /* pathconf */
160 { .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)err_advlock }, /* advlock */
161 { .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)spec_bwrite }, /* bwrite */
162 { .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein }, /* Pagein */
163 { .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout }, /* Pageout */
164 { .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile }, /* Copyfile */
165 { .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)spec_blktooff }, /* blktooff */
166 { .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)spec_offtoblk }, /* offtoblk */
167 { .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)spec_blockmap }, /* blockmap */
168 { .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL }
169 };
170 const struct vnodeopv_desc spec_vnodeop_opv_desc =
171 { .opv_desc_vector_p = &spec_vnodeop_p, .opv_desc_ops = spec_vnodeop_entries };
172
173
174 static void set_blocksize(vnode_t, dev_t);
175
176 #define LOWPRI_TIER1_WINDOW_MSECS 25
177 #define LOWPRI_TIER2_WINDOW_MSECS 100
178 #define LOWPRI_TIER3_WINDOW_MSECS 500
179
180 #define LOWPRI_TIER1_IO_PERIOD_MSECS 40
181 #define LOWPRI_TIER2_IO_PERIOD_MSECS 85
182 #define LOWPRI_TIER3_IO_PERIOD_MSECS 200
183
184 #define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS 5
185 #define LOWPRI_TIER2_IO_PERIOD_SSD_MSECS 15
186 #define LOWPRI_TIER3_IO_PERIOD_SSD_MSECS 25
187
188
189 int throttle_windows_msecs[THROTTLE_LEVEL_END + 1] = {
190 0,
191 LOWPRI_TIER1_WINDOW_MSECS,
192 LOWPRI_TIER2_WINDOW_MSECS,
193 LOWPRI_TIER3_WINDOW_MSECS,
194 };
195
196 int throttle_io_period_msecs[THROTTLE_LEVEL_END + 1] = {
197 0,
198 LOWPRI_TIER1_IO_PERIOD_MSECS,
199 LOWPRI_TIER2_IO_PERIOD_MSECS,
200 LOWPRI_TIER3_IO_PERIOD_MSECS,
201 };
202
203 int throttle_io_period_ssd_msecs[THROTTLE_LEVEL_END + 1] = {
204 0,
205 LOWPRI_TIER1_IO_PERIOD_SSD_MSECS,
206 LOWPRI_TIER2_IO_PERIOD_SSD_MSECS,
207 LOWPRI_TIER3_IO_PERIOD_SSD_MSECS,
208 };
209
210
211 int throttled_count[THROTTLE_LEVEL_END + 1];
212
213 struct _throttle_io_info_t {
214 lck_mtx_t throttle_lock;
215
216 struct timeval throttle_last_write_timestamp;
217 struct timeval throttle_min_timer_deadline;
218 struct timeval throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1]; /* window starts at both the beginning and completion of an I/O */
219 struct timeval throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1];
220 pid_t throttle_last_IO_pid[THROTTLE_LEVEL_END + 1];
221 struct timeval throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + 1];
222 int32_t throttle_inflight_count[THROTTLE_LEVEL_END + 1];
223
224 TAILQ_HEAD(, uthread) throttle_uthlist[THROTTLE_LEVEL_END + 1]; /* Lists of throttled uthreads */
225 int throttle_next_wake_level;
226
227 thread_call_t throttle_timer_call;
228 int32_t throttle_timer_ref;
229 int32_t throttle_timer_active;
230
231 int32_t throttle_io_count;
232 int32_t throttle_io_count_begin;
233 int *throttle_io_periods;
234 uint32_t throttle_io_period_num;
235
236 int32_t throttle_refcnt;
237 int32_t throttle_alloc;
238 int32_t throttle_disabled;
239 int32_t throttle_is_fusion_with_priority;
240 };
241
242 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
243
244
245 int lowpri_throttle_enabled = 1;
246
247
248 static void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level);
249 static int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap);
250 static int throttle_get_thread_throttle_level(uthread_t ut);
251 static int throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier);
252 void throttle_info_mount_reset_period(mount_t mp, int isssd);
253
254 /*
255 * Trivial lookup routine that always fails.
256 */
257 int
spec_lookup(struct vnop_lookup_args * ap)258 spec_lookup(struct vnop_lookup_args *ap)
259 {
260 *ap->a_vpp = NULL;
261 return ENOTDIR;
262 }
263
264 static void
set_blocksize(struct vnode * vp,dev_t dev)265 set_blocksize(struct vnode *vp, dev_t dev)
266 {
267 int (*size)(dev_t);
268 int rsize;
269
270 if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
271 rsize = (*size)(dev);
272 if (rsize <= 0) { /* did size fail? */
273 vp->v_specsize = DEV_BSIZE;
274 } else {
275 vp->v_specsize = rsize;
276 }
277 } else {
278 vp->v_specsize = DEV_BSIZE;
279 }
280 }
281
282 void
set_fsblocksize(struct vnode * vp)283 set_fsblocksize(struct vnode *vp)
284 {
285 if (vp->v_type == VBLK) {
286 dev_t dev = (dev_t)vp->v_rdev;
287 int maj = major(dev);
288
289 if ((u_int)maj >= (u_int)nblkdev) {
290 return;
291 }
292
293 vnode_lock(vp);
294 set_blocksize(vp, dev);
295 vnode_unlock(vp);
296 }
297 }
298
299
300 /*
301 * Open a special file.
302 */
303 int
spec_open(struct vnop_open_args * ap)304 spec_open(struct vnop_open_args *ap)
305 {
306 struct proc *p = vfs_context_proc(ap->a_context);
307 kauth_cred_t cred = vfs_context_ucred(ap->a_context);
308 struct vnode *vp = ap->a_vp;
309 dev_t bdev, dev = (dev_t)vp->v_rdev;
310 int maj = major(dev);
311 int error;
312
313 /*
314 * Don't allow open if fs is mounted -nodev.
315 */
316 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) {
317 return ENXIO;
318 }
319
320 switch (vp->v_type) {
321 case VCHR:
322 if ((u_int)maj >= (u_int)nchrdev) {
323 return ENXIO;
324 }
325 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
326 #if 0
327 /*
328 * When running in very secure mode, do not allow
329 * opens for writing of any disk character devices.
330 */
331 if (securelevel >= 2 && isdisk(dev, VCHR)) {
332 return EPERM;
333 }
334 #endif
335
336 /* Never allow writing to /dev/mem or /dev/kmem */
337 if (iskmemdev(dev)) {
338 return EPERM;
339 }
340 /*
341 * When running in secure mode, do not allow opens for
342 * writing of character devices whose corresponding block
343 * devices are currently mounted.
344 */
345 if (securelevel >= 1) {
346 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error)) {
347 return error;
348 }
349 }
350 }
351
352 devsw_lock(dev, S_IFCHR);
353 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
354
355 if (error == 0) {
356 vp->v_specinfo->si_opencount++;
357 }
358
359 devsw_unlock(dev, S_IFCHR);
360
361 if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
362 int isssd = 0;
363 uint64_t throttle_mask = 0;
364 uint32_t devbsdunit = 0;
365
366 if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
367 if (throttle_mask != 0 &&
368 VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
369 /*
370 * as a reasonable approximation, only use the lowest bit of the mask
371 * to generate a disk unit number
372 */
373 devbsdunit = num_trailing_0(throttle_mask);
374
375 vnode_lock(vp);
376
377 vp->v_un.vu_specinfo->si_isssd = isssd ? 1 : 0;
378 vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
379 vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
380 vp->v_un.vu_specinfo->si_throttleable = 1;
381 vp->v_un.vu_specinfo->si_initted = 1;
382
383 vnode_unlock(vp);
384 }
385 }
386 if (vp->v_un.vu_specinfo->si_initted == 0) {
387 vnode_lock(vp);
388 vp->v_un.vu_specinfo->si_initted = 1;
389 vnode_unlock(vp);
390 }
391 }
392 return error;
393
394 case VBLK:
395 if ((u_int)maj >= (u_int)nblkdev) {
396 return ENXIO;
397 }
398 /*
399 * When running in very secure mode, do not allow
400 * opens for writing of any disk block devices.
401 */
402 if (securelevel >= 2 && cred != FSCRED &&
403 (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) {
404 return EPERM;
405 }
406 /*
407 * Do not allow opens of block devices that are
408 * currently mounted.
409 */
410 if ((error = vfs_mountedon(vp))) {
411 return error;
412 }
413
414 devsw_lock(dev, S_IFBLK);
415 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
416 if (!error) {
417 vp->v_specinfo->si_opencount++;
418 }
419 devsw_unlock(dev, S_IFBLK);
420
421 if (!error) {
422 u_int64_t blkcnt;
423 u_int32_t blksize;
424 int setsize = 0;
425 u_int32_t size512 = 512;
426
427
428 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
429 /* Switch to 512 byte sectors (temporarily) */
430
431 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
432 /* Get the number of 512 byte physical blocks. */
433 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
434 setsize = 1;
435 }
436 }
437 /* If it doesn't set back, we can't recover */
438 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) {
439 error = ENXIO;
440 }
441 }
442
443
444 vnode_lock(vp);
445 set_blocksize(vp, dev);
446
447 /*
448 * Cache the size in bytes of the block device for later
449 * use by spec_write().
450 */
451 if (setsize) {
452 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
453 } else {
454 vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */
455 }
456 vnode_unlock(vp);
457 }
458 return error;
459 default:
460 panic("spec_open type");
461 }
462 return 0;
463 }
464
465 /*
466 * Vnode op for read
467 */
468 int
spec_read(struct vnop_read_args * ap)469 spec_read(struct vnop_read_args *ap)
470 {
471 struct vnode *vp = ap->a_vp;
472 struct uio *uio = ap->a_uio;
473 struct buf *bp;
474 daddr64_t bn, nextbn;
475 long bscale;
476 int devBlockSize = 0;
477 size_t bsize, n, on;
478 int error = 0;
479 dev_t dev;
480
481 #if DIAGNOSTIC
482 if (uio->uio_rw != UIO_READ) {
483 panic("spec_read mode");
484 }
485 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
486 panic("spec_read proc");
487 }
488 #endif
489 if (uio_resid(uio) == 0) {
490 return 0;
491 }
492
493 switch (vp->v_type) {
494 case VCHR:
495 {
496 struct _throttle_io_info_t *throttle_info = NULL;
497 int thread_throttle_level;
498 uint64_t blkno = 0;
499 uint32_t iolen = 0;
500 int ddisk = 0;
501 int ktrace_code = DKIO_READ;
502 devBlockSize = vp->v_specsize;
503 uintptr_t our_id = 0;
504
505 if (cdevsw[major(vp->v_rdev)].d_type == D_DISK) {
506 ddisk = 1;
507 }
508
509 if (ddisk && vp->v_un.vu_specinfo->si_throttleable) {
510 throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
511 thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
512 }
513
514 if (kdebug_enable && ddisk) {
515 if (devBlockSize == 0) {
516 devBlockSize = 512; // default sector size
517 }
518
519 if (uio_offset(uio) && devBlockSize) {
520 blkno = ((uint64_t) uio_offset(uio) / ((uint64_t)devBlockSize));
521 }
522 iolen = (int) uio_resid(uio);
523 our_id = (uintptr_t)thread_tid(current_thread());
524 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
525 (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
526 vp->v_rdev, blkno, iolen, 0);
527 }
528
529 error = (*cdevsw[major(vp->v_rdev)].d_read)
530 (vp->v_rdev, uio, ap->a_ioflag);
531
532
533 if (kdebug_enable && ddisk) {
534 uint32_t residual = (uint32_t)uio_resid(uio);
535 ktrace_code |= DKIO_DONE;
536 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
537 (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
538 (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0);
539 }
540
541 if (throttle_info) {
542 throttle_info_end_io_internal(throttle_info, thread_throttle_level);
543 }
544
545 return error;
546 }
547
548 case VBLK:
549 if (uio->uio_offset < 0) {
550 return EINVAL;
551 }
552
553 dev = vp->v_rdev;
554
555 devBlockSize = vp->v_specsize;
556
557 if (devBlockSize > PAGE_SIZE) {
558 return EINVAL;
559 }
560
561 bscale = PAGE_SIZE / devBlockSize;
562 bsize = bscale * devBlockSize;
563
564 do {
565 on = uio->uio_offset % bsize;
566
567 bn = (daddr64_t)((uio->uio_offset / devBlockSize) & ~(bscale - 1));
568
569 if (vp->v_speclastr + bscale == bn) {
570 nextbn = bn + bscale;
571 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
572 (int *)&bsize, 1, NOCRED, &bp);
573 } else {
574 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
575 }
576
577 vnode_lock(vp);
578 vp->v_speclastr = bn;
579 vnode_unlock(vp);
580
581 n = bsize - buf_resid(bp);
582 if ((on > n) || error) {
583 if (!error) {
584 error = EINVAL;
585 }
586 buf_brelse(bp);
587 return error;
588 }
589 n = MIN((n - on), (size_t)uio_resid(uio));
590
591 error = uiomove((char *)buf_dataptr(bp) + on, (int)n, uio);
592 if (n + on == bsize) {
593 buf_markaged(bp);
594 }
595 buf_brelse(bp);
596 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
597 return error;
598
599 default:
600 panic("spec_read type");
601 }
602 /* NOTREACHED */
603
604 return 0;
605 }
606
607 /*
608 * Vnode op for write
609 */
610 int
spec_write(struct vnop_write_args * ap)611 spec_write(struct vnop_write_args *ap)
612 {
613 struct vnode *vp = ap->a_vp;
614 struct uio *uio = ap->a_uio;
615 struct buf *bp;
616 daddr64_t bn;
617 int blkmask, bscale;
618 int io_sync;
619 int devBlockSize = 0;
620 size_t bsize, n, on;
621 int error = 0;
622 dev_t dev;
623
624 #if DIAGNOSTIC
625 if (uio->uio_rw != UIO_WRITE) {
626 panic("spec_write mode");
627 }
628 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
629 panic("spec_write proc");
630 }
631 #endif
632
633 switch (vp->v_type) {
634 case VCHR:
635 {
636 struct _throttle_io_info_t *throttle_info = NULL;
637 int thread_throttle_level;
638 dev = vp->v_rdev;
639 devBlockSize = vp->v_specsize;
640 uint32_t iolen = 0;
641 uint64_t blkno = 0;
642 int ddisk = 0;
643 int ktrace_code = 0; // write is implied; read must be OR'd in.
644 uintptr_t our_id = 0;
645
646 if (cdevsw[major(dev)].d_type == D_DISK) {
647 ddisk = 1;
648 }
649
650 if (ddisk && vp->v_un.vu_specinfo->si_throttleable) {
651 throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
652
653 thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
654
655 microuptime(&throttle_info->throttle_last_write_timestamp);
656 }
657
658 if (kdebug_enable && ddisk) {
659 if (devBlockSize == 0) {
660 devBlockSize = 512; // default sector size
661 }
662 if ((uio_offset(uio) != 0) && devBlockSize) {
663 blkno = ((uint64_t)uio_offset(uio)) / ((uint64_t)devBlockSize);
664 }
665 iolen = (int)uio_resid(uio);
666 our_id = (uintptr_t)thread_tid(current_thread());
667 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
668 (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
669 vp->v_rdev, blkno, iolen, 0);
670 }
671 error = (*cdevsw[major(vp->v_rdev)].d_write)
672 (vp->v_rdev, uio, ap->a_ioflag);
673
674 if (kdebug_enable && ddisk) {
675 //emit the I/O completion
676 uint32_t residual = (uint32_t)uio_resid(uio);
677 ktrace_code |= DKIO_DONE;
678 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
679 (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
680 (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0);
681 }
682
683 if (throttle_info) {
684 throttle_info_end_io_internal(throttle_info, thread_throttle_level);
685 }
686
687 return error;
688 }
689
690 case VBLK:
691 if (uio_resid(uio) == 0) {
692 return 0;
693 }
694 if (uio->uio_offset < 0) {
695 return EINVAL;
696 }
697
698 io_sync = (ap->a_ioflag & IO_SYNC);
699
700 dev = (vp->v_rdev);
701
702 devBlockSize = vp->v_specsize;
703 if (devBlockSize > PAGE_SIZE) {
704 return EINVAL;
705 }
706
707 bscale = PAGE_SIZE / devBlockSize;
708 blkmask = bscale - 1;
709 bsize = bscale * devBlockSize;
710
711
712 do {
713 bn = (daddr64_t)((uio->uio_offset / devBlockSize) & ~blkmask);
714 on = uio->uio_offset % bsize;
715
716 n = MIN((bsize - on), (size_t)uio_resid(uio));
717
718 /*
719 * Use buf_getblk() as an optimization IFF:
720 *
721 * 1) We are reading exactly a block on a block
722 * aligned boundary
723 * 2) We know the size of the device from spec_open
724 * 3) The read doesn't span the end of the device
725 *
726 * Otherwise, we fall back on buf_bread().
727 */
728 if (n == bsize &&
729 vp->v_specdevsize != (u_int64_t)0 &&
730 (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
731 /* reduce the size of the read to what is there */
732 n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
733 }
734
735 if (n == bsize) {
736 bp = buf_getblk(vp, bn, (int)bsize, 0, 0, BLK_WRITE);
737 } else {
738 error = (int)buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
739 }
740
741 /* Translate downstream error for upstream, if needed */
742 if (!error) {
743 error = (int)buf_error(bp);
744 }
745 if (error) {
746 buf_brelse(bp);
747 return error;
748 }
749 n = MIN(n, bsize - buf_resid(bp));
750
751 error = uiomove((char *)buf_dataptr(bp) + on, (int)n, uio);
752 if (error) {
753 buf_brelse(bp);
754 return error;
755 }
756 buf_markaged(bp);
757
758 if (io_sync) {
759 error = buf_bwrite(bp);
760 } else {
761 if ((n + on) == bsize) {
762 error = buf_bawrite(bp);
763 } else {
764 error = buf_bdwrite(bp);
765 }
766 }
767 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
768 return error;
769
770 default:
771 panic("spec_write type");
772 }
773 /* NOTREACHED */
774
775 return 0;
776 }
777
778 /*
779 * Device ioctl operation.
780 */
781 int
spec_ioctl(struct vnop_ioctl_args * ap)782 spec_ioctl(struct vnop_ioctl_args *ap)
783 {
784 proc_t p = vfs_context_proc(ap->a_context);
785 dev_t dev = ap->a_vp->v_rdev;
786 int retval = 0;
787
788 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
789 dev, ap->a_command, ap->a_fflag, ap->a_vp->v_type, 0);
790
791 switch (ap->a_vp->v_type) {
792 case VCHR:
793 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
794 ap->a_fflag, p);
795 break;
796
797 case VBLK:
798 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p);
799 if (!retval && ap->a_command == DKIOCSETBLOCKSIZE) {
800 ap->a_vp->v_specsize = *(uint32_t *)ap->a_data;
801 }
802 break;
803
804 default:
805 panic("spec_ioctl");
806 /* NOTREACHED */
807 }
808 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
809 dev, ap->a_command, ap->a_fflag, retval, 0);
810
811 return retval;
812 }
813
814 int
spec_select(struct vnop_select_args * ap)815 spec_select(struct vnop_select_args *ap)
816 {
817 proc_t p = vfs_context_proc(ap->a_context);
818 dev_t dev;
819
820 switch (ap->a_vp->v_type) {
821 default:
822 return 1; /* XXX */
823
824 case VCHR:
825 dev = ap->a_vp->v_rdev;
826 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
827 }
828 }
829
830 int
spec_kqfilter(vnode_t vp,struct knote * kn,struct kevent_qos_s * kev)831 spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_qos_s *kev)
832 {
833 dev_t dev;
834
835 assert(vnode_ischr(vp));
836
837 dev = vnode_specrdev(vp);
838
839 #if NETWORKING
840 /*
841 * Try a bpf device, as defined in bsd/net/bpf.c
842 * If it doesn't error out the attach, then it
843 * claimed it. Otherwise, fall through and try
844 * other attaches.
845 */
846 int32_t tmp_flags = kn->kn_flags;
847 int64_t tmp_sdata = kn->kn_sdata;
848 int res;
849
850 res = bpfkqfilter(dev, kn);
851 if ((kn->kn_flags & EV_ERROR) == 0) {
852 return res;
853 }
854 kn->kn_flags = tmp_flags;
855 kn->kn_sdata = tmp_sdata;
856 #endif
857
858 if (major(dev) >= nchrdev) {
859 knote_set_error(kn, ENXIO);
860 return 0;
861 }
862
863 kn->kn_vnode_kqok = !!(cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE);
864 kn->kn_vnode_use_ofst = !!(cdevsw_flags[major(dev)] & CDEVSW_USE_OFFSET);
865
866 if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTS) {
867 kn->kn_filtid = EVFILTID_PTSD;
868 return ptsd_kqfilter(dev, kn);
869 } else if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTC) {
870 kn->kn_filtid = EVFILTID_PTMX;
871 return ptmx_kqfilter(dev, kn);
872 } else if (cdevsw[major(dev)].d_type == D_TTY && kn->kn_vnode_kqok) {
873 /*
874 * TTYs from drivers that use struct ttys use their own filter
875 * routines. The PTC driver doesn't use the tty for character
876 * counts, so it must go through the select fallback.
877 */
878 kn->kn_filtid = EVFILTID_TTY;
879 } else {
880 /* Try to attach to other char special devices */
881 kn->kn_filtid = EVFILTID_SPEC;
882 }
883
884 return knote_fops(kn)->f_attach(kn, kev);
885 }
886
887 /*
888 * Synch buffers associated with a block device
889 */
890 int
spec_fsync_internal(vnode_t vp,int waitfor,__unused vfs_context_t context)891 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
892 {
893 if (vp->v_type == VCHR) {
894 return 0;
895 }
896 /*
897 * Flush all dirty buffers associated with a block device.
898 */
899 buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
900
901 return 0;
902 }
903
904 int
spec_fsync(struct vnop_fsync_args * ap)905 spec_fsync(struct vnop_fsync_args *ap)
906 {
907 return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
908 }
909
910
911 /*
912 * Just call the device strategy routine
913 */
914 void throttle_init(void);
915
916
917 #if 0
918 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \
919 do { \
920 if ((debug_info)->alloc) \
921 printf("%s: "format, __FUNCTION__, ## args); \
922 } while(0)
923
924 #else
925 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
926 #endif
927
928
929 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER1], 0, "");
930 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER2], 0, "");
931 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER3], 0, "");
932
933 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER1], 0, "");
934 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER2], 0, "");
935 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER3], 0, "");
936
937 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER1], 0, "");
938 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER2], 0, "");
939 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER3], 0, "");
940
941 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, "");
942
943
944 static LCK_GRP_DECLARE(throttle_lock_grp, "throttle I/O");
945
946
947 /*
948 * throttled I/O helper function
949 * convert the index of the lowest set bit to a device index
950 */
951 int
num_trailing_0(uint64_t n)952 num_trailing_0(uint64_t n)
953 {
954 /*
955 * since in most cases the number of trailing 0s is very small,
956 * we simply counting sequentially from the lowest bit
957 */
958 if (n == 0) {
959 return sizeof(n) * 8;
960 }
961 int count = 0;
962 while (!ISSET(n, 1)) {
963 n >>= 1;
964 ++count;
965 }
966 return count;
967 }
968
969
970 /*
971 * Release the reference and if the item was allocated and this is the last
972 * reference then free it.
973 *
974 * This routine always returns the old value.
975 */
976 static int
throttle_info_rel(struct _throttle_io_info_t * info)977 throttle_info_rel(struct _throttle_io_info_t *info)
978 {
979 SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt);
980
981 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
982 info, (int)(oldValue - 1), info );
983
984 /* The reference count just went negative, very bad */
985 if (oldValue == 0) {
986 panic("throttle info ref cnt went negative!");
987 }
988
989 /*
990 * Once reference count is zero, no one else should be able to take a
991 * reference
992 */
993 if ((oldValue == 1) && (info->throttle_alloc)) {
994 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
995
996 lck_mtx_destroy(&info->throttle_lock, &throttle_lock_grp);
997 kfree_type(struct _throttle_io_info_t, info);
998 }
999 return oldValue;
1000 }
1001
1002
1003 /*
1004 * Just take a reference on the throttle info structure.
1005 *
1006 * This routine always returns the old value.
1007 */
1008 static SInt32
throttle_info_ref(struct _throttle_io_info_t * info)1009 throttle_info_ref(struct _throttle_io_info_t *info)
1010 {
1011 SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt);
1012
1013 DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
1014 info, (int)(oldValue - 1), info );
1015 /* Allocated items should never have a reference of zero */
1016 if (info->throttle_alloc && (oldValue == 0)) {
1017 panic("Taking a reference without calling create throttle info!");
1018 }
1019
1020 return oldValue;
1021 }
1022
1023 /*
1024 * on entry the throttle_lock is held...
1025 * this function is responsible for taking
1026 * and dropping the reference on the info
1027 * structure which will keep it from going
1028 * away while the timer is running if it
1029 * happens to have been dynamically allocated by
1030 * a network fileystem kext which is now trying
1031 * to free it
1032 */
1033 static uint32_t
throttle_timer_start(struct _throttle_io_info_t * info,boolean_t update_io_count,int wakelevel)1034 throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count, int wakelevel)
1035 {
1036 struct timeval elapsed;
1037 struct timeval now;
1038 struct timeval period;
1039 uint64_t elapsed_msecs;
1040 int throttle_level;
1041 int level;
1042 int msecs;
1043 boolean_t throttled = FALSE;
1044 boolean_t need_timer = FALSE;
1045
1046 microuptime(&now);
1047
1048 if (update_io_count == TRUE) {
1049 info->throttle_io_count_begin = info->throttle_io_count;
1050 info->throttle_io_period_num++;
1051
1052 while (wakelevel >= THROTTLE_LEVEL_THROTTLED) {
1053 info->throttle_start_IO_period_timestamp[wakelevel--] = now;
1054 }
1055
1056 info->throttle_min_timer_deadline = now;
1057
1058 msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED];
1059 period.tv_sec = msecs / 1000;
1060 period.tv_usec = (msecs % 1000) * 1000;
1061
1062 timevaladd(&info->throttle_min_timer_deadline, &period);
1063 }
1064 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
1065 elapsed = now;
1066 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1067 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1068
1069 for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
1070 if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) {
1071 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level] || info->throttle_inflight_count[throttle_level]) {
1072 /*
1073 * we had an I/O occur at a higher priority tier within
1074 * this tier's throttle window
1075 */
1076 throttled = TRUE;
1077 }
1078 /*
1079 * we assume that the windows are the same or longer
1080 * as we drop through the throttling tiers... thus
1081 * we can stop looking once we run into a tier with
1082 * threads to schedule regardless of whether it's
1083 * still in its throttling window or not
1084 */
1085 break;
1086 }
1087 }
1088 if (throttled == TRUE) {
1089 break;
1090 }
1091 }
1092 if (throttled == TRUE) {
1093 uint64_t deadline = 0;
1094 struct timeval target;
1095 struct timeval min_target;
1096
1097 /*
1098 * we've got at least one tier still in a throttled window
1099 * so we need a timer running... compute the next deadline
1100 * and schedule it
1101 */
1102 for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
1103 if (TAILQ_EMPTY(&info->throttle_uthlist[level])) {
1104 continue;
1105 }
1106
1107 target = info->throttle_start_IO_period_timestamp[level];
1108
1109 msecs = info->throttle_io_periods[level];
1110 period.tv_sec = msecs / 1000;
1111 period.tv_usec = (msecs % 1000) * 1000;
1112
1113 timevaladd(&target, &period);
1114
1115 if (need_timer == FALSE || timevalcmp(&target, &min_target, <)) {
1116 min_target = target;
1117 need_timer = TRUE;
1118 }
1119 }
1120 if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) {
1121 if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >)) {
1122 min_target = info->throttle_min_timer_deadline;
1123 }
1124 }
1125
1126 if (info->throttle_timer_active) {
1127 if (thread_call_cancel(info->throttle_timer_call) == FALSE) {
1128 /*
1129 * couldn't kill the timer because it's already
1130 * been dispatched, so don't try to start a new
1131 * one... once we drop the lock, the timer will
1132 * proceed and eventually re-run this function
1133 */
1134 need_timer = FALSE;
1135 } else {
1136 info->throttle_timer_active = 0;
1137 }
1138 }
1139 if (need_timer == TRUE) {
1140 /*
1141 * This is defined as an int (32-bit) rather than a 64-bit
1142 * value because it would need a really big period in the
1143 * order of ~500 days to overflow this. So, we let this be
1144 * 32-bit which allows us to use the clock_interval_to_deadline()
1145 * routine.
1146 */
1147 int target_msecs;
1148
1149 if (info->throttle_timer_ref == 0) {
1150 /*
1151 * take a reference for the timer
1152 */
1153 throttle_info_ref(info);
1154
1155 info->throttle_timer_ref = 1;
1156 }
1157 elapsed = min_target;
1158 timevalsub(&elapsed, &now);
1159 target_msecs = (int)(elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000);
1160
1161 if (target_msecs <= 0) {
1162 /*
1163 * we may have computed a deadline slightly in the past
1164 * due to various factors... if so, just set the timer
1165 * to go off in the near future (we don't need to be precise)
1166 */
1167 target_msecs = 1;
1168 }
1169 clock_interval_to_deadline(target_msecs, 1000000, &deadline);
1170
1171 thread_call_enter_delayed(info->throttle_timer_call, deadline);
1172 info->throttle_timer_active = 1;
1173 }
1174 }
1175 return throttle_level;
1176 }
1177
1178
1179 static void
throttle_timer(struct _throttle_io_info_t * info,__unused thread_call_param_t p)1180 throttle_timer(struct _throttle_io_info_t *info, __unused thread_call_param_t p)
1181 {
1182 uthread_t ut, utlist;
1183 struct timeval elapsed;
1184 struct timeval now;
1185 uint64_t elapsed_msecs;
1186 int throttle_level;
1187 int level;
1188 int wake_level;
1189 caddr_t wake_address = NULL;
1190 boolean_t update_io_count = FALSE;
1191 boolean_t need_wakeup = FALSE;
1192 boolean_t need_release = FALSE;
1193
1194 ut = NULL;
1195 lck_mtx_lock(&info->throttle_lock);
1196
1197 info->throttle_timer_active = 0;
1198 microuptime(&now);
1199
1200 elapsed = now;
1201 timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]);
1202 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1203
1204 if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) {
1205 wake_level = info->throttle_next_wake_level;
1206
1207 for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) {
1208 elapsed = now;
1209 timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]);
1210 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1211
1212 if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1213 /*
1214 * we're closing out the current IO period...
1215 * if we have a waiting thread, wake it up
1216 * after we have reset the I/O window info
1217 */
1218 need_wakeup = TRUE;
1219 update_io_count = TRUE;
1220
1221 info->throttle_next_wake_level = wake_level - 1;
1222
1223 if (info->throttle_next_wake_level == THROTTLE_LEVEL_START) {
1224 info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1225 }
1226
1227 break;
1228 }
1229 wake_level--;
1230
1231 if (wake_level == THROTTLE_LEVEL_START) {
1232 wake_level = THROTTLE_LEVEL_END;
1233 }
1234 }
1235 }
1236 if (need_wakeup == TRUE) {
1237 if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1238 ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]);
1239 TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist);
1240 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1241 ut->uu_is_throttled = false;
1242
1243 wake_address = (caddr_t)&ut->uu_on_throttlelist;
1244 }
1245 } else {
1246 wake_level = THROTTLE_LEVEL_START;
1247 }
1248
1249 throttle_level = throttle_timer_start(info, update_io_count, wake_level);
1250
1251 if (wake_address != NULL) {
1252 wakeup(wake_address);
1253 }
1254
1255 for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) {
1256 TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) {
1257 TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist);
1258 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1259 ut->uu_is_throttled = false;
1260
1261 wakeup(&ut->uu_on_throttlelist);
1262 }
1263 }
1264 if (info->throttle_timer_active == 0 && info->throttle_timer_ref) {
1265 info->throttle_timer_ref = 0;
1266 need_release = TRUE;
1267 }
1268 lck_mtx_unlock(&info->throttle_lock);
1269
1270 if (need_release == TRUE) {
1271 throttle_info_rel(info);
1272 }
1273 }
1274
1275
1276 static int
throttle_add_to_list(struct _throttle_io_info_t * info,uthread_t ut,int mylevel,boolean_t insert_tail)1277 throttle_add_to_list(struct _throttle_io_info_t *info, uthread_t ut, int mylevel, boolean_t insert_tail)
1278 {
1279 boolean_t start_timer = FALSE;
1280 int level = THROTTLE_LEVEL_START;
1281
1282 if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) {
1283 info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel];
1284 start_timer = TRUE;
1285 }
1286
1287 if (insert_tail == TRUE) {
1288 TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1289 } else {
1290 TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1291 }
1292
1293 ut->uu_on_throttlelist = (int8_t)mylevel;
1294
1295 if (start_timer == TRUE) {
1296 /* we may need to start or rearm the timer */
1297 level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START);
1298
1299 if (level == THROTTLE_LEVEL_END) {
1300 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1301 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1302
1303 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1304 }
1305 }
1306 }
1307 return level;
1308 }
1309
1310 static void
throttle_init_throttle_window(void)1311 throttle_init_throttle_window(void)
1312 {
1313 int throttle_window_size;
1314
1315 /*
1316 * The hierarchy of throttle window values is as follows:
1317 * - Global defaults
1318 * - Device tree properties
1319 * - Boot-args
1320 * All values are specified in msecs.
1321 */
1322
1323 #if (XNU_TARGET_OS_OSX && __arm64__)
1324 /*
1325 * IO Tier EDT overrides are meant for
1326 * some arm platforms but not for
1327 * macs.
1328 */
1329 #else /* (XNU_TARGET_OS_OSX && __arm64__) */
1330 /* Override global values with device-tree properties */
1331 if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) {
1332 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1333 }
1334
1335 if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) {
1336 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1337 }
1338
1339 if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) {
1340 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1341 }
1342 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
1343
1344 /* Override with boot-args */
1345 if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) {
1346 throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1347 }
1348
1349 if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) {
1350 throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1351 }
1352
1353 if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) {
1354 throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1355 }
1356 }
1357
1358 static void
throttle_init_throttle_period(struct _throttle_io_info_t * info,boolean_t isssd)1359 throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd)
1360 {
1361 int throttle_period_size;
1362
1363 /*
1364 * The hierarchy of throttle period values is as follows:
1365 * - Global defaults
1366 * - Device tree properties
1367 * - Boot-args
1368 * All values are specified in msecs.
1369 */
1370
1371 /* Assign global defaults */
1372 if ((isssd == TRUE) && (info->throttle_is_fusion_with_priority == 0)) {
1373 info->throttle_io_periods = &throttle_io_period_ssd_msecs[0];
1374 } else {
1375 info->throttle_io_periods = &throttle_io_period_msecs[0];
1376 }
1377
1378 #if (XNU_TARGET_OS_OSX && __arm64__)
1379 /*
1380 * IO Tier EDT overrides are meant for
1381 * some arm platforms but not for
1382 * macs.
1383 */
1384 #else /* (XNU_TARGET_OS_OSX && __arm64__) */
1385 /* Override global values with device-tree properties */
1386 if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) {
1387 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1388 }
1389
1390 if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) {
1391 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1392 }
1393
1394 if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) {
1395 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1396 }
1397 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
1398
1399 /* Override with boot-args */
1400 if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) {
1401 info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1402 }
1403
1404 if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) {
1405 info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1406 }
1407
1408 if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) {
1409 info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1410 }
1411 }
1412
1413 #if CONFIG_IOSCHED
1414 extern void vm_io_reprioritize_init(void);
1415 int iosched_enabled = 1;
1416 #endif
1417
1418 void
throttle_init(void)1419 throttle_init(void)
1420 {
1421 struct _throttle_io_info_t *info;
1422 int i;
1423 int level;
1424 #if CONFIG_IOSCHED
1425 int iosched;
1426 #endif
1427
1428 /* Update throttle parameters based on device tree configuration */
1429 throttle_init_throttle_window();
1430
1431 for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
1432 info = &_throttle_io_info[i];
1433
1434 lck_mtx_init(&info->throttle_lock, &throttle_lock_grp, LCK_ATTR_NULL);
1435 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1436
1437 for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1438 TAILQ_INIT(&info->throttle_uthlist[level]);
1439 info->throttle_last_IO_pid[level] = 0;
1440 info->throttle_inflight_count[level] = 0;
1441 }
1442 info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1443 info->throttle_disabled = 0;
1444 info->throttle_is_fusion_with_priority = 0;
1445 }
1446 #if CONFIG_IOSCHED
1447 if (PE_parse_boot_argn("iosched", &iosched, sizeof(iosched))) {
1448 iosched_enabled = iosched;
1449 }
1450 if (iosched_enabled) {
1451 /* Initialize I/O Reprioritization mechanism */
1452 vm_io_reprioritize_init();
1453 }
1454 #endif
1455 }
1456
1457 void
sys_override_io_throttle(boolean_t enable_override)1458 sys_override_io_throttle(boolean_t enable_override)
1459 {
1460 if (enable_override) {
1461 lowpri_throttle_enabled = 0;
1462 } else {
1463 lowpri_throttle_enabled = 1;
1464 }
1465 }
1466
1467 int rethrottle_wakeups = 0;
1468
1469 /*
1470 * the uu_rethrottle_lock is used to synchronize this function
1471 * with "throttle_lowpri_io" which is where a throttled thread
1472 * will block... that function will grab this lock before beginning
1473 * it's decision making process concerning the need to block, and
1474 * hold it through the assert_wait. When that thread is awakened
1475 * for any reason (timer or rethrottle), it will reacquire the
1476 * uu_rethrottle_lock before determining if it really is ok for
1477 * it to now run. This is the point at which the thread could
1478 * enter a different throttling queue and reblock or return from
1479 * the throttle w/o having waited out it's entire throttle if
1480 * the rethrottle has now moved it out of any currently
1481 * active throttle window.
1482 *
1483 *
1484 * NOTES:
1485 * 1 - This may be called with the task lock held.
1486 * 2 - This may be called with preemption and interrupts disabled
1487 * in the kqueue wakeup path so we can't take the throttle_lock which is a mutex
1488 * 3 - This cannot safely dereference uu_throttle_info, as it may
1489 * get deallocated out from under us
1490 */
1491
1492 void
rethrottle_thread(uthread_t ut)1493 rethrottle_thread(uthread_t ut)
1494 {
1495 /*
1496 * If uthread doesn't have throttle state, then there's no chance
1497 * of it needing a rethrottle.
1498 */
1499 if (ut->uu_throttle_info == NULL) {
1500 return;
1501 }
1502
1503 boolean_t s = ml_set_interrupts_enabled(FALSE);
1504 lck_spin_lock(&ut->uu_rethrottle_lock);
1505
1506 if (!ut->uu_is_throttled) {
1507 ut->uu_was_rethrottled = true;
1508 } else {
1509 int my_new_level = throttle_get_thread_throttle_level(ut);
1510
1511 if (my_new_level != ut->uu_on_throttlelist) {
1512 /*
1513 * ut is currently blocked (as indicated by
1514 * ut->uu_is_throttled == true)
1515 * and we're changing it's throttle level, so
1516 * we need to wake it up.
1517 */
1518 ut->uu_is_throttled = false;
1519 wakeup(&ut->uu_on_throttlelist);
1520
1521 rethrottle_wakeups++;
1522 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 102)),
1523 uthread_tid(ut), ut->uu_on_throttlelist, my_new_level, 0, 0);
1524 }
1525 }
1526 lck_spin_unlock(&ut->uu_rethrottle_lock);
1527 ml_set_interrupts_enabled(s);
1528 }
1529
1530
1531 /*
1532 * KPI routine
1533 *
1534 * Create and take a reference on a throttle info structure and return a
1535 * pointer for the file system to use when calling throttle_info_update.
1536 * Calling file system must have a matching release for every create.
1537 */
1538 void *
throttle_info_create(void)1539 throttle_info_create(void)
1540 {
1541 struct _throttle_io_info_t *info;
1542 int level;
1543
1544 info = kalloc_type(struct _throttle_io_info_t,
1545 Z_ZERO | Z_WAITOK | Z_NOFAIL);
1546 /* Mark that this one was allocated and needs to be freed */
1547 DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
1548 info->throttle_alloc = TRUE;
1549
1550 lck_mtx_init(&info->throttle_lock, &throttle_lock_grp, LCK_ATTR_NULL);
1551 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1552
1553 for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1554 TAILQ_INIT(&info->throttle_uthlist[level]);
1555 }
1556 info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1557
1558 /* Take a reference */
1559 OSIncrementAtomic(&info->throttle_refcnt);
1560 return info;
1561 }
1562
1563 /*
1564 * KPI routine
1565 *
1566 * Release the throttle info pointer if all the reference are gone. Should be
1567 * called to release reference taken by throttle_info_create
1568 */
1569 void
throttle_info_release(void * throttle_info)1570 throttle_info_release(void *throttle_info)
1571 {
1572 DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
1573 (struct _throttle_io_info_t *)throttle_info,
1574 (struct _throttle_io_info_t *)throttle_info);
1575 if (throttle_info) { /* Just to be careful */
1576 throttle_info_rel(throttle_info);
1577 }
1578 }
1579
1580 /*
1581 * KPI routine
1582 *
1583 * File Systems that create an info structure, need to call this routine in
1584 * their mount routine (used by cluster code). File Systems that call this in
1585 * their mount routines must call throttle_info_mount_rel in their unmount
1586 * routines.
1587 */
1588 void
throttle_info_mount_ref(mount_t mp,void * throttle_info)1589 throttle_info_mount_ref(mount_t mp, void *throttle_info)
1590 {
1591 if ((throttle_info == NULL) || (mp == NULL)) {
1592 return;
1593 }
1594 throttle_info_ref(throttle_info);
1595
1596 /*
1597 * We already have a reference release it before adding the new one
1598 */
1599 if (mp->mnt_throttle_info) {
1600 throttle_info_rel(mp->mnt_throttle_info);
1601 }
1602 mp->mnt_throttle_info = throttle_info;
1603 }
1604
1605 /*
1606 * Private KPI routine
1607 *
1608 * return a handle for accessing throttle_info given a throttle_mask. The
1609 * handle must be released by throttle_info_rel_by_mask
1610 */
1611 int
throttle_info_ref_by_mask(uint64_t throttle_mask,throttle_info_handle_t * throttle_info_handle)1612 throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle)
1613 {
1614 int dev_index;
1615 struct _throttle_io_info_t *info;
1616
1617 /*
1618 * The 'throttle_mask' is not expected to be 0 otherwise num_trailing_0()
1619 * would return value of 64 and this will cause '_throttle_io_info' to
1620 * go out of bounds as '_throttle_io_info' is only LOWPRI_MAX_NUM_DEV (64)
1621 * elements long.
1622 */
1623 if (throttle_info_handle == NULL || throttle_mask == 0) {
1624 return EINVAL;
1625 }
1626
1627 dev_index = num_trailing_0(throttle_mask);
1628 info = &_throttle_io_info[dev_index];
1629 throttle_info_ref(info);
1630 *(struct _throttle_io_info_t**)throttle_info_handle = info;
1631
1632 return 0;
1633 }
1634
1635 /*
1636 * Private KPI routine
1637 *
1638 * release the handle obtained by throttle_info_ref_by_mask
1639 */
1640 void
throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)1641 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
1642 {
1643 /*
1644 * for now the handle is just a pointer to _throttle_io_info_t
1645 */
1646 throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
1647 }
1648
1649 /*
1650 * KPI routine
1651 *
1652 * File Systems that throttle_info_mount_ref, must call this routine in their
1653 * umount routine.
1654 */
1655 void
throttle_info_mount_rel(mount_t mp)1656 throttle_info_mount_rel(mount_t mp)
1657 {
1658 if (mp->mnt_throttle_info) {
1659 throttle_info_rel(mp->mnt_throttle_info);
1660 }
1661 mp->mnt_throttle_info = NULL;
1662 }
1663
1664 /*
1665 * Reset throttling periods for the given mount point
1666 *
1667 * private interface used by disk conditioner to reset
1668 * throttling periods when 'is_ssd' status changes
1669 */
1670 void
throttle_info_mount_reset_period(mount_t mp,int isssd)1671 throttle_info_mount_reset_period(mount_t mp, int isssd)
1672 {
1673 struct _throttle_io_info_t *info;
1674
1675 if (mp == NULL) {
1676 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1677 } else if (mp->mnt_throttle_info == NULL) {
1678 info = &_throttle_io_info[mp->mnt_devbsdunit];
1679 } else {
1680 info = mp->mnt_throttle_info;
1681 }
1682
1683 throttle_init_throttle_period(info, isssd);
1684 }
1685
1686 void
throttle_info_get_last_io_time(mount_t mp,struct timeval * tv)1687 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
1688 {
1689 struct _throttle_io_info_t *info;
1690
1691 if (mp == NULL) {
1692 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1693 } else if (mp->mnt_throttle_info == NULL) {
1694 info = &_throttle_io_info[mp->mnt_devbsdunit];
1695 } else {
1696 info = mp->mnt_throttle_info;
1697 }
1698
1699 *tv = info->throttle_last_write_timestamp;
1700 }
1701
1702 void
update_last_io_time(mount_t mp)1703 update_last_io_time(mount_t mp)
1704 {
1705 struct _throttle_io_info_t *info;
1706
1707 if (mp == NULL) {
1708 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1709 } else if (mp->mnt_throttle_info == NULL) {
1710 info = &_throttle_io_info[mp->mnt_devbsdunit];
1711 } else {
1712 info = mp->mnt_throttle_info;
1713 }
1714
1715 microuptime(&info->throttle_last_write_timestamp);
1716 if (mp != NULL) {
1717 mp->mnt_last_write_completed_timestamp = info->throttle_last_write_timestamp;
1718 }
1719 }
1720
1721 int
throttle_get_io_policy(uthread_t * ut)1722 throttle_get_io_policy(uthread_t *ut)
1723 {
1724 if (ut != NULL) {
1725 *ut = current_uthread();
1726 }
1727
1728 return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
1729 }
1730
1731 int
throttle_get_passive_io_policy(uthread_t * ut)1732 throttle_get_passive_io_policy(uthread_t *ut)
1733 {
1734 if (ut != NULL) {
1735 *ut = current_uthread();
1736 }
1737
1738 return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_PASSIVE_IO);
1739 }
1740
1741
1742 static int
throttle_get_thread_throttle_level(uthread_t ut)1743 throttle_get_thread_throttle_level(uthread_t ut)
1744 {
1745 uthread_t *ut_p = (ut == NULL) ? &ut : NULL;
1746 int io_tier = throttle_get_io_policy(ut_p);
1747
1748 return throttle_get_thread_throttle_level_internal(ut, io_tier);
1749 }
1750
1751 /*
1752 * Return a throttle level given an existing I/O tier (such as returned by throttle_get_io_policy)
1753 */
1754 static int
throttle_get_thread_throttle_level_internal(uthread_t ut,int io_tier)1755 throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier)
1756 {
1757 int thread_throttle_level = io_tier;
1758 int user_idle_level;
1759
1760 assert(ut != NULL);
1761
1762 /* Bootcache misses should always be throttled */
1763 if (ut->uu_throttle_bc) {
1764 thread_throttle_level = THROTTLE_LEVEL_TIER3;
1765 }
1766
1767 /*
1768 * Issue tier3 I/O as tier2 when the user is idle
1769 * to allow maintenance tasks to make more progress.
1770 *
1771 * Assume any positive idle level is enough... for now it's
1772 * only ever 0 or 128 but this is not defined anywhere.
1773 */
1774 if (thread_throttle_level >= THROTTLE_LEVEL_TIER3) {
1775 user_idle_level = timer_get_user_idle_level();
1776 if (user_idle_level > 0) {
1777 thread_throttle_level--;
1778 }
1779 }
1780
1781 return thread_throttle_level;
1782 }
1783
1784 /*
1785 * I/O will be throttled if either of the following are true:
1786 * - Higher tiers have in-flight I/O
1787 * - The time delta since the last start/completion of a higher tier is within the throttle window interval
1788 *
1789 * In-flight I/O is bookended by throttle_info_update_internal/throttle_info_end_io_internal
1790 */
1791 static int
throttle_io_will_be_throttled_internal(void * throttle_info,int * mylevel,int * throttling_level)1792 throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level)
1793 {
1794 struct _throttle_io_info_t *info = throttle_info;
1795 struct timeval elapsed;
1796 struct timeval now;
1797 uint64_t elapsed_msecs;
1798 int thread_throttle_level;
1799 int throttle_level;
1800
1801 if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED) {
1802 return THROTTLE_DISENGAGED;
1803 }
1804
1805 microuptime(&now);
1806
1807 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
1808 if (info->throttle_inflight_count[throttle_level]) {
1809 break;
1810 }
1811 elapsed = now;
1812 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1813 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1814
1815 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) {
1816 break;
1817 }
1818 }
1819 if (throttle_level >= thread_throttle_level) {
1820 /*
1821 * we're beyond all of the throttle windows
1822 * that affect the throttle level of this thread,
1823 * so go ahead and treat as normal I/O
1824 */
1825 return THROTTLE_DISENGAGED;
1826 }
1827 if (mylevel) {
1828 *mylevel = thread_throttle_level;
1829 }
1830 if (throttling_level) {
1831 *throttling_level = throttle_level;
1832 }
1833
1834 if (info->throttle_io_count != info->throttle_io_count_begin) {
1835 /*
1836 * we've already issued at least one throttleable I/O
1837 * in the current I/O window, so avoid issuing another one
1838 */
1839 return THROTTLE_NOW;
1840 }
1841 /*
1842 * we're in the throttle window, so
1843 * cut the I/O size back
1844 */
1845 return THROTTLE_ENGAGED;
1846 }
1847
1848 /*
1849 * If we have a mount point and it has a throttle info pointer then
1850 * use it to do the check, otherwise use the device unit number to find
1851 * the correct throttle info array element.
1852 */
1853 int
throttle_io_will_be_throttled(__unused int lowpri_window_msecs,mount_t mp)1854 throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp)
1855 {
1856 struct _throttle_io_info_t *info;
1857
1858 /*
1859 * Should we just return zero if no mount point
1860 */
1861 if (mp == NULL) {
1862 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1863 } else if (mp->mnt_throttle_info == NULL) {
1864 info = &_throttle_io_info[mp->mnt_devbsdunit];
1865 } else {
1866 info = mp->mnt_throttle_info;
1867 }
1868
1869 if (info->throttle_is_fusion_with_priority) {
1870 uthread_t ut = current_uthread();
1871 if (ut->uu_lowpri_window == 0) {
1872 return THROTTLE_DISENGAGED;
1873 }
1874 }
1875
1876 if (info->throttle_disabled) {
1877 return THROTTLE_DISENGAGED;
1878 } else {
1879 return throttle_io_will_be_throttled_internal(info, NULL, NULL);
1880 }
1881 }
1882
1883 /*
1884 * Routine to increment I/O throttling counters maintained in the proc
1885 */
1886
1887 static void
throttle_update_proc_stats(pid_t throttling_pid,int count)1888 throttle_update_proc_stats(pid_t throttling_pid, int count)
1889 {
1890 proc_t throttling_proc;
1891 proc_t throttled_proc = current_proc();
1892
1893 /* The throttled_proc is always the current proc; so we are not concerned with refs */
1894 OSAddAtomic64(count, &(throttled_proc->was_throttled));
1895
1896 /* The throttling pid might have exited by now */
1897 throttling_proc = proc_find(throttling_pid);
1898 if (throttling_proc != PROC_NULL) {
1899 OSAddAtomic64(count, &(throttling_proc->did_throttle));
1900 proc_rele(throttling_proc);
1901 }
1902 }
1903
1904 /*
1905 * Block until woken up by the throttle timer or by a rethrottle call.
1906 * As long as we hold the throttle_lock while querying the throttle tier, we're
1907 * safe against seeing an old throttle tier after a rethrottle.
1908 */
1909 uint32_t
throttle_lowpri_io(int sleep_amount)1910 throttle_lowpri_io(int sleep_amount)
1911 {
1912 uthread_t ut;
1913 struct _throttle_io_info_t *info;
1914 int throttle_type = 0;
1915 int mylevel = 0;
1916 int throttling_level = THROTTLE_LEVEL_NONE;
1917 int sleep_cnt = 0;
1918 uint32_t throttle_io_period_num = 0;
1919 boolean_t insert_tail = TRUE;
1920 boolean_t s;
1921
1922 ut = current_uthread();
1923
1924 if (ut->uu_lowpri_window == 0) {
1925 return 0;
1926 }
1927
1928 info = ut->uu_throttle_info;
1929
1930 if (info == NULL) {
1931 ut->uu_throttle_bc = false;
1932 ut->uu_lowpri_window = 0;
1933 return 0;
1934 }
1935 lck_mtx_lock(&info->throttle_lock);
1936 assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
1937
1938 if (sleep_amount == 0) {
1939 goto done;
1940 }
1941
1942 if (sleep_amount == 1 && !ut->uu_throttle_bc) {
1943 sleep_amount = 0;
1944 }
1945
1946 throttle_io_period_num = info->throttle_io_period_num;
1947
1948 ut->uu_was_rethrottled = false;
1949
1950 while ((throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level))) {
1951 if (throttle_type == THROTTLE_ENGAGED) {
1952 if (sleep_amount == 0) {
1953 break;
1954 }
1955 if (info->throttle_io_period_num < throttle_io_period_num) {
1956 break;
1957 }
1958 if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
1959 break;
1960 }
1961 }
1962 /*
1963 * keep the same position in the list if "rethrottle_thread" changes our throttle level and
1964 * then puts us back to the original level before we get a chance to run
1965 */
1966 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED && ut->uu_on_throttlelist != mylevel) {
1967 /*
1968 * must have been awakened via "rethrottle_thread" (the timer pulls us off the list)
1969 * and we've changed our throttling level, so pull ourselves off of the appropriate list
1970 * and make sure we get put on the tail of the new list since we're starting anew w/r to
1971 * the throttling engine
1972 */
1973 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1974 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1975 insert_tail = TRUE;
1976 }
1977 if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) {
1978 if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END) {
1979 goto done;
1980 }
1981 }
1982 assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END);
1983
1984 s = ml_set_interrupts_enabled(FALSE);
1985 lck_spin_lock(&ut->uu_rethrottle_lock);
1986
1987 /*
1988 * this is the critical section w/r to our interaction
1989 * with "rethrottle_thread"
1990 */
1991 if (ut->uu_was_rethrottled) {
1992 lck_spin_unlock(&ut->uu_rethrottle_lock);
1993 ml_set_interrupts_enabled(s);
1994 lck_mtx_yield(&info->throttle_lock);
1995
1996 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 103)),
1997 uthread_tid(ut), ut->uu_on_throttlelist, 0, 0, 0);
1998
1999 ut->uu_was_rethrottled = false;
2000 continue;
2001 }
2002 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE,
2003 info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, 0);
2004
2005 if (sleep_cnt == 0) {
2006 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
2007 throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
2008 throttled_count[mylevel]++;
2009 }
2010 ut->uu_wmesg = "throttle_lowpri_io";
2011
2012 assert_wait((caddr_t)&ut->uu_on_throttlelist, THREAD_UNINT);
2013
2014 ut->uu_is_throttled = true;
2015 lck_spin_unlock(&ut->uu_rethrottle_lock);
2016 ml_set_interrupts_enabled(s);
2017
2018 lck_mtx_unlock(&info->throttle_lock);
2019
2020 thread_block(THREAD_CONTINUE_NULL);
2021
2022 ut->uu_wmesg = NULL;
2023
2024 ut->uu_is_throttled = false;
2025 ut->uu_was_rethrottled = false;
2026
2027 lck_mtx_lock(&info->throttle_lock);
2028
2029 sleep_cnt++;
2030
2031 if (sleep_amount == 0) {
2032 insert_tail = FALSE;
2033 } else if (info->throttle_io_period_num < throttle_io_period_num ||
2034 (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
2035 insert_tail = FALSE;
2036 sleep_amount = 0;
2037 }
2038 }
2039 done:
2040 if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
2041 TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
2042 ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
2043 }
2044 lck_mtx_unlock(&info->throttle_lock);
2045
2046 if (sleep_cnt) {
2047 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
2048 throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
2049 /*
2050 * We update the stats for the last pid which opened a throttle window for the throttled thread.
2051 * This might not be completely accurate since the multiple throttles seen by the lower tier pid
2052 * might have been caused by various higher prio pids. However, updating these stats accurately
2053 * means doing a proc_find while holding the throttle lock which leads to deadlock.
2054 */
2055 throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level], sleep_cnt);
2056 }
2057
2058 ut->uu_throttle_info = NULL;
2059 ut->uu_throttle_bc = false;
2060 ut->uu_lowpri_window = 0;
2061
2062 throttle_info_rel(info);
2063
2064 return sleep_cnt;
2065 }
2066
2067 /*
2068 * returns TRUE if the throttle_lowpri_io called with the same sleep_amount would've slept
2069 * This function mimics the most of the throttle_lowpri_io checks but without actual sleeping
2070 */
2071 int
throttle_lowpri_io_will_be_throttled(int sleep_amount)2072 throttle_lowpri_io_will_be_throttled(int sleep_amount)
2073 {
2074 if (sleep_amount == 0) {
2075 return FALSE;
2076 }
2077
2078 uthread_t ut = current_uthread();
2079 if (ut->uu_lowpri_window == 0) {
2080 return FALSE;
2081 }
2082
2083 struct _throttle_io_info_t *info = ut->uu_throttle_info;
2084 if (info == NULL) {
2085 return FALSE;
2086 }
2087
2088 lck_mtx_lock(&info->throttle_lock);
2089 assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
2090
2091 if (sleep_amount == 1 && !ut->uu_throttle_bc) {
2092 sleep_amount = 0;
2093 }
2094
2095 int result = FALSE;
2096
2097 int throttle_type = throttle_io_will_be_throttled_internal(info, NULL, NULL);
2098 if (throttle_type > THROTTLE_DISENGAGED) {
2099 result = TRUE;
2100 if ((throttle_type == THROTTLE_ENGAGED) && (sleep_amount == 0)) {
2101 result = FALSE;
2102 }
2103 }
2104
2105 lck_mtx_unlock(&info->throttle_lock);
2106
2107 return result;
2108 }
2109
2110
2111 /*
2112 * KPI routine
2113 *
2114 * set a kernel thread's IO policy. policy can be:
2115 * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE, IOPOL_UTILITY, IOPOL_STANDARD
2116 *
2117 * explanations about these policies are in the man page of setiopolicy_np
2118 */
2119 void
throttle_set_thread_io_policy(int policy)2120 throttle_set_thread_io_policy(int policy)
2121 {
2122 proc_set_thread_policy(current_thread(), TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, policy);
2123 }
2124
2125 int
throttle_get_thread_effective_io_policy()2126 throttle_get_thread_effective_io_policy()
2127 {
2128 return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
2129 }
2130
2131 int
throttle_thread_io_tier_above_metadata(void)2132 throttle_thread_io_tier_above_metadata(void)
2133 {
2134 return throttle_get_thread_effective_io_policy() < IOSCHED_METADATA_TIER;
2135 }
2136
2137 void
throttle_info_reset_window(uthread_t ut)2138 throttle_info_reset_window(uthread_t ut)
2139 {
2140 struct _throttle_io_info_t *info;
2141
2142 if (ut == NULL) {
2143 ut = current_uthread();
2144 }
2145
2146 if ((info = ut->uu_throttle_info)) {
2147 throttle_info_rel(info);
2148
2149 ut->uu_throttle_info = NULL;
2150 ut->uu_lowpri_window = 0;
2151 ut->uu_throttle_bc = false;
2152 }
2153 }
2154
2155 static
2156 void
throttle_info_set_initial_window(uthread_t ut,struct _throttle_io_info_t * info,boolean_t BC_throttle,boolean_t isssd)2157 throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle, boolean_t isssd)
2158 {
2159 if (lowpri_throttle_enabled == 0 || info->throttle_disabled) {
2160 return;
2161 }
2162
2163 if (info->throttle_io_periods == 0) {
2164 throttle_init_throttle_period(info, isssd);
2165 }
2166 if (ut->uu_throttle_info == NULL) {
2167 ut->uu_throttle_info = info;
2168 throttle_info_ref(info);
2169 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
2170
2171 ut->uu_lowpri_window = 1;
2172 ut->uu_throttle_bc = BC_throttle;
2173 }
2174 }
2175
2176 /*
2177 * Update inflight IO count and throttling window
2178 * Should be called when an IO is done
2179 *
2180 * Only affects IO that was sent through spec_strategy
2181 */
2182 void
throttle_info_end_io(buf_t bp)2183 throttle_info_end_io(buf_t bp)
2184 {
2185 mount_t mp;
2186 struct bufattr *bap;
2187 struct _throttle_io_info_t *info;
2188 int io_tier;
2189
2190 bap = &bp->b_attr;
2191 if (!ISSET(bap->ba_flags, BA_STRATEGY_TRACKED_IO)) {
2192 return;
2193 }
2194 CLR(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2195
2196 mp = buf_vnode(bp)->v_mount;
2197 if (mp != NULL) {
2198 info = &_throttle_io_info[mp->mnt_devbsdunit];
2199 } else {
2200 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2201 }
2202
2203 io_tier = GET_BUFATTR_IO_TIER(bap);
2204 if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2205 io_tier--;
2206 }
2207
2208 throttle_info_end_io_internal(info, io_tier);
2209 }
2210
2211 /*
2212 * Decrement inflight count initially incremented by throttle_info_update_internal
2213 */
2214 static
2215 void
throttle_info_end_io_internal(struct _throttle_io_info_t * info,int throttle_level)2216 throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level)
2217 {
2218 if (throttle_level == THROTTLE_LEVEL_NONE) {
2219 return;
2220 }
2221
2222 microuptime(&info->throttle_window_start_timestamp[throttle_level]);
2223 OSDecrementAtomic(&info->throttle_inflight_count[throttle_level]);
2224 assert(info->throttle_inflight_count[throttle_level] >= 0);
2225 }
2226
2227 /*
2228 * If inflight is TRUE and bap is NULL then the caller is responsible for calling
2229 * throttle_info_end_io_internal to avoid leaking in-flight I/O.
2230 */
2231 static
2232 int
throttle_info_update_internal(struct _throttle_io_info_t * info,uthread_t ut,int flags,boolean_t isssd,boolean_t inflight,struct bufattr * bap)2233 throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap)
2234 {
2235 int thread_throttle_level;
2236
2237 if (lowpri_throttle_enabled == 0 || info->throttle_disabled) {
2238 return THROTTLE_LEVEL_NONE;
2239 }
2240
2241 if (ut == NULL) {
2242 ut = current_uthread();
2243 }
2244
2245 if (bap && inflight && !ut->uu_throttle_bc) {
2246 thread_throttle_level = GET_BUFATTR_IO_TIER(bap);
2247 if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2248 thread_throttle_level--;
2249 }
2250 } else {
2251 thread_throttle_level = throttle_get_thread_throttle_level(ut);
2252 }
2253
2254 if (thread_throttle_level != THROTTLE_LEVEL_NONE) {
2255 if (!ISSET(flags, B_PASSIVE)) {
2256 info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid();
2257 if (inflight && !ut->uu_throttle_bc) {
2258 if (NULL != bap) {
2259 SET(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2260 }
2261 OSIncrementAtomic(&info->throttle_inflight_count[thread_throttle_level]);
2262 } else {
2263 microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]);
2264 }
2265 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) | DBG_FUNC_NONE,
2266 proc_getpid(current_proc()), thread_throttle_level, 0, 0, 0);
2267 }
2268 microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]);
2269 }
2270
2271
2272 if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) {
2273 /*
2274 * I'd really like to do the IOSleep here, but
2275 * we may be holding all kinds of filesystem related locks
2276 * and the pages for this I/O marked 'busy'...
2277 * we don't want to cause a normal task to block on
2278 * one of these locks while we're throttling a task marked
2279 * for low priority I/O... we'll mark the uthread and
2280 * do the delay just before we return from the system
2281 * call that triggered this I/O or from vnode_pagein
2282 */
2283 OSAddAtomic(1, &info->throttle_io_count);
2284
2285 throttle_info_set_initial_window(ut, info, FALSE, isssd);
2286 }
2287
2288 return thread_throttle_level;
2289 }
2290
2291 void *
throttle_info_update_by_mount(mount_t mp)2292 throttle_info_update_by_mount(mount_t mp)
2293 {
2294 struct _throttle_io_info_t *info;
2295 uthread_t ut;
2296 boolean_t isssd = FALSE;
2297
2298 ut = current_uthread();
2299
2300 if (mp != NULL) {
2301 if (disk_conditioner_mount_is_ssd(mp)) {
2302 isssd = TRUE;
2303 }
2304 info = &_throttle_io_info[mp->mnt_devbsdunit];
2305 } else {
2306 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2307 }
2308
2309 if (!ut->uu_lowpri_window) {
2310 throttle_info_set_initial_window(ut, info, FALSE, isssd);
2311 }
2312
2313 return info;
2314 }
2315
2316
2317 /*
2318 * KPI routine
2319 *
2320 * this is usually called before every I/O, used for throttled I/O
2321 * book keeping. This routine has low overhead and does not sleep
2322 */
2323 void
throttle_info_update(void * throttle_info,int flags)2324 throttle_info_update(void *throttle_info, int flags)
2325 {
2326 if (throttle_info) {
2327 throttle_info_update_internal(throttle_info, NULL, flags, FALSE, FALSE, NULL);
2328 }
2329 }
2330
2331 /*
2332 * KPI routine
2333 *
2334 * this is usually called before every I/O, used for throttled I/O
2335 * book keeping. This routine has low overhead and does not sleep
2336 */
2337 void
throttle_info_update_by_mask(void * throttle_info_handle,int flags)2338 throttle_info_update_by_mask(void *throttle_info_handle, int flags)
2339 {
2340 void *throttle_info = throttle_info_handle;
2341
2342 /*
2343 * for now we only use the lowest bit of the throttle mask, so the
2344 * handle is the same as the throttle_info. Later if we store a
2345 * set of throttle infos in the handle, we will want to loop through
2346 * them and call throttle_info_update in a loop
2347 */
2348 throttle_info_update(throttle_info, flags);
2349 }
2350 /*
2351 * KPI routine
2352 *
2353 * This routine marks the throttle info as disabled. Used for mount points which
2354 * support I/O scheduling.
2355 */
2356
2357 void
throttle_info_disable_throttle(int devno,boolean_t isfusion)2358 throttle_info_disable_throttle(int devno, boolean_t isfusion)
2359 {
2360 struct _throttle_io_info_t *info;
2361
2362 if (devno < 0 || devno >= LOWPRI_MAX_NUM_DEV) {
2363 panic("Illegal devno (%d) passed into throttle_info_disable_throttle()", devno);
2364 }
2365
2366 info = &_throttle_io_info[devno];
2367 // don't disable software throttling on devices that are part of a fusion device
2368 // and override the software throttle periods to use HDD periods
2369 if (isfusion) {
2370 info->throttle_is_fusion_with_priority = isfusion;
2371 throttle_init_throttle_period(info, FALSE);
2372 }
2373 info->throttle_disabled = !info->throttle_is_fusion_with_priority;
2374 return;
2375 }
2376
2377
2378 /*
2379 * KPI routine (private)
2380 * Called to determine if this IO is being throttled to this level so that it can be treated specially
2381 */
2382 int
throttle_info_io_will_be_throttled(void * throttle_info,int policy)2383 throttle_info_io_will_be_throttled(void * throttle_info, int policy)
2384 {
2385 struct _throttle_io_info_t *info = throttle_info;
2386 struct timeval elapsed;
2387 uint64_t elapsed_msecs;
2388 int throttle_level;
2389 int thread_throttle_level;
2390
2391 switch (policy) {
2392 case IOPOL_THROTTLE:
2393 thread_throttle_level = THROTTLE_LEVEL_TIER3;
2394 break;
2395 case IOPOL_UTILITY:
2396 thread_throttle_level = THROTTLE_LEVEL_TIER2;
2397 break;
2398 case IOPOL_STANDARD:
2399 thread_throttle_level = THROTTLE_LEVEL_TIER1;
2400 break;
2401 default:
2402 thread_throttle_level = THROTTLE_LEVEL_TIER0;
2403 break;
2404 }
2405 for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
2406 if (info->throttle_inflight_count[throttle_level]) {
2407 break;
2408 }
2409
2410 microuptime(&elapsed);
2411 timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
2412 elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
2413
2414 if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) {
2415 break;
2416 }
2417 }
2418 if (throttle_level >= thread_throttle_level) {
2419 /*
2420 * we're beyond all of the throttle windows
2421 * so go ahead and treat as normal I/O
2422 */
2423 return THROTTLE_DISENGAGED;
2424 }
2425 /*
2426 * we're in the throttle window
2427 */
2428 return THROTTLE_ENGAGED;
2429 }
2430
2431 int
throttle_lowpri_window(void)2432 throttle_lowpri_window(void)
2433 {
2434 return current_uthread()->uu_lowpri_window;
2435 }
2436
2437
2438 #if CONFIG_IOSCHED
2439 int upl_get_cached_tier(void *);
2440 #endif
2441
2442 #if CONFIG_PHYS_WRITE_ACCT
2443 extern thread_t pm_sync_thread;
2444 #endif /* CONFIG_PHYS_WRITE_ACCT */
2445
2446 int
spec_strategy(struct vnop_strategy_args * ap)2447 spec_strategy(struct vnop_strategy_args *ap)
2448 {
2449 buf_t bp;
2450 int bflags;
2451 int io_tier;
2452 int passive;
2453 dev_t bdev;
2454 uthread_t ut;
2455 mount_t mp;
2456 struct bufattr *bap;
2457 int strategy_ret;
2458 struct _throttle_io_info_t *throttle_info;
2459 boolean_t isssd = FALSE;
2460 boolean_t inflight = FALSE;
2461 boolean_t upgrade = FALSE;
2462 int code = 0;
2463
2464 #if CONFIG_DELAY_IDLE_SLEEP
2465 proc_t curproc = current_proc();
2466 #endif /* CONFIG_DELAY_IDLE_SLEEP */
2467
2468 bp = ap->a_bp;
2469 bdev = buf_device(bp);
2470 mp = buf_vnode(bp)->v_mount;
2471 bap = &bp->b_attr;
2472
2473 #if CONFIG_PHYS_WRITE_ACCT
2474 if (current_thread() == pm_sync_thread) {
2475 OSAddAtomic64(buf_count(bp), (SInt64 *)&(kernel_pm_writes));
2476 }
2477 #endif /* CONFIG_PHYS_WRITE_ACCT */
2478
2479 #if CONFIG_IOSCHED
2480 if (bp->b_flags & B_CLUSTER) {
2481 io_tier = upl_get_cached_tier(bp->b_upl);
2482
2483 if (io_tier == -1) {
2484 io_tier = throttle_get_io_policy(&ut);
2485 }
2486 #if DEVELOPMENT || DEBUG
2487 else {
2488 int my_io_tier = throttle_get_io_policy(&ut);
2489
2490 if (io_tier != my_io_tier) {
2491 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, IO_TIER_UPL_MISMATCH)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp), my_io_tier, io_tier, 0, 0);
2492 }
2493 }
2494 #endif
2495 } else {
2496 io_tier = throttle_get_io_policy(&ut);
2497 }
2498 #else
2499 io_tier = throttle_get_io_policy(&ut);
2500 #endif
2501 passive = throttle_get_passive_io_policy(&ut);
2502
2503 /*
2504 * Mark if the I/O was upgraded by throttle_get_thread_throttle_level
2505 * while preserving the original issued tier (throttle_get_io_policy
2506 * does not return upgraded tiers)
2507 */
2508 if (mp && io_tier > throttle_get_thread_throttle_level_internal(ut, io_tier)) {
2509 #if CONFIG_IOSCHED
2510 if (!(mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2511 upgrade = TRUE;
2512 }
2513 #else /* CONFIG_IOSCHED */
2514 upgrade = TRUE;
2515 #endif /* CONFIG_IOSCHED */
2516 }
2517
2518 if (bp->b_flags & B_META) {
2519 bap->ba_flags |= BA_META;
2520 }
2521
2522 #if CONFIG_IOSCHED
2523 /*
2524 * For metadata reads, ceil the I/O tier to IOSCHED_METADATA_EXPEDITED_TIER if they are expedited, otherwise
2525 * ceil it to IOSCHED_METADATA_TIER. Mark them passive if the I/O tier was upgraded.
2526 * For metadata writes, set the I/O tier to IOSCHED_METADATA_EXPEDITED_TIER if they are expedited. Otherwise
2527 * set it to IOSCHED_METADATA_TIER. In addition, mark them as passive.
2528 */
2529 if (bap->ba_flags & BA_META) {
2530 if ((mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) || (bap->ba_flags & BA_IO_SCHEDULED)) {
2531 if (bp->b_flags & B_READ) {
2532 if ((bap->ba_flags & BA_EXPEDITED_META_IO) && (io_tier > IOSCHED_METADATA_EXPEDITED_TIER)) {
2533 io_tier = IOSCHED_METADATA_EXPEDITED_TIER;
2534 passive = 1;
2535 } else if (io_tier > IOSCHED_METADATA_TIER) {
2536 io_tier = IOSCHED_METADATA_TIER;
2537 passive = 1;
2538 }
2539 } else {
2540 if (bap->ba_flags & BA_EXPEDITED_META_IO) {
2541 io_tier = IOSCHED_METADATA_EXPEDITED_TIER;
2542 } else {
2543 io_tier = IOSCHED_METADATA_TIER;
2544 }
2545 passive = 1;
2546 }
2547 }
2548 }
2549 #endif /* CONFIG_IOSCHED */
2550
2551 SET_BUFATTR_IO_TIER(bap, io_tier);
2552
2553 if (passive) {
2554 bp->b_flags |= B_PASSIVE;
2555 bap->ba_flags |= BA_PASSIVE;
2556 }
2557
2558 #if CONFIG_DELAY_IDLE_SLEEP
2559 if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP)) {
2560 bap->ba_flags |= BA_DELAYIDLESLEEP;
2561 }
2562 #endif /* CONFIG_DELAY_IDLE_SLEEP */
2563
2564 bflags = bp->b_flags;
2565
2566 if (((bflags & B_READ) == 0) && ((bflags & B_ASYNC) == 0)) {
2567 bufattr_markquickcomplete(bap);
2568 }
2569
2570 if (bflags & B_READ) {
2571 code |= DKIO_READ;
2572 }
2573 if (bflags & B_ASYNC) {
2574 code |= DKIO_ASYNC;
2575 }
2576
2577 if (bap->ba_flags & BA_META) {
2578 code |= DKIO_META;
2579 } else if (bflags & B_PAGEIO) {
2580 code |= DKIO_PAGING;
2581 }
2582
2583 if (io_tier != 0) {
2584 code |= DKIO_THROTTLE;
2585 }
2586
2587 code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
2588
2589 if (bflags & B_PASSIVE) {
2590 code |= DKIO_PASSIVE;
2591 }
2592
2593 if (bap->ba_flags & BA_NOCACHE) {
2594 code |= DKIO_NOCACHE;
2595 }
2596
2597 if (upgrade) {
2598 code |= DKIO_TIER_UPGRADE;
2599 SET(bap->ba_flags, BA_IO_TIER_UPGRADE);
2600 }
2601
2602 if (kdebug_enable) {
2603 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
2604 buf_kernel_addrperm_addr(bp), bdev, buf_blkno(bp), buf_count(bp), 0);
2605 }
2606
2607 #if CONFIG_IO_COMPRESSION_STATS
2608 // Do not run IO Compression Stats when a privilege thread is active
2609 if (!is_vm_privileged() && !is_external_pageout_thread()) {
2610 io_compression_stats(bp);
2611 }
2612 #endif /* CONFIG_IO_COMPRESSION_STATS */
2613 thread_update_io_stats(current_thread(), buf_count(bp), code);
2614
2615 if (mp != NULL) {
2616 if (disk_conditioner_mount_is_ssd(mp)) {
2617 isssd = TRUE;
2618 }
2619 /*
2620 * Partially initialized mounts don't have a final devbsdunit and should not be tracked.
2621 * Verify that devbsdunit is initialized (non-zero) or that 0 is the correct initialized value
2622 * (mnt_throttle_mask is initialized and num_trailing_0 would be 0)
2623 */
2624 if (mp->mnt_devbsdunit || (mp->mnt_throttle_mask != LOWPRI_MAX_NUM_DEV - 1 && mp->mnt_throttle_mask & 0x1)) {
2625 inflight = TRUE;
2626 }
2627 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
2628 } else {
2629 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2630 }
2631
2632 throttle_info_update_internal(throttle_info, ut, bflags, isssd, inflight, bap);
2633
2634 if ((bflags & B_READ) == 0) {
2635 microuptime(&throttle_info->throttle_last_write_timestamp);
2636
2637 if (mp) {
2638 mp->mnt_last_write_issued_timestamp = throttle_info->throttle_last_write_timestamp;
2639 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
2640 }
2641 } else if (mp) {
2642 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
2643 }
2644 /*
2645 * The BootCache may give us special information about
2646 * the IO, so it returns special values that we check
2647 * for here.
2648 *
2649 * IO_SATISFIED_BY_CACHE
2650 * The read has been satisfied by the boot cache. Don't
2651 * throttle the thread unnecessarily.
2652 *
2653 * IO_SHOULD_BE_THROTTLED
2654 * The boot cache is playing back a playlist and this IO
2655 * cut through. Throttle it so we're not cutting through
2656 * the boot cache too often.
2657 *
2658 * Note that typical strategy routines are defined with
2659 * a void return so we'll get garbage here. In the
2660 * unlikely case the garbage matches our special return
2661 * value, it's not a big deal since we're only adjusting
2662 * the throttling delay.
2663 */
2664 #define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
2665 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
2666 #pragma clang diagnostic push
2667 #pragma clang diagnostic ignored "-Wcast-function-type"
2668
2669 typedef int strategy_fcn_ret_t(struct buf *bp);
2670
2671 strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
2672
2673 #pragma clang diagnostic pop
2674
2675 // disk conditioner needs to track when this I/O actually starts
2676 // which means track it after `strategy` which may include delays
2677 // from inflight I/Os
2678 microuptime(&bp->b_timestamp_tv);
2679
2680 if (IO_SATISFIED_BY_CACHE == strategy_ret) {
2681 /*
2682 * If this was a throttled IO satisfied by the boot cache,
2683 * don't delay the thread.
2684 */
2685 throttle_info_reset_window(ut);
2686 } else if (IO_SHOULD_BE_THROTTLED == strategy_ret) {
2687 /*
2688 * If the boot cache indicates this IO should be throttled,
2689 * delay the thread.
2690 */
2691 throttle_info_set_initial_window(ut, throttle_info, TRUE, isssd);
2692 }
2693 return 0;
2694 }
2695
2696
2697 /*
2698 * This is a noop, simply returning what one has been given.
2699 */
2700 int
spec_blockmap(__unused struct vnop_blockmap_args * ap)2701 spec_blockmap(__unused struct vnop_blockmap_args *ap)
2702 {
2703 return ENOTSUP;
2704 }
2705
2706
2707 /*
2708 * Device close routine
2709 */
2710 int
spec_close(struct vnop_close_args * ap)2711 spec_close(struct vnop_close_args *ap)
2712 {
2713 struct vnode *vp = ap->a_vp;
2714 dev_t dev = vp->v_rdev;
2715 int error = 0;
2716 int flags = ap->a_fflag;
2717 struct proc *p = vfs_context_proc(ap->a_context);
2718 struct session *sessp;
2719 struct pgrp *pg;
2720
2721 switch (vp->v_type) {
2722 case VCHR:
2723 /*
2724 * Hack: a tty device that is a controlling terminal
2725 * has a reference from the session structure.
2726 * We cannot easily tell that a character device is
2727 * a controlling terminal, unless it is the closing
2728 * process' controlling terminal. In that case,
2729 * if the reference count is 1 (this is the very
2730 * last close)
2731 */
2732 pg = proc_pgrp(p, &sessp);
2733 devsw_lock(dev, S_IFCHR);
2734 if (sessp != SESSION_NULL) {
2735 if (vp == sessp->s_ttyvp && vcount(vp) == 1) {
2736 struct tty *tp = TTY_NULL;
2737
2738 devsw_unlock(dev, S_IFCHR);
2739 session_lock(sessp);
2740 if (vp == sessp->s_ttyvp) {
2741 tp = session_clear_tty_locked(sessp);
2742 }
2743 session_unlock(sessp);
2744
2745 if (tp != TTY_NULL) {
2746 ttyfree(tp);
2747 }
2748 devsw_lock(dev, S_IFCHR);
2749 }
2750 }
2751 pgrp_rele(pg);
2752
2753 if (--vp->v_specinfo->si_opencount < 0) {
2754 panic("negative open count (c, %u, %u)", major(dev), minor(dev));
2755 }
2756
2757 /*
2758 * close on last reference or on vnode revoke call
2759 */
2760 if (vcount(vp) == 0 || (flags & IO_REVOKE) != 0) {
2761 error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
2762 }
2763
2764 devsw_unlock(dev, S_IFCHR);
2765 break;
2766
2767 case VBLK:
2768 /*
2769 * If there is more than one outstanding open, don't
2770 * send the close to the device.
2771 */
2772 devsw_lock(dev, S_IFBLK);
2773 if (vcount(vp) > 1) {
2774 vp->v_specinfo->si_opencount--;
2775 devsw_unlock(dev, S_IFBLK);
2776 return 0;
2777 }
2778 devsw_unlock(dev, S_IFBLK);
2779
2780 /*
2781 * On last close of a block device (that isn't mounted)
2782 * we must invalidate any in core blocks, so that
2783 * we can, for instance, change floppy disks.
2784 */
2785 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) {
2786 return error;
2787 }
2788
2789 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
2790 if (error) {
2791 return error;
2792 }
2793
2794 devsw_lock(dev, S_IFBLK);
2795
2796 if (--vp->v_specinfo->si_opencount < 0) {
2797 panic("negative open count (b, %u, %u)", major(dev), minor(dev));
2798 }
2799
2800 if (vcount(vp) == 0) {
2801 error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
2802 }
2803
2804 devsw_unlock(dev, S_IFBLK);
2805 break;
2806
2807 default:
2808 panic("spec_close: not special");
2809 return EBADF;
2810 }
2811
2812 return error;
2813 }
2814
2815 /*
2816 * Return POSIX pathconf information applicable to special devices.
2817 */
2818 int
spec_pathconf(struct vnop_pathconf_args * ap)2819 spec_pathconf(struct vnop_pathconf_args *ap)
2820 {
2821 switch (ap->a_name) {
2822 case _PC_LINK_MAX:
2823 *ap->a_retval = LINK_MAX;
2824 return 0;
2825 case _PC_MAX_CANON:
2826 *ap->a_retval = MAX_CANON;
2827 return 0;
2828 case _PC_MAX_INPUT:
2829 *ap->a_retval = MAX_INPUT;
2830 return 0;
2831 case _PC_PIPE_BUF:
2832 *ap->a_retval = PIPE_BUF;
2833 return 0;
2834 case _PC_CHOWN_RESTRICTED:
2835 *ap->a_retval = 200112; /* _POSIX_CHOWN_RESTRICTED */
2836 return 0;
2837 case _PC_VDISABLE:
2838 *ap->a_retval = _POSIX_VDISABLE;
2839 return 0;
2840 default:
2841 return EINVAL;
2842 }
2843 /* NOTREACHED */
2844 }
2845
2846 /*
2847 * Special device failed operation
2848 */
2849 int
spec_ebadf(__unused void * dummy)2850 spec_ebadf(__unused void *dummy)
2851 {
2852 return EBADF;
2853 }
2854
2855 /* Blktooff derives file offset from logical block number */
2856 int
spec_blktooff(struct vnop_blktooff_args * ap)2857 spec_blktooff(struct vnop_blktooff_args *ap)
2858 {
2859 struct vnode *vp = ap->a_vp;
2860
2861 switch (vp->v_type) {
2862 case VCHR:
2863 *ap->a_offset = (off_t)-1; /* failure */
2864 return ENOTSUP;
2865
2866 case VBLK:
2867 printf("spec_blktooff: not implemented for VBLK\n");
2868 *ap->a_offset = (off_t)-1; /* failure */
2869 return ENOTSUP;
2870
2871 default:
2872 panic("spec_blktooff type");
2873 }
2874 /* NOTREACHED */
2875
2876 return 0;
2877 }
2878
2879 /* Offtoblk derives logical block number from file offset */
2880 int
spec_offtoblk(struct vnop_offtoblk_args * ap)2881 spec_offtoblk(struct vnop_offtoblk_args *ap)
2882 {
2883 struct vnode *vp = ap->a_vp;
2884
2885 switch (vp->v_type) {
2886 case VCHR:
2887 *ap->a_lblkno = (daddr64_t)-1; /* failure */
2888 return ENOTSUP;
2889
2890 case VBLK:
2891 printf("spec_offtoblk: not implemented for VBLK\n");
2892 *ap->a_lblkno = (daddr64_t)-1; /* failure */
2893 return ENOTSUP;
2894
2895 default:
2896 panic("spec_offtoblk type");
2897 }
2898 /* NOTREACHED */
2899
2900 return 0;
2901 }
2902
2903 static int filt_specattach(struct knote *kn, struct kevent_qos_s *kev);
2904 static void filt_specdetach(struct knote *kn);
2905 static int filt_specevent(struct knote *kn, long hint);
2906 static int filt_spectouch(struct knote *kn, struct kevent_qos_s *kev);
2907 static int filt_specprocess(struct knote *kn, struct kevent_qos_s *kev);
2908
2909 SECURITY_READ_ONLY_EARLY(struct filterops) spec_filtops = {
2910 .f_isfd = 1,
2911 .f_attach = filt_specattach,
2912 .f_detach = filt_specdetach,
2913 .f_event = filt_specevent,
2914 .f_touch = filt_spectouch,
2915 .f_process = filt_specprocess,
2916 };
2917
2918 static void
filt_spec_make_eof(struct knote * kn)2919 filt_spec_make_eof(struct knote *kn)
2920 {
2921 /*
2922 * The spec filter might touch kn_flags from f_event
2923 * without holding "the primitive lock", so make it atomic.
2924 */
2925 os_atomic_or(&kn->kn_flags, EV_EOF | EV_ONESHOT, relaxed);
2926 }
2927
2928 static int
filt_spec_common(struct knote * kn,struct kevent_qos_s * kev,bool attach)2929 filt_spec_common(struct knote *kn, struct kevent_qos_s *kev, bool attach)
2930 {
2931 uthread_t uth = current_uthread();
2932 vfs_context_t ctx = vfs_context_current();
2933 vnode_t vp = (vnode_t)fp_get_data(kn->kn_fp);
2934 __block bool selrecorded = false;
2935 struct select_set *old_wqs;
2936 int64_t data = 0;
2937 int ret, selret;
2938
2939 if (kn->kn_flags & EV_EOF) {
2940 ret = FILTER_ACTIVE;
2941 goto out;
2942 }
2943
2944 if (!attach && vnode_getwithvid(vp, vnode_vid(vp)) != 0) {
2945 filt_spec_make_eof(kn);
2946 ret = FILTER_ACTIVE;
2947 goto out;
2948 }
2949
2950 selspec_record_hook_t cb = ^(struct selinfo *si) {
2951 selspec_attach(kn, si);
2952 selrecorded = true;
2953 };
2954
2955 old_wqs = uth->uu_selset;
2956 uth->uu_selset = SELSPEC_RECORD_MARKER;
2957 selret = VNOP_SELECT(vp, knote_get_seltype(kn), 0, cb, ctx);
2958 uth->uu_selset = old_wqs;
2959
2960 if (!attach) {
2961 vnode_put(vp);
2962 }
2963
2964 if (!selrecorded && selret == 0) {
2965 /*
2966 * The device indicated that there's no data to read,
2967 * but didn't call `selrecord`.
2968 *
2969 * Nothing will be notified of changes to this vnode,
2970 * so return an error back to user space on attach,
2971 * or pretend the knote disappeared for other cases,
2972 * to make it clear that the knote is not attached.
2973 */
2974 if (attach) {
2975 knote_set_error(kn, ENODEV);
2976 return 0;
2977 }
2978
2979 filt_spec_make_eof(kn);
2980 ret = FILTER_ACTIVE;
2981 goto out;
2982 }
2983
2984 if (kn->kn_vnode_use_ofst) {
2985 if (kn->kn_fp->fp_glob->fg_offset >= (uint32_t)selret) {
2986 data = 0;
2987 } else {
2988 data = ((uint32_t)selret) - kn->kn_fp->fp_glob->fg_offset;
2989 }
2990 } else {
2991 data = selret;
2992 }
2993
2994 if (data >= knote_low_watermark(kn)) {
2995 ret = FILTER_ACTIVE;
2996 } else {
2997 ret = 0;
2998 }
2999 out:
3000 if (ret) {
3001 knote_fill_kevent(kn, kev, data);
3002 }
3003 return ret;
3004 }
3005
3006 static int
filt_specattach(struct knote * kn,__unused struct kevent_qos_s * kev)3007 filt_specattach(struct knote *kn, __unused struct kevent_qos_s *kev)
3008 {
3009 vnode_t vp = (vnode_t)fp_get_data(kn->kn_fp); /* Already have iocount, and vnode is alive */
3010 dev_t dev;
3011
3012 assert(vnode_ischr(vp));
3013
3014 dev = vnode_specrdev(vp);
3015
3016 /*
3017 * For a few special kinds of devices, we can attach knotes with
3018 * no restrictions because their "select" vectors return the amount
3019 * of data available. Others require an explicit NOTE_LOWAT with
3020 * data of 1, indicating that the caller doesn't care about actual
3021 * data counts, just an indication that the device has data.
3022 */
3023 if (!kn->kn_vnode_kqok &&
3024 ((kn->kn_sfflags & NOTE_LOWAT) == 0 || kn->kn_sdata != 1)) {
3025 knote_set_error(kn, EINVAL);
3026 return 0;
3027 }
3028
3029 return filt_spec_common(kn, kev, true);
3030 }
3031
3032 static void
filt_specdetach(struct knote * kn)3033 filt_specdetach(struct knote *kn)
3034 {
3035 selspec_detach(kn);
3036 }
3037
3038 static int
filt_specevent(struct knote * kn,long hint)3039 filt_specevent(struct knote *kn, long hint)
3040 {
3041 /* knote_post() will have cleared it for us */
3042 assert(kn->kn_hook == NULL);
3043
3044 /* called by selwakeup with the selspec_lock lock held */
3045 if (hint & NOTE_REVOKE) {
3046 filt_spec_make_eof(kn);
3047 }
3048 return FILTER_ACTIVE;
3049 }
3050
3051 static int
filt_spectouch(struct knote * kn,struct kevent_qos_s * kev)3052 filt_spectouch(struct knote *kn, struct kevent_qos_s *kev)
3053 {
3054 kn->kn_sdata = kev->data;
3055 kn->kn_sfflags = kev->fflags;
3056
3057 return filt_spec_common(kn, kev, false);
3058 }
3059
3060 static int
filt_specprocess(struct knote * kn,struct kevent_qos_s * kev)3061 filt_specprocess(struct knote *kn, struct kevent_qos_s *kev)
3062 {
3063 return filt_spec_common(kn, kev, false);
3064 }
3065