xref: /xnu-8792.61.2/bsd/miscfs/specfs/spec_vnops.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2000-2019 Apple Computer, Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1989, 1993, 1995
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)spec_vnops.c	8.14 (Berkeley) 5/21/95
62  */
63 
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/conf.h>
70 #include <sys/buf_internal.h>
71 #include <sys/mount_internal.h>
72 #include <sys/vnode_internal.h>
73 #include <sys/file_internal.h>
74 #include <sys/namei.h>
75 #include <sys/stat.h>
76 #include <sys/errno.h>
77 #include <sys/ioctl.h>
78 #include <sys/file.h>
79 #include <sys/user.h>
80 #include <sys/malloc.h>
81 #include <sys/disk.h>
82 #include <sys/uio_internal.h>
83 #include <sys/resource.h>
84 #include <machine/machine_routines.h>
85 #include <miscfs/specfs/specdev.h>
86 #include <vfs/vfs_support.h>
87 #include <vfs/vfs_disk_conditioner.h>
88 
89 #include <kern/assert.h>
90 #include <kern/task.h>
91 #include <kern/sched_prim.h>
92 #include <kern/thread.h>
93 #include <kern/policy_internal.h>
94 #include <kern/timer_call.h>
95 #include <kern/waitq.h>
96 
97 #include <pexpert/pexpert.h>
98 
99 #include <sys/kdebug.h>
100 #include <libkern/section_keywords.h>
101 
102 #if CONFIG_IO_COMPRESSION_STATS
103 #include <vfs/vfs_io_compression_stats.h>
104 #endif /* CONFIG_IO_COMPRESSION_STATS */
105 
106 /* XXX following three prototypes should be in a header file somewhere */
107 extern dev_t    chrtoblk(dev_t dev);
108 extern boolean_t        iskmemdev(dev_t dev);
109 extern int bpfkqfilter(dev_t dev, struct knote *kn);
110 extern int ptsd_kqfilter(dev_t, struct knote *);
111 extern int ptmx_kqfilter(dev_t, struct knote *);
112 #if CONFIG_PHYS_WRITE_ACCT
113 uint64_t kernel_pm_writes;    // to track the sync writes occurring during power management transitions
114 #endif /* CONFIG_PHYS_WRITE_ACCT */
115 
116 
117 struct vnode *speclisth[SPECHSZ];
118 
119 /* symbolic sleep message strings for devices */
120 char    devopn[] = "devopn";
121 char    devio[] = "devio";
122 char    devwait[] = "devwait";
123 char    devin[] = "devin";
124 char    devout[] = "devout";
125 char    devioc[] = "devioc";
126 char    devcls[] = "devcls";
127 
128 #define VOPFUNC int (*)(void *)
129 
130 int(**spec_vnodeop_p)(void *);
131 const struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
132 	{ .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)(void (*)(void))vn_default_error },
133 	{ .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)spec_lookup },            /* lookup */
134 	{ .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create },             /* create */
135 	{ .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)err_mknod },               /* mknod */
136 	{ .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)spec_open },                        /* open */
137 	{ .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)spec_close },              /* close */
138 	{ .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)spec_access },            /* access */
139 	{ .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)spec_getattr },          /* getattr */
140 	{ .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)spec_setattr },          /* setattr */
141 	{ .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)spec_read },                        /* read */
142 	{ .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)spec_write },              /* write */
143 	{ .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)spec_ioctl },              /* ioctl */
144 	{ .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)spec_select },            /* select */
145 	{ .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)nop_revoke },             /* revoke */
146 	{ .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap },                 /* mmap */
147 	{ .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)spec_fsync },              /* fsync */
148 	{ .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)err_remove },             /* remove */
149 	{ .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)err_link },                 /* link */
150 	{ .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)err_rename },             /* rename */
151 	{ .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)err_mkdir },               /* mkdir */
152 	{ .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)err_rmdir },               /* rmdir */
153 	{ .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)err_symlink },           /* symlink */
154 	{ .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)err_readdir },           /* readdir */
155 	{ .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink },         /* readlink */
156 	{ .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)nop_inactive },         /* inactive */
157 	{ .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)nop_reclaim },           /* reclaim */
158 	{ .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)spec_strategy },                /* strategy */
159 	{ .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)spec_pathconf },                /* pathconf */
160 	{ .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)err_advlock },           /* advlock */
161 	{ .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)spec_bwrite },            /* bwrite */
162 	{ .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein },             /* Pagein */
163 	{ .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout },           /* Pageout */
164 	{ .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile },         /* Copyfile */
165 	{ .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)spec_blktooff },                /* blktooff */
166 	{ .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)spec_offtoblk },                /* offtoblk */
167 	{ .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)spec_blockmap },                /* blockmap */
168 	{ .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL }
169 };
170 const struct vnodeopv_desc spec_vnodeop_opv_desc =
171 { .opv_desc_vector_p = &spec_vnodeop_p, .opv_desc_ops = spec_vnodeop_entries };
172 
173 
174 static void set_blocksize(vnode_t, dev_t);
175 
176 #define LOWPRI_TIER1_WINDOW_MSECS         25
177 #define LOWPRI_TIER2_WINDOW_MSECS         100
178 #define LOWPRI_TIER3_WINDOW_MSECS         500
179 
180 #define LOWPRI_TIER1_IO_PERIOD_MSECS      40
181 #define LOWPRI_TIER2_IO_PERIOD_MSECS      85
182 #define LOWPRI_TIER3_IO_PERIOD_MSECS      200
183 
184 #define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS  5
185 #define LOWPRI_TIER2_IO_PERIOD_SSD_MSECS  15
186 #define LOWPRI_TIER3_IO_PERIOD_SSD_MSECS  25
187 
188 
189 int     throttle_windows_msecs[THROTTLE_LEVEL_END + 1] = {
190 	0,
191 	LOWPRI_TIER1_WINDOW_MSECS,
192 	LOWPRI_TIER2_WINDOW_MSECS,
193 	LOWPRI_TIER3_WINDOW_MSECS,
194 };
195 
196 int     throttle_io_period_msecs[THROTTLE_LEVEL_END + 1] = {
197 	0,
198 	LOWPRI_TIER1_IO_PERIOD_MSECS,
199 	LOWPRI_TIER2_IO_PERIOD_MSECS,
200 	LOWPRI_TIER3_IO_PERIOD_MSECS,
201 };
202 
203 int     throttle_io_period_ssd_msecs[THROTTLE_LEVEL_END + 1] = {
204 	0,
205 	LOWPRI_TIER1_IO_PERIOD_SSD_MSECS,
206 	LOWPRI_TIER2_IO_PERIOD_SSD_MSECS,
207 	LOWPRI_TIER3_IO_PERIOD_SSD_MSECS,
208 };
209 
210 
211 int     throttled_count[THROTTLE_LEVEL_END + 1];
212 
213 struct _throttle_io_info_t {
214 	lck_mtx_t       throttle_lock;
215 
216 	struct timeval  throttle_last_write_timestamp;
217 	struct timeval  throttle_min_timer_deadline;
218 	struct timeval  throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1]; /* window starts at both the beginning and completion of an I/O */
219 	struct timeval  throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1];
220 	pid_t           throttle_last_IO_pid[THROTTLE_LEVEL_END + 1];
221 	struct timeval  throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + 1];
222 	int32_t throttle_inflight_count[THROTTLE_LEVEL_END + 1];
223 
224 	TAILQ_HEAD(, uthread) throttle_uthlist[THROTTLE_LEVEL_END + 1];         /* Lists of throttled uthreads */
225 	int             throttle_next_wake_level;
226 
227 	thread_call_t   throttle_timer_call;
228 	int32_t throttle_timer_ref;
229 	int32_t throttle_timer_active;
230 
231 	int32_t throttle_io_count;
232 	int32_t throttle_io_count_begin;
233 	int    *throttle_io_periods;
234 	uint32_t throttle_io_period_num;
235 
236 	int32_t throttle_refcnt;
237 	int32_t throttle_alloc;
238 	int32_t throttle_disabled;
239 	int32_t throttle_is_fusion_with_priority;
240 };
241 
242 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
243 
244 
245 int     lowpri_throttle_enabled = 1;
246 
247 
248 static void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level);
249 static int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap);
250 static int throttle_get_thread_throttle_level(uthread_t ut);
251 static int throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier);
252 void throttle_info_mount_reset_period(mount_t mp, int isssd);
253 
254 /*
255  * Trivial lookup routine that always fails.
256  */
257 int
spec_lookup(struct vnop_lookup_args * ap)258 spec_lookup(struct vnop_lookup_args *ap)
259 {
260 	*ap->a_vpp = NULL;
261 	return ENOTDIR;
262 }
263 
264 static void
set_blocksize(struct vnode * vp,dev_t dev)265 set_blocksize(struct vnode *vp, dev_t dev)
266 {
267 	int (*size)(dev_t);
268 	int rsize;
269 
270 	if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
271 		rsize = (*size)(dev);
272 		if (rsize <= 0) { /* did size fail? */
273 			vp->v_specsize = DEV_BSIZE;
274 		} else {
275 			vp->v_specsize = rsize;
276 		}
277 	} else {
278 		vp->v_specsize = DEV_BSIZE;
279 	}
280 }
281 
282 void
set_fsblocksize(struct vnode * vp)283 set_fsblocksize(struct vnode *vp)
284 {
285 	if (vp->v_type == VBLK) {
286 		dev_t dev = (dev_t)vp->v_rdev;
287 		int maj = major(dev);
288 
289 		if ((u_int)maj >= (u_int)nblkdev) {
290 			return;
291 		}
292 
293 		vnode_lock(vp);
294 		set_blocksize(vp, dev);
295 		vnode_unlock(vp);
296 	}
297 }
298 
299 
300 /*
301  * Open a special file.
302  */
303 int
spec_open(struct vnop_open_args * ap)304 spec_open(struct vnop_open_args *ap)
305 {
306 	struct proc *p = vfs_context_proc(ap->a_context);
307 	kauth_cred_t cred = vfs_context_ucred(ap->a_context);
308 	struct vnode *vp = ap->a_vp;
309 	dev_t bdev, dev = (dev_t)vp->v_rdev;
310 	int maj = major(dev);
311 	int error;
312 
313 	/*
314 	 * Don't allow open if fs is mounted -nodev.
315 	 */
316 	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) {
317 		return ENXIO;
318 	}
319 
320 	switch (vp->v_type) {
321 	case VCHR:
322 		if ((u_int)maj >= (u_int)nchrdev) {
323 			return ENXIO;
324 		}
325 		if (cred != FSCRED && (ap->a_mode & FWRITE)) {
326 #if 0
327 			/*
328 			 * When running in very secure mode, do not allow
329 			 * opens for writing of any disk character devices.
330 			 */
331 			if (securelevel >= 2 && isdisk(dev, VCHR)) {
332 				return EPERM;
333 			}
334 #endif
335 
336 			/* Never allow writing to /dev/mem or /dev/kmem */
337 			if (iskmemdev(dev)) {
338 				return EPERM;
339 			}
340 			/*
341 			 * When running in secure mode, do not allow opens for
342 			 * writing of character devices whose corresponding block
343 			 * devices are currently mounted.
344 			 */
345 			if (securelevel >= 1) {
346 				if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error)) {
347 					return error;
348 				}
349 			}
350 		}
351 
352 		devsw_lock(dev, S_IFCHR);
353 		error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
354 
355 		if (error == 0) {
356 			vp->v_specinfo->si_opencount++;
357 		}
358 
359 		devsw_unlock(dev, S_IFCHR);
360 
361 		if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
362 			int     isssd = 0;
363 			uint64_t throttle_mask = 0;
364 			uint32_t devbsdunit = 0;
365 
366 			if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
367 				if (throttle_mask != 0 &&
368 				    VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
369 					/*
370 					 * as a reasonable approximation, only use the lowest bit of the mask
371 					 * to generate a disk unit number
372 					 */
373 					devbsdunit = num_trailing_0(throttle_mask);
374 
375 					vnode_lock(vp);
376 
377 					vp->v_un.vu_specinfo->si_isssd = isssd ? 1 : 0;
378 					vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
379 					vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
380 					vp->v_un.vu_specinfo->si_throttleable = 1;
381 					vp->v_un.vu_specinfo->si_initted = 1;
382 
383 					vnode_unlock(vp);
384 				}
385 			}
386 			if (vp->v_un.vu_specinfo->si_initted == 0) {
387 				vnode_lock(vp);
388 				vp->v_un.vu_specinfo->si_initted = 1;
389 				vnode_unlock(vp);
390 			}
391 		}
392 		return error;
393 
394 	case VBLK:
395 		if ((u_int)maj >= (u_int)nblkdev) {
396 			return ENXIO;
397 		}
398 		/*
399 		 * When running in very secure mode, do not allow
400 		 * opens for writing of any disk block devices.
401 		 */
402 		if (securelevel >= 2 && cred != FSCRED &&
403 		    (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) {
404 			return EPERM;
405 		}
406 		/*
407 		 * Do not allow opens of block devices that are
408 		 * currently mounted.
409 		 */
410 		if ((error = vfs_mountedon(vp))) {
411 			return error;
412 		}
413 
414 		devsw_lock(dev, S_IFBLK);
415 		error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
416 		if (!error) {
417 			vp->v_specinfo->si_opencount++;
418 		}
419 		devsw_unlock(dev, S_IFBLK);
420 
421 		if (!error) {
422 			u_int64_t blkcnt;
423 			u_int32_t blksize;
424 			int setsize = 0;
425 			u_int32_t size512 = 512;
426 
427 
428 			if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
429 				/* Switch to 512 byte sectors (temporarily) */
430 
431 				if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
432 					/* Get the number of 512 byte physical blocks. */
433 					if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
434 						setsize = 1;
435 					}
436 				}
437 				/* If it doesn't set back, we can't recover */
438 				if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) {
439 					error = ENXIO;
440 				}
441 			}
442 
443 
444 			vnode_lock(vp);
445 			set_blocksize(vp, dev);
446 
447 			/*
448 			 * Cache the size in bytes of the block device for later
449 			 * use by spec_write().
450 			 */
451 			if (setsize) {
452 				vp->v_specdevsize = blkcnt * (u_int64_t)size512;
453 			} else {
454 				vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */
455 			}
456 			vnode_unlock(vp);
457 		}
458 		return error;
459 	default:
460 		panic("spec_open type");
461 	}
462 	return 0;
463 }
464 
465 /*
466  * Vnode op for read
467  */
468 int
spec_read(struct vnop_read_args * ap)469 spec_read(struct vnop_read_args *ap)
470 {
471 	struct vnode *vp = ap->a_vp;
472 	struct uio *uio = ap->a_uio;
473 	struct buf *bp;
474 	daddr64_t bn, nextbn;
475 	long bscale;
476 	int devBlockSize = 0;
477 	size_t bsize, n, on;
478 	int error = 0;
479 	dev_t dev;
480 
481 #if DIAGNOSTIC
482 	if (uio->uio_rw != UIO_READ) {
483 		panic("spec_read mode");
484 	}
485 	if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
486 		panic("spec_read proc");
487 	}
488 #endif
489 	if (uio_resid(uio) == 0) {
490 		return 0;
491 	}
492 
493 	switch (vp->v_type) {
494 	case VCHR:
495 	{
496 		struct _throttle_io_info_t *throttle_info = NULL;
497 		int thread_throttle_level;
498 		uint64_t blkno = 0;
499 		uint32_t iolen = 0;
500 		int ddisk = 0;
501 		int ktrace_code = DKIO_READ;
502 		devBlockSize = vp->v_specsize;
503 		uintptr_t our_id = 0;
504 
505 		if (cdevsw[major(vp->v_rdev)].d_type == D_DISK) {
506 			ddisk = 1;
507 		}
508 
509 		if (ddisk && vp->v_un.vu_specinfo->si_throttleable) {
510 			throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
511 			thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
512 		}
513 
514 		if (kdebug_enable && ddisk) {
515 			if (devBlockSize == 0) {
516 				devBlockSize = 512;  // default sector size
517 			}
518 
519 			if (uio_offset(uio) && devBlockSize) {
520 				blkno = ((uint64_t) uio_offset(uio) / ((uint64_t)devBlockSize));
521 			}
522 			iolen = (int) uio_resid(uio);
523 			our_id = (uintptr_t)thread_tid(current_thread());
524 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
525 			    (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
526 			    vp->v_rdev, blkno, iolen, 0);
527 		}
528 
529 		error = (*cdevsw[major(vp->v_rdev)].d_read)
530 		    (vp->v_rdev, uio, ap->a_ioflag);
531 
532 
533 		if (kdebug_enable && ddisk) {
534 			uint32_t residual = (uint32_t)uio_resid(uio);
535 			ktrace_code |= DKIO_DONE;
536 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
537 			    (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
538 			    (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0);
539 		}
540 
541 		if (throttle_info) {
542 			throttle_info_end_io_internal(throttle_info, thread_throttle_level);
543 		}
544 
545 		return error;
546 	}
547 
548 	case VBLK:
549 		if (uio->uio_offset < 0) {
550 			return EINVAL;
551 		}
552 
553 		dev = vp->v_rdev;
554 
555 		devBlockSize = vp->v_specsize;
556 
557 		if (devBlockSize > PAGE_SIZE) {
558 			return EINVAL;
559 		}
560 
561 		bscale = PAGE_SIZE / devBlockSize;
562 		bsize = bscale * devBlockSize;
563 
564 		do {
565 			on = uio->uio_offset % bsize;
566 
567 			bn = (daddr64_t)((uio->uio_offset / devBlockSize) & ~(bscale - 1));
568 
569 			if (vp->v_speclastr + bscale == bn) {
570 				nextbn = bn + bscale;
571 				error = buf_breadn(vp, bn, (int)bsize, &nextbn,
572 				    (int *)&bsize, 1, NOCRED, &bp);
573 			} else {
574 				error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
575 			}
576 
577 			vnode_lock(vp);
578 			vp->v_speclastr = bn;
579 			vnode_unlock(vp);
580 
581 			n = bsize - buf_resid(bp);
582 			if ((on > n) || error) {
583 				if (!error) {
584 					error = EINVAL;
585 				}
586 				buf_brelse(bp);
587 				return error;
588 			}
589 			n = MIN((n  - on), (size_t)uio_resid(uio));
590 
591 			error = uiomove((char *)buf_dataptr(bp) + on, (int)n, uio);
592 			if (n + on == bsize) {
593 				buf_markaged(bp);
594 			}
595 			buf_brelse(bp);
596 		} while (error == 0 && uio_resid(uio) > 0 && n != 0);
597 		return error;
598 
599 	default:
600 		panic("spec_read type");
601 	}
602 	/* NOTREACHED */
603 
604 	return 0;
605 }
606 
607 /*
608  * Vnode op for write
609  */
610 int
spec_write(struct vnop_write_args * ap)611 spec_write(struct vnop_write_args *ap)
612 {
613 	struct vnode *vp = ap->a_vp;
614 	struct uio *uio = ap->a_uio;
615 	struct buf *bp;
616 	daddr64_t bn;
617 	int blkmask, bscale;
618 	int io_sync;
619 	int devBlockSize = 0;
620 	size_t bsize, n, on;
621 	int error = 0;
622 	dev_t dev;
623 
624 #if DIAGNOSTIC
625 	if (uio->uio_rw != UIO_WRITE) {
626 		panic("spec_write mode");
627 	}
628 	if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
629 		panic("spec_write proc");
630 	}
631 #endif
632 
633 	switch (vp->v_type) {
634 	case VCHR:
635 	{
636 		struct _throttle_io_info_t *throttle_info = NULL;
637 		int thread_throttle_level;
638 		dev = vp->v_rdev;
639 		devBlockSize = vp->v_specsize;
640 		uint32_t iolen = 0;
641 		uint64_t blkno = 0;
642 		int ddisk = 0;
643 		int ktrace_code = 0;  // write is implied; read must be OR'd in.
644 		uintptr_t our_id = 0;
645 
646 		if (cdevsw[major(dev)].d_type == D_DISK) {
647 			ddisk = 1;
648 		}
649 
650 		if (ddisk && vp->v_un.vu_specinfo->si_throttleable) {
651 			throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
652 
653 			thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
654 
655 			microuptime(&throttle_info->throttle_last_write_timestamp);
656 		}
657 
658 		if (kdebug_enable && ddisk) {
659 			if (devBlockSize == 0) {
660 				devBlockSize = 512; // default sector size
661 			}
662 			if ((uio_offset(uio) != 0) && devBlockSize) {
663 				blkno = ((uint64_t)uio_offset(uio)) / ((uint64_t)devBlockSize);
664 			}
665 			iolen = (int)uio_resid(uio);
666 			our_id = (uintptr_t)thread_tid(current_thread());
667 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
668 			    (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
669 			    vp->v_rdev, blkno, iolen, 0);
670 		}
671 		error = (*cdevsw[major(vp->v_rdev)].d_write)
672 		    (vp->v_rdev, uio, ap->a_ioflag);
673 
674 		if (kdebug_enable && ddisk) {
675 			//emit the I/O completion
676 			uint32_t residual = (uint32_t)uio_resid(uio);
677 			ktrace_code |= DKIO_DONE;
678 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
679 			    (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
680 			    (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0);
681 		}
682 
683 		if (throttle_info) {
684 			throttle_info_end_io_internal(throttle_info, thread_throttle_level);
685 		}
686 
687 		return error;
688 	}
689 
690 	case VBLK:
691 		if (uio_resid(uio) == 0) {
692 			return 0;
693 		}
694 		if (uio->uio_offset < 0) {
695 			return EINVAL;
696 		}
697 
698 		io_sync = (ap->a_ioflag & IO_SYNC);
699 
700 		dev = (vp->v_rdev);
701 
702 		devBlockSize = vp->v_specsize;
703 		if (devBlockSize > PAGE_SIZE) {
704 			return EINVAL;
705 		}
706 
707 		bscale = PAGE_SIZE / devBlockSize;
708 		blkmask = bscale - 1;
709 		bsize = bscale * devBlockSize;
710 
711 
712 		do {
713 			bn = (daddr64_t)((uio->uio_offset / devBlockSize) & ~blkmask);
714 			on = uio->uio_offset % bsize;
715 
716 			n = MIN((bsize - on), (size_t)uio_resid(uio));
717 
718 			/*
719 			 * Use buf_getblk() as an optimization IFF:
720 			 *
721 			 * 1)	We are reading exactly a block on a block
722 			 *	aligned boundary
723 			 * 2)	We know the size of the device from spec_open
724 			 * 3)	The read doesn't span the end of the device
725 			 *
726 			 * Otherwise, we fall back on buf_bread().
727 			 */
728 			if (n == bsize &&
729 			    vp->v_specdevsize != (u_int64_t)0 &&
730 			    (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
731 				/* reduce the size of the read to what is there */
732 				n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
733 			}
734 
735 			if (n == bsize) {
736 				bp = buf_getblk(vp, bn, (int)bsize, 0, 0, BLK_WRITE);
737 			} else {
738 				error = (int)buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
739 			}
740 
741 			/* Translate downstream error for upstream, if needed */
742 			if (!error) {
743 				error = (int)buf_error(bp);
744 			}
745 			if (error) {
746 				buf_brelse(bp);
747 				return error;
748 			}
749 			n = MIN(n, bsize - buf_resid(bp));
750 
751 			error = uiomove((char *)buf_dataptr(bp) + on, (int)n, uio);
752 			if (error) {
753 				buf_brelse(bp);
754 				return error;
755 			}
756 			buf_markaged(bp);
757 
758 			if (io_sync) {
759 				error = buf_bwrite(bp);
760 			} else {
761 				if ((n + on) == bsize) {
762 					error = buf_bawrite(bp);
763 				} else {
764 					error = buf_bdwrite(bp);
765 				}
766 			}
767 		} while (error == 0 && uio_resid(uio) > 0 && n != 0);
768 		return error;
769 
770 	default:
771 		panic("spec_write type");
772 	}
773 	/* NOTREACHED */
774 
775 	return 0;
776 }
777 
778 /*
779  * Device ioctl operation.
780  */
781 int
spec_ioctl(struct vnop_ioctl_args * ap)782 spec_ioctl(struct vnop_ioctl_args *ap)
783 {
784 	proc_t p = vfs_context_proc(ap->a_context);
785 	dev_t dev = ap->a_vp->v_rdev;
786 	int     retval = 0;
787 
788 	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
789 	    dev, ap->a_command, ap->a_fflag, ap->a_vp->v_type, 0);
790 
791 	switch (ap->a_vp->v_type) {
792 	case VCHR:
793 		retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
794 		    ap->a_fflag, p);
795 		break;
796 
797 	case VBLK:
798 		retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p);
799 		if (!retval && ap->a_command == DKIOCSETBLOCKSIZE) {
800 			ap->a_vp->v_specsize = *(uint32_t *)ap->a_data;
801 		}
802 		break;
803 
804 	default:
805 		panic("spec_ioctl");
806 		/* NOTREACHED */
807 	}
808 	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
809 	    dev, ap->a_command, ap->a_fflag, retval, 0);
810 
811 	return retval;
812 }
813 
814 int
spec_select(struct vnop_select_args * ap)815 spec_select(struct vnop_select_args *ap)
816 {
817 	proc_t p = vfs_context_proc(ap->a_context);
818 	dev_t dev;
819 
820 	switch (ap->a_vp->v_type) {
821 	default:
822 		return 1;             /* XXX */
823 
824 	case VCHR:
825 		dev = ap->a_vp->v_rdev;
826 		return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
827 	}
828 }
829 
830 int
spec_kqfilter(vnode_t vp,struct knote * kn,struct kevent_qos_s * kev)831 spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_qos_s *kev)
832 {
833 	dev_t dev;
834 
835 	assert(vnode_ischr(vp));
836 
837 	dev = vnode_specrdev(vp);
838 
839 #if NETWORKING
840 	/*
841 	 * Try a bpf device, as defined in bsd/net/bpf.c
842 	 * If it doesn't error out the attach, then it
843 	 * claimed it. Otherwise, fall through and try
844 	 * other attaches.
845 	 */
846 	int32_t tmp_flags = kn->kn_flags;
847 	int64_t tmp_sdata = kn->kn_sdata;
848 	int res;
849 
850 	res = bpfkqfilter(dev, kn);
851 	if ((kn->kn_flags & EV_ERROR) == 0) {
852 		return res;
853 	}
854 	kn->kn_flags = tmp_flags;
855 	kn->kn_sdata = tmp_sdata;
856 #endif
857 
858 	if (major(dev) >= nchrdev) {
859 		knote_set_error(kn, ENXIO);
860 		return 0;
861 	}
862 
863 	kn->kn_vnode_kqok = !!(cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE);
864 	kn->kn_vnode_use_ofst = !!(cdevsw_flags[major(dev)] & CDEVSW_USE_OFFSET);
865 
866 	if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTS) {
867 		kn->kn_filtid = EVFILTID_PTSD;
868 		return ptsd_kqfilter(dev, kn);
869 	} else if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTC) {
870 		kn->kn_filtid = EVFILTID_PTMX;
871 		return ptmx_kqfilter(dev, kn);
872 	} else if (cdevsw[major(dev)].d_type == D_TTY && kn->kn_vnode_kqok) {
873 		/*
874 		 * TTYs from drivers that use struct ttys use their own filter
875 		 * routines.  The PTC driver doesn't use the tty for character
876 		 * counts, so it must go through the select fallback.
877 		 */
878 		kn->kn_filtid = EVFILTID_TTY;
879 	} else {
880 		/* Try to attach to other char special devices */
881 		kn->kn_filtid = EVFILTID_SPEC;
882 	}
883 
884 	return knote_fops(kn)->f_attach(kn, kev);
885 }
886 
887 /*
888  * Synch buffers associated with a block device
889  */
890 int
spec_fsync_internal(vnode_t vp,int waitfor,__unused vfs_context_t context)891 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
892 {
893 	if (vp->v_type == VCHR) {
894 		return 0;
895 	}
896 	/*
897 	 * Flush all dirty buffers associated with a block device.
898 	 */
899 	buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
900 
901 	return 0;
902 }
903 
904 int
spec_fsync(struct vnop_fsync_args * ap)905 spec_fsync(struct vnop_fsync_args *ap)
906 {
907 	return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
908 }
909 
910 
911 /*
912  * Just call the device strategy routine
913  */
914 void throttle_init(void);
915 
916 
917 #if 0
918 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)  \
919 	do {                                                    \
920 	       if ((debug_info)->alloc)                           \
921 	       printf("%s: "format, __FUNCTION__, ## args);     \
922        } while(0)
923 
924 #else
925 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
926 #endif
927 
928 
929 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER1], 0, "");
930 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER2], 0, "");
931 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER3], 0, "");
932 
933 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER1], 0, "");
934 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER2], 0, "");
935 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER3], 0, "");
936 
937 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER1], 0, "");
938 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER2], 0, "");
939 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER3], 0, "");
940 
941 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, "");
942 
943 
944 static LCK_GRP_DECLARE(throttle_lock_grp, "throttle I/O");
945 
946 
947 /*
948  * throttled I/O helper function
949  * convert the index of the lowest set bit to a device index
950  */
951 int
num_trailing_0(uint64_t n)952 num_trailing_0(uint64_t n)
953 {
954 	/*
955 	 * since in most cases the number of trailing 0s is very small,
956 	 * we simply counting sequentially from the lowest bit
957 	 */
958 	if (n == 0) {
959 		return sizeof(n) * 8;
960 	}
961 	int count = 0;
962 	while (!ISSET(n, 1)) {
963 		n >>= 1;
964 		++count;
965 	}
966 	return count;
967 }
968 
969 
970 /*
971  * Release the reference and if the item was allocated and this is the last
972  * reference then free it.
973  *
974  * This routine always returns the old value.
975  */
976 static int
throttle_info_rel(struct _throttle_io_info_t * info)977 throttle_info_rel(struct _throttle_io_info_t *info)
978 {
979 	SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt);
980 
981 	DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
982 	    info, (int)(oldValue - 1), info );
983 
984 	/* The reference count just went negative, very bad */
985 	if (oldValue == 0) {
986 		panic("throttle info ref cnt went negative!");
987 	}
988 
989 	/*
990 	 * Once reference count is zero, no one else should be able to take a
991 	 * reference
992 	 */
993 	if ((oldValue == 1) && (info->throttle_alloc)) {
994 		DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
995 
996 		lck_mtx_destroy(&info->throttle_lock, &throttle_lock_grp);
997 		kfree_type(struct _throttle_io_info_t, info);
998 	}
999 	return oldValue;
1000 }
1001 
1002 
1003 /*
1004  * Just take a reference on the throttle info structure.
1005  *
1006  * This routine always returns the old value.
1007  */
1008 static SInt32
throttle_info_ref(struct _throttle_io_info_t * info)1009 throttle_info_ref(struct _throttle_io_info_t *info)
1010 {
1011 	SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt);
1012 
1013 	DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
1014 	    info, (int)(oldValue - 1), info );
1015 	/* Allocated items should never have a reference of zero */
1016 	if (info->throttle_alloc && (oldValue == 0)) {
1017 		panic("Taking a reference without calling create throttle info!");
1018 	}
1019 
1020 	return oldValue;
1021 }
1022 
1023 /*
1024  * on entry the throttle_lock is held...
1025  * this function is responsible for taking
1026  * and dropping the reference on the info
1027  * structure which will keep it from going
1028  * away while the timer is running if it
1029  * happens to have been dynamically allocated by
1030  * a network fileystem kext which is now trying
1031  * to free it
1032  */
1033 static uint32_t
throttle_timer_start(struct _throttle_io_info_t * info,boolean_t update_io_count,int wakelevel)1034 throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count, int wakelevel)
1035 {
1036 	struct timeval  elapsed;
1037 	struct timeval  now;
1038 	struct timeval  period;
1039 	uint64_t        elapsed_msecs;
1040 	int             throttle_level;
1041 	int             level;
1042 	int             msecs;
1043 	boolean_t       throttled = FALSE;
1044 	boolean_t       need_timer = FALSE;
1045 
1046 	microuptime(&now);
1047 
1048 	if (update_io_count == TRUE) {
1049 		info->throttle_io_count_begin = info->throttle_io_count;
1050 		info->throttle_io_period_num++;
1051 
1052 		while (wakelevel >= THROTTLE_LEVEL_THROTTLED) {
1053 			info->throttle_start_IO_period_timestamp[wakelevel--] = now;
1054 		}
1055 
1056 		info->throttle_min_timer_deadline = now;
1057 
1058 		msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED];
1059 		period.tv_sec = msecs / 1000;
1060 		period.tv_usec = (msecs % 1000) * 1000;
1061 
1062 		timevaladd(&info->throttle_min_timer_deadline, &period);
1063 	}
1064 	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
1065 		elapsed = now;
1066 		timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1067 		elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1068 
1069 		for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
1070 			if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) {
1071 				if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level] || info->throttle_inflight_count[throttle_level]) {
1072 					/*
1073 					 * we had an I/O occur at a higher priority tier within
1074 					 * this tier's throttle window
1075 					 */
1076 					throttled = TRUE;
1077 				}
1078 				/*
1079 				 * we assume that the windows are the same or longer
1080 				 * as we drop through the throttling tiers...  thus
1081 				 * we can stop looking once we run into a tier with
1082 				 * threads to schedule regardless of whether it's
1083 				 * still in its throttling window or not
1084 				 */
1085 				break;
1086 			}
1087 		}
1088 		if (throttled == TRUE) {
1089 			break;
1090 		}
1091 	}
1092 	if (throttled == TRUE) {
1093 		uint64_t        deadline = 0;
1094 		struct timeval  target;
1095 		struct timeval  min_target;
1096 
1097 		/*
1098 		 * we've got at least one tier still in a throttled window
1099 		 * so we need a timer running... compute the next deadline
1100 		 * and schedule it
1101 		 */
1102 		for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
1103 			if (TAILQ_EMPTY(&info->throttle_uthlist[level])) {
1104 				continue;
1105 			}
1106 
1107 			target = info->throttle_start_IO_period_timestamp[level];
1108 
1109 			msecs = info->throttle_io_periods[level];
1110 			period.tv_sec = msecs / 1000;
1111 			period.tv_usec = (msecs % 1000) * 1000;
1112 
1113 			timevaladd(&target, &period);
1114 
1115 			if (need_timer == FALSE || timevalcmp(&target, &min_target, <)) {
1116 				min_target = target;
1117 				need_timer = TRUE;
1118 			}
1119 		}
1120 		if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) {
1121 			if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >)) {
1122 				min_target = info->throttle_min_timer_deadline;
1123 			}
1124 		}
1125 
1126 		if (info->throttle_timer_active) {
1127 			if (thread_call_cancel(info->throttle_timer_call) == FALSE) {
1128 				/*
1129 				 * couldn't kill the timer because it's already
1130 				 * been dispatched, so don't try to start a new
1131 				 * one... once we drop the lock, the timer will
1132 				 * proceed and eventually re-run this function
1133 				 */
1134 				need_timer = FALSE;
1135 			} else {
1136 				info->throttle_timer_active = 0;
1137 			}
1138 		}
1139 		if (need_timer == TRUE) {
1140 			/*
1141 			 * This is defined as an int (32-bit) rather than a 64-bit
1142 			 * value because it would need a really big period in the
1143 			 * order of ~500 days to overflow this. So, we let this be
1144 			 * 32-bit which allows us to use the clock_interval_to_deadline()
1145 			 * routine.
1146 			 */
1147 			int     target_msecs;
1148 
1149 			if (info->throttle_timer_ref == 0) {
1150 				/*
1151 				 * take a reference for the timer
1152 				 */
1153 				throttle_info_ref(info);
1154 
1155 				info->throttle_timer_ref = 1;
1156 			}
1157 			elapsed = min_target;
1158 			timevalsub(&elapsed, &now);
1159 			target_msecs = (int)(elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000);
1160 
1161 			if (target_msecs <= 0) {
1162 				/*
1163 				 * we may have computed a deadline slightly in the past
1164 				 * due to various factors... if so, just set the timer
1165 				 * to go off in the near future (we don't need to be precise)
1166 				 */
1167 				target_msecs = 1;
1168 			}
1169 			clock_interval_to_deadline(target_msecs, 1000000, &deadline);
1170 
1171 			thread_call_enter_delayed(info->throttle_timer_call, deadline);
1172 			info->throttle_timer_active = 1;
1173 		}
1174 	}
1175 	return throttle_level;
1176 }
1177 
1178 
1179 static void
throttle_timer(struct _throttle_io_info_t * info,__unused thread_call_param_t p)1180 throttle_timer(struct _throttle_io_info_t *info, __unused thread_call_param_t p)
1181 {
1182 	uthread_t       ut, utlist;
1183 	struct timeval  elapsed;
1184 	struct timeval  now;
1185 	uint64_t        elapsed_msecs;
1186 	int             throttle_level;
1187 	int             level;
1188 	int             wake_level;
1189 	caddr_t         wake_address = NULL;
1190 	boolean_t       update_io_count = FALSE;
1191 	boolean_t       need_wakeup = FALSE;
1192 	boolean_t       need_release = FALSE;
1193 
1194 	ut = NULL;
1195 	lck_mtx_lock(&info->throttle_lock);
1196 
1197 	info->throttle_timer_active = 0;
1198 	microuptime(&now);
1199 
1200 	elapsed = now;
1201 	timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]);
1202 	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1203 
1204 	if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) {
1205 		wake_level = info->throttle_next_wake_level;
1206 
1207 		for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) {
1208 			elapsed = now;
1209 			timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]);
1210 			elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1211 
1212 			if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1213 				/*
1214 				 * we're closing out the current IO period...
1215 				 * if we have a waiting thread, wake it up
1216 				 * after we have reset the I/O window info
1217 				 */
1218 				need_wakeup = TRUE;
1219 				update_io_count = TRUE;
1220 
1221 				info->throttle_next_wake_level = wake_level - 1;
1222 
1223 				if (info->throttle_next_wake_level == THROTTLE_LEVEL_START) {
1224 					info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1225 				}
1226 
1227 				break;
1228 			}
1229 			wake_level--;
1230 
1231 			if (wake_level == THROTTLE_LEVEL_START) {
1232 				wake_level = THROTTLE_LEVEL_END;
1233 			}
1234 		}
1235 	}
1236 	if (need_wakeup == TRUE) {
1237 		if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1238 			ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]);
1239 			TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist);
1240 			ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1241 			ut->uu_is_throttled = false;
1242 
1243 			wake_address = (caddr_t)&ut->uu_on_throttlelist;
1244 		}
1245 	} else {
1246 		wake_level = THROTTLE_LEVEL_START;
1247 	}
1248 
1249 	throttle_level = throttle_timer_start(info, update_io_count, wake_level);
1250 
1251 	if (wake_address != NULL) {
1252 		wakeup(wake_address);
1253 	}
1254 
1255 	for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) {
1256 		TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) {
1257 			TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist);
1258 			ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1259 			ut->uu_is_throttled = false;
1260 
1261 			wakeup(&ut->uu_on_throttlelist);
1262 		}
1263 	}
1264 	if (info->throttle_timer_active == 0 && info->throttle_timer_ref) {
1265 		info->throttle_timer_ref = 0;
1266 		need_release = TRUE;
1267 	}
1268 	lck_mtx_unlock(&info->throttle_lock);
1269 
1270 	if (need_release == TRUE) {
1271 		throttle_info_rel(info);
1272 	}
1273 }
1274 
1275 
1276 static int
throttle_add_to_list(struct _throttle_io_info_t * info,uthread_t ut,int mylevel,boolean_t insert_tail)1277 throttle_add_to_list(struct _throttle_io_info_t *info, uthread_t ut, int mylevel, boolean_t insert_tail)
1278 {
1279 	boolean_t start_timer = FALSE;
1280 	int level = THROTTLE_LEVEL_START;
1281 
1282 	if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) {
1283 		info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel];
1284 		start_timer = TRUE;
1285 	}
1286 
1287 	if (insert_tail == TRUE) {
1288 		TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1289 	} else {
1290 		TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1291 	}
1292 
1293 	ut->uu_on_throttlelist = (int8_t)mylevel;
1294 
1295 	if (start_timer == TRUE) {
1296 		/* we may need to start or rearm the timer */
1297 		level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START);
1298 
1299 		if (level == THROTTLE_LEVEL_END) {
1300 			if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1301 				TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1302 
1303 				ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1304 			}
1305 		}
1306 	}
1307 	return level;
1308 }
1309 
1310 static void
throttle_init_throttle_window(void)1311 throttle_init_throttle_window(void)
1312 {
1313 	int throttle_window_size;
1314 
1315 	/*
1316 	 * The hierarchy of throttle window values is as follows:
1317 	 * - Global defaults
1318 	 * - Device tree properties
1319 	 * - Boot-args
1320 	 * All values are specified in msecs.
1321 	 */
1322 
1323 #if (XNU_TARGET_OS_OSX && __arm64__)
1324 	/*
1325 	 * IO Tier EDT overrides are meant for
1326 	 * some arm platforms but not for
1327 	 * macs.
1328 	 */
1329 #else /* (XNU_TARGET_OS_OSX && __arm64__) */
1330 	/* Override global values with device-tree properties */
1331 	if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) {
1332 		throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1333 	}
1334 
1335 	if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) {
1336 		throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1337 	}
1338 
1339 	if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) {
1340 		throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1341 	}
1342 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
1343 
1344 	/* Override with boot-args */
1345 	if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) {
1346 		throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1347 	}
1348 
1349 	if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) {
1350 		throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1351 	}
1352 
1353 	if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) {
1354 		throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1355 	}
1356 }
1357 
1358 static void
throttle_init_throttle_period(struct _throttle_io_info_t * info,boolean_t isssd)1359 throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd)
1360 {
1361 	int throttle_period_size;
1362 
1363 	/*
1364 	 * The hierarchy of throttle period values is as follows:
1365 	 * - Global defaults
1366 	 * - Device tree properties
1367 	 * - Boot-args
1368 	 * All values are specified in msecs.
1369 	 */
1370 
1371 	/* Assign global defaults */
1372 	if ((isssd == TRUE) && (info->throttle_is_fusion_with_priority == 0)) {
1373 		info->throttle_io_periods = &throttle_io_period_ssd_msecs[0];
1374 	} else {
1375 		info->throttle_io_periods = &throttle_io_period_msecs[0];
1376 	}
1377 
1378 #if (XNU_TARGET_OS_OSX && __arm64__)
1379 	/*
1380 	 * IO Tier EDT overrides are meant for
1381 	 * some arm platforms but not for
1382 	 * macs.
1383 	 */
1384 #else /* (XNU_TARGET_OS_OSX && __arm64__) */
1385 	/* Override global values with device-tree properties */
1386 	if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) {
1387 		info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1388 	}
1389 
1390 	if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) {
1391 		info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1392 	}
1393 
1394 	if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) {
1395 		info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1396 	}
1397 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
1398 
1399 	/* Override with boot-args */
1400 	if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) {
1401 		info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1402 	}
1403 
1404 	if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) {
1405 		info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1406 	}
1407 
1408 	if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) {
1409 		info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1410 	}
1411 }
1412 
1413 #if CONFIG_IOSCHED
1414 extern  void vm_io_reprioritize_init(void);
1415 int     iosched_enabled = 1;
1416 #endif
1417 
1418 void
throttle_init(void)1419 throttle_init(void)
1420 {
1421 	struct _throttle_io_info_t *info;
1422 	int     i;
1423 	int     level;
1424 #if CONFIG_IOSCHED
1425 	int     iosched;
1426 #endif
1427 
1428 	/* Update throttle parameters based on device tree configuration */
1429 	throttle_init_throttle_window();
1430 
1431 	for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
1432 		info = &_throttle_io_info[i];
1433 
1434 		lck_mtx_init(&info->throttle_lock, &throttle_lock_grp, LCK_ATTR_NULL);
1435 		info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1436 
1437 		for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1438 			TAILQ_INIT(&info->throttle_uthlist[level]);
1439 			info->throttle_last_IO_pid[level] = 0;
1440 			info->throttle_inflight_count[level] = 0;
1441 		}
1442 		info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1443 		info->throttle_disabled = 0;
1444 		info->throttle_is_fusion_with_priority = 0;
1445 	}
1446 #if CONFIG_IOSCHED
1447 	if (PE_parse_boot_argn("iosched", &iosched, sizeof(iosched))) {
1448 		iosched_enabled = iosched;
1449 	}
1450 	if (iosched_enabled) {
1451 		/* Initialize I/O Reprioritization mechanism */
1452 		vm_io_reprioritize_init();
1453 	}
1454 #endif
1455 }
1456 
1457 void
sys_override_io_throttle(boolean_t enable_override)1458 sys_override_io_throttle(boolean_t enable_override)
1459 {
1460 	if (enable_override) {
1461 		lowpri_throttle_enabled = 0;
1462 	} else {
1463 		lowpri_throttle_enabled = 1;
1464 	}
1465 }
1466 
1467 int rethrottle_wakeups = 0;
1468 
1469 /*
1470  * the uu_rethrottle_lock is used to synchronize this function
1471  * with "throttle_lowpri_io" which is where a throttled thread
1472  * will block... that function will grab this lock before beginning
1473  * it's decision making process concerning the need to block, and
1474  * hold it through the assert_wait.  When that thread is awakened
1475  * for any reason (timer or rethrottle), it will reacquire the
1476  * uu_rethrottle_lock before determining if it really is ok for
1477  * it to now run.  This is the point at which the thread could
1478  * enter a different throttling queue and reblock or return from
1479  * the throttle w/o having waited out it's entire throttle if
1480  * the rethrottle has now moved it out of any currently
1481  * active throttle window.
1482  *
1483  *
1484  * NOTES:
1485  * 1 - This may be called with the task lock held.
1486  * 2 - This may be called with preemption and interrupts disabled
1487  *     in the kqueue wakeup path so we can't take the throttle_lock which is a mutex
1488  * 3 - This cannot safely dereference uu_throttle_info, as it may
1489  *     get deallocated out from under us
1490  */
1491 
1492 void
rethrottle_thread(uthread_t ut)1493 rethrottle_thread(uthread_t ut)
1494 {
1495 	/*
1496 	 * If uthread doesn't have throttle state, then there's no chance
1497 	 * of it needing a rethrottle.
1498 	 */
1499 	if (ut->uu_throttle_info == NULL) {
1500 		return;
1501 	}
1502 
1503 	boolean_t s = ml_set_interrupts_enabled(FALSE);
1504 	lck_spin_lock(&ut->uu_rethrottle_lock);
1505 
1506 	if (!ut->uu_is_throttled) {
1507 		ut->uu_was_rethrottled = true;
1508 	} else {
1509 		int my_new_level = throttle_get_thread_throttle_level(ut);
1510 
1511 		if (my_new_level != ut->uu_on_throttlelist) {
1512 			/*
1513 			 * ut is currently blocked (as indicated by
1514 			 * ut->uu_is_throttled == true)
1515 			 * and we're changing it's throttle level, so
1516 			 * we need to wake it up.
1517 			 */
1518 			ut->uu_is_throttled = false;
1519 			wakeup(&ut->uu_on_throttlelist);
1520 
1521 			rethrottle_wakeups++;
1522 			KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 102)),
1523 			    uthread_tid(ut), ut->uu_on_throttlelist, my_new_level, 0, 0);
1524 		}
1525 	}
1526 	lck_spin_unlock(&ut->uu_rethrottle_lock);
1527 	ml_set_interrupts_enabled(s);
1528 }
1529 
1530 
1531 /*
1532  * KPI routine
1533  *
1534  * Create and take a reference on a throttle info structure and return a
1535  * pointer for the file system to use when calling throttle_info_update.
1536  * Calling file system must have a matching release for every create.
1537  */
1538 void *
throttle_info_create(void)1539 throttle_info_create(void)
1540 {
1541 	struct _throttle_io_info_t *info;
1542 	int     level;
1543 
1544 	info = kalloc_type(struct _throttle_io_info_t,
1545 	    Z_ZERO | Z_WAITOK | Z_NOFAIL);
1546 	/* Mark that this one was allocated and needs to be freed */
1547 	DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
1548 	info->throttle_alloc = TRUE;
1549 
1550 	lck_mtx_init(&info->throttle_lock, &throttle_lock_grp, LCK_ATTR_NULL);
1551 	info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1552 
1553 	for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1554 		TAILQ_INIT(&info->throttle_uthlist[level]);
1555 	}
1556 	info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1557 
1558 	/* Take a reference */
1559 	OSIncrementAtomic(&info->throttle_refcnt);
1560 	return info;
1561 }
1562 
1563 /*
1564  * KPI routine
1565  *
1566  * Release the throttle info pointer if all the reference are gone. Should be
1567  * called to release reference taken by throttle_info_create
1568  */
1569 void
throttle_info_release(void * throttle_info)1570 throttle_info_release(void *throttle_info)
1571 {
1572 	DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
1573 	    (struct _throttle_io_info_t *)throttle_info,
1574 	    (struct _throttle_io_info_t *)throttle_info);
1575 	if (throttle_info) { /* Just to be careful */
1576 		throttle_info_rel(throttle_info);
1577 	}
1578 }
1579 
1580 /*
1581  * KPI routine
1582  *
1583  * File Systems that create an info structure, need to call this routine in
1584  * their mount routine (used by cluster code). File Systems that call this in
1585  * their mount routines must call throttle_info_mount_rel in their unmount
1586  * routines.
1587  */
1588 void
throttle_info_mount_ref(mount_t mp,void * throttle_info)1589 throttle_info_mount_ref(mount_t mp, void *throttle_info)
1590 {
1591 	if ((throttle_info == NULL) || (mp == NULL)) {
1592 		return;
1593 	}
1594 	throttle_info_ref(throttle_info);
1595 
1596 	/*
1597 	 * We already have a reference release it before adding the new one
1598 	 */
1599 	if (mp->mnt_throttle_info) {
1600 		throttle_info_rel(mp->mnt_throttle_info);
1601 	}
1602 	mp->mnt_throttle_info = throttle_info;
1603 }
1604 
1605 /*
1606  * Private KPI routine
1607  *
1608  * return a handle for accessing throttle_info given a throttle_mask.  The
1609  * handle must be released by throttle_info_rel_by_mask
1610  */
1611 int
throttle_info_ref_by_mask(uint64_t throttle_mask,throttle_info_handle_t * throttle_info_handle)1612 throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle)
1613 {
1614 	int     dev_index;
1615 	struct _throttle_io_info_t *info;
1616 
1617 	/*
1618 	 * The 'throttle_mask' is not expected to be 0 otherwise num_trailing_0()
1619 	 * would return value of 64 and this will cause '_throttle_io_info' to
1620 	 * go out of bounds as '_throttle_io_info' is only LOWPRI_MAX_NUM_DEV (64)
1621 	 * elements long.
1622 	 */
1623 	if (throttle_info_handle == NULL || throttle_mask == 0) {
1624 		return EINVAL;
1625 	}
1626 
1627 	dev_index = num_trailing_0(throttle_mask);
1628 	info = &_throttle_io_info[dev_index];
1629 	throttle_info_ref(info);
1630 	*(struct _throttle_io_info_t**)throttle_info_handle = info;
1631 
1632 	return 0;
1633 }
1634 
1635 /*
1636  * Private KPI routine
1637  *
1638  * release the handle obtained by throttle_info_ref_by_mask
1639  */
1640 void
throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)1641 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
1642 {
1643 	/*
1644 	 * for now the handle is just a pointer to _throttle_io_info_t
1645 	 */
1646 	throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
1647 }
1648 
1649 /*
1650  * KPI routine
1651  *
1652  * File Systems that throttle_info_mount_ref, must call this routine in their
1653  * umount routine.
1654  */
1655 void
throttle_info_mount_rel(mount_t mp)1656 throttle_info_mount_rel(mount_t mp)
1657 {
1658 	if (mp->mnt_throttle_info) {
1659 		throttle_info_rel(mp->mnt_throttle_info);
1660 	}
1661 	mp->mnt_throttle_info = NULL;
1662 }
1663 
1664 /*
1665  * Reset throttling periods for the given mount point
1666  *
1667  * private interface used by disk conditioner to reset
1668  * throttling periods when 'is_ssd' status changes
1669  */
1670 void
throttle_info_mount_reset_period(mount_t mp,int isssd)1671 throttle_info_mount_reset_period(mount_t mp, int isssd)
1672 {
1673 	struct _throttle_io_info_t *info;
1674 
1675 	if (mp == NULL) {
1676 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1677 	} else if (mp->mnt_throttle_info == NULL) {
1678 		info = &_throttle_io_info[mp->mnt_devbsdunit];
1679 	} else {
1680 		info = mp->mnt_throttle_info;
1681 	}
1682 
1683 	throttle_init_throttle_period(info, isssd);
1684 }
1685 
1686 void
throttle_info_get_last_io_time(mount_t mp,struct timeval * tv)1687 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
1688 {
1689 	struct _throttle_io_info_t *info;
1690 
1691 	if (mp == NULL) {
1692 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1693 	} else if (mp->mnt_throttle_info == NULL) {
1694 		info = &_throttle_io_info[mp->mnt_devbsdunit];
1695 	} else {
1696 		info = mp->mnt_throttle_info;
1697 	}
1698 
1699 	*tv = info->throttle_last_write_timestamp;
1700 }
1701 
1702 void
update_last_io_time(mount_t mp)1703 update_last_io_time(mount_t mp)
1704 {
1705 	struct _throttle_io_info_t *info;
1706 
1707 	if (mp == NULL) {
1708 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1709 	} else if (mp->mnt_throttle_info == NULL) {
1710 		info = &_throttle_io_info[mp->mnt_devbsdunit];
1711 	} else {
1712 		info = mp->mnt_throttle_info;
1713 	}
1714 
1715 	microuptime(&info->throttle_last_write_timestamp);
1716 	if (mp != NULL) {
1717 		mp->mnt_last_write_completed_timestamp = info->throttle_last_write_timestamp;
1718 	}
1719 }
1720 
1721 int
throttle_get_io_policy(uthread_t * ut)1722 throttle_get_io_policy(uthread_t *ut)
1723 {
1724 	if (ut != NULL) {
1725 		*ut = current_uthread();
1726 	}
1727 
1728 	return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
1729 }
1730 
1731 int
throttle_get_passive_io_policy(uthread_t * ut)1732 throttle_get_passive_io_policy(uthread_t *ut)
1733 {
1734 	if (ut != NULL) {
1735 		*ut = current_uthread();
1736 	}
1737 
1738 	return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_PASSIVE_IO);
1739 }
1740 
1741 
1742 static int
throttle_get_thread_throttle_level(uthread_t ut)1743 throttle_get_thread_throttle_level(uthread_t ut)
1744 {
1745 	uthread_t *ut_p = (ut == NULL) ? &ut : NULL;
1746 	int io_tier = throttle_get_io_policy(ut_p);
1747 
1748 	return throttle_get_thread_throttle_level_internal(ut, io_tier);
1749 }
1750 
1751 /*
1752  * Return a throttle level given an existing I/O tier (such as returned by throttle_get_io_policy)
1753  */
1754 static int
throttle_get_thread_throttle_level_internal(uthread_t ut,int io_tier)1755 throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier)
1756 {
1757 	int thread_throttle_level = io_tier;
1758 	int user_idle_level;
1759 
1760 	assert(ut != NULL);
1761 
1762 	/* Bootcache misses should always be throttled */
1763 	if (ut->uu_throttle_bc) {
1764 		thread_throttle_level = THROTTLE_LEVEL_TIER3;
1765 	}
1766 
1767 	/*
1768 	 * Issue tier3 I/O as tier2 when the user is idle
1769 	 * to allow maintenance tasks to make more progress.
1770 	 *
1771 	 * Assume any positive idle level is enough... for now it's
1772 	 * only ever 0 or 128 but this is not defined anywhere.
1773 	 */
1774 	if (thread_throttle_level >= THROTTLE_LEVEL_TIER3) {
1775 		user_idle_level = timer_get_user_idle_level();
1776 		if (user_idle_level > 0) {
1777 			thread_throttle_level--;
1778 		}
1779 	}
1780 
1781 	return thread_throttle_level;
1782 }
1783 
1784 /*
1785  * I/O will be throttled if either of the following are true:
1786  *   - Higher tiers have in-flight I/O
1787  *   - The time delta since the last start/completion of a higher tier is within the throttle window interval
1788  *
1789  * In-flight I/O is bookended by throttle_info_update_internal/throttle_info_end_io_internal
1790  */
1791 static int
throttle_io_will_be_throttled_internal(void * throttle_info,int * mylevel,int * throttling_level)1792 throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level)
1793 {
1794 	struct _throttle_io_info_t *info = throttle_info;
1795 	struct timeval elapsed;
1796 	struct timeval now;
1797 	uint64_t elapsed_msecs;
1798 	int     thread_throttle_level;
1799 	int     throttle_level;
1800 
1801 	if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED) {
1802 		return THROTTLE_DISENGAGED;
1803 	}
1804 
1805 	microuptime(&now);
1806 
1807 	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
1808 		if (info->throttle_inflight_count[throttle_level]) {
1809 			break;
1810 		}
1811 		elapsed = now;
1812 		timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1813 		elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1814 
1815 		if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) {
1816 			break;
1817 		}
1818 	}
1819 	if (throttle_level >= thread_throttle_level) {
1820 		/*
1821 		 * we're beyond all of the throttle windows
1822 		 * that affect the throttle level of this thread,
1823 		 * so go ahead and treat as normal I/O
1824 		 */
1825 		return THROTTLE_DISENGAGED;
1826 	}
1827 	if (mylevel) {
1828 		*mylevel = thread_throttle_level;
1829 	}
1830 	if (throttling_level) {
1831 		*throttling_level = throttle_level;
1832 	}
1833 
1834 	if (info->throttle_io_count != info->throttle_io_count_begin) {
1835 		/*
1836 		 * we've already issued at least one throttleable I/O
1837 		 * in the current I/O window, so avoid issuing another one
1838 		 */
1839 		return THROTTLE_NOW;
1840 	}
1841 	/*
1842 	 * we're in the throttle window, so
1843 	 * cut the I/O size back
1844 	 */
1845 	return THROTTLE_ENGAGED;
1846 }
1847 
1848 /*
1849  * If we have a mount point and it has a throttle info pointer then
1850  * use it to do the check, otherwise use the device unit number to find
1851  * the correct throttle info array element.
1852  */
1853 int
throttle_io_will_be_throttled(__unused int lowpri_window_msecs,mount_t mp)1854 throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp)
1855 {
1856 	struct _throttle_io_info_t      *info;
1857 
1858 	/*
1859 	 * Should we just return zero if no mount point
1860 	 */
1861 	if (mp == NULL) {
1862 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1863 	} else if (mp->mnt_throttle_info == NULL) {
1864 		info = &_throttle_io_info[mp->mnt_devbsdunit];
1865 	} else {
1866 		info = mp->mnt_throttle_info;
1867 	}
1868 
1869 	if (info->throttle_is_fusion_with_priority) {
1870 		uthread_t ut = current_uthread();
1871 		if (ut->uu_lowpri_window == 0) {
1872 			return THROTTLE_DISENGAGED;
1873 		}
1874 	}
1875 
1876 	if (info->throttle_disabled) {
1877 		return THROTTLE_DISENGAGED;
1878 	} else {
1879 		return throttle_io_will_be_throttled_internal(info, NULL, NULL);
1880 	}
1881 }
1882 
1883 /*
1884  * Routine to increment I/O throttling counters maintained in the proc
1885  */
1886 
1887 static void
throttle_update_proc_stats(pid_t throttling_pid,int count)1888 throttle_update_proc_stats(pid_t throttling_pid, int count)
1889 {
1890 	proc_t throttling_proc;
1891 	proc_t throttled_proc = current_proc();
1892 
1893 	/* The throttled_proc is always the current proc; so we are not concerned with refs */
1894 	OSAddAtomic64(count, &(throttled_proc->was_throttled));
1895 
1896 	/* The throttling pid might have exited by now */
1897 	throttling_proc = proc_find(throttling_pid);
1898 	if (throttling_proc != PROC_NULL) {
1899 		OSAddAtomic64(count, &(throttling_proc->did_throttle));
1900 		proc_rele(throttling_proc);
1901 	}
1902 }
1903 
1904 /*
1905  * Block until woken up by the throttle timer or by a rethrottle call.
1906  * As long as we hold the throttle_lock while querying the throttle tier, we're
1907  * safe against seeing an old throttle tier after a rethrottle.
1908  */
1909 uint32_t
throttle_lowpri_io(int sleep_amount)1910 throttle_lowpri_io(int sleep_amount)
1911 {
1912 	uthread_t ut;
1913 	struct _throttle_io_info_t *info;
1914 	int     throttle_type = 0;
1915 	int     mylevel = 0;
1916 	int     throttling_level = THROTTLE_LEVEL_NONE;
1917 	int     sleep_cnt = 0;
1918 	uint32_t  throttle_io_period_num = 0;
1919 	boolean_t insert_tail = TRUE;
1920 	boolean_t s;
1921 
1922 	ut = current_uthread();
1923 
1924 	if (ut->uu_lowpri_window == 0) {
1925 		return 0;
1926 	}
1927 
1928 	info = ut->uu_throttle_info;
1929 
1930 	if (info == NULL) {
1931 		ut->uu_throttle_bc = false;
1932 		ut->uu_lowpri_window = 0;
1933 		return 0;
1934 	}
1935 	lck_mtx_lock(&info->throttle_lock);
1936 	assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
1937 
1938 	if (sleep_amount == 0) {
1939 		goto done;
1940 	}
1941 
1942 	if (sleep_amount == 1 && !ut->uu_throttle_bc) {
1943 		sleep_amount = 0;
1944 	}
1945 
1946 	throttle_io_period_num = info->throttle_io_period_num;
1947 
1948 	ut->uu_was_rethrottled = false;
1949 
1950 	while ((throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level))) {
1951 		if (throttle_type == THROTTLE_ENGAGED) {
1952 			if (sleep_amount == 0) {
1953 				break;
1954 			}
1955 			if (info->throttle_io_period_num < throttle_io_period_num) {
1956 				break;
1957 			}
1958 			if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
1959 				break;
1960 			}
1961 		}
1962 		/*
1963 		 * keep the same position in the list if "rethrottle_thread" changes our throttle level  and
1964 		 * then puts us back to the original level before we get a chance to run
1965 		 */
1966 		if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED && ut->uu_on_throttlelist != mylevel) {
1967 			/*
1968 			 * must have been awakened via "rethrottle_thread" (the timer pulls us off the list)
1969 			 * and we've changed our throttling level, so pull ourselves off of the appropriate list
1970 			 * and make sure we get put on the tail of the new list since we're starting anew w/r to
1971 			 * the throttling engine
1972 			 */
1973 			TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1974 			ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1975 			insert_tail = TRUE;
1976 		}
1977 		if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) {
1978 			if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END) {
1979 				goto done;
1980 			}
1981 		}
1982 		assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END);
1983 
1984 		s = ml_set_interrupts_enabled(FALSE);
1985 		lck_spin_lock(&ut->uu_rethrottle_lock);
1986 
1987 		/*
1988 		 * this is the critical section w/r to our interaction
1989 		 * with "rethrottle_thread"
1990 		 */
1991 		if (ut->uu_was_rethrottled) {
1992 			lck_spin_unlock(&ut->uu_rethrottle_lock);
1993 			ml_set_interrupts_enabled(s);
1994 			lck_mtx_yield(&info->throttle_lock);
1995 
1996 			KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 103)),
1997 			    uthread_tid(ut), ut->uu_on_throttlelist, 0, 0, 0);
1998 
1999 			ut->uu_was_rethrottled = false;
2000 			continue;
2001 		}
2002 		KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE,
2003 		    info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, 0);
2004 
2005 		if (sleep_cnt == 0) {
2006 			KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
2007 			    throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
2008 			throttled_count[mylevel]++;
2009 		}
2010 		ut->uu_wmesg = "throttle_lowpri_io";
2011 
2012 		assert_wait((caddr_t)&ut->uu_on_throttlelist, THREAD_UNINT);
2013 
2014 		ut->uu_is_throttled = true;
2015 		lck_spin_unlock(&ut->uu_rethrottle_lock);
2016 		ml_set_interrupts_enabled(s);
2017 
2018 		lck_mtx_unlock(&info->throttle_lock);
2019 
2020 		thread_block(THREAD_CONTINUE_NULL);
2021 
2022 		ut->uu_wmesg = NULL;
2023 
2024 		ut->uu_is_throttled = false;
2025 		ut->uu_was_rethrottled = false;
2026 
2027 		lck_mtx_lock(&info->throttle_lock);
2028 
2029 		sleep_cnt++;
2030 
2031 		if (sleep_amount == 0) {
2032 			insert_tail = FALSE;
2033 		} else if (info->throttle_io_period_num < throttle_io_period_num ||
2034 		    (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
2035 			insert_tail = FALSE;
2036 			sleep_amount = 0;
2037 		}
2038 	}
2039 done:
2040 	if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
2041 		TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
2042 		ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
2043 	}
2044 	lck_mtx_unlock(&info->throttle_lock);
2045 
2046 	if (sleep_cnt) {
2047 		KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
2048 		    throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
2049 		/*
2050 		 * We update the stats for the last pid which opened a throttle window for the throttled thread.
2051 		 * This might not be completely accurate since the multiple throttles seen by the lower tier pid
2052 		 * might have been caused by various higher prio pids. However, updating these stats accurately
2053 		 * means doing a proc_find while holding the throttle lock which leads to deadlock.
2054 		 */
2055 		throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level], sleep_cnt);
2056 	}
2057 
2058 	ut->uu_throttle_info = NULL;
2059 	ut->uu_throttle_bc = false;
2060 	ut->uu_lowpri_window = 0;
2061 
2062 	throttle_info_rel(info);
2063 
2064 	return sleep_cnt;
2065 }
2066 
2067 /*
2068  *  returns TRUE if the throttle_lowpri_io called with the same sleep_amount would've slept
2069  *  This function mimics the most of the throttle_lowpri_io checks but without actual sleeping
2070  */
2071 int
throttle_lowpri_io_will_be_throttled(int sleep_amount)2072 throttle_lowpri_io_will_be_throttled(int sleep_amount)
2073 {
2074 	if (sleep_amount == 0) {
2075 		return FALSE;
2076 	}
2077 
2078 	uthread_t ut = current_uthread();
2079 	if (ut->uu_lowpri_window == 0) {
2080 		return FALSE;
2081 	}
2082 
2083 	struct _throttle_io_info_t *info = ut->uu_throttle_info;
2084 	if (info == NULL) {
2085 		return FALSE;
2086 	}
2087 
2088 	lck_mtx_lock(&info->throttle_lock);
2089 	assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
2090 
2091 	if (sleep_amount == 1 && !ut->uu_throttle_bc) {
2092 		sleep_amount = 0;
2093 	}
2094 
2095 	int result = FALSE;
2096 
2097 	int throttle_type = throttle_io_will_be_throttled_internal(info, NULL, NULL);
2098 	if (throttle_type > THROTTLE_DISENGAGED) {
2099 		result = TRUE;
2100 		if ((throttle_type == THROTTLE_ENGAGED) && (sleep_amount == 0)) {
2101 			result = FALSE;
2102 		}
2103 	}
2104 
2105 	lck_mtx_unlock(&info->throttle_lock);
2106 
2107 	return result;
2108 }
2109 
2110 
2111 /*
2112  * KPI routine
2113  *
2114  * set a kernel thread's IO policy.  policy can be:
2115  * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE, IOPOL_UTILITY, IOPOL_STANDARD
2116  *
2117  * explanations about these policies are in the man page of setiopolicy_np
2118  */
2119 void
throttle_set_thread_io_policy(int policy)2120 throttle_set_thread_io_policy(int policy)
2121 {
2122 	proc_set_thread_policy(current_thread(), TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, policy);
2123 }
2124 
2125 int
throttle_get_thread_effective_io_policy()2126 throttle_get_thread_effective_io_policy()
2127 {
2128 	return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
2129 }
2130 
2131 int
throttle_thread_io_tier_above_metadata(void)2132 throttle_thread_io_tier_above_metadata(void)
2133 {
2134 	return throttle_get_thread_effective_io_policy() < IOSCHED_METADATA_TIER;
2135 }
2136 
2137 void
throttle_info_reset_window(uthread_t ut)2138 throttle_info_reset_window(uthread_t ut)
2139 {
2140 	struct _throttle_io_info_t *info;
2141 
2142 	if (ut == NULL) {
2143 		ut = current_uthread();
2144 	}
2145 
2146 	if ((info = ut->uu_throttle_info)) {
2147 		throttle_info_rel(info);
2148 
2149 		ut->uu_throttle_info = NULL;
2150 		ut->uu_lowpri_window = 0;
2151 		ut->uu_throttle_bc = false;
2152 	}
2153 }
2154 
2155 static
2156 void
throttle_info_set_initial_window(uthread_t ut,struct _throttle_io_info_t * info,boolean_t BC_throttle,boolean_t isssd)2157 throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle, boolean_t isssd)
2158 {
2159 	if (lowpri_throttle_enabled == 0 || info->throttle_disabled) {
2160 		return;
2161 	}
2162 
2163 	if (info->throttle_io_periods == 0) {
2164 		throttle_init_throttle_period(info, isssd);
2165 	}
2166 	if (ut->uu_throttle_info == NULL) {
2167 		ut->uu_throttle_info = info;
2168 		throttle_info_ref(info);
2169 		DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
2170 
2171 		ut->uu_lowpri_window = 1;
2172 		ut->uu_throttle_bc = BC_throttle;
2173 	}
2174 }
2175 
2176 /*
2177  * Update inflight IO count and throttling window
2178  * Should be called when an IO is done
2179  *
2180  * Only affects IO that was sent through spec_strategy
2181  */
2182 void
throttle_info_end_io(buf_t bp)2183 throttle_info_end_io(buf_t bp)
2184 {
2185 	mount_t mp;
2186 	struct bufattr *bap;
2187 	struct _throttle_io_info_t *info;
2188 	int io_tier;
2189 
2190 	bap = &bp->b_attr;
2191 	if (!ISSET(bap->ba_flags, BA_STRATEGY_TRACKED_IO)) {
2192 		return;
2193 	}
2194 	CLR(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2195 
2196 	mp = buf_vnode(bp)->v_mount;
2197 	if (mp != NULL) {
2198 		info = &_throttle_io_info[mp->mnt_devbsdunit];
2199 	} else {
2200 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2201 	}
2202 
2203 	io_tier = GET_BUFATTR_IO_TIER(bap);
2204 	if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2205 		io_tier--;
2206 	}
2207 
2208 	throttle_info_end_io_internal(info, io_tier);
2209 }
2210 
2211 /*
2212  * Decrement inflight count initially incremented by throttle_info_update_internal
2213  */
2214 static
2215 void
throttle_info_end_io_internal(struct _throttle_io_info_t * info,int throttle_level)2216 throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level)
2217 {
2218 	if (throttle_level == THROTTLE_LEVEL_NONE) {
2219 		return;
2220 	}
2221 
2222 	microuptime(&info->throttle_window_start_timestamp[throttle_level]);
2223 	OSDecrementAtomic(&info->throttle_inflight_count[throttle_level]);
2224 	assert(info->throttle_inflight_count[throttle_level] >= 0);
2225 }
2226 
2227 /*
2228  * If inflight is TRUE and bap is NULL then the caller is responsible for calling
2229  * throttle_info_end_io_internal to avoid leaking in-flight I/O.
2230  */
2231 static
2232 int
throttle_info_update_internal(struct _throttle_io_info_t * info,uthread_t ut,int flags,boolean_t isssd,boolean_t inflight,struct bufattr * bap)2233 throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap)
2234 {
2235 	int     thread_throttle_level;
2236 
2237 	if (lowpri_throttle_enabled == 0 || info->throttle_disabled) {
2238 		return THROTTLE_LEVEL_NONE;
2239 	}
2240 
2241 	if (ut == NULL) {
2242 		ut = current_uthread();
2243 	}
2244 
2245 	if (bap && inflight && !ut->uu_throttle_bc) {
2246 		thread_throttle_level = GET_BUFATTR_IO_TIER(bap);
2247 		if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2248 			thread_throttle_level--;
2249 		}
2250 	} else {
2251 		thread_throttle_level = throttle_get_thread_throttle_level(ut);
2252 	}
2253 
2254 	if (thread_throttle_level != THROTTLE_LEVEL_NONE) {
2255 		if (!ISSET(flags, B_PASSIVE)) {
2256 			info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid();
2257 			if (inflight && !ut->uu_throttle_bc) {
2258 				if (NULL != bap) {
2259 					SET(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2260 				}
2261 				OSIncrementAtomic(&info->throttle_inflight_count[thread_throttle_level]);
2262 			} else {
2263 				microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]);
2264 			}
2265 			KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) | DBG_FUNC_NONE,
2266 			    proc_getpid(current_proc()), thread_throttle_level, 0, 0, 0);
2267 		}
2268 		microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]);
2269 	}
2270 
2271 
2272 	if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) {
2273 		/*
2274 		 * I'd really like to do the IOSleep here, but
2275 		 * we may be holding all kinds of filesystem related locks
2276 		 * and the pages for this I/O marked 'busy'...
2277 		 * we don't want to cause a normal task to block on
2278 		 * one of these locks while we're throttling a task marked
2279 		 * for low priority I/O... we'll mark the uthread and
2280 		 * do the delay just before we return from the system
2281 		 * call that triggered this I/O or from vnode_pagein
2282 		 */
2283 		OSAddAtomic(1, &info->throttle_io_count);
2284 
2285 		throttle_info_set_initial_window(ut, info, FALSE, isssd);
2286 	}
2287 
2288 	return thread_throttle_level;
2289 }
2290 
2291 void *
throttle_info_update_by_mount(mount_t mp)2292 throttle_info_update_by_mount(mount_t mp)
2293 {
2294 	struct _throttle_io_info_t *info;
2295 	uthread_t ut;
2296 	boolean_t isssd = FALSE;
2297 
2298 	ut = current_uthread();
2299 
2300 	if (mp != NULL) {
2301 		if (disk_conditioner_mount_is_ssd(mp)) {
2302 			isssd = TRUE;
2303 		}
2304 		info = &_throttle_io_info[mp->mnt_devbsdunit];
2305 	} else {
2306 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2307 	}
2308 
2309 	if (!ut->uu_lowpri_window) {
2310 		throttle_info_set_initial_window(ut, info, FALSE, isssd);
2311 	}
2312 
2313 	return info;
2314 }
2315 
2316 
2317 /*
2318  * KPI routine
2319  *
2320  * this is usually called before every I/O, used for throttled I/O
2321  * book keeping.  This routine has low overhead and does not sleep
2322  */
2323 void
throttle_info_update(void * throttle_info,int flags)2324 throttle_info_update(void *throttle_info, int flags)
2325 {
2326 	if (throttle_info) {
2327 		throttle_info_update_internal(throttle_info, NULL, flags, FALSE, FALSE, NULL);
2328 	}
2329 }
2330 
2331 /*
2332  * KPI routine
2333  *
2334  * this is usually called before every I/O, used for throttled I/O
2335  * book keeping.  This routine has low overhead and does not sleep
2336  */
2337 void
throttle_info_update_by_mask(void * throttle_info_handle,int flags)2338 throttle_info_update_by_mask(void *throttle_info_handle, int flags)
2339 {
2340 	void *throttle_info = throttle_info_handle;
2341 
2342 	/*
2343 	 * for now we only use the lowest bit of the throttle mask, so the
2344 	 * handle is the same as the throttle_info.  Later if we store a
2345 	 * set of throttle infos in the handle, we will want to loop through
2346 	 * them and call throttle_info_update in a loop
2347 	 */
2348 	throttle_info_update(throttle_info, flags);
2349 }
2350 /*
2351  * KPI routine
2352  *
2353  * This routine marks the throttle info as disabled. Used for mount points which
2354  * support I/O scheduling.
2355  */
2356 
2357 void
throttle_info_disable_throttle(int devno,boolean_t isfusion)2358 throttle_info_disable_throttle(int devno, boolean_t isfusion)
2359 {
2360 	struct _throttle_io_info_t *info;
2361 
2362 	if (devno < 0 || devno >= LOWPRI_MAX_NUM_DEV) {
2363 		panic("Illegal devno (%d) passed into throttle_info_disable_throttle()", devno);
2364 	}
2365 
2366 	info = &_throttle_io_info[devno];
2367 	// don't disable software throttling on devices that are part of a fusion device
2368 	// and override the software throttle periods to use HDD periods
2369 	if (isfusion) {
2370 		info->throttle_is_fusion_with_priority = isfusion;
2371 		throttle_init_throttle_period(info, FALSE);
2372 	}
2373 	info->throttle_disabled = !info->throttle_is_fusion_with_priority;
2374 	return;
2375 }
2376 
2377 
2378 /*
2379  * KPI routine (private)
2380  * Called to determine if this IO is being throttled to this level so that it can be treated specially
2381  */
2382 int
throttle_info_io_will_be_throttled(void * throttle_info,int policy)2383 throttle_info_io_will_be_throttled(void * throttle_info, int policy)
2384 {
2385 	struct _throttle_io_info_t *info = throttle_info;
2386 	struct timeval elapsed;
2387 	uint64_t elapsed_msecs;
2388 	int     throttle_level;
2389 	int     thread_throttle_level;
2390 
2391 	switch (policy) {
2392 	case IOPOL_THROTTLE:
2393 		thread_throttle_level = THROTTLE_LEVEL_TIER3;
2394 		break;
2395 	case IOPOL_UTILITY:
2396 		thread_throttle_level = THROTTLE_LEVEL_TIER2;
2397 		break;
2398 	case IOPOL_STANDARD:
2399 		thread_throttle_level = THROTTLE_LEVEL_TIER1;
2400 		break;
2401 	default:
2402 		thread_throttle_level = THROTTLE_LEVEL_TIER0;
2403 		break;
2404 	}
2405 	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
2406 		if (info->throttle_inflight_count[throttle_level]) {
2407 			break;
2408 		}
2409 
2410 		microuptime(&elapsed);
2411 		timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
2412 		elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
2413 
2414 		if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) {
2415 			break;
2416 		}
2417 	}
2418 	if (throttle_level >= thread_throttle_level) {
2419 		/*
2420 		 * we're beyond all of the throttle windows
2421 		 * so go ahead and treat as normal I/O
2422 		 */
2423 		return THROTTLE_DISENGAGED;
2424 	}
2425 	/*
2426 	 * we're in the throttle window
2427 	 */
2428 	return THROTTLE_ENGAGED;
2429 }
2430 
2431 int
throttle_lowpri_window(void)2432 throttle_lowpri_window(void)
2433 {
2434 	return current_uthread()->uu_lowpri_window;
2435 }
2436 
2437 
2438 #if CONFIG_IOSCHED
2439 int upl_get_cached_tier(void *);
2440 #endif
2441 
2442 #if CONFIG_PHYS_WRITE_ACCT
2443 extern thread_t pm_sync_thread;
2444 #endif /* CONFIG_PHYS_WRITE_ACCT */
2445 
2446 int
spec_strategy(struct vnop_strategy_args * ap)2447 spec_strategy(struct vnop_strategy_args *ap)
2448 {
2449 	buf_t   bp;
2450 	int     bflags;
2451 	int     io_tier;
2452 	int     passive;
2453 	dev_t   bdev;
2454 	uthread_t ut;
2455 	mount_t mp;
2456 	struct  bufattr *bap;
2457 	int     strategy_ret;
2458 	struct _throttle_io_info_t *throttle_info;
2459 	boolean_t isssd = FALSE;
2460 	boolean_t inflight = FALSE;
2461 	boolean_t upgrade = FALSE;
2462 	int code = 0;
2463 
2464 #if CONFIG_DELAY_IDLE_SLEEP
2465 	proc_t curproc = current_proc();
2466 #endif /* CONFIG_DELAY_IDLE_SLEEP */
2467 
2468 	bp = ap->a_bp;
2469 	bdev = buf_device(bp);
2470 	mp = buf_vnode(bp)->v_mount;
2471 	bap = &bp->b_attr;
2472 
2473 #if CONFIG_PHYS_WRITE_ACCT
2474 	if (current_thread() == pm_sync_thread) {
2475 		OSAddAtomic64(buf_count(bp), (SInt64 *)&(kernel_pm_writes));
2476 	}
2477 #endif /* CONFIG_PHYS_WRITE_ACCT */
2478 
2479 #if CONFIG_IOSCHED
2480 	if (bp->b_flags & B_CLUSTER) {
2481 		io_tier = upl_get_cached_tier(bp->b_upl);
2482 
2483 		if (io_tier == -1) {
2484 			io_tier = throttle_get_io_policy(&ut);
2485 		}
2486 #if DEVELOPMENT || DEBUG
2487 		else {
2488 			int my_io_tier = throttle_get_io_policy(&ut);
2489 
2490 			if (io_tier != my_io_tier) {
2491 				KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, IO_TIER_UPL_MISMATCH)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp), my_io_tier, io_tier, 0, 0);
2492 			}
2493 		}
2494 #endif
2495 	} else {
2496 		io_tier = throttle_get_io_policy(&ut);
2497 	}
2498 #else
2499 	io_tier = throttle_get_io_policy(&ut);
2500 #endif
2501 	passive = throttle_get_passive_io_policy(&ut);
2502 
2503 	/*
2504 	 * Mark if the I/O was upgraded by throttle_get_thread_throttle_level
2505 	 * while preserving the original issued tier (throttle_get_io_policy
2506 	 * does not return upgraded tiers)
2507 	 */
2508 	if (mp && io_tier > throttle_get_thread_throttle_level_internal(ut, io_tier)) {
2509 #if CONFIG_IOSCHED
2510 		if (!(mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2511 			upgrade = TRUE;
2512 		}
2513 #else /* CONFIG_IOSCHED */
2514 		upgrade = TRUE;
2515 #endif /* CONFIG_IOSCHED */
2516 	}
2517 
2518 	if (bp->b_flags & B_META) {
2519 		bap->ba_flags |= BA_META;
2520 	}
2521 
2522 #if CONFIG_IOSCHED
2523 	/*
2524 	 * For metadata reads, ceil the I/O tier to IOSCHED_METADATA_EXPEDITED_TIER if they are expedited, otherwise
2525 	 * ceil it to IOSCHED_METADATA_TIER. Mark them passive if the I/O tier was upgraded.
2526 	 * For metadata writes, set the I/O tier to IOSCHED_METADATA_EXPEDITED_TIER if they are expedited. Otherwise
2527 	 * set it to IOSCHED_METADATA_TIER. In addition, mark them as passive.
2528 	 */
2529 	if (bap->ba_flags & BA_META) {
2530 		if ((mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) || (bap->ba_flags & BA_IO_SCHEDULED)) {
2531 			if (bp->b_flags & B_READ) {
2532 				if ((bap->ba_flags & BA_EXPEDITED_META_IO) && (io_tier > IOSCHED_METADATA_EXPEDITED_TIER)) {
2533 					io_tier = IOSCHED_METADATA_EXPEDITED_TIER;
2534 					passive = 1;
2535 				} else if (io_tier > IOSCHED_METADATA_TIER) {
2536 					io_tier = IOSCHED_METADATA_TIER;
2537 					passive = 1;
2538 				}
2539 			} else {
2540 				if (bap->ba_flags & BA_EXPEDITED_META_IO) {
2541 					io_tier = IOSCHED_METADATA_EXPEDITED_TIER;
2542 				} else {
2543 					io_tier = IOSCHED_METADATA_TIER;
2544 				}
2545 				passive = 1;
2546 			}
2547 		}
2548 	}
2549 #endif /* CONFIG_IOSCHED */
2550 
2551 	SET_BUFATTR_IO_TIER(bap, io_tier);
2552 
2553 	if (passive) {
2554 		bp->b_flags |= B_PASSIVE;
2555 		bap->ba_flags |= BA_PASSIVE;
2556 	}
2557 
2558 #if CONFIG_DELAY_IDLE_SLEEP
2559 	if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP)) {
2560 		bap->ba_flags |= BA_DELAYIDLESLEEP;
2561 	}
2562 #endif /* CONFIG_DELAY_IDLE_SLEEP */
2563 
2564 	bflags = bp->b_flags;
2565 
2566 	if (((bflags & B_READ) == 0) && ((bflags & B_ASYNC) == 0)) {
2567 		bufattr_markquickcomplete(bap);
2568 	}
2569 
2570 	if (bflags & B_READ) {
2571 		code |= DKIO_READ;
2572 	}
2573 	if (bflags & B_ASYNC) {
2574 		code |= DKIO_ASYNC;
2575 	}
2576 
2577 	if (bap->ba_flags & BA_META) {
2578 		code |= DKIO_META;
2579 	} else if (bflags & B_PAGEIO) {
2580 		code |= DKIO_PAGING;
2581 	}
2582 
2583 	if (io_tier != 0) {
2584 		code |= DKIO_THROTTLE;
2585 	}
2586 
2587 	code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
2588 
2589 	if (bflags & B_PASSIVE) {
2590 		code |= DKIO_PASSIVE;
2591 	}
2592 
2593 	if (bap->ba_flags & BA_NOCACHE) {
2594 		code |= DKIO_NOCACHE;
2595 	}
2596 
2597 	if (upgrade) {
2598 		code |= DKIO_TIER_UPGRADE;
2599 		SET(bap->ba_flags, BA_IO_TIER_UPGRADE);
2600 	}
2601 
2602 	if (kdebug_enable) {
2603 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
2604 		    buf_kernel_addrperm_addr(bp), bdev, buf_blkno(bp), buf_count(bp), 0);
2605 	}
2606 
2607 #if CONFIG_IO_COMPRESSION_STATS
2608 	// Do not run IO Compression Stats when a privilege thread is active
2609 	if (!is_vm_privileged() && !is_external_pageout_thread()) {
2610 		io_compression_stats(bp);
2611 	}
2612 #endif /* CONFIG_IO_COMPRESSION_STATS */
2613 	thread_update_io_stats(current_thread(), buf_count(bp), code);
2614 
2615 	if (mp != NULL) {
2616 		if (disk_conditioner_mount_is_ssd(mp)) {
2617 			isssd = TRUE;
2618 		}
2619 		/*
2620 		 * Partially initialized mounts don't have a final devbsdunit and should not be tracked.
2621 		 * Verify that devbsdunit is initialized (non-zero) or that 0 is the correct initialized value
2622 		 * (mnt_throttle_mask is initialized and num_trailing_0 would be 0)
2623 		 */
2624 		if (mp->mnt_devbsdunit || (mp->mnt_throttle_mask != LOWPRI_MAX_NUM_DEV - 1 && mp->mnt_throttle_mask & 0x1)) {
2625 			inflight = TRUE;
2626 		}
2627 		throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
2628 	} else {
2629 		throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2630 	}
2631 
2632 	throttle_info_update_internal(throttle_info, ut, bflags, isssd, inflight, bap);
2633 
2634 	if ((bflags & B_READ) == 0) {
2635 		microuptime(&throttle_info->throttle_last_write_timestamp);
2636 
2637 		if (mp) {
2638 			mp->mnt_last_write_issued_timestamp = throttle_info->throttle_last_write_timestamp;
2639 			INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
2640 		}
2641 	} else if (mp) {
2642 		INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
2643 	}
2644 	/*
2645 	 * The BootCache may give us special information about
2646 	 * the IO, so it returns special values that we check
2647 	 * for here.
2648 	 *
2649 	 * IO_SATISFIED_BY_CACHE
2650 	 * The read has been satisfied by the boot cache. Don't
2651 	 * throttle the thread unnecessarily.
2652 	 *
2653 	 * IO_SHOULD_BE_THROTTLED
2654 	 * The boot cache is playing back a playlist and this IO
2655 	 * cut through. Throttle it so we're not cutting through
2656 	 * the boot cache too often.
2657 	 *
2658 	 * Note that typical strategy routines are defined with
2659 	 * a void return so we'll get garbage here. In the
2660 	 * unlikely case the garbage matches our special return
2661 	 * value, it's not a big deal since we're only adjusting
2662 	 * the throttling delay.
2663 	 */
2664 #define IO_SATISFIED_BY_CACHE  ((int)0xcafefeed)
2665 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
2666 #pragma clang diagnostic push
2667 #pragma clang diagnostic ignored "-Wcast-function-type"
2668 
2669 	typedef int strategy_fcn_ret_t(struct buf *bp);
2670 
2671 	strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
2672 
2673 #pragma clang diagnostic pop
2674 
2675 	// disk conditioner needs to track when this I/O actually starts
2676 	// which means track it after `strategy` which may include delays
2677 	// from inflight I/Os
2678 	microuptime(&bp->b_timestamp_tv);
2679 
2680 	if (IO_SATISFIED_BY_CACHE == strategy_ret) {
2681 		/*
2682 		 * If this was a throttled IO satisfied by the boot cache,
2683 		 * don't delay the thread.
2684 		 */
2685 		throttle_info_reset_window(ut);
2686 	} else if (IO_SHOULD_BE_THROTTLED == strategy_ret) {
2687 		/*
2688 		 * If the boot cache indicates this IO should be throttled,
2689 		 * delay the thread.
2690 		 */
2691 		throttle_info_set_initial_window(ut, throttle_info, TRUE, isssd);
2692 	}
2693 	return 0;
2694 }
2695 
2696 
2697 /*
2698  * This is a noop, simply returning what one has been given.
2699  */
2700 int
spec_blockmap(__unused struct vnop_blockmap_args * ap)2701 spec_blockmap(__unused struct vnop_blockmap_args *ap)
2702 {
2703 	return ENOTSUP;
2704 }
2705 
2706 
2707 /*
2708  * Device close routine
2709  */
2710 int
spec_close(struct vnop_close_args * ap)2711 spec_close(struct vnop_close_args *ap)
2712 {
2713 	struct vnode *vp = ap->a_vp;
2714 	dev_t dev = vp->v_rdev;
2715 	int error = 0;
2716 	int flags = ap->a_fflag;
2717 	struct proc *p = vfs_context_proc(ap->a_context);
2718 	struct session *sessp;
2719 	struct pgrp *pg;
2720 
2721 	switch (vp->v_type) {
2722 	case VCHR:
2723 		/*
2724 		 * Hack: a tty device that is a controlling terminal
2725 		 * has a reference from the session structure.
2726 		 * We cannot easily tell that a character device is
2727 		 * a controlling terminal, unless it is the closing
2728 		 * process' controlling terminal.  In that case,
2729 		 * if the reference count is 1 (this is the very
2730 		 * last close)
2731 		 */
2732 		pg = proc_pgrp(p, &sessp);
2733 		devsw_lock(dev, S_IFCHR);
2734 		if (sessp != SESSION_NULL) {
2735 			if (vp == sessp->s_ttyvp && vcount(vp) == 1) {
2736 				struct tty *tp = TTY_NULL;
2737 
2738 				devsw_unlock(dev, S_IFCHR);
2739 				session_lock(sessp);
2740 				if (vp == sessp->s_ttyvp) {
2741 					tp = session_clear_tty_locked(sessp);
2742 				}
2743 				session_unlock(sessp);
2744 
2745 				if (tp != TTY_NULL) {
2746 					ttyfree(tp);
2747 				}
2748 				devsw_lock(dev, S_IFCHR);
2749 			}
2750 		}
2751 		pgrp_rele(pg);
2752 
2753 		if (--vp->v_specinfo->si_opencount < 0) {
2754 			panic("negative open count (c, %u, %u)", major(dev), minor(dev));
2755 		}
2756 
2757 		/*
2758 		 * close on last reference or on vnode revoke call
2759 		 */
2760 		if (vcount(vp) == 0 || (flags & IO_REVOKE) != 0) {
2761 			error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
2762 		}
2763 
2764 		devsw_unlock(dev, S_IFCHR);
2765 		break;
2766 
2767 	case VBLK:
2768 		/*
2769 		 * If there is more than one outstanding open, don't
2770 		 * send the close to the device.
2771 		 */
2772 		devsw_lock(dev, S_IFBLK);
2773 		if (vcount(vp) > 1) {
2774 			vp->v_specinfo->si_opencount--;
2775 			devsw_unlock(dev, S_IFBLK);
2776 			return 0;
2777 		}
2778 		devsw_unlock(dev, S_IFBLK);
2779 
2780 		/*
2781 		 * On last close of a block device (that isn't mounted)
2782 		 * we must invalidate any in core blocks, so that
2783 		 * we can, for instance, change floppy disks.
2784 		 */
2785 		if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) {
2786 			return error;
2787 		}
2788 
2789 		error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
2790 		if (error) {
2791 			return error;
2792 		}
2793 
2794 		devsw_lock(dev, S_IFBLK);
2795 
2796 		if (--vp->v_specinfo->si_opencount < 0) {
2797 			panic("negative open count (b, %u, %u)", major(dev), minor(dev));
2798 		}
2799 
2800 		if (vcount(vp) == 0) {
2801 			error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
2802 		}
2803 
2804 		devsw_unlock(dev, S_IFBLK);
2805 		break;
2806 
2807 	default:
2808 		panic("spec_close: not special");
2809 		return EBADF;
2810 	}
2811 
2812 	return error;
2813 }
2814 
2815 /*
2816  * Return POSIX pathconf information applicable to special devices.
2817  */
2818 int
spec_pathconf(struct vnop_pathconf_args * ap)2819 spec_pathconf(struct vnop_pathconf_args *ap)
2820 {
2821 	switch (ap->a_name) {
2822 	case _PC_LINK_MAX:
2823 		*ap->a_retval = LINK_MAX;
2824 		return 0;
2825 	case _PC_MAX_CANON:
2826 		*ap->a_retval = MAX_CANON;
2827 		return 0;
2828 	case _PC_MAX_INPUT:
2829 		*ap->a_retval = MAX_INPUT;
2830 		return 0;
2831 	case _PC_PIPE_BUF:
2832 		*ap->a_retval = PIPE_BUF;
2833 		return 0;
2834 	case _PC_CHOWN_RESTRICTED:
2835 		*ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
2836 		return 0;
2837 	case _PC_VDISABLE:
2838 		*ap->a_retval = _POSIX_VDISABLE;
2839 		return 0;
2840 	default:
2841 		return EINVAL;
2842 	}
2843 	/* NOTREACHED */
2844 }
2845 
2846 /*
2847  * Special device failed operation
2848  */
2849 int
spec_ebadf(__unused void * dummy)2850 spec_ebadf(__unused void *dummy)
2851 {
2852 	return EBADF;
2853 }
2854 
2855 /* Blktooff derives file offset from logical block number */
2856 int
spec_blktooff(struct vnop_blktooff_args * ap)2857 spec_blktooff(struct vnop_blktooff_args *ap)
2858 {
2859 	struct vnode *vp = ap->a_vp;
2860 
2861 	switch (vp->v_type) {
2862 	case VCHR:
2863 		*ap->a_offset = (off_t)-1; /* failure */
2864 		return ENOTSUP;
2865 
2866 	case VBLK:
2867 		printf("spec_blktooff: not implemented for VBLK\n");
2868 		*ap->a_offset = (off_t)-1; /* failure */
2869 		return ENOTSUP;
2870 
2871 	default:
2872 		panic("spec_blktooff type");
2873 	}
2874 	/* NOTREACHED */
2875 
2876 	return 0;
2877 }
2878 
2879 /* Offtoblk derives logical block number from file offset */
2880 int
spec_offtoblk(struct vnop_offtoblk_args * ap)2881 spec_offtoblk(struct vnop_offtoblk_args *ap)
2882 {
2883 	struct vnode *vp = ap->a_vp;
2884 
2885 	switch (vp->v_type) {
2886 	case VCHR:
2887 		*ap->a_lblkno = (daddr64_t)-1; /* failure */
2888 		return ENOTSUP;
2889 
2890 	case VBLK:
2891 		printf("spec_offtoblk: not implemented for VBLK\n");
2892 		*ap->a_lblkno = (daddr64_t)-1; /* failure */
2893 		return ENOTSUP;
2894 
2895 	default:
2896 		panic("spec_offtoblk type");
2897 	}
2898 	/* NOTREACHED */
2899 
2900 	return 0;
2901 }
2902 
2903 static int filt_specattach(struct knote *kn, struct kevent_qos_s *kev);
2904 static void filt_specdetach(struct knote *kn);
2905 static int filt_specevent(struct knote *kn, long hint);
2906 static int filt_spectouch(struct knote *kn, struct kevent_qos_s *kev);
2907 static int filt_specprocess(struct knote *kn, struct kevent_qos_s *kev);
2908 
2909 SECURITY_READ_ONLY_EARLY(struct filterops) spec_filtops = {
2910 	.f_isfd    = 1,
2911 	.f_attach  = filt_specattach,
2912 	.f_detach  = filt_specdetach,
2913 	.f_event   = filt_specevent,
2914 	.f_touch   = filt_spectouch,
2915 	.f_process = filt_specprocess,
2916 };
2917 
2918 static void
filt_spec_make_eof(struct knote * kn)2919 filt_spec_make_eof(struct knote *kn)
2920 {
2921 	/*
2922 	 * The spec filter might touch kn_flags from f_event
2923 	 * without holding "the primitive lock", so make it atomic.
2924 	 */
2925 	os_atomic_or(&kn->kn_flags, EV_EOF | EV_ONESHOT, relaxed);
2926 }
2927 
2928 static int
filt_spec_common(struct knote * kn,struct kevent_qos_s * kev,bool attach)2929 filt_spec_common(struct knote *kn, struct kevent_qos_s *kev, bool attach)
2930 {
2931 	uthread_t uth = current_uthread();
2932 	vfs_context_t ctx = vfs_context_current();
2933 	vnode_t vp = (vnode_t)fp_get_data(kn->kn_fp);
2934 	__block bool selrecorded = false;
2935 	struct select_set *old_wqs;
2936 	int64_t data = 0;
2937 	int ret, selret;
2938 
2939 	if (kn->kn_flags & EV_EOF) {
2940 		ret = FILTER_ACTIVE;
2941 		goto out;
2942 	}
2943 
2944 	if (!attach && vnode_getwithvid(vp, vnode_vid(vp)) != 0) {
2945 		filt_spec_make_eof(kn);
2946 		ret = FILTER_ACTIVE;
2947 		goto out;
2948 	}
2949 
2950 	selspec_record_hook_t cb = ^(struct selinfo *si) {
2951 		selspec_attach(kn, si);
2952 		selrecorded = true;
2953 	};
2954 
2955 	old_wqs = uth->uu_selset;
2956 	uth->uu_selset = SELSPEC_RECORD_MARKER;
2957 	selret = VNOP_SELECT(vp, knote_get_seltype(kn), 0, cb, ctx);
2958 	uth->uu_selset = old_wqs;
2959 
2960 	if (!attach) {
2961 		vnode_put(vp);
2962 	}
2963 
2964 	if (!selrecorded && selret == 0) {
2965 		/*
2966 		 * The device indicated that there's no data to read,
2967 		 * but didn't call `selrecord`.
2968 		 *
2969 		 * Nothing will be notified of changes to this vnode,
2970 		 * so return an error back to user space on attach,
2971 		 * or pretend the knote disappeared for other cases,
2972 		 * to make it clear that the knote is not attached.
2973 		 */
2974 		if (attach) {
2975 			knote_set_error(kn, ENODEV);
2976 			return 0;
2977 		}
2978 
2979 		filt_spec_make_eof(kn);
2980 		ret = FILTER_ACTIVE;
2981 		goto out;
2982 	}
2983 
2984 	if (kn->kn_vnode_use_ofst) {
2985 		if (kn->kn_fp->fp_glob->fg_offset >= (uint32_t)selret) {
2986 			data = 0;
2987 		} else {
2988 			data = ((uint32_t)selret) - kn->kn_fp->fp_glob->fg_offset;
2989 		}
2990 	} else {
2991 		data = selret;
2992 	}
2993 
2994 	if (data >= knote_low_watermark(kn)) {
2995 		ret = FILTER_ACTIVE;
2996 	} else {
2997 		ret = 0;
2998 	}
2999 out:
3000 	if (ret) {
3001 		knote_fill_kevent(kn, kev, data);
3002 	}
3003 	return ret;
3004 }
3005 
3006 static int
filt_specattach(struct knote * kn,__unused struct kevent_qos_s * kev)3007 filt_specattach(struct knote *kn, __unused struct kevent_qos_s *kev)
3008 {
3009 	vnode_t vp = (vnode_t)fp_get_data(kn->kn_fp); /* Already have iocount, and vnode is alive */
3010 	dev_t dev;
3011 
3012 	assert(vnode_ischr(vp));
3013 
3014 	dev = vnode_specrdev(vp);
3015 
3016 	/*
3017 	 * For a few special kinds of devices, we can attach knotes with
3018 	 * no restrictions because their "select" vectors return the amount
3019 	 * of data available.  Others require an explicit NOTE_LOWAT with
3020 	 * data of 1, indicating that the caller doesn't care about actual
3021 	 * data counts, just an indication that the device has data.
3022 	 */
3023 	if (!kn->kn_vnode_kqok &&
3024 	    ((kn->kn_sfflags & NOTE_LOWAT) == 0 || kn->kn_sdata != 1)) {
3025 		knote_set_error(kn, EINVAL);
3026 		return 0;
3027 	}
3028 
3029 	return filt_spec_common(kn, kev, true);
3030 }
3031 
3032 static void
filt_specdetach(struct knote * kn)3033 filt_specdetach(struct knote *kn)
3034 {
3035 	selspec_detach(kn);
3036 }
3037 
3038 static int
filt_specevent(struct knote * kn,long hint)3039 filt_specevent(struct knote *kn, long hint)
3040 {
3041 	/* knote_post() will have cleared it for us */
3042 	assert(kn->kn_hook == NULL);
3043 
3044 	/* called by selwakeup with the selspec_lock lock held */
3045 	if (hint & NOTE_REVOKE) {
3046 		filt_spec_make_eof(kn);
3047 	}
3048 	return FILTER_ACTIVE;
3049 }
3050 
3051 static int
filt_spectouch(struct knote * kn,struct kevent_qos_s * kev)3052 filt_spectouch(struct knote *kn, struct kevent_qos_s *kev)
3053 {
3054 	kn->kn_sdata = kev->data;
3055 	kn->kn_sfflags = kev->fflags;
3056 
3057 	return filt_spec_common(kn, kev, false);
3058 }
3059 
3060 static int
filt_specprocess(struct knote * kn,struct kevent_qos_s * kev)3061 filt_specprocess(struct knote *kn, struct kevent_qos_s *kev)
3062 {
3063 	return filt_spec_common(kn, kev, false);
3064 }
3065