xref: /xnu-12377.41.6/bsd/miscfs/specfs/spec_vnops.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2000-2019 Apple Computer, Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30  * Copyright (c) 1989, 1993, 1995
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  * 3. All advertising materials mentioning features or use of this software
42  *    must display the following acknowledgement:
43  *	This product includes software developed by the University of
44  *	California, Berkeley and its contributors.
45  * 4. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)spec_vnops.c	8.14 (Berkeley) 5/21/95
62  */
63 
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/conf.h>
70 #include <sys/buf_internal.h>
71 #include <sys/mount_internal.h>
72 #include <sys/vnode_internal.h>
73 #include <sys/file_internal.h>
74 #include <sys/namei.h>
75 #include <sys/stat.h>
76 #include <sys/errno.h>
77 #include <sys/ioctl.h>
78 #include <sys/file.h>
79 #include <sys/user.h>
80 #include <sys/malloc.h>
81 #include <sys/disk.h>
82 #include <sys/uio_internal.h>
83 #include <sys/resource.h>
84 #include <machine/machine_routines.h>
85 #include <miscfs/specfs/specdev.h>
86 #include <vfs/vfs_support.h>
87 #include <vfs/vfs_disk_conditioner.h>
88 
89 #include <kern/assert.h>
90 #include <kern/task.h>
91 #include <kern/sched_prim.h>
92 #include <kern/thread.h>
93 #include <kern/policy_internal.h>
94 #include <kern/timer_call.h>
95 #include <kern/waitq.h>
96 
97 #include <pexpert/pexpert.h>
98 
99 #include <sys/kdebug.h>
100 #include <libkern/section_keywords.h>
101 
102 #if CONFIG_IO_COMPRESSION_STATS
103 #include <vfs/vfs_io_compression_stats.h>
104 #endif /* CONFIG_IO_COMPRESSION_STATS */
105 
106 #if CONFIG_IOSCHED
107 #include <vm/vm_pageout_xnu.h>
108 #include <vm/vm_object_xnu.h>
109 #endif /* CONFIG_IOSCHED */
110 
111 /* XXX following three prototypes should be in a header file somewhere */
112 extern dev_t    chrtoblk(dev_t dev);
113 extern boolean_t        iskmemdev(dev_t dev);
114 extern int bpfkqfilter(dev_t dev, struct knote *kn);
115 extern int ptsd_kqfilter(dev_t, struct knote *);
116 extern int ptmx_kqfilter(dev_t, struct knote *);
117 #if CONFIG_PHYS_WRITE_ACCT
118 uint64_t kernel_pm_writes;    // to track the sync writes occurring during power management transitions
119 #endif /* CONFIG_PHYS_WRITE_ACCT */
120 
121 
122 struct vnode *speclisth[SPECHSZ];
123 
124 /* symbolic sleep message strings for devices */
125 char    devopn[] = "devopn";
126 char    devio[] = "devio";
127 char    devwait[] = "devwait";
128 char    devin[] = "devin";
129 char    devout[] = "devout";
130 char    devioc[] = "devioc";
131 char    devcls[] = "devcls";
132 
133 #define VOPFUNC int (*)(void *)
134 
135 int(**spec_vnodeop_p)(void *);
136 const struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
137 	{ .opve_op = &vnop_default_desc, .opve_impl = (VOPFUNC)(void (*)(void))vn_default_error },
138 	{ .opve_op = &vnop_lookup_desc, .opve_impl = (VOPFUNC)spec_lookup },            /* lookup */
139 	{ .opve_op = &vnop_create_desc, .opve_impl = (VOPFUNC)err_create },             /* create */
140 	{ .opve_op = &vnop_mknod_desc, .opve_impl = (VOPFUNC)err_mknod },               /* mknod */
141 	{ .opve_op = &vnop_open_desc, .opve_impl = (VOPFUNC)spec_open },                        /* open */
142 	{ .opve_op = &vnop_close_desc, .opve_impl = (VOPFUNC)spec_close },              /* close */
143 	{ .opve_op = &vnop_access_desc, .opve_impl = (VOPFUNC)spec_access },            /* access */
144 	{ .opve_op = &vnop_getattr_desc, .opve_impl = (VOPFUNC)spec_getattr },          /* getattr */
145 	{ .opve_op = &vnop_setattr_desc, .opve_impl = (VOPFUNC)spec_setattr },          /* setattr */
146 	{ .opve_op = &vnop_read_desc, .opve_impl = (VOPFUNC)spec_read },                        /* read */
147 	{ .opve_op = &vnop_write_desc, .opve_impl = (VOPFUNC)spec_write },              /* write */
148 	{ .opve_op = &vnop_ioctl_desc, .opve_impl = (VOPFUNC)spec_ioctl },              /* ioctl */
149 	{ .opve_op = &vnop_select_desc, .opve_impl = (VOPFUNC)spec_select },            /* select */
150 	{ .opve_op = &vnop_revoke_desc, .opve_impl = (VOPFUNC)nop_revoke },             /* revoke */
151 	{ .opve_op = &vnop_mmap_desc, .opve_impl = (VOPFUNC)err_mmap },                 /* mmap */
152 	{ .opve_op = &vnop_fsync_desc, .opve_impl = (VOPFUNC)spec_fsync },              /* fsync */
153 	{ .opve_op = &vnop_remove_desc, .opve_impl = (VOPFUNC)err_remove },             /* remove */
154 	{ .opve_op = &vnop_link_desc, .opve_impl = (VOPFUNC)err_link },                 /* link */
155 	{ .opve_op = &vnop_rename_desc, .opve_impl = (VOPFUNC)err_rename },             /* rename */
156 	{ .opve_op = &vnop_mkdir_desc, .opve_impl = (VOPFUNC)err_mkdir },               /* mkdir */
157 	{ .opve_op = &vnop_rmdir_desc, .opve_impl = (VOPFUNC)err_rmdir },               /* rmdir */
158 	{ .opve_op = &vnop_symlink_desc, .opve_impl = (VOPFUNC)err_symlink },           /* symlink */
159 	{ .opve_op = &vnop_readdir_desc, .opve_impl = (VOPFUNC)err_readdir },           /* readdir */
160 	{ .opve_op = &vnop_readlink_desc, .opve_impl = (VOPFUNC)err_readlink },         /* readlink */
161 	{ .opve_op = &vnop_inactive_desc, .opve_impl = (VOPFUNC)nop_inactive },         /* inactive */
162 	{ .opve_op = &vnop_reclaim_desc, .opve_impl = (VOPFUNC)nop_reclaim },           /* reclaim */
163 	{ .opve_op = &vnop_strategy_desc, .opve_impl = (VOPFUNC)spec_strategy },                /* strategy */
164 	{ .opve_op = &vnop_pathconf_desc, .opve_impl = (VOPFUNC)spec_pathconf },                /* pathconf */
165 	{ .opve_op = &vnop_advlock_desc, .opve_impl = (VOPFUNC)err_advlock },           /* advlock */
166 	{ .opve_op = &vnop_bwrite_desc, .opve_impl = (VOPFUNC)spec_bwrite },            /* bwrite */
167 	{ .opve_op = &vnop_pagein_desc, .opve_impl = (VOPFUNC)err_pagein },             /* Pagein */
168 	{ .opve_op = &vnop_pageout_desc, .opve_impl = (VOPFUNC)err_pageout },           /* Pageout */
169 	{ .opve_op = &vnop_copyfile_desc, .opve_impl = (VOPFUNC)err_copyfile },         /* Copyfile */
170 	{ .opve_op = &vnop_blktooff_desc, .opve_impl = (VOPFUNC)spec_blktooff },                /* blktooff */
171 	{ .opve_op = &vnop_offtoblk_desc, .opve_impl = (VOPFUNC)spec_offtoblk },                /* offtoblk */
172 	{ .opve_op = &vnop_blockmap_desc, .opve_impl = (VOPFUNC)spec_blockmap },                /* blockmap */
173 	{ .opve_op = (struct vnodeop_desc*)NULL, .opve_impl = (int (*)(void *))NULL }
174 };
175 const struct vnodeopv_desc spec_vnodeop_opv_desc =
176 { .opv_desc_vector_p = &spec_vnodeop_p, .opv_desc_ops = spec_vnodeop_entries };
177 
178 
179 static void set_blocksize(vnode_t, dev_t);
180 
181 #define LOWPRI_TIER1_WINDOW_MSECS         25
182 #define LOWPRI_TIER2_WINDOW_MSECS         100
183 #define LOWPRI_TIER3_WINDOW_MSECS         500
184 
185 #define LOWPRI_TIER1_IO_PERIOD_MSECS      40
186 #define LOWPRI_TIER2_IO_PERIOD_MSECS      85
187 #define LOWPRI_TIER3_IO_PERIOD_MSECS      200
188 
189 #define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS  5
190 #define LOWPRI_TIER2_IO_PERIOD_SSD_MSECS  15
191 #define LOWPRI_TIER3_IO_PERIOD_SSD_MSECS  25
192 
193 
194 int     throttle_windows_msecs[THROTTLE_LEVEL_END + 1] = {
195 	0,
196 	LOWPRI_TIER1_WINDOW_MSECS,
197 	LOWPRI_TIER2_WINDOW_MSECS,
198 	LOWPRI_TIER3_WINDOW_MSECS,
199 };
200 
201 int     throttle_io_period_msecs[THROTTLE_LEVEL_END + 1] = {
202 	0,
203 	LOWPRI_TIER1_IO_PERIOD_MSECS,
204 	LOWPRI_TIER2_IO_PERIOD_MSECS,
205 	LOWPRI_TIER3_IO_PERIOD_MSECS,
206 };
207 
208 int     throttle_io_period_ssd_msecs[THROTTLE_LEVEL_END + 1] = {
209 	0,
210 	LOWPRI_TIER1_IO_PERIOD_SSD_MSECS,
211 	LOWPRI_TIER2_IO_PERIOD_SSD_MSECS,
212 	LOWPRI_TIER3_IO_PERIOD_SSD_MSECS,
213 };
214 
215 
216 int     throttled_count[THROTTLE_LEVEL_END + 1];
217 
218 struct _throttle_io_info_t {
219 	lck_mtx_t       throttle_lock;
220 
221 	struct timeval  throttle_last_write_timestamp;
222 	struct timeval  throttle_min_timer_deadline;
223 	struct timeval  throttle_window_start_timestamp[THROTTLE_LEVEL_END + 1]; /* window starts at both the beginning and completion of an I/O */
224 	struct timeval  throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1];
225 	pid_t           throttle_last_IO_pid[THROTTLE_LEVEL_END + 1];
226 	struct timeval  throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + 1];
227 	int32_t throttle_inflight_count[THROTTLE_LEVEL_END + 1];
228 
229 	TAILQ_HEAD(, uthread) throttle_uthlist[THROTTLE_LEVEL_END + 1];         /* Lists of throttled uthreads */
230 	int             throttle_next_wake_level;
231 
232 	thread_call_t   throttle_timer_call;
233 	int32_t throttle_timer_ref;
234 	int32_t throttle_timer_active;
235 
236 	int32_t throttle_io_count;
237 	int32_t throttle_io_count_begin;
238 	int    *throttle_io_periods;
239 	uint32_t throttle_io_period_num;
240 
241 	int32_t throttle_refcnt;
242 	int32_t throttle_alloc;
243 	int32_t throttle_disabled;
244 	int32_t throttle_is_fusion_with_priority;
245 };
246 
247 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
248 
249 
250 int     lowpri_throttle_enabled = 1;
251 
252 static int spec_close_internal(struct vnode *vp, dev_t dev, int flags, vfs_context_t ctx);
253 
254 static void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level);
255 static int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap);
256 static int throttle_get_thread_throttle_level(uthread_t ut);
257 static int throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier);
258 void throttle_info_mount_reset_period(mount_t mp, int isssd);
259 
260 /*
261  * Trivial lookup routine that always fails.
262  */
263 int
spec_lookup(struct vnop_lookup_args * ap)264 spec_lookup(struct vnop_lookup_args *ap)
265 {
266 	*ap->a_vpp = NULL;
267 	return ENOTDIR;
268 }
269 
270 static void
set_blocksize(struct vnode * vp,dev_t dev)271 set_blocksize(struct vnode *vp, dev_t dev)
272 {
273 	int (*size)(dev_t);
274 	int rsize;
275 
276 	if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
277 		rsize = (*size)(dev);
278 		if (rsize <= 0) { /* did size fail? */
279 			vp->v_specsize = DEV_BSIZE;
280 		} else {
281 			vp->v_specsize = rsize;
282 		}
283 	} else {
284 		vp->v_specsize = DEV_BSIZE;
285 	}
286 }
287 
288 void
set_fsblocksize(struct vnode * vp)289 set_fsblocksize(struct vnode *vp)
290 {
291 	if (vp->v_type == VBLK) {
292 		dev_t dev = (dev_t)vp->v_rdev;
293 		int maj = major(dev);
294 
295 		if ((u_int)maj >= (u_int)nblkdev) {
296 			return;
297 		}
298 
299 		vnode_lock(vp);
300 		set_blocksize(vp, dev);
301 		vnode_unlock(vp);
302 	}
303 }
304 
305 static void
spec_init_bsdunit(vnode_t vp,vfs_context_t ctx,const char * caller)306 spec_init_bsdunit(vnode_t vp, vfs_context_t ctx, const char* caller)
307 {
308 	int     isssd = 0;
309 	uint64_t throttle_mask = 0;
310 	uint32_t devbsdunit = 0;
311 
312 	if (VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ctx)) {
313 		isssd = 0;
314 	}
315 	if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL)) {
316 		throttle_mask = 0;
317 	}
318 
319 	if (throttle_mask != 0) {
320 		/*
321 		 * as a reasonable approximation, only use the lowest bit of the mask
322 		 * to generate a disk unit number
323 		 */
324 		devbsdunit = num_trailing_0(throttle_mask);
325 	} else {
326 		devbsdunit = 0;
327 	}
328 
329 	if (vp->v_un.vu_specinfo->si_initted == 0) {
330 		vnode_lock(vp);
331 		if (vp->v_un.vu_specinfo->si_initted == 0) {
332 			vp->v_un.vu_specinfo->si_isssd = isssd ? 1 : 0;
333 			vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
334 			vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
335 			vp->v_un.vu_specinfo->si_throttleable = 1;
336 			vp->v_un.vu_specinfo->si_initted = 1;
337 		}
338 		vnode_unlock(vp);
339 		printf("%s : si_devbsdunit initialized to (%d), throttle_mask is (0x%llx), isssd is (%d)\n",
340 		    caller, vp->v_un.vu_specinfo->si_devbsdunit,
341 		    vp->v_un.vu_specinfo->si_throttle_mask,
342 		    vp->v_un.vu_specinfo->si_isssd);
343 	}
344 }
345 
346 #define SPEC_INIT_BSDUNIT(vp, ctx) spec_init_bsdunit((vp), (ctx), __FUNCTION__)
347 
348 /*
349  * Open a special file.
350  */
351 int
spec_open(struct vnop_open_args * ap)352 spec_open(struct vnop_open_args *ap)
353 {
354 	struct proc *p = vfs_context_proc(ap->a_context);
355 	kauth_cred_t cred = vfs_context_ucred(ap->a_context);
356 	struct vnode *vp = ap->a_vp;
357 	dev_t bdev, dev = (dev_t)vp->v_rdev;
358 	int maj = major(dev);
359 	int error;
360 
361 	/*
362 	 * Don't allow open if fs is mounted -nodev.
363 	 */
364 	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) {
365 		return ENXIO;
366 	}
367 
368 	switch (vp->v_type) {
369 	case VCHR:
370 		if ((u_int)maj >= (u_int)nchrdev) {
371 			return ENXIO;
372 		}
373 		if (cred != FSCRED && (ap->a_mode & FWRITE)) {
374 #if 0
375 			/*
376 			 * When running in very secure mode, do not allow
377 			 * opens for writing of any disk character devices.
378 			 */
379 			if (securelevel >= 2 && isdisk(dev, VCHR)) {
380 				return EPERM;
381 			}
382 #endif
383 
384 			/* Never allow writing to /dev/mem or /dev/kmem */
385 			if (iskmemdev(dev)) {
386 				return EPERM;
387 			}
388 			/*
389 			 * When running in secure mode, do not allow opens for
390 			 * writing of character devices whose corresponding block
391 			 * devices are currently mounted.
392 			 */
393 			if (securelevel >= 1) {
394 				if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error)) {
395 					return error;
396 				}
397 			}
398 		}
399 
400 		devsw_lock(dev, S_IFCHR);
401 		error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
402 
403 		if (error == 0) {
404 			vp->v_specinfo->si_opencount++;
405 		}
406 
407 		devsw_unlock(dev, S_IFCHR);
408 
409 		if (error == 0 && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
410 			int     isssd = 0;
411 			uint64_t throttle_mask = 0;
412 			uint32_t devbsdunit = 0;
413 
414 			if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
415 				if (throttle_mask != 0 &&
416 				    VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
417 					/*
418 					 * as a reasonable approximation, only use the lowest bit of the mask
419 					 * to generate a disk unit number
420 					 */
421 					devbsdunit = num_trailing_0(throttle_mask);
422 
423 					vnode_lock(vp);
424 
425 					vp->v_un.vu_specinfo->si_isssd = isssd ? 1 : 0;
426 					vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
427 					vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
428 					vp->v_un.vu_specinfo->si_throttleable = 1;
429 					vp->v_un.vu_specinfo->si_initted = 1;
430 
431 					vnode_unlock(vp);
432 				}
433 			}
434 			if (vp->v_un.vu_specinfo->si_initted == 0) {
435 				vnode_lock(vp);
436 				vp->v_un.vu_specinfo->si_initted = 1;
437 				vnode_unlock(vp);
438 			}
439 		}
440 		return error;
441 
442 	case VBLK:
443 		if ((u_int)maj >= (u_int)nblkdev) {
444 			return ENXIO;
445 		}
446 		/*
447 		 * When running in very secure mode, do not allow
448 		 * opens for writing of any disk block devices.
449 		 */
450 		if (securelevel >= 2 && cred != FSCRED &&
451 		    (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) {
452 			return EPERM;
453 		}
454 		/*
455 		 * Do not allow opens of block devices that are
456 		 * currently mounted.
457 		 */
458 		if ((error = vfs_mountedon(vp))) {
459 			return error;
460 		}
461 
462 		devsw_lock(dev, S_IFBLK);
463 		error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
464 		if (!error) {
465 			vp->v_specinfo->si_opencount++;
466 		}
467 		devsw_unlock(dev, S_IFBLK);
468 
469 		if (!error) {
470 			u_int64_t blkcnt;
471 			u_int32_t blksize;
472 			int setsize = 0;
473 			u_int32_t size512 = 512;
474 
475 			if (bdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
476 				SPEC_INIT_BSDUNIT(vp, ap->a_context);
477 			}
478 
479 			if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
480 				/* Switch to 512 byte sectors (temporarily) */
481 
482 				if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
483 					/* Get the number of 512 byte physical blocks. */
484 					if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
485 						setsize = 1;
486 					}
487 				}
488 				/* If it doesn't set back, we can't recover */
489 				if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) {
490 					/* Perform an explicit close on the block device, as the device is already open */
491 					spec_close_internal(vp, dev, ap->a_mode, ap->a_context);
492 
493 					error = ENXIO;
494 				}
495 			}
496 
497 
498 			vnode_lock(vp);
499 			set_blocksize(vp, dev);
500 
501 			/*
502 			 * Cache the size in bytes of the block device for later
503 			 * use by spec_write().
504 			 */
505 			if (setsize) {
506 				vp->v_specdevsize = blkcnt * (u_int64_t)size512;
507 			} else {
508 				vp->v_specdevsize = (u_int64_t)0; /* Default: Can't get */
509 			}
510 			vnode_unlock(vp);
511 		}
512 		return error;
513 	default:
514 		panic("spec_open type");
515 	}
516 	return 0;
517 }
518 
519 /*
520  * Vnode op for read
521  */
522 int
spec_read(struct vnop_read_args * ap)523 spec_read(struct vnop_read_args *ap)
524 {
525 	struct vnode *vp = ap->a_vp;
526 	struct uio *uio = ap->a_uio;
527 	struct buf *bp;
528 	daddr64_t bn, nextbn;
529 	long bscale;
530 	int devBlockSize = 0;
531 	size_t bsize, n, on;
532 	int error = 0;
533 	dev_t dev;
534 
535 #if DIAGNOSTIC
536 	if (uio->uio_rw != UIO_READ) {
537 		panic("spec_read mode");
538 	}
539 	if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
540 		panic("spec_read proc");
541 	}
542 #endif
543 	if (uio_resid(uio) == 0) {
544 		return 0;
545 	}
546 
547 	switch (vp->v_type) {
548 	case VCHR:
549 	{
550 		struct _throttle_io_info_t *throttle_info = NULL;
551 		int thread_throttle_level;
552 		uint64_t blkno = 0;
553 		uint32_t iolen = 0;
554 		int ddisk = 0;
555 		int ktrace_code = DKIO_READ;
556 		devBlockSize = vp->v_specsize;
557 		uintptr_t our_id = 0;
558 
559 		if (cdevsw[major(vp->v_rdev)].d_type == D_DISK) {
560 			ddisk = 1;
561 		}
562 
563 		if (ddisk && vp->v_un.vu_specinfo->si_throttleable) {
564 			throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
565 			thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
566 		}
567 
568 		if (kdebug_enable && ddisk) {
569 			if (devBlockSize == 0) {
570 				devBlockSize = 512;  // default sector size
571 			}
572 
573 			if (uio_offset(uio) && devBlockSize) {
574 				blkno = ((uint64_t) uio_offset(uio) / ((uint64_t)devBlockSize));
575 			}
576 			iolen = (int) uio_resid(uio);
577 			our_id = (uintptr_t)thread_tid(current_thread());
578 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
579 			    (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
580 			    vp->v_rdev, blkno, iolen, 0);
581 		}
582 
583 		error = (*cdevsw[major(vp->v_rdev)].d_read)
584 		    (vp->v_rdev, uio, ap->a_ioflag);
585 
586 
587 		if (kdebug_enable && ddisk) {
588 			uint32_t residual = (uint32_t)uio_resid(uio);
589 			ktrace_code |= DKIO_DONE;
590 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
591 			    (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
592 			    (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0);
593 		}
594 
595 		if (throttle_info) {
596 			throttle_info_end_io_internal(throttle_info, thread_throttle_level);
597 		}
598 
599 		return error;
600 	}
601 
602 	case VBLK:
603 		if (uio->uio_offset < 0) {
604 			return EINVAL;
605 		}
606 
607 		dev = vp->v_rdev;
608 
609 		devBlockSize = vp->v_specsize;
610 
611 		if (devBlockSize > PAGE_SIZE) {
612 			return EINVAL;
613 		}
614 
615 		bscale = PAGE_SIZE / devBlockSize;
616 		bsize = bscale * devBlockSize;
617 
618 		do {
619 			on = uio->uio_offset % bsize;
620 
621 			bn = (daddr64_t)((uio->uio_offset / devBlockSize) & ~(bscale - 1));
622 
623 			if (vp->v_speclastr + bscale == bn) {
624 				nextbn = bn + bscale;
625 				error = buf_breadn(vp, bn, (int)bsize, &nextbn,
626 				    (int *)&bsize, 1, NOCRED, &bp);
627 			} else {
628 				error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
629 			}
630 
631 			vnode_lock(vp);
632 			vp->v_speclastr = bn;
633 			vnode_unlock(vp);
634 
635 			n = bsize - buf_resid(bp);
636 			if ((on > n) || error) {
637 				if (!error) {
638 					error = EINVAL;
639 				}
640 				buf_brelse(bp);
641 				return error;
642 			}
643 			n = MIN((n  - on), (size_t)uio_resid(uio));
644 
645 			error = uiomove((char *)buf_dataptr(bp) + on, (int)n, uio);
646 			if (n + on == bsize) {
647 				buf_markaged(bp);
648 			}
649 			buf_brelse(bp);
650 		} while (error == 0 && uio_resid(uio) > 0 && n != 0);
651 		return error;
652 
653 	default:
654 		panic("spec_read type");
655 	}
656 	/* NOTREACHED */
657 
658 	return 0;
659 }
660 
661 /*
662  * Vnode op for write
663  */
664 int
spec_write(struct vnop_write_args * ap)665 spec_write(struct vnop_write_args *ap)
666 {
667 	struct vnode *vp = ap->a_vp;
668 	struct uio *uio = ap->a_uio;
669 	struct buf *bp;
670 	daddr64_t bn;
671 	int blkmask, bscale;
672 	int io_sync;
673 	int devBlockSize = 0;
674 	size_t bsize, n, on;
675 	int error = 0;
676 	dev_t dev;
677 
678 #if DIAGNOSTIC
679 	if (uio->uio_rw != UIO_WRITE) {
680 		panic("spec_write mode");
681 	}
682 	if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
683 		panic("spec_write proc");
684 	}
685 #endif
686 
687 	switch (vp->v_type) {
688 	case VCHR:
689 	{
690 		struct _throttle_io_info_t *throttle_info = NULL;
691 		int thread_throttle_level;
692 		dev = vp->v_rdev;
693 		devBlockSize = vp->v_specsize;
694 		uint32_t iolen = 0;
695 		uint64_t blkno = 0;
696 		int ddisk = 0;
697 		int ktrace_code = 0;  // write is implied; read must be OR'd in.
698 		uintptr_t our_id = 0;
699 
700 		if (cdevsw[major(dev)].d_type == D_DISK) {
701 			ddisk = 1;
702 		}
703 
704 		if (ddisk && vp->v_un.vu_specinfo->si_throttleable) {
705 			throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
706 
707 			thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, 0, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
708 
709 			microuptime(&throttle_info->throttle_last_write_timestamp);
710 		}
711 
712 		if (kdebug_enable && ddisk) {
713 			if (devBlockSize == 0) {
714 				devBlockSize = 512; // default sector size
715 			}
716 			if ((uio_offset(uio) != 0) && devBlockSize) {
717 				blkno = ((uint64_t)uio_offset(uio)) / ((uint64_t)devBlockSize);
718 			}
719 			iolen = (int)uio_resid(uio);
720 			our_id = (uintptr_t)thread_tid(current_thread());
721 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
722 			    (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
723 			    vp->v_rdev, blkno, iolen, 0);
724 		}
725 		error = (*cdevsw[major(vp->v_rdev)].d_write)
726 		    (vp->v_rdev, uio, ap->a_ioflag);
727 
728 		if (kdebug_enable && ddisk) {
729 			//emit the I/O completion
730 			uint32_t residual = (uint32_t)uio_resid(uio);
731 			ktrace_code |= DKIO_DONE;
732 			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
733 			    (FSDBG_CODE(DBG_DKRW, ktrace_code)) | DBG_FUNC_NONE, our_id,
734 			    (uintptr_t)VM_KERNEL_ADDRPERM(vp), residual, error, 0);
735 		}
736 
737 		if (throttle_info) {
738 			throttle_info_end_io_internal(throttle_info, thread_throttle_level);
739 		}
740 
741 		return error;
742 	}
743 
744 	case VBLK:
745 		if (uio_resid(uio) == 0) {
746 			return 0;
747 		}
748 		if (uio->uio_offset < 0) {
749 			return EINVAL;
750 		}
751 
752 		io_sync = (ap->a_ioflag & IO_SYNC);
753 
754 		dev = (vp->v_rdev);
755 
756 		devBlockSize = vp->v_specsize;
757 		if (devBlockSize > PAGE_SIZE) {
758 			return EINVAL;
759 		}
760 
761 		bscale = PAGE_SIZE / devBlockSize;
762 		blkmask = bscale - 1;
763 		bsize = bscale * devBlockSize;
764 
765 
766 		do {
767 			bn = (daddr64_t)((uio->uio_offset / devBlockSize) & ~blkmask);
768 			on = uio->uio_offset % bsize;
769 
770 			n = MIN((bsize - on), (size_t)uio_resid(uio));
771 
772 			/*
773 			 * Use buf_getblk() as an optimization IFF:
774 			 *
775 			 * 1)	We are reading exactly a block on a block
776 			 *	aligned boundary
777 			 * 2)	We know the size of the device from spec_open
778 			 * 3)	The read doesn't span the end of the device
779 			 *
780 			 * Otherwise, we fall back on buf_bread().
781 			 */
782 			if (n == bsize &&
783 			    vp->v_specdevsize != (u_int64_t)0 &&
784 			    (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
785 				/* reduce the size of the read to what is there */
786 				n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
787 			}
788 
789 			if (n == bsize) {
790 				bp = buf_getblk(vp, bn, (int)bsize, 0, 0, BLK_WRITE);
791 			} else {
792 				error = (int)buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
793 			}
794 
795 			/* Translate downstream error for upstream, if needed */
796 			if (!error) {
797 				error = (int)buf_error(bp);
798 			}
799 			if (error) {
800 				buf_brelse(bp);
801 				return error;
802 			}
803 			n = MIN(n, bsize - buf_resid(bp));
804 
805 			error = uiomove((char *)buf_dataptr(bp) + on, (int)n, uio);
806 			if (error) {
807 				buf_brelse(bp);
808 				return error;
809 			}
810 			buf_markaged(bp);
811 
812 			if (io_sync) {
813 				error = buf_bwrite(bp);
814 			} else {
815 				if ((n + on) == bsize) {
816 					error = buf_bawrite(bp);
817 				} else {
818 					error = buf_bdwrite(bp);
819 				}
820 			}
821 		} while (error == 0 && uio_resid(uio) > 0 && n != 0);
822 		return error;
823 
824 	default:
825 		panic("spec_write type");
826 	}
827 	/* NOTREACHED */
828 
829 	return 0;
830 }
831 
832 /*
833  * Device ioctl operation.
834  */
835 int
spec_ioctl(struct vnop_ioctl_args * ap)836 spec_ioctl(struct vnop_ioctl_args *ap)
837 {
838 	proc_t p = vfs_context_proc(ap->a_context);
839 	dev_t dev = ap->a_vp->v_rdev;
840 	int     retval = 0;
841 
842 	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
843 	    dev, ap->a_command, ap->a_fflag, ap->a_vp->v_type, 0);
844 
845 	switch (ap->a_vp->v_type) {
846 	case VCHR:
847 		retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
848 		    ap->a_fflag, p);
849 		break;
850 
851 	case VBLK:
852 		retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p);
853 		if (!retval && ap->a_command == DKIOCSETBLOCKSIZE) {
854 			ap->a_vp->v_specsize = *(uint32_t *)ap->a_data;
855 		}
856 		break;
857 
858 	default:
859 		panic("spec_ioctl");
860 		/* NOTREACHED */
861 	}
862 	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
863 	    dev, ap->a_command, ap->a_fflag, retval, 0);
864 
865 	return retval;
866 }
867 
868 int
spec_select(struct vnop_select_args * ap)869 spec_select(struct vnop_select_args *ap)
870 {
871 	proc_t p = vfs_context_proc(ap->a_context);
872 	dev_t dev;
873 
874 	switch (ap->a_vp->v_type) {
875 	default:
876 		return 1;             /* XXX */
877 
878 	case VCHR:
879 		dev = ap->a_vp->v_rdev;
880 		return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
881 	}
882 }
883 
884 int
spec_kqfilter(vnode_t vp,struct knote * kn,struct kevent_qos_s * kev)885 spec_kqfilter(vnode_t vp, struct knote *kn, struct kevent_qos_s *kev)
886 {
887 	dev_t dev;
888 
889 	assert(vnode_ischr(vp));
890 
891 	dev = vnode_specrdev(vp);
892 
893 #if NETWORKING
894 	/*
895 	 * Try a bpf device, as defined in bsd/net/bpf.c
896 	 * If it doesn't error out the attach, then it
897 	 * claimed it. Otherwise, fall through and try
898 	 * other attaches.
899 	 */
900 	int32_t tmp_flags = kn->kn_flags;
901 	int64_t tmp_sdata = kn->kn_sdata;
902 	int res;
903 
904 	res = bpfkqfilter(dev, kn);
905 	if ((kn->kn_flags & EV_ERROR) == 0) {
906 		return res;
907 	}
908 	kn->kn_flags = tmp_flags;
909 	kn->kn_sdata = tmp_sdata;
910 #endif
911 
912 	if (major(dev) >= nchrdev) {
913 		knote_set_error(kn, ENXIO);
914 		return 0;
915 	}
916 
917 	kn->kn_vnode_kqok = !!(cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE);
918 	kn->kn_vnode_use_ofst = !!(cdevsw_flags[major(dev)] & CDEVSW_USE_OFFSET);
919 
920 	if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTS) {
921 		kn->kn_filtid = EVFILTID_PTSD;
922 		return ptsd_kqfilter(dev, kn);
923 	} else if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTC) {
924 		kn->kn_filtid = EVFILTID_PTMX;
925 		return ptmx_kqfilter(dev, kn);
926 	} else if (cdevsw[major(dev)].d_type == D_TTY && kn->kn_vnode_kqok) {
927 		/*
928 		 * TTYs from drivers that use struct ttys use their own filter
929 		 * routines.  The PTC driver doesn't use the tty for character
930 		 * counts, so it must go through the select fallback.
931 		 */
932 		kn->kn_filtid = EVFILTID_TTY;
933 	} else {
934 		/* Try to attach to other char special devices */
935 		kn->kn_filtid = EVFILTID_SPEC;
936 	}
937 
938 	return knote_fops(kn)->f_attach(kn, kev);
939 }
940 
941 /*
942  * Synch buffers associated with a block device
943  */
944 int
spec_fsync_internal(vnode_t vp,int waitfor,__unused vfs_context_t context)945 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
946 {
947 	if (vp->v_type == VCHR) {
948 		return 0;
949 	}
950 	/*
951 	 * Flush all dirty buffers associated with a block device.
952 	 */
953 	buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
954 
955 	return 0;
956 }
957 
958 int
spec_fsync(struct vnop_fsync_args * ap)959 spec_fsync(struct vnop_fsync_args *ap)
960 {
961 	return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
962 }
963 
964 
965 /*
966  * Just call the device strategy routine
967  */
968 void throttle_init(void);
969 
970 
971 #if 0
972 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)  \
973 	do {                                                    \
974 	       if ((debug_info)->alloc)                           \
975 	       printf("%s: "format, __FUNCTION__, ## args);     \
976        } while(0)
977 
978 #else
979 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
980 #endif
981 
982 
983 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER1], 0, "");
984 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER2], 0, "");
985 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER3], 0, "");
986 
987 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER1], 0, "");
988 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER2], 0, "");
989 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER3], 0, "");
990 
991 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER1], 0, "");
992 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER2], 0, "");
993 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER3], 0, "");
994 
995 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_enabled, 0, "");
996 
997 
998 static LCK_GRP_DECLARE(throttle_lock_grp, "throttle I/O");
999 
1000 
1001 /*
1002  * throttled I/O helper function
1003  * convert the index of the lowest set bit to a device index
1004  */
1005 int
num_trailing_0(uint64_t n)1006 num_trailing_0(uint64_t n)
1007 {
1008 	/*
1009 	 * since in most cases the number of trailing 0s is very small,
1010 	 * we simply counting sequentially from the lowest bit
1011 	 */
1012 	if (n == 0) {
1013 		return sizeof(n) * 8;
1014 	}
1015 	int count = 0;
1016 	while (!ISSET(n, 1)) {
1017 		n >>= 1;
1018 		++count;
1019 	}
1020 	return count;
1021 }
1022 
1023 
1024 /*
1025  * Release the reference and if the item was allocated and this is the last
1026  * reference then free it.
1027  *
1028  * This routine always returns the old value.
1029  */
1030 static int
throttle_info_rel(struct _throttle_io_info_t * info)1031 throttle_info_rel(struct _throttle_io_info_t *info)
1032 {
1033 	SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt);
1034 
1035 	DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
1036 	    info, (int)(oldValue - 1), info );
1037 
1038 	/* The reference count just went negative, very bad */
1039 	if (oldValue == 0) {
1040 		panic("throttle info ref cnt went negative!");
1041 	}
1042 
1043 	/*
1044 	 * Once reference count is zero, no one else should be able to take a
1045 	 * reference
1046 	 */
1047 	if ((oldValue == 1) && (info->throttle_alloc)) {
1048 		DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
1049 
1050 		lck_mtx_destroy(&info->throttle_lock, &throttle_lock_grp);
1051 		kfree_type(struct _throttle_io_info_t, info);
1052 	}
1053 	return oldValue;
1054 }
1055 
1056 
1057 /*
1058  * Just take a reference on the throttle info structure.
1059  *
1060  * This routine always returns the old value.
1061  */
1062 static SInt32
throttle_info_ref(struct _throttle_io_info_t * info)1063 throttle_info_ref(struct _throttle_io_info_t *info)
1064 {
1065 	SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt);
1066 
1067 	DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
1068 	    info, (int)(oldValue - 1), info );
1069 	/* Allocated items should never have a reference of zero */
1070 	if (info->throttle_alloc && (oldValue == 0)) {
1071 		panic("Taking a reference without calling create throttle info!");
1072 	}
1073 
1074 	return oldValue;
1075 }
1076 
1077 /*
1078  * on entry the throttle_lock is held...
1079  * this function is responsible for taking
1080  * and dropping the reference on the info
1081  * structure which will keep it from going
1082  * away while the timer is running if it
1083  * happens to have been dynamically allocated by
1084  * a network fileystem kext which is now trying
1085  * to free it
1086  */
1087 static uint32_t
throttle_timer_start(struct _throttle_io_info_t * info,boolean_t update_io_count,int wakelevel)1088 throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count, int wakelevel)
1089 {
1090 	struct timeval  elapsed;
1091 	struct timeval  now;
1092 	struct timeval  period;
1093 	uint64_t        elapsed_msecs;
1094 	int             throttle_level;
1095 	int             level;
1096 	int             msecs;
1097 	boolean_t       throttled = FALSE;
1098 	boolean_t       need_timer = FALSE;
1099 
1100 	microuptime(&now);
1101 
1102 	if (update_io_count == TRUE) {
1103 		info->throttle_io_count_begin = info->throttle_io_count;
1104 		info->throttle_io_period_num++;
1105 
1106 		while (wakelevel >= THROTTLE_LEVEL_THROTTLED) {
1107 			info->throttle_start_IO_period_timestamp[wakelevel--] = now;
1108 		}
1109 
1110 		info->throttle_min_timer_deadline = now;
1111 
1112 		msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED];
1113 		period.tv_sec = msecs / 1000;
1114 		period.tv_usec = (msecs % 1000) * 1000;
1115 
1116 		timevaladd(&info->throttle_min_timer_deadline, &period);
1117 	}
1118 	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
1119 		elapsed = now;
1120 		timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1121 		elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1122 
1123 		for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
1124 			if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) {
1125 				if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level] || info->throttle_inflight_count[throttle_level]) {
1126 					/*
1127 					 * we had an I/O occur at a higher priority tier within
1128 					 * this tier's throttle window
1129 					 */
1130 					throttled = TRUE;
1131 				}
1132 				/*
1133 				 * we assume that the windows are the same or longer
1134 				 * as we drop through the throttling tiers...  thus
1135 				 * we can stop looking once we run into a tier with
1136 				 * threads to schedule regardless of whether it's
1137 				 * still in its throttling window or not
1138 				 */
1139 				break;
1140 			}
1141 		}
1142 		if (throttled == TRUE) {
1143 			break;
1144 		}
1145 	}
1146 	if (throttled == TRUE) {
1147 		uint64_t        deadline = 0;
1148 		struct timeval  target;
1149 		struct timeval  min_target;
1150 
1151 		/*
1152 		 * we've got at least one tier still in a throttled window
1153 		 * so we need a timer running... compute the next deadline
1154 		 * and schedule it
1155 		 */
1156 		for (level = throttle_level + 1; level <= THROTTLE_LEVEL_END; level++) {
1157 			if (TAILQ_EMPTY(&info->throttle_uthlist[level])) {
1158 				continue;
1159 			}
1160 
1161 			target = info->throttle_start_IO_period_timestamp[level];
1162 
1163 			msecs = info->throttle_io_periods[level];
1164 			period.tv_sec = msecs / 1000;
1165 			period.tv_usec = (msecs % 1000) * 1000;
1166 
1167 			timevaladd(&target, &period);
1168 
1169 			if (need_timer == FALSE || timevalcmp(&target, &min_target, <)) {
1170 				min_target = target;
1171 				need_timer = TRUE;
1172 			}
1173 		}
1174 		if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) {
1175 			if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >)) {
1176 				min_target = info->throttle_min_timer_deadline;
1177 			}
1178 		}
1179 
1180 		if (info->throttle_timer_active) {
1181 			if (thread_call_cancel(info->throttle_timer_call) == FALSE) {
1182 				/*
1183 				 * couldn't kill the timer because it's already
1184 				 * been dispatched, so don't try to start a new
1185 				 * one... once we drop the lock, the timer will
1186 				 * proceed and eventually re-run this function
1187 				 */
1188 				need_timer = FALSE;
1189 			} else {
1190 				info->throttle_timer_active = 0;
1191 			}
1192 		}
1193 		if (need_timer == TRUE) {
1194 			/*
1195 			 * This is defined as an int (32-bit) rather than a 64-bit
1196 			 * value because it would need a really big period in the
1197 			 * order of ~500 days to overflow this. So, we let this be
1198 			 * 32-bit which allows us to use the clock_interval_to_deadline()
1199 			 * routine.
1200 			 */
1201 			int     target_msecs;
1202 
1203 			if (info->throttle_timer_ref == 0) {
1204 				/*
1205 				 * take a reference for the timer
1206 				 */
1207 				throttle_info_ref(info);
1208 
1209 				info->throttle_timer_ref = 1;
1210 			}
1211 			elapsed = min_target;
1212 			timevalsub(&elapsed, &now);
1213 			target_msecs = (int)(elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000);
1214 
1215 			if (target_msecs <= 0) {
1216 				/*
1217 				 * we may have computed a deadline slightly in the past
1218 				 * due to various factors... if so, just set the timer
1219 				 * to go off in the near future (we don't need to be precise)
1220 				 */
1221 				target_msecs = 1;
1222 			}
1223 			clock_interval_to_deadline(target_msecs, 1000000, &deadline);
1224 
1225 			thread_call_enter_delayed(info->throttle_timer_call, deadline);
1226 			info->throttle_timer_active = 1;
1227 		}
1228 	}
1229 	return throttle_level;
1230 }
1231 
1232 
1233 static void
throttle_timer(struct _throttle_io_info_t * info,__unused thread_call_param_t p)1234 throttle_timer(struct _throttle_io_info_t *info, __unused thread_call_param_t p)
1235 {
1236 	uthread_t       ut, utlist;
1237 	struct timeval  elapsed;
1238 	struct timeval  now;
1239 	uint64_t        elapsed_msecs;
1240 	int             throttle_level;
1241 	int             level;
1242 	int             wake_level;
1243 	caddr_t         wake_address = NULL;
1244 	boolean_t       update_io_count = FALSE;
1245 	boolean_t       need_wakeup = FALSE;
1246 	boolean_t       need_release = FALSE;
1247 
1248 	ut = NULL;
1249 	lck_mtx_lock(&info->throttle_lock);
1250 
1251 	info->throttle_timer_active = 0;
1252 	microuptime(&now);
1253 
1254 	elapsed = now;
1255 	timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]);
1256 	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1257 
1258 	if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) {
1259 		wake_level = info->throttle_next_wake_level;
1260 
1261 		for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) {
1262 			elapsed = now;
1263 			timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]);
1264 			elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1265 
1266 			if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1267 				/*
1268 				 * we're closing out the current IO period...
1269 				 * if we have a waiting thread, wake it up
1270 				 * after we have reset the I/O window info
1271 				 */
1272 				need_wakeup = TRUE;
1273 				update_io_count = TRUE;
1274 
1275 				info->throttle_next_wake_level = wake_level - 1;
1276 
1277 				if (info->throttle_next_wake_level == THROTTLE_LEVEL_START) {
1278 					info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1279 				}
1280 
1281 				break;
1282 			}
1283 			wake_level--;
1284 
1285 			if (wake_level == THROTTLE_LEVEL_START) {
1286 				wake_level = THROTTLE_LEVEL_END;
1287 			}
1288 		}
1289 	}
1290 	if (need_wakeup == TRUE) {
1291 		if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1292 			ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]);
1293 			TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist);
1294 			ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1295 			ut->uu_is_throttled = false;
1296 
1297 			wake_address = (caddr_t)&ut->uu_on_throttlelist;
1298 		}
1299 	} else {
1300 		wake_level = THROTTLE_LEVEL_START;
1301 	}
1302 
1303 	throttle_level = throttle_timer_start(info, update_io_count, wake_level);
1304 
1305 	if (wake_address != NULL) {
1306 		wakeup(wake_address);
1307 	}
1308 
1309 	for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) {
1310 		TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) {
1311 			TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist);
1312 			ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1313 			ut->uu_is_throttled = false;
1314 
1315 			wakeup(&ut->uu_on_throttlelist);
1316 		}
1317 	}
1318 	if (info->throttle_timer_active == 0 && info->throttle_timer_ref) {
1319 		info->throttle_timer_ref = 0;
1320 		need_release = TRUE;
1321 	}
1322 	lck_mtx_unlock(&info->throttle_lock);
1323 
1324 	if (need_release == TRUE) {
1325 		throttle_info_rel(info);
1326 	}
1327 }
1328 
1329 
1330 static int
throttle_add_to_list(struct _throttle_io_info_t * info,uthread_t ut,int mylevel,boolean_t insert_tail)1331 throttle_add_to_list(struct _throttle_io_info_t *info, uthread_t ut, int mylevel, boolean_t insert_tail)
1332 {
1333 	boolean_t start_timer = FALSE;
1334 	int level = THROTTLE_LEVEL_START;
1335 
1336 	if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) {
1337 		info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel];
1338 		start_timer = TRUE;
1339 	}
1340 
1341 	if (insert_tail == TRUE) {
1342 		TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1343 	} else {
1344 		TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1345 	}
1346 
1347 	ut->uu_on_throttlelist = (int8_t)mylevel;
1348 
1349 	if (start_timer == TRUE) {
1350 		/* we may need to start or rearm the timer */
1351 		level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START);
1352 
1353 		if (level == THROTTLE_LEVEL_END) {
1354 			if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1355 				TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1356 
1357 				ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1358 			}
1359 		}
1360 	}
1361 	return level;
1362 }
1363 
1364 static void
throttle_init_throttle_window(void)1365 throttle_init_throttle_window(void)
1366 {
1367 	int throttle_window_size;
1368 
1369 	/*
1370 	 * The hierarchy of throttle window values is as follows:
1371 	 * - Global defaults
1372 	 * - Device tree properties
1373 	 * - Boot-args
1374 	 * All values are specified in msecs.
1375 	 */
1376 
1377 #if (XNU_TARGET_OS_OSX && __arm64__)
1378 	/*
1379 	 * IO Tier EDT overrides are meant for
1380 	 * some arm platforms but not for
1381 	 * macs.
1382 	 */
1383 #else /* (XNU_TARGET_OS_OSX && __arm64__) */
1384 	/* Override global values with device-tree properties */
1385 	if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) {
1386 		throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1387 	}
1388 
1389 	if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) {
1390 		throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1391 	}
1392 
1393 	if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) {
1394 		throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1395 	}
1396 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
1397 
1398 	/* Override with boot-args */
1399 	if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size))) {
1400 		throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1401 	}
1402 
1403 	if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size))) {
1404 		throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1405 	}
1406 
1407 	if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size))) {
1408 		throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1409 	}
1410 }
1411 
1412 static void
throttle_init_throttle_period(struct _throttle_io_info_t * info,boolean_t isssd)1413 throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd)
1414 {
1415 	int throttle_period_size;
1416 
1417 	/*
1418 	 * The hierarchy of throttle period values is as follows:
1419 	 * - Global defaults
1420 	 * - Device tree properties
1421 	 * - Boot-args
1422 	 * All values are specified in msecs.
1423 	 */
1424 
1425 	/* Assign global defaults */
1426 	if ((isssd == TRUE) && (info->throttle_is_fusion_with_priority == 0)) {
1427 		info->throttle_io_periods = &throttle_io_period_ssd_msecs[0];
1428 	} else {
1429 		info->throttle_io_periods = &throttle_io_period_msecs[0];
1430 	}
1431 
1432 #if (XNU_TARGET_OS_OSX && __arm64__)
1433 	/*
1434 	 * IO Tier EDT overrides are meant for
1435 	 * some arm platforms but not for
1436 	 * macs.
1437 	 */
1438 #else /* (XNU_TARGET_OS_OSX && __arm64__) */
1439 	/* Override global values with device-tree properties */
1440 	if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) {
1441 		info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1442 	}
1443 
1444 	if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) {
1445 		info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1446 	}
1447 
1448 	if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) {
1449 		info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1450 	}
1451 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
1452 
1453 	/* Override with boot-args */
1454 	if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size))) {
1455 		info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1456 	}
1457 
1458 	if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size))) {
1459 		info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1460 	}
1461 
1462 	if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size))) {
1463 		info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1464 	}
1465 }
1466 
1467 #if CONFIG_IOSCHED
1468 int     iosched_enabled = 1;
1469 #endif
1470 
1471 void
throttle_init(void)1472 throttle_init(void)
1473 {
1474 	struct _throttle_io_info_t *info;
1475 	int     i;
1476 	int     level;
1477 #if CONFIG_IOSCHED
1478 	int     iosched;
1479 #endif
1480 
1481 	/* Update throttle parameters based on device tree configuration */
1482 	throttle_init_throttle_window();
1483 
1484 	for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
1485 		info = &_throttle_io_info[i];
1486 
1487 		lck_mtx_init(&info->throttle_lock, &throttle_lock_grp, LCK_ATTR_NULL);
1488 		info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1489 
1490 		for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1491 			TAILQ_INIT(&info->throttle_uthlist[level]);
1492 			info->throttle_last_IO_pid[level] = 0;
1493 			info->throttle_inflight_count[level] = 0;
1494 		}
1495 		info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1496 		info->throttle_disabled = 0;
1497 		info->throttle_is_fusion_with_priority = 0;
1498 	}
1499 #if CONFIG_IOSCHED
1500 	if (PE_parse_boot_argn("iosched", &iosched, sizeof(iosched))) {
1501 		iosched_enabled = iosched;
1502 	}
1503 	if (iosched_enabled) {
1504 		/* Initialize I/O Reprioritization mechanism */
1505 		vm_io_reprioritize_init();
1506 	}
1507 #endif
1508 }
1509 
1510 void
sys_override_io_throttle(boolean_t enable_override)1511 sys_override_io_throttle(boolean_t enable_override)
1512 {
1513 	if (enable_override) {
1514 		lowpri_throttle_enabled = 0;
1515 	} else {
1516 		lowpri_throttle_enabled = 1;
1517 	}
1518 }
1519 
1520 int rethrottle_wakeups = 0;
1521 
1522 /*
1523  * the uu_rethrottle_lock is used to synchronize this function
1524  * with "throttle_lowpri_io" which is where a throttled thread
1525  * will block... that function will grab this lock before beginning
1526  * it's decision making process concerning the need to block, and
1527  * hold it through the assert_wait.  When that thread is awakened
1528  * for any reason (timer or rethrottle), it will reacquire the
1529  * uu_rethrottle_lock before determining if it really is ok for
1530  * it to now run.  This is the point at which the thread could
1531  * enter a different throttling queue and reblock or return from
1532  * the throttle w/o having waited out it's entire throttle if
1533  * the rethrottle has now moved it out of any currently
1534  * active throttle window.
1535  *
1536  *
1537  * NOTES:
1538  * 1 - This may be called with the task lock held.
1539  * 2 - This may be called with preemption and interrupts disabled
1540  *     in the kqueue wakeup path so we can't take the throttle_lock which is a mutex
1541  * 3 - This cannot safely dereference uu_throttle_info, as it may
1542  *     get deallocated out from under us
1543  */
1544 
1545 void
rethrottle_thread(uthread_t ut)1546 rethrottle_thread(uthread_t ut)
1547 {
1548 	/*
1549 	 * If uthread doesn't have throttle state, then there's no chance
1550 	 * of it needing a rethrottle.
1551 	 */
1552 	if (ut->uu_throttle_info == NULL) {
1553 		return;
1554 	}
1555 
1556 	boolean_t s = ml_set_interrupts_enabled(FALSE);
1557 	lck_spin_lock(&ut->uu_rethrottle_lock);
1558 
1559 	if (!ut->uu_is_throttled) {
1560 		ut->uu_was_rethrottled = true;
1561 	} else {
1562 		int my_new_level = throttle_get_thread_throttle_level(ut);
1563 
1564 		if (my_new_level != ut->uu_on_throttlelist) {
1565 			/*
1566 			 * ut is currently blocked (as indicated by
1567 			 * ut->uu_is_throttled == true)
1568 			 * and we're changing it's throttle level, so
1569 			 * we need to wake it up.
1570 			 */
1571 			ut->uu_is_throttled = false;
1572 			wakeup(&ut->uu_on_throttlelist);
1573 
1574 			rethrottle_wakeups++;
1575 			KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 102)),
1576 			    uthread_tid(ut), ut->uu_on_throttlelist, my_new_level, 0, 0);
1577 		}
1578 	}
1579 	lck_spin_unlock(&ut->uu_rethrottle_lock);
1580 	ml_set_interrupts_enabled(s);
1581 }
1582 
1583 
1584 /*
1585  * KPI routine
1586  *
1587  * Create and take a reference on a throttle info structure and return a
1588  * pointer for the file system to use when calling throttle_info_update.
1589  * Calling file system must have a matching release for every create.
1590  */
1591 void *
throttle_info_create(void)1592 throttle_info_create(void)
1593 {
1594 	struct _throttle_io_info_t *info;
1595 	int     level;
1596 
1597 	info = kalloc_type(struct _throttle_io_info_t,
1598 	    Z_ZERO | Z_WAITOK | Z_NOFAIL);
1599 	/* Mark that this one was allocated and needs to be freed */
1600 	DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
1601 	info->throttle_alloc = TRUE;
1602 
1603 	lck_mtx_init(&info->throttle_lock, &throttle_lock_grp, LCK_ATTR_NULL);
1604 	info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1605 
1606 	for (level = 0; level <= THROTTLE_LEVEL_END; level++) {
1607 		TAILQ_INIT(&info->throttle_uthlist[level]);
1608 	}
1609 	info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1610 
1611 	/* Take a reference */
1612 	OSIncrementAtomic(&info->throttle_refcnt);
1613 	return info;
1614 }
1615 
1616 /*
1617  * KPI routine
1618  *
1619  * Release the throttle info pointer if all the reference are gone. Should be
1620  * called to release reference taken by throttle_info_create
1621  */
1622 void
throttle_info_release(void * throttle_info)1623 throttle_info_release(void *throttle_info)
1624 {
1625 	DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
1626 	    (struct _throttle_io_info_t *)throttle_info,
1627 	    (struct _throttle_io_info_t *)throttle_info);
1628 	if (throttle_info) { /* Just to be careful */
1629 		throttle_info_rel(throttle_info);
1630 	}
1631 }
1632 
1633 /*
1634  * KPI routine
1635  *
1636  * File Systems that create an info structure, need to call this routine in
1637  * their mount routine (used by cluster code). File Systems that call this in
1638  * their mount routines must call throttle_info_mount_rel in their unmount
1639  * routines.
1640  */
1641 void
throttle_info_mount_ref(mount_t mp,void * throttle_info)1642 throttle_info_mount_ref(mount_t mp, void *throttle_info)
1643 {
1644 	if ((throttle_info == NULL) || (mp == NULL)) {
1645 		return;
1646 	}
1647 	throttle_info_ref(throttle_info);
1648 
1649 	/*
1650 	 * We already have a reference release it before adding the new one
1651 	 */
1652 	if (mp->mnt_throttle_info) {
1653 		throttle_info_rel(mp->mnt_throttle_info);
1654 	}
1655 	mp->mnt_throttle_info = throttle_info;
1656 }
1657 
1658 /*
1659  * Private KPI routine
1660  *
1661  * return a handle for accessing throttle_info given a throttle_mask.  The
1662  * handle must be released by throttle_info_rel_by_mask
1663  */
1664 int
throttle_info_ref_by_mask(uint64_t throttle_mask,throttle_info_handle_t * throttle_info_handle)1665 throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle)
1666 {
1667 	int     dev_index;
1668 	struct _throttle_io_info_t *info;
1669 
1670 	/*
1671 	 * The 'throttle_mask' is not expected to be 0 otherwise num_trailing_0()
1672 	 * would return value of 64 and this will cause '_throttle_io_info' to
1673 	 * go out of bounds as '_throttle_io_info' is only LOWPRI_MAX_NUM_DEV (64)
1674 	 * elements long.
1675 	 */
1676 	if (throttle_info_handle == NULL || throttle_mask == 0) {
1677 		return EINVAL;
1678 	}
1679 
1680 	dev_index = num_trailing_0(throttle_mask);
1681 	info = &_throttle_io_info[dev_index];
1682 	throttle_info_ref(info);
1683 	*(struct _throttle_io_info_t**)throttle_info_handle = info;
1684 
1685 	return 0;
1686 }
1687 
1688 /*
1689  * Private KPI routine
1690  *
1691  * release the handle obtained by throttle_info_ref_by_mask
1692  */
1693 void
throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)1694 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
1695 {
1696 	/*
1697 	 * for now the handle is just a pointer to _throttle_io_info_t
1698 	 */
1699 	throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
1700 }
1701 
1702 /*
1703  * KPI routine
1704  *
1705  * File Systems that throttle_info_mount_ref, must call this routine in their
1706  * umount routine.
1707  */
1708 void
throttle_info_mount_rel(mount_t mp)1709 throttle_info_mount_rel(mount_t mp)
1710 {
1711 	if (mp->mnt_throttle_info) {
1712 		throttle_info_rel(mp->mnt_throttle_info);
1713 	}
1714 	mp->mnt_throttle_info = NULL;
1715 }
1716 
1717 /*
1718  * Reset throttling periods for the given mount point
1719  *
1720  * private interface used by disk conditioner to reset
1721  * throttling periods when 'is_ssd' status changes
1722  */
1723 void
throttle_info_mount_reset_period(mount_t mp,int isssd)1724 throttle_info_mount_reset_period(mount_t mp, int isssd)
1725 {
1726 	struct _throttle_io_info_t *info;
1727 
1728 	if (mp == NULL) {
1729 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1730 	} else if (mp->mnt_throttle_info == NULL) {
1731 		info = &_throttle_io_info[mp->mnt_devbsdunit];
1732 	} else {
1733 		info = mp->mnt_throttle_info;
1734 	}
1735 
1736 	throttle_init_throttle_period(info, isssd);
1737 }
1738 
1739 void
throttle_info_get_last_io_time(mount_t mp,struct timeval * tv)1740 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
1741 {
1742 	struct _throttle_io_info_t *info;
1743 
1744 	if (mp == NULL) {
1745 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1746 	} else if (mp->mnt_throttle_info == NULL) {
1747 		info = &_throttle_io_info[mp->mnt_devbsdunit];
1748 	} else {
1749 		info = mp->mnt_throttle_info;
1750 	}
1751 
1752 	*tv = info->throttle_last_write_timestamp;
1753 }
1754 
1755 void
update_last_io_time(mount_t mp)1756 update_last_io_time(mount_t mp)
1757 {
1758 	struct _throttle_io_info_t *info;
1759 
1760 	if (mp == NULL) {
1761 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1762 	} else if (mp->mnt_throttle_info == NULL) {
1763 		info = &_throttle_io_info[mp->mnt_devbsdunit];
1764 	} else {
1765 		info = mp->mnt_throttle_info;
1766 	}
1767 
1768 	microuptime(&info->throttle_last_write_timestamp);
1769 	if (mp != NULL) {
1770 		mp->mnt_last_write_completed_timestamp = info->throttle_last_write_timestamp;
1771 	}
1772 }
1773 
1774 int
throttle_get_io_policy(uthread_t * ut)1775 throttle_get_io_policy(uthread_t *ut)
1776 {
1777 	if (ut != NULL) {
1778 		*ut = current_uthread();
1779 	}
1780 
1781 	return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
1782 }
1783 
1784 int
throttle_get_passive_io_policy(uthread_t * ut)1785 throttle_get_passive_io_policy(uthread_t *ut)
1786 {
1787 	if (ut != NULL) {
1788 		*ut = current_uthread();
1789 	}
1790 
1791 	return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_PASSIVE_IO);
1792 }
1793 
1794 
1795 static int
throttle_get_thread_throttle_level(uthread_t ut)1796 throttle_get_thread_throttle_level(uthread_t ut)
1797 {
1798 	uthread_t *ut_p = (ut == NULL) ? &ut : NULL;
1799 	int io_tier = throttle_get_io_policy(ut_p);
1800 
1801 	return throttle_get_thread_throttle_level_internal(ut, io_tier);
1802 }
1803 
1804 /*
1805  * Return a throttle level given an existing I/O tier (such as returned by throttle_get_io_policy)
1806  */
1807 static int
throttle_get_thread_throttle_level_internal(uthread_t ut,int io_tier)1808 throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier)
1809 {
1810 	int thread_throttle_level = io_tier;
1811 	int user_idle_level;
1812 
1813 	assert(ut != NULL);
1814 
1815 	/* Bootcache misses should always be throttled */
1816 	if (ut->uu_throttle_bc) {
1817 		thread_throttle_level = THROTTLE_LEVEL_TIER3;
1818 	}
1819 
1820 	/*
1821 	 * Issue tier3 I/O as tier2 when the user is idle
1822 	 * to allow maintenance tasks to make more progress.
1823 	 *
1824 	 * Assume any positive idle level is enough... for now it's
1825 	 * only ever 0 or 128 but this is not defined anywhere.
1826 	 */
1827 	if (thread_throttle_level >= THROTTLE_LEVEL_TIER3) {
1828 		user_idle_level = timer_get_user_idle_level();
1829 		if (user_idle_level > 0) {
1830 			thread_throttle_level--;
1831 		}
1832 	}
1833 
1834 	return thread_throttle_level;
1835 }
1836 
1837 /*
1838  * I/O will be throttled if either of the following are true:
1839  *   - Higher tiers have in-flight I/O
1840  *   - The time delta since the last start/completion of a higher tier is within the throttle window interval
1841  *
1842  * In-flight I/O is bookended by throttle_info_update_internal/throttle_info_end_io_internal
1843  */
1844 static int
throttle_io_will_be_throttled_internal(void * throttle_info,int * mylevel,int * throttling_level)1845 throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level)
1846 {
1847 	struct _throttle_io_info_t *info = throttle_info;
1848 	struct timeval elapsed;
1849 	struct timeval now;
1850 	uint64_t elapsed_msecs;
1851 	int     thread_throttle_level;
1852 	int     throttle_level;
1853 
1854 	if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED) {
1855 		return THROTTLE_DISENGAGED;
1856 	}
1857 
1858 	microuptime(&now);
1859 
1860 	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
1861 		if (info->throttle_inflight_count[throttle_level]) {
1862 			break;
1863 		}
1864 		elapsed = now;
1865 		timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1866 		elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
1867 
1868 		if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) {
1869 			break;
1870 		}
1871 	}
1872 	if (throttle_level >= thread_throttle_level) {
1873 		/*
1874 		 * we're beyond all of the throttle windows
1875 		 * that affect the throttle level of this thread,
1876 		 * so go ahead and treat as normal I/O
1877 		 */
1878 		return THROTTLE_DISENGAGED;
1879 	}
1880 	if (mylevel) {
1881 		*mylevel = thread_throttle_level;
1882 	}
1883 	if (throttling_level) {
1884 		*throttling_level = throttle_level;
1885 	}
1886 
1887 	if (info->throttle_io_count != info->throttle_io_count_begin) {
1888 		/*
1889 		 * we've already issued at least one throttleable I/O
1890 		 * in the current I/O window, so avoid issuing another one
1891 		 */
1892 		return THROTTLE_NOW;
1893 	}
1894 	/*
1895 	 * we're in the throttle window, so
1896 	 * cut the I/O size back
1897 	 */
1898 	return THROTTLE_ENGAGED;
1899 }
1900 
1901 /*
1902  * If we have a mount point and it has a throttle info pointer then
1903  * use it to do the check, otherwise use the device unit number to find
1904  * the correct throttle info array element.
1905  */
1906 int
throttle_io_will_be_throttled(__unused int lowpri_window_msecs,mount_t mp)1907 throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp)
1908 {
1909 	struct _throttle_io_info_t      *info;
1910 
1911 	/*
1912 	 * Should we just return zero if no mount point
1913 	 */
1914 	if (mp == NULL) {
1915 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
1916 	} else if (mp->mnt_throttle_info == NULL) {
1917 		info = &_throttle_io_info[mp->mnt_devbsdunit];
1918 	} else {
1919 		info = mp->mnt_throttle_info;
1920 	}
1921 
1922 	if (info->throttle_is_fusion_with_priority) {
1923 		uthread_t ut = current_uthread();
1924 		if (ut->uu_lowpri_window == 0) {
1925 			return THROTTLE_DISENGAGED;
1926 		}
1927 	}
1928 
1929 	if (info->throttle_disabled) {
1930 		return THROTTLE_DISENGAGED;
1931 	} else {
1932 		return throttle_io_will_be_throttled_internal(info, NULL, NULL);
1933 	}
1934 }
1935 
1936 /*
1937  * Routine to increment I/O throttling counters maintained in the proc
1938  */
1939 
1940 static void
throttle_update_proc_stats(pid_t throttling_pid,int count)1941 throttle_update_proc_stats(pid_t throttling_pid, int count)
1942 {
1943 	proc_t throttling_proc;
1944 	proc_t throttled_proc = current_proc();
1945 
1946 	/* The throttled_proc is always the current proc; so we are not concerned with refs */
1947 	OSAddAtomic64(count, &(throttled_proc->was_throttled));
1948 
1949 	/* The throttling pid might have exited by now */
1950 	throttling_proc = proc_find(throttling_pid);
1951 	if (throttling_proc != PROC_NULL) {
1952 		OSAddAtomic64(count, &(throttling_proc->did_throttle));
1953 		proc_rele(throttling_proc);
1954 	}
1955 }
1956 
1957 /*
1958  * Block until woken up by the throttle timer or by a rethrottle call.
1959  * As long as we hold the throttle_lock while querying the throttle tier, we're
1960  * safe against seeing an old throttle tier after a rethrottle.
1961  */
1962 uint32_t
throttle_lowpri_io(int sleep_amount)1963 throttle_lowpri_io(int sleep_amount)
1964 {
1965 	uthread_t ut;
1966 	struct _throttle_io_info_t *info;
1967 	int     throttle_type = 0;
1968 	int     mylevel = 0;
1969 	int     throttling_level = THROTTLE_LEVEL_NONE;
1970 	int     sleep_cnt = 0;
1971 	uint32_t  throttle_io_period_num = 0;
1972 	boolean_t insert_tail = TRUE;
1973 	boolean_t s;
1974 
1975 	ut = current_uthread();
1976 
1977 	if (ut->uu_lowpri_window == 0) {
1978 		return 0;
1979 	}
1980 	if (current_thread_in_kernel_fault()) {
1981 		/* do not throttle kernel faults */
1982 		return 0;
1983 	}
1984 
1985 	info = ut->uu_throttle_info;
1986 
1987 	if (info == NULL) {
1988 		ut->uu_throttle_bc = false;
1989 		ut->uu_lowpri_window = 0;
1990 		return 0;
1991 	}
1992 	lck_mtx_lock(&info->throttle_lock);
1993 	assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
1994 
1995 	if (sleep_amount == 0) {
1996 		goto done;
1997 	}
1998 
1999 	if (sleep_amount == 1 && !ut->uu_throttle_bc) {
2000 		sleep_amount = 0;
2001 	}
2002 
2003 	throttle_io_period_num = info->throttle_io_period_num;
2004 
2005 	ut->uu_was_rethrottled = false;
2006 
2007 	while ((throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level))) {
2008 		if (throttle_type == THROTTLE_ENGAGED) {
2009 			if (sleep_amount == 0) {
2010 				break;
2011 			}
2012 			if (info->throttle_io_period_num < throttle_io_period_num) {
2013 				break;
2014 			}
2015 			if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
2016 				break;
2017 			}
2018 		}
2019 		/*
2020 		 * keep the same position in the list if "rethrottle_thread" changes our throttle level  and
2021 		 * then puts us back to the original level before we get a chance to run
2022 		 */
2023 		if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED && ut->uu_on_throttlelist != mylevel) {
2024 			/*
2025 			 * must have been awakened via "rethrottle_thread" (the timer pulls us off the list)
2026 			 * and we've changed our throttling level, so pull ourselves off of the appropriate list
2027 			 * and make sure we get put on the tail of the new list since we're starting anew w/r to
2028 			 * the throttling engine
2029 			 */
2030 			TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
2031 			ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
2032 			insert_tail = TRUE;
2033 		}
2034 		if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) {
2035 			if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END) {
2036 				goto done;
2037 			}
2038 		}
2039 		assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END);
2040 
2041 		s = ml_set_interrupts_enabled(FALSE);
2042 		lck_spin_lock(&ut->uu_rethrottle_lock);
2043 
2044 		/*
2045 		 * this is the critical section w/r to our interaction
2046 		 * with "rethrottle_thread"
2047 		 */
2048 		if (ut->uu_was_rethrottled) {
2049 			lck_spin_unlock(&ut->uu_rethrottle_lock);
2050 			ml_set_interrupts_enabled(s);
2051 			lck_mtx_yield(&info->throttle_lock);
2052 
2053 			KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 103)),
2054 			    uthread_tid(ut), ut->uu_on_throttlelist, 0, 0, 0);
2055 
2056 			ut->uu_was_rethrottled = false;
2057 			continue;
2058 		}
2059 		KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE,
2060 		    info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, 0);
2061 
2062 		if (sleep_cnt == 0) {
2063 			KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
2064 			    throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
2065 			throttled_count[mylevel]++;
2066 		}
2067 		ut->uu_wmesg = "throttle_lowpri_io";
2068 
2069 		assert_wait((caddr_t)&ut->uu_on_throttlelist, THREAD_UNINT);
2070 
2071 		ut->uu_is_throttled = true;
2072 		lck_spin_unlock(&ut->uu_rethrottle_lock);
2073 		ml_set_interrupts_enabled(s);
2074 
2075 		lck_mtx_unlock(&info->throttle_lock);
2076 
2077 		thread_block(THREAD_CONTINUE_NULL);
2078 
2079 		ut->uu_wmesg = NULL;
2080 
2081 		ut->uu_is_throttled = false;
2082 		ut->uu_was_rethrottled = false;
2083 
2084 		lck_mtx_lock(&info->throttle_lock);
2085 
2086 		sleep_cnt++;
2087 
2088 		if (sleep_amount == 0) {
2089 			insert_tail = FALSE;
2090 		} else if (info->throttle_io_period_num < throttle_io_period_num ||
2091 		    (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
2092 			insert_tail = FALSE;
2093 			sleep_amount = 0;
2094 		}
2095 	}
2096 done:
2097 	if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
2098 		TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
2099 		ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
2100 	}
2101 	lck_mtx_unlock(&info->throttle_lock);
2102 
2103 	if (sleep_cnt) {
2104 		KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
2105 		    throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, 0, 0);
2106 		/*
2107 		 * We update the stats for the last pid which opened a throttle window for the throttled thread.
2108 		 * This might not be completely accurate since the multiple throttles seen by the lower tier pid
2109 		 * might have been caused by various higher prio pids. However, updating these stats accurately
2110 		 * means doing a proc_find while holding the throttle lock which leads to deadlock.
2111 		 */
2112 		throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level], sleep_cnt);
2113 	}
2114 
2115 	ut->uu_throttle_info = NULL;
2116 	ut->uu_throttle_bc = false;
2117 	ut->uu_lowpri_window = 0;
2118 
2119 	throttle_info_rel(info);
2120 
2121 	return sleep_cnt;
2122 }
2123 
2124 /*
2125  *  returns TRUE if the throttle_lowpri_io called with the same sleep_amount would've slept
2126  *  This function mimics the most of the throttle_lowpri_io checks but without actual sleeping
2127  */
2128 int
throttle_lowpri_io_will_be_throttled(int sleep_amount)2129 throttle_lowpri_io_will_be_throttled(int sleep_amount)
2130 {
2131 	if (sleep_amount == 0) {
2132 		return FALSE;
2133 	}
2134 
2135 	uthread_t ut = current_uthread();
2136 	if (ut->uu_lowpri_window == 0) {
2137 		return FALSE;
2138 	}
2139 
2140 	struct _throttle_io_info_t *info = ut->uu_throttle_info;
2141 	if (info == NULL) {
2142 		return FALSE;
2143 	}
2144 
2145 	lck_mtx_lock(&info->throttle_lock);
2146 	assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
2147 
2148 	if (sleep_amount == 1 && !ut->uu_throttle_bc) {
2149 		sleep_amount = 0;
2150 	}
2151 
2152 	int result = FALSE;
2153 
2154 	int throttle_type = throttle_io_will_be_throttled_internal(info, NULL, NULL);
2155 	if (throttle_type > THROTTLE_DISENGAGED) {
2156 		result = TRUE;
2157 		if ((throttle_type == THROTTLE_ENGAGED) && (sleep_amount == 0)) {
2158 			result = FALSE;
2159 		}
2160 	}
2161 
2162 	lck_mtx_unlock(&info->throttle_lock);
2163 
2164 	return result;
2165 }
2166 
2167 
2168 /*
2169  * KPI routine
2170  *
2171  * set a kernel thread's IO policy.  policy can be:
2172  * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE, IOPOL_UTILITY, IOPOL_STANDARD
2173  *
2174  * explanations about these policies are in the man page of setiopolicy_np
2175  */
2176 void
throttle_set_thread_io_policy(int policy)2177 throttle_set_thread_io_policy(int policy)
2178 {
2179 	proc_set_thread_policy(current_thread(), TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, policy);
2180 }
2181 
2182 int
throttle_get_thread_effective_io_policy()2183 throttle_get_thread_effective_io_policy()
2184 {
2185 	return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
2186 }
2187 
2188 int
throttle_thread_io_tier_above_metadata(void)2189 throttle_thread_io_tier_above_metadata(void)
2190 {
2191 	return throttle_get_thread_effective_io_policy() < IOSCHED_METADATA_TIER;
2192 }
2193 
2194 void
throttle_info_reset_window(uthread_t ut)2195 throttle_info_reset_window(uthread_t ut)
2196 {
2197 	struct _throttle_io_info_t *info;
2198 
2199 	if (ut == NULL) {
2200 		ut = current_uthread();
2201 	}
2202 
2203 	if ((info = ut->uu_throttle_info)) {
2204 		throttle_info_rel(info);
2205 
2206 		ut->uu_throttle_info = NULL;
2207 		ut->uu_lowpri_window = 0;
2208 		ut->uu_throttle_bc = false;
2209 	}
2210 }
2211 
2212 static
2213 void
throttle_info_set_initial_window(uthread_t ut,struct _throttle_io_info_t * info,boolean_t BC_throttle,boolean_t isssd)2214 throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle, boolean_t isssd)
2215 {
2216 	if (lowpri_throttle_enabled == 0 || info->throttle_disabled) {
2217 		return;
2218 	}
2219 
2220 	if (info->throttle_io_periods == 0) {
2221 		throttle_init_throttle_period(info, isssd);
2222 	}
2223 	if (ut->uu_throttle_info == NULL) {
2224 		ut->uu_throttle_info = info;
2225 		throttle_info_ref(info);
2226 		DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
2227 
2228 		ut->uu_lowpri_window = 1;
2229 		ut->uu_throttle_bc = BC_throttle;
2230 	}
2231 }
2232 
2233 /*
2234  * Update inflight IO count and throttling window
2235  * Should be called when an IO is done
2236  *
2237  * Only affects IO that was sent through spec_strategy
2238  */
2239 void
throttle_info_end_io(buf_t bp)2240 throttle_info_end_io(buf_t bp)
2241 {
2242 	vnode_t vp;
2243 	mount_t mp;
2244 	struct bufattr *bap;
2245 	struct _throttle_io_info_t *info;
2246 	int io_tier;
2247 
2248 	bap = &bp->b_attr;
2249 	if (!ISSET(bap->ba_flags, BA_STRATEGY_TRACKED_IO)) {
2250 		return;
2251 	}
2252 	CLR(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2253 
2254 	vp = buf_vnode(bp);
2255 	mp = vp->v_mount;
2256 
2257 	if (vp && (vp->v_type == VBLK || vp->v_type == VCHR)) {
2258 		info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
2259 	} else if (mp != NULL) {
2260 		info = &_throttle_io_info[mp->mnt_devbsdunit];
2261 	} else {
2262 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2263 	}
2264 
2265 	io_tier = GET_BUFATTR_IO_TIER(bap);
2266 	if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2267 		io_tier--;
2268 	}
2269 
2270 	throttle_info_end_io_internal(info, io_tier);
2271 }
2272 
2273 /*
2274  * Decrement inflight count initially incremented by throttle_info_update_internal
2275  */
2276 static
2277 void
throttle_info_end_io_internal(struct _throttle_io_info_t * info,int throttle_level)2278 throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level)
2279 {
2280 	if (throttle_level == THROTTLE_LEVEL_NONE) {
2281 		return;
2282 	}
2283 
2284 	microuptime(&info->throttle_window_start_timestamp[throttle_level]);
2285 	OSDecrementAtomic(&info->throttle_inflight_count[throttle_level]);
2286 	assert(info->throttle_inflight_count[throttle_level] >= 0);
2287 }
2288 
2289 /*
2290  * If inflight is TRUE and bap is NULL then the caller is responsible for calling
2291  * throttle_info_end_io_internal to avoid leaking in-flight I/O.
2292  */
2293 static
2294 int
throttle_info_update_internal(struct _throttle_io_info_t * info,uthread_t ut,int flags,boolean_t isssd,boolean_t inflight,struct bufattr * bap)2295 throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap)
2296 {
2297 	int     thread_throttle_level;
2298 
2299 	if (lowpri_throttle_enabled == 0 || info->throttle_disabled) {
2300 		return THROTTLE_LEVEL_NONE;
2301 	}
2302 
2303 	if (ut == NULL) {
2304 		ut = current_uthread();
2305 	}
2306 
2307 	if (bap && inflight && !ut->uu_throttle_bc) {
2308 		thread_throttle_level = GET_BUFATTR_IO_TIER(bap);
2309 		if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2310 			thread_throttle_level--;
2311 		}
2312 	} else {
2313 		thread_throttle_level = throttle_get_thread_throttle_level(ut);
2314 	}
2315 
2316 	if (thread_throttle_level != THROTTLE_LEVEL_NONE) {
2317 		if (!ISSET(flags, B_PASSIVE)) {
2318 			info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid();
2319 			if (inflight && !ut->uu_throttle_bc) {
2320 				if (NULL != bap) {
2321 					SET(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2322 				}
2323 				OSIncrementAtomic(&info->throttle_inflight_count[thread_throttle_level]);
2324 			} else {
2325 				microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]);
2326 			}
2327 			KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) | DBG_FUNC_NONE,
2328 			    proc_getpid(current_proc()), thread_throttle_level, 0, 0, 0);
2329 		}
2330 		microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]);
2331 	}
2332 
2333 
2334 	if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) {
2335 		/*
2336 		 * I'd really like to do the IOSleep here, but
2337 		 * we may be holding all kinds of filesystem related locks
2338 		 * and the pages for this I/O marked 'busy'...
2339 		 * we don't want to cause a normal task to block on
2340 		 * one of these locks while we're throttling a task marked
2341 		 * for low priority I/O... we'll mark the uthread and
2342 		 * do the delay just before we return from the system
2343 		 * call that triggered this I/O or from vnode_pagein
2344 		 */
2345 		OSAddAtomic(1, &info->throttle_io_count);
2346 
2347 		throttle_info_set_initial_window(ut, info, FALSE, isssd);
2348 	}
2349 
2350 	return thread_throttle_level;
2351 }
2352 
2353 void *
throttle_info_update_by_mount(mount_t mp)2354 throttle_info_update_by_mount(mount_t mp)
2355 {
2356 	struct _throttle_io_info_t *info;
2357 	uthread_t ut;
2358 	boolean_t isssd = FALSE;
2359 
2360 	ut = current_uthread();
2361 
2362 	if (mp != NULL) {
2363 		if (disk_conditioner_mount_is_ssd(mp)) {
2364 			isssd = TRUE;
2365 		}
2366 		info = &_throttle_io_info[mp->mnt_devbsdunit];
2367 	} else {
2368 		info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2369 	}
2370 
2371 	if (!ut->uu_lowpri_window) {
2372 		throttle_info_set_initial_window(ut, info, FALSE, isssd);
2373 	}
2374 
2375 	return info;
2376 }
2377 
2378 
2379 /*
2380  * KPI routine
2381  *
2382  * this is usually called before every I/O, used for throttled I/O
2383  * book keeping.  This routine has low overhead and does not sleep
2384  */
2385 void
throttle_info_update(void * throttle_info,int flags)2386 throttle_info_update(void *throttle_info, int flags)
2387 {
2388 	if (throttle_info) {
2389 		throttle_info_update_internal(throttle_info, NULL, flags, FALSE, FALSE, NULL);
2390 	}
2391 }
2392 
2393 /*
2394  * KPI routine (private)
2395  *
2396  * similar to throttle_info_update() but takes an additional argument to
2397  * indicate if the backing device type is SSD or not.
2398  */
2399 void
throttle_info_update_with_type(void * throttle_info,int flags,boolean_t isssd)2400 throttle_info_update_with_type(void *throttle_info, int flags, boolean_t isssd)
2401 {
2402 	if (throttle_info) {
2403 		throttle_info_update_internal(throttle_info, NULL, flags, isssd, FALSE, NULL);
2404 	}
2405 }
2406 
2407 /*
2408  * KPI routine
2409  *
2410  * this is usually called before every I/O, used for throttled I/O
2411  * book keeping.  This routine has low overhead and does not sleep
2412  */
2413 void
throttle_info_update_by_mask(void * throttle_info_handle,int flags)2414 throttle_info_update_by_mask(void *throttle_info_handle, int flags)
2415 {
2416 	void *throttle_info = throttle_info_handle;
2417 
2418 	/*
2419 	 * for now we only use the lowest bit of the throttle mask, so the
2420 	 * handle is the same as the throttle_info.  Later if we store a
2421 	 * set of throttle infos in the handle, we will want to loop through
2422 	 * them and call throttle_info_update in a loop
2423 	 */
2424 	throttle_info_update(throttle_info, flags);
2425 }
2426 /*
2427  * KPI routine
2428  *
2429  * This routine marks the throttle info as disabled. Used for mount points which
2430  * support I/O scheduling.
2431  */
2432 
2433 void
throttle_info_disable_throttle(int devno,boolean_t isfusion)2434 throttle_info_disable_throttle(int devno, boolean_t isfusion)
2435 {
2436 	struct _throttle_io_info_t *info;
2437 
2438 	if (devno < 0 || devno >= LOWPRI_MAX_NUM_DEV) {
2439 		panic("Illegal devno (%d) passed into throttle_info_disable_throttle()", devno);
2440 	}
2441 
2442 	info = &_throttle_io_info[devno];
2443 	// don't disable software throttling on devices that are part of a fusion device
2444 	// and override the software throttle periods to use HDD periods
2445 	if (isfusion) {
2446 		info->throttle_is_fusion_with_priority = isfusion;
2447 		throttle_init_throttle_period(info, FALSE);
2448 	}
2449 	info->throttle_disabled = !info->throttle_is_fusion_with_priority;
2450 	return;
2451 }
2452 
2453 
2454 /*
2455  * KPI routine (private)
2456  * Called to determine if this IO is being throttled to this level so that it can be treated specially
2457  */
2458 int
throttle_info_io_will_be_throttled(void * throttle_info,int policy)2459 throttle_info_io_will_be_throttled(void * throttle_info, int policy)
2460 {
2461 	struct _throttle_io_info_t *info = throttle_info;
2462 	struct timeval elapsed;
2463 	uint64_t elapsed_msecs;
2464 	int     throttle_level;
2465 	int     thread_throttle_level;
2466 
2467 	switch (policy) {
2468 	case IOPOL_THROTTLE:
2469 		thread_throttle_level = THROTTLE_LEVEL_TIER3;
2470 		break;
2471 	case IOPOL_UTILITY:
2472 		thread_throttle_level = THROTTLE_LEVEL_TIER2;
2473 		break;
2474 	case IOPOL_STANDARD:
2475 		thread_throttle_level = THROTTLE_LEVEL_TIER1;
2476 		break;
2477 	default:
2478 		thread_throttle_level = THROTTLE_LEVEL_TIER0;
2479 		break;
2480 	}
2481 	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
2482 		if (info->throttle_inflight_count[throttle_level]) {
2483 			break;
2484 		}
2485 
2486 		microuptime(&elapsed);
2487 		timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
2488 		elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)1000 + (elapsed.tv_usec / 1000);
2489 
2490 		if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level]) {
2491 			break;
2492 		}
2493 	}
2494 	if (throttle_level >= thread_throttle_level) {
2495 		/*
2496 		 * we're beyond all of the throttle windows
2497 		 * so go ahead and treat as normal I/O
2498 		 */
2499 		return THROTTLE_DISENGAGED;
2500 	}
2501 	/*
2502 	 * we're in the throttle window
2503 	 */
2504 	return THROTTLE_ENGAGED;
2505 }
2506 
2507 int
throttle_lowpri_window(void)2508 throttle_lowpri_window(void)
2509 {
2510 	return current_uthread()->uu_lowpri_window;
2511 }
2512 
2513 #if CONFIG_PHYS_WRITE_ACCT
2514 extern thread_t pm_sync_thread;
2515 #endif /* CONFIG_PHYS_WRITE_ACCT */
2516 
2517 int
spec_strategy(struct vnop_strategy_args * ap)2518 spec_strategy(struct vnop_strategy_args *ap)
2519 {
2520 	buf_t   bp;
2521 	int     bflags;
2522 	int     io_tier;
2523 	int     passive;
2524 	dev_t   bdev;
2525 	uthread_t ut;
2526 	vnode_t vp;
2527 	mount_t mp;
2528 	struct  bufattr *bap;
2529 	int     strategy_ret;
2530 	struct _throttle_io_info_t *throttle_info;
2531 	boolean_t isssd = FALSE;
2532 	boolean_t inflight = FALSE;
2533 	boolean_t upgrade = FALSE;
2534 	int code = 0;
2535 
2536 #if CONFIG_DELAY_IDLE_SLEEP
2537 	proc_t curproc = current_proc();
2538 #endif /* CONFIG_DELAY_IDLE_SLEEP */
2539 
2540 	bp = ap->a_bp;
2541 	bdev = buf_device(bp);
2542 	vp = buf_vnode(bp);
2543 	mp = vp ? vp->v_mount : NULL;
2544 	bap = &bp->b_attr;
2545 
2546 #if CONFIG_PHYS_WRITE_ACCT
2547 	if (current_thread() == pm_sync_thread) {
2548 		OSAddAtomic64(buf_count(bp), (SInt64 *)&(kernel_pm_writes));
2549 	}
2550 #endif /* CONFIG_PHYS_WRITE_ACCT */
2551 
2552 #if CONFIG_IOSCHED
2553 	if (bp->b_flags & B_CLUSTER) {
2554 		io_tier = upl_get_cached_tier(bp->b_upl);
2555 
2556 		if (io_tier == -1) {
2557 			io_tier = throttle_get_io_policy(&ut);
2558 		}
2559 #if DEVELOPMENT || DEBUG
2560 		else {
2561 			int my_io_tier = throttle_get_io_policy(&ut);
2562 
2563 			if (io_tier != my_io_tier) {
2564 				KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, IO_TIER_UPL_MISMATCH)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp), my_io_tier, io_tier, 0, 0);
2565 			}
2566 		}
2567 #endif
2568 	} else {
2569 		io_tier = throttle_get_io_policy(&ut);
2570 	}
2571 #else
2572 	io_tier = throttle_get_io_policy(&ut);
2573 #endif
2574 	passive = throttle_get_passive_io_policy(&ut);
2575 
2576 	/*
2577 	 * Mark if the I/O was upgraded by throttle_get_thread_throttle_level
2578 	 * while preserving the original issued tier (throttle_get_io_policy
2579 	 * does not return upgraded tiers)
2580 	 */
2581 	if (mp && io_tier > throttle_get_thread_throttle_level_internal(ut, io_tier)) {
2582 #if CONFIG_IOSCHED
2583 		if (!(mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2584 			upgrade = TRUE;
2585 		}
2586 #else /* CONFIG_IOSCHED */
2587 		upgrade = TRUE;
2588 #endif /* CONFIG_IOSCHED */
2589 	}
2590 
2591 	if (bp->b_flags & B_META) {
2592 		bap->ba_flags |= BA_META;
2593 	}
2594 
2595 #if CONFIG_IOSCHED
2596 	/*
2597 	 * For metadata reads, ceil the I/O tier to IOSCHED_METADATA_EXPEDITED_TIER if they are expedited, otherwise
2598 	 * ceil it to IOSCHED_METADATA_TIER. Mark them passive if the I/O tier was upgraded.
2599 	 * For metadata writes, set the I/O tier to IOSCHED_METADATA_EXPEDITED_TIER if they are expedited. Otherwise
2600 	 * set it to IOSCHED_METADATA_TIER. In addition, mark them as passive.
2601 	 */
2602 	if (bap->ba_flags & BA_META) {
2603 		if ((mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) || (bap->ba_flags & BA_IO_SCHEDULED)) {
2604 			if (bp->b_flags & B_READ) {
2605 				if ((bap->ba_flags & BA_EXPEDITED_META_IO) && (io_tier > IOSCHED_METADATA_EXPEDITED_TIER)) {
2606 					io_tier = IOSCHED_METADATA_EXPEDITED_TIER;
2607 					passive = 1;
2608 				} else if (io_tier > IOSCHED_METADATA_TIER) {
2609 					io_tier = IOSCHED_METADATA_TIER;
2610 					passive = 1;
2611 				}
2612 			} else {
2613 				if (bap->ba_flags & BA_EXPEDITED_META_IO) {
2614 					io_tier = IOSCHED_METADATA_EXPEDITED_TIER;
2615 				} else {
2616 					io_tier = IOSCHED_METADATA_TIER;
2617 				}
2618 				passive = 1;
2619 			}
2620 		}
2621 	}
2622 #endif /* CONFIG_IOSCHED */
2623 
2624 	SET_BUFATTR_IO_TIER(bap, io_tier);
2625 
2626 	if (passive) {
2627 		bp->b_flags |= B_PASSIVE;
2628 		bap->ba_flags |= BA_PASSIVE;
2629 	}
2630 
2631 #if CONFIG_DELAY_IDLE_SLEEP
2632 	if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP)) {
2633 		bap->ba_flags |= BA_DELAYIDLESLEEP;
2634 	}
2635 #endif /* CONFIG_DELAY_IDLE_SLEEP */
2636 
2637 	bflags = bp->b_flags;
2638 
2639 	if (((bflags & B_READ) == 0) && ((bflags & B_ASYNC) == 0)) {
2640 		bufattr_markquickcomplete(bap);
2641 	}
2642 
2643 	if (bflags & B_READ) {
2644 		code |= DKIO_READ;
2645 	}
2646 	if (bflags & B_ASYNC) {
2647 		code |= DKIO_ASYNC;
2648 	}
2649 
2650 	if (bap->ba_flags & BA_META) {
2651 		code |= DKIO_META;
2652 	} else if (bflags & B_PAGEIO) {
2653 		code |= DKIO_PAGING;
2654 	}
2655 
2656 	if (io_tier != 0) {
2657 		code |= DKIO_THROTTLE;
2658 	}
2659 
2660 	code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
2661 
2662 	if (bflags & B_PASSIVE) {
2663 		code |= DKIO_PASSIVE;
2664 	}
2665 
2666 	if (bap->ba_flags & BA_NOCACHE) {
2667 		code |= DKIO_NOCACHE;
2668 	}
2669 
2670 	if (upgrade) {
2671 		code |= DKIO_TIER_UPGRADE;
2672 		SET(bap->ba_flags, BA_IO_TIER_UPGRADE);
2673 	}
2674 
2675 	if (kdebug_enable) {
2676 		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
2677 		    buf_kernel_addrperm_addr(bp), bdev, buf_blkno(bp), buf_count(bp), 0);
2678 	}
2679 
2680 #if CONFIG_IO_COMPRESSION_STATS
2681 	// Do not run IO Compression Stats when a privilege thread is active
2682 	if (!is_vm_privileged() && !is_external_pageout_thread()) {
2683 		io_compression_stats(bp);
2684 	}
2685 #endif /* CONFIG_IO_COMPRESSION_STATS */
2686 	thread_update_io_stats(current_thread(), buf_count(bp), code);
2687 
2688 	if (vp && (vp->v_type == VBLK || vp->v_type == VCHR)) {
2689 		if (!vp->v_un.vu_specinfo->si_initted) {
2690 			SPEC_INIT_BSDUNIT(vp, vfs_context_current());
2691 		}
2692 		if (vp->v_un.vu_specinfo->si_devbsdunit > (LOWPRI_MAX_NUM_DEV - 1)) {
2693 			panic("Invalid value (%d) for si_devbsdunit for vnode %p",
2694 			    vp->v_un.vu_specinfo->si_devbsdunit, vp);
2695 		}
2696 		if (vp->v_un.vu_specinfo->si_isssd > 1) {
2697 			panic("Invalid value (%d) for si_isssd for vnode %p",
2698 			    vp->v_un.vu_specinfo->si_isssd, vp);
2699 		}
2700 		throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
2701 		isssd = vp->v_un.vu_specinfo->si_isssd;
2702 	} else if (mp != NULL) {
2703 		if (disk_conditioner_mount_is_ssd(mp)) {
2704 			isssd = TRUE;
2705 		}
2706 		/*
2707 		 * Partially initialized mounts don't have a final devbsdunit and should not be tracked.
2708 		 * Verify that devbsdunit is initialized (non-zero) or that 0 is the correct initialized value
2709 		 * (mnt_throttle_mask is initialized and num_trailing_0 would be 0)
2710 		 */
2711 		if (mp->mnt_devbsdunit || (mp->mnt_throttle_mask != LOWPRI_MAX_NUM_DEV - 1 && mp->mnt_throttle_mask & 0x1)) {
2712 			inflight = TRUE;
2713 		}
2714 		throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
2715 	} else {
2716 		throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
2717 	}
2718 
2719 	throttle_info_update_internal(throttle_info, ut, bflags, isssd, inflight, bap);
2720 
2721 	if ((bflags & B_READ) == 0) {
2722 		microuptime(&throttle_info->throttle_last_write_timestamp);
2723 
2724 		if (!(vp && (vp->v_type == VBLK || vp->v_type == VCHR)) && mp) {
2725 			mp->mnt_last_write_issued_timestamp = throttle_info->throttle_last_write_timestamp;
2726 			INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
2727 		}
2728 	} else if (!(vp && (vp->v_type == VBLK || vp->v_type == VCHR)) && mp) {
2729 		INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
2730 	}
2731 	/*
2732 	 * The BootCache may give us special information about
2733 	 * the IO, so it returns special values that we check
2734 	 * for here.
2735 	 *
2736 	 * IO_SATISFIED_BY_CACHE
2737 	 * The read has been satisfied by the boot cache. Don't
2738 	 * throttle the thread unnecessarily.
2739 	 *
2740 	 * IO_SHOULD_BE_THROTTLED
2741 	 * The boot cache is playing back a playlist and this IO
2742 	 * cut through. Throttle it so we're not cutting through
2743 	 * the boot cache too often.
2744 	 *
2745 	 * Note that typical strategy routines are defined with
2746 	 * a void return so we'll get garbage here. In the
2747 	 * unlikely case the garbage matches our special return
2748 	 * value, it's not a big deal since we're only adjusting
2749 	 * the throttling delay.
2750 	 */
2751 #define IO_SATISFIED_BY_CACHE  ((int)0xcafefeed)
2752 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
2753 #pragma clang diagnostic push
2754 #pragma clang diagnostic ignored "-Wcast-function-type"
2755 
2756 	typedef int strategy_fcn_ret_t(struct buf *bp);
2757 
2758 	strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
2759 
2760 #pragma clang diagnostic pop
2761 
2762 	// disk conditioner needs to track when this I/O actually starts
2763 	// which means track it after `strategy` which may include delays
2764 	// from inflight I/Os
2765 	microuptime(&bp->b_timestamp_tv);
2766 
2767 	if (IO_SATISFIED_BY_CACHE == strategy_ret) {
2768 		/*
2769 		 * If this was a throttled IO satisfied by the boot cache,
2770 		 * don't delay the thread.
2771 		 */
2772 		throttle_info_reset_window(ut);
2773 	} else if (IO_SHOULD_BE_THROTTLED == strategy_ret) {
2774 		/*
2775 		 * If the boot cache indicates this IO should be throttled,
2776 		 * delay the thread.
2777 		 */
2778 		throttle_info_set_initial_window(ut, throttle_info, TRUE, isssd);
2779 	}
2780 	return 0;
2781 }
2782 
2783 
2784 /*
2785  * This is a noop, simply returning what one has been given.
2786  */
2787 int
spec_blockmap(__unused struct vnop_blockmap_args * ap)2788 spec_blockmap(__unused struct vnop_blockmap_args *ap)
2789 {
2790 	return ENOTSUP;
2791 }
2792 
2793 static int
spec_close_internal(struct vnode * vp,dev_t dev,int flags,vfs_context_t ctx)2794 spec_close_internal(struct vnode *vp, dev_t dev, int flags, vfs_context_t ctx)
2795 {
2796 	int error = 0;
2797 	struct proc *p = vfs_context_proc(ctx);
2798 	struct session *sessp;
2799 	struct pgrp *pg;
2800 
2801 	switch (vp->v_type) {
2802 	case VCHR:
2803 		/*
2804 		 * Hack: a tty device that is a controlling terminal
2805 		 * has a reference from the session structure.
2806 		 * We cannot easily tell that a character device is
2807 		 * a controlling terminal, unless it is the closing
2808 		 * process' controlling terminal.  In that case,
2809 		 * if the reference count is 1 (this is the very
2810 		 * last close)
2811 		 */
2812 		pg = proc_pgrp(p, &sessp);
2813 		devsw_lock(dev, S_IFCHR);
2814 		if (sessp != SESSION_NULL) {
2815 			if (vp == sessp->s_ttyvp && vcount(vp) == 1) {
2816 				struct tty *tp = TTY_NULL;
2817 
2818 				devsw_unlock(dev, S_IFCHR);
2819 				session_lock(sessp);
2820 				if (vp == sessp->s_ttyvp) {
2821 					tp = session_clear_tty_locked(sessp);
2822 				}
2823 				session_unlock(sessp);
2824 
2825 				if (tp != TTY_NULL) {
2826 					ttyfree(tp);
2827 				}
2828 				devsw_lock(dev, S_IFCHR);
2829 			}
2830 		}
2831 		pgrp_rele(pg);
2832 
2833 		if (--vp->v_specinfo->si_opencount < 0) {
2834 			panic("negative open count (c, %u, %u)", major(dev), minor(dev));
2835 		}
2836 
2837 		/*
2838 		 * close on last reference or on vnode revoke call
2839 		 */
2840 		if (vcount(vp) == 0 || (flags & IO_REVOKE) != 0) {
2841 			error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
2842 		}
2843 
2844 		devsw_unlock(dev, S_IFCHR);
2845 		break;
2846 
2847 	case VBLK:
2848 		/*
2849 		 * If there is more than one outstanding open, don't
2850 		 * send the close to the device.
2851 		 */
2852 		devsw_lock(dev, S_IFBLK);
2853 		if (vcount(vp) > 1) {
2854 			vp->v_specinfo->si_opencount--;
2855 			devsw_unlock(dev, S_IFBLK);
2856 			return 0;
2857 		}
2858 		devsw_unlock(dev, S_IFBLK);
2859 
2860 		/*
2861 		 * On last close of a block device (that isn't mounted)
2862 		 * we must invalidate any in core blocks, so that
2863 		 * we can, for instance, change floppy disks.
2864 		 */
2865 		if ((error = spec_fsync_internal(vp, MNT_WAIT, ctx))) {
2866 			return error;
2867 		}
2868 
2869 		error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
2870 		if (error) {
2871 			return error;
2872 		}
2873 
2874 		devsw_lock(dev, S_IFBLK);
2875 
2876 		if (--vp->v_specinfo->si_opencount < 0) {
2877 			panic("negative open count (b, %u, %u)", major(dev), minor(dev));
2878 		}
2879 
2880 		if (vcount(vp) == 0) {
2881 			error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
2882 		}
2883 
2884 		devsw_unlock(dev, S_IFBLK);
2885 		break;
2886 
2887 	default:
2888 		panic("spec_close: not special");
2889 		return EBADF;
2890 	}
2891 
2892 	return error;
2893 }
2894 
2895 /*
2896  * Device close routine
2897  */
2898 int
spec_close(struct vnop_close_args * ap)2899 spec_close(struct vnop_close_args *ap)
2900 {
2901 	return spec_close_internal(ap->a_vp, ap->a_vp->v_rdev, ap->a_fflag, ap->a_context);
2902 }
2903 
2904 /*
2905  * Return POSIX pathconf information applicable to special devices.
2906  */
2907 int
spec_pathconf(struct vnop_pathconf_args * ap)2908 spec_pathconf(struct vnop_pathconf_args *ap)
2909 {
2910 	switch (ap->a_name) {
2911 	case _PC_LINK_MAX:
2912 		*ap->a_retval = LINK_MAX;
2913 		return 0;
2914 	case _PC_MAX_CANON:
2915 		*ap->a_retval = MAX_CANON;
2916 		return 0;
2917 	case _PC_MAX_INPUT:
2918 		*ap->a_retval = MAX_INPUT;
2919 		return 0;
2920 	case _PC_PIPE_BUF:
2921 		*ap->a_retval = PIPE_BUF;
2922 		return 0;
2923 	case _PC_CHOWN_RESTRICTED:
2924 		*ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
2925 		return 0;
2926 	case _PC_VDISABLE:
2927 		*ap->a_retval = _POSIX_VDISABLE;
2928 		return 0;
2929 	default:
2930 		return EINVAL;
2931 	}
2932 	/* NOTREACHED */
2933 }
2934 
2935 /*
2936  * Special device failed operation
2937  */
2938 int
spec_ebadf(__unused void * dummy)2939 spec_ebadf(__unused void *dummy)
2940 {
2941 	return EBADF;
2942 }
2943 
2944 /* Blktooff derives file offset from logical block number */
2945 int
spec_blktooff(struct vnop_blktooff_args * ap)2946 spec_blktooff(struct vnop_blktooff_args *ap)
2947 {
2948 	struct vnode *vp = ap->a_vp;
2949 
2950 	switch (vp->v_type) {
2951 	case VCHR:
2952 		*ap->a_offset = (off_t)-1; /* failure */
2953 		return ENOTSUP;
2954 
2955 	case VBLK:
2956 		printf("spec_blktooff: not implemented for VBLK\n");
2957 		*ap->a_offset = (off_t)-1; /* failure */
2958 		return ENOTSUP;
2959 
2960 	default:
2961 		panic("spec_blktooff type");
2962 	}
2963 	/* NOTREACHED */
2964 
2965 	return 0;
2966 }
2967 
2968 /* Offtoblk derives logical block number from file offset */
2969 int
spec_offtoblk(struct vnop_offtoblk_args * ap)2970 spec_offtoblk(struct vnop_offtoblk_args *ap)
2971 {
2972 	struct vnode *vp = ap->a_vp;
2973 
2974 	switch (vp->v_type) {
2975 	case VCHR:
2976 		*ap->a_lblkno = (daddr64_t)-1; /* failure */
2977 		return ENOTSUP;
2978 
2979 	case VBLK:
2980 		printf("spec_offtoblk: not implemented for VBLK\n");
2981 		*ap->a_lblkno = (daddr64_t)-1; /* failure */
2982 		return ENOTSUP;
2983 
2984 	default:
2985 		panic("spec_offtoblk type");
2986 	}
2987 	/* NOTREACHED */
2988 
2989 	return 0;
2990 }
2991 
2992 static int filt_specattach(struct knote *kn, struct kevent_qos_s *kev);
2993 static void filt_specdetach(struct knote *kn);
2994 static int filt_specevent(struct knote *kn, long hint);
2995 static int filt_spectouch(struct knote *kn, struct kevent_qos_s *kev);
2996 static int filt_specprocess(struct knote *kn, struct kevent_qos_s *kev);
2997 
2998 SECURITY_READ_ONLY_EARLY(struct filterops) spec_filtops = {
2999 	.f_isfd    = 1,
3000 	.f_attach  = filt_specattach,
3001 	.f_detach  = filt_specdetach,
3002 	.f_event   = filt_specevent,
3003 	.f_touch   = filt_spectouch,
3004 	.f_process = filt_specprocess,
3005 };
3006 
3007 static void
filt_spec_make_eof(struct knote * kn)3008 filt_spec_make_eof(struct knote *kn)
3009 {
3010 	/*
3011 	 * The spec filter might touch kn_flags from f_event
3012 	 * without holding "the primitive lock", so make it atomic.
3013 	 */
3014 	os_atomic_or(&kn->kn_flags, EV_EOF | EV_ONESHOT, relaxed);
3015 }
3016 
3017 static int
filt_spec_common(struct knote * kn,struct kevent_qos_s * kev,bool attach)3018 filt_spec_common(struct knote *kn, struct kevent_qos_s *kev, bool attach)
3019 {
3020 	uthread_t uth = current_uthread();
3021 	vfs_context_t ctx = vfs_context_current();
3022 	vnode_t vp = (vnode_t)fp_get_data(kn->kn_fp);
3023 	__block bool selrecorded = false;
3024 	struct select_set *old_wqs;
3025 	int64_t data = 0;
3026 	int ret, selret;
3027 
3028 	if (kn->kn_flags & EV_EOF) {
3029 		ret = FILTER_ACTIVE;
3030 		goto out;
3031 	}
3032 
3033 	if (!attach && vnode_getwithvid(vp, vnode_vid(vp)) != 0) {
3034 		filt_spec_make_eof(kn);
3035 		ret = FILTER_ACTIVE;
3036 		goto out;
3037 	}
3038 
3039 	selspec_record_hook_t cb = ^(struct selinfo *si) {
3040 		selspec_attach(kn, si);
3041 		selrecorded = true;
3042 	};
3043 
3044 	old_wqs = uth->uu_selset;
3045 	uth->uu_selset = SELSPEC_RECORD_MARKER;
3046 	selret = VNOP_SELECT(vp, knote_get_seltype(kn), 0, cb, ctx);
3047 	uth->uu_selset = old_wqs;
3048 
3049 	if (!attach) {
3050 		vnode_put(vp);
3051 	}
3052 
3053 	if (!selrecorded && selret == 0) {
3054 		/*
3055 		 * The device indicated that there's no data to read,
3056 		 * but didn't call `selrecord`.
3057 		 *
3058 		 * Nothing will be notified of changes to this vnode,
3059 		 * so return an error back to user space on attach,
3060 		 * or pretend the knote disappeared for other cases,
3061 		 * to make it clear that the knote is not attached.
3062 		 */
3063 		if (attach) {
3064 			knote_set_error(kn, ENODEV);
3065 			return 0;
3066 		}
3067 
3068 		filt_spec_make_eof(kn);
3069 		ret = FILTER_ACTIVE;
3070 		goto out;
3071 	}
3072 
3073 	if (kn->kn_vnode_use_ofst) {
3074 		if (kn->kn_fp->fp_glob->fg_offset >= (uint32_t)selret) {
3075 			data = 0;
3076 		} else {
3077 			data = ((uint32_t)selret) - kn->kn_fp->fp_glob->fg_offset;
3078 		}
3079 	} else {
3080 		data = selret;
3081 	}
3082 
3083 	if (data >= knote_low_watermark(kn)) {
3084 		ret = FILTER_ACTIVE;
3085 	} else {
3086 		ret = 0;
3087 	}
3088 out:
3089 	if (ret) {
3090 		knote_fill_kevent(kn, kev, data);
3091 	}
3092 	return ret;
3093 }
3094 
3095 static int
filt_specattach(struct knote * kn,__unused struct kevent_qos_s * kev)3096 filt_specattach(struct knote *kn, __unused struct kevent_qos_s *kev)
3097 {
3098 	vnode_t vp = (vnode_t)fp_get_data(kn->kn_fp); /* Already have iocount, and vnode is alive */
3099 	dev_t dev;
3100 
3101 	assert(vnode_ischr(vp));
3102 
3103 	dev = vnode_specrdev(vp);
3104 
3105 	/*
3106 	 * For a few special kinds of devices, we can attach knotes with
3107 	 * no restrictions because their "select" vectors return the amount
3108 	 * of data available.  Others require an explicit NOTE_LOWAT with
3109 	 * data of 1, indicating that the caller doesn't care about actual
3110 	 * data counts, just an indication that the device has data.
3111 	 */
3112 	if (!kn->kn_vnode_kqok &&
3113 	    ((kn->kn_sfflags & NOTE_LOWAT) == 0 || kn->kn_sdata != 1)) {
3114 		knote_set_error(kn, EINVAL);
3115 		return 0;
3116 	}
3117 
3118 	return filt_spec_common(kn, kev, true);
3119 }
3120 
3121 static void
filt_specdetach(struct knote * kn)3122 filt_specdetach(struct knote *kn)
3123 {
3124 	selspec_detach(kn);
3125 }
3126 
3127 static int
filt_specevent(struct knote * kn,long hint)3128 filt_specevent(struct knote *kn, long hint)
3129 {
3130 	/* Due to selwakeup_internal() on SI_SELSPEC */
3131 	assert(KNOTE_IS_AUTODETACHED(kn));
3132 	knote_kn_hook_set_raw(kn, NULL);
3133 
3134 	/* called by selwakeup with the selspec_lock lock held */
3135 	if (hint & NOTE_REVOKE) {
3136 		filt_spec_make_eof(kn);
3137 	}
3138 	return FILTER_ACTIVE;
3139 }
3140 
3141 static int
filt_spectouch(struct knote * kn,struct kevent_qos_s * kev)3142 filt_spectouch(struct knote *kn, struct kevent_qos_s *kev)
3143 {
3144 	kn->kn_sdata = kev->data;
3145 	kn->kn_sfflags = kev->fflags;
3146 
3147 	return filt_spec_common(kn, kev, false);
3148 }
3149 
3150 static int
filt_specprocess(struct knote * kn,struct kevent_qos_s * kev)3151 filt_specprocess(struct knote *kn, struct kevent_qos_s *kev)
3152 {
3153 	return filt_spec_common(kn, kev, false);
3154 }
3155