xref: /xnu-8792.61.2/bsd/vfs/vfs_disk_conditioner.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2016-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/fsctl.h>
30 #include <stdbool.h>
31 #include <sys/time.h>
32 #include <sys/buf.h>
33 #include <sys/mount_internal.h>
34 #include <sys/vnode_internal.h>
35 #include <sys/buf_internal.h>
36 
37 #include <kern/kalloc.h>
38 
39 #include <sys/kauth.h>
40 #include <IOKit/IOBSD.h>
41 
42 #include <vfs/vfs_disk_conditioner.h>
43 
44 #define DISK_CONDITIONER_SET_ENTITLEMENT "com.apple.private.dmc.set"
45 
46 // number of total blocks for a mount
47 #define BLK_MAX(mp) ((mp->mnt_vfsstat.f_blocks * mp->mnt_vfsstat.f_bsize) / (mp->mnt_devblocksize))
48 
49 // approx. time to spin up an idle HDD
50 #define DISK_SPINUP_SEC (8)
51 
52 // idle period until assumed disk spin down
53 #define DISK_IDLE_SEC (10 * 60)
54 
55 struct saved_mount_fields {
56 	uint32_t        mnt_maxreadcnt;         /* Max. byte count for read */
57 	uint32_t        mnt_maxwritecnt;        /* Max. byte count for write */
58 	uint32_t        mnt_segreadcnt;         /* Max. segment count for read */
59 	uint32_t        mnt_segwritecnt;        /* Max. segment count for write */
60 	uint32_t        mnt_ioqueue_depth;      /* the maxiumum number of commands a device can accept */
61 	uint32_t        mnt_ioscale;            /* scale the various throttles/limits imposed on the amount of I/O in flight */
62 };
63 
64 struct _disk_conditioner_info_t {
65 	disk_conditioner_info dcinfo; // all the original data from fsctl
66 	struct saved_mount_fields mnt_fields; // fields to restore in mount_t when conditioner is disabled
67 
68 	daddr64_t last_blkno; // approx. last transfered block for simulating seek times
69 	struct timeval last_io_timestamp; // the last time an I/O completed
70 };
71 
72 void disk_conditioner_delay(buf_t, int, int, uint64_t);
73 void disk_conditioner_unmount(mount_t mp);
74 
75 extern void throttle_info_mount_reset_period(mount_t, int isssd);
76 
77 static double
weighted_scale_factor(double scale)78 weighted_scale_factor(double scale)
79 {
80 	// 0 to 1 increasing quickly from 0. This weights smaller blkdiffs higher to add a type of minimum latency
81 	// I would like to use log(10) / 2.0 + 1, but using different approximation due to no math library
82 	// y = (x-1)^3 + 1
83 	double x_m1 = scale - 1;
84 	return x_m1 * x_m1 * x_m1 + 1;
85 }
86 
87 void
disk_conditioner_delay(buf_t bp,int extents,int total_size,uint64_t already_elapsed_usec)88 disk_conditioner_delay(buf_t bp, int extents, int total_size, uint64_t already_elapsed_usec)
89 {
90 	mount_t mp;
91 	uint64_t delay_usec;
92 	daddr64_t blkdiff;
93 	daddr64_t last_blkno;
94 	double access_time_scale;
95 	struct _disk_conditioner_info_t *internal_info = NULL;
96 	disk_conditioner_info *info = NULL;
97 	struct timeval elapsed;
98 	struct timeval start;
99 	vnode_t vp;
100 
101 	vp = buf_vnode(bp);
102 	if (!vp) {
103 		return;
104 	}
105 
106 	mp = vp->v_mount;
107 	if (!mp) {
108 		return;
109 	}
110 
111 	internal_info = mp->mnt_disk_conditioner_info;
112 	if (!internal_info || !internal_info->dcinfo.enabled) {
113 		return;
114 	}
115 	info = &(internal_info->dcinfo);
116 
117 	if (!info->is_ssd) {
118 		// calculate approximate seek time based on difference in block number
119 		last_blkno = internal_info->last_blkno;
120 		blkdiff = bp->b_blkno > last_blkno ? bp->b_blkno - last_blkno : last_blkno - bp->b_blkno;
121 		internal_info->last_blkno = bp->b_blkno + bp->b_bcount;
122 	} else {
123 		blkdiff = BLK_MAX(mp);
124 	}
125 
126 	// scale access time by (distance in blocks from previous I/O / maximum blocks)
127 	access_time_scale = weighted_scale_factor((double)blkdiff / (double)BLK_MAX(mp));
128 	if (__builtin_isnan(access_time_scale)) {
129 		return;
130 	}
131 	// most cases should pass in extents==1 for optimal delay calculation, otherwise just multiply delay by extents
132 	double temp = (((double)extents * (double)info->access_time_usec) * access_time_scale);
133 	if (temp <= 0) {
134 		delay_usec = 0;
135 	} else if (temp >= (double)(18446744073709549568ULL)) { /* highest 64-bit unsigned integer representable as a double */
136 		delay_usec = UINT64_MAX;
137 	} else {
138 		delay_usec = (uint64_t)temp;
139 	}
140 
141 	if (info->read_throughput_mbps && (bp->b_flags & B_READ)) {
142 		delay_usec += (uint64_t)(total_size / ((double)(info->read_throughput_mbps * 1024 * 1024 / 8) / USEC_PER_SEC));
143 	} else if (info->write_throughput_mbps && !(bp->b_flags & B_READ)) {
144 		delay_usec += (uint64_t)(total_size / ((double)(info->write_throughput_mbps * 1024 * 1024 / 8) / USEC_PER_SEC));
145 	}
146 
147 	// try simulating disk spinup based on time since last I/O
148 	if (!info->is_ssd) {
149 		microuptime(&elapsed);
150 		timevalsub(&elapsed, &internal_info->last_io_timestamp);
151 		// avoid this delay right after boot (assuming last_io_timestamp is 0 and disk is already spinning)
152 		if (elapsed.tv_sec > DISK_IDLE_SEC && internal_info->last_io_timestamp.tv_sec != 0) {
153 			delay_usec += DISK_SPINUP_SEC * USEC_PER_SEC;
154 		}
155 	}
156 
157 	if (delay_usec <= already_elapsed_usec) {
158 		microuptime(&internal_info->last_io_timestamp);
159 		return;
160 	}
161 
162 	delay_usec -= already_elapsed_usec;
163 
164 	while (delay_usec) {
165 		microuptime(&start);
166 		assert(delay_usec <= INT_MAX);
167 		delay((int)delay_usec);
168 		microuptime(&elapsed);
169 		timevalsub(&elapsed, &start);
170 		if (elapsed.tv_sec * USEC_PER_SEC < delay_usec) {
171 			delay_usec -= elapsed.tv_sec * USEC_PER_SEC;
172 		} else {
173 			break;
174 		}
175 		if ((uint64_t)elapsed.tv_usec < delay_usec) {
176 			delay_usec -= elapsed.tv_usec;
177 		} else {
178 			break;
179 		}
180 	}
181 
182 	microuptime(&internal_info->last_io_timestamp);
183 }
184 
185 int
disk_conditioner_get_info(mount_t mp,disk_conditioner_info * uinfo)186 disk_conditioner_get_info(mount_t mp, disk_conditioner_info *uinfo)
187 {
188 	struct _disk_conditioner_info_t *info;
189 
190 	if (!mp) {
191 		return EINVAL;
192 	}
193 
194 	info = mp->mnt_disk_conditioner_info;
195 
196 	if (info) {
197 		memcpy(uinfo, &(info->dcinfo), sizeof(disk_conditioner_info));
198 	}
199 
200 	return 0;
201 }
202 
203 static inline void
disk_conditioner_restore_mount_fields(mount_t mp,struct saved_mount_fields * mnt_fields)204 disk_conditioner_restore_mount_fields(mount_t mp, struct saved_mount_fields *mnt_fields)
205 {
206 	mp->mnt_maxreadcnt = mnt_fields->mnt_maxreadcnt;
207 	mp->mnt_maxwritecnt = mnt_fields->mnt_maxwritecnt;
208 	mp->mnt_segreadcnt = mnt_fields->mnt_segreadcnt;
209 	mp->mnt_segwritecnt = mnt_fields->mnt_segwritecnt;
210 	mp->mnt_ioqueue_depth = mnt_fields->mnt_ioqueue_depth;
211 	mp->mnt_ioscale = mnt_fields->mnt_ioscale;
212 }
213 
214 int
disk_conditioner_set_info(mount_t mp,disk_conditioner_info * uinfo)215 disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo)
216 {
217 	struct _disk_conditioner_info_t *internal_info;
218 	disk_conditioner_info *info;
219 	struct saved_mount_fields *mnt_fields;
220 
221 	if (!kauth_cred_issuser(kauth_cred_get()) || !IOCurrentTaskHasEntitlement(DISK_CONDITIONER_SET_ENTITLEMENT)) {
222 		return EPERM;
223 	}
224 
225 	if (!mp) {
226 		return EINVAL;
227 	}
228 
229 	mount_lock(mp);
230 
231 	internal_info = mp->mnt_disk_conditioner_info;
232 	if (!internal_info) {
233 		internal_info = kalloc_type(struct _disk_conditioner_info_t,
234 		    Z_WAITOK | Z_ZERO);
235 		mp->mnt_disk_conditioner_info = internal_info;
236 		mnt_fields = &(internal_info->mnt_fields);
237 
238 		/* save mount_t fields for restoration later */
239 		mnt_fields->mnt_maxreadcnt = mp->mnt_maxreadcnt;
240 		mnt_fields->mnt_maxwritecnt = mp->mnt_maxwritecnt;
241 		mnt_fields->mnt_segreadcnt = mp->mnt_segreadcnt;
242 		mnt_fields->mnt_segwritecnt = mp->mnt_segwritecnt;
243 		mnt_fields->mnt_ioqueue_depth = mp->mnt_ioqueue_depth;
244 		mnt_fields->mnt_ioscale = mp->mnt_ioscale;
245 	}
246 
247 	info = &(internal_info->dcinfo);
248 	mnt_fields = &(internal_info->mnt_fields);
249 
250 	if (!uinfo->enabled && info->enabled) {
251 		/* disk conditioner is being disabled when already enabled */
252 		disk_conditioner_restore_mount_fields(mp, mnt_fields);
253 	}
254 
255 	memcpy(info, uinfo, sizeof(disk_conditioner_info));
256 
257 	/* scale back based on hardware advertised limits */
258 	if (uinfo->ioqueue_depth == 0 || uinfo->ioqueue_depth > mnt_fields->mnt_ioqueue_depth) {
259 		info->ioqueue_depth = mnt_fields->mnt_ioqueue_depth;
260 	}
261 	if (uinfo->maxreadcnt == 0 || uinfo->maxreadcnt > mnt_fields->mnt_maxreadcnt) {
262 		info->maxreadcnt = mnt_fields->mnt_maxreadcnt;
263 	}
264 	if (uinfo->maxwritecnt == 0 || uinfo->maxwritecnt > mnt_fields->mnt_maxwritecnt) {
265 		info->maxwritecnt = mnt_fields->mnt_maxwritecnt;
266 	}
267 	if (uinfo->segreadcnt == 0 || uinfo->segreadcnt > mnt_fields->mnt_segreadcnt) {
268 		info->segreadcnt = mnt_fields->mnt_segreadcnt;
269 	}
270 	if (uinfo->segwritecnt == 0 || uinfo->segwritecnt > mnt_fields->mnt_segwritecnt) {
271 		info->segwritecnt = mnt_fields->mnt_segwritecnt;
272 	}
273 
274 	if (uinfo->enabled) {
275 		mp->mnt_maxreadcnt = info->maxreadcnt;
276 		mp->mnt_maxwritecnt = info->maxwritecnt;
277 		mp->mnt_segreadcnt = info->segreadcnt;
278 		mp->mnt_segwritecnt = info->segwritecnt;
279 		mp->mnt_ioqueue_depth = info->ioqueue_depth;
280 		mp->mnt_ioscale = MNT_IOSCALE(info->ioqueue_depth);
281 	}
282 
283 	mount_unlock(mp);
284 
285 	microuptime(&internal_info->last_io_timestamp);
286 
287 	// make sure throttling picks up the new periods
288 	throttle_info_mount_reset_period(mp, info->is_ssd);
289 
290 	return 0;
291 }
292 
293 void
disk_conditioner_unmount(mount_t mp)294 disk_conditioner_unmount(mount_t mp)
295 {
296 	struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info;
297 
298 	if (!internal_info) {
299 		return;
300 	}
301 
302 	if (internal_info->dcinfo.enabled) {
303 		disk_conditioner_restore_mount_fields(mp, &(internal_info->mnt_fields));
304 	}
305 	mp->mnt_disk_conditioner_info = NULL;
306 	kfree_type(struct _disk_conditioner_info_t, internal_info);
307 }
308 
309 boolean_t
disk_conditioner_mount_is_ssd(mount_t mp)310 disk_conditioner_mount_is_ssd(mount_t mp)
311 {
312 	struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info;
313 
314 	if (!internal_info || !internal_info->dcinfo.enabled) {
315 		if (mp->mnt_kern_flag & MNTK_SSD) {
316 			return TRUE;
317 		}
318 		return FALSE;
319 	}
320 
321 	return internal_info->dcinfo.is_ssd;
322 }
323