xref: /xnu-11417.121.6/bsd/netinet/in_tclass.c (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1 /*
2  * Copyright (c) 2009-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/types.h>
32 #include <sys/filedesc.h>
33 #include <sys/file_internal.h>
34 #include <sys/proc.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 #include <sys/errno.h>
38 #include <sys/protosw.h>
39 #include <sys/domain.h>
40 #include <sys/mbuf.h>
41 #include <sys/queue.h>
42 #include <sys/sysctl.h>
43 #include <sys/sysproto.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/route.h>
48 
49 #include <netinet/in.h>
50 #include <netinet/in_var.h>
51 #include <netinet/in_pcb.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip_var.h>
54 #include <netinet/ip6.h>
55 #include <netinet6/ip6_var.h>
56 #include <netinet/udp.h>
57 #include <netinet/udp_var.h>
58 #include <netinet/tcp.h>
59 #include <netinet/tcp_var.h>
60 #include <netinet/tcp_cc.h>
61 #include <netinet/in_tclass.h>
62 
63 #include <os/log.h>
64 
65 static_assert(_SO_TC_MAX == SO_TC_STATS_MAX);
66 
67 /*
68  * The size is one more than the max because DSCP start at zero
69  */
70 #define DSCP_ARRAY_SIZE (_MAX_DSCP + 1)
71 
72 struct net_qos_dscp_map {
73 	uint8_t        sotc_to_dscp[SO_TC_MAX];
74 	uint8_t        netsvctype_to_dscp[_NET_SERVICE_TYPE_COUNT];
75 };
76 
77 struct dcsp_msc_map {
78 	uint8_t                 dscp;
79 	mbuf_svc_class_t        msc;
80 };
81 static inline int so_throttle_best_effort(struct socket *, struct ifnet *);
82 static void set_dscp_to_wifi_ac_map(const struct dcsp_msc_map *__indexable, int);
83 static errno_t dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *__counted_by(count) map,
84     size_t count, struct dcsp_msc_map *__counted_by(DSCP_ARRAY_SIZE));
85 
86 SYSCTL_NODE(_net, OID_AUTO, qos,
87     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "QoS");
88 
89 static int sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS;
90 SYSCTL_PROC(_net_qos, OID_AUTO, default_netsvctype_to_dscp_map,
91     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
92     0, 0, sysctl_default_netsvctype_to_dscp_map, "S", "");
93 
94 static int sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS;
95 SYSCTL_PROC(_net_qos, OID_AUTO, dscp_to_wifi_ac_map,
96     CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
97     0, 0, sysctl_dscp_to_wifi_ac_map, "S", "");
98 
99 static int sysctl_reset_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS;
100 SYSCTL_PROC(_net_qos, OID_AUTO, reset_dscp_to_wifi_ac_map,
101     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
102     0, 0, sysctl_reset_dscp_to_wifi_ac_map, "I", "");
103 
104 int net_qos_verbose = 0;
105 SYSCTL_INT(_net_qos, OID_AUTO, verbose,
106     CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_verbose, 0, "");
107 
108 /*
109  * Fastlane QoS policy:
110  * By Default allow all apps to get traffic class to DSCP mapping
111  */
112 SYSCTL_NODE(_net_qos, OID_AUTO, policy,
113     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "");
114 
115 int net_qos_policy_restricted = 0;
116 SYSCTL_INT(_net_qos_policy, OID_AUTO, restricted,
117     CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_restricted, 0, "");
118 
119 int net_qos_policy_restrict_avapps = 0;
120 SYSCTL_INT(_net_qos_policy, OID_AUTO, restrict_avapps,
121     CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_restrict_avapps, 0, "");
122 
123 int net_qos_policy_wifi_enabled = 0;
124 SYSCTL_INT(_net_qos_policy, OID_AUTO, wifi_enabled,
125     CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_wifi_enabled, 0, "");
126 
127 int net_qos_policy_capable_enabled = 0;
128 SYSCTL_INT(_net_qos_policy, OID_AUTO, capable_enabled,
129     CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_capable_enabled, 0, "");
130 
131 /*
132  * Socket traffic class from network service type
133  */
134 const int sotc_by_netservicetype[_NET_SERVICE_TYPE_COUNT] = {
135 	SO_TC_BE,       /* NET_SERVICE_TYPE_BE */
136 	SO_TC_BK,       /* NET_SERVICE_TYPE_BK */
137 	SO_TC_VI,       /* NET_SERVICE_TYPE_SIG */
138 	SO_TC_VI,       /* NET_SERVICE_TYPE_VI */
139 	SO_TC_VO,       /* NET_SERVICE_TYPE_VO */
140 	SO_TC_RV,       /* NET_SERVICE_TYPE_RV */
141 	SO_TC_AV,       /* NET_SERVICE_TYPE_AV */
142 	SO_TC_OAM,      /* NET_SERVICE_TYPE_OAM */
143 	SO_TC_RD        /* NET_SERVICE_TYPE_RD */
144 };
145 
146 /*
147  * DSCP mappings for QoS Fastlane as based on network service types
148  */
149 static const
150 struct netsvctype_dscp_map fastlane_netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {
151 	{ .netsvctype = NET_SERVICE_TYPE_BE, .dscp = _DSCP_DF },
152 	{ .netsvctype = NET_SERVICE_TYPE_BK, .dscp = _DSCP_AF11 },
153 	{ .netsvctype = NET_SERVICE_TYPE_SIG, .dscp = _DSCP_CS3 },
154 	{ .netsvctype = NET_SERVICE_TYPE_VI, .dscp = _DSCP_AF41 },
155 	{ .netsvctype = NET_SERVICE_TYPE_VO, .dscp = _DSCP_EF },
156 	{ .netsvctype = NET_SERVICE_TYPE_RV, .dscp = _DSCP_CS4 },
157 	{ .netsvctype = NET_SERVICE_TYPE_AV, .dscp = _DSCP_AF31 },
158 	{ .netsvctype = NET_SERVICE_TYPE_OAM, .dscp = _DSCP_CS2 },
159 	{ .netsvctype = NET_SERVICE_TYPE_RD, .dscp = _DSCP_AF21 },
160 };
161 
162 /*
163  * DSCP mappings for QoS RFC4594 as based on network service types
164  */
165 static const
166 struct netsvctype_dscp_map rfc4594_netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {
167 	{ .netsvctype = NET_SERVICE_TYPE_BE, .dscp = _DSCP_DF },
168 	{ .netsvctype = NET_SERVICE_TYPE_BK, .dscp = _DSCP_CS1 },
169 	{ .netsvctype = NET_SERVICE_TYPE_SIG, .dscp = _DSCP_CS5 },
170 	{ .netsvctype = NET_SERVICE_TYPE_VI, .dscp = _DSCP_AF41 },
171 	{ .netsvctype = NET_SERVICE_TYPE_VO, .dscp = _DSCP_EF },
172 	{ .netsvctype = NET_SERVICE_TYPE_RV, .dscp = _DSCP_CS4 },
173 	{ .netsvctype = NET_SERVICE_TYPE_AV, .dscp = _DSCP_AF31 },
174 	{ .netsvctype = NET_SERVICE_TYPE_OAM, .dscp = _DSCP_CS2 },
175 	{ .netsvctype = NET_SERVICE_TYPE_RD, .dscp = _DSCP_AF21 },
176 };
177 
178 static struct net_qos_dscp_map fastlane_net_qos_dscp_map;
179 static struct net_qos_dscp_map rfc4594_net_qos_dscp_map;
180 #if (DEBUG || DEVELOPMENT)
181 static struct net_qos_dscp_map custom_net_qos_dscp_map;
182 #endif /* (DEBUG || DEVELOPMENT) */
183 
184 
185 /*
186  * The DSCP to UP mapping (via mbuf service class) for WiFi follows is the mapping
187  * that implemented at the 802.11 driver level when the mbuf service class is
188  * MBUF_SC_BE.
189  *
190  * This clashes with the recommended mapping documented by the IETF document
191  * draft-szigeti-tsvwg-ieee-802-11e-01.txt but we keep the mapping to maintain
192  * binary compatibility. Applications should use the network service type socket
193  * option instead to select L2 QoS marking instead of IP_TOS or IPV6_TCLASS.
194  */
195 static const struct dcsp_msc_map default_dscp_to_wifi_ac_map[] = {
196 	{ .dscp = _DSCP_DF, .msc = MBUF_SC_BE },        /* RFC 2474 Standard */
197 	{ .dscp = 1, .msc = MBUF_SC_BE },               /*  */
198 	{ .dscp = 2, .msc = MBUF_SC_BE },               /*  */
199 	{ .dscp = 3, .msc = MBUF_SC_BE },               /*  */
200 	{ .dscp = 4, .msc = MBUF_SC_BE },               /*  */
201 	{ .dscp = 5, .msc = MBUF_SC_BE },               /*  */
202 	{ .dscp = 6, .msc = MBUF_SC_BE },               /*  */
203 	{ .dscp = 7, .msc = MBUF_SC_BE },               /*  */
204 
205 	{ .dscp = _DSCP_CS1, .msc = MBUF_SC_BK },       /* RFC 3662 Low-Priority Data */
206 	{ .dscp = 9, .msc = MBUF_SC_BK },               /*  */
207 	{ .dscp = _DSCP_AF11, .msc = MBUF_SC_BK },      /* RFC 2597 High-Throughput Data */
208 	{ .dscp = 11, .msc = MBUF_SC_BK },              /*  */
209 	{ .dscp = _DSCP_AF12, .msc = MBUF_SC_BK },      /* RFC 2597 High-Throughput Data */
210 	{ .dscp = 13, .msc = MBUF_SC_BK },              /*  */
211 	{ .dscp = _DSCP_AF13, .msc = MBUF_SC_BK },      /* RFC 2597 High-Throughput Data */
212 	{ .dscp = 15, .msc = MBUF_SC_BK },              /*  */
213 
214 	{ .dscp = _DSCP_CS2, .msc = MBUF_SC_BK },       /* RFC 4594 OAM */
215 	{ .dscp = 17, .msc = MBUF_SC_BK },              /*  */
216 	{ .dscp = _DSCP_AF21, .msc = MBUF_SC_BK },      /* RFC 2597 Low-Latency Data */
217 	{ .dscp = 19, .msc = MBUF_SC_BK },              /*  */
218 	{ .dscp = _DSCP_AF22, .msc = MBUF_SC_BK },      /* RFC 2597 Low-Latency Data */
219 	{ .dscp = 21, .msc = MBUF_SC_BK },              /*  */
220 	{ .dscp = _DSCP_AF23, .msc = MBUF_SC_BK },      /* RFC 2597 Low-Latency Data */
221 	{ .dscp = 23, .msc = MBUF_SC_BK },              /*  */
222 
223 	{ .dscp = _DSCP_CS3, .msc = MBUF_SC_BE },       /* RFC 2474 Broadcast Video */
224 	{ .dscp = 25, .msc = MBUF_SC_BE },              /*  */
225 	{ .dscp = _DSCP_AF31, .msc = MBUF_SC_BE },      /* RFC 2597 Multimedia Streaming */
226 	{ .dscp = 27, .msc = MBUF_SC_BE },              /*  */
227 	{ .dscp = _DSCP_AF32, .msc = MBUF_SC_BE },      /* RFC 2597 Multimedia Streaming */
228 	{ .dscp = 29, .msc = MBUF_SC_BE },              /*  */
229 	{ .dscp = _DSCP_AF33, .msc = MBUF_SC_BE },      /* RFC 2597 Multimedia Streaming */
230 	{ .dscp = 31, .msc = MBUF_SC_BE },              /*  */
231 
232 	{ .dscp = _DSCP_CS4, .msc = MBUF_SC_VI },       /* RFC 2474 Real-Time Interactive */
233 	{ .dscp = 33, .msc = MBUF_SC_VI },              /*  */
234 	{ .dscp = _DSCP_AF41, .msc = MBUF_SC_VI },      /* RFC 2597 Multimedia Conferencing */
235 	{ .dscp = 35, .msc = MBUF_SC_VI },              /*  */
236 	{ .dscp = _DSCP_AF42, .msc = MBUF_SC_VI },      /* RFC 2597 Multimedia Conferencing */
237 	{ .dscp = 37, .msc = MBUF_SC_VI },              /*  */
238 	{ .dscp = _DSCP_AF43, .msc = MBUF_SC_VI },      /* RFC 2597 Multimedia Conferencing */
239 	{ .dscp = 39, .msc = MBUF_SC_VI },              /*  */
240 
241 	{ .dscp = _DSCP_CS5, .msc = MBUF_SC_VI },       /* RFC 2474 Signaling */
242 	{ .dscp = 41, .msc = MBUF_SC_VI },              /*  */
243 	{ .dscp = 42, .msc = MBUF_SC_VI },              /*  */
244 	{ .dscp = 43, .msc = MBUF_SC_VI },              /*  */
245 	{ .dscp = _DSCP_VA, .msc = MBUF_SC_VI },        /* RFC 5865 VOICE-ADMIT */
246 	{ .dscp = 45, .msc = MBUF_SC_VI },              /*  */
247 	{ .dscp = _DSCP_EF, .msc = MBUF_SC_VI },        /* RFC 3246 Telephony */
248 	{ .dscp = 47, .msc = MBUF_SC_VI },              /*  */
249 
250 	{ .dscp = _DSCP_CS6, .msc = MBUF_SC_VO },       /* Wi-Fi WMM Certification: Chariot */
251 	{ .dscp = 49, .msc = MBUF_SC_VO },              /*  */
252 	{ .dscp = 50, .msc = MBUF_SC_VO },              /*  */
253 	{ .dscp = 51, .msc = MBUF_SC_VO },              /*  */
254 	{ .dscp = 52, .msc = MBUF_SC_VO },              /* Wi-Fi WMM Certification: Sigma */
255 	{ .dscp = 53, .msc = MBUF_SC_VO },              /*  */
256 	{ .dscp = 54, .msc = MBUF_SC_VO },              /*  */
257 	{ .dscp = 55, .msc = MBUF_SC_VO },              /*  */
258 
259 	{ .dscp = _DSCP_CS7, .msc = MBUF_SC_VO },       /* Wi-Fi WMM Certification: Chariot */
260 	{ .dscp = 57, .msc = MBUF_SC_VO },              /*  */
261 	{ .dscp = 58, .msc = MBUF_SC_VO },              /*  */
262 	{ .dscp = 59, .msc = MBUF_SC_VO },              /*  */
263 	{ .dscp = 60, .msc = MBUF_SC_VO },              /*  */
264 	{ .dscp = 61, .msc = MBUF_SC_VO },              /*  */
265 	{ .dscp = 62, .msc = MBUF_SC_VO },              /*  */
266 	{ .dscp = 63, .msc = MBUF_SC_VO },              /*  */
267 
268 	{ .dscp = 255, .msc = MBUF_SC_UNSPEC }          /* invalid DSCP to mark last entry */
269 };
270 
271 mbuf_svc_class_t wifi_dscp_to_msc_array[DSCP_ARRAY_SIZE];
272 
273 /*
274  * If there is no foreground activity on the interface for bg_switch_time
275  * seconds, the background connections can switch to foreground TCP
276  * congestion control.
277  */
278 #define TCP_BG_SWITCH_TIME 2 /* seconds */
279 
280 #if (DEVELOPMENT || DEBUG)
281 
282 static LCK_GRP_DECLARE(tclass_lck_grp, "tclass");
283 static LCK_MTX_DECLARE(tclass_lock, &tclass_lck_grp);
284 
285 static int tfp_count = 0;
286 
287 static TAILQ_HEAD(, tclass_for_proc) tfp_head =
288     TAILQ_HEAD_INITIALIZER(tfp_head);
289 
290 struct tclass_for_proc {
291 	TAILQ_ENTRY(tclass_for_proc)    tfp_link;
292 	int             tfp_class;
293 	pid_t           tfp_pid;
294 	char            tfp_pname[(2 * MAXCOMLEN) + 1];
295 	uint32_t        tfp_qos_mode;
296 };
297 
298 static int get_pid_tclass(struct so_tcdbg *);
299 static int get_pname_tclass(struct so_tcdbg *);
300 static int set_pid_tclass(struct so_tcdbg *);
301 static int set_pname_tclass(struct so_tcdbg *);
302 static int flush_pid_tclass(struct so_tcdbg *);
303 static int purge_tclass_for_proc(void);
304 static int flush_tclass_for_proc(void);
305 static void set_tclass_for_curr_proc(struct socket *);
306 
307 /*
308  * Must be called with tclass_lock held
309  */
310 static struct tclass_for_proc *
find_tfp_by_pid(pid_t pid)311 find_tfp_by_pid(pid_t pid)
312 {
313 	struct tclass_for_proc *tfp;
314 
315 	TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
316 		if (tfp->tfp_pid == pid) {
317 			break;
318 		}
319 	}
320 	return tfp;
321 }
322 
323 /*
324  * Must be called with tclass_lock held
325  */
326 static struct tclass_for_proc *
find_tfp_by_pname(const char * pname)327 find_tfp_by_pname(const char *pname)
328 {
329 	struct tclass_for_proc *tfp;
330 
331 	TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
332 		if (strlcmp(tfp->tfp_pname, pname,
333 		    sizeof(tfp->tfp_pname)) == 0) {
334 			break;
335 		}
336 	}
337 	return tfp;
338 }
339 
340 __private_extern__ void
set_tclass_for_curr_proc(struct socket * so)341 set_tclass_for_curr_proc(struct socket *so)
342 {
343 	struct tclass_for_proc *tfp = NULL;
344 	proc_t p = current_proc();      /* Not ref counted */
345 	pid_t pid = proc_pid(p);
346 	const char *__null_terminated pname = proc_best_name(p);
347 
348 	lck_mtx_lock(&tclass_lock);
349 
350 	TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
351 		if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 &&
352 		    strlcmp(tfp->tfp_pname, pname,
353 		    sizeof(tfp->tfp_pname)) == 0)) {
354 			if (tfp->tfp_class != SO_TC_UNSPEC) {
355 				so->so_traffic_class = (uint16_t)tfp->tfp_class;
356 			}
357 
358 			if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_ENABLE) {
359 				so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
360 			} else if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_DISABLE) {
361 				so->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
362 			}
363 			break;
364 		}
365 	}
366 
367 	lck_mtx_unlock(&tclass_lock);
368 }
369 
370 /*
371  * Purge entries with PIDs of exited processes
372  */
373 int
purge_tclass_for_proc(void)374 purge_tclass_for_proc(void)
375 {
376 	int error = 0;
377 	struct tclass_for_proc *tfp, *tvar;
378 
379 	lck_mtx_lock(&tclass_lock);
380 
381 	TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
382 		proc_t p;
383 
384 		if (tfp->tfp_pid == -1) {
385 			continue;
386 		}
387 		if ((p = proc_find(tfp->tfp_pid)) == NULL) {
388 			tfp_count--;
389 			TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
390 
391 			kfree_type(struct tclass_for_proc, tfp);
392 		} else {
393 			proc_rele(p);
394 		}
395 	}
396 
397 	lck_mtx_unlock(&tclass_lock);
398 
399 	return error;
400 }
401 
402 /*
403  * Remove one entry
404  * Must be called with tclass_lock held
405  */
406 static void
free_tclass_for_proc(struct tclass_for_proc * tfp)407 free_tclass_for_proc(struct tclass_for_proc *tfp)
408 {
409 	if (tfp == NULL) {
410 		return;
411 	}
412 	tfp_count--;
413 	TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
414 	kfree_type(struct tclass_for_proc, tfp);
415 }
416 
417 /*
418  * Remove all entries
419  */
420 int
flush_tclass_for_proc(void)421 flush_tclass_for_proc(void)
422 {
423 	int error = 0;
424 	struct tclass_for_proc *tfp, *tvar;
425 
426 	lck_mtx_lock(&tclass_lock);
427 
428 	TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
429 		free_tclass_for_proc(tfp);
430 	}
431 
432 	lck_mtx_unlock(&tclass_lock);
433 
434 	return error;
435 }
436 
437 /*
438  * Must be called with tclass_lock held
439  */
440 static struct tclass_for_proc *
alloc_tclass_for_proc(pid_t pid,const char * __sized_by (pnamelen)pname,size_t pnamelen)441 alloc_tclass_for_proc(pid_t pid, const char *__sized_by(pnamelen) pname, size_t pnamelen)
442 {
443 	struct tclass_for_proc *tfp;
444 
445 	if (pid == -1 && pname == NULL) {
446 		return NULL;
447 	}
448 
449 	tfp = kalloc_type(struct tclass_for_proc, Z_NOWAIT | Z_ZERO);
450 	if (tfp == NULL) {
451 		return NULL;
452 	}
453 
454 	tfp->tfp_pid = pid;
455 	/*
456 	 * Add per pid entries before per proc name so we can find
457 	 * a specific instance of a process before the general name base entry.
458 	 */
459 	if (pid != -1) {
460 		TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link);
461 	} else {
462 		if (pname != NULL) {
463 			strbufcpy(tfp->tfp_pname, sizeof(tfp->tfp_pname),
464 			    pname, pnamelen);
465 		} else {
466 			tfp->tfp_pname[0] = '\0';
467 		}
468 		TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link);
469 	}
470 
471 	tfp_count++;
472 
473 	return tfp;
474 }
475 
476 /*
477  * SO_TC_UNSPEC for tclass means to remove the entry
478  */
479 int
set_pid_tclass(struct so_tcdbg * so_tcdbg)480 set_pid_tclass(struct so_tcdbg *so_tcdbg)
481 {
482 	int error = EINVAL;
483 	proc_t p = NULL;
484 	struct tclass_for_proc *tfp;
485 	pid_t pid = so_tcdbg->so_tcdbg_pid;
486 	int tclass = so_tcdbg->so_tcdbg_tclass;
487 	int netsvctype = so_tcdbg->so_tcdbg_netsvctype;
488 	uint8_t ecn_val = so_tcdbg->so_tcdbg_ecn_val;
489 
490 	p = proc_find(pid);
491 	if (p == NULL) {
492 		printf("%s proc_find(%d) failed\n", __func__, pid);
493 		goto done;
494 	}
495 
496 	/* Need a tfp */
497 	lck_mtx_lock(&tclass_lock);
498 
499 	tfp = find_tfp_by_pid(pid);
500 	if (tfp == NULL) {
501 		tfp = alloc_tclass_for_proc(pid, NULL, 0);
502 		if (tfp == NULL) {
503 			error = ENOBUFS;
504 			goto done_unlock;
505 		}
506 	}
507 	tfp->tfp_class = tclass;
508 	tfp->tfp_qos_mode = so_tcdbg->so_tcbbg_qos_mode;
509 
510 	if (tfp != NULL) {
511 		struct fileproc *fp;
512 		proc_fdlock(p);
513 		fdt_foreach(fp, p) {
514 			struct socket *so;
515 
516 			if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
517 				continue;
518 			}
519 
520 			so = (struct socket *)fp_get_data(fp);
521 			if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
522 				continue;
523 			}
524 
525 			socket_lock(so, 1);
526 			if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_ENABLE) {
527 				so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
528 			} else if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_DISABLE) {
529 				so->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
530 			}
531 
532 			struct inpcb *inp = so ? sotoinpcb(so) : NULL;
533 			struct tcpcb *tp = inp ? intotcpcb(inp) : NULL;
534 
535 			if (tp != NULL) {
536 				if (ecn_val == IPTOS_ECN_ECT1 || ecn_val == IPTOS_ECN_ECT0) {
537 					tp->ecn_flags |= (ecn_val == IPTOS_ECN_ECT1) ?
538 					    TE_FORCE_ECT1 : TE_FORCE_ECT0;
539 				} else {
540 					tp->ecn_flags &= ~(TE_FORCE_ECT1 | TE_FORCE_ECT0);
541 				}
542 			}
543 			socket_unlock(so, 1);
544 
545 			if (netsvctype != _NET_SERVICE_TYPE_UNSPEC) {
546 				error = sock_setsockopt(so, SOL_SOCKET,
547 				    SO_NET_SERVICE_TYPE, &netsvctype, sizeof(int));
548 			}
549 			if (tclass != SO_TC_UNSPEC) {
550 				error = sock_setsockopt(so, SOL_SOCKET,
551 				    SO_TRAFFIC_CLASS, &tclass, sizeof(int));
552 			}
553 		}
554 
555 		proc_fdunlock(p);
556 	}
557 
558 	error = 0;
559 done_unlock:
560 	lck_mtx_unlock(&tclass_lock);
561 done:
562 	if (p != NULL) {
563 		proc_rele(p);
564 	}
565 
566 	return error;
567 }
568 
569 int
set_pname_tclass(struct so_tcdbg * so_tcdbg)570 set_pname_tclass(struct so_tcdbg *so_tcdbg)
571 {
572 	int error = EINVAL;
573 	struct tclass_for_proc *tfp;
574 
575 	lck_mtx_lock(&tclass_lock);
576 
577 	tfp = find_tfp_by_pname(__unsafe_null_terminated_from_indexable(so_tcdbg->so_tcdbg_pname));
578 	if (tfp == NULL) {
579 		tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname,
580 		    sizeof(so_tcdbg->so_tcdbg_pname));
581 		if (tfp == NULL) {
582 			lck_mtx_unlock(&tclass_lock);
583 			error = ENOBUFS;
584 			goto done;
585 		}
586 	}
587 	tfp->tfp_class = so_tcdbg->so_tcdbg_tclass;
588 	tfp->tfp_qos_mode = so_tcdbg->so_tcbbg_qos_mode;
589 
590 	lck_mtx_unlock(&tclass_lock);
591 
592 	error = 0;
593 done:
594 
595 	return error;
596 }
597 
598 static int
flush_pid_tclass(struct so_tcdbg * so_tcdbg)599 flush_pid_tclass(struct so_tcdbg *so_tcdbg)
600 {
601 	pid_t pid = so_tcdbg->so_tcdbg_pid;
602 	int tclass = so_tcdbg->so_tcdbg_tclass;
603 	struct fileproc *fp;
604 	proc_t p;
605 	int error;
606 
607 	p = proc_find(pid);
608 	if (p == PROC_NULL) {
609 		printf("%s proc_find(%d) failed\n", __func__, pid);
610 		return EINVAL;
611 	}
612 
613 	proc_fdlock(p);
614 
615 	fdt_foreach(fp, p) {
616 		struct socket *so;
617 
618 		if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
619 			continue;
620 		}
621 
622 		so = (struct socket *)fp_get_data(fp);
623 		error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass,
624 		    sizeof(tclass));
625 		if (error != 0) {
626 			printf("%s: setsockopt(SO_FLUSH) (so=0x%llx, fd=%d, "
627 			    "tclass=%d) failed %d\n", __func__,
628 			    (uint64_t)VM_KERNEL_ADDRPERM(so), fdt_foreach_fd(), tclass,
629 			    error);
630 		}
631 	}
632 
633 	proc_fdunlock(p);
634 
635 	proc_rele(p);
636 	return 0;
637 }
638 
639 int
get_pid_tclass(struct so_tcdbg * so_tcdbg)640 get_pid_tclass(struct so_tcdbg *so_tcdbg)
641 {
642 	int error = EINVAL;
643 	proc_t p = NULL;
644 	struct tclass_for_proc *tfp;
645 	pid_t pid = so_tcdbg->so_tcdbg_pid;
646 
647 	so_tcdbg->so_tcdbg_tclass = SO_TC_UNSPEC; /* Means not set */
648 
649 	p = proc_find(pid);
650 	if (p == NULL) {
651 		printf("%s proc_find(%d) failed\n", __func__, pid);
652 		goto done;
653 	}
654 
655 	/* Need a tfp */
656 	lck_mtx_lock(&tclass_lock);
657 
658 	tfp = find_tfp_by_pid(pid);
659 	if (tfp != NULL) {
660 		so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
661 		so_tcdbg->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
662 		error = 0;
663 	}
664 	lck_mtx_unlock(&tclass_lock);
665 done:
666 	if (p != NULL) {
667 		proc_rele(p);
668 	}
669 
670 	return error;
671 }
672 
673 int
get_pname_tclass(struct so_tcdbg * so_tcdbg)674 get_pname_tclass(struct so_tcdbg *so_tcdbg)
675 {
676 	int error = EINVAL;
677 	struct tclass_for_proc *tfp;
678 
679 	so_tcdbg->so_tcdbg_tclass = SO_TC_UNSPEC; /* Means not set */
680 
681 	/* Need a tfp */
682 	lck_mtx_lock(&tclass_lock);
683 
684 	tfp = find_tfp_by_pname(__unsafe_null_terminated_from_indexable(so_tcdbg->so_tcdbg_pname));
685 	if (tfp != NULL) {
686 		so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
687 		so_tcdbg->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
688 		error = 0;
689 	}
690 	lck_mtx_unlock(&tclass_lock);
691 
692 	return error;
693 }
694 
695 static int
delete_tclass_for_pid_pname(struct so_tcdbg * so_tcdbg)696 delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg)
697 {
698 	int error = EINVAL;
699 	pid_t pid = so_tcdbg->so_tcdbg_pid;
700 	struct tclass_for_proc *tfp = NULL;
701 
702 	lck_mtx_lock(&tclass_lock);
703 
704 	if (pid != -1) {
705 		tfp = find_tfp_by_pid(pid);
706 	} else {
707 		tfp = find_tfp_by_pname(__unsafe_null_terminated_from_indexable(so_tcdbg->so_tcdbg_pname));
708 	}
709 
710 	if (tfp != NULL) {
711 		free_tclass_for_proc(tfp);
712 		error = 0;
713 	}
714 
715 	lck_mtx_unlock(&tclass_lock);
716 
717 	return error;
718 }
719 
720 /*
721  * Setting options requires privileges
722  */
723 __private_extern__ int
so_set_tcdbg(struct socket * so,struct so_tcdbg * so_tcdbg)724 so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg)
725 {
726 	int error = 0;
727 
728 	if ((so->so_state & SS_PRIV) == 0) {
729 		return EPERM;
730 	}
731 
732 	socket_unlock(so, 0);
733 
734 	switch (so_tcdbg->so_tcdbg_cmd) {
735 	case SO_TCDBG_PID:
736 		error = set_pid_tclass(so_tcdbg);
737 		break;
738 
739 	case SO_TCDBG_PNAME:
740 		error = set_pname_tclass(so_tcdbg);
741 		break;
742 
743 	case SO_TCDBG_PURGE:
744 		error = purge_tclass_for_proc();
745 		break;
746 
747 	case SO_TCDBG_FLUSH:
748 		error = flush_tclass_for_proc();
749 		break;
750 
751 	case SO_TCDBG_DELETE:
752 		error = delete_tclass_for_pid_pname(so_tcdbg);
753 		break;
754 
755 	case SO_TCDBG_TCFLUSH_PID:
756 		error = flush_pid_tclass(so_tcdbg);
757 		break;
758 
759 	default:
760 		error = EINVAL;
761 		break;
762 	}
763 
764 	socket_lock(so, 0);
765 
766 	return error;
767 }
768 
769 /*
770  * Not required to be privileged to get
771  */
772 __private_extern__ int
sogetopt_tcdbg(struct socket * so,struct sockopt * sopt)773 sogetopt_tcdbg(struct socket *so, struct sockopt *sopt)
774 {
775 	int error = 0;
776 	struct so_tcdbg so_tcdbg;
777 	void *buf = NULL;
778 	size_t len = sopt->sopt_valsize;
779 
780 	error = sooptcopyin(sopt, &so_tcdbg, sizeof(struct so_tcdbg),
781 	    sizeof(struct so_tcdbg));
782 	if (error != 0) {
783 		return error;
784 	}
785 
786 	sopt->sopt_valsize = len;
787 
788 	socket_unlock(so, 0);
789 
790 	switch (so_tcdbg.so_tcdbg_cmd) {
791 	case SO_TCDBG_PID:
792 		error = get_pid_tclass(&so_tcdbg);
793 		break;
794 
795 	case SO_TCDBG_PNAME:
796 		error = get_pname_tclass(&so_tcdbg);
797 		break;
798 
799 	case SO_TCDBG_COUNT:
800 		lck_mtx_lock(&tclass_lock);
801 		so_tcdbg.so_tcdbg_count = tfp_count;
802 		lck_mtx_unlock(&tclass_lock);
803 		break;
804 
805 	case SO_TCDBG_LIST: {
806 		struct tclass_for_proc *tfp;
807 		int n, alloc_count;
808 		struct so_tcdbg *ptr;
809 
810 		lck_mtx_lock(&tclass_lock);
811 		if ((alloc_count = tfp_count) == 0) {
812 			lck_mtx_unlock(&tclass_lock);
813 			error = EINVAL;
814 			break;
815 		}
816 		len = alloc_count * sizeof(struct so_tcdbg);
817 		lck_mtx_unlock(&tclass_lock);
818 
819 		buf = kalloc_data(len, Z_WAITOK | Z_ZERO);
820 		if (buf == NULL) {
821 			error = ENOBUFS;
822 			break;
823 		}
824 
825 		lck_mtx_lock(&tclass_lock);
826 		n = 0;
827 		ptr = (struct so_tcdbg *)buf;
828 		TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
829 			if (++n > alloc_count) {
830 				break;
831 			}
832 			if (tfp->tfp_pid != -1) {
833 				ptr->so_tcdbg_cmd = SO_TCDBG_PID;
834 				ptr->so_tcdbg_pid = tfp->tfp_pid;
835 			} else {
836 				ptr->so_tcdbg_cmd = SO_TCDBG_PNAME;
837 				ptr->so_tcdbg_pid = -1;
838 				strbufcpy(ptr->so_tcdbg_pname,
839 				    tfp->tfp_pname);
840 			}
841 			ptr->so_tcdbg_tclass = tfp->tfp_class;
842 			ptr->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
843 			ptr++;
844 		}
845 
846 		lck_mtx_unlock(&tclass_lock);
847 	}
848 	break;
849 
850 	default:
851 		error = EINVAL;
852 		break;
853 	}
854 
855 	socket_lock(so, 0);
856 
857 	if (error == 0) {
858 		if (buf == NULL) {
859 			error = sooptcopyout(sopt, &so_tcdbg,
860 			    sizeof(struct so_tcdbg));
861 		} else {
862 			error = sooptcopyout(sopt, buf, len);
863 			kfree_data(buf, len);
864 		}
865 	}
866 	return error;
867 }
868 
869 #endif /* (DEVELOPMENT || DEBUG) */
870 
871 int
so_get_netsvc_marking_level(struct socket * so)872 so_get_netsvc_marking_level(struct socket *so)
873 {
874 	int marking_level = NETSVC_MRKNG_UNKNOWN;
875 	struct ifnet *ifp = NULL;
876 
877 	switch (SOCK_DOM(so)) {
878 	case PF_INET: {
879 		struct inpcb *inp = sotoinpcb(so);
880 
881 		if (inp != NULL) {
882 			ifp = inp->inp_last_outifp;
883 		}
884 		break;
885 	}
886 	case PF_INET6: {
887 		struct in6pcb *in6p = sotoin6pcb(so);
888 
889 		if (in6p != NULL) {
890 			ifp = in6p->in6p_last_outifp;
891 		}
892 		break;
893 	}
894 	default:
895 		break;
896 	}
897 	if (ifp != NULL) {
898 		if ((ifp->if_eflags & IFEF_QOSMARKING_ENABLED) != 0) {
899 			if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
900 				marking_level = NETSVC_MRKNG_LVL_L3L2_ALL;
901 			} else {
902 				marking_level = NETSVC_MRKNG_LVL_L3L2_BK;
903 			}
904 		} else {
905 			marking_level = NETSVC_MRKNG_LVL_L2;
906 		}
907 	}
908 	return marking_level;
909 }
910 
911 __private_extern__ int
so_set_traffic_class(struct socket * so,int optval)912 so_set_traffic_class(struct socket *so, int optval)
913 {
914 	int error = 0;
915 
916 	if (optval < SO_TC_BE || optval > SO_TC_CTL) {
917 		error = EINVAL;
918 	} else {
919 		switch (optval) {
920 		case _SO_TC_BK:
921 			optval = SO_TC_BK;
922 			break;
923 		case _SO_TC_VI:
924 			optval = SO_TC_VI;
925 			break;
926 		case _SO_TC_VO:
927 			optval = SO_TC_VO;
928 			break;
929 		default:
930 			if (!SO_VALID_TC(optval)) {
931 				error = EINVAL;
932 			}
933 			break;
934 		}
935 
936 		if (error == 0) {
937 			int oldval = so->so_traffic_class;
938 
939 			VERIFY(SO_VALID_TC(optval));
940 			so->so_traffic_class = (uint16_t)optval;
941 
942 			if ((SOCK_DOM(so) == PF_INET ||
943 			    SOCK_DOM(so) == PF_INET6) &&
944 			    SOCK_TYPE(so) == SOCK_STREAM) {
945 				set_tcp_stream_priority(so);
946 			}
947 
948 			if ((SOCK_DOM(so) == PF_INET ||
949 			    SOCK_DOM(so) == PF_INET6) &&
950 			    optval != oldval && (optval == SO_TC_BK_SYS ||
951 			    oldval == SO_TC_BK_SYS)) {
952 				/*
953 				 * If the app switches from BK_SYS to something
954 				 * else, resume the socket if it was suspended.
955 				 */
956 				if (oldval == SO_TC_BK_SYS) {
957 					inp_reset_fc_state(so->so_pcb);
958 				}
959 
960 				SOTHROTTLELOG("throttle[%d]: so 0x%llx "
961 				    "[%d,%d] opportunistic %s\n", so->last_pid,
962 				    (uint64_t)VM_KERNEL_ADDRPERM(so),
963 				    SOCK_DOM(so), SOCK_TYPE(so),
964 				    (optval == SO_TC_BK_SYS) ? "ON" : "OFF");
965 			}
966 		}
967 	}
968 	return error;
969 }
970 
971 __private_extern__ int
so_set_net_service_type(struct socket * so,int netsvctype)972 so_set_net_service_type(struct socket *so, int netsvctype)
973 {
974 	int sotc;
975 	int error;
976 
977 	if (!IS_VALID_NET_SERVICE_TYPE(netsvctype)) {
978 		return EINVAL;
979 	}
980 
981 	sotc = sotc_by_netservicetype[netsvctype];
982 	error = so_set_traffic_class(so, sotc);
983 	if (error != 0) {
984 		return error;
985 	}
986 	so->so_netsvctype = (int8_t)netsvctype;
987 	so->so_flags1 |= SOF1_TC_NET_SERV_TYPE;
988 
989 	return 0;
990 }
991 
992 __private_extern__ void
so_set_default_traffic_class(struct socket * so)993 so_set_default_traffic_class(struct socket *so)
994 {
995 	so->so_traffic_class = SO_TC_BE;
996 
997 	if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) {
998 		if (net_qos_policy_restricted == 0) {
999 			so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
1000 		}
1001 #if (DEVELOPMENT || DEBUG)
1002 		if (tfp_count > 0) {
1003 			set_tclass_for_curr_proc(so);
1004 		}
1005 #endif /* (DEVELOPMENT || DEBUG) */
1006 	}
1007 }
1008 
1009 __private_extern__ int
so_set_opportunistic(struct socket * so,int optval)1010 so_set_opportunistic(struct socket *so, int optval)
1011 {
1012 	return so_set_traffic_class(so, (optval == 0) ?
1013 	           SO_TC_BE : SO_TC_BK_SYS);
1014 }
1015 
1016 __private_extern__ int
so_get_opportunistic(struct socket * so)1017 so_get_opportunistic(struct socket *so)
1018 {
1019 	return so->so_traffic_class == SO_TC_BK_SYS;
1020 }
1021 
1022 __private_extern__ int
so_tc_from_control(struct mbuf * control,int * out_netsvctype)1023 so_tc_from_control(struct mbuf *control, int *out_netsvctype)
1024 {
1025 	struct cmsghdr *cm;
1026 	int sotc = SO_TC_UNSPEC;
1027 
1028 	*out_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1029 
1030 	for (cm = M_FIRST_CMSGHDR(control);
1031 	    is_cmsg_valid(control, cm);
1032 	    cm = M_NXT_CMSGHDR(control, cm)) {
1033 		int val;
1034 
1035 		if (cm->cmsg_level != SOL_SOCKET ||
1036 		    cm->cmsg_len != CMSG_LEN(sizeof(int))) {
1037 			continue;
1038 		}
1039 		val = *(int *)(void *)CMSG_DATA(cm);
1040 		/*
1041 		 * The first valid option wins
1042 		 */
1043 		switch (cm->cmsg_type) {
1044 		case SO_TRAFFIC_CLASS:
1045 			if (SO_VALID_TC(val)) {
1046 				sotc = val;
1047 				return sotc;
1048 				/* NOT REACHED */
1049 			} else if (val < SO_TC_NET_SERVICE_OFFSET) {
1050 				break;
1051 			}
1052 			/*
1053 			 * Handle the case SO_NET_SERVICE_TYPE values are
1054 			 * passed using SO_TRAFFIC_CLASS
1055 			 */
1056 			val = val - SO_TC_NET_SERVICE_OFFSET;
1057 			OS_FALLTHROUGH;
1058 		case SO_NET_SERVICE_TYPE:
1059 			if (!IS_VALID_NET_SERVICE_TYPE(val)) {
1060 				break;
1061 			}
1062 			*out_netsvctype = val;
1063 			sotc = sotc_by_netservicetype[val];
1064 			return sotc;
1065 		/* NOT REACHED */
1066 		default:
1067 			break;
1068 		}
1069 	}
1070 
1071 	return sotc;
1072 }
1073 
1074 __private_extern__ int
so_tos_from_control(struct mbuf * control)1075 so_tos_from_control(struct mbuf *control)
1076 {
1077 	struct cmsghdr *cm;
1078 	int tos = IPTOS_UNSPEC;
1079 
1080 	for (cm = M_FIRST_CMSGHDR(control);
1081 	    is_cmsg_valid(control, cm);
1082 	    cm = M_NXT_CMSGHDR(control, cm)) {
1083 		if (cm->cmsg_len != CMSG_LEN(sizeof(int))) {
1084 			continue;
1085 		}
1086 
1087 		if ((cm->cmsg_level == IPPROTO_IP &&
1088 		    cm->cmsg_type == IP_TOS) ||
1089 		    (cm->cmsg_level == IPPROTO_IPV6 &&
1090 		    cm->cmsg_type == IPV6_TCLASS)) {
1091 			tos = *(int *)(void *)CMSG_DATA(cm) & IPTOS_MASK;
1092 			/* The first valid option wins */
1093 			break;
1094 		}
1095 	}
1096 
1097 	return tos;
1098 }
1099 
1100 /*
1101  * There is no traffic class for input packet
1102  */
1103 __private_extern__ void
so_recv_data_stat(struct socket * so,struct mbuf * m,size_t off)1104 so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off)
1105 {
1106 	so->so_tc_stats[SO_STATS_DATA].rxpackets += 1;
1107 	so->so_tc_stats[SO_STATS_DATA].rxbytes +=
1108 	    ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off;
1109 }
1110 
1111 __private_extern__ void
so_inc_recv_data_stat(struct socket * so,size_t pkts,size_t bytes)1112 so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes)
1113 {
1114 	so->so_tc_stats[SO_STATS_DATA].rxpackets += pkts;
1115 	so->so_tc_stats[SO_STATS_DATA].rxbytes += bytes;
1116 }
1117 
1118 static inline int
so_throttle_best_effort(struct socket * so,struct ifnet * ifp)1119 so_throttle_best_effort(struct socket *so, struct ifnet *ifp)
1120 {
1121 	uint32_t uptime = (uint32_t)net_uptime();
1122 	return soissrcbesteffort(so) &&
1123 	       net_io_policy_throttle_best_effort == 1 &&
1124 	       ifp->if_rt_sendts > 0 &&
1125 	       (int)(uptime - ifp->if_rt_sendts) <= TCP_BG_SWITCH_TIME;
1126 }
1127 
1128 __private_extern__ void
set_tcp_stream_priority(struct socket * so)1129 set_tcp_stream_priority(struct socket *so)
1130 {
1131 	struct inpcb *inp = sotoinpcb(so);
1132 	struct tcpcb *tp = intotcpcb(inp);
1133 	struct ifnet *outifp;
1134 	u_char old_cc = tp->tcp_cc_index;
1135 	int recvbg = IS_TCP_RECV_BG(so);
1136 	bool is_local = false, fg_active = false;
1137 	uint32_t uptime;
1138 
1139 	VERIFY((SOCK_CHECK_DOM(so, PF_INET) ||
1140 	    SOCK_CHECK_DOM(so, PF_INET6)) &&
1141 	    SOCK_CHECK_TYPE(so, SOCK_STREAM) &&
1142 	    SOCK_CHECK_PROTO(so, IPPROTO_TCP));
1143 
1144 	/* Return if the socket is in a terminal state */
1145 	if (inp->inp_state == INPCB_STATE_DEAD) {
1146 		return;
1147 	}
1148 
1149 	outifp = inp->inp_last_outifp;
1150 	uptime = (uint32_t)net_uptime();
1151 
1152 	/*
1153 	 * If the socket was marked as a background socket or if the
1154 	 * traffic class is set to background with traffic class socket
1155 	 * option then make both send and recv side of the stream to be
1156 	 * background. The variable sotcdb which can be set with sysctl
1157 	 * is used to disable these settings for testing.
1158 	 */
1159 	if (outifp == NULL || (outifp->if_flags & IFF_LOOPBACK)) {
1160 		is_local = true;
1161 	}
1162 
1163 	/* Check if there has been recent foreground activity */
1164 	if (outifp != NULL) {
1165 		/*
1166 		 * If the traffic source is background, check if
1167 		 * there is recent foreground activity which should
1168 		 * continue to keep the traffic source as background.
1169 		 * Otherwise, we can switch the traffic source to
1170 		 * foreground.
1171 		 */
1172 		if (soissrcbackground(so) && outifp->if_fg_sendts > 0 &&
1173 		    (int)(uptime - outifp->if_fg_sendts) <= TCP_BG_SWITCH_TIME) {
1174 			fg_active = true;
1175 		}
1176 
1177 		/*
1178 		 * The traffic source is best-effort -- check if
1179 		 * the policy to throttle best effort is enabled
1180 		 * and there was realtime activity on this
1181 		 * interface recently. If this is true, enable
1182 		 * algorithms that respond to increased latency
1183 		 * on best-effort traffic.
1184 		 */
1185 		if (so_throttle_best_effort(so, outifp)) {
1186 			fg_active = true;
1187 		}
1188 	}
1189 
1190 	/*
1191 	 * System initiated background traffic like cloud uploads should
1192 	 * always use background delay sensitive algorithms. This will
1193 	 * make the stream more responsive to other streams on the user's
1194 	 * network and it will minimize latency induced.
1195 	 */
1196 	if (fg_active || IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
1197 		/*
1198 		 * If the interface that the connection is using is
1199 		 * loopback, do not use background congestion
1200 		 * control algorithm.
1201 		 *
1202 		 * If there has been recent foreground activity or if there
1203 		 * was an indication that a real time foreground application
1204 		 * is going to use networking (net_io_policy_throttled),
1205 		 * switch the background and best effort streams to use background
1206 		 * congestion control algorithm.
1207 		 */
1208 		if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0 || is_local) {
1209 			if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX) {
1210 				tcp_set_foreground_cc(so);
1211 			}
1212 		} else {
1213 			if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX) {
1214 				tcp_set_background_cc(so);
1215 			}
1216 		}
1217 
1218 		/* Set receive side background flags */
1219 		if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0 || is_local) {
1220 			tcp_clear_recv_bg(so);
1221 		} else {
1222 			tcp_set_recv_bg(so);
1223 		}
1224 	} else {
1225 		/*
1226 		 * If there is no recent foreground activity, even the
1227 		 * background flows can use foreground congestion controller.
1228 		 */
1229 		tcp_clear_recv_bg(so);
1230 		if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX) {
1231 			tcp_set_foreground_cc(so);
1232 		}
1233 	}
1234 
1235 	if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) {
1236 		SOTHROTTLELOG("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; "
1237 		    "%s recv\n", so->last_pid,
1238 		    (uint64_t)VM_KERNEL_ADDRPERM(so),
1239 		    SOCK_DOM(so), SOCK_TYPE(so),
1240 		    (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ?
1241 		    "background" : "foreground",
1242 		    IS_TCP_RECV_BG(so) ? "background" : "foreground");
1243 	}
1244 }
1245 
1246 /*
1247  * Set traffic class to an IPv4 or IPv6 packet
1248  * - mark the mbuf
1249  * - set the DSCP code following the WMM mapping
1250  */
1251 __private_extern__ void
set_packet_service_class(struct mbuf * m,struct socket * so,int sotc,uint32_t flags)1252 set_packet_service_class(struct mbuf *m, struct socket *so,
1253     int sotc, uint32_t flags)
1254 {
1255 	mbuf_svc_class_t msc = MBUF_SC_BE;         /* Best effort by default */
1256 	struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */
1257 
1258 	if (!(m->m_flags & M_PKTHDR)) {
1259 		return;
1260 	}
1261 
1262 	/*
1263 	 * Here is the precedence:
1264 	 * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all
1265 	 * 2) Traffic class passed via ancillary data to sendmsdg(2)
1266 	 * 3) Traffic class socket option last
1267 	 */
1268 	if (sotc != SO_TC_UNSPEC) {
1269 		VERIFY(SO_VALID_TC(sotc));
1270 		msc = so_tc2msc(sotc);
1271 		/* Assert because tc must have been valid */
1272 		VERIFY(MBUF_VALID_SC(msc));
1273 	}
1274 
1275 	/*
1276 	 * If TRAFFIC_MGT_SO_BACKGROUND is set or policy to throttle
1277 	 * best effort is set, depress the priority.
1278 	 */
1279 	if (!IS_MBUF_SC_BACKGROUND(msc) && soisthrottled(so)) {
1280 		msc = MBUF_SC_BK;
1281 	}
1282 
1283 	if (IS_MBUF_SC_BESTEFFORT(msc) && inp->inp_last_outifp != NULL &&
1284 	    so_throttle_best_effort(so, inp->inp_last_outifp)) {
1285 		msc = MBUF_SC_BK;
1286 	}
1287 
1288 	if (soissrcbackground(so)) {
1289 		m->m_pkthdr.pkt_flags |= PKTF_SO_BACKGROUND;
1290 	}
1291 
1292 	if (soissrcrealtime(so) || IS_MBUF_SC_REALTIME(msc)) {
1293 		m->m_pkthdr.pkt_flags |= PKTF_SO_REALTIME;
1294 	}
1295 	/*
1296 	 * Set the traffic class in the mbuf packet header svc field
1297 	 */
1298 	if (sotcdb & SOTCDB_NO_MTC) {
1299 		goto no_mbtc;
1300 	}
1301 
1302 	/*
1303 	 * Elevate service class if the packet is a pure TCP ACK.
1304 	 * We can do this only when the flow is not a background
1305 	 * flow and the outgoing interface supports
1306 	 * transmit-start model.
1307 	 */
1308 	if (!IS_MBUF_SC_BACKGROUND(msc) &&
1309 	    (flags & (PKT_SCF_TCP_ACK | PKT_SCF_TCP_SYN)) != 0) {
1310 		msc = MBUF_SC_CTL;
1311 	}
1312 
1313 	(void) m_set_service_class(m, msc);
1314 
1315 	/*
1316 	 * Set the privileged traffic auxiliary flag if applicable,
1317 	 * or clear it.
1318 	 */
1319 	if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) &&
1320 	    msc != MBUF_SC_UNSPEC) {
1321 		m->m_pkthdr.pkt_flags |= PKTF_PRIO_PRIVILEGED;
1322 	} else {
1323 		m->m_pkthdr.pkt_flags &= ~PKTF_PRIO_PRIVILEGED;
1324 	}
1325 
1326 no_mbtc:
1327 	/*
1328 	 * For TCP with background traffic class switch CC algo based on sysctl
1329 	 */
1330 	if (so->so_type == SOCK_STREAM) {
1331 		set_tcp_stream_priority(so);
1332 	}
1333 }
1334 
1335 __private_extern__ mbuf_svc_class_t
so_tc2msc(int tc)1336 so_tc2msc(int tc)
1337 {
1338 	mbuf_svc_class_t msc;
1339 
1340 	switch (tc) {
1341 	case SO_TC_BK_SYS:
1342 		msc = MBUF_SC_BK_SYS;
1343 		break;
1344 	case SO_TC_BK:
1345 	case _SO_TC_BK:
1346 		msc = MBUF_SC_BK;
1347 		break;
1348 	case SO_TC_BE:
1349 		msc = MBUF_SC_BE;
1350 		break;
1351 	case SO_TC_RD:
1352 		msc = MBUF_SC_RD;
1353 		break;
1354 	case SO_TC_OAM:
1355 		msc = MBUF_SC_OAM;
1356 		break;
1357 	case SO_TC_AV:
1358 		msc = MBUF_SC_AV;
1359 		break;
1360 	case SO_TC_RV:
1361 		msc = MBUF_SC_RV;
1362 		break;
1363 	case SO_TC_VI:
1364 	case _SO_TC_VI:
1365 		msc = MBUF_SC_VI;
1366 		break;
1367 	case SO_TC_NETSVC_SIG:
1368 		msc = MBUF_SC_SIG;
1369 		break;
1370 	case SO_TC_VO:
1371 	case _SO_TC_VO:
1372 		msc = MBUF_SC_VO;
1373 		break;
1374 	case SO_TC_CTL:
1375 		msc = MBUF_SC_CTL;
1376 		break;
1377 	case SO_TC_ALL:
1378 	default:
1379 		msc = MBUF_SC_UNSPEC;
1380 		break;
1381 	}
1382 
1383 	return msc;
1384 }
1385 
1386 __private_extern__ int
so_svc2tc(mbuf_svc_class_t svc)1387 so_svc2tc(mbuf_svc_class_t svc)
1388 {
1389 	switch (svc) {
1390 	case MBUF_SC_BK_SYS:
1391 		return SO_TC_BK_SYS;
1392 	case MBUF_SC_BK:
1393 		return SO_TC_BK;
1394 	case MBUF_SC_BE:
1395 		return SO_TC_BE;
1396 	case MBUF_SC_RD:
1397 		return SO_TC_RD;
1398 	case MBUF_SC_OAM:
1399 		return SO_TC_OAM;
1400 	case MBUF_SC_AV:
1401 		return SO_TC_AV;
1402 	case MBUF_SC_RV:
1403 		return SO_TC_RV;
1404 	case MBUF_SC_VI:
1405 		return SO_TC_VI;
1406 	case MBUF_SC_SIG:
1407 		return SO_TC_NETSVC_SIG;
1408 	case MBUF_SC_VO:
1409 		return SO_TC_VO;
1410 	case MBUF_SC_CTL:
1411 		return SO_TC_CTL;
1412 	case MBUF_SC_UNSPEC:
1413 	default:
1414 		return SO_TC_BE;
1415 	}
1416 }
1417 
1418 static size_t
sotc_index(int sotc)1419 sotc_index(int sotc)
1420 {
1421 	switch (sotc) {
1422 	case SO_TC_BK_SYS:
1423 		return SOTCIX_BK_SYS;
1424 	case _SO_TC_BK:
1425 	case SO_TC_BK:
1426 		return SOTCIX_BK;
1427 
1428 	case SO_TC_BE:
1429 		return SOTCIX_BE;
1430 	case SO_TC_RD:
1431 		return SOTCIX_RD;
1432 	case SO_TC_OAM:
1433 		return SOTCIX_OAM;
1434 
1435 	case SO_TC_AV:
1436 		return SOTCIX_AV;
1437 	case SO_TC_RV:
1438 		return SOTCIX_RV;
1439 	case _SO_TC_VI:
1440 	case SO_TC_VI:
1441 		return SOTCIX_VI;
1442 
1443 	case _SO_TC_VO:
1444 	case SO_TC_VO:
1445 		return SOTCIX_VO;
1446 	case SO_TC_CTL:
1447 		return SOTCIX_CTL;
1448 
1449 	default:
1450 		break;
1451 	}
1452 	/*
1453 	 * Unknown traffic class value
1454 	 */
1455 	return SIZE_T_MAX;
1456 }
1457 
1458 uint8_t
fastlane_sc_to_dscp(uint32_t svc_class)1459 fastlane_sc_to_dscp(uint32_t svc_class)
1460 {
1461 	uint8_t dscp = _DSCP_DF;
1462 
1463 	switch (svc_class) {
1464 	case MBUF_SC_BK_SYS:
1465 	case MBUF_SC_BK:
1466 		dscp = _DSCP_AF11;
1467 		break;
1468 
1469 	case MBUF_SC_BE:
1470 		dscp = _DSCP_DF;
1471 		break;
1472 	case MBUF_SC_RD:
1473 		dscp = _DSCP_AF21;
1474 		break;
1475 	case MBUF_SC_OAM:
1476 		dscp = _DSCP_CS2;
1477 		break;
1478 
1479 	case MBUF_SC_AV:
1480 		dscp = _DSCP_AF31;
1481 		break;
1482 	case MBUF_SC_RV:
1483 		dscp = _DSCP_CS4;
1484 		break;
1485 	case MBUF_SC_VI:
1486 		dscp = _DSCP_AF41;
1487 		break;
1488 	case MBUF_SC_SIG:
1489 		dscp = _DSCP_CS3;
1490 		break;
1491 
1492 	case MBUF_SC_VO:
1493 		dscp = _DSCP_EF;
1494 		break;
1495 	case MBUF_SC_CTL:
1496 		dscp = _DSCP_DF;
1497 		break;
1498 	default:
1499 		dscp = _DSCP_DF;
1500 		break;
1501 	}
1502 
1503 	return dscp;
1504 }
1505 
1506 uint8_t
rfc4594_sc_to_dscp(uint32_t svc_class)1507 rfc4594_sc_to_dscp(uint32_t svc_class)
1508 {
1509 	uint8_t dscp = _DSCP_DF;
1510 
1511 	switch (svc_class) {
1512 	case MBUF_SC_BK_SYS:            /* Low-Priority Data */
1513 	case MBUF_SC_BK:
1514 		dscp = _DSCP_CS1;
1515 		break;
1516 
1517 	case MBUF_SC_BE:                /* Standard */
1518 		dscp = _DSCP_DF;
1519 		break;
1520 	case MBUF_SC_RD:                /* Low-Latency Data */
1521 		dscp = _DSCP_AF21;
1522 		break;
1523 
1524 	/* SVC_CLASS Not Defined:  High-Throughput Data */
1525 
1526 	case MBUF_SC_OAM:               /* OAM */
1527 		dscp = _DSCP_CS2;
1528 		break;
1529 
1530 	/* SVC_CLASS Not Defined:  Broadcast Video */
1531 
1532 	case MBUF_SC_AV:                /* Multimedia Streaming */
1533 		dscp = _DSCP_AF31;
1534 		break;
1535 	case MBUF_SC_RV:                /* Real-Time Interactive */
1536 		dscp = _DSCP_CS4;
1537 		break;
1538 	case MBUF_SC_VI:                /* Multimedia Conferencing */
1539 		dscp = _DSCP_AF41;
1540 		break;
1541 	case MBUF_SC_SIG:               /* Signaling */
1542 		dscp = _DSCP_CS5;
1543 		break;
1544 
1545 	case MBUF_SC_VO:                /* Telephony */
1546 		dscp = _DSCP_EF;
1547 		break;
1548 	case MBUF_SC_CTL:               /* Network Control*/
1549 		dscp = _DSCP_CS6;
1550 		break;
1551 	default:
1552 		dscp = _DSCP_DF;
1553 		break;
1554 	}
1555 
1556 	return dscp;
1557 }
1558 
1559 mbuf_traffic_class_t
rfc4594_dscp_to_tc(uint8_t dscp)1560 rfc4594_dscp_to_tc(uint8_t dscp)
1561 {
1562 	mbuf_traffic_class_t tc = MBUF_TC_BE;
1563 
1564 	switch (dscp) {
1565 	case _DSCP_CS1:
1566 		tc = MBUF_TC_BK;
1567 		break;
1568 	case _DSCP_DF:
1569 	case _DSCP_AF21:
1570 	case _DSCP_CS2:
1571 		tc = MBUF_TC_BE;
1572 		break;
1573 	case _DSCP_AF31:
1574 	case _DSCP_CS4:
1575 	case _DSCP_AF41:
1576 	case _DSCP_CS5:
1577 		tc = MBUF_TC_VI;
1578 		break;
1579 	case _DSCP_EF:
1580 	case _DSCP_CS6:
1581 		tc = MBUF_TC_VO;
1582 		break;
1583 	default:
1584 		tc = MBUF_TC_BE;
1585 		break;
1586 	}
1587 
1588 	return tc;
1589 }
1590 
1591 /*
1592  * Pass NULL ifp for default map
1593  */
1594 static errno_t
set_netsvctype_dscp_map(struct net_qos_dscp_map * net_qos_dscp_map,const struct netsvctype_dscp_map * __counted_by (_NET_SERVICE_TYPE_COUNT)netsvctype_dscp_map)1595 set_netsvctype_dscp_map(struct net_qos_dscp_map *net_qos_dscp_map,
1596     const struct netsvctype_dscp_map *__counted_by(_NET_SERVICE_TYPE_COUNT) netsvctype_dscp_map)
1597 {
1598 	size_t i;
1599 	int netsvctype;
1600 
1601 	VERIFY(netsvctype_dscp_map != NULL);
1602 	/*
1603 	 * Do not accept more that max number of distinct DSCPs
1604 	 */
1605 	if (net_qos_dscp_map == NULL) {
1606 		return EINVAL;
1607 	}
1608 
1609 	/*
1610 	 * Validate input parameters
1611 	 */
1612 	for (i = 0; i < _NET_SERVICE_TYPE_COUNT; i++) {
1613 		if (!IS_VALID_NET_SERVICE_TYPE(netsvctype_dscp_map[i].netsvctype)) {
1614 			return EINVAL;
1615 		}
1616 		if (netsvctype_dscp_map[i].dscp > _MAX_DSCP) {
1617 			return EINVAL;
1618 		}
1619 	}
1620 
1621 	for (i = 0; i < _NET_SERVICE_TYPE_COUNT; i++) {
1622 		netsvctype = netsvctype_dscp_map[i].netsvctype;
1623 
1624 		net_qos_dscp_map->netsvctype_to_dscp[netsvctype] =
1625 		    netsvctype_dscp_map[i].dscp;
1626 	}
1627 	for (netsvctype = 0; netsvctype < _NET_SERVICE_TYPE_COUNT; netsvctype++) {
1628 		switch (netsvctype) {
1629 		case NET_SERVICE_TYPE_BE:
1630 		case NET_SERVICE_TYPE_BK:
1631 		case NET_SERVICE_TYPE_VI:
1632 		case NET_SERVICE_TYPE_VO:
1633 		case NET_SERVICE_TYPE_RV:
1634 		case NET_SERVICE_TYPE_AV:
1635 		case NET_SERVICE_TYPE_OAM:
1636 		case NET_SERVICE_TYPE_RD: {
1637 			size_t sotcix;
1638 
1639 			sotcix = sotc_index(sotc_by_netservicetype[netsvctype]);
1640 			if (sotcix != SIZE_T_MAX) {
1641 				net_qos_dscp_map->sotc_to_dscp[sotcix]  =
1642 				    netsvctype_dscp_map[netsvctype].dscp;
1643 			}
1644 			break;
1645 		}
1646 		case  NET_SERVICE_TYPE_SIG:
1647 			/* Signaling does not have its own traffic class */
1648 			break;
1649 		default:
1650 			/* We should not be here */
1651 			ASSERT(0);
1652 		}
1653 	}
1654 	if (net_qos_dscp_map == &fastlane_net_qos_dscp_map) {
1655 		/* Network control socket traffic class is always best effort for fastlane*/
1656 		net_qos_dscp_map->sotc_to_dscp[SOTCIX_CTL] = _DSCP_DF;
1657 	} else {
1658 		net_qos_dscp_map->sotc_to_dscp[SOTCIX_CTL] = _DSCP_CS6;
1659 	}
1660 
1661 	/* Background system socket traffic class DSCP same as background */
1662 	net_qos_dscp_map->sotc_to_dscp[SOTCIX_BK_SYS] =
1663 	    net_qos_dscp_map->sotc_to_dscp[SOTCIX_BK];
1664 
1665 	return 0;
1666 }
1667 
1668 static size_t
get_netsvctype_dscp_map(struct netsvctype_dscp_map * __counted_by (_NET_SERVICE_TYPE_COUNT)netsvctype_dscp_map)1669 get_netsvctype_dscp_map(struct netsvctype_dscp_map *__counted_by(_NET_SERVICE_TYPE_COUNT) netsvctype_dscp_map)
1670 {
1671 	struct net_qos_dscp_map *net_qos_dscp_map;
1672 	int i;
1673 
1674 	net_qos_dscp_map = &fastlane_net_qos_dscp_map;
1675 
1676 	for (i = 0; i < _NET_SERVICE_TYPE_COUNT; i++) {
1677 		netsvctype_dscp_map[i].netsvctype = i;
1678 		netsvctype_dscp_map[i].dscp = net_qos_dscp_map->netsvctype_to_dscp[i];
1679 	}
1680 
1681 	return i * sizeof(struct netsvctype_dscp_map);
1682 }
1683 
1684 void
net_qos_map_init()1685 net_qos_map_init()
1686 {
1687 	errno_t error;
1688 
1689 	error = set_netsvctype_dscp_map(&fastlane_net_qos_dscp_map,
1690 	    fastlane_netsvctype_dscp_map);
1691 	ASSERT(error == 0);
1692 
1693 	error = set_netsvctype_dscp_map(&rfc4594_net_qos_dscp_map,
1694 	    rfc4594_netsvctype_dscp_map);
1695 	ASSERT(error == 0);
1696 
1697 #if (DEBUG || DEVELOPMENT)
1698 	error = set_netsvctype_dscp_map(&custom_net_qos_dscp_map,
1699 	    rfc4594_netsvctype_dscp_map);
1700 	ASSERT(error == 0);
1701 
1702 #endif /* (DEBUG || DEVELOPMENT) */
1703 
1704 	set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1);
1705 }
1706 
1707 int
1708 sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS
1709 {
1710 #pragma unused(oidp, arg1, arg2)
1711 	int error = 0;
1712 
1713 	if (req->oldptr == USER_ADDR_NULL) {
1714 		req->oldidx =
1715 		    _NET_SERVICE_TYPE_COUNT * sizeof(struct netsvctype_dscp_map);
1716 	} else if (req->oldlen > 0) {
1717 		struct netsvctype_dscp_map netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {};
1718 		size_t len;
1719 
1720 		len = get_netsvctype_dscp_map(netsvctype_dscp_map);
1721 
1722 		error = SYSCTL_OUT(req, netsvctype_dscp_map,
1723 		    MIN(len, req->oldlen));
1724 		if (error != 0) {
1725 			goto done;
1726 		}
1727 	}
1728 
1729 	if (req->newptr != USER_ADDR_NULL) {
1730 		error = EPERM;
1731 	}
1732 done:
1733 	return error;
1734 }
1735 
1736 __private_extern__ errno_t
set_packet_qos(struct mbuf * m,struct ifnet * ifp,boolean_t qos_allowed,int sotc,int netsvctype,uint8_t * dscp_inout)1737 set_packet_qos(struct mbuf *m, struct ifnet *ifp, boolean_t qos_allowed,
1738     int sotc, int netsvctype, uint8_t *dscp_inout)
1739 {
1740 	if (ifp == NULL || dscp_inout == NULL) {
1741 		return EINVAL;
1742 	}
1743 
1744 	if ((ifp->if_eflags & IFEF_QOSMARKING_ENABLED) != 0 &&
1745 	    ifp->if_qosmarking_mode != IFRTYPE_QOSMARKING_MODE_NONE) {
1746 		uint8_t dscp;
1747 		const struct net_qos_dscp_map *net_qos_dscp_map = NULL;
1748 
1749 		switch (ifp->if_qosmarking_mode) {
1750 		case IFRTYPE_QOSMARKING_FASTLANE:
1751 			net_qos_dscp_map = &fastlane_net_qos_dscp_map;
1752 			break;
1753 		case IFRTYPE_QOSMARKING_RFC4594:
1754 			net_qos_dscp_map = &rfc4594_net_qos_dscp_map;
1755 			break;
1756 #if (DEBUG || DEVELOPMENT)
1757 		case IFRTYPE_QOSMARKING_CUSTOM:
1758 			net_qos_dscp_map = &custom_net_qos_dscp_map;
1759 			break;
1760 #endif /* (DEBUG || DEVELOPMENT) */
1761 		default:
1762 			panic("invalid QoS marking type");
1763 			/* NOTREACHED */
1764 		}
1765 
1766 		/*
1767 		 * When on a Fastlane network, IP_TOS/IPV6_TCLASS are no-ops
1768 		 */
1769 		dscp = _DSCP_DF;
1770 
1771 		/*
1772 		 * For DSCP use the network service type is specified, otherwise
1773 		 * use the socket traffic class
1774 		 *
1775 		 * When not whitelisted by the policy, set DSCP only for best
1776 		 * effort and background, and set the mbuf service class to
1777 		 * best effort as well so the packet will be queued and
1778 		 * scheduled at a lower priority.
1779 		 * We still want to prioritize control traffic on the interface
1780 		 * so we do not change the mbuf service class for SO_TC_CTL
1781 		 */
1782 		if (IS_VALID_NET_SERVICE_TYPE(netsvctype) &&
1783 		    netsvctype != NET_SERVICE_TYPE_BE) {
1784 			dscp = net_qos_dscp_map->netsvctype_to_dscp[netsvctype];
1785 
1786 			if (qos_allowed == FALSE &&
1787 			    netsvctype != NET_SERVICE_TYPE_BE &&
1788 			    netsvctype != NET_SERVICE_TYPE_BK) {
1789 				dscp = _DSCP_DF;
1790 				if (sotc != SO_TC_CTL) {
1791 					m_set_service_class(m, MBUF_SC_BE);
1792 				}
1793 			}
1794 		} else if (sotc != SO_TC_UNSPEC) {
1795 			size_t sotcix = sotc_index(sotc);
1796 			if (sotcix != SIZE_T_MAX) {
1797 				dscp = net_qos_dscp_map->sotc_to_dscp[sotcix];
1798 
1799 				if (qos_allowed == FALSE && sotc != SO_TC_BE &&
1800 				    sotc != SO_TC_BK && sotc != SO_TC_BK_SYS &&
1801 				    sotc != SO_TC_CTL) {
1802 					dscp = _DSCP_DF;
1803 					if (sotc != SO_TC_CTL) {
1804 						m_set_service_class(m, MBUF_SC_BE);
1805 					}
1806 				}
1807 			}
1808 		}
1809 		if (net_qos_verbose != 0) {
1810 			printf("%s qos_allowed %d sotc %u netsvctype %u dscp %u\n",
1811 			    __func__, qos_allowed, sotc, netsvctype, dscp);
1812 		}
1813 
1814 		if (*dscp_inout != dscp) {
1815 			*dscp_inout = dscp;
1816 		}
1817 	} else if (*dscp_inout != _DSCP_DF && IFNET_IS_WIFI_INFRA(ifp)) {
1818 		mbuf_svc_class_t msc = m_get_service_class(m);
1819 
1820 		/*
1821 		 * For WiFi infra, when the mbuf service class is best effort
1822 		 * and the DSCP is not default, set the service class based
1823 		 * on DSCP
1824 		 */
1825 		if (msc == MBUF_SC_BE) {
1826 			msc = wifi_dscp_to_msc_array[*dscp_inout];
1827 
1828 			if (msc != MBUF_SC_BE) {
1829 				m_set_service_class(m, msc);
1830 
1831 				if (net_qos_verbose != 0) {
1832 					printf("%s set msc %u for dscp %u\n",
1833 					    __func__, msc, *dscp_inout);
1834 				}
1835 			}
1836 		}
1837 	}
1838 
1839 	return 0;
1840 }
1841 
1842 static void
set_dscp_to_wifi_ac_map(const struct dcsp_msc_map * __indexable map,int clear)1843 set_dscp_to_wifi_ac_map(const struct dcsp_msc_map *__indexable map, int clear)
1844 {
1845 	int i;
1846 
1847 	if (clear) {
1848 		bzero(wifi_dscp_to_msc_array, sizeof(wifi_dscp_to_msc_array));
1849 	}
1850 
1851 	for (i = 0; i < DSCP_ARRAY_SIZE; i++) {
1852 		const struct dcsp_msc_map *elem = map + i;
1853 
1854 		if (elem->dscp > _MAX_DSCP || elem->msc == MBUF_SC_UNSPEC) {
1855 			break;
1856 		}
1857 		switch (elem->msc) {
1858 		case MBUF_SC_BK_SYS:
1859 		case MBUF_SC_BK:
1860 			wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_BK;
1861 			break;
1862 		default:
1863 		case MBUF_SC_BE:
1864 		case MBUF_SC_RD:
1865 		case MBUF_SC_OAM:
1866 			wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_BE;
1867 			break;
1868 		case MBUF_SC_AV:
1869 		case MBUF_SC_RV:
1870 		case MBUF_SC_VI:
1871 			wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_VI;
1872 			break;
1873 		case MBUF_SC_VO:
1874 		case MBUF_SC_CTL:
1875 			wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_VO;
1876 			break;
1877 		}
1878 	}
1879 }
1880 
1881 static errno_t
dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map * __counted_by (count)netsvctype_dscp_map,size_t count,struct dcsp_msc_map * __counted_by (DSCP_ARRAY_SIZE)dcsp_msc_map)1882 dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *__counted_by(count) netsvctype_dscp_map,
1883     size_t count, struct dcsp_msc_map *__counted_by(DSCP_ARRAY_SIZE) dcsp_msc_map)
1884 {
1885 	errno_t error = 0;
1886 	uint32_t i;
1887 
1888 	/*
1889 	 * Validate input parameters
1890 	 */
1891 	for (i = 0; i < count; i++) {
1892 		if (!SO_VALID_TC(netsvctype_dscp_map[i].netsvctype)) {
1893 			error = EINVAL;
1894 			goto done;
1895 		}
1896 		if (netsvctype_dscp_map[i].dscp > _MAX_DSCP) {
1897 			error = EINVAL;
1898 			goto done;
1899 		}
1900 	}
1901 
1902 	bzero(dcsp_msc_map, DSCP_ARRAY_SIZE * sizeof(struct dcsp_msc_map));
1903 
1904 	for (i = 0; i < count; i++) {
1905 		dcsp_msc_map[i].dscp = netsvctype_dscp_map[i].dscp;
1906 		dcsp_msc_map[i].msc = so_tc2msc(netsvctype_dscp_map[i].netsvctype);
1907 	}
1908 done:
1909 	return error;
1910 }
1911 
1912 int
1913 sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS
1914 {
1915 #pragma unused(oidp, arg1, arg2)
1916 	int error = 0;
1917 	size_t len = DSCP_ARRAY_SIZE * sizeof(struct netsvctype_dscp_map);
1918 	struct netsvctype_dscp_map netsvctype_dscp_map[DSCP_ARRAY_SIZE] = {};
1919 	struct dcsp_msc_map dcsp_msc_map[DSCP_ARRAY_SIZE];
1920 	size_t count;
1921 
1922 	if (req->oldptr == USER_ADDR_NULL) {
1923 		req->oldidx = len;
1924 	} else if (req->oldlen > 0) {
1925 		uint8_t i;
1926 
1927 		for (i = 0; i < DSCP_ARRAY_SIZE; i++) {
1928 			netsvctype_dscp_map[i].dscp = i;
1929 			netsvctype_dscp_map[i].netsvctype =
1930 			    so_svc2tc(wifi_dscp_to_msc_array[i]);
1931 		}
1932 		error = SYSCTL_OUT(req, netsvctype_dscp_map,
1933 		    MIN(len, req->oldlen));
1934 		if (error != 0) {
1935 			goto done;
1936 		}
1937 	}
1938 
1939 	if (req->newptr == USER_ADDR_NULL) {
1940 		goto done;
1941 	}
1942 
1943 	error = proc_suser(current_proc());
1944 	if (error != 0) {
1945 		goto done;
1946 	}
1947 
1948 	/*
1949 	 * Check input length
1950 	 */
1951 	if (req->newlen > len) {
1952 		error = EINVAL;
1953 		goto done;
1954 	}
1955 	/*
1956 	 * Cap the number of entries to copy from input buffer
1957 	 */
1958 	if (len > req->newlen) {
1959 		len = req->newlen;
1960 	}
1961 	error = SYSCTL_IN(req, netsvctype_dscp_map, len);
1962 	if (error != 0) {
1963 		goto done;
1964 	}
1965 	count = len / sizeof(struct netsvctype_dscp_map);
1966 	bzero(dcsp_msc_map, sizeof(dcsp_msc_map));
1967 	error = dscp_msc_map_from_netsvctype_dscp_map(netsvctype_dscp_map, count,
1968 	    dcsp_msc_map);
1969 	if (error != 0) {
1970 		goto done;
1971 	}
1972 	set_dscp_to_wifi_ac_map(dcsp_msc_map, 0);
1973 done:
1974 	return error;
1975 }
1976 
1977 int
1978 sysctl_reset_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS
1979 {
1980 #pragma unused(oidp, arg1, arg2)
1981 	int error = 0;
1982 	int val = 0;
1983 
1984 	error = sysctl_handle_int(oidp, &val, 0, req);
1985 	if (error || !req->newptr) {
1986 		return error;
1987 	}
1988 	if (req->newptr == USER_ADDR_NULL) {
1989 		return 0;
1990 	}
1991 	error = proc_suser(current_proc());
1992 	if (error != 0) {
1993 		return error;
1994 	}
1995 
1996 	set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1);
1997 
1998 	return 0;
1999 }
2000 
2001 /*
2002  * Returns whether a large upload or download transfer should be marked as
2003  * BK service type for network activity. This is a system level
2004  * hint/suggestion to classify application traffic based on statistics
2005  * collected from the current network attachment
2006  *
2007  * Returns 1 for BK and 0 for default
2008  */
2009 
2010 int
net_qos_guideline(struct proc * p,struct net_qos_guideline_args * arg,int * retval)2011 net_qos_guideline(struct proc *p, struct net_qos_guideline_args *arg,
2012     int *retval)
2013 {
2014 #pragma unused(p)
2015 #define RETURN_USE_BK   1
2016 #define RETURN_USE_DEFAULT      0
2017 	struct net_qos_param qos_arg;
2018 	struct ifnet *ipv4_primary, *ipv6_primary;
2019 	int err = 0;
2020 
2021 	if (arg->param == USER_ADDR_NULL || retval == NULL ||
2022 	    arg->param_len != sizeof(qos_arg)) {
2023 		return EINVAL;
2024 	}
2025 	err = copyin(arg->param, (caddr_t) &qos_arg, sizeof(qos_arg));
2026 	if (err != 0) {
2027 		return err;
2028 	}
2029 
2030 	*retval = RETURN_USE_DEFAULT;
2031 	ipv4_primary = ifindex2ifnet[get_primary_ifscope(AF_INET)];
2032 	ipv6_primary = ifindex2ifnet[get_primary_ifscope(AF_INET6)];
2033 
2034 	/*
2035 	 * If either of the interfaces is in Low Internet mode, enable
2036 	 * background delay based algorithms on this transfer
2037 	 */
2038 	if (qos_arg.nq_uplink) {
2039 		if ((ipv4_primary != NULL &&
2040 		    (ipv4_primary->if_xflags & IFXF_LOW_INTERNET_UL)) ||
2041 		    (ipv6_primary != NULL &&
2042 		    (ipv6_primary->if_xflags & IFXF_LOW_INTERNET_UL))) {
2043 			*retval = RETURN_USE_BK;
2044 			return 0;
2045 		}
2046 	} else {
2047 		if ((ipv4_primary != NULL &&
2048 		    (ipv4_primary->if_xflags & IFXF_LOW_INTERNET_DL)) ||
2049 		    (ipv6_primary != NULL &&
2050 		    (ipv6_primary->if_xflags & IFXF_LOW_INTERNET_DL))) {
2051 			*retval = RETURN_USE_BK;
2052 			return 0;
2053 		}
2054 	}
2055 
2056 	/*
2057 	 * Some times IPv4 and IPv6 primary interfaces can be different.
2058 	 * In this case, if either of them is non-cellular, we should mark
2059 	 * the transfer as BK as it can potentially get used based on
2060 	 * the host name resolution
2061 	 */
2062 	if (ipv4_primary != NULL && IFNET_IS_EXPENSIVE(ipv4_primary) &&
2063 	    ipv6_primary != NULL && IFNET_IS_EXPENSIVE(ipv6_primary)) {
2064 		if (qos_arg.nq_use_expensive) {
2065 			return 0;
2066 		} else {
2067 			*retval = RETURN_USE_BK;
2068 			return 0;
2069 		}
2070 	}
2071 	if (ipv4_primary != NULL && IFNET_IS_CONSTRAINED(ipv4_primary) &&
2072 	    ipv6_primary != NULL && IFNET_IS_CONSTRAINED(ipv6_primary)) {
2073 		if (qos_arg.nq_use_constrained) {
2074 			return 0;
2075 		} else {
2076 			*retval = RETURN_USE_BK;
2077 			return 0;
2078 		}
2079 	}
2080 	if (qos_arg.nq_transfer_size >= 5 * 1024 * 1024) {
2081 		*retval = RETURN_USE_BK;
2082 		return 0;
2083 	}
2084 
2085 
2086 #undef  RETURN_USE_BK
2087 #undef  RETURN_USE_DEFAULT
2088 	return 0;
2089 }
2090 
2091 #if (DEBUG || DEVELOPMENT)
2092 /*
2093  * Customizable QoS mapping table
2094  * By default it uses the mapping table for RFC 4594
2095  *
2096  * Notes:
2097  *   BK_SYS is the same as BK
2098  *   CTL cannot be changed and is always _DSCP_CS6
2099  */
2100 SYSCTL_NODE(_net_qos, OID_AUTO, custom,
2101     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "");
2102 
2103 SYSCTL_NODE(_net_qos_custom, OID_AUTO, netsvctype_to_dscp,
2104     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "");
2105 
2106 static int sysctl_net_qos_custom_netsvctype_to_dscp SYSCTL_HANDLER_ARGS;
2107 SYSCTL_PROC(_net_qos_custom_netsvctype_to_dscp, OID_AUTO, be,
2108     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
2109     0, NET_SERVICE_TYPE_BE, sysctl_net_qos_custom_netsvctype_to_dscp, "I", "");
2110 SYSCTL_PROC(_net_qos_custom_netsvctype_to_dscp, OID_AUTO, bk,
2111     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
2112     0, NET_SERVICE_TYPE_BK, sysctl_net_qos_custom_netsvctype_to_dscp, "I", "");
2113 SYSCTL_PROC(_net_qos_custom_netsvctype_to_dscp, OID_AUTO, sig,
2114     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
2115     0, NET_SERVICE_TYPE_SIG, sysctl_net_qos_custom_netsvctype_to_dscp, "I", "");
2116 SYSCTL_PROC(_net_qos_custom_netsvctype_to_dscp, OID_AUTO, vi,
2117     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
2118     0, NET_SERVICE_TYPE_VI, sysctl_net_qos_custom_netsvctype_to_dscp, "I", "");
2119 SYSCTL_PROC(_net_qos_custom_netsvctype_to_dscp, OID_AUTO, vo,
2120     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
2121     0, NET_SERVICE_TYPE_VO, sysctl_net_qos_custom_netsvctype_to_dscp, "I", "");
2122 SYSCTL_PROC(_net_qos_custom_netsvctype_to_dscp, OID_AUTO, rv,
2123     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
2124     0, NET_SERVICE_TYPE_RV, sysctl_net_qos_custom_netsvctype_to_dscp, "I", "");
2125 SYSCTL_PROC(_net_qos_custom_netsvctype_to_dscp, OID_AUTO, av,
2126     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
2127     0, NET_SERVICE_TYPE_AV, sysctl_net_qos_custom_netsvctype_to_dscp, "I", "");
2128 SYSCTL_PROC(_net_qos_custom_netsvctype_to_dscp, OID_AUTO, oam,
2129     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
2130     0, NET_SERVICE_TYPE_OAM, sysctl_net_qos_custom_netsvctype_to_dscp, "I", "");
2131 SYSCTL_PROC(_net_qos_custom_netsvctype_to_dscp, OID_AUTO, rd,
2132     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
2133     0, NET_SERVICE_TYPE_RD, sysctl_net_qos_custom_netsvctype_to_dscp, "I", "");
2134 
2135 static int sysctl_net_qos_custom_reset SYSCTL_HANDLER_ARGS;
2136 SYSCTL_PROC(_net_qos_custom, OID_AUTO, reset,
2137     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
2138     0, 0, sysctl_net_qos_custom_reset, "I", "");
2139 
2140 int
2141 sysctl_net_qos_custom_netsvctype_to_dscp SYSCTL_HANDLER_ARGS
2142 {
2143 #pragma unused(arg1)
2144 	int error = 0;
2145 
2146 	switch (arg2) {
2147 	case NET_SERVICE_TYPE_BE:
2148 	case NET_SERVICE_TYPE_BK:
2149 	case NET_SERVICE_TYPE_SIG:
2150 	case NET_SERVICE_TYPE_VI:
2151 	case NET_SERVICE_TYPE_VO:
2152 	case NET_SERVICE_TYPE_RV:
2153 	case NET_SERVICE_TYPE_AV:
2154 	case NET_SERVICE_TYPE_OAM:
2155 	case NET_SERVICE_TYPE_RD:
2156 		break;
2157 	default:
2158 		os_log(OS_LOG_DEFAULT, "%s: unexpected netsvctype %d",
2159 		    __func__, arg2);
2160 		return EINVAL;
2161 	}
2162 
2163 	int val = custom_net_qos_dscp_map.netsvctype_to_dscp[arg2];
2164 	error = sysctl_handle_int(oidp, &val, 0, req);
2165 	if (error != 0 || req->newptr == USER_ADDR_NULL) {
2166 		return error;
2167 	}
2168 	if (req->newptr == USER_ADDR_NULL) {
2169 		return 0;
2170 	}
2171 	error = proc_suser(current_proc());
2172 	if (error != 0) {
2173 		return error;
2174 	}
2175 	if (val < 0 || val > _MAX_DSCP) {
2176 		os_log(OS_LOG_DEFAULT, "%s: unexpected DSCP %d",
2177 		    __func__, val);
2178 		return EINVAL;
2179 	}
2180 
2181 	struct netsvctype_dscp_map netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {};
2182 
2183 	for (int i = 0; i < _NET_SERVICE_TYPE_COUNT; i++) {
2184 		netsvctype_dscp_map[i].netsvctype = i;
2185 		netsvctype_dscp_map[i].dscp = custom_net_qos_dscp_map.netsvctype_to_dscp[i];
2186 	}
2187 	netsvctype_dscp_map[arg2].dscp = (uint8_t) val;
2188 
2189 	error = set_netsvctype_dscp_map(&custom_net_qos_dscp_map,
2190 	    netsvctype_dscp_map);
2191 
2192 	return 0;
2193 }
2194 
2195 int
2196 sysctl_net_qos_custom_reset SYSCTL_HANDLER_ARGS
2197 {
2198 #pragma unused(arg1, arg2)
2199 	int error = 0;
2200 	int val = 0;
2201 
2202 	error = sysctl_handle_int(oidp, &val, 0, req);
2203 	if (error || !req->newptr) {
2204 		return error;
2205 	}
2206 	if (req->newptr == USER_ADDR_NULL) {
2207 		return 0;
2208 	}
2209 	error = proc_suser(current_proc());
2210 	if (error != 0) {
2211 		return error;
2212 	}
2213 
2214 	error = set_netsvctype_dscp_map(&custom_net_qos_dscp_map,
2215 	    rfc4594_netsvctype_dscp_map);
2216 
2217 	return error;
2218 }
2219 
2220 uint8_t
custom_sc_to_dscp(uint32_t svc_class)2221 custom_sc_to_dscp(uint32_t svc_class)
2222 {
2223 	uint8_t dscp = _DSCP_DF;
2224 
2225 	switch (svc_class) {
2226 	case MBUF_SC_BK_SYS:
2227 		dscp = custom_net_qos_dscp_map.sotc_to_dscp[SOTCIX_BK_SYS];
2228 		break;
2229 	case MBUF_SC_BK:
2230 		dscp = custom_net_qos_dscp_map.sotc_to_dscp[SOTCIX_BK];
2231 		break;
2232 
2233 	case MBUF_SC_BE:
2234 		dscp = custom_net_qos_dscp_map.sotc_to_dscp[SOTCIX_BE];
2235 		break;
2236 	case MBUF_SC_RD:
2237 		dscp = custom_net_qos_dscp_map.sotc_to_dscp[SOTCIX_RD];
2238 		break;
2239 	case MBUF_SC_OAM:
2240 		dscp = custom_net_qos_dscp_map.sotc_to_dscp[SOTCIX_OAM];
2241 		break;
2242 
2243 	case MBUF_SC_AV:
2244 		dscp = custom_net_qos_dscp_map.sotc_to_dscp[SOTCIX_AV];
2245 		break;
2246 	case MBUF_SC_RV:
2247 		dscp = custom_net_qos_dscp_map.sotc_to_dscp[SOTCIX_RV];
2248 		break;
2249 	case MBUF_SC_VI:
2250 		dscp = custom_net_qos_dscp_map.sotc_to_dscp[SOTCIX_VI];
2251 		break;
2252 	case MBUF_SC_SIG:
2253 		dscp = custom_net_qos_dscp_map.netsvctype_to_dscp[NET_SERVICE_TYPE_SIG];
2254 		break;
2255 
2256 	case MBUF_SC_VO:
2257 		dscp = custom_net_qos_dscp_map.sotc_to_dscp[SOTCIX_VO];
2258 		break;
2259 	case MBUF_SC_CTL:
2260 		dscp = custom_net_qos_dscp_map.sotc_to_dscp[SOTCIX_CTL];
2261 		break;
2262 	default:
2263 		break;
2264 	}
2265 	return dscp;
2266 }
2267 #endif /* (DEBUG || DEVELOPMENT) */
2268