xref: /xnu-12377.81.4/bsd/net/flowadv.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Flow Control and Feedback Advisory
31  *
32  * Each mbuf that is being sent out through an interface is tagged with a
33  * unique 32-bit ID which will help to identify all the packets that belong
34  * to a particular flow at the interface layer.  Packets carrying such ID
35  * would need to be marked with PKTF_FLOW_ID.  Normally, this ID is computed
36  * by the module that generates the flow.  There are 3 kinds of flow sources
37  * that are currently recognized:
38  *
39  *	a. INPCB (INET/INET6 Protocol Control Block).  When a socket is
40  *	   connected, the flow hash for the socket is computed and stored in
41  *	   the PCB.  Further transmissions on the socket will cause the hash
42  *	   value to be carried within the mbuf as the flow ID.
43  *
44  *	b. Interface.  When an interface is attached, the flow hash for the
45  *	   interface is computed and stored in the ifnet.  This value is
46  *	   normally ignored for most network drivers, except for those that
47  *	   reside atop another driver, e.g. a virtual interface performing
48  *	   encapsulation/encryption on the original packet and sending the
49  *	   newly-generated packet to another interface.  Such interface needs
50  *	   to associate all generated packets with the interface flow hash
51  *	   value as the flow ID.
52  *
53  *	c. PF (Packet Filter).  When a packet goes through PF and it is not
54  *	   already associated with a flow ID, PF will compute a flow hash and
55  *	   store it in the packet as flow ID.  When the packet is associated
56  *	   with a PF state, the state record will have the flow ID stored
57  *	   within, in order to avoid recalculating the flow hash.  Although PF
58  *	   is capable of generating flow IDs, it does not participate in flow
59  *	   advisory, and therefore packets whose IDs are computed by PF will
60  *	   not have their PKTF_FLOW_ADV packet flag set.
61  *
62  * Activation of flow advisory mechanism is done by setting the PKTF_FLOW_ADV
63  * packet flag; because a flow ID is required, the mechanism will not take
64  * place unless PKTF_FLOW_ID is set as well.  The packet must also carry one
65  * of the flow source types FLOWSRC_{INPCB,IFNET} in order to identify where
66  * the flow advisory notification should be delivered to.  As noted above,
67  * FLOWSRC_PF does not participate in this mechanism.
68  *
69  * The classq module configured on the interface is responsible for exerting
70  * flow control to the upper layers.  This occurs when the number of packets
71  * queued for a flow reaches a limit.  The module generating the flow will
72  * cease transmission until further flow advisory notice, and the flow will
73  * be inserted into the classq's flow control list.
74  *
75  * When packets are dequeued from the classq and the number of packets for
76  * a flow goes below a limit, the classq will transfer its flow control list
77  * to the global fadv_list.  This will then trigger the flow advisory thread
78  * to run, which will cause the flow source modules to be notified that data
79  * can now be generated for those previously flow-controlled flows.
80  */
81 
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/kernel.h>
85 #include <sys/mcache.h> /* for VERIFY() */
86 #include <sys/mbuf.h>
87 #include <sys/proc_internal.h>
88 #include <sys/socketvar.h>
89 
90 #include <kern/assert.h>
91 #include <kern/thread.h>
92 #include <kern/locks.h>
93 #include <kern/zalloc.h>
94 
95 #include <netinet/in_pcb.h>
96 #include <net/flowadv.h>
97 #if SKYWALK
98 #include <skywalk/os_channel.h>
99 #endif /* SKYWALK */
100 
101 /* Lock group and attribute for fadv_lock */
102 static LCK_GRP_DECLARE(fadv_lock_grp, "fadv_lock");
103 static LCK_MTX_DECLARE(fadv_lock, &fadv_lock_grp);
104 
105 /* protected by fadv_lock */
106 static STAILQ_HEAD(fadv_head, flowadv_fcentry) fadv_list =
107     STAILQ_HEAD_INITIALIZER(fadv_list);
108 static thread_t fadv_thread = THREAD_NULL;
109 static uint32_t fadv_active;
110 
111 #define FADV_CACHE_NAME  "flowadv"              /* cache name */
112 
113 static int flowadv_thread_cont(int);
114 static void flowadv_thread_func(void *, wait_result_t);
115 
116 void
flowadv_init(void)117 flowadv_init(void)
118 {
119 	if (kernel_thread_start(flowadv_thread_func, NULL, &fadv_thread) !=
120 	    KERN_SUCCESS) {
121 		panic("%s: couldn't create flow event advisory thread",
122 		    __func__);
123 		/* NOTREACHED */
124 	}
125 	thread_deallocate(fadv_thread);
126 }
127 
128 struct flowadv_fcentry *
flowadv_alloc_entry(int how)129 flowadv_alloc_entry(int how)
130 {
131 	return kalloc_type(struct flowadv_fcentry, how | Z_ZERO);
132 }
133 
134 void
flowadv_free_entry(struct flowadv_fcentry * fce)135 flowadv_free_entry(struct flowadv_fcentry *fce)
136 {
137 	kfree_type(struct flowadv_fcentry, fce);
138 }
139 
140 void
flowadv_add(struct flowadv_fclist * fcl)141 flowadv_add(struct flowadv_fclist *fcl)
142 {
143 	if (STAILQ_EMPTY(fcl)) {
144 		return;
145 	}
146 
147 	lck_mtx_lock_spin(&fadv_lock);
148 
149 	STAILQ_CONCAT(&fadv_list, fcl);
150 	VERIFY(!STAILQ_EMPTY(&fadv_list));
151 
152 	if (!fadv_active && fadv_thread != THREAD_NULL) {
153 		wakeup_one((caddr_t)&fadv_list);
154 	}
155 
156 	lck_mtx_unlock(&fadv_lock);
157 }
158 
159 void
flowadv_add_entry(struct flowadv_fcentry * fce)160 flowadv_add_entry(struct flowadv_fcentry *fce)
161 {
162 	lck_mtx_lock_spin(&fadv_lock);
163 	STAILQ_INSERT_HEAD(&fadv_list, fce, fce_link);
164 	VERIFY(!STAILQ_EMPTY(&fadv_list));
165 
166 	if (!fadv_active && fadv_thread != THREAD_NULL) {
167 		wakeup_one((caddr_t)&fadv_list);
168 	}
169 
170 	lck_mtx_unlock(&fadv_lock);
171 }
172 
173 static int
flowadv_thread_cont(int err)174 flowadv_thread_cont(int err)
175 {
176 #pragma unused(err)
177 	for (;;) {
178 		LCK_MTX_ASSERT(&fadv_lock, LCK_MTX_ASSERT_OWNED);
179 		while (STAILQ_EMPTY(&fadv_list)) {
180 			VERIFY(!fadv_active);
181 			(void) msleep0(&fadv_list, &fadv_lock, (PSOCK | PSPIN),
182 			    "flowadv_cont", 0, flowadv_thread_cont);
183 			/* NOTREACHED */
184 		}
185 
186 		fadv_active = 1;
187 		for (;;) {
188 			struct flowadv_fcentry *fce;
189 
190 			VERIFY(!STAILQ_EMPTY(&fadv_list));
191 			fce = STAILQ_FIRST(&fadv_list);
192 			STAILQ_REMOVE(&fadv_list, fce,
193 			    flowadv_fcentry, fce_link);
194 			STAILQ_NEXT(fce, fce_link) = NULL;
195 
196 			lck_mtx_unlock(&fadv_lock);
197 
198 			if (fce->fce_event_type == FCE_EVENT_TYPE_CONGESTION_EXPERIENCED) {
199 				switch (fce->fce_flowsrc_type) {
200 				case FLOWSRC_CHANNEL:
201 					kern_channel_flowadv_report_congestion_event(fce,
202 					    fce->fce_congestion_cnt, fce->l4s_ce_cnt,
203 					    fce->fce_pkts_since_last_report);
204 					break;
205 				case FLOWSRC_INPCB:
206 				case FLOWSRC_IFNET:
207 				case FLOWSRC_PF:
208 				default:
209 					break;
210 				}
211 
212 				goto next;
213 			}
214 
215 			switch (fce->fce_flowsrc_type) {
216 			case FLOWSRC_INPCB:
217 				inp_flowadv(fce->fce_flowid);
218 				break;
219 
220 			case FLOWSRC_IFNET:
221 #if SKYWALK
222 				/*
223 				 * when using the flowID allocator, IPSec
224 				 * driver uses the "pkt_flowid" field in mbuf
225 				 * packet header for the globally unique flowID
226 				 * and the "pkt_mpriv_srcid" field carries the
227 				 * interface flow control id (if_flowhash).
228 				 * For IPSec flows, it is the IPSec driver
229 				 * network interface which is flow controlled,
230 				 * instead of the IPSec SA flow.
231 				 */
232 				ifnet_flowadv(fce->fce_flowsrc_token);
233 #else /* !SKYWALK */
234 				ifnet_flowadv(fce->fce_flowid);
235 #endif /* !SKYWALK */
236 				break;
237 
238 #if SKYWALK
239 			case FLOWSRC_CHANNEL:
240 				kern_channel_flowadv_clear(fce);
241 				break;
242 #endif /* SKYWALK */
243 
244 			case FLOWSRC_PF:
245 			default:
246 				break;
247 			}
248 next:
249 			flowadv_free_entry(fce);
250 			lck_mtx_lock_spin(&fadv_lock);
251 
252 			/* if there's no pending request, we're done */
253 			if (STAILQ_EMPTY(&fadv_list)) {
254 				break;
255 			}
256 		}
257 		fadv_active = 0;
258 	}
259 }
260 
261 __dead2
262 static void
flowadv_thread_func(void * v,wait_result_t w)263 flowadv_thread_func(void *v, wait_result_t w)
264 {
265 #pragma unused(v, w)
266 	lck_mtx_lock(&fadv_lock);
267 	(void) msleep0(&fadv_list, &fadv_lock, (PSOCK | PSPIN),
268 	    "flowadv", 0, flowadv_thread_cont);
269 	/*
270 	 * msleep0() shouldn't have returned as PCATCH was not set;
271 	 * therefore assert in this case.
272 	 */
273 	lck_mtx_unlock(&fadv_lock);
274 	VERIFY(0);
275 }
276