xref: /xnu-8020.140.41/bsd/netinet/mptcp_subr.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32 
33 #include <mach/sdt.h>
34 
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50 
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_cache.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_var.h>
62 #include <netinet/mptcp_var.h>
63 #include <netinet/mptcp.h>
64 #include <netinet/mptcp_opt.h>
65 #include <netinet/mptcp_seq.h>
66 #include <netinet/mptcp_timer.h>
67 #include <libkern/crypto/sha1.h>
68 #include <libkern/crypto/sha2.h>
69 #include <netinet6/in6_pcb.h>
70 #include <netinet6/ip6protosw.h>
71 #include <dev/random/randomdev.h>
72 
73 /*
74  * Notes on MPTCP implementation.
75  *
76  * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77  * communication domain.  The structure mtcbinfo describes the MPTCP instance
78  * of a Multipath protocol in that domain.  It is used to keep track of all
79  * MPTCP PCB instances in the system, and is protected by the global lock
80  * mppi_lock.
81  *
82  * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83  * IPPROTO_TCP).  Upon success, a Multipath PCB gets allocated and along with
84  * it comes an MPTCP Session and an MPTCP PCB.  All three structures are
85  * allocated from the same memory block, and each structure has a pointer
86  * to the adjacent ones.  The layout is defined by the mpp_mtp structure.
87  * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88  * PCB (mppcb) as well as the MPTCP Session (mptses).
89  *
90  * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91  *
92  * A functioning MPTCP Session consists of one or more subflow sockets.  Each
93  * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94  * represented by the mptsub structure.  Because each subflow requires access
95  * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96  * subflow.  This gets decremented prior to the subflow's destruction.
97  *
98  * To handle events (read, write, control) from the subflows, we do direct
99  * upcalls into the specific function.
100  *
101  * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102  * lock. Incoming data on a subflow also ends up taking this single lock. To
103  * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104  * of the MPTCP-socket.
105  *
106  * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107  * work is done by the MPTCP garbage collector which is invoked on demand by
108  * the PF_MULTIPATH garbage collector.  This process will take place once all
109  * of the subflows have been destroyed.
110  */
111 
112 static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
113 static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
114 
115 static uint32_t mptcp_gc(struct mppcbinfo *);
116 static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117     struct uio *, struct mbuf **, struct mbuf **, int *);
118 static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119     struct uio *, struct mbuf *, struct mbuf *, int);
120 static void mptcp_subflow_wupcall(struct socket *, void *, int);
121 static void mptcp_subflow_eupcall1(struct socket *so, void *arg, long events);
122 static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
123 static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
124 
125 static void mptcp_subflow_abort(struct mptsub *, int);
126 
127 static void mptcp_send_dfin(struct socket *so);
128 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
129 static int mptcp_freeq(struct mptcb *mp_tp);
130 
131 /*
132  * Possible return values for subflow event handlers.  Note that success
133  * values must be greater or equal than MPTS_EVRET_OK.  Values less than that
134  * indicate errors or actions which require immediate attention; they will
135  * prevent the rest of the handlers from processing their respective events
136  * until the next round of events processing.
137  */
138 typedef enum {
139 	MPTS_EVRET_DELETE               = 1,    /* delete this subflow */
140 	MPTS_EVRET_OK                   = 2,    /* OK */
141 	MPTS_EVRET_CONNECT_PENDING      = 3,    /* resume pended connects */
142 	MPTS_EVRET_DISCONNECT_FALLBACK  = 4,    /* abort all but preferred */
143 } ev_ret_t;
144 
145 static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, long *, long);
146 static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, long *, long);
147 static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, long *, long);
148 static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, long *, long);
149 static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, long *, long);
150 static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, long *, long);
151 static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, long *, long);
152 static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, long *, long);
153 static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, long *, long);
154 static ev_ret_t mptcp_subflow_mpsuberror_ev(struct mptses *, struct mptsub *, long *, long);
155 static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, long *, long);
156 static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, long *, long);
157 
158 static void mptcp_do_sha1(mptcp_key_t *, char *);
159 static void mptcp_do_sha256(mptcp_key_t *, char *);
160 
161 static void mptcp_init_local_parms(struct mptses *, struct sockaddr *);
162 
163 static ZONE_DEFINE_TYPE(mptsub_zone, "mptsub", struct mptsub, ZC_ZFREE_CLEARMEM);
164 static ZONE_DEFINE_TYPE(mptopt_zone, "mptopt", struct mptopt, ZC_ZFREE_CLEARMEM);
165 static ZONE_DEFINE(mpt_subauth_zone, "mptauth",
166     sizeof(struct mptcp_subf_auth_entry), ZC_NONE);
167 
168 struct mppcbinfo mtcbinfo;
169 
170 SYSCTL_DECL(_net_inet);
171 
172 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
173 
174 uint32_t mptcp_dbg_area = 31;           /* more noise if greater than 1 */
175 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW | CTLFLAG_LOCKED,
176     &mptcp_dbg_area, 0, "MPTCP debug area");
177 
178 uint32_t mptcp_dbg_level = 1;
179 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
180     &mptcp_dbg_level, 0, "MPTCP debug level");
181 
182 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
183     &mtcbinfo.mppi_count, 0, "Number of active PCBs");
184 
185 
186 static int mptcp_alternate_port = 0;
187 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
188     &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
189 
190 static struct protosw mptcp_subflow_protosw;
191 static struct pr_usrreqs mptcp_subflow_usrreqs;
192 static struct ip6protosw mptcp_subflow_protosw6;
193 static struct pr_usrreqs mptcp_subflow_usrreqs6;
194 
195 static uint8_t  mptcp_create_subflows_scheduled;
196 
197 typedef struct mptcp_subflow_event_entry {
198 	long        sofilt_hint_mask;
199 	ev_ret_t    (*sofilt_hint_ev_hdlr)(
200 		struct mptses *mpte,
201 		struct mptsub *mpts,
202 		long *p_mpsofilt_hint,
203 		long event);
204 } mptsub_ev_entry_t;
205 
206 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
207 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
208 static uint32_t mptcp_kern_skt_inuse = 0;
209 static uint32_t mptcp_kern_skt_unit;
210 static symptoms_advisory_t mptcp_advisory;
211 
212 uint32_t mptcp_cellicon_refcount = 0;
213 
214 /*
215  * XXX The order of the event handlers below is really
216  * really important. Think twice before changing it.
217  */
218 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
219 	{
220 		.sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
221 		.sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
222 	},
223 	{
224 		.sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
225 		.sofilt_hint_ev_hdlr =  mptcp_subflow_mpcantrcvmore_ev,
226 	},
227 	{
228 		.sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
229 		.sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
230 	},
231 	{
232 		.sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
233 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
234 	},
235 	{
236 		.sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
237 		.sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
238 	},
239 	{
240 		.sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
241 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
242 	},
243 	{
244 		.sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
245 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
246 	},
247 	{
248 		.sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
249 		.sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
250 	},
251 	{
252 		.sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
253 		.sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
254 	},
255 	{
256 		.sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
257 		.sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
258 	},
259 	{
260 		.sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
261 		.sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
262 	},
263 	{
264 		.sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
265 		.sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
266 	},
267 	{
268 		.sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
269 		.sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
270 	},
271 	{
272 		.sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
273 		.sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
274 	},
275 };
276 
277 os_log_t mptcp_log_handle;
278 
279 /*
280  * Protocol pr_init callback.
281  */
282 void
mptcp_init(struct protosw * pp,struct domain * dp)283 mptcp_init(struct protosw *pp, struct domain *dp)
284 {
285 #pragma unused(dp)
286 	static int mptcp_initialized = 0;
287 	struct protosw *prp;
288 	struct ip6protosw *prp6;
289 
290 	VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
291 
292 	/* do this only once */
293 	if (mptcp_initialized) {
294 		return;
295 	}
296 	mptcp_initialized = 1;
297 
298 	mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
299 
300 	/*
301 	 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
302 	 * we must be able to find IPPROTO_TCP entries for both.
303 	 */
304 	prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
305 	VERIFY(prp != NULL);
306 	bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
307 	bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
308 	    sizeof(mptcp_subflow_usrreqs));
309 	mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
310 	mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
311 	mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
312 	mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
313 	mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
314 	mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
315 	/*
316 	 * Socket filters shouldn't attach/detach to/from this protosw
317 	 * since pr_protosw is to be used instead, which points to the
318 	 * real protocol; if they do, it is a bug and we should panic.
319 	 */
320 	mptcp_subflow_protosw.pr_filter_head.tqh_first =
321 	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
322 	mptcp_subflow_protosw.pr_filter_head.tqh_last =
323 	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
324 
325 	prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
326 	    IPPROTO_TCP, SOCK_STREAM);
327 	VERIFY(prp6 != NULL);
328 	bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
329 	bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
330 	    sizeof(mptcp_subflow_usrreqs6));
331 	mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
332 	mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
333 	mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
334 	mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
335 	mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
336 	mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
337 	/*
338 	 * Socket filters shouldn't attach/detach to/from this protosw
339 	 * since pr_protosw is to be used instead, which points to the
340 	 * real protocol; if they do, it is a bug and we should panic.
341 	 */
342 	mptcp_subflow_protosw6.pr_filter_head.tqh_first =
343 	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
344 	mptcp_subflow_protosw6.pr_filter_head.tqh_last =
345 	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
346 
347 	bzero(&mtcbinfo, sizeof(mtcbinfo));
348 	TAILQ_INIT(&mtcbinfo.mppi_pcbs);
349 	mtcbinfo.mppi_zone = zone_create("mptc", sizeof(struct mpp_mtp), ZC_NONE);
350 
351 	mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", LCK_GRP_ATTR_NULL);
352 	lck_attr_setdefault(&mtcbinfo.mppi_lock_attr);
353 	lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
354 	    &mtcbinfo.mppi_lock_attr);
355 
356 	mtcbinfo.mppi_gc = mptcp_gc;
357 	mtcbinfo.mppi_timer = mptcp_timer;
358 
359 	/* attach to MP domain for garbage collection to take place */
360 	mp_pcbinfo_attach(&mtcbinfo);
361 
362 	mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
363 }
364 
365 int
mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats * stats,u_short ifindex,boolean_t create)366 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create)
367 {
368 	int i, index = -1;
369 
370 	for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
371 		if (create && stats[i].ifindex == IFSCOPE_NONE) {
372 			if (index < 0) {
373 				index = i;
374 			}
375 			continue;
376 		}
377 
378 		if (stats[i].ifindex == ifindex) {
379 			index = i;
380 			return index;
381 		}
382 	}
383 
384 	if (index != -1) {
385 		stats[index].ifindex = ifindex;
386 	}
387 
388 	return index;
389 }
390 
391 static int
mptcpstats_get_index(struct mptcp_itf_stats * stats,const struct mptsub * mpts)392 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
393 {
394 	const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
395 	int index;
396 
397 	if (ifp == NULL) {
398 		os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
399 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
400 		    sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
401 		return -1;
402 	}
403 
404 	index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
405 
406 	if (index != -1) {
407 		if (stats[index].is_expensive == 0) {
408 			stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
409 		}
410 	}
411 
412 	return index;
413 }
414 
415 void
mptcpstats_inc_switch(struct mptses * mpte,const struct mptsub * mpts)416 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
417 {
418 	int index;
419 
420 	tcpstat.tcps_mp_switches++;
421 	mpte->mpte_subflow_switches++;
422 
423 	index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
424 
425 	if (index != -1) {
426 		mpte->mpte_itfstats[index].switches++;
427 	}
428 }
429 
430 /*
431  * Flushes all recorded socket options from an MP socket.
432  */
433 static void
mptcp_flush_sopts(struct mptses * mpte)434 mptcp_flush_sopts(struct mptses *mpte)
435 {
436 	struct mptopt *mpo, *tmpo;
437 
438 	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
439 		mptcp_sopt_remove(mpte, mpo);
440 		mptcp_sopt_free(mpo);
441 	}
442 	VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
443 }
444 
445 /*
446  * Create an MPTCP session, called as a result of opening a MPTCP socket.
447  */
448 int
mptcp_session_create(struct mppcb * mpp)449 mptcp_session_create(struct mppcb *mpp)
450 {
451 	struct mppcbinfo *mppi;
452 	struct mptses *mpte;
453 	struct mptcb *mp_tp;
454 
455 	VERIFY(mpp != NULL);
456 	mppi = mpp->mpp_pcbinfo;
457 	VERIFY(mppi != NULL);
458 
459 	__IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
460 	__IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
461 
462 	/* MPTCP Multipath PCB Extension */
463 	bzero(mpte, sizeof(*mpte));
464 	VERIFY(mpp->mpp_pcbe == NULL);
465 	mpp->mpp_pcbe = mpte;
466 	mpte->mpte_mppcb = mpp;
467 	mpte->mpte_mptcb = mp_tp;
468 
469 	TAILQ_INIT(&mpte->mpte_sopts);
470 	TAILQ_INIT(&mpte->mpte_subflows);
471 	mpte->mpte_associd = SAE_ASSOCID_ANY;
472 	mpte->mpte_connid_last = SAE_CONNID_ANY;
473 
474 	mptcp_init_urgency_timer(mpte);
475 
476 	mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
477 	mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
478 
479 	if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
480 		mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
481 	}
482 
483 	mpte->mpte_last_cellicon_set = tcp_now;
484 
485 	/* MPTCP Protocol Control Block */
486 	bzero(mp_tp, sizeof(*mp_tp));
487 	mp_tp->mpt_mpte = mpte;
488 	mp_tp->mpt_state = MPTCPS_CLOSED;
489 
490 	DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
491 
492 	return 0;
493 }
494 
495 struct sockaddr *
mptcp_get_session_dst(struct mptses * mpte,boolean_t ipv6,boolean_t ipv4)496 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
497 {
498 	if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
499 		return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
500 	}
501 
502 	if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
503 		return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
504 	}
505 
506 	/* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
507 	 * meaning we prefer IPv6 over IPv4.
508 	 */
509 	if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
510 		return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
511 	}
512 
513 	if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
514 		return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
515 	}
516 
517 	/* We don't yet have a unicast IP */
518 	return NULL;
519 }
520 
521 static void
mptcpstats_get_bytes(struct mptses * mpte,boolean_t initial_cell,uint64_t * cellbytes,uint64_t * allbytes)522 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
523     uint64_t *cellbytes, uint64_t *allbytes)
524 {
525 	int64_t mycellbytes = 0;
526 	uint64_t myallbytes = 0;
527 	int i;
528 
529 	for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
530 		if (mpte->mpte_itfstats[i].is_expensive) {
531 			mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
532 			mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
533 		}
534 
535 		myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
536 		myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
537 	}
538 
539 	if (initial_cell) {
540 		mycellbytes -= mpte->mpte_init_txbytes;
541 		mycellbytes -= mpte->mpte_init_rxbytes;
542 	}
543 
544 	if (mycellbytes < 0) {
545 		os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
546 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
547 		*cellbytes = 0;
548 		*allbytes = 0;
549 	} else {
550 		*cellbytes = mycellbytes;
551 		*allbytes = myallbytes;
552 	}
553 }
554 
555 static void
mptcpstats_session_wrapup(struct mptses * mpte)556 mptcpstats_session_wrapup(struct mptses *mpte)
557 {
558 	boolean_t cell = mpte->mpte_initial_cell;
559 
560 	switch (mpte->mpte_svctype) {
561 	case MPTCP_SVCTYPE_HANDOVER:
562 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
563 			tcpstat.tcps_mptcp_fp_handover_attempt++;
564 
565 			if (cell && mpte->mpte_handshake_success) {
566 				tcpstat.tcps_mptcp_fp_handover_success_cell++;
567 
568 				if (mpte->mpte_used_wifi) {
569 					tcpstat.tcps_mptcp_handover_wifi_from_cell++;
570 				}
571 			} else if (mpte->mpte_handshake_success) {
572 				tcpstat.tcps_mptcp_fp_handover_success_wifi++;
573 
574 				if (mpte->mpte_used_cell) {
575 					tcpstat.tcps_mptcp_handover_cell_from_wifi++;
576 				}
577 			}
578 		} else {
579 			tcpstat.tcps_mptcp_handover_attempt++;
580 
581 			if (cell && mpte->mpte_handshake_success) {
582 				tcpstat.tcps_mptcp_handover_success_cell++;
583 
584 				if (mpte->mpte_used_wifi) {
585 					tcpstat.tcps_mptcp_handover_wifi_from_cell++;
586 				}
587 			} else if (mpte->mpte_handshake_success) {
588 				tcpstat.tcps_mptcp_handover_success_wifi++;
589 
590 				if (mpte->mpte_used_cell) {
591 					tcpstat.tcps_mptcp_handover_cell_from_wifi++;
592 				}
593 			}
594 		}
595 
596 		if (mpte->mpte_handshake_success) {
597 			uint64_t cellbytes;
598 			uint64_t allbytes;
599 
600 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
601 
602 			tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
603 			tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
604 		}
605 		break;
606 	case MPTCP_SVCTYPE_INTERACTIVE:
607 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
608 			tcpstat.tcps_mptcp_fp_interactive_attempt++;
609 
610 			if (mpte->mpte_handshake_success) {
611 				tcpstat.tcps_mptcp_fp_interactive_success++;
612 
613 				if (!cell && mpte->mpte_used_cell) {
614 					tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
615 				}
616 			}
617 		} else {
618 			tcpstat.tcps_mptcp_interactive_attempt++;
619 
620 			if (mpte->mpte_handshake_success) {
621 				tcpstat.tcps_mptcp_interactive_success++;
622 
623 				if (!cell && mpte->mpte_used_cell) {
624 					tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
625 				}
626 			}
627 		}
628 
629 		if (mpte->mpte_handshake_success) {
630 			uint64_t cellbytes;
631 			uint64_t allbytes;
632 
633 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
634 
635 			tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
636 			tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
637 		}
638 		break;
639 	case MPTCP_SVCTYPE_AGGREGATE:
640 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
641 			tcpstat.tcps_mptcp_fp_aggregate_attempt++;
642 
643 			if (mpte->mpte_handshake_success) {
644 				tcpstat.tcps_mptcp_fp_aggregate_success++;
645 			}
646 		} else {
647 			tcpstat.tcps_mptcp_aggregate_attempt++;
648 
649 			if (mpte->mpte_handshake_success) {
650 				tcpstat.tcps_mptcp_aggregate_success++;
651 			}
652 		}
653 
654 		if (mpte->mpte_handshake_success) {
655 			uint64_t cellbytes;
656 			uint64_t allbytes;
657 
658 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
659 
660 			tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
661 			tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
662 		}
663 		break;
664 	}
665 
666 	if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
667 		tcpstat.tcps_mptcp_back_to_wifi++;
668 	}
669 
670 	if (mpte->mpte_triggered_cell) {
671 		tcpstat.tcps_mptcp_triggered_cell++;
672 	}
673 }
674 
675 /*
676  * Destroy an MPTCP session.
677  */
678 static void
mptcp_session_destroy(struct mptses * mpte)679 mptcp_session_destroy(struct mptses *mpte)
680 {
681 	struct mptcb *mp_tp = mpte->mpte_mptcb;
682 
683 	VERIFY(mp_tp != NULL);
684 	VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
685 
686 	mptcpstats_session_wrapup(mpte);
687 	mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
688 	mptcp_flush_sopts(mpte);
689 
690 	if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
691 		kfree_data(mpte->mpte_itfinfo,
692 		    sizeof(*mpte->mpte_itfinfo) * mpte->mpte_itfinfo_size);
693 	}
694 	mpte->mpte_itfinfo = NULL;
695 
696 	mptcp_freeq(mp_tp);
697 	m_freem_list(mpte->mpte_reinjectq);
698 
699 	os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
700 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
701 }
702 
703 boolean_t
mptcp_ok_to_create_subflows(struct mptcb * mp_tp)704 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
705 {
706 	return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
707 	       mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
708 	       !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
709 }
710 
711 static int
mptcp_synthesize_nat64(struct in6_addr * addr,uint32_t len,const struct in_addr * addrv4)712 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
713     const struct in_addr *addrv4)
714 {
715 	static const struct in6_addr well_known_prefix = {
716 		.__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
717 			                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
718 			                 0x00, 0x00, 0x00, 0x00},
719 	};
720 	const char *ptrv4 = (const char *)addrv4;
721 	char *ptr = (char *)addr;
722 
723 	if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
724 	    IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
725 	    IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
726 	    IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
727 	    IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
728 	    IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
729 	    INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
730 		return -1;
731 	}
732 
733 	/* Check for the well-known prefix */
734 	if (len == NAT64_PREFIX_LEN_96 &&
735 	    IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
736 		if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
737 		    IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
738 			return -1;
739 		}
740 	}
741 
742 	switch (len) {
743 	case NAT64_PREFIX_LEN_96:
744 		memcpy(ptr + 12, ptrv4, 4);
745 		break;
746 	case NAT64_PREFIX_LEN_64:
747 		memcpy(ptr + 9, ptrv4, 4);
748 		break;
749 	case NAT64_PREFIX_LEN_56:
750 		memcpy(ptr + 7, ptrv4, 1);
751 		memcpy(ptr + 9, ptrv4 + 1, 3);
752 		break;
753 	case NAT64_PREFIX_LEN_48:
754 		memcpy(ptr + 6, ptrv4, 2);
755 		memcpy(ptr + 9, ptrv4 + 2, 2);
756 		break;
757 	case NAT64_PREFIX_LEN_40:
758 		memcpy(ptr + 5, ptrv4, 3);
759 		memcpy(ptr + 9, ptrv4 + 3, 1);
760 		break;
761 	case NAT64_PREFIX_LEN_32:
762 		memcpy(ptr + 4, ptrv4, 4);
763 		break;
764 	default:
765 		panic("NAT64-prefix len is wrong: %u", len);
766 	}
767 
768 	return 0;
769 }
770 
771 static void
mptcp_trigger_cell_bringup(struct mptses * mpte)772 mptcp_trigger_cell_bringup(struct mptses *mpte)
773 {
774 	struct socket *mp_so = mptetoso(mpte);
775 
776 	if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
777 		uuid_string_t uuidstr;
778 		int err;
779 
780 		socket_unlock(mp_so, 0);
781 		err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
782 		    TRUE);
783 		socket_lock(mp_so, 0);
784 
785 		if (err == 0) {
786 			mpte->mpte_triggered_cell = 1;
787 		}
788 
789 		uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
790 		os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
791 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
792 	} else {
793 		os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
794 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
795 	}
796 }
797 
798 static boolean_t
mptcp_subflow_disconnecting(struct mptsub * mpts)799 mptcp_subflow_disconnecting(struct mptsub *mpts)
800 {
801 	if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
802 		return true;
803 	}
804 
805 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
806 		return true;
807 	}
808 
809 	if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
810 		return true;
811 	}
812 
813 	return false;
814 }
815 
816 /*
817  * In Handover mode, only create cell subflow if
818  * - Symptoms marked WiFi as weak:
819  *   Here, if we are sending data, then we can check the RTO-state. That is a
820  *   stronger signal of WiFi quality than the Symptoms indicator.
821  *   If however we are not sending any data, the only thing we can do is guess
822  *   and thus bring up Cell.
823  *
824  * - Symptoms marked WiFi as unknown:
825  *   In this state we don't know what the situation is and thus remain
826  *   conservative, only bringing up cell if there are retransmissions going on.
827  */
828 static boolean_t
mptcp_handover_use_cellular(struct mptses * mpte,struct tcpcb * tp)829 mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
830 {
831 	int unusable_state = mptcp_is_wifi_unusable_for_session(mpte);
832 
833 	if (unusable_state == 0) {
834 		/* WiFi is good - don't use cell */
835 		return false;
836 	}
837 
838 	if (unusable_state == -1) {
839 		/*
840 		 * We are in unknown state, only use Cell if we have confirmed
841 		 * that WiFi is bad.
842 		 */
843 		if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
844 			return true;
845 		} else {
846 			return false;
847 		}
848 	}
849 
850 	if (unusable_state == 1) {
851 		/*
852 		 * WiFi is confirmed to be bad from Symptoms-Framework.
853 		 * If we are sending data, check the RTOs.
854 		 * Otherwise, be pessimistic and use Cell.
855 		 */
856 		if (mptetoso(mpte)->so_snd.sb_cc != 0) {
857 			if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
858 				return true;
859 			} else {
860 				return false;
861 			}
862 		} else {
863 			return true;
864 		}
865 	}
866 
867 	return false;
868 }
869 
870 void
mptcp_check_subflows_and_add(struct mptses * mpte)871 mptcp_check_subflows_and_add(struct mptses *mpte)
872 {
873 	struct mptcb *mp_tp = mpte->mpte_mptcb;
874 	boolean_t cellular_viable = FALSE;
875 	boolean_t want_cellular = TRUE;
876 	uint32_t i;
877 
878 	if (!mptcp_ok_to_create_subflows(mp_tp)) {
879 		os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
880 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
881 		return;
882 	}
883 
884 	/* Just to see if we have an IP-address available */
885 	if (mptcp_get_session_dst(mpte, false, false) == NULL) {
886 		return;
887 	}
888 
889 	for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
890 		boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
891 		struct mpt_itf_info *info;
892 		struct sockaddr_in6 nat64pre;
893 		struct sockaddr *dst;
894 		struct mptsub *mpts;
895 		struct ifnet *ifp;
896 		uint32_t ifindex;
897 
898 		info = &mpte->mpte_itfinfo[i];
899 
900 		ifindex = info->ifindex;
901 		if (ifindex == IFSCOPE_NONE) {
902 			continue;
903 		}
904 
905 		os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
906 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
907 		    info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
908 
909 		if (info->no_mptcp_support) {
910 			continue;
911 		}
912 
913 		ifnet_head_lock_shared();
914 		ifp = ifindex2ifnet[ifindex];
915 		ifnet_head_done();
916 
917 		if (ifp == NULL) {
918 			continue;
919 		}
920 
921 		if (IFNET_IS_CELLULAR(ifp)) {
922 			cellular_viable = TRUE;
923 
924 			if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
925 			    mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
926 				if (!mptcp_is_wifi_unusable_for_session(mpte)) {
927 					continue;
928 				}
929 			}
930 		}
931 
932 		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
933 			const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
934 			struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
935 
936 			if (subifp == NULL) {
937 				continue;
938 			}
939 
940 			/*
941 			 * If there is at least one functioning subflow on WiFi
942 			 * and we are checking for the cell interface, then
943 			 * we always need to ask symptoms for permission as
944 			 * cell is triggered even if WiFi is available.
945 			 */
946 			if (!IFNET_IS_CELLULAR(subifp) &&
947 			    !mptcp_subflow_disconnecting(mpts) &&
948 			    IFNET_IS_CELLULAR(ifp)) {
949 				need_to_ask_symptoms = TRUE;
950 			}
951 
952 			if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
953 				os_log(mptcp_log_handle,
954 				    "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
955 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
956 				    mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
957 				    IFNET_IS_CELLULAR(subifp),
958 				    mptcp_is_wifi_unusable_for_session(mpte),
959 				    mpts->mpts_flags,
960 				    tp->t_rxtshift,
961 				    !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
962 				    mptetoso(mpte)->so_snd.sb_cc,
963 				    ifindex, subifp->if_index,
964 				    tp->t_srtt >> TCP_RTT_SHIFT,
965 				    tp->t_rttvar >> TCP_RTTVAR_SHIFT,
966 				    tp->t_rxtcur);
967 
968 				if (!IFNET_IS_CELLULAR(subifp) &&
969 				    !mptcp_subflow_disconnecting(mpts) &&
970 				    (mpts->mpts_flags & MPTSF_CONNECTED) &&
971 				    !mptcp_handover_use_cellular(mpte, tp)) {
972 					found = TRUE;
973 
974 					/* We found a proper subflow on WiFi - no need for cell */
975 					want_cellular = FALSE;
976 					break;
977 				}
978 			} else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
979 				uint64_t time_now = mach_continuous_time();
980 
981 				os_log(mptcp_log_handle,
982 				    "%s - %lx: target-based: %llu now %llu unusable? %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
983 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
984 				    time_now, mptcp_is_wifi_unusable_for_session(mpte),
985 				    IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
986 				    mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
987 
988 				if (!IFNET_IS_CELLULAR(subifp) &&
989 				    !mptcp_subflow_disconnecting(mpts) &&
990 				    (mpte->mpte_time_target == 0 ||
991 				    (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
992 				    !mptcp_is_wifi_unusable_for_session(mpte))) {
993 					found = TRUE;
994 
995 					want_cellular = FALSE;
996 					break;
997 				}
998 			}
999 
1000 			if (subifp->if_index == ifindex &&
1001 			    !mptcp_subflow_disconnecting(mpts)) {
1002 				/*
1003 				 * We found a subflow on this interface.
1004 				 * No need to create a new one.
1005 				 */
1006 				found = TRUE;
1007 				break;
1008 			}
1009 		}
1010 
1011 		if (found) {
1012 			continue;
1013 		}
1014 
1015 		if (need_to_ask_symptoms &&
1016 		    !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
1017 		    !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
1018 		    mptcp_developer_mode == 0) {
1019 			mptcp_ask_symptoms(mpte);
1020 			return;
1021 		}
1022 
1023 		dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
1024 
1025 		if (dst->sa_family == AF_INET &&
1026 		    !info->has_v4_conn && info->has_nat64_conn) {
1027 			struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
1028 			int error, j;
1029 
1030 			bzero(&nat64pre, sizeof(struct sockaddr_in6));
1031 
1032 			error = ifnet_get_nat64prefix(ifp, nat64prefixes);
1033 			if (error) {
1034 				os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
1035 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
1036 				continue;
1037 			}
1038 
1039 			for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
1040 				if (nat64prefixes[j].prefix_len != 0) {
1041 					break;
1042 				}
1043 			}
1044 
1045 			VERIFY(j < NAT64_MAX_NUM_PREFIXES);
1046 
1047 			error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
1048 			    nat64prefixes[j].prefix_len,
1049 			    &((struct sockaddr_in *)(void *)dst)->sin_addr);
1050 			if (error != 0) {
1051 				os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
1052 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1053 				continue;
1054 			}
1055 
1056 			memcpy(&nat64pre.sin6_addr,
1057 			    &nat64prefixes[j].ipv6_prefix,
1058 			    sizeof(nat64pre.sin6_addr));
1059 			nat64pre.sin6_len = sizeof(struct sockaddr_in6);
1060 			nat64pre.sin6_family = AF_INET6;
1061 			nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
1062 			nat64pre.sin6_flowinfo = 0;
1063 			nat64pre.sin6_scope_id = 0;
1064 
1065 			dst = (struct sockaddr *)&nat64pre;
1066 		}
1067 
1068 		if (dst->sa_family == AF_INET && !info->has_v4_conn) {
1069 			continue;
1070 		}
1071 		if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
1072 			continue;
1073 		}
1074 
1075 		mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
1076 	}
1077 
1078 	if (!cellular_viable && want_cellular) {
1079 		/* Trigger Cell Bringup */
1080 		mptcp_trigger_cell_bringup(mpte);
1081 	}
1082 }
1083 
1084 static void
mptcp_remove_cell_subflows(struct mptses * mpte)1085 mptcp_remove_cell_subflows(struct mptses *mpte)
1086 {
1087 	struct mptsub *mpts, *tmpts;
1088 
1089 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1090 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1091 
1092 		if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
1093 			continue;
1094 		}
1095 
1096 		os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
1097 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1098 
1099 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1100 	}
1101 
1102 	return;
1103 }
1104 
1105 static void
mptcp_remove_wifi_subflows(struct mptses * mpte)1106 mptcp_remove_wifi_subflows(struct mptses *mpte)
1107 {
1108 	struct mptsub *mpts, *tmpts;
1109 
1110 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1111 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1112 
1113 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1114 			continue;
1115 		}
1116 
1117 		os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
1118 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
1119 
1120 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1121 	}
1122 
1123 	return;
1124 }
1125 
1126 static void
mptcp_pure_handover_subflows_remove(struct mptses * mpte)1127 mptcp_pure_handover_subflows_remove(struct mptses *mpte)
1128 {
1129 	int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
1130 	boolean_t found_working_wifi_subflow = false;
1131 	boolean_t found_working_cell_subflow = false;
1132 
1133 	struct mptsub *mpts;
1134 
1135 	/*
1136 	 * Look for a subflow that is on a non-cellular interface in connected
1137 	 * state.
1138 	 *
1139 	 * In that case, remove all cellular subflows.
1140 	 *
1141 	 * If however there is no connected subflow
1142 	 */
1143 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1144 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1145 		struct socket *so;
1146 		struct tcpcb *tp;
1147 
1148 		if (ifp == NULL) {
1149 			continue;
1150 		}
1151 
1152 		so = mpts->mpts_socket;
1153 		tp = sototcpcb(so);
1154 
1155 		if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1156 		    tp->t_state != TCPS_ESTABLISHED ||
1157 		    mptcp_subflow_disconnecting(mpts)) {
1158 			continue;
1159 		}
1160 
1161 		if (IFNET_IS_CELLULAR(ifp)) {
1162 			found_working_cell_subflow = true;
1163 		} else {
1164 			os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
1165 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
1166 			if (!mptcp_handover_use_cellular(mpte, tp)) {
1167 				found_working_wifi_subflow = true;
1168 			}
1169 		}
1170 	}
1171 
1172 	/*
1173 	 * Couldn't find a working subflow, let's not remove those on a cellular
1174 	 * interface.
1175 	 */
1176 	os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
1177 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1178 	    found_working_wifi_subflow, found_working_cell_subflow);
1179 	if (!found_working_wifi_subflow && wifi_unusable) {
1180 		if (found_working_cell_subflow) {
1181 			mptcp_remove_wifi_subflows(mpte);
1182 		}
1183 		return;
1184 	}
1185 
1186 	mptcp_remove_cell_subflows(mpte);
1187 }
1188 
1189 static void
mptcp_handover_subflows_remove(struct mptses * mpte)1190 mptcp_handover_subflows_remove(struct mptses *mpte)
1191 {
1192 	int wifi_unusable = mptcp_is_wifi_unusable_for_session(mpte);
1193 	boolean_t found_working_subflow = false;
1194 	struct mptsub *mpts;
1195 
1196 	/*
1197 	 * Look for a subflow that is on a non-cellular interface
1198 	 * and actually works (aka, no retransmission timeout).
1199 	 */
1200 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1201 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1202 		struct socket *so;
1203 		struct tcpcb *tp;
1204 
1205 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1206 			continue;
1207 		}
1208 
1209 		so = mpts->mpts_socket;
1210 		tp = sototcpcb(so);
1211 
1212 		if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1213 		    tp->t_state != TCPS_ESTABLISHED) {
1214 			continue;
1215 		}
1216 
1217 		os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u unusable %d\n",
1218 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_unusable);
1219 
1220 		if (!mptcp_handover_use_cellular(mpte, tp)) {
1221 			found_working_subflow = true;
1222 			break;
1223 		}
1224 	}
1225 
1226 	/*
1227 	 * Couldn't find a working subflow, let's not remove those on a cellular
1228 	 * interface.
1229 	 */
1230 	if (!found_working_subflow) {
1231 		return;
1232 	}
1233 
1234 	mptcp_remove_cell_subflows(mpte);
1235 }
1236 
1237 static void
mptcp_targetbased_subflows_remove(struct mptses * mpte)1238 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1239 {
1240 	uint64_t time_now = mach_continuous_time();
1241 	struct mptsub *mpts;
1242 
1243 	if (mpte->mpte_time_target != 0 &&
1244 	    (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1245 	    mptcp_is_wifi_unusable_for_session(mpte)) {
1246 		/* WiFi is bad and we are below the target - don't remove any subflows */
1247 		return;
1248 	}
1249 
1250 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1251 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1252 
1253 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1254 			continue;
1255 		}
1256 
1257 		/* We have a functioning subflow on WiFi. No need for cell! */
1258 		if (mpts->mpts_flags & MPTSF_CONNECTED &&
1259 		    !mptcp_subflow_disconnecting(mpts)) {
1260 			mptcp_remove_cell_subflows(mpte);
1261 			break;
1262 		}
1263 	}
1264 }
1265 
1266 /*
1267  * Based on the MPTCP Service-type and the state of the subflows, we
1268  * will destroy subflows here.
1269  */
1270 void
mptcp_check_subflows_and_remove(struct mptses * mpte)1271 mptcp_check_subflows_and_remove(struct mptses *mpte)
1272 {
1273 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1274 		return;
1275 	}
1276 
1277 	socket_lock_assert_owned(mptetoso(mpte));
1278 
1279 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1280 		mptcp_pure_handover_subflows_remove(mpte);
1281 	}
1282 
1283 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1284 		mptcp_handover_subflows_remove(mpte);
1285 	}
1286 
1287 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1288 		mptcp_targetbased_subflows_remove(mpte);
1289 	}
1290 }
1291 
1292 static void
mptcp_remove_subflows(struct mptses * mpte)1293 mptcp_remove_subflows(struct mptses *mpte)
1294 {
1295 	struct mptsub *mpts, *tmpts;
1296 
1297 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1298 		return;
1299 	}
1300 
1301 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1302 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1303 		boolean_t found = false;
1304 		uint32_t ifindex;
1305 		uint32_t i;
1306 
1307 		if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1308 			mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1309 
1310 			os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1311 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1312 			    ifp ? ifp->if_index : -1);
1313 			soevent(mpts->mpts_socket,
1314 			    SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1315 
1316 			continue;
1317 		}
1318 
1319 		if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1320 			continue;
1321 		}
1322 
1323 		if (ifp) {
1324 			ifindex = ifp->if_index;
1325 		} else {
1326 			ifindex = mpts->mpts_ifscope;
1327 		}
1328 
1329 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1330 			if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1331 				continue;
1332 			}
1333 
1334 			if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1335 				if (mpts->mpts_dst.sa_family == AF_INET6 &&
1336 				    (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1337 					found = true;
1338 					break;
1339 				}
1340 
1341 				if (mpts->mpts_dst.sa_family == AF_INET &&
1342 				    mpte->mpte_itfinfo[i].has_v4_conn) {
1343 					found = true;
1344 					break;
1345 				}
1346 			}
1347 		}
1348 
1349 		if (!found) {
1350 			os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1351 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1352 			    ifindex, mpts->mpts_flags);
1353 
1354 			soevent(mpts->mpts_socket,
1355 			    SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1356 		}
1357 	}
1358 }
1359 
1360 static void
mptcp_create_subflows(__unused void * arg)1361 mptcp_create_subflows(__unused void *arg)
1362 {
1363 	struct mppcb *mpp;
1364 
1365 	/*
1366 	 * Start with clearing, because we might be processing connections
1367 	 * while a new event comes in.
1368 	 */
1369 	if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1370 		os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1371 	}
1372 
1373 	/* Iterate over all MPTCP connections */
1374 
1375 	lck_mtx_lock(&mtcbinfo.mppi_lock);
1376 
1377 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1378 		struct socket *mp_so = mpp->mpp_socket;
1379 		struct mptses *mpte = mpp->mpp_pcbe;
1380 
1381 		if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1382 			continue;
1383 		}
1384 
1385 		socket_lock(mp_so, 1);
1386 		VERIFY(mp_so->so_usecount > 0);
1387 
1388 		mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1389 
1390 		mptcp_check_subflows_and_add(mpte);
1391 		mptcp_remove_subflows(mpte);
1392 
1393 		mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1394 		socket_unlock(mp_so, 1);
1395 	}
1396 
1397 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
1398 }
1399 
1400 /*
1401  * We need this because we are coming from an NECP-event. This event gets posted
1402  * while holding NECP-locks. The creation of the subflow however leads us back
1403  * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1404  * So, we would deadlock there as we already hold the NECP-lock.
1405  *
1406  * So, let's schedule this separately. It also gives NECP the chance to make
1407  * progress, without having to wait for MPTCP to finish its subflow creation.
1408  */
1409 void
mptcp_sched_create_subflows(struct mptses * mpte)1410 mptcp_sched_create_subflows(struct mptses *mpte)
1411 {
1412 	struct mppcb *mpp = mpte->mpte_mppcb;
1413 	struct mptcb *mp_tp = mpte->mpte_mptcb;
1414 	struct socket *mp_so = mpp->mpp_socket;
1415 
1416 	if (!mptcp_ok_to_create_subflows(mp_tp)) {
1417 		os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1418 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1419 		return;
1420 	}
1421 
1422 	if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1423 		mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1424 		mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1425 	}
1426 
1427 	if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1428 		return;
1429 	}
1430 
1431 	/* Do the call in 100ms to allow NECP to schedule it on all sockets */
1432 	timeout(mptcp_create_subflows, NULL, hz / 10);
1433 }
1434 
1435 /*
1436  * Allocate an MPTCP socket option structure.
1437  */
1438 struct mptopt *
mptcp_sopt_alloc(zalloc_flags_t how)1439 mptcp_sopt_alloc(zalloc_flags_t how)
1440 {
1441 	return zalloc_flags(mptopt_zone, how | Z_ZERO);
1442 }
1443 
1444 /*
1445  * Free an MPTCP socket option structure.
1446  */
1447 void
mptcp_sopt_free(struct mptopt * mpo)1448 mptcp_sopt_free(struct mptopt *mpo)
1449 {
1450 	VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1451 
1452 	zfree(mptopt_zone, mpo);
1453 }
1454 
1455 /*
1456  * Add a socket option to the MPTCP socket option list.
1457  */
1458 void
mptcp_sopt_insert(struct mptses * mpte,struct mptopt * mpo)1459 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1460 {
1461 	socket_lock_assert_owned(mptetoso(mpte));
1462 	mpo->mpo_flags |= MPOF_ATTACHED;
1463 	TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1464 }
1465 
1466 /*
1467  * Remove a socket option from the MPTCP socket option list.
1468  */
1469 void
mptcp_sopt_remove(struct mptses * mpte,struct mptopt * mpo)1470 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1471 {
1472 	socket_lock_assert_owned(mptetoso(mpte));
1473 	VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1474 	mpo->mpo_flags &= ~MPOF_ATTACHED;
1475 	TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1476 }
1477 
1478 /*
1479  * Search for an existing <sopt_level,sopt_name> socket option.
1480  */
1481 struct mptopt *
mptcp_sopt_find(struct mptses * mpte,struct sockopt * sopt)1482 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1483 {
1484 	struct mptopt *mpo;
1485 
1486 	socket_lock_assert_owned(mptetoso(mpte));
1487 
1488 	TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1489 		if (mpo->mpo_level == sopt->sopt_level &&
1490 		    mpo->mpo_name == sopt->sopt_name) {
1491 			break;
1492 		}
1493 	}
1494 	return mpo;
1495 }
1496 
1497 /*
1498  * Allocate a MPTCP subflow structure.
1499  */
1500 static struct mptsub *
mptcp_subflow_alloc(void)1501 mptcp_subflow_alloc(void)
1502 {
1503 	return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1504 }
1505 
1506 /*
1507  * Deallocate a subflow structure, called when all of the references held
1508  * on it have been released.  This implies that the subflow has been deleted.
1509  */
1510 static void
mptcp_subflow_free(struct mptsub * mpts)1511 mptcp_subflow_free(struct mptsub *mpts)
1512 {
1513 	VERIFY(mpts->mpts_refcnt == 0);
1514 	VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1515 	VERIFY(mpts->mpts_mpte == NULL);
1516 	VERIFY(mpts->mpts_socket == NULL);
1517 
1518 	free_sockaddr(mpts->mpts_src);
1519 
1520 	zfree(mptsub_zone, mpts);
1521 }
1522 
1523 static void
mptcp_subflow_addref(struct mptsub * mpts)1524 mptcp_subflow_addref(struct mptsub *mpts)
1525 {
1526 	if (++mpts->mpts_refcnt == 0) {
1527 		panic("%s: mpts %p wraparound refcnt", __func__, mpts);
1528 	}
1529 	/* NOTREACHED */
1530 }
1531 
1532 static void
mptcp_subflow_remref(struct mptsub * mpts)1533 mptcp_subflow_remref(struct mptsub *mpts)
1534 {
1535 	if (mpts->mpts_refcnt == 0) {
1536 		panic("%s: mpts %p negative refcnt", __func__, mpts);
1537 		/* NOTREACHED */
1538 	}
1539 	if (--mpts->mpts_refcnt > 0) {
1540 		return;
1541 	}
1542 
1543 	/* callee will unlock and destroy lock */
1544 	mptcp_subflow_free(mpts);
1545 }
1546 
1547 static void
mptcp_subflow_attach(struct mptses * mpte,struct mptsub * mpts,struct socket * so)1548 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1549 {
1550 	struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1551 	struct tcpcb *tp = sototcpcb(so);
1552 
1553 	/*
1554 	 * From this moment on, the subflow is linked to the MPTCP-connection.
1555 	 * Locking,... happens now at the MPTCP-layer
1556 	 */
1557 	tp->t_mptcb = mpte->mpte_mptcb;
1558 	so->so_flags |= SOF_MP_SUBFLOW;
1559 	mp_so->so_usecount++;
1560 
1561 	/*
1562 	 * Insert the subflow into the list, and associate the MPTCP PCB
1563 	 * as well as the the subflow socket.  From this point on, removing
1564 	 * the subflow needs to be done via mptcp_subflow_del().
1565 	 */
1566 	TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1567 	mpte->mpte_numflows++;
1568 
1569 	atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1570 	mpts->mpts_mpte = mpte;
1571 	mpts->mpts_socket = so;
1572 	tp->t_mpsub = mpts;
1573 	mptcp_subflow_addref(mpts);     /* for being in MPTCP subflow list */
1574 	mptcp_subflow_addref(mpts);     /* for subflow socket */
1575 }
1576 
1577 static void
mptcp_subflow_necp_cb(void * handle,__unused int action,__unused uint32_t interface_index,uint32_t necp_flags,bool * viable)1578 mptcp_subflow_necp_cb(void *handle, __unused int action,
1579     __unused uint32_t interface_index,
1580     uint32_t necp_flags, bool *viable)
1581 {
1582 	boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1583 	struct inpcb *inp = (struct inpcb *)handle;
1584 	struct socket *so = inp->inp_socket;
1585 	struct mptsub *mpts;
1586 	struct mptses *mpte;
1587 
1588 	if (low_power) {
1589 		action = NECP_CLIENT_CBACTION_NONVIABLE;
1590 	}
1591 
1592 	if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1593 		return;
1594 	}
1595 
1596 	/*
1597 	 * The socket is being garbage-collected. There is nothing to be done
1598 	 * here.
1599 	 */
1600 	if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1601 		return;
1602 	}
1603 
1604 	socket_lock(so, 1);
1605 
1606 	/* Check again after we acquired the lock. */
1607 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1608 		goto out;
1609 	}
1610 
1611 	mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1612 	mpts = sototcpcb(so)->t_mpsub;
1613 
1614 	os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1615 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1616 
1617 	mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1618 
1619 	mptcp_sched_create_subflows(mpte);
1620 
1621 	if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1622 	    mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
1623 	    mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1624 	    viable != NULL) {
1625 		*viable = 1;
1626 	}
1627 
1628 out:
1629 	socket_unlock(so, 1);
1630 }
1631 
1632 /*
1633  * Create an MPTCP subflow socket.
1634  */
1635 static int
mptcp_subflow_socreate(struct mptses * mpte,struct mptsub * mpts,int dom,struct socket ** so)1636 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1637     struct socket **so)
1638 {
1639 	lck_mtx_t *subflow_mtx;
1640 	struct mptopt smpo, *mpo, *tmpo;
1641 	struct proc *p;
1642 	struct socket *mp_so;
1643 	struct mppcb *mpp;
1644 	int error;
1645 
1646 	*so = NULL;
1647 
1648 	mp_so = mptetoso(mpte);
1649 	mpp = mpsotomppcb(mp_so);
1650 
1651 	p = proc_find(mp_so->last_pid);
1652 	if (p == PROC_NULL) {
1653 		os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1654 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1655 
1656 		mptcp_subflow_free(mpts);
1657 		return ESRCH;
1658 	}
1659 
1660 	/*
1661 	 * Create the subflow socket (multipath subflow, non-blocking.)
1662 	 *
1663 	 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1664 	 * socket; it will be cleared when the socket is peeled off or closed.
1665 	 * It also indicates to the underlying TCP to handle MPTCP options.
1666 	 * A multipath subflow socket implies SS_NOFDREF state.
1667 	 */
1668 
1669 	/*
1670 	 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1671 	 * the ipi-lock. We cannot hold the socket-lock at that point.
1672 	 */
1673 	socket_unlock(mp_so, 0);
1674 	error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1675 	    SOCF_MPTCP, PROC_NULL);
1676 	socket_lock(mp_so, 0);
1677 	if (error) {
1678 		os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1679 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1680 
1681 		proc_rele(p);
1682 
1683 		mptcp_subflow_free(mpts);
1684 		return error;
1685 	}
1686 
1687 	/*
1688 	 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1689 	 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1690 	 * Which is why we also need to get the lock with pr_getlock, as after
1691 	 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1692 	 */
1693 	subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1694 	lck_mtx_lock(subflow_mtx);
1695 
1696 	/*
1697 	 * Must be the first thing we do, to make sure all pointers for this
1698 	 * subflow are set.
1699 	 */
1700 	mptcp_subflow_attach(mpte, mpts, *so);
1701 
1702 	/*
1703 	 * A multipath subflow socket is used internally in the kernel,
1704 	 * therefore it does not have a file desciptor associated by
1705 	 * default.
1706 	 */
1707 	(*so)->so_state |= SS_NOFDREF;
1708 
1709 	lck_mtx_unlock(subflow_mtx);
1710 
1711 	/* prevent the socket buffers from being compressed */
1712 	(*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1713 	(*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1714 
1715 	/* Inherit preconnect and TFO data flags */
1716 	if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1717 		(*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1718 	}
1719 	if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1720 		(*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1721 	}
1722 	if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1723 		(*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1724 	}
1725 
1726 	/* Inherit uuid and create the related flow. */
1727 	if (!uuid_is_null(mpp->necp_client_uuid)) {
1728 		struct mptcb *mp_tp = mpte->mpte_mptcb;
1729 
1730 		sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1731 
1732 		/*
1733 		 * A note on the unlock: With MPTCP, we do multiple times a
1734 		 * necp_client_register_socket_flow. This is problematic,
1735 		 * because now the lock-ordering guarantee (first necp-locks,
1736 		 * then socket-locks) is no more respected. So, we need to
1737 		 * unlock here.
1738 		 */
1739 		socket_unlock(mp_so, 0);
1740 		error = necp_client_register_socket_flow(mp_so->last_pid,
1741 		    mpp->necp_client_uuid, sotoinpcb(*so));
1742 		socket_lock(mp_so, 0);
1743 
1744 		if (error) {
1745 			os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1746 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1747 
1748 			goto out_err;
1749 		}
1750 
1751 		/* Possible state-change during the unlock above */
1752 		if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1753 		    (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1754 			os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1755 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1756 			    mp_tp->mpt_state, mp_tp->mpt_flags);
1757 
1758 			error = EINVAL;
1759 			goto out_err;
1760 		}
1761 
1762 		uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpp->necp_client_uuid);
1763 	}
1764 
1765 	if (mpp->inp_necp_attributes.inp_domain != NULL) {
1766 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain);
1767 		sotoinpcb(*so)->inp_necp_attributes.inp_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1768 
1769 		if (sotoinpcb(*so)->inp_necp_attributes.inp_domain) {
1770 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain, mpp->inp_necp_attributes.inp_domain, string_size + 1);
1771 		}
1772 	}
1773 	if (mpp->inp_necp_attributes.inp_account != NULL) {
1774 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_account);
1775 		sotoinpcb(*so)->inp_necp_attributes.inp_account = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1776 
1777 		if (sotoinpcb(*so)->inp_necp_attributes.inp_account) {
1778 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_account, mpp->inp_necp_attributes.inp_account, string_size + 1);
1779 		}
1780 	}
1781 
1782 	if (mpp->inp_necp_attributes.inp_domain_owner != NULL) {
1783 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain_owner);
1784 		sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1785 
1786 		if (sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner) {
1787 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner, mpp->inp_necp_attributes.inp_domain_owner, string_size + 1);
1788 		}
1789 	}
1790 
1791 	if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) {
1792 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_tracker_domain);
1793 		sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1794 
1795 		if (sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain) {
1796 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain, mpp->inp_necp_attributes.inp_tracker_domain, string_size + 1);
1797 		}
1798 	}
1799 
1800 	/* Needs to happen prior to the delegation! */
1801 	(*so)->last_pid = mp_so->last_pid;
1802 
1803 	if (mp_so->so_flags & SOF_DELEGATED) {
1804 		if (mpte->mpte_epid) {
1805 			error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1806 			if (error) {
1807 				os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1808 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1809 				goto out_err;
1810 			}
1811 		}
1812 		if (!uuid_is_null(mpte->mpte_euuid)) {
1813 			error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1814 			if (error) {
1815 				os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1816 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1817 				goto out_err;
1818 			}
1819 		}
1820 	}
1821 
1822 	/* inherit the other socket options */
1823 	bzero(&smpo, sizeof(smpo));
1824 	smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1825 	smpo.mpo_level = SOL_SOCKET;
1826 	smpo.mpo_intval = 1;
1827 
1828 	/* disable SIGPIPE */
1829 	smpo.mpo_name = SO_NOSIGPIPE;
1830 	if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1831 		goto out_err;
1832 	}
1833 
1834 	/* find out if the subflow's source address goes away */
1835 	smpo.mpo_name = SO_NOADDRERR;
1836 	if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1837 		goto out_err;
1838 	}
1839 
1840 	if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1841 		/*
1842 		 * On secondary subflows we might need to set the cell-fallback
1843 		 * flag (see conditions in mptcp_subflow_sosetopt).
1844 		 */
1845 		smpo.mpo_level = SOL_SOCKET;
1846 		smpo.mpo_name = SO_MARK_CELLFALLBACK;
1847 		smpo.mpo_intval = 1;
1848 		if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1849 			goto out_err;
1850 		}
1851 	}
1852 
1853 	/* replay setsockopt(2) on the subflow sockets for eligible options */
1854 	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1855 		int interim;
1856 
1857 		if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1858 			continue;
1859 		}
1860 
1861 		/*
1862 		 * Skip those that are handled internally; these options
1863 		 * should not have been recorded and marked with the
1864 		 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1865 		 */
1866 		if (mpo->mpo_level == SOL_SOCKET &&
1867 		    (mpo->mpo_name == SO_NOSIGPIPE ||
1868 		    mpo->mpo_name == SO_NOADDRERR ||
1869 		    mpo->mpo_name == SO_KEEPALIVE)) {
1870 			continue;
1871 		}
1872 
1873 		interim = (mpo->mpo_flags & MPOF_INTERIM);
1874 		if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1875 			os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1876 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1877 			    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1878 			    mpo->mpo_intval);
1879 			mptcp_sopt_remove(mpte, mpo);
1880 			mptcp_sopt_free(mpo);
1881 			continue;
1882 		}
1883 	}
1884 
1885 	/*
1886 	 * We need to receive everything that the subflow socket has,
1887 	 * so use a customized socket receive function.  We will undo
1888 	 * this when the socket is peeled off or closed.
1889 	 */
1890 	switch (dom) {
1891 	case PF_INET:
1892 		(*so)->so_proto = &mptcp_subflow_protosw;
1893 		break;
1894 	case PF_INET6:
1895 		(*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1896 		break;
1897 	default:
1898 		VERIFY(0);
1899 		/* NOTREACHED */
1900 	}
1901 
1902 	proc_rele(p);
1903 
1904 	DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1905 	    int, dom, int, error);
1906 
1907 	return 0;
1908 
1909 out_err:
1910 	mptcp_subflow_abort(mpts, error);
1911 
1912 	proc_rele(p);
1913 
1914 	return error;
1915 }
1916 
1917 /*
1918  * Close an MPTCP subflow socket.
1919  *
1920  * Note that this may be called on an embryonic subflow, and the only
1921  * thing that is guaranteed valid is the protocol-user request.
1922  */
1923 static void
mptcp_subflow_soclose(struct mptsub * mpts)1924 mptcp_subflow_soclose(struct mptsub *mpts)
1925 {
1926 	struct socket *so = mpts->mpts_socket;
1927 
1928 	if (mpts->mpts_flags & MPTSF_CLOSED) {
1929 		return;
1930 	}
1931 
1932 	VERIFY(so != NULL);
1933 	VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1934 	VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1935 
1936 	DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1937 	    struct socket *, so,
1938 	    struct sockbuf *, &so->so_rcv,
1939 	    struct sockbuf *, &so->so_snd,
1940 	    struct mptses *, mpts->mpts_mpte);
1941 
1942 	mpts->mpts_flags |= MPTSF_CLOSED;
1943 
1944 	if (so->so_retaincnt == 0) {
1945 		soclose_locked(so);
1946 
1947 		return;
1948 	} else {
1949 		VERIFY(so->so_usecount > 0);
1950 		so->so_usecount--;
1951 	}
1952 
1953 	return;
1954 }
1955 
1956 /*
1957  * Connect an MPTCP subflow socket.
1958  *
1959  * Note that in the pending connect case, the subflow socket may have been
1960  * bound to an interface and/or a source IP address which may no longer be
1961  * around by the time this routine is called; in that case the connect attempt
1962  * will most likely fail.
1963  */
1964 static int
mptcp_subflow_soconnectx(struct mptses * mpte,struct mptsub * mpts)1965 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1966 {
1967 	char dbuf[MAX_IPv6_STR_LEN];
1968 	struct socket *mp_so, *so;
1969 	struct mptcb *mp_tp;
1970 	struct sockaddr *dst;
1971 	struct proc *p;
1972 	int af, error, dport;
1973 
1974 	mp_so = mptetoso(mpte);
1975 	mp_tp = mpte->mpte_mptcb;
1976 	so = mpts->mpts_socket;
1977 	af = mpts->mpts_dst.sa_family;
1978 	dst = &mpts->mpts_dst;
1979 
1980 	VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1981 	VERIFY(mpts->mpts_socket != NULL);
1982 	VERIFY(af == AF_INET || af == AF_INET6);
1983 
1984 	if (af == AF_INET) {
1985 		inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1986 		dport = ntohs(SIN(dst)->sin_port);
1987 	} else {
1988 		inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1989 		dport = ntohs(SIN6(dst)->sin6_port);
1990 	}
1991 
1992 	os_log(mptcp_log_handle,
1993 	    "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1994 	    mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1995 
1996 	p = proc_find(mp_so->last_pid);
1997 	if (p == PROC_NULL) {
1998 		os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1999 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
2000 
2001 		return ESRCH;
2002 	}
2003 
2004 	mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
2005 
2006 	mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
2007 
2008 	/* connect the subflow socket */
2009 	error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
2010 	    p, mpts->mpts_ifscope,
2011 	    mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
2012 
2013 	mpts->mpts_iss = sototcpcb(so)->iss;
2014 
2015 	/* See tcp_connect_complete */
2016 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
2017 	    (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2018 		mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
2019 	}
2020 
2021 	/* Allocate a unique address id per subflow */
2022 	mpte->mpte_addrid_last++;
2023 	if (mpte->mpte_addrid_last == 0) {
2024 		mpte->mpte_addrid_last++;
2025 	}
2026 
2027 	proc_rele(p);
2028 
2029 	DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
2030 	    struct mptsub *, mpts, int, error);
2031 	if (error) {
2032 		os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
2033 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
2034 	}
2035 
2036 	return error;
2037 }
2038 
2039 static int
mptcp_adj_rmap(struct socket * so,struct mbuf * m,int off,uint64_t dsn,uint32_t rseq,uint16_t dlen,uint8_t dfin)2040 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
2041     uint32_t rseq, uint16_t dlen, uint8_t dfin)
2042 {
2043 	struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2044 
2045 	if (m_pktlen(m) == 0) {
2046 		return 0;
2047 	}
2048 
2049 	if (!(m->m_flags & M_PKTHDR)) {
2050 		return 0;
2051 	}
2052 
2053 	if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
2054 		if (off && (dsn != m->m_pkthdr.mp_dsn ||
2055 		    rseq != m->m_pkthdr.mp_rseq ||
2056 		    dlen != m->m_pkthdr.mp_rlen ||
2057 		    dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
2058 			os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
2059 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
2060 			    (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
2061 			    rseq, m->m_pkthdr.mp_rseq,
2062 			    dlen, m->m_pkthdr.mp_rlen,
2063 			    dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
2064 
2065 			soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2066 			return -1;
2067 		}
2068 	}
2069 
2070 	/* If mbuf is beyond right edge of the mapping, we need to split */
2071 	if (m_pktlen(m) > dlen - dfin - off) {
2072 		struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
2073 		if (new == NULL) {
2074 			os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
2075 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
2076 			    dlen, dfin, off, m_pktlen(m),
2077 			    mpts->mpts_connid);
2078 
2079 			soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2080 			return -1;
2081 		}
2082 
2083 		m->m_next = new;
2084 		sballoc(&so->so_rcv, new);
2085 		/* Undo, as sballoc will add to it as well */
2086 		so->so_rcv.sb_cc -= new->m_len;
2087 
2088 		if (so->so_rcv.sb_mbtail == m) {
2089 			so->so_rcv.sb_mbtail = new;
2090 		}
2091 	}
2092 
2093 	m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2094 	m->m_pkthdr.mp_dsn = dsn + off;
2095 	m->m_pkthdr.mp_rseq = rseq + off;
2096 	VERIFY(m_pktlen(m) < UINT16_MAX);
2097 	m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
2098 
2099 	/* Only put the DATA_FIN-flag on the last mbuf of this mapping */
2100 	if (dfin) {
2101 		if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
2102 			m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
2103 		} else {
2104 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
2105 		}
2106 	}
2107 
2108 
2109 	mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
2110 
2111 	return 0;
2112 }
2113 
2114 /*
2115  * MPTCP subflow socket receive routine, derived from soreceive().
2116  */
2117 static int
mptcp_subflow_soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)2118 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
2119     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2120 {
2121 #pragma unused(uio)
2122 	struct socket *mp_so;
2123 	struct mptses *mpte;
2124 	struct mptcb *mp_tp;
2125 	int flags, error = 0;
2126 	struct mbuf *m, **mp = mp0;
2127 
2128 	mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2129 	mp_so = mptetoso(mpte);
2130 	mp_tp = mpte->mpte_mptcb;
2131 
2132 	VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2133 
2134 #ifdef MORE_LOCKING_DEBUG
2135 	if (so->so_usecount == 1) {
2136 		panic("%s: so=%x no other reference on socket", __func__, so);
2137 		/* NOTREACHED */
2138 	}
2139 #endif
2140 	/*
2141 	 * We return all that is there in the subflow's socket receive buffer
2142 	 * to the MPTCP layer, so we require that the caller passes in the
2143 	 * expected parameters.
2144 	 */
2145 	if (mp == NULL || controlp != NULL) {
2146 		return EINVAL;
2147 	}
2148 
2149 	*mp = NULL;
2150 	if (psa != NULL) {
2151 		*psa = NULL;
2152 	}
2153 	if (flagsp != NULL) {
2154 		flags = *flagsp & ~MSG_EOR;
2155 	} else {
2156 		flags = 0;
2157 	}
2158 
2159 	if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2160 		return EOPNOTSUPP;
2161 	}
2162 
2163 	flags |= (MSG_DONTWAIT | MSG_NBIO);
2164 
2165 	/*
2166 	 * If a recv attempt is made on a previously-accepted socket
2167 	 * that has been marked as inactive (disconnected), reject
2168 	 * the request.
2169 	 */
2170 	if (so->so_flags & SOF_DEFUNCT) {
2171 		struct sockbuf *sb = &so->so_rcv;
2172 
2173 		error = ENOTCONN;
2174 		/*
2175 		 * This socket should have been disconnected and flushed
2176 		 * prior to being returned from sodefunct(); there should
2177 		 * be no data on its receive list, so panic otherwise.
2178 		 */
2179 		if (so->so_state & SS_DEFUNCT) {
2180 			sb_empty_assert(sb, __func__);
2181 		}
2182 		return error;
2183 	}
2184 
2185 	/*
2186 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2187 	 * and if so just return to the caller.  This could happen when
2188 	 * soreceive() is called by a socket upcall function during the
2189 	 * time the socket is freed.  The socket buffer would have been
2190 	 * locked across the upcall, therefore we cannot put this thread
2191 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2192 	 * we may livelock), because the lock on the socket buffer will
2193 	 * only be released when the upcall routine returns to its caller.
2194 	 * Because the socket has been officially closed, there can be
2195 	 * no further read on it.
2196 	 *
2197 	 * A multipath subflow socket would have its SS_NOFDREF set by
2198 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2199 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2200 	 */
2201 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2202 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2203 		return 0;
2204 	}
2205 
2206 	/*
2207 	 * For consistency with soreceive() semantics, we need to obey
2208 	 * SB_LOCK in case some other code path has locked the buffer.
2209 	 */
2210 	error = sblock(&so->so_rcv, 0);
2211 	if (error != 0) {
2212 		return error;
2213 	}
2214 
2215 	m = so->so_rcv.sb_mb;
2216 	if (m == NULL) {
2217 		/*
2218 		 * Panic if we notice inconsistencies in the socket's
2219 		 * receive list; both sb_mb and sb_cc should correctly
2220 		 * reflect the contents of the list, otherwise we may
2221 		 * end up with false positives during select() or poll()
2222 		 * which could put the application in a bad state.
2223 		 */
2224 		SB_MB_CHECK(&so->so_rcv);
2225 
2226 		if (so->so_error != 0) {
2227 			error = so->so_error;
2228 			so->so_error = 0;
2229 			goto release;
2230 		}
2231 
2232 		if (so->so_state & SS_CANTRCVMORE) {
2233 			goto release;
2234 		}
2235 
2236 		if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2237 			error = ENOTCONN;
2238 			goto release;
2239 		}
2240 
2241 		/*
2242 		 * MSG_DONTWAIT is implicitly defined and this routine will
2243 		 * never block, so return EWOULDBLOCK when there is nothing.
2244 		 */
2245 		error = EWOULDBLOCK;
2246 		goto release;
2247 	}
2248 
2249 	mptcp_update_last_owner(so, mp_so);
2250 
2251 	SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2252 	SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2253 
2254 	while (m != NULL) {
2255 		int dlen = 0, error_out = 0, off = 0;
2256 		uint8_t dfin = 0;
2257 		struct mbuf *start = m;
2258 		uint64_t dsn;
2259 		uint32_t sseq;
2260 		uint16_t orig_dlen;
2261 		uint16_t csum;
2262 
2263 		VERIFY(m->m_nextpkt == NULL);
2264 
2265 		if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2266 fallback:
2267 			/* Just move mbuf to MPTCP-level */
2268 
2269 			sbfree(&so->so_rcv, m);
2270 
2271 			if (mp != NULL) {
2272 				*mp = m;
2273 				mp = &m->m_next;
2274 				so->so_rcv.sb_mb = m = m->m_next;
2275 				*mp = NULL;
2276 			}
2277 
2278 			if (m != NULL) {
2279 				so->so_rcv.sb_lastrecord = m;
2280 			} else {
2281 				SB_EMPTY_FIXUP(&so->so_rcv);
2282 			}
2283 
2284 			continue;
2285 		} else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2286 			struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2287 			boolean_t found_mapping = false;
2288 			int parsed_length = 0;
2289 			struct mbuf *m_iter;
2290 
2291 			/*
2292 			 * No MPTCP-option in the header. Either fallback or
2293 			 * wait for additional mappings.
2294 			 */
2295 			if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2296 				/* data arrived without a DSS option mapping */
2297 
2298 				/* initial subflow can fallback right after SYN handshake */
2299 				if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2300 					mptcp_notify_mpfail(so);
2301 
2302 					goto fallback;
2303 				} else {
2304 					os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2305 					    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2306 					    mpts->mpts_connid);
2307 					soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2308 
2309 					error = EIO;
2310 					*mp0 = NULL;
2311 					goto release;
2312 				}
2313 			}
2314 
2315 			/* Thus, let's look for an mbuf with the mapping */
2316 			m_iter = m->m_next;
2317 			parsed_length = m->m_len;
2318 			while (m_iter != NULL && parsed_length < UINT16_MAX) {
2319 				if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2320 					parsed_length += m_iter->m_len;
2321 					m_iter = m_iter->m_next;
2322 					continue;
2323 				}
2324 
2325 				found_mapping = true;
2326 
2327 				/* Found an mbuf with a DSS-mapping */
2328 				orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2329 				dsn = m_iter->m_pkthdr.mp_dsn;
2330 				sseq = m_iter->m_pkthdr.mp_rseq;
2331 				csum = m_iter->m_pkthdr.mp_csum;
2332 
2333 				if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2334 					dfin = 1;
2335 					dlen--;
2336 				}
2337 
2338 				break;
2339 			}
2340 
2341 			if (!found_mapping && parsed_length < UINT16_MAX) {
2342 				/* Mapping not yet present, we can wait! */
2343 				if (*mp0 == NULL) {
2344 					error = EWOULDBLOCK;
2345 				}
2346 				goto release;
2347 			} else if (!found_mapping && parsed_length >= UINT16_MAX) {
2348 				os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2349 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2350 				    mpts->mpts_connid);
2351 				/* Received 64KB without DSS-mapping. We should kill the subflow */
2352 				soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2353 
2354 				error = EIO;
2355 				*mp0 = NULL;
2356 				goto release;
2357 			}
2358 		} else {
2359 			orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2360 			dsn = m->m_pkthdr.mp_dsn;
2361 			sseq = m->m_pkthdr.mp_rseq;
2362 			csum = m->m_pkthdr.mp_csum;
2363 
2364 			if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2365 				dfin = 1;
2366 				dlen--;
2367 			}
2368 		}
2369 
2370 		/*
2371 		 * Check if the full mapping is now present
2372 		 */
2373 		if ((int)so->so_rcv.sb_cc < dlen) {
2374 			if (*mp0 == NULL) {
2375 				error = EWOULDBLOCK;
2376 			}
2377 			goto release;
2378 		}
2379 
2380 		/* Now, get the full mapping */
2381 		off = 0;
2382 		while (dlen > 0) {
2383 			if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
2384 				error_out = 1;
2385 				error = EIO;
2386 				dlen = 0;
2387 				*mp0 = NULL;
2388 				break;
2389 			}
2390 
2391 			dlen -= m->m_len;
2392 			off += m->m_len;
2393 			sbfree(&so->so_rcv, m);
2394 
2395 			if (mp != NULL) {
2396 				*mp = m;
2397 				mp = &m->m_next;
2398 				so->so_rcv.sb_mb = m = m->m_next;
2399 				*mp = NULL;
2400 			}
2401 
2402 			ASSERT(dlen == 0 || m);
2403 			if (dlen != 0 && m == NULL) {
2404 				/* "try" to gracefully recover on customer builds */
2405 				error_out = 1;
2406 				error = EIO;
2407 				dlen  = 0;
2408 
2409 				*mp0 = NULL;
2410 
2411 				SB_EMPTY_FIXUP(&so->so_rcv);
2412 				soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2413 
2414 				break;
2415 			}
2416 		}
2417 
2418 		VERIFY(dlen == 0);
2419 
2420 		if (m != NULL) {
2421 			so->so_rcv.sb_lastrecord = m;
2422 		} else {
2423 			SB_EMPTY_FIXUP(&so->so_rcv);
2424 		}
2425 
2426 		if (error_out) {
2427 			goto release;
2428 		}
2429 
2430 		if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2431 			error = EIO;
2432 			*mp0 = NULL;
2433 			goto release;
2434 		}
2435 
2436 		SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2437 		SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2438 	}
2439 
2440 	DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2441 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2442 
2443 	if (flagsp != NULL) {
2444 		*flagsp |= flags;
2445 	}
2446 
2447 release:
2448 	sbunlock(&so->so_rcv, TRUE);
2449 
2450 	return error;
2451 }
2452 
2453 /*
2454  * MPTCP subflow socket send routine, derived from sosend().
2455  */
2456 static int
mptcp_subflow_sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2457 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2458     struct mbuf *top, struct mbuf *control, int flags)
2459 {
2460 	struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2461 	boolean_t en_tracing = FALSE, proc_held = FALSE;
2462 	struct proc *p = current_proc();
2463 	int en_tracing_val;
2464 	int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2465 	int error;
2466 
2467 	VERIFY(control == NULL);
2468 	VERIFY(addr == NULL);
2469 	VERIFY(uio == NULL);
2470 	VERIFY(flags == 0);
2471 	VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2472 
2473 	VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2474 	VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2475 
2476 	/*
2477 	 * trace if tracing & network (vs. unix) sockets & and
2478 	 * non-loopback
2479 	 */
2480 	if (ENTR_SHOULDTRACE &&
2481 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2482 		struct inpcb *inp = sotoinpcb(so);
2483 		if (inp->inp_last_outifp != NULL &&
2484 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2485 			en_tracing = TRUE;
2486 			en_tracing_val = top->m_pkthdr.len;
2487 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2488 			    (unsigned long)VM_KERNEL_ADDRPERM(so),
2489 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2490 			    (int64_t)en_tracing_val);
2491 		}
2492 	}
2493 
2494 	mptcp_update_last_owner(so, mp_so);
2495 
2496 	if (mp_so->last_pid != proc_pid(p)) {
2497 		p = proc_find(mp_so->last_pid);
2498 		if (p == PROC_NULL) {
2499 			p = current_proc();
2500 		} else {
2501 			proc_held = TRUE;
2502 		}
2503 	}
2504 
2505 #if NECP
2506 	inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2507 #endif /* NECP */
2508 
2509 	error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2510 	if (error) {
2511 		goto out;
2512 	}
2513 
2514 	error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2515 	top = NULL;
2516 
2517 out:
2518 	if (top != NULL) {
2519 		m_freem(top);
2520 	}
2521 
2522 	if (proc_held) {
2523 		proc_rele(p);
2524 	}
2525 
2526 	soclearfastopen(so);
2527 
2528 	if (en_tracing) {
2529 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2530 		    (unsigned long)VM_KERNEL_ADDRPERM(so),
2531 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2532 		    (int64_t)en_tracing_val);
2533 	}
2534 
2535 	return error;
2536 }
2537 
2538 /*
2539  * Establish an initial MPTCP connection (if first subflow and not yet
2540  * connected), or add a subflow to an existing MPTCP connection.
2541  */
2542 int
mptcp_subflow_add(struct mptses * mpte,struct sockaddr * src,struct sockaddr * dst,uint32_t ifscope,sae_connid_t * pcid)2543 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2544     struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2545 {
2546 	struct socket *mp_so, *so = NULL;
2547 	struct mptcb *mp_tp;
2548 	struct mptsub *mpts = NULL;
2549 	int af, error = 0;
2550 
2551 	mp_so = mptetoso(mpte);
2552 	mp_tp = mpte->mpte_mptcb;
2553 
2554 	socket_lock_assert_owned(mp_so);
2555 
2556 	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2557 		/* If the remote end sends Data FIN, refuse subflow adds */
2558 		os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2559 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2560 		error = ENOTCONN;
2561 		goto out_err;
2562 	}
2563 
2564 	if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2565 		error = EOVERFLOW;
2566 		goto out_err;
2567 	}
2568 
2569 	mpts = mptcp_subflow_alloc();
2570 	if (mpts == NULL) {
2571 		os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2572 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2573 		error = ENOMEM;
2574 		goto out_err;
2575 	}
2576 
2577 	if (src) {
2578 		if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2579 			error = EAFNOSUPPORT;
2580 			goto out_err;
2581 		}
2582 
2583 		if (src->sa_family == AF_INET &&
2584 		    src->sa_len != sizeof(struct sockaddr_in)) {
2585 			error = EINVAL;
2586 			goto out_err;
2587 		}
2588 
2589 		if (src->sa_family == AF_INET6 &&
2590 		    src->sa_len != sizeof(struct sockaddr_in6)) {
2591 			error = EINVAL;
2592 			goto out_err;
2593 		}
2594 
2595 		mpts->mpts_src = (struct sockaddr *)alloc_sockaddr(src->sa_len,
2596 		    Z_WAITOK | Z_NOFAIL);
2597 
2598 		bcopy(src, mpts->mpts_src, src->sa_len);
2599 	}
2600 
2601 	if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2602 		error = EAFNOSUPPORT;
2603 		goto out_err;
2604 	}
2605 
2606 	if (dst->sa_family == AF_INET &&
2607 	    dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2608 		error = EINVAL;
2609 		goto out_err;
2610 	}
2611 
2612 	if (dst->sa_family == AF_INET6 &&
2613 	    dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2614 		error = EINVAL;
2615 		goto out_err;
2616 	}
2617 
2618 	memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
2619 
2620 	af = mpts->mpts_dst.sa_family;
2621 
2622 	ifnet_head_lock_shared();
2623 	if ((ifscope > (unsigned)if_index)) {
2624 		ifnet_head_done();
2625 		error = ENXIO;
2626 		goto out_err;
2627 	}
2628 	ifnet_head_done();
2629 
2630 	mpts->mpts_ifscope = ifscope;
2631 
2632 	/* create the subflow socket */
2633 	if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2634 		/*
2635 		 * Returning (error) and not cleaning up, because up to here
2636 		 * all we did is creating mpts.
2637 		 *
2638 		 * And the contract is that the call to mptcp_subflow_socreate,
2639 		 * moves ownership of mpts to mptcp_subflow_socreate.
2640 		 */
2641 		return error;
2642 	}
2643 
2644 	/*
2645 	 * We may be called from within the kernel. Still need to account this
2646 	 * one to the real app.
2647 	 */
2648 	mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2649 
2650 	/*
2651 	 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2652 	 * -1 (SAE_CONNID_ALL).
2653 	 */
2654 	mpte->mpte_connid_last++;
2655 	if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2656 	    mpte->mpte_connid_last == SAE_CONNID_ANY) {
2657 		mpte->mpte_connid_last++;
2658 	}
2659 
2660 	mpts->mpts_connid = mpte->mpte_connid_last;
2661 
2662 	mpts->mpts_rel_seq = 1;
2663 
2664 	/* Allocate a unique address id per subflow */
2665 	mpte->mpte_addrid_last++;
2666 	if (mpte->mpte_addrid_last == 0) {
2667 		mpte->mpte_addrid_last++;
2668 	}
2669 
2670 	/* register for subflow socket read/write events */
2671 	sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2672 
2673 	/* Register for subflow socket control events */
2674 	sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2675 	    SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2676 	    SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2677 	    SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2678 	    SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2679 	    SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2680 	    SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2681 	    SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2682 
2683 	/* sanity check */
2684 	VERIFY(!(mpts->mpts_flags &
2685 	    (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2686 
2687 	/*
2688 	 * Indicate to the TCP subflow whether or not it should establish
2689 	 * the initial MPTCP connection, or join an existing one.  Fill
2690 	 * in the connection request structure with additional info needed
2691 	 * by the underlying TCP (to be used in the TCP options, etc.)
2692 	 */
2693 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2694 		mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2695 
2696 		if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2697 			mptcp_init_local_parms(mpte, dst);
2698 		}
2699 		soisconnecting(mp_so);
2700 
2701 		/* If fastopen is requested, set state in mpts */
2702 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2703 			mpts->mpts_flags |= MPTSF_TFO_REQD;
2704 		}
2705 	} else {
2706 		if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2707 			mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2708 		}
2709 	}
2710 
2711 	mpts->mpts_flags |= MPTSF_CONNECTING;
2712 
2713 	/* connect right away if first attempt, or if join can be done now */
2714 	if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2715 		error = mptcp_subflow_soconnectx(mpte, mpts);
2716 	}
2717 
2718 	if (error) {
2719 		goto out_err_close;
2720 	}
2721 
2722 	if (pcid) {
2723 		*pcid = mpts->mpts_connid;
2724 	}
2725 
2726 	return 0;
2727 
2728 out_err_close:
2729 	mptcp_subflow_abort(mpts, error);
2730 
2731 	return error;
2732 
2733 out_err:
2734 	if (mpts) {
2735 		mptcp_subflow_free(mpts);
2736 	}
2737 
2738 	return error;
2739 }
2740 
2741 void
mptcpstats_update(struct mptcp_itf_stats * stats,const struct mptsub * mpts)2742 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2743 {
2744 	int index = mptcpstats_get_index(stats, mpts);
2745 
2746 	if (index != -1) {
2747 		struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2748 
2749 		stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2750 		stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2751 
2752 		stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2753 		stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2754 
2755 		stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2756 		stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2757 
2758 		stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2759 		stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2760 	}
2761 }
2762 
2763 /*
2764  * Delete/remove a subflow from an MPTCP.  The underlying subflow socket
2765  * will no longer be accessible after a subflow is deleted, thus this
2766  * should occur only after the subflow socket has been disconnected.
2767  */
2768 void
mptcp_subflow_del(struct mptses * mpte,struct mptsub * mpts)2769 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2770 {
2771 	struct socket *mp_so = mptetoso(mpte);
2772 	struct socket *so = mpts->mpts_socket;
2773 	struct tcpcb *tp = sototcpcb(so);
2774 
2775 	socket_lock_assert_owned(mp_so);
2776 	VERIFY(mpts->mpts_mpte == mpte);
2777 	VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2778 	VERIFY(mpte->mpte_numflows != 0);
2779 	VERIFY(mp_so->so_usecount > 0);
2780 
2781 	mptcpstats_update(mpte->mpte_itfstats, mpts);
2782 
2783 	mptcp_unset_cellicon(mpte, mpts, 1);
2784 
2785 	mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2786 	mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2787 
2788 	atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2789 	TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2790 	mpte->mpte_numflows--;
2791 	if (mpte->mpte_active_sub == mpts) {
2792 		mpte->mpte_active_sub = NULL;
2793 	}
2794 
2795 	/*
2796 	 * Drop references held by this subflow socket; there
2797 	 * will be no further upcalls made from this point.
2798 	 */
2799 	sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2800 	sock_catchevents_locked(so, NULL, NULL, 0);
2801 
2802 	mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2803 
2804 	mp_so->so_usecount--;           /* for subflow socket */
2805 	mpts->mpts_mpte = NULL;
2806 	mpts->mpts_socket = NULL;
2807 
2808 	mptcp_subflow_remref(mpts);             /* for MPTCP subflow list */
2809 	mptcp_subflow_remref(mpts);             /* for subflow socket */
2810 
2811 	so->so_flags &= ~SOF_MP_SUBFLOW;
2812 	tp->t_mptcb = NULL;
2813 	tp->t_mpsub = NULL;
2814 }
2815 
2816 void
mptcp_subflow_shutdown(struct mptses * mpte,struct mptsub * mpts)2817 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2818 {
2819 	struct socket *so = mpts->mpts_socket;
2820 	struct mptcb *mp_tp = mpte->mpte_mptcb;
2821 	int send_dfin = 0;
2822 
2823 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2824 		send_dfin = 1;
2825 	}
2826 
2827 	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2828 	    (so->so_state & SS_ISCONNECTED)) {
2829 		mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2830 		    __func__, mpts->mpts_connid, send_dfin),
2831 		    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2832 
2833 		if (send_dfin) {
2834 			mptcp_send_dfin(so);
2835 		}
2836 		soshutdownlock(so, SHUT_WR);
2837 	}
2838 }
2839 
2840 static void
mptcp_subflow_abort(struct mptsub * mpts,int error)2841 mptcp_subflow_abort(struct mptsub *mpts, int error)
2842 {
2843 	struct socket *so = mpts->mpts_socket;
2844 	struct tcpcb *tp = sototcpcb(so);
2845 
2846 	if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2847 		return;
2848 	}
2849 
2850 	mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2851 	    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2852 
2853 	if (tp->t_state != TCPS_CLOSED) {
2854 		tcp_drop(tp, error);
2855 	}
2856 
2857 	mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2858 }
2859 
2860 /*
2861  * Disconnect a subflow socket.
2862  */
2863 void
mptcp_subflow_disconnect(struct mptses * mpte,struct mptsub * mpts)2864 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2865 {
2866 	struct socket *so, *mp_so;
2867 	struct mptcb *mp_tp;
2868 	int send_dfin = 0;
2869 
2870 	so = mpts->mpts_socket;
2871 	mp_tp = mpte->mpte_mptcb;
2872 	mp_so = mptetoso(mpte);
2873 
2874 	socket_lock_assert_owned(mp_so);
2875 
2876 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2877 		return;
2878 	}
2879 
2880 	mptcp_unset_cellicon(mpte, mpts, 1);
2881 
2882 	mpts->mpts_flags |= MPTSF_DISCONNECTING;
2883 
2884 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2885 		send_dfin = 1;
2886 	}
2887 
2888 	if (mp_so->so_flags & SOF_DEFUNCT) {
2889 		errno_t ret;
2890 
2891 		ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2892 		if (ret == 0) {
2893 			ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2894 
2895 			if (ret != 0) {
2896 				os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2897 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2898 			}
2899 		} else {
2900 			os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2901 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2902 		}
2903 	}
2904 
2905 	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2906 	    (so->so_state & SS_ISCONNECTED)) {
2907 		mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
2908 		    __func__, mpts->mpts_connid, send_dfin),
2909 		    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2910 
2911 		if (send_dfin) {
2912 			mptcp_send_dfin(so);
2913 		}
2914 
2915 		(void) soshutdownlock(so, SHUT_RD);
2916 		(void) soshutdownlock(so, SHUT_WR);
2917 		(void) sodisconnectlocked(so);
2918 	}
2919 
2920 	/*
2921 	 * Generate a disconnect event for this subflow socket, in case
2922 	 * the lower layer doesn't do it; this is needed because the
2923 	 * subflow socket deletion relies on it.
2924 	 */
2925 	mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2926 }
2927 
2928 /*
2929  * Subflow socket input.
2930  */
2931 static void
mptcp_subflow_input(struct mptses * mpte,struct mptsub * mpts)2932 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2933 {
2934 	struct socket *mp_so = mptetoso(mpte);
2935 	struct mbuf *m = NULL;
2936 	struct socket *so;
2937 	int error, wakeup = 0;
2938 
2939 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2940 	mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2941 
2942 	DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2943 	    struct mptsub *, mpts);
2944 
2945 	if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2946 		goto out;
2947 	}
2948 
2949 	so = mpts->mpts_socket;
2950 
2951 	error = sock_receive_internal(so, NULL, &m, 0, NULL);
2952 	if (error != 0 && error != EWOULDBLOCK) {
2953 		os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2954 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2955 		if (error == ENODATA) {
2956 			/*
2957 			 * Don't ignore ENODATA so as to discover
2958 			 * nasty middleboxes.
2959 			 */
2960 			mp_so->so_error = ENODATA;
2961 
2962 			wakeup = 1;
2963 			goto out;
2964 		}
2965 	} else if (error == 0) {
2966 		mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
2967 		    MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2968 	}
2969 
2970 	/* In fallback, make sure to accept data on all but one subflow */
2971 	if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2972 	    !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2973 		mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2974 		    __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2975 		m_freem(m);
2976 		goto out;
2977 	}
2978 
2979 	if (m != NULL) {
2980 		if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2981 			mptcp_set_cellicon(mpte, mpts);
2982 
2983 			mpte->mpte_used_cell = 1;
2984 		} else {
2985 			/*
2986 			 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2987 			 * explicitly set the cellicon, then we unset it again.
2988 			 */
2989 			if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2990 				mptcp_unset_cellicon(mpte, NULL, 1);
2991 			}
2992 
2993 			mpte->mpte_used_wifi = 1;
2994 		}
2995 
2996 		mptcp_input(mpte, m);
2997 	}
2998 
2999 out:
3000 	if (wakeup) {
3001 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
3002 	}
3003 
3004 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
3005 }
3006 
3007 void
mptcp_handle_input(struct socket * so)3008 mptcp_handle_input(struct socket *so)
3009 {
3010 	struct mptsub *mpts, *tmpts;
3011 	struct mptses *mpte;
3012 
3013 	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
3014 		return;
3015 	}
3016 
3017 	mpts = sototcpcb(so)->t_mpsub;
3018 	mpte = mpts->mpts_mpte;
3019 
3020 	socket_lock_assert_owned(mptetoso(mpte));
3021 
3022 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3023 		if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
3024 			mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
3025 		}
3026 		return;
3027 	}
3028 
3029 	mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
3030 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3031 		if (mpts->mpts_socket->so_usecount == 0) {
3032 			/* Will be removed soon by tcp_garbage_collect */
3033 			continue;
3034 		}
3035 
3036 		mptcp_subflow_addref(mpts);
3037 		mpts->mpts_socket->so_usecount++;
3038 
3039 		mptcp_subflow_input(mpte, mpts);
3040 
3041 		mptcp_subflow_remref(mpts);             /* ours */
3042 
3043 		VERIFY(mpts->mpts_socket->so_usecount != 0);
3044 		mpts->mpts_socket->so_usecount--;
3045 	}
3046 
3047 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
3048 }
3049 
3050 /*
3051  * Subflow socket write upcall.
3052  *
3053  * Called when the associated subflow socket posted a read event.
3054  */
3055 static void
mptcp_subflow_wupcall(struct socket * so,void * arg,int waitf)3056 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
3057 {
3058 #pragma unused(so, waitf)
3059 	struct mptsub *mpts = arg;
3060 	struct mptses *mpte = mpts->mpts_mpte;
3061 
3062 	VERIFY(mpte != NULL);
3063 
3064 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3065 		if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
3066 			mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3067 		}
3068 		return;
3069 	}
3070 
3071 	mptcp_output(mpte);
3072 }
3073 
3074 static boolean_t
mptcp_search_seq_in_sub(struct mbuf * m,struct socket * so)3075 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
3076 {
3077 	struct mbuf *so_m = so->so_snd.sb_mb;
3078 	uint64_t dsn = m->m_pkthdr.mp_dsn;
3079 
3080 	while (so_m) {
3081 		VERIFY(so_m->m_flags & M_PKTHDR);
3082 		VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
3083 
3084 		/* Part of the segment is covered, don't reinject here */
3085 		if (so_m->m_pkthdr.mp_dsn <= dsn &&
3086 		    so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
3087 			return TRUE;
3088 		}
3089 
3090 		so_m = so_m->m_next;
3091 	}
3092 
3093 	return FALSE;
3094 }
3095 
3096 /*
3097  * Subflow socket output.
3098  *
3099  * Called for sending data from MPTCP to the underlying subflow socket.
3100  */
3101 int
mptcp_subflow_output(struct mptses * mpte,struct mptsub * mpts,int flags)3102 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
3103 {
3104 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3105 	struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL;
3106 	struct socket *mp_so, *so;
3107 	struct tcpcb *tp;
3108 	uint64_t mpt_dsn = 0, off = 0;
3109 	int sb_cc = 0, error = 0, wakeup = 0;
3110 	uint16_t dss_csum;
3111 	uint16_t tot_sent = 0;
3112 	boolean_t reinjected = FALSE;
3113 
3114 	mp_so = mptetoso(mpte);
3115 	so = mpts->mpts_socket;
3116 	tp = sototcpcb(so);
3117 
3118 	socket_lock_assert_owned(mp_so);
3119 
3120 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3121 	mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
3122 
3123 	VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3124 	VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
3125 	    (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3126 	    (mpts->mpts_flags & MPTSF_TFO_REQD));
3127 	VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
3128 
3129 	mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
3130 	    __func__, mpts->mpts_flags, mpte->mpte_flags,
3131 	    mptcp_subflow_cwnd_space(so)),
3132 	    MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3133 	DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3134 	    struct mptsub *, mpts);
3135 
3136 	/* Remove Addr Option is not sent reliably as per I-D */
3137 	if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
3138 		tp->t_rem_aid = mpte->mpte_lost_aid;
3139 		tp->t_mpflags |= TMPF_SND_REM_ADDR;
3140 		mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3141 	}
3142 
3143 	/*
3144 	 * The mbuf chains containing the metadata (as well as pointing to
3145 	 * the user data sitting at the MPTCP output queue) would then be
3146 	 * sent down to the subflow socket.
3147 	 *
3148 	 * Some notes on data sequencing:
3149 	 *
3150 	 *   a. Each mbuf must be a M_PKTHDR.
3151 	 *   b. MPTCP metadata is stored in the mptcp_pktinfo structure
3152 	 *	in the mbuf pkthdr structure.
3153 	 *   c. Each mbuf containing the MPTCP metadata must have its
3154 	 *	pkt_flags marked with the PKTF_MPTCP flag.
3155 	 */
3156 
3157 	if (mpte->mpte_reinjectq) {
3158 		sb_mb = mpte->mpte_reinjectq;
3159 	} else {
3160 		sb_mb = mp_so->so_snd.sb_mb;
3161 	}
3162 
3163 	if (sb_mb == NULL) {
3164 		os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3165 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3166 		    (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3167 		    (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3168 
3169 		/* Fix it to prevent looping */
3170 		if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3171 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3172 		}
3173 		goto out;
3174 	}
3175 
3176 	VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3177 
3178 	if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3179 	    !(so->so_state & SS_ISCONNECTED) &&
3180 	    (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3181 		tp->t_mpflags |= TMPF_TFO_REQUEST;
3182 
3183 		/* Opting to call pru_send as no mbuf at subflow level */
3184 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3185 		    NULL, current_proc());
3186 
3187 		goto done_sending;
3188 	}
3189 
3190 	mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3191 
3192 	/* First, drop acknowledged data */
3193 	if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3194 		os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3195 		    "dsn %u suna %u reinject? %u\n",
3196 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3197 		    (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3198 		if (mpte->mpte_reinjectq) {
3199 			mptcp_clean_reinjectq(mpte);
3200 		} else {
3201 			uint64_t len = 0;
3202 			len = mp_tp->mpt_snduna - mpt_dsn;
3203 			sbdrop(&mp_so->so_snd, (int)len);
3204 			wakeup = 1;
3205 		}
3206 	}
3207 
3208 	/* Check again because of above sbdrop */
3209 	if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3210 		os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3211 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3212 		goto out;
3213 	}
3214 
3215 	/*
3216 	 * In degraded mode, we don't receive data acks, so force free
3217 	 * mbufs less than snd_nxt
3218 	 */
3219 	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3220 	    (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3221 	    mp_so->so_snd.sb_mb) {
3222 		mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3223 		if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3224 			uint64_t len = 0;
3225 			len = mp_tp->mpt_snduna - mpt_dsn;
3226 			sbdrop(&mp_so->so_snd, (int)len);
3227 			wakeup = 1;
3228 
3229 			os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3230 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3231 			    (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3232 		}
3233 	}
3234 
3235 	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3236 	    !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3237 		mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3238 		so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3239 	}
3240 
3241 	/*
3242 	 * Adjust the top level notion of next byte used for retransmissions
3243 	 * and sending FINs.
3244 	 */
3245 	if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3246 		mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3247 	}
3248 
3249 	/* Now determine the offset from which to start transmitting data */
3250 	if (mpte->mpte_reinjectq) {
3251 		sb_mb = mpte->mpte_reinjectq;
3252 	} else {
3253 dont_reinject:
3254 		sb_mb = mp_so->so_snd.sb_mb;
3255 	}
3256 	if (sb_mb == NULL) {
3257 		os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3258 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3259 		goto out;
3260 	}
3261 
3262 	if (sb_mb == mpte->mpte_reinjectq) {
3263 		sb_cc = sb_mb->m_pkthdr.mp_rlen;
3264 		off = 0;
3265 
3266 		if (mptcp_search_seq_in_sub(sb_mb, so)) {
3267 			if (mptcp_can_send_more(mp_tp, TRUE)) {
3268 				goto dont_reinject;
3269 			}
3270 
3271 			error = ECANCELED;
3272 			goto out;
3273 		}
3274 
3275 		reinjected = TRUE;
3276 	} else if (flags & MPTCP_SUBOUT_PROBING) {
3277 		sb_cc = sb_mb->m_pkthdr.mp_rlen;
3278 		off = 0;
3279 	} else {
3280 		sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3281 
3282 		/*
3283 		 * With TFO, there might be no data at all, thus still go into this
3284 		 * code-path here.
3285 		 */
3286 		if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3287 		    MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3288 			off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3289 			sb_cc -= off;
3290 		} else {
3291 			os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3292 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3293 			    (uint32_t)mp_tp->mpt_sndmax);
3294 
3295 			goto out;
3296 		}
3297 	}
3298 
3299 	sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3300 	if (sb_cc <= 0) {
3301 		os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3302 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3303 		    (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3304 		    mptcp_subflow_cwnd_space(so));
3305 	}
3306 
3307 	sb_cc = min(sb_cc, UINT16_MAX);
3308 
3309 	/*
3310 	 * Create a DSN mapping for the data we are about to send. It all
3311 	 * has the same mapping.
3312 	 */
3313 	if (reinjected) {
3314 		mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3315 	} else {
3316 		mpt_dsn = mp_tp->mpt_snduna + off;
3317 	}
3318 
3319 	mpt_mbuf = sb_mb;
3320 	while (mpt_mbuf && reinjected == FALSE &&
3321 	    (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3322 	    mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3323 		off -= mpt_mbuf->m_pkthdr.mp_rlen;
3324 		mpt_mbuf = mpt_mbuf->m_next;
3325 	}
3326 	if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3327 		mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
3328 		    __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
3329 		    mpts->mpts_probecnt),
3330 		    MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3331 	}
3332 
3333 	VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3334 
3335 	head = tail = NULL;
3336 
3337 	while (tot_sent < sb_cc) {
3338 		int32_t mlen;
3339 
3340 		mlen = mpt_mbuf->m_len;
3341 		mlen -= off;
3342 		mlen = MIN(mlen, sb_cc - tot_sent);
3343 
3344 		if (mlen < 0) {
3345 			os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3346 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3347 			    (uint32_t)off, sb_cc, tot_sent);
3348 			goto out;
3349 		}
3350 
3351 		if (mlen == 0) {
3352 			goto next;
3353 		}
3354 
3355 		m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
3356 		    M_COPYM_MUST_COPY_HDR);
3357 		if (m == NULL) {
3358 			os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3359 			    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3360 			error = ENOBUFS;
3361 			break;
3362 		}
3363 
3364 		/* Create a DSN mapping for the data (m_copym does it) */
3365 		VERIFY(m->m_flags & M_PKTHDR);
3366 		VERIFY(m->m_next == NULL);
3367 
3368 		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3369 		m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3370 		m->m_pkthdr.mp_dsn = mpt_dsn;
3371 		m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3372 		m->m_pkthdr.len = mlen;
3373 
3374 		if (head == NULL) {
3375 			head = tail = m;
3376 		} else {
3377 			tail->m_next = m;
3378 			tail = m;
3379 		}
3380 
3381 		tot_sent += mlen;
3382 		off = 0;
3383 next:
3384 		mpt_mbuf = mpt_mbuf->m_next;
3385 	}
3386 
3387 	if (reinjected) {
3388 		if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3389 			struct mbuf *n = sb_mb;
3390 
3391 			while (n) {
3392 				n->m_pkthdr.mp_dsn += sb_cc;
3393 				n->m_pkthdr.mp_rlen -= sb_cc;
3394 				n = n->m_next;
3395 			}
3396 			m_adj(sb_mb, sb_cc);
3397 		} else {
3398 			mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3399 			m_freem(sb_mb);
3400 		}
3401 	}
3402 
3403 	mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
3404 	    __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
3405 	    tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3406 
3407 	if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3408 		dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3409 		    tot_sent);
3410 	}
3411 
3412 	/* Now, let's update rel-seq and the data-level length */
3413 	mpts->mpts_rel_seq += tot_sent;
3414 	m = head;
3415 	while (m) {
3416 		if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3417 			m->m_pkthdr.mp_csum = dss_csum;
3418 		}
3419 		m->m_pkthdr.mp_rlen = tot_sent;
3420 		m = m->m_next;
3421 	}
3422 
3423 	if (head != NULL) {
3424 		if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3425 		    (tp->t_tfo_stats == 0)) {
3426 			tp->t_mpflags |= TMPF_TFO_REQUEST;
3427 		}
3428 
3429 		error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0);
3430 		head = NULL;
3431 	}
3432 
3433 done_sending:
3434 	if (error == 0 ||
3435 	    (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3436 		uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3437 
3438 		if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3439 			tcpstat.tcps_mp_num_probes++;
3440 			if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3441 				mpts->mpts_probecnt += 1;
3442 			} else {
3443 				mpts->mpts_probecnt +=
3444 				    tot_sent / mpts->mpts_maxseg;
3445 			}
3446 		}
3447 
3448 		if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3449 			if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3450 			    MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3451 				mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3452 			}
3453 			mp_tp->mpt_sndnxt = new_sndnxt;
3454 		}
3455 
3456 		mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3457 
3458 		/* Must be here as mptcp_can_send_more() checks for this */
3459 		soclearfastopen(mp_so);
3460 
3461 		if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3462 		    (mpts->mpts_probesoon != 0)) {
3463 			mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
3464 			    __func__, mpts->mpts_connid,
3465 			    !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
3466 			    tot_sent, (int) sb_cc, mpts->mpts_probecnt,
3467 			    (tcp_now - mpts->mpts_probesoon)),
3468 			    MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3469 		}
3470 
3471 		if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3472 			mptcp_set_cellicon(mpte, mpts);
3473 
3474 			mpte->mpte_used_cell = 1;
3475 		} else {
3476 			/*
3477 			 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3478 			 * explicitly set the cellicon, then we unset it again.
3479 			 */
3480 			if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3481 				mptcp_unset_cellicon(mpte, NULL, 1);
3482 			}
3483 
3484 			mpte->mpte_used_wifi = 1;
3485 		}
3486 
3487 		/*
3488 		 * Don't propagate EWOULDBLOCK - it's already taken care of
3489 		 * in mptcp_usr_send for TFO.
3490 		 */
3491 		error = 0;
3492 	} else {
3493 		/* We need to revert our change to mpts_rel_seq */
3494 		mpts->mpts_rel_seq -= tot_sent;
3495 
3496 		os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3497 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3498 	}
3499 out:
3500 
3501 	if (head != NULL) {
3502 		m_freem(head);
3503 	}
3504 
3505 	if (wakeup) {
3506 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3507 	}
3508 
3509 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3510 	return error;
3511 }
3512 
3513 static void
mptcp_add_reinjectq(struct mptses * mpte,struct mbuf * m)3514 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3515 {
3516 	struct mbuf *n, *prev = NULL;
3517 
3518 	mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
3519 	    __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3520 	    m->m_pkthdr.mp_rseq),
3521 	    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3522 
3523 	n = mpte->mpte_reinjectq;
3524 
3525 	/* First, look for an mbuf n, whose data-sequence-number is bigger or
3526 	 * equal than m's sequence number.
3527 	 */
3528 	while (n) {
3529 		if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3530 			break;
3531 		}
3532 
3533 		prev = n;
3534 
3535 		n = n->m_nextpkt;
3536 	}
3537 
3538 	if (n) {
3539 		/* m is already fully covered by the next mbuf in the queue */
3540 		if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3541 		    n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3542 			os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3543 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3544 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3545 			    m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
3546 			goto dont_queue;
3547 		}
3548 
3549 		/* m is covering the next mbuf entirely, thus we remove this guy */
3550 		if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3551 			struct mbuf *tmp = n->m_nextpkt;
3552 
3553 			os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3554 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3555 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3556 			    (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
3557 
3558 			m->m_nextpkt = NULL;
3559 			if (prev == NULL) {
3560 				mpte->mpte_reinjectq = tmp;
3561 			} else {
3562 				prev->m_nextpkt = tmp;
3563 			}
3564 
3565 			m_freem(n);
3566 			n = tmp;
3567 		}
3568 	}
3569 
3570 	if (prev) {
3571 		/* m is already fully covered by the previous mbuf in the queue */
3572 		if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3573 			os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3574 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3575 			    (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3576 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
3577 			goto dont_queue;
3578 		}
3579 	}
3580 
3581 	if (prev == NULL) {
3582 		mpte->mpte_reinjectq = m;
3583 	} else {
3584 		prev->m_nextpkt = m;
3585 	}
3586 
3587 	m->m_nextpkt = n;
3588 
3589 	return;
3590 
3591 dont_queue:
3592 	m_freem(m);
3593 	return;
3594 }
3595 
3596 static struct mbuf *
mptcp_lookup_dsn(struct mptses * mpte,uint64_t dsn)3597 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3598 {
3599 	struct socket *mp_so = mptetoso(mpte);
3600 	struct mbuf *m;
3601 
3602 	m = mp_so->so_snd.sb_mb;
3603 
3604 	while (m) {
3605 		/* If this segment covers what we are looking for, return it. */
3606 		if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3607 		    MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3608 			break;
3609 		}
3610 
3611 
3612 		/* Segment is no more in the queue */
3613 		if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3614 			return NULL;
3615 		}
3616 
3617 		m = m->m_next;
3618 	}
3619 
3620 	return m;
3621 }
3622 
3623 static struct mbuf *
mptcp_copy_mbuf_list(struct mptses * mpte,struct mbuf * m,int len)3624 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3625 {
3626 	struct mbuf *top = NULL, *tail = NULL;
3627 	uint64_t dsn;
3628 	uint32_t dlen, rseq;
3629 
3630 	dsn = m->m_pkthdr.mp_dsn;
3631 	dlen = m->m_pkthdr.mp_rlen;
3632 	rseq = m->m_pkthdr.mp_rseq;
3633 
3634 	while (len > 0) {
3635 		struct mbuf *n;
3636 
3637 		VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3638 
3639 		n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3640 		if (n == NULL) {
3641 			os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3642 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3643 			goto err;
3644 		}
3645 
3646 		VERIFY(n->m_flags & M_PKTHDR);
3647 		VERIFY(n->m_next == NULL);
3648 		VERIFY(n->m_pkthdr.mp_dsn == dsn);
3649 		VERIFY(n->m_pkthdr.mp_rlen == dlen);
3650 		VERIFY(n->m_pkthdr.mp_rseq == rseq);
3651 		VERIFY(n->m_len == m->m_len);
3652 
3653 		n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3654 
3655 		if (top == NULL) {
3656 			top = n;
3657 		}
3658 
3659 		if (tail != NULL) {
3660 			tail->m_next = n;
3661 		}
3662 
3663 		tail = n;
3664 
3665 		len -= m->m_len;
3666 		m = m->m_next;
3667 	}
3668 
3669 	return top;
3670 
3671 err:
3672 	if (top) {
3673 		m_freem(top);
3674 	}
3675 
3676 	return NULL;
3677 }
3678 
3679 static void
mptcp_reinject_mbufs(struct socket * so)3680 mptcp_reinject_mbufs(struct socket *so)
3681 {
3682 	struct tcpcb *tp = sototcpcb(so);
3683 	struct mptsub *mpts = tp->t_mpsub;
3684 	struct mptcb *mp_tp = tptomptp(tp);
3685 	struct mptses *mpte = mp_tp->mpt_mpte;
3686 	struct sockbuf *sb = &so->so_snd;
3687 	struct mbuf *m;
3688 
3689 	m = sb->sb_mb;
3690 	while (m) {
3691 		struct mbuf *n = m->m_next, *orig = m;
3692 		bool set_reinject_flag = false;
3693 
3694 		mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3695 		    __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3696 		    m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3697 		    MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3698 
3699 		VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3700 
3701 		if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3702 			goto next;
3703 		}
3704 
3705 		/* Has it all already been acknowledged at the data-level? */
3706 		if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3707 			goto next;
3708 		}
3709 
3710 		/* Part of this has already been acknowledged - lookup in the
3711 		 * MPTCP-socket for the segment.
3712 		 */
3713 		if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3714 			m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3715 			if (m == NULL) {
3716 				goto next;
3717 			}
3718 		}
3719 
3720 		/* Copy the mbuf with headers (aka, DSN-numbers) */
3721 		m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3722 		if (m == NULL) {
3723 			break;
3724 		}
3725 
3726 		VERIFY(m->m_nextpkt == NULL);
3727 
3728 		/* Now, add to the reinject-queue, eliminating overlapping
3729 		 * segments
3730 		 */
3731 		mptcp_add_reinjectq(mpte, m);
3732 
3733 		set_reinject_flag = true;
3734 		orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3735 
3736 next:
3737 		/* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3738 		while (n) {
3739 			VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3740 
3741 			if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3742 				break;
3743 			}
3744 
3745 			if (set_reinject_flag) {
3746 				n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3747 			}
3748 			n = n->m_next;
3749 		}
3750 
3751 		m = n;
3752 	}
3753 }
3754 
3755 void
mptcp_clean_reinjectq(struct mptses * mpte)3756 mptcp_clean_reinjectq(struct mptses *mpte)
3757 {
3758 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3759 
3760 	socket_lock_assert_owned(mptetoso(mpte));
3761 
3762 	while (mpte->mpte_reinjectq) {
3763 		struct mbuf *m = mpte->mpte_reinjectq;
3764 
3765 		if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3766 		    MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3767 			break;
3768 		}
3769 
3770 		mpte->mpte_reinjectq = m->m_nextpkt;
3771 		m->m_nextpkt = NULL;
3772 		m_freem(m);
3773 	}
3774 }
3775 
3776 /*
3777  * Subflow socket control event upcall.
3778  */
3779 static void
mptcp_subflow_eupcall1(struct socket * so,void * arg,long events)3780 mptcp_subflow_eupcall1(struct socket *so, void *arg, long events)
3781 {
3782 #pragma unused(so)
3783 	struct mptsub *mpts = arg;
3784 	struct mptses *mpte = mpts->mpts_mpte;
3785 
3786 	socket_lock_assert_owned(mptetoso(mpte));
3787 
3788 	if ((mpts->mpts_evctl & events) == events) {
3789 		return;
3790 	}
3791 
3792 	mpts->mpts_evctl |= events;
3793 
3794 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3795 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3796 		return;
3797 	}
3798 
3799 	mptcp_subflow_workloop(mpte);
3800 }
3801 
3802 /*
3803  * Subflow socket control events.
3804  *
3805  * Called for handling events related to the underlying subflow socket.
3806  */
3807 static ev_ret_t
mptcp_subflow_events(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint)3808 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3809     long *p_mpsofilt_hint)
3810 {
3811 	ev_ret_t ret = MPTS_EVRET_OK;
3812 	int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3813 	    sizeof(mpsub_ev_entry_tbl[0]);
3814 
3815 	/* bail if there's nothing to process */
3816 	if (!mpts->mpts_evctl) {
3817 		return ret;
3818 	}
3819 
3820 	if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
3821 	    SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
3822 	    SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
3823 	    SO_FILT_HINT_DISCONNECTED)) {
3824 		mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3825 	}
3826 
3827 	DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3828 	    struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3829 
3830 	/*
3831 	 * Process all the socket filter hints and reset the hint
3832 	 * once it is handled
3833 	 */
3834 	for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3835 		/*
3836 		 * Always execute the DISCONNECTED event, because it will wakeup
3837 		 * the app.
3838 		 */
3839 		if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3840 		    (ret >= MPTS_EVRET_OK ||
3841 		    mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3842 			mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3843 			ev_ret_t error =
3844 			    mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3845 			ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3846 		}
3847 	}
3848 
3849 	return ret;
3850 }
3851 
3852 static ev_ret_t
mptcp_subflow_propagate_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)3853 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3854     long *p_mpsofilt_hint, long event)
3855 {
3856 	struct socket *mp_so, *so;
3857 	struct mptcb *mp_tp;
3858 
3859 	mp_so = mptetoso(mpte);
3860 	mp_tp = mpte->mpte_mptcb;
3861 	so = mpts->mpts_socket;
3862 
3863 	/*
3864 	 * We got an event for this subflow that might need to be propagated,
3865 	 * based on the state of the MPTCP connection.
3866 	 */
3867 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3868 	    (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3869 	    ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3870 		mp_so->so_error = so->so_error;
3871 		*p_mpsofilt_hint |= event;
3872 	}
3873 
3874 	return MPTS_EVRET_OK;
3875 }
3876 
3877 /*
3878  * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3879  */
3880 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)3881 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3882     long *p_mpsofilt_hint, long event)
3883 {
3884 #pragma unused(p_mpsofilt_hint, event)
3885 	struct socket *mp_so;
3886 	struct tcpcb *tp;
3887 
3888 	mp_so = mptetoso(mpte);
3889 	tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3890 
3891 	/*
3892 	 * This overwrites any previous mpte_lost_aid to avoid storing
3893 	 * too much state when the typical case has only two subflows.
3894 	 */
3895 	mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3896 	mpte->mpte_lost_aid = tp->t_local_aid;
3897 
3898 	mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3899 	    MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3900 
3901 	/*
3902 	 * The subflow connection has lost its source address.
3903 	 */
3904 	mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3905 
3906 	if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3907 		mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3908 	}
3909 
3910 	return MPTS_EVRET_DELETE;
3911 }
3912 
3913 static ev_ret_t
mptcp_subflow_mpsuberror_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)3914 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3915     long *p_mpsofilt_hint, long event)
3916 {
3917 #pragma unused(event, p_mpsofilt_hint)
3918 	struct socket *so, *mp_so;
3919 
3920 	so = mpts->mpts_socket;
3921 
3922 	if (so->so_error != ENODATA) {
3923 		return MPTS_EVRET_OK;
3924 	}
3925 
3926 
3927 	mp_so = mptetoso(mpte);
3928 
3929 	mp_so->so_error = ENODATA;
3930 
3931 	sorwakeup(mp_so);
3932 	sowwakeup(mp_so);
3933 
3934 	return MPTS_EVRET_OK;
3935 }
3936 
3937 
3938 /*
3939  * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3940  * indicates that the remote side sent a Data FIN
3941  */
3942 static ev_ret_t
mptcp_subflow_mpcantrcvmore_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)3943 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3944     long *p_mpsofilt_hint, long event)
3945 {
3946 #pragma unused(event)
3947 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3948 
3949 	mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3950 	    MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3951 
3952 	/*
3953 	 * We got a Data FIN for the MPTCP connection.
3954 	 * The FIN may arrive with data. The data is handed up to the
3955 	 * mptcp socket and the user is notified so that it may close
3956 	 * the socket if needed.
3957 	 */
3958 	if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3959 		*p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3960 	}
3961 
3962 	return MPTS_EVRET_OK; /* keep the subflow socket around */
3963 }
3964 
3965 /*
3966  * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3967  */
3968 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)3969 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3970     long *p_mpsofilt_hint, long event)
3971 {
3972 #pragma unused(event, p_mpsofilt_hint)
3973 	struct mptsub *mpts_alt = NULL;
3974 	struct socket *alt_so = NULL;
3975 	struct socket *mp_so;
3976 	int altpath_exists = 0;
3977 
3978 	mp_so = mptetoso(mpte);
3979 	os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3980 
3981 	mptcp_reinject_mbufs(mpts->mpts_socket);
3982 
3983 	mpts_alt = mptcp_get_subflow(mpte, NULL);
3984 
3985 	/* If there is no alternate eligible subflow, ignore the failover hint. */
3986 	if (mpts_alt == NULL || mpts_alt == mpts) {
3987 		os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3988 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3989 
3990 		goto done;
3991 	}
3992 
3993 	altpath_exists = 1;
3994 	alt_so = mpts_alt->mpts_socket;
3995 	if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3996 		/* All data acknowledged and no RTT spike */
3997 		if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3998 			mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3999 		} else {
4000 			/* no alternate path available */
4001 			altpath_exists = 0;
4002 		}
4003 	}
4004 
4005 	if (altpath_exists) {
4006 		mpts_alt->mpts_flags |= MPTSF_ACTIVE;
4007 
4008 		mpte->mpte_active_sub = mpts_alt;
4009 		mpts->mpts_flags |= MPTSF_FAILINGOVER;
4010 		mpts->mpts_flags &= ~MPTSF_ACTIVE;
4011 
4012 		os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
4013 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
4014 
4015 		mptcpstats_inc_switch(mpte, mpts);
4016 
4017 		sowwakeup(alt_so);
4018 	} else {
4019 		mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
4020 		    mpts->mpts_connid),
4021 		    MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4022 done:
4023 		mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
4024 	}
4025 
4026 	return MPTS_EVRET_OK;
4027 }
4028 
4029 /*
4030  * Handle SO_FILT_HINT_IFDENIED subflow socket event.
4031  */
4032 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4033 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
4034     long *p_mpsofilt_hint, long event)
4035 {
4036 	mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
4037 	    mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4038 
4039 	/*
4040 	 * The subflow connection cannot use the outgoing interface, let's
4041 	 * close this subflow.
4042 	 */
4043 	mptcp_subflow_abort(mpts, EPERM);
4044 
4045 	mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
4046 
4047 	return MPTS_EVRET_DELETE;
4048 }
4049 
4050 /*
4051  * https://tools.ietf.org/html/rfc6052#section-2
4052  * https://tools.ietf.org/html/rfc6147#section-5.2
4053  */
4054 static boolean_t
mptcp_desynthesize_ipv6_addr(struct mptses * mpte,const struct in6_addr * addr,const struct ipv6_prefix * prefix,struct in_addr * addrv4)4055 mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr,
4056     const struct ipv6_prefix *prefix,
4057     struct in_addr *addrv4)
4058 {
4059 	char buf[MAX_IPv4_STR_LEN];
4060 	char *ptrv4 = (char *)addrv4;
4061 	const char *ptr = (const char *)addr;
4062 
4063 	if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
4064 		return false;
4065 	}
4066 
4067 	switch (prefix->prefix_len) {
4068 	case NAT64_PREFIX_LEN_96:
4069 		memcpy(ptrv4, ptr + 12, 4);
4070 		break;
4071 	case NAT64_PREFIX_LEN_64:
4072 		memcpy(ptrv4, ptr + 9, 4);
4073 		break;
4074 	case NAT64_PREFIX_LEN_56:
4075 		memcpy(ptrv4, ptr + 7, 1);
4076 		memcpy(ptrv4 + 1, ptr + 9, 3);
4077 		break;
4078 	case NAT64_PREFIX_LEN_48:
4079 		memcpy(ptrv4, ptr + 6, 2);
4080 		memcpy(ptrv4 + 2, ptr + 9, 2);
4081 		break;
4082 	case NAT64_PREFIX_LEN_40:
4083 		memcpy(ptrv4, ptr + 5, 3);
4084 		memcpy(ptrv4 + 3, ptr + 9, 1);
4085 		break;
4086 	case NAT64_PREFIX_LEN_32:
4087 		memcpy(ptrv4, ptr + 4, 4);
4088 		break;
4089 	default:
4090 		panic("NAT64-prefix len is wrong: %u",
4091 		    prefix->prefix_len);
4092 	}
4093 
4094 	os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__,
4095 	    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4096 	    inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
4097 
4098 	return true;
4099 }
4100 
4101 static void
mptcp_handle_ipv6_connection(struct mptses * mpte,const struct mptsub * mpts)4102 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
4103 {
4104 	struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
4105 	struct socket *so = mpts->mpts_socket;
4106 	struct ifnet *ifp;
4107 	int j;
4108 
4109 	/* Subflow IPs will be steered directly by the server - no need to
4110 	 * desynthesize.
4111 	 */
4112 	if (mpte->mpte_flags & MPTE_UNICAST_IP) {
4113 		return;
4114 	}
4115 
4116 	ifp = sotoinpcb(so)->inp_last_outifp;
4117 
4118 	if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
4119 		return;
4120 	}
4121 
4122 	for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
4123 		int success;
4124 
4125 		if (nat64prefixes[j].prefix_len == 0) {
4126 			continue;
4127 		}
4128 
4129 		success = mptcp_desynthesize_ipv6_addr(mpte,
4130 		    &mpte->__mpte_dst_v6.sin6_addr,
4131 		    &nat64prefixes[j],
4132 		    &mpte->mpte_sub_dst_v4.sin_addr);
4133 		if (success) {
4134 			mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
4135 			mpte->mpte_sub_dst_v4.sin_family = AF_INET;
4136 			mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
4137 
4138 			/*
4139 			 * We connected to a NAT64'ed address. Let's remove it
4140 			 * from the potential IPs to use. Whenever we are back on
4141 			 * that network and need to connect, we can synthesize again.
4142 			 *
4143 			 * Otherwise, on different IPv6 networks we will attempt
4144 			 * to connect to that NAT64 address...
4145 			 */
4146 			memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6));
4147 			break;
4148 		}
4149 	}
4150 }
4151 
4152 static void
mptcp_try_alternate_port(struct mptses * mpte,struct mptsub * mpts)4153 mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
4154 {
4155 	struct inpcb *inp;
4156 
4157 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
4158 		return;
4159 	}
4160 
4161 	inp = sotoinpcb(mpts->mpts_socket);
4162 	if (inp == NULL) {
4163 		return;
4164 	}
4165 
4166 	/* Should we try the alternate port? */
4167 	if (mpte->mpte_alternate_port &&
4168 	    inp->inp_fport != mpte->mpte_alternate_port) {
4169 		union sockaddr_in_4_6 dst;
4170 		struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
4171 
4172 		memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
4173 
4174 		dst_in->sin_port = mpte->mpte_alternate_port;
4175 
4176 		mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
4177 		    mpts->mpts_ifscope, NULL);
4178 	} else { /* Else, we tried all we could, mark this interface as non-MPTCP */
4179 		unsigned int i;
4180 
4181 		if (inp->inp_last_outifp == NULL) {
4182 			return;
4183 		}
4184 
4185 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
4186 			struct mpt_itf_info *info =  &mpte->mpte_itfinfo[i];
4187 
4188 			if (inp->inp_last_outifp->if_index == info->ifindex) {
4189 				info->no_mptcp_support = 1;
4190 				break;
4191 			}
4192 		}
4193 	}
4194 }
4195 
4196 /*
4197  * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4198  */
4199 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4200 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4201     long *p_mpsofilt_hint, long event)
4202 {
4203 #pragma unused(event, p_mpsofilt_hint)
4204 	struct socket *mp_so, *so;
4205 	struct inpcb *inp;
4206 	struct tcpcb *tp;
4207 	struct mptcb *mp_tp;
4208 	int af;
4209 	boolean_t mpok = FALSE;
4210 
4211 	mp_so = mptetoso(mpte);
4212 	mp_tp = mpte->mpte_mptcb;
4213 	so = mpts->mpts_socket;
4214 	tp = sototcpcb(so);
4215 	af = mpts->mpts_dst.sa_family;
4216 
4217 	if (mpts->mpts_flags & MPTSF_CONNECTED) {
4218 		return MPTS_EVRET_OK;
4219 	}
4220 
4221 	if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4222 	    (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4223 		if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
4224 		    (so->so_state & SS_ISCONNECTED)) {
4225 			mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
4226 			    __func__, mpts->mpts_connid),
4227 			    MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4228 			(void) soshutdownlock(so, SHUT_RD);
4229 			(void) soshutdownlock(so, SHUT_WR);
4230 			(void) sodisconnectlocked(so);
4231 		}
4232 		return MPTS_EVRET_OK;
4233 	}
4234 
4235 	/*
4236 	 * The subflow connection has been connected.  Find out whether it
4237 	 * is connected as a regular TCP or as a MPTCP subflow.  The idea is:
4238 	 *
4239 	 *   a. If MPTCP connection is not yet established, then this must be
4240 	 *	the first subflow connection.  If MPTCP failed to negotiate,
4241 	 *	fallback to regular TCP by degrading this subflow.
4242 	 *
4243 	 *   b. If MPTCP connection has been established, then this must be
4244 	 *	one of the subsequent subflow connections. If MPTCP failed
4245 	 *	to negotiate, disconnect the connection.
4246 	 *
4247 	 * Right now, we simply unblock any waiters at the MPTCP socket layer
4248 	 * if the MPTCP connection has not been established.
4249 	 */
4250 
4251 	if (so->so_state & SS_ISDISCONNECTED) {
4252 		/*
4253 		 * With MPTCP joins, a connection is connected at the subflow
4254 		 * level, but the 4th ACK from the server elevates the MPTCP
4255 		 * subflow to connected state. So there is a small window
4256 		 * where the subflow could get disconnected before the
4257 		 * connected event is processed.
4258 		 */
4259 		return MPTS_EVRET_OK;
4260 	}
4261 
4262 	if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4263 		mptcp_drop_tfo_data(mpte, mpts);
4264 	}
4265 
4266 	mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4267 	mpts->mpts_flags |= MPTSF_CONNECTED;
4268 
4269 	if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4270 		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4271 	}
4272 
4273 	tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4274 
4275 	/* get/verify the outbound interface */
4276 	inp = sotoinpcb(so);
4277 
4278 	mpts->mpts_maxseg = tp->t_maxseg;
4279 
4280 	mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
4281 	    ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
4282 	    ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
4283 	    (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
4284 
4285 	mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4286 
4287 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4288 		mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4289 		mpte->mpte_associd = mpts->mpts_connid;
4290 		DTRACE_MPTCP2(state__change,
4291 		    struct mptcb *, mp_tp,
4292 		    uint32_t, 0 /* event */);
4293 
4294 		if (SOCK_DOM(so) == AF_INET) {
4295 			in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4296 		} else {
4297 			in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4298 		}
4299 
4300 		mpts->mpts_flags |= MPTSF_ACTIVE;
4301 
4302 		/* case (a) above */
4303 		if (!mpok) {
4304 			tcpstat.tcps_mpcap_fallback++;
4305 
4306 			tp->t_mpflags |= TMPF_INFIN_SENT;
4307 			mptcp_notify_mpfail(so);
4308 		} else {
4309 			if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4310 			    mptcp_subflows_need_backup_flag(mpte)) {
4311 				tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4312 			} else {
4313 				mpts->mpts_flags |= MPTSF_PREFERRED;
4314 			}
4315 			mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4316 			mpte->mpte_nummpcapflows++;
4317 
4318 			if (SOCK_DOM(so) == AF_INET6) {
4319 				mptcp_handle_ipv6_connection(mpte, mpts);
4320 			}
4321 
4322 			mptcp_check_subflows_and_add(mpte);
4323 
4324 			if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4325 				mpte->mpte_initial_cell = 1;
4326 			}
4327 
4328 			mpte->mpte_handshake_success = 1;
4329 		}
4330 
4331 		mp_tp->mpt_sndwnd = tp->snd_wnd;
4332 		mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4333 		mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4334 		soisconnected(mp_so);
4335 	} else if (mpok) {
4336 		/*
4337 		 * case (b) above
4338 		 * In case of additional flows, the MPTCP socket is not
4339 		 * MPTSF_MP_CAPABLE until an ACK is received from server
4340 		 * for 3-way handshake.  TCP would have guaranteed that this
4341 		 * is an MPTCP subflow.
4342 		 */
4343 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4344 		    !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4345 		    mptcp_subflows_need_backup_flag(mpte)) {
4346 			tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4347 			mpts->mpts_flags &= ~MPTSF_PREFERRED;
4348 		} else {
4349 			mpts->mpts_flags |= MPTSF_PREFERRED;
4350 		}
4351 
4352 		mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4353 		mpte->mpte_nummpcapflows++;
4354 
4355 		mpts->mpts_rel_seq = 1;
4356 
4357 		mptcp_check_subflows_and_remove(mpte);
4358 	} else {
4359 		mptcp_try_alternate_port(mpte, mpts);
4360 
4361 		tcpstat.tcps_join_fallback++;
4362 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4363 			tcpstat.tcps_mptcp_cell_proxy++;
4364 		} else {
4365 			tcpstat.tcps_mptcp_wifi_proxy++;
4366 		}
4367 
4368 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4369 
4370 		return MPTS_EVRET_OK;
4371 	}
4372 
4373 	/* This call, just to "book" an entry in the stats-table for this ifindex */
4374 	mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4375 
4376 	mptcp_output(mpte);
4377 
4378 	return MPTS_EVRET_OK; /* keep the subflow socket around */
4379 }
4380 
4381 /*
4382  * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4383  */
4384 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4385 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4386     long *p_mpsofilt_hint, long event)
4387 {
4388 #pragma unused(event, p_mpsofilt_hint)
4389 	struct socket *mp_so, *so;
4390 	struct mptcb *mp_tp;
4391 
4392 	mp_so = mptetoso(mpte);
4393 	mp_tp = mpte->mpte_mptcb;
4394 	so = mpts->mpts_socket;
4395 
4396 	mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
4397 	    __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
4398 	    !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
4399 	    !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
4400 	    MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4401 
4402 	if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4403 		return MPTS_EVRET_DELETE;
4404 	}
4405 
4406 	mpts->mpts_flags |= MPTSF_DISCONNECTED;
4407 
4408 	/* The subflow connection has been disconnected. */
4409 
4410 	if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4411 		mpte->mpte_nummpcapflows--;
4412 		if (mpte->mpte_active_sub == mpts) {
4413 			mpte->mpte_active_sub = NULL;
4414 			mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
4415 			    __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
4416 		}
4417 		mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4418 	} else {
4419 		if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4420 		    !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4421 			mptcp_try_alternate_port(mpte, mpts);
4422 		}
4423 	}
4424 
4425 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4426 	    ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4427 		mptcp_drop(mpte, mp_tp, so->so_error);
4428 	}
4429 
4430 	/*
4431 	 * Clear flags that are used by getconninfo to return state.
4432 	 * Retain like MPTSF_DELETEOK for internal purposes.
4433 	 */
4434 	mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4435 	    MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4436 	    MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4437 
4438 	return MPTS_EVRET_DELETE;
4439 }
4440 
4441 /*
4442  * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4443  */
4444 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4445 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4446     long *p_mpsofilt_hint, long event)
4447 {
4448 #pragma unused(event, p_mpsofilt_hint)
4449 	ev_ret_t ret = MPTS_EVRET_OK;
4450 	struct socket *mp_so, *so;
4451 	struct mptcb *mp_tp;
4452 
4453 	mp_so = mptetoso(mpte);
4454 	mp_tp = mpte->mpte_mptcb;
4455 	so = mpts->mpts_socket;
4456 	struct inpcb *inp = sotoinpcb(so);
4457 	struct tcpcb *tp = intotcpcb(inp);
4458 
4459 	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4460 		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4461 	} else {
4462 		mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4463 	}
4464 
4465 	if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4466 		if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4467 			goto done;
4468 		}
4469 		mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4470 	} else {
4471 		mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4472 	}
4473 
4474 	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4475 		mpts->mpts_flags |= MPTSF_MP_READY;
4476 	} else {
4477 		mpts->mpts_flags &= ~MPTSF_MP_READY;
4478 	}
4479 
4480 	if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4481 		mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4482 		mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4483 		tcp_cache_update_mptcp_version(tp, FALSE);
4484 	}
4485 
4486 	if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4487 		ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4488 
4489 		m_freem_list(mpte->mpte_reinjectq);
4490 		mpte->mpte_reinjectq = NULL;
4491 	} else if (mpts->mpts_flags & MPTSF_MP_READY) {
4492 		mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4493 		ret = MPTS_EVRET_CONNECT_PENDING;
4494 	}
4495 
4496 done:
4497 	return ret;
4498 }
4499 
4500 /*
4501  * Handle SO_FILT_HINT_MUSTRST subflow socket event
4502  */
4503 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4504 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4505     long *p_mpsofilt_hint, long event)
4506 {
4507 #pragma unused(event)
4508 	struct socket *mp_so, *so;
4509 	struct mptcb *mp_tp;
4510 	boolean_t is_fastclose;
4511 
4512 	mp_so = mptetoso(mpte);
4513 	mp_tp = mpte->mpte_mptcb;
4514 	so = mpts->mpts_socket;
4515 
4516 	/* We got an invalid option or a fast close */
4517 	struct inpcb *inp = sotoinpcb(so);
4518 	struct tcpcb *tp = NULL;
4519 
4520 	tp = intotcpcb(inp);
4521 	so->so_error = ECONNABORTED;
4522 
4523 	is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4524 
4525 	tp->t_mpflags |= TMPF_RESET;
4526 
4527 	if (tp->t_state != TCPS_CLOSED) {
4528 		struct tcptemp *t_template = tcp_maketemplate(tp);
4529 
4530 		if (t_template) {
4531 			struct tcp_respond_args tra;
4532 
4533 			bzero(&tra, sizeof(tra));
4534 			if (inp->inp_flags & INP_BOUND_IF) {
4535 				tra.ifscope = inp->inp_boundifp->if_index;
4536 			} else {
4537 				tra.ifscope = IFSCOPE_NONE;
4538 			}
4539 			tra.awdl_unrestricted = 1;
4540 
4541 			tcp_respond(tp, t_template->tt_ipgen,
4542 			    &t_template->tt_t, (struct mbuf *)NULL,
4543 			    tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4544 			(void) m_free(dtom(t_template));
4545 		}
4546 	}
4547 
4548 	if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4549 		struct mptsub *iter, *tmp;
4550 
4551 		*p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4552 
4553 		mp_so->so_error = ECONNRESET;
4554 
4555 		TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4556 			if (iter == mpts) {
4557 				continue;
4558 			}
4559 			mptcp_subflow_abort(iter, ECONNABORTED);
4560 		}
4561 
4562 		/*
4563 		 * mptcp_drop is being called after processing the events, to fully
4564 		 * close the MPTCP connection
4565 		 */
4566 		mptcp_drop(mpte, mp_tp, mp_so->so_error);
4567 	}
4568 
4569 	mptcp_subflow_abort(mpts, ECONNABORTED);
4570 
4571 	if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4572 		mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4573 	}
4574 
4575 	return MPTS_EVRET_DELETE;
4576 }
4577 
4578 static ev_ret_t
mptcp_subflow_adaptive_rtimo_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4579 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4580     long *p_mpsofilt_hint, long event)
4581 {
4582 #pragma unused(event)
4583 	bool found_active = false;
4584 
4585 	mpts->mpts_flags |= MPTSF_READ_STALL;
4586 
4587 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4588 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4589 
4590 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4591 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
4592 			continue;
4593 		}
4594 
4595 		if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4596 			found_active = true;
4597 			break;
4598 		}
4599 	}
4600 
4601 	if (!found_active) {
4602 		*p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4603 	}
4604 
4605 	return MPTS_EVRET_OK;
4606 }
4607 
4608 static ev_ret_t
mptcp_subflow_adaptive_wtimo_ev(struct mptses * mpte,struct mptsub * mpts,long * p_mpsofilt_hint,long event)4609 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4610     long *p_mpsofilt_hint, long event)
4611 {
4612 #pragma unused(event)
4613 	bool found_active = false;
4614 
4615 	mpts->mpts_flags |= MPTSF_WRITE_STALL;
4616 
4617 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4618 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4619 
4620 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4621 		    tp->t_state > TCPS_CLOSE_WAIT) {
4622 			continue;
4623 		}
4624 
4625 		if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4626 			found_active = true;
4627 			break;
4628 		}
4629 	}
4630 
4631 	if (!found_active) {
4632 		*p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4633 	}
4634 
4635 	return MPTS_EVRET_OK;
4636 }
4637 
4638 /*
4639  * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4640  * caller must ensure that the option can be issued on subflow sockets, via
4641  * MPOF_SUBFLOW_OK flag.
4642  */
4643 int
mptcp_subflow_sosetopt(struct mptses * mpte,struct mptsub * mpts,struct mptopt * mpo)4644 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4645 {
4646 	struct socket *mp_so, *so;
4647 	struct sockopt sopt;
4648 	int error;
4649 
4650 	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4651 
4652 	mp_so = mptetoso(mpte);
4653 	so = mpts->mpts_socket;
4654 
4655 	socket_lock_assert_owned(mp_so);
4656 
4657 	if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4658 	    mpo->mpo_level == SOL_SOCKET &&
4659 	    mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4660 		struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4661 
4662 		mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
4663 		    __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable_for_session(mpte),
4664 		    sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
4665 		    mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
4666 		    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4667 
4668 		/*
4669 		 * When we open a new subflow, mark it as cell fallback, if
4670 		 * this subflow goes over cell.
4671 		 *
4672 		 * (except for first-party apps)
4673 		 */
4674 
4675 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4676 			return 0;
4677 		}
4678 
4679 		if (sotoinpcb(so)->inp_last_outifp &&
4680 		    !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4681 			return 0;
4682 		}
4683 
4684 		/*
4685 		 * This here is an OR, because if the app is not binding to the
4686 		 * interface, then it definitely is not a cell-fallback
4687 		 * connection.
4688 		 */
4689 		if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4690 		    !IFNET_IS_CELLULAR(ifp)) {
4691 			return 0;
4692 		}
4693 	}
4694 
4695 	mpo->mpo_flags &= ~MPOF_INTERIM;
4696 
4697 	bzero(&sopt, sizeof(sopt));
4698 	sopt.sopt_dir = SOPT_SET;
4699 	sopt.sopt_level = mpo->mpo_level;
4700 	sopt.sopt_name = mpo->mpo_name;
4701 	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4702 	sopt.sopt_valsize = sizeof(int);
4703 	sopt.sopt_p = kernproc;
4704 
4705 	error = sosetoptlock(so, &sopt, 0);
4706 	if (error) {
4707 		os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4708 		    "val %d set error %d\n", __func__,
4709 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4710 		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4711 		    mpo->mpo_intval, error);
4712 	}
4713 	return error;
4714 }
4715 
4716 /*
4717  * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4718  * caller must ensure that the option can be issued on subflow sockets, via
4719  * MPOF_SUBFLOW_OK flag.
4720  */
4721 int
mptcp_subflow_sogetopt(struct mptses * mpte,struct socket * so,struct mptopt * mpo)4722 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4723     struct mptopt *mpo)
4724 {
4725 	struct socket *mp_so;
4726 	struct sockopt sopt;
4727 	int error;
4728 
4729 	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4730 	mp_so = mptetoso(mpte);
4731 
4732 	socket_lock_assert_owned(mp_so);
4733 
4734 	bzero(&sopt, sizeof(sopt));
4735 	sopt.sopt_dir = SOPT_GET;
4736 	sopt.sopt_level = mpo->mpo_level;
4737 	sopt.sopt_name = mpo->mpo_name;
4738 	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4739 	sopt.sopt_valsize = sizeof(int);
4740 	sopt.sopt_p = kernproc;
4741 
4742 	error = sogetoptlock(so, &sopt, 0);     /* already locked */
4743 	if (error) {
4744 		os_log_error(mptcp_log_handle,
4745 		    "%s - %lx: sopt %s get error %d\n",
4746 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4747 		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4748 	}
4749 	return error;
4750 }
4751 
4752 
4753 /*
4754  * MPTCP garbage collector.
4755  *
4756  * This routine is called by the MP domain on-demand, periodic callout,
4757  * which is triggered when a MPTCP socket is closed.  The callout will
4758  * repeat as long as this routine returns a non-zero value.
4759  */
4760 static uint32_t
mptcp_gc(struct mppcbinfo * mppi)4761 mptcp_gc(struct mppcbinfo *mppi)
4762 {
4763 	struct mppcb *mpp, *tmpp;
4764 	uint32_t active = 0;
4765 
4766 	LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4767 
4768 	TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4769 		struct socket *mp_so;
4770 		struct mptses *mpte;
4771 		struct mptcb *mp_tp;
4772 
4773 		mp_so = mpp->mpp_socket;
4774 		mpte = mptompte(mpp);
4775 		mp_tp = mpte->mpte_mptcb;
4776 
4777 		if (!mpp_try_lock(mpp)) {
4778 			active++;
4779 			continue;
4780 		}
4781 
4782 		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4783 
4784 		/* check again under the lock */
4785 		if (mp_so->so_usecount > 0) {
4786 			boolean_t wakeup = FALSE;
4787 			struct mptsub *mpts, *tmpts;
4788 
4789 			if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4790 				if (mp_tp->mpt_gc_ticks > 0) {
4791 					mp_tp->mpt_gc_ticks--;
4792 				}
4793 				if (mp_tp->mpt_gc_ticks == 0) {
4794 					wakeup = TRUE;
4795 				}
4796 			}
4797 			if (wakeup) {
4798 				TAILQ_FOREACH_SAFE(mpts,
4799 				    &mpte->mpte_subflows, mpts_entry, tmpts) {
4800 					mptcp_subflow_eupcall1(mpts->mpts_socket,
4801 					    mpts, SO_FILT_HINT_DISCONNECTED);
4802 				}
4803 			}
4804 			socket_unlock(mp_so, 0);
4805 			active++;
4806 			continue;
4807 		}
4808 
4809 		if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4810 			panic("%s - %lx: skipped state "
4811 			    "[u=%d,r=%d,s=%d]\n", __func__,
4812 			    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4813 			    mp_so->so_usecount, mp_so->so_retaincnt,
4814 			    mpp->mpp_state);
4815 		}
4816 
4817 		if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4818 			mptcp_close(mpte, mp_tp);
4819 		}
4820 
4821 		mptcp_session_destroy(mpte);
4822 
4823 		DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4824 		    struct sockbuf *, &mp_so->so_rcv,
4825 		    struct sockbuf *, &mp_so->so_snd,
4826 		    struct mppcb *, mpp);
4827 
4828 		mptcp_pcbdispose(mpp);
4829 		sodealloc(mp_so);
4830 	}
4831 
4832 	return active;
4833 }
4834 
4835 /*
4836  * Drop a MPTCP connection, reporting the specified error.
4837  */
4838 struct mptses *
mptcp_drop(struct mptses * mpte,struct mptcb * mp_tp,u_short errno)4839 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4840 {
4841 	struct socket *mp_so = mptetoso(mpte);
4842 
4843 	VERIFY(mpte->mpte_mptcb == mp_tp);
4844 
4845 	socket_lock_assert_owned(mp_so);
4846 
4847 	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4848 	    uint32_t, 0 /* event */);
4849 
4850 	if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4851 		errno = mp_tp->mpt_softerror;
4852 	}
4853 	mp_so->so_error = errno;
4854 
4855 	return mptcp_close(mpte, mp_tp);
4856 }
4857 
4858 /*
4859  * Close a MPTCP control block.
4860  */
4861 struct mptses *
mptcp_close(struct mptses * mpte,struct mptcb * mp_tp)4862 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4863 {
4864 	struct mptsub *mpts = NULL, *tmpts = NULL;
4865 	struct socket *mp_so = mptetoso(mpte);
4866 
4867 	socket_lock_assert_owned(mp_so);
4868 	VERIFY(mpte->mpte_mptcb == mp_tp);
4869 
4870 	mp_tp->mpt_state = MPTCPS_TERMINATE;
4871 
4872 	mptcp_freeq(mp_tp);
4873 
4874 	soisdisconnected(mp_so);
4875 
4876 	/* Clean up all subflows */
4877 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4878 		mptcp_subflow_disconnect(mpte, mpts);
4879 	}
4880 
4881 	return NULL;
4882 }
4883 
4884 void
mptcp_notify_close(struct socket * so)4885 mptcp_notify_close(struct socket *so)
4886 {
4887 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4888 }
4889 
4890 /*
4891  * MPTCP workloop.
4892  */
4893 void
mptcp_subflow_workloop(struct mptses * mpte)4894 mptcp_subflow_workloop(struct mptses *mpte)
4895 {
4896 	boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4897 	long mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4898 	struct mptsub *mpts, *tmpts;
4899 	struct socket *mp_so;
4900 
4901 	mp_so = mptetoso(mpte);
4902 
4903 	socket_lock_assert_owned(mp_so);
4904 
4905 	if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4906 		mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4907 		return;
4908 	}
4909 	mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4910 
4911 relaunch:
4912 	mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4913 
4914 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4915 		ev_ret_t ret;
4916 
4917 		if (mpts->mpts_socket->so_usecount == 0) {
4918 			/* Will be removed soon by tcp_garbage_collect */
4919 			continue;
4920 		}
4921 
4922 		mptcp_subflow_addref(mpts);
4923 		mpts->mpts_socket->so_usecount++;
4924 
4925 		ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4926 
4927 		/*
4928 		 * If MPTCP socket is closed, disconnect all subflows.
4929 		 * This will generate a disconnect event which will
4930 		 * be handled during the next iteration, causing a
4931 		 * non-zero error to be returned above.
4932 		 */
4933 		if (mp_so->so_flags & SOF_PCBCLEARING) {
4934 			mptcp_subflow_disconnect(mpte, mpts);
4935 		}
4936 
4937 		switch (ret) {
4938 		case MPTS_EVRET_OK:
4939 			/* nothing to do */
4940 			break;
4941 		case MPTS_EVRET_DELETE:
4942 			mptcp_subflow_soclose(mpts);
4943 			break;
4944 		case MPTS_EVRET_CONNECT_PENDING:
4945 			connect_pending = TRUE;
4946 			break;
4947 		case MPTS_EVRET_DISCONNECT_FALLBACK:
4948 			disconnect_fallback = TRUE;
4949 			break;
4950 		default:
4951 			mptcplog((LOG_DEBUG,
4952 			    "MPTCP Socket: %s: mptcp_subflow_events "
4953 			    "returned invalid value: %d\n", __func__,
4954 			    ret),
4955 			    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4956 			break;
4957 		}
4958 		mptcp_subflow_remref(mpts);             /* ours */
4959 
4960 		VERIFY(mpts->mpts_socket->so_usecount != 0);
4961 		mpts->mpts_socket->so_usecount--;
4962 	}
4963 
4964 	if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4965 		VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4966 
4967 		if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4968 			mp_so->so_state |= SS_CANTRCVMORE;
4969 			sorwakeup(mp_so);
4970 		}
4971 
4972 		soevent(mp_so, mpsofilt_hint_mask);
4973 	}
4974 
4975 	if (!connect_pending && !disconnect_fallback) {
4976 		goto exit;
4977 	}
4978 
4979 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4980 		if (disconnect_fallback) {
4981 			struct socket *so = NULL;
4982 			struct inpcb *inp = NULL;
4983 			struct tcpcb *tp = NULL;
4984 
4985 			if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4986 				continue;
4987 			}
4988 
4989 			mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4990 
4991 			if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4992 			    MPTSF_DISCONNECTED)) {
4993 				continue;
4994 			}
4995 
4996 			so = mpts->mpts_socket;
4997 
4998 			/*
4999 			 * The MPTCP connection has degraded to a fallback
5000 			 * mode, so there is no point in keeping this subflow
5001 			 * regardless of its MPTCP-readiness state, unless it
5002 			 * is the primary one which we use for fallback.  This
5003 			 * assumes that the subflow used for fallback is the
5004 			 * ACTIVE one.
5005 			 */
5006 
5007 			inp = sotoinpcb(so);
5008 			tp = intotcpcb(inp);
5009 			tp->t_mpflags &=
5010 			    ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5011 			tp->t_mpflags |= TMPF_TCP_FALLBACK;
5012 
5013 			soevent(so, SO_FILT_HINT_MUSTRST);
5014 		} else if (connect_pending) {
5015 			/*
5016 			 * The MPTCP connection has progressed to a state
5017 			 * where it supports full multipath semantics; allow
5018 			 * additional joins to be attempted for all subflows
5019 			 * that are in the PENDING state.
5020 			 */
5021 			if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
5022 				int error = mptcp_subflow_soconnectx(mpte, mpts);
5023 
5024 				if (error) {
5025 					mptcp_subflow_abort(mpts, error);
5026 				}
5027 			}
5028 		}
5029 	}
5030 
5031 exit:
5032 	if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
5033 		goto relaunch;
5034 	}
5035 
5036 	mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
5037 }
5038 
5039 /*
5040  * Protocol pr_lock callback.
5041  */
5042 int
mptcp_lock(struct socket * mp_so,int refcount,void * lr)5043 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
5044 {
5045 	struct mppcb *mpp = mpsotomppcb(mp_so);
5046 	void *lr_saved;
5047 
5048 	if (lr == NULL) {
5049 		lr_saved = __builtin_return_address(0);
5050 	} else {
5051 		lr_saved = lr;
5052 	}
5053 
5054 	if (mpp == NULL) {
5055 		panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
5056 		    mp_so, lr_saved, solockhistory_nr(mp_so));
5057 		/* NOTREACHED */
5058 	}
5059 	mpp_lock(mpp);
5060 
5061 	if (mp_so->so_usecount < 0) {
5062 		panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__,
5063 		    mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
5064 		    solockhistory_nr(mp_so));
5065 		/* NOTREACHED */
5066 	}
5067 	if (refcount != 0) {
5068 		mp_so->so_usecount++;
5069 		mpp->mpp_inside++;
5070 	}
5071 	mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
5072 	mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
5073 
5074 	return 0;
5075 }
5076 
5077 /*
5078  * Protocol pr_unlock callback.
5079  */
5080 int
mptcp_unlock(struct socket * mp_so,int refcount,void * lr)5081 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
5082 {
5083 	struct mppcb *mpp = mpsotomppcb(mp_so);
5084 	void *lr_saved;
5085 
5086 	if (lr == NULL) {
5087 		lr_saved = __builtin_return_address(0);
5088 	} else {
5089 		lr_saved = lr;
5090 	}
5091 
5092 	if (mpp == NULL) {
5093 		panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__,
5094 		    mp_so, mp_so->so_usecount, lr_saved,
5095 		    solockhistory_nr(mp_so));
5096 		/* NOTREACHED */
5097 	}
5098 	socket_lock_assert_owned(mp_so);
5099 
5100 	if (refcount != 0) {
5101 		mp_so->so_usecount--;
5102 		mpp->mpp_inside--;
5103 	}
5104 
5105 	if (mp_so->so_usecount < 0) {
5106 		panic("%s: so=%p usecount=%x lrh= %s", __func__,
5107 		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5108 		/* NOTREACHED */
5109 	}
5110 	if (mpp->mpp_inside < 0) {
5111 		panic("%s: mpp=%p inside=%x lrh= %s", __func__,
5112 		    mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5113 		/* NOTREACHED */
5114 	}
5115 	mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5116 	mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5117 	mpp_unlock(mpp);
5118 
5119 	return 0;
5120 }
5121 
5122 /*
5123  * Protocol pr_getlock callback.
5124  */
5125 lck_mtx_t *
mptcp_getlock(struct socket * mp_so,int flags)5126 mptcp_getlock(struct socket *mp_so, int flags)
5127 {
5128 	struct mppcb *mpp = mpsotomppcb(mp_so);
5129 
5130 	if (mpp == NULL) {
5131 		panic("%s: so=%p NULL so_pcb %s", __func__, mp_so,
5132 		    solockhistory_nr(mp_so));
5133 		/* NOTREACHED */
5134 	}
5135 	if (mp_so->so_usecount < 0) {
5136 		panic("%s: so=%p usecount=%x lrh= %s", __func__,
5137 		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5138 		/* NOTREACHED */
5139 	}
5140 	return mpp_getlock(mpp, flags);
5141 }
5142 
5143 /*
5144  * MPTCP Join support
5145  */
5146 
5147 static void
mptcp_attach_to_subf(struct socket * so,struct mptcb * mp_tp,uint8_t addr_id)5148 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
5149 {
5150 	struct tcpcb *tp = sototcpcb(so);
5151 	struct mptcp_subf_auth_entry *sauth_entry;
5152 
5153 	/*
5154 	 * The address ID of the first flow is implicitly 0.
5155 	 */
5156 	if (mp_tp->mpt_state == MPTCPS_CLOSED) {
5157 		tp->t_local_aid = 0;
5158 	} else {
5159 		tp->t_local_aid = addr_id;
5160 		tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
5161 		so->so_flags |= SOF_MP_SEC_SUBFLOW;
5162 	}
5163 	sauth_entry = zalloc(mpt_subauth_zone);
5164 	sauth_entry->msae_laddr_id = tp->t_local_aid;
5165 	sauth_entry->msae_raddr_id = 0;
5166 	sauth_entry->msae_raddr_rand = 0;
5167 try_again:
5168 	sauth_entry->msae_laddr_rand = RandomULong();
5169 	if (sauth_entry->msae_laddr_rand == 0) {
5170 		goto try_again;
5171 	}
5172 	LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
5173 }
5174 
5175 static void
mptcp_detach_mptcb_from_subf(struct mptcb * mp_tp,struct socket * so)5176 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
5177 {
5178 	struct mptcp_subf_auth_entry *sauth_entry;
5179 	struct tcpcb *tp = NULL;
5180 	int found = 0;
5181 
5182 	tp = sototcpcb(so);
5183 	if (tp == NULL) {
5184 		return;
5185 	}
5186 
5187 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5188 		if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
5189 			found = 1;
5190 			break;
5191 		}
5192 	}
5193 	if (found) {
5194 		LIST_REMOVE(sauth_entry, msae_next);
5195 	}
5196 
5197 	if (found) {
5198 		zfree(mpt_subauth_zone, sauth_entry);
5199 	}
5200 }
5201 
5202 void
mptcp_get_rands(mptcp_addr_id addr_id,struct mptcb * mp_tp,u_int32_t * lrand,u_int32_t * rrand)5203 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5204     u_int32_t *rrand)
5205 {
5206 	struct mptcp_subf_auth_entry *sauth_entry;
5207 
5208 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5209 		if (sauth_entry->msae_laddr_id == addr_id) {
5210 			if (lrand) {
5211 				*lrand = sauth_entry->msae_laddr_rand;
5212 			}
5213 			if (rrand) {
5214 				*rrand = sauth_entry->msae_raddr_rand;
5215 			}
5216 			break;
5217 		}
5218 	}
5219 }
5220 
5221 void
mptcp_set_raddr_rand(mptcp_addr_id laddr_id,struct mptcb * mp_tp,mptcp_addr_id raddr_id,u_int32_t raddr_rand)5222 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5223     mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5224 {
5225 	struct mptcp_subf_auth_entry *sauth_entry;
5226 
5227 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5228 		if (sauth_entry->msae_laddr_id == laddr_id) {
5229 			if ((sauth_entry->msae_raddr_id != 0) &&
5230 			    (sauth_entry->msae_raddr_id != raddr_id)) {
5231 				os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5232 				    " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5233 				    raddr_id, sauth_entry->msae_raddr_id);
5234 				return;
5235 			}
5236 			sauth_entry->msae_raddr_id = raddr_id;
5237 			if ((sauth_entry->msae_raddr_rand != 0) &&
5238 			    (sauth_entry->msae_raddr_rand != raddr_rand)) {
5239 				os_log_error(mptcp_log_handle, "%s - %lx: "
5240 				    "dup SYN_ACK %d %d \n",
5241 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5242 				    raddr_rand, sauth_entry->msae_raddr_rand);
5243 				return;
5244 			}
5245 			sauth_entry->msae_raddr_rand = raddr_rand;
5246 			return;
5247 		}
5248 	}
5249 }
5250 
5251 /*
5252  * SHA-256 support for MPTCP
5253  */
5254 
5255 static void
mptcp_do_sha256(mptcp_key_t * key,char * sha_digest)5256 mptcp_do_sha256(mptcp_key_t *key, char *sha_digest)
5257 {
5258 	const unsigned char *sha2_base;
5259 	int sha2_size;
5260 
5261 	sha2_base = (const unsigned char *) key;
5262 	sha2_size = sizeof(mptcp_key_t);
5263 
5264 	SHA256_CTX sha_ctx;
5265 	SHA256_Init(&sha_ctx);
5266 	SHA256_Update(&sha_ctx, sha2_base, sha2_size);
5267 	SHA256_Final(sha_digest, &sha_ctx);
5268 }
5269 
5270 void
mptcp_hmac_sha256(mptcp_key_t key1,mptcp_key_t key2,u_char * msg,uint16_t msg_len,u_char * digest)5271 mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2,
5272     u_char *msg, uint16_t msg_len, u_char *digest)
5273 {
5274 	SHA256_CTX sha_ctx;
5275 	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5276 	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5277 	int i;
5278 
5279 	bzero(digest, SHA256_DIGEST_LENGTH);
5280 
5281 	/* Set up the Key for HMAC */
5282 	key_ipad[0] = key1;
5283 	key_ipad[1] = key2;
5284 
5285 	key_opad[0] = key1;
5286 	key_opad[1] = key2;
5287 
5288 	/* Key is 512 block length, so no need to compute hash */
5289 
5290 	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5291 
5292 	for (i = 0; i < 8; i++) {
5293 		key_ipad[i] ^= 0x3636363636363636;
5294 		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5295 	}
5296 
5297 	/* Perform inner SHA256 */
5298 	SHA256_Init(&sha_ctx);
5299 	SHA256_Update(&sha_ctx, (unsigned char *)key_ipad, sizeof(key_ipad));
5300 	SHA256_Update(&sha_ctx, msg, msg_len);
5301 	SHA256_Final(digest, &sha_ctx);
5302 
5303 	/* Perform outer SHA256 */
5304 	SHA256_Init(&sha_ctx);
5305 	SHA256_Update(&sha_ctx, (unsigned char *)key_opad, sizeof(key_opad));
5306 	SHA256_Update(&sha_ctx, (unsigned char *)digest, SHA256_DIGEST_LENGTH);
5307 	SHA256_Final(digest, &sha_ctx);
5308 }
5309 
5310 /*
5311  * SHA1 support for MPTCP
5312  */
5313 
5314 static void
mptcp_do_sha1(mptcp_key_t * key,char * sha_digest)5315 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5316 {
5317 	SHA1_CTX sha1ctxt;
5318 	const unsigned char *sha1_base;
5319 	int sha1_size;
5320 
5321 	sha1_base = (const unsigned char *) key;
5322 	sha1_size = sizeof(mptcp_key_t);
5323 	SHA1Init(&sha1ctxt);
5324 	SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5325 	SHA1Final(sha_digest, &sha1ctxt);
5326 }
5327 
5328 void
mptcp_hmac_sha1(mptcp_key_t key1,mptcp_key_t key2,u_int32_t rand1,u_int32_t rand2,u_char * digest)5329 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5330     u_int32_t rand1, u_int32_t rand2, u_char *digest)
5331 {
5332 	SHA1_CTX  sha1ctxt;
5333 	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5334 	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5335 	u_int32_t data[2];
5336 	int i;
5337 
5338 	bzero(digest, SHA1_RESULTLEN);
5339 
5340 	/* Set up the Key for HMAC */
5341 	key_ipad[0] = key1;
5342 	key_ipad[1] = key2;
5343 
5344 	key_opad[0] = key1;
5345 	key_opad[1] = key2;
5346 
5347 	/* Set up the message for HMAC */
5348 	data[0] = rand1;
5349 	data[1] = rand2;
5350 
5351 	/* Key is 512 block length, so no need to compute hash */
5352 
5353 	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5354 
5355 	for (i = 0; i < 8; i++) {
5356 		key_ipad[i] ^= 0x3636363636363636;
5357 		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5358 	}
5359 
5360 	/* Perform inner SHA1 */
5361 	SHA1Init(&sha1ctxt);
5362 	SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5363 	SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5364 	SHA1Final(digest, &sha1ctxt);
5365 
5366 	/* Perform outer SHA1 */
5367 	SHA1Init(&sha1ctxt);
5368 	SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5369 	SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5370 	SHA1Final(digest, &sha1ctxt);
5371 }
5372 
5373 /*
5374  * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5375  * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5376  */
5377 void
mptcp_get_mpjoin_hmac(mptcp_addr_id aid,struct mptcb * mp_tp,u_char * digest,uint8_t digest_len)5378 mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, uint8_t digest_len)
5379 {
5380 	uint32_t lrand, rrand;
5381 
5382 	lrand = rrand = 0;
5383 	mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5384 
5385 	u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0};
5386 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5387 		mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, full_digest);
5388 	} else {
5389 		uint32_t data[2];
5390 		data[0] = lrand;
5391 		data[1] = rrand;
5392 		mptcp_hmac_sha256(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, (u_char*)data, 8, full_digest);
5393 	}
5394 	bcopy(full_digest, digest, digest_len);
5395 }
5396 
5397 /*
5398  * Authentication data generation
5399  */
5400 static void
mptcp_generate_token(char * sha_digest,int sha_digest_len,caddr_t token,int token_len)5401 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5402     int token_len)
5403 {
5404 	VERIFY(token_len == sizeof(u_int32_t));
5405 	VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5406 	    sha_digest_len == SHA256_DIGEST_LENGTH);
5407 
5408 	/* Most significant 32 bits of the SHA1/SHA256 hash */
5409 	bcopy(sha_digest, token, sizeof(u_int32_t));
5410 	return;
5411 }
5412 
5413 static void
mptcp_generate_idsn(char * sha_digest,int sha_digest_len,caddr_t idsn,int idsn_len,uint8_t mp_version)5414 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5415     int idsn_len, uint8_t mp_version)
5416 {
5417 	VERIFY(idsn_len == sizeof(u_int64_t));
5418 	VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5419 	    sha_digest_len == SHA256_DIGEST_LENGTH);
5420 	VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1);
5421 
5422 	/*
5423 	 * Least significant 64 bits of the hash
5424 	 */
5425 
5426 	if (mp_version == MPTCP_VERSION_0) {
5427 		idsn[7] = sha_digest[12];
5428 		idsn[6] = sha_digest[13];
5429 		idsn[5] = sha_digest[14];
5430 		idsn[4] = sha_digest[15];
5431 		idsn[3] = sha_digest[16];
5432 		idsn[2] = sha_digest[17];
5433 		idsn[1] = sha_digest[18];
5434 		idsn[0] = sha_digest[19];
5435 	} else {
5436 		idsn[7] = sha_digest[24];
5437 		idsn[6] = sha_digest[25];
5438 		idsn[5] = sha_digest[26];
5439 		idsn[4] = sha_digest[27];
5440 		idsn[3] = sha_digest[28];
5441 		idsn[2] = sha_digest[29];
5442 		idsn[1] = sha_digest[30];
5443 		idsn[0] = sha_digest[31];
5444 	}
5445 	return;
5446 }
5447 
5448 static void
mptcp_conn_properties(struct mptcb * mp_tp)5449 mptcp_conn_properties(struct mptcb *mp_tp)
5450 {
5451 	/* Set DSS checksum flag */
5452 	if (mptcp_dss_csum) {
5453 		mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5454 	}
5455 
5456 	/* Set up receive window */
5457 	mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5458 
5459 	/* Set up gc ticks */
5460 	mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5461 }
5462 
5463 static void
mptcp_init_local_parms(struct mptses * mpte,struct sockaddr * dst)5464 mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst)
5465 {
5466 	struct mptcb *mp_tp = mpte->mpte_mptcb;
5467 	char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5468 	uint16_t digest_len;
5469 
5470 	if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) {
5471 		mp_tp->mpt_version = MPTCP_VERSION_0;
5472 	} else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) {
5473 		mp_tp->mpt_version = MPTCP_VERSION_1;
5474 	} else {
5475 		mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst);
5476 	}
5477 	VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 ||
5478 	    mp_tp->mpt_version == MPTCP_VERSION_1);
5479 
5480 	read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5481 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5482 		digest_len = SHA1_RESULTLEN;
5483 		mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5484 	} else {
5485 		digest_len = SHA256_DIGEST_LENGTH;
5486 		mptcp_do_sha256(&mp_tp->mpt_localkey, key_digest);
5487 	}
5488 
5489 	mptcp_generate_token(key_digest, digest_len,
5490 	    (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5491 	mptcp_generate_idsn(key_digest, digest_len,
5492 	    (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5493 	/* The subflow SYN is also first MPTCP byte */
5494 	mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5495 	mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5496 
5497 	mptcp_conn_properties(mp_tp);
5498 }
5499 
5500 int
mptcp_init_remote_parms(struct mptcb * mp_tp)5501 mptcp_init_remote_parms(struct mptcb *mp_tp)
5502 {
5503 	/* Setup local and remote tokens and Initial DSNs */
5504 	char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5505 	uint16_t digest_len;
5506 
5507 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5508 		digest_len = SHA1_RESULTLEN;
5509 		mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5510 	} else if (mp_tp->mpt_version == MPTCP_VERSION_1) {
5511 		digest_len = SHA256_DIGEST_LENGTH;
5512 		mptcp_do_sha256(&mp_tp->mpt_remotekey, remote_digest);
5513 	} else {
5514 		return -1;
5515 	}
5516 
5517 	mptcp_generate_token(remote_digest, digest_len,
5518 	    (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5519 	mptcp_generate_idsn(remote_digest, digest_len,
5520 	    (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5521 	mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5522 	mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5523 	return 0;
5524 }
5525 
5526 static void
mptcp_send_dfin(struct socket * so)5527 mptcp_send_dfin(struct socket *so)
5528 {
5529 	struct tcpcb *tp = NULL;
5530 	struct inpcb *inp = NULL;
5531 
5532 	inp = sotoinpcb(so);
5533 	if (!inp) {
5534 		return;
5535 	}
5536 
5537 	tp = intotcpcb(inp);
5538 	if (!tp) {
5539 		return;
5540 	}
5541 
5542 	if (!(tp->t_mpflags & TMPF_RESET)) {
5543 		tp->t_mpflags |= TMPF_SEND_DFIN;
5544 	}
5545 }
5546 
5547 /*
5548  * Data Sequence Mapping routines
5549  */
5550 void
mptcp_insert_dsn(struct mppcb * mpp,struct mbuf * m)5551 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5552 {
5553 	struct mptcb *mp_tp;
5554 
5555 	if (m == NULL) {
5556 		return;
5557 	}
5558 
5559 	__IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5560 
5561 	while (m) {
5562 		VERIFY(m->m_flags & M_PKTHDR);
5563 		m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5564 		m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5565 		VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5566 		m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5567 		mp_tp->mpt_sndmax += m_pktlen(m);
5568 		m = m->m_next;
5569 	}
5570 }
5571 
5572 void
mptcp_fallback_sbdrop(struct socket * so,struct mbuf * m,int len)5573 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5574 {
5575 	struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5576 	uint64_t data_ack;
5577 	uint64_t dsn;
5578 
5579 	VERIFY(len >= 0);
5580 
5581 	if (!m || len == 0) {
5582 		return;
5583 	}
5584 
5585 	while (m && len > 0) {
5586 		VERIFY(m->m_flags & M_PKTHDR);
5587 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5588 
5589 		data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5590 		dsn = m->m_pkthdr.mp_dsn;
5591 
5592 		len -= m->m_len;
5593 		m = m->m_next;
5594 	}
5595 
5596 	if (m && len == 0) {
5597 		/*
5598 		 * If there is one more mbuf in the chain, it automatically means
5599 		 * that up to m->mp_dsn has been ack'ed.
5600 		 *
5601 		 * This means, we actually correct data_ack back down (compared
5602 		 * to what we set inside the loop - dsn + data_len). Because in
5603 		 * the loop we are "optimistic" and assume that the full mapping
5604 		 * will be acked. If that's not the case and we get out of the
5605 		 * loop with m != NULL, it means only up to m->mp_dsn has been
5606 		 * really acked.
5607 		 */
5608 		data_ack = m->m_pkthdr.mp_dsn;
5609 	}
5610 
5611 	if (len < 0) {
5612 		/*
5613 		 * If len is negative, meaning we acked in the middle of an mbuf,
5614 		 * only up to this mbuf's data-sequence number has been acked
5615 		 * at the MPTCP-level.
5616 		 */
5617 		data_ack = dsn;
5618 	}
5619 
5620 	mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
5621 	    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5622 
5623 	/* We can have data in the subflow's send-queue that is being acked,
5624 	 * while the DATA_ACK has already advanced. Thus, we should check whether
5625 	 * or not the DATA_ACK is actually new here.
5626 	 */
5627 	if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5628 	    MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5629 		mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5630 	}
5631 }
5632 
5633 void
mptcp_preproc_sbdrop(struct socket * so,struct mbuf * m,unsigned int len)5634 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5635 {
5636 	int rewinding = 0;
5637 
5638 	/* TFO makes things complicated. */
5639 	if (so->so_flags1 & SOF1_TFO_REWIND) {
5640 		rewinding = 1;
5641 		so->so_flags1 &= ~SOF1_TFO_REWIND;
5642 	}
5643 
5644 	while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5645 		u_int32_t sub_len;
5646 		VERIFY(m->m_flags & M_PKTHDR);
5647 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5648 
5649 		sub_len = m->m_pkthdr.mp_rlen;
5650 
5651 		if (sub_len < len) {
5652 			m->m_pkthdr.mp_dsn += sub_len;
5653 			if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5654 				m->m_pkthdr.mp_rseq += sub_len;
5655 			}
5656 			m->m_pkthdr.mp_rlen = 0;
5657 			len -= sub_len;
5658 		} else {
5659 			/* sub_len >= len */
5660 			if (rewinding == 0) {
5661 				m->m_pkthdr.mp_dsn += len;
5662 			}
5663 			if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5664 				if (rewinding == 0) {
5665 					m->m_pkthdr.mp_rseq += len;
5666 				}
5667 			}
5668 			mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
5669 			    __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
5670 			    m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
5671 			    MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5672 			m->m_pkthdr.mp_rlen -= len;
5673 			break;
5674 		}
5675 		m = m->m_next;
5676 	}
5677 
5678 	if (so->so_flags & SOF_MP_SUBFLOW &&
5679 	    !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5680 	    !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5681 		/*
5682 		 * Received an ack without receiving a DATA_ACK.
5683 		 * Need to fallback to regular TCP (or destroy this subflow).
5684 		 */
5685 		sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5686 		mptcp_notify_mpfail(so);
5687 	}
5688 }
5689 
5690 /* Obtain the DSN mapping stored in the mbuf */
5691 void
mptcp_output_getm_dsnmap32(struct socket * so,int off,uint32_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5692 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5693     uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5694 {
5695 	u_int64_t dsn64;
5696 
5697 	mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5698 	*dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5699 }
5700 
5701 void
mptcp_output_getm_dsnmap64(struct socket * so,int off,uint64_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5702 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5703     uint32_t *relseq, uint16_t *data_len,
5704     uint16_t *dss_csum)
5705 {
5706 	struct mbuf *m = so->so_snd.sb_mb;
5707 	int off_orig = off;
5708 
5709 	VERIFY(off >= 0);
5710 
5711 	if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5712 		*dsn = 0;
5713 		*relseq = 0;
5714 		*data_len = 0;
5715 		*dss_csum = 0;
5716 		return;
5717 	}
5718 
5719 	/*
5720 	 * In the subflow socket, the DSN sequencing can be discontiguous,
5721 	 * but the subflow sequence mapping is contiguous. Use the subflow
5722 	 * sequence property to find the right mbuf and corresponding dsn
5723 	 * mapping.
5724 	 */
5725 
5726 	while (m) {
5727 		VERIFY(m->m_flags & M_PKTHDR);
5728 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5729 
5730 		if (off >= m->m_len) {
5731 			off -= m->m_len;
5732 			m = m->m_next;
5733 		} else {
5734 			break;
5735 		}
5736 	}
5737 
5738 	VERIFY(off >= 0);
5739 	VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5740 
5741 	*dsn = m->m_pkthdr.mp_dsn;
5742 	*relseq = m->m_pkthdr.mp_rseq;
5743 	*data_len = m->m_pkthdr.mp_rlen;
5744 	*dss_csum = m->m_pkthdr.mp_csum;
5745 
5746 	mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
5747 	    __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
5748 	    MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5749 }
5750 
5751 void
mptcp_output_getm_data_level_details(struct socket * so,int off,uint16_t * data_len,uint16_t * dss_csum)5752 mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum)
5753 {
5754 	uint64_t dsn;
5755 	uint32_t relseq;
5756 
5757 	mptcp_output_getm_dsnmap64(so, off, &dsn, &relseq, data_len, dss_csum);
5758 }
5759 
5760 /*
5761  * Note that this is called only from tcp_input() via mptcp_input_preproc()
5762  * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5763  * When it trims data tcp_input calls m_adj() which does not remove the
5764  * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5765  * The dsn map insertion cannot be delayed after trim, because data can be in
5766  * the reassembly queue for a while and the DSN option info in tp will be
5767  * overwritten for every new packet received.
5768  * The dsn map will be adjusted just prior to appending to subflow sockbuf
5769  * with mptcp_adj_rmap()
5770  */
5771 void
mptcp_insert_rmap(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th)5772 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5773 {
5774 	VERIFY(m->m_flags & M_PKTHDR);
5775 	VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5776 
5777 	if (tp->t_mpflags & TMPF_EMBED_DSN) {
5778 		m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5779 		m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5780 		m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5781 		m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5782 		if (tp->t_rcv_map.mpt_dfin) {
5783 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5784 		}
5785 
5786 		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5787 
5788 		tp->t_mpflags &= ~TMPF_EMBED_DSN;
5789 		tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5790 	} else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5791 		if (th->th_flags & TH_FIN) {
5792 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5793 		}
5794 	}
5795 }
5796 
5797 /*
5798  * Following routines help with failure detection and failover of data
5799  * transfer from one subflow to another.
5800  */
5801 void
mptcp_act_on_txfail(struct socket * so)5802 mptcp_act_on_txfail(struct socket *so)
5803 {
5804 	struct tcpcb *tp = NULL;
5805 	struct inpcb *inp = sotoinpcb(so);
5806 
5807 	if (inp == NULL) {
5808 		return;
5809 	}
5810 
5811 	tp = intotcpcb(inp);
5812 	if (tp == NULL) {
5813 		return;
5814 	}
5815 
5816 	if (so->so_flags & SOF_MP_TRYFAILOVER) {
5817 		return;
5818 	}
5819 
5820 	so->so_flags |= SOF_MP_TRYFAILOVER;
5821 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5822 }
5823 
5824 /*
5825  * Support for MP_FAIL option
5826  */
5827 int
mptcp_get_map_for_dsn(struct socket * so,uint64_t dsn_fail,uint32_t * tcp_seq)5828 mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5829 {
5830 	struct mbuf *m = so->so_snd.sb_mb;
5831 	uint16_t datalen;
5832 	uint64_t dsn;
5833 	int off = 0;
5834 
5835 	if (m == NULL) {
5836 		return -1;
5837 	}
5838 
5839 	while (m != NULL) {
5840 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5841 		VERIFY(m->m_flags & M_PKTHDR);
5842 		dsn = m->m_pkthdr.mp_dsn;
5843 		datalen = m->m_pkthdr.mp_rlen;
5844 		if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5845 		    (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5846 			off = (int)(dsn_fail - dsn);
5847 			*tcp_seq = m->m_pkthdr.mp_rseq + off;
5848 			return 0;
5849 		}
5850 
5851 		m = m->m_next;
5852 	}
5853 
5854 	/*
5855 	 * If there was no mbuf data and a fallback to TCP occurred, there's
5856 	 * not much else to do.
5857 	 */
5858 
5859 	os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5860 	return -1;
5861 }
5862 
5863 /*
5864  * Support for sending contiguous MPTCP bytes in subflow
5865  * Also for preventing sending data with ACK in 3-way handshake
5866  */
5867 int32_t
mptcp_adj_sendlen(struct socket * so,int32_t off)5868 mptcp_adj_sendlen(struct socket *so, int32_t off)
5869 {
5870 	struct tcpcb *tp = sototcpcb(so);
5871 	struct mptsub *mpts = tp->t_mpsub;
5872 	uint64_t mdss_dsn;
5873 	uint32_t mdss_subflow_seq;
5874 	int mdss_subflow_off;
5875 	uint16_t mdss_data_len;
5876 	uint16_t dss_csum;
5877 
5878 	if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5879 		return 0;
5880 	}
5881 
5882 	mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5883 	    &mdss_data_len, &dss_csum);
5884 
5885 	/*
5886 	 * We need to compute how much of the mapping still remains.
5887 	 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5888 	 */
5889 	mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5890 
5891 	/*
5892 	 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5893 	 * seq has been set to 1 (while it should be 0).
5894 	 */
5895 	if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5896 		mdss_subflow_off--;
5897 	}
5898 
5899 	VERIFY(off >= mdss_subflow_off);
5900 
5901 	return mdss_data_len - (off - mdss_subflow_off);
5902 }
5903 
5904 static uint32_t
mptcp_get_maxseg(struct mptses * mpte)5905 mptcp_get_maxseg(struct mptses *mpte)
5906 {
5907 	struct mptsub *mpts;
5908 	uint32_t maxseg = 0;
5909 
5910 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5911 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5912 
5913 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5914 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
5915 			continue;
5916 		}
5917 
5918 		if (tp->t_maxseg > maxseg) {
5919 			maxseg = tp->t_maxseg;
5920 		}
5921 	}
5922 
5923 	return maxseg;
5924 }
5925 
5926 static uint8_t
mptcp_get_rcvscale(struct mptses * mpte)5927 mptcp_get_rcvscale(struct mptses *mpte)
5928 {
5929 	struct mptsub *mpts;
5930 	uint8_t rcvscale = UINT8_MAX;
5931 
5932 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5933 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5934 
5935 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5936 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
5937 			continue;
5938 		}
5939 
5940 		if (tp->rcv_scale < rcvscale) {
5941 			rcvscale = tp->rcv_scale;
5942 		}
5943 	}
5944 
5945 	return rcvscale;
5946 }
5947 
5948 /* Similar to tcp_sbrcv_reserve */
5949 static void
mptcp_sbrcv_reserve(struct mptcb * mp_tp,struct sockbuf * sbrcv,u_int32_t newsize,u_int32_t idealsize)5950 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5951     u_int32_t newsize, u_int32_t idealsize)
5952 {
5953 	uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5954 
5955 	if (rcvscale == UINT8_MAX) {
5956 		return;
5957 	}
5958 
5959 	/* newsize should not exceed max */
5960 	newsize = min(newsize, tcp_autorcvbuf_max);
5961 
5962 	/* The receive window scale negotiated at the
5963 	 * beginning of the connection will also set a
5964 	 * limit on the socket buffer size
5965 	 */
5966 	newsize = min(newsize, TCP_MAXWIN << rcvscale);
5967 
5968 	/* Set new socket buffer size */
5969 	if (newsize > sbrcv->sb_hiwat &&
5970 	    (sbreserve(sbrcv, newsize) == 1)) {
5971 		sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5972 		    (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5973 
5974 		/* Again check the limit set by the advertised
5975 		 * window scale
5976 		 */
5977 		sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5978 		    TCP_MAXWIN << rcvscale);
5979 	}
5980 }
5981 
5982 void
mptcp_sbrcv_grow(struct mptcb * mp_tp)5983 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5984 {
5985 	struct mptses *mpte = mp_tp->mpt_mpte;
5986 	struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5987 	struct sockbuf *sbrcv = &mp_so->so_rcv;
5988 	uint32_t hiwat_sum = 0;
5989 	uint32_t ideal_sum = 0;
5990 	struct mptsub *mpts;
5991 
5992 	/*
5993 	 * Do not grow the receive socket buffer if
5994 	 * - auto resizing is disabled, globally or on this socket
5995 	 * - the high water mark already reached the maximum
5996 	 * - the stream is in background and receive side is being
5997 	 * throttled
5998 	 * - if there are segments in reassembly queue indicating loss,
5999 	 * do not need to increase recv window during recovery as more
6000 	 * data is not going to be sent. A duplicate ack sent during
6001 	 * recovery should not change the receive window
6002 	 */
6003 	if (tcp_do_autorcvbuf == 0 ||
6004 	    (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
6005 	    tcp_cansbgrow(sbrcv) == 0 ||
6006 	    sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
6007 	    (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
6008 	    !LIST_EMPTY(&mp_tp->mpt_segq)) {
6009 		/* Can not resize the socket buffer, just return */
6010 		return;
6011 	}
6012 
6013 	/*
6014 	 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
6015 	 *
6016 	 * But, for this we first need accurate receiver-RTT estimations, which
6017 	 * we currently don't have.
6018 	 *
6019 	 * Let's use a dummy algorithm for now, just taking the sum of all
6020 	 * subflow's receive-buffers. It's too low, but that's all we can get
6021 	 * for now.
6022 	 */
6023 
6024 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6025 		hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
6026 		ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
6027 	}
6028 
6029 	mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
6030 }
6031 
6032 /*
6033  * Determine if we can grow the recieve socket buffer to avoid sending
6034  * a zero window update to the peer. We allow even socket buffers that
6035  * have fixed size (set by the application) to grow if the resource
6036  * constraints are met. They will also be trimmed after the application
6037  * reads data.
6038  *
6039  * Similar to tcp_sbrcv_grow_rwin
6040  */
6041 static void
mptcp_sbrcv_grow_rwin(struct mptcb * mp_tp,struct sockbuf * sb)6042 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
6043 {
6044 	struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
6045 	u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
6046 	u_int32_t rcvbuf = sb->sb_hiwat;
6047 
6048 	if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
6049 		return;
6050 	}
6051 
6052 	if (tcp_do_autorcvbuf == 1 &&
6053 	    tcp_cansbgrow(sb) &&
6054 	    /* Diff to tcp_sbrcv_grow_rwin */
6055 	    (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
6056 	    (rcvbuf - sb->sb_cc) < rcvbufinc &&
6057 	    rcvbuf < tcp_autorcvbuf_max &&
6058 	    (sb->sb_idealsize > 0 &&
6059 	    sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
6060 		sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
6061 	}
6062 }
6063 
6064 /* Similar to tcp_sbspace */
6065 int32_t
mptcp_sbspace(struct mptcb * mp_tp)6066 mptcp_sbspace(struct mptcb *mp_tp)
6067 {
6068 	struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
6069 	uint32_t rcvbuf;
6070 	int32_t space;
6071 	int32_t pending = 0;
6072 
6073 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6074 
6075 	mptcp_sbrcv_grow_rwin(mp_tp, sb);
6076 
6077 	/* hiwat might have changed */
6078 	rcvbuf = sb->sb_hiwat;
6079 
6080 	space =  ((int32_t) imin((rcvbuf - sb->sb_cc),
6081 	    (sb->sb_mbmax - sb->sb_mbcnt)));
6082 	if (space < 0) {
6083 		space = 0;
6084 	}
6085 
6086 #if CONTENT_FILTER
6087 	/* Compensate for data being processed by content filters */
6088 	pending = cfil_sock_data_space(sb);
6089 #endif /* CONTENT_FILTER */
6090 	if (pending > space) {
6091 		space = 0;
6092 	} else {
6093 		space -= pending;
6094 	}
6095 
6096 	return space;
6097 }
6098 
6099 /*
6100  * Support Fallback to Regular TCP
6101  */
6102 void
mptcp_notify_mpready(struct socket * so)6103 mptcp_notify_mpready(struct socket *so)
6104 {
6105 	struct tcpcb *tp = NULL;
6106 
6107 	if (so == NULL) {
6108 		return;
6109 	}
6110 
6111 	tp = intotcpcb(sotoinpcb(so));
6112 
6113 	if (tp == NULL) {
6114 		return;
6115 	}
6116 
6117 	DTRACE_MPTCP4(multipath__ready, struct socket *, so,
6118 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
6119 	    struct tcpcb *, tp);
6120 
6121 	if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
6122 		return;
6123 	}
6124 
6125 	if (tp->t_mpflags & TMPF_MPTCP_READY) {
6126 		return;
6127 	}
6128 
6129 	tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
6130 	tp->t_mpflags |= TMPF_MPTCP_READY;
6131 
6132 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
6133 }
6134 
6135 void
mptcp_notify_mpfail(struct socket * so)6136 mptcp_notify_mpfail(struct socket *so)
6137 {
6138 	struct tcpcb *tp = NULL;
6139 
6140 	if (so == NULL) {
6141 		return;
6142 	}
6143 
6144 	tp = intotcpcb(sotoinpcb(so));
6145 
6146 	if (tp == NULL) {
6147 		return;
6148 	}
6149 
6150 	DTRACE_MPTCP4(multipath__failed, struct socket *, so,
6151 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
6152 	    struct tcpcb *, tp);
6153 
6154 	if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
6155 		return;
6156 	}
6157 
6158 	tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
6159 	tp->t_mpflags |= TMPF_TCP_FALLBACK;
6160 
6161 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
6162 }
6163 
6164 /*
6165  * Keepalive helper function
6166  */
6167 boolean_t
mptcp_ok_to_keepalive(struct mptcb * mp_tp)6168 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
6169 {
6170 	boolean_t ret = 1;
6171 
6172 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6173 
6174 	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
6175 		ret = 0;
6176 	}
6177 	return ret;
6178 }
6179 
6180 /*
6181  * MPTCP t_maxseg adjustment function
6182  */
6183 int
mptcp_adj_mss(struct tcpcb * tp,boolean_t mtudisc)6184 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6185 {
6186 	int mss_lower = 0;
6187 	struct mptcb *mp_tp = tptomptp(tp);
6188 
6189 #define MPTCP_COMPUTE_LEN {                             \
6190 	mss_lower = sizeof (struct mptcp_dss_ack_opt);  \
6191 	if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)         \
6192 	        mss_lower += 2;                         \
6193 	else                                            \
6194 	/* adjust to 32-bit boundary + EOL */   \
6195 	        mss_lower += 2;                         \
6196 }
6197 	if (mp_tp == NULL) {
6198 		return 0;
6199 	}
6200 
6201 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6202 
6203 	/*
6204 	 * For the first subflow and subsequent subflows, adjust mss for
6205 	 * most common MPTCP option size, for case where tcp_mss is called
6206 	 * during option processing and MTU discovery.
6207 	 */
6208 	if (!mtudisc) {
6209 		if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6210 		    !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6211 			MPTCP_COMPUTE_LEN;
6212 		}
6213 
6214 		if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6215 		    tp->t_mpflags & TMPF_SENT_JOIN) {
6216 			MPTCP_COMPUTE_LEN;
6217 		}
6218 	} else {
6219 		if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6220 			MPTCP_COMPUTE_LEN;
6221 		}
6222 	}
6223 
6224 	return mss_lower;
6225 }
6226 
6227 /*
6228  * Update the pid, upid, uuid of the subflow so, based on parent so
6229  */
6230 void
mptcp_update_last_owner(struct socket * so,struct socket * mp_so)6231 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
6232 {
6233 	if (so->last_pid != mp_so->last_pid ||
6234 	    so->last_upid != mp_so->last_upid) {
6235 		so->last_upid = mp_so->last_upid;
6236 		so->last_pid = mp_so->last_pid;
6237 		uuid_copy(so->last_uuid, mp_so->last_uuid);
6238 	}
6239 	so_update_policy(so);
6240 }
6241 
6242 static void
fill_mptcp_subflow(struct socket * so,mptcp_flow_t * flow,struct mptsub * mpts)6243 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6244 {
6245 	struct inpcb *inp;
6246 
6247 	tcp_getconninfo(so, &flow->flow_ci);
6248 	inp = sotoinpcb(so);
6249 	if ((inp->inp_vflag & INP_IPV6) != 0) {
6250 		flow->flow_src.ss_family = AF_INET6;
6251 		flow->flow_dst.ss_family = AF_INET6;
6252 		flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6253 		flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6254 		SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6255 		SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6256 		SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6257 		SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
6258 	} else if ((inp->inp_vflag & INP_IPV4) != 0) {
6259 		flow->flow_src.ss_family = AF_INET;
6260 		flow->flow_dst.ss_family = AF_INET;
6261 		flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6262 		flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6263 		SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6264 		SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6265 		SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6266 		SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6267 	}
6268 	flow->flow_len = sizeof(*flow);
6269 	flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
6270 	flow->flow_flags = mpts->mpts_flags;
6271 	flow->flow_cid = mpts->mpts_connid;
6272 	flow->flow_relseq = mpts->mpts_rel_seq;
6273 	flow->flow_soerror = mpts->mpts_socket->so_error;
6274 	flow->flow_probecnt = mpts->mpts_probecnt;
6275 }
6276 
6277 static int
6278 mptcp_pcblist SYSCTL_HANDLER_ARGS
6279 {
6280 #pragma unused(oidp, arg1, arg2)
6281 	int error = 0, f;
6282 	size_t len;
6283 	struct mppcb *mpp;
6284 	struct mptses *mpte;
6285 	struct mptcb *mp_tp;
6286 	struct mptsub *mpts;
6287 	struct socket *so;
6288 	conninfo_mptcp_t mptcpci;
6289 	mptcp_flow_t *flows = NULL;
6290 
6291 	if (req->newptr != USER_ADDR_NULL) {
6292 		return EPERM;
6293 	}
6294 
6295 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6296 	if (req->oldptr == USER_ADDR_NULL) {
6297 		size_t n = mtcbinfo.mppi_count;
6298 		lck_mtx_unlock(&mtcbinfo.mppi_lock);
6299 		req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6300 		    4 * (n + n / 8)  * sizeof(mptcp_flow_t);
6301 		return 0;
6302 	}
6303 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6304 		flows = NULL;
6305 		socket_lock(mpp->mpp_socket, 1);
6306 		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6307 		mpte = mptompte(mpp);
6308 
6309 		socket_lock_assert_owned(mptetoso(mpte));
6310 		mp_tp = mpte->mpte_mptcb;
6311 
6312 		bzero(&mptcpci, sizeof(mptcpci));
6313 		mptcpci.mptcpci_state = mp_tp->mpt_state;
6314 		mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6315 		mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6316 		mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6317 		mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6318 		mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6319 		mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6320 		mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6321 		mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6322 		mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6323 		mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6324 		mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6325 		mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6326 		mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6327 
6328 		mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6329 		mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6330 		mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6331 		mptcpci.mptcpci_flow_offset =
6332 		    offsetof(conninfo_mptcp_t, mptcpci_flows);
6333 
6334 		len = sizeof(*flows) * mpte->mpte_numflows;
6335 		if (mpte->mpte_numflows != 0) {
6336 			flows = kalloc_data(len, Z_WAITOK | Z_ZERO);
6337 			if (flows == NULL) {
6338 				socket_unlock(mpp->mpp_socket, 1);
6339 				break;
6340 			}
6341 			mptcpci.mptcpci_len = sizeof(mptcpci) +
6342 			    sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6343 			error = SYSCTL_OUT(req, &mptcpci,
6344 			    sizeof(mptcpci) - sizeof(mptcp_flow_t));
6345 		} else {
6346 			mptcpci.mptcpci_len = sizeof(mptcpci);
6347 			error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6348 		}
6349 		if (error) {
6350 			socket_unlock(mpp->mpp_socket, 1);
6351 			kfree_data(flows, len);
6352 			break;
6353 		}
6354 		f = 0;
6355 		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6356 			so = mpts->mpts_socket;
6357 			fill_mptcp_subflow(so, &flows[f], mpts);
6358 			f++;
6359 		}
6360 		socket_unlock(mpp->mpp_socket, 1);
6361 		if (flows) {
6362 			error = SYSCTL_OUT(req, flows, len);
6363 			kfree_data(flows, len);
6364 			if (error) {
6365 				break;
6366 			}
6367 		}
6368 	}
6369 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6370 
6371 	return error;
6372 }
6373 
6374 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6375     0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6376     "List of active MPTCP connections");
6377 
6378 /*
6379  * Set notsent lowat mark on the MPTCB
6380  */
6381 int
mptcp_set_notsent_lowat(struct mptses * mpte,int optval)6382 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6383 {
6384 	struct mptcb *mp_tp = NULL;
6385 	int error = 0;
6386 
6387 	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6388 		mp_tp = mpte->mpte_mptcb;
6389 	}
6390 
6391 	if (mp_tp) {
6392 		mp_tp->mpt_notsent_lowat = optval;
6393 	} else {
6394 		error = EINVAL;
6395 	}
6396 
6397 	return error;
6398 }
6399 
6400 u_int32_t
mptcp_get_notsent_lowat(struct mptses * mpte)6401 mptcp_get_notsent_lowat(struct mptses *mpte)
6402 {
6403 	struct mptcb *mp_tp = NULL;
6404 
6405 	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6406 		mp_tp = mpte->mpte_mptcb;
6407 	}
6408 
6409 	if (mp_tp) {
6410 		return mp_tp->mpt_notsent_lowat;
6411 	} else {
6412 		return 0;
6413 	}
6414 }
6415 
6416 int
mptcp_notsent_lowat_check(struct socket * so)6417 mptcp_notsent_lowat_check(struct socket *so)
6418 {
6419 	struct mptses *mpte;
6420 	struct mppcb *mpp;
6421 	struct mptcb *mp_tp;
6422 	struct mptsub *mpts;
6423 
6424 	int notsent = 0;
6425 
6426 	mpp = mpsotomppcb(so);
6427 	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6428 		return 0;
6429 	}
6430 
6431 	mpte = mptompte(mpp);
6432 	socket_lock_assert_owned(mptetoso(mpte));
6433 	mp_tp = mpte->mpte_mptcb;
6434 
6435 	notsent = so->so_snd.sb_cc;
6436 
6437 	if ((notsent == 0) ||
6438 	    ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6439 	    mp_tp->mpt_notsent_lowat)) {
6440 		mptcplog((LOG_DEBUG, "MPTCP Sender: "
6441 		    "lowat %d notsent %d actual %llu \n",
6442 		    mp_tp->mpt_notsent_lowat, notsent,
6443 		    notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
6444 		    MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6445 		return 1;
6446 	}
6447 
6448 	/* When Nagle's algorithm is not disabled, it is better
6449 	 * to wakeup the client even before there is atleast one
6450 	 * maxseg of data to write.
6451 	 */
6452 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6453 		int retval = 0;
6454 		if (mpts->mpts_flags & MPTSF_ACTIVE) {
6455 			struct socket *subf_so = mpts->mpts_socket;
6456 			struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6457 
6458 			notsent = so->so_snd.sb_cc -
6459 			    (tp->snd_nxt - tp->snd_una);
6460 
6461 			if ((tp->t_flags & TF_NODELAY) == 0 &&
6462 			    notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6463 				retval = 1;
6464 			}
6465 			mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
6466 			    " nodelay false \n",
6467 			    mp_tp->mpt_notsent_lowat, notsent),
6468 			    MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6469 			return retval;
6470 		}
6471 	}
6472 	return 0;
6473 }
6474 
6475 static errno_t
mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)6476 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6477     void **unitinfo)
6478 {
6479 #pragma unused(kctlref, sac, unitinfo)
6480 
6481 	if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6482 		os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6483 	}
6484 
6485 	mptcp_kern_skt_unit = sac->sc_unit;
6486 
6487 	return 0;
6488 }
6489 
6490 static void
mptcp_allow_uuid(uuid_t uuid,int32_t rssi)6491 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6492 {
6493 	struct mppcb *mpp;
6494 
6495 	/* Iterate over all MPTCP connections */
6496 
6497 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6498 
6499 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6500 		struct socket *mp_so = mpp->mpp_socket;
6501 		struct mptses *mpte = mpp->mpp_pcbe;
6502 
6503 		socket_lock(mp_so, 1);
6504 
6505 		if (mp_so->so_flags & SOF_DELEGATED &&
6506 		    uuid_compare(uuid, mp_so->e_uuid)) {
6507 			goto next;
6508 		} else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6509 		    uuid_compare(uuid, mp_so->last_uuid)) {
6510 			goto next;
6511 		}
6512 
6513 		os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6514 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6515 
6516 		mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6517 
6518 		if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6519 			mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6520 		}
6521 
6522 		mptcp_check_subflows_and_add(mpte);
6523 		mptcp_remove_subflows(mpte);
6524 
6525 		mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6526 
6527 next:
6528 		socket_unlock(mp_so, 1);
6529 	}
6530 
6531 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6532 }
6533 
6534 static void
mptcp_wifi_status_changed(void)6535 mptcp_wifi_status_changed(void)
6536 {
6537 	struct mppcb *mpp;
6538 
6539 	/* Iterate over all MPTCP connections */
6540 
6541 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6542 
6543 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6544 		struct socket *mp_so = mpp->mpp_socket;
6545 		struct mptses *mpte = mpp->mpp_pcbe;
6546 
6547 		socket_lock(mp_so, 1);
6548 
6549 		/* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6550 		if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6551 		    mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
6552 		    mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6553 			goto next;
6554 		}
6555 
6556 		mptcp_check_subflows_and_add(mpte);
6557 		mptcp_check_subflows_and_remove(mpte);
6558 
6559 next:
6560 		socket_unlock(mp_so, 1);
6561 	}
6562 
6563 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6564 }
6565 
6566 struct mptcp_uuid_search_info {
6567 	uuid_t target_uuid;
6568 	proc_t found_proc;
6569 	boolean_t is_proc_found;
6570 };
6571 
6572 static int
mptcp_find_proc_filter(proc_t p,void * arg)6573 mptcp_find_proc_filter(proc_t p, void *arg)
6574 {
6575 	struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6576 	int found;
6577 
6578 	if (info->is_proc_found) {
6579 		return 0;
6580 	}
6581 
6582 	/*
6583 	 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6584 	 * expects != 0 for a matching filter.
6585 	 */
6586 	found = uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0;
6587 	if (found) {
6588 		info->is_proc_found = true;
6589 	}
6590 
6591 	return found;
6592 }
6593 
6594 static int
mptcp_find_proc_callout(proc_t p,void * arg)6595 mptcp_find_proc_callout(proc_t p, void * arg)
6596 {
6597 	struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6598 
6599 	if (uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0) {
6600 		info->found_proc = p;
6601 		return PROC_CLAIMED_DONE;
6602 	}
6603 
6604 	return PROC_RETURNED;
6605 }
6606 
6607 static proc_t
mptcp_find_proc(const uuid_t uuid)6608 mptcp_find_proc(const uuid_t uuid)
6609 {
6610 	struct mptcp_uuid_search_info info;
6611 
6612 	uuid_copy(info.target_uuid, uuid);
6613 	info.found_proc = PROC_NULL;
6614 	info.is_proc_found = false;
6615 
6616 	proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
6617 	    mptcp_find_proc_filter, &info);
6618 
6619 	return info.found_proc;
6620 }
6621 
6622 void
mptcp_ask_symptoms(struct mptses * mpte)6623 mptcp_ask_symptoms(struct mptses *mpte)
6624 {
6625 	struct mptcp_symptoms_ask_uuid ask;
6626 	struct socket *mp_so;
6627 	struct proc *p = PROC_NULL;
6628 	int pid, prio, err;
6629 
6630 	if (mptcp_kern_skt_unit == 0) {
6631 		os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6632 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6633 		return;
6634 	}
6635 
6636 	mp_so = mptetoso(mpte);
6637 
6638 	if (mp_so->so_flags & SOF_DELEGATED) {
6639 		if (mpte->mpte_epid != 0) {
6640 			p = proc_find(mpte->mpte_epid);
6641 			if (p != PROC_NULL) {
6642 				/* We found a pid, check its UUID */
6643 				if (uuid_compare(mp_so->e_uuid, proc_executableuuid_addr(p))) {
6644 					/* It's not the same - we need to look for the real proc */
6645 					proc_rele(p);
6646 					p = PROC_NULL;
6647 				}
6648 			}
6649 		}
6650 
6651 		if (p == PROC_NULL) {
6652 			p = mptcp_find_proc(mp_so->e_uuid);
6653 			if (p == PROC_NULL) {
6654 				uuid_string_t uuid_string;
6655 				uuid_unparse(mp_so->e_uuid, uuid_string);
6656 
6657 				os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6658 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
6659 
6660 				return;
6661 			}
6662 			mpte->mpte_epid = proc_pid(p);
6663 		}
6664 
6665 		pid = mpte->mpte_epid;
6666 		uuid_copy(ask.uuid, mp_so->e_uuid);
6667 	} else {
6668 		pid = mp_so->last_pid;
6669 
6670 		p = proc_find(pid);
6671 		if (p == PROC_NULL) {
6672 			os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6673 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6674 			return;
6675 		}
6676 
6677 		uuid_copy(ask.uuid, mp_so->last_uuid);
6678 	}
6679 
6680 
6681 	ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6682 
6683 	prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6684 
6685 	if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6686 	    prio == TASK_DARWINBG_APPLICATION) {
6687 		ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6688 	} else if (prio == TASK_FOREGROUND_APPLICATION) {
6689 		ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6690 	} else {
6691 		ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6692 	}
6693 
6694 	err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6695 	    &ask, sizeof(ask), CTL_DATA_EOR);
6696 
6697 	os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6698 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6699 
6700 
6701 	proc_rele(p);
6702 }
6703 
6704 static errno_t
mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo)6705 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6706     void *unitinfo)
6707 {
6708 #pragma unused(kctlref, kcunit, unitinfo)
6709 
6710 	OSDecrementAtomic(&mptcp_kern_skt_inuse);
6711 
6712 	return 0;
6713 }
6714 
6715 static errno_t
mptcp_symptoms_ctl_send(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,mbuf_t m,int flags)6716 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6717     mbuf_t m, int flags)
6718 {
6719 #pragma unused(kctlref, unitinfo, flags)
6720 	symptoms_advisory_t *sa = NULL;
6721 
6722 	if (kcunit != mptcp_kern_skt_unit) {
6723 		os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6724 		    __func__, kcunit, mptcp_kern_skt_unit);
6725 	}
6726 
6727 	if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6728 		mbuf_freem(m);
6729 		return EINVAL;
6730 	}
6731 
6732 	if (mbuf_len(m) < sizeof(*sa)) {
6733 		os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6734 		    __func__, mbuf_len(m), sizeof(*sa));
6735 		mbuf_freem(m);
6736 		return EINVAL;
6737 	}
6738 
6739 	sa = mbuf_data(m);
6740 
6741 	if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6742 		os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6743 		    sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6744 		    sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6745 
6746 		if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6747 			mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6748 			mptcp_wifi_status_changed();
6749 		}
6750 	} else {
6751 		struct mptcp_symptoms_answer answer;
6752 		errno_t err;
6753 
6754 		/* We temporarily allow different sizes for ease of submission */
6755 		if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6756 		    mbuf_len(m) != sizeof(answer)) {
6757 			os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6758 			    __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6759 			    sizeof(answer));
6760 			mbuf_free(m);
6761 			return EINVAL;
6762 		}
6763 
6764 		memset(&answer, 0, sizeof(answer));
6765 
6766 		err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6767 		if (err) {
6768 			os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6769 			mbuf_free(m);
6770 			return err;
6771 		}
6772 
6773 		mptcp_allow_uuid(answer.uuid, answer.rssi);
6774 	}
6775 
6776 	mbuf_freem(m);
6777 	return 0;
6778 }
6779 
6780 void
mptcp_control_register(void)6781 mptcp_control_register(void)
6782 {
6783 	/* Set up the advisory control socket */
6784 	struct kern_ctl_reg mptcp_kern_ctl;
6785 
6786 	bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6787 	strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6788 	    sizeof(mptcp_kern_ctl.ctl_name));
6789 	mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6790 	mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6791 	mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6792 	mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6793 
6794 	(void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6795 }
6796 
6797 /*
6798  * Three return-values:
6799  * 1  : WiFi is bad
6800  * 0  : WiFi is good
6801  * -1 : WiFi-state is unknown
6802  */
6803 int
mptcp_is_wifi_unusable_for_session(struct mptses * mpte)6804 mptcp_is_wifi_unusable_for_session(struct mptses *mpte)
6805 {
6806 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6807 		if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6808 		    mptcp_advisory.sa_wifi_status) {
6809 			return symptoms_is_wifi_lossy() ? 1 : 0;
6810 		}
6811 
6812 		/*
6813 		 * If it's a first-party app and we don't have any info
6814 		 * about the Wi-Fi state, let's be pessimistic.
6815 		 */
6816 		return -1;
6817 	} else {
6818 		if (symptoms_is_wifi_lossy()) {
6819 			return 1;
6820 		}
6821 
6822 		/*
6823 		 * If we are target-based (meaning, we allow to be more lax on
6824 		 * the "unusable" target. We only *know* about the state once
6825 		 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6826 		 *
6827 		 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6828 		 * be set.
6829 		 *
6830 		 * In any other case (while in target-mode), consider WiFi bad
6831 		 * and we are going to ask for allowance from Symptoms anyway.
6832 		 */
6833 		if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6834 			if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6835 			    mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6836 				return 0;
6837 			}
6838 
6839 			return 1;
6840 		}
6841 
6842 		return 0;
6843 	}
6844 }
6845 
6846 boolean_t
symptoms_is_wifi_lossy(void)6847 symptoms_is_wifi_lossy(void)
6848 {
6849 	return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6850 }
6851 
6852 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
6853 static void
mptcp_drop_tfo_data(struct mptses * mpte,struct mptsub * mpts)6854 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
6855 {
6856 	struct socket *mp_so = mptetoso(mpte);
6857 	struct socket *so = mpts->mpts_socket;
6858 	struct tcpcb *tp = intotcpcb(sotoinpcb(so));
6859 	struct mptcb *mp_tp = mpte->mpte_mptcb;
6860 
6861 	/* If data was sent with SYN, rewind state */
6862 	if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
6863 		u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
6864 		unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
6865 
6866 		VERIFY(mp_droplen <= (UINT_MAX));
6867 		VERIFY(mp_droplen >= tcp_droplen);
6868 
6869 		mpts->mpts_flags &= ~MPTSF_TFO_REQD;
6870 		mpts->mpts_iss += tcp_droplen;
6871 		tp->t_mpflags &= ~TMPF_TFO_REQUEST;
6872 
6873 		if (mp_droplen > tcp_droplen) {
6874 			/* handle partial TCP ack */
6875 			mp_so->so_flags1 |= SOF1_TFO_REWIND;
6876 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
6877 			mp_droplen = tcp_droplen;
6878 		} else {
6879 			/* all data on SYN was acked */
6880 			mpts->mpts_rel_seq = 1;
6881 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
6882 		}
6883 		mp_tp->mpt_sndmax -= tcp_droplen;
6884 
6885 		if (mp_droplen != 0) {
6886 			VERIFY(mp_so->so_snd.sb_mb != NULL);
6887 			sbdrop(&mp_so->so_snd, (int)mp_droplen);
6888 		}
6889 	}
6890 }
6891 
6892 int
mptcp_freeq(struct mptcb * mp_tp)6893 mptcp_freeq(struct mptcb *mp_tp)
6894 {
6895 	struct tseg_qent *q;
6896 	int rv = 0;
6897 	int count = 0;
6898 
6899 	while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6900 		LIST_REMOVE(q, tqe_q);
6901 		m_freem(q->tqe_m);
6902 		zfree(tcp_reass_zone, q);
6903 		count++;
6904 		rv = 1;
6905 	}
6906 	mp_tp->mpt_reassqlen = 0;
6907 
6908 	if (count > 0) {
6909 		OSAddAtomic(-count, &mptcp_reass_total_qlen);
6910 	}
6911 
6912 	return rv;
6913 }
6914 
6915 static int
mptcp_post_event(u_int32_t event_code,int value)6916 mptcp_post_event(u_int32_t event_code, int value)
6917 {
6918 	struct kev_mptcp_data event_data;
6919 	struct kev_msg ev_msg;
6920 
6921 	memset(&ev_msg, 0, sizeof(ev_msg));
6922 
6923 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
6924 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
6925 	ev_msg.kev_subclass     = KEV_MPTCP_SUBCLASS;
6926 	ev_msg.event_code       = event_code;
6927 
6928 	event_data.value = value;
6929 
6930 	ev_msg.dv[0].data_ptr    = &event_data;
6931 	ev_msg.dv[0].data_length = sizeof(event_data);
6932 
6933 	return kev_post_msg(&ev_msg);
6934 }
6935 
6936 static void
mptcp_set_cellicon(struct mptses * mpte,struct mptsub * mpts)6937 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6938 {
6939 	struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6940 	int error;
6941 
6942 	/* First-party apps (Siri) don't flip the cellicon */
6943 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6944 		return;
6945 	}
6946 
6947 	/* Subflow is disappearing - don't set it on this one */
6948 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6949 		return;
6950 	}
6951 
6952 	/* Fallen back connections are not triggering the cellicon */
6953 	if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6954 		return;
6955 	}
6956 
6957 	/* Remember the last time we set the cellicon. Needed for debouncing */
6958 	mpte->mpte_last_cellicon_set = tcp_now;
6959 
6960 	tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6961 	tcp_sched_timers(tp);
6962 
6963 	if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6964 	    mpte->mpte_cellicon_increments != 0) {
6965 		if (mptcp_cellicon_refcount == 0) {
6966 			os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6967 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6968 
6969 			/* Continue, so that the icon gets set... */
6970 		} else {
6971 			/*
6972 			 * In this case, the cellicon is already set. No need to bump it
6973 			 * even higher
6974 			 */
6975 
6976 			return;
6977 		}
6978 	}
6979 
6980 	/* When tearing down this subflow, we need to decrement the
6981 	 * reference counter
6982 	 */
6983 	mpts->mpts_flags |= MPTSF_CELLICON_SET;
6984 
6985 	/* This counter, so that when a session gets destroyed we decrement
6986 	 * the reference counter by whatever is left
6987 	 */
6988 	mpte->mpte_cellicon_increments++;
6989 
6990 	if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6991 		/* If cellicon is already set, get out of here! */
6992 		return;
6993 	}
6994 
6995 	error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6996 
6997 	if (error) {
6998 		os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6999 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
7000 	} else {
7001 		os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
7002 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
7003 	}
7004 }
7005 
7006 void
mptcp_clear_cellicon(void)7007 mptcp_clear_cellicon(void)
7008 {
7009 	int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
7010 
7011 	if (error) {
7012 		os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
7013 		    __func__, error);
7014 	} else {
7015 		os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
7016 		    __func__);
7017 	}
7018 }
7019 
7020 /*
7021  * Returns true if the icon has been flipped to WiFi.
7022  */
7023 static boolean_t
__mptcp_unset_cellicon(uint32_t val)7024 __mptcp_unset_cellicon(uint32_t val)
7025 {
7026 	VERIFY(val < INT32_MAX);
7027 	if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
7028 		return false;
7029 	}
7030 
7031 	mptcp_clear_cellicon();
7032 
7033 	return true;
7034 }
7035 
7036 void
mptcp_unset_cellicon(struct mptses * mpte,struct mptsub * mpts,uint32_t val)7037 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
7038 {
7039 	/* First-party apps (Siri) don't flip the cellicon */
7040 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
7041 		return;
7042 	}
7043 
7044 	if (mpte->mpte_cellicon_increments == 0) {
7045 		/* This flow never used cell - get out of here! */
7046 		return;
7047 	}
7048 
7049 	if (mptcp_cellicon_refcount == 0) {
7050 		os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
7051 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
7052 
7053 		return;
7054 	}
7055 
7056 	if (mpts) {
7057 		if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
7058 			return;
7059 		}
7060 
7061 		mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
7062 	}
7063 
7064 	if (mpte->mpte_cellicon_increments < val) {
7065 		os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
7066 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
7067 		val = mpte->mpte_cellicon_increments;
7068 	}
7069 
7070 	mpte->mpte_cellicon_increments -= val;
7071 
7072 	if (__mptcp_unset_cellicon(val) == false) {
7073 		return;
7074 	}
7075 
7076 	/* All flows are gone - our counter should be at zero too! */
7077 	if (mpte->mpte_cellicon_increments != 0) {
7078 		os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
7079 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
7080 	}
7081 }
7082 
7083 void
mptcp_reset_rexmit_state(struct tcpcb * tp)7084 mptcp_reset_rexmit_state(struct tcpcb *tp)
7085 {
7086 	struct mptsub *mpts;
7087 	struct inpcb *inp;
7088 	struct socket *so;
7089 
7090 	inp = tp->t_inpcb;
7091 	if (inp == NULL) {
7092 		return;
7093 	}
7094 
7095 	so = inp->inp_socket;
7096 	if (so == NULL) {
7097 		return;
7098 	}
7099 
7100 	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
7101 		return;
7102 	}
7103 
7104 	mpts = tp->t_mpsub;
7105 
7106 	mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
7107 	so->so_flags &= ~SOF_MP_TRYFAILOVER;
7108 }
7109 
7110 void
mptcp_reset_keepalive(struct tcpcb * tp)7111 mptcp_reset_keepalive(struct tcpcb *tp)
7112 {
7113 	struct mptsub *mpts = tp->t_mpsub;
7114 
7115 	mpts->mpts_flags &= ~MPTSF_READ_STALL;
7116 }
7117