xref: /xnu-11215.81.4/bsd/netinet/mptcp_subr.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32 
33 #include <mach/sdt.h>
34 
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50 
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_cache.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_var.h>
62 #include <netinet/mptcp_var.h>
63 #include <netinet/mptcp.h>
64 #include <netinet/mptcp_opt.h>
65 #include <netinet/mptcp_seq.h>
66 #include <netinet/mptcp_timer.h>
67 #include <libkern/crypto/sha1.h>
68 #include <libkern/crypto/sha2.h>
69 #include <netinet6/in6_pcb.h>
70 #include <netinet6/ip6protosw.h>
71 #include <dev/random/randomdev.h>
72 #include <net/sockaddr_utils.h>
73 
74 /*
75  * Notes on MPTCP implementation.
76  *
77  * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
78  * communication domain.  The structure mtcbinfo describes the MPTCP instance
79  * of a Multipath protocol in that domain.  It is used to keep track of all
80  * MPTCP PCB instances in the system, and is protected by the global lock
81  * mppi_lock.
82  *
83  * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
84  * IPPROTO_TCP).  Upon success, a Multipath PCB gets allocated and along with
85  * it comes an MPTCP Session and an MPTCP PCB.  All three structures are
86  * allocated from the same memory block, and each structure has a pointer
87  * to the adjacent ones.  The layout is defined by the mpp_mtp structure.
88  * The socket lock (mpp_lock) is used to protect accesses to the Multipath
89  * PCB (mppcb) as well as the MPTCP Session (mptses).
90  *
91  * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
92  *
93  * A functioning MPTCP Session consists of one or more subflow sockets.  Each
94  * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
95  * represented by the mptsub structure.  Because each subflow requires access
96  * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
97  * subflow.  This gets decremented prior to the subflow's destruction.
98  *
99  * To handle events (read, write, control) from the subflows, we do direct
100  * upcalls into the specific function.
101  *
102  * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
103  * lock. Incoming data on a subflow also ends up taking this single lock. To
104  * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
105  * of the MPTCP-socket.
106  *
107  * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
108  * work is done by the MPTCP garbage collector which is invoked on demand by
109  * the PF_MULTIPATH garbage collector.  This process will take place once all
110  * of the subflows have been destroyed.
111  */
112 
113 static void mptcp_subflow_abort(struct mptsub *, int);
114 
115 static void mptcp_send_dfin(struct socket *so);
116 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
117 static int mptcp_freeq(struct mptcb *mp_tp);
118 
119 /*
120  * Possible return values for subflow event handlers.  Note that success
121  * values must be greater or equal than MPTS_EVRET_OK.  Values less than that
122  * indicate errors or actions which require immediate attention; they will
123  * prevent the rest of the handlers from processing their respective events
124  * until the next round of events processing.
125  */
126 typedef enum {
127 	MPTS_EVRET_DELETE               = 1,    /* delete this subflow */
128 	MPTS_EVRET_OK                   = 2,    /* OK */
129 	MPTS_EVRET_CONNECT_PENDING      = 3,    /* resume pended connects */
130 	MPTS_EVRET_DISCONNECT_FALLBACK  = 4,    /* abort all but preferred */
131 } ev_ret_t;
132 
133 static void mptcp_do_sha1(mptcp_key_t *, char *);
134 static void mptcp_do_sha256(mptcp_key_t *, char *);
135 
136 static void mptcp_init_local_parms(struct mptses *, struct sockaddr *);
137 
138 static KALLOC_TYPE_DEFINE(mptsub_zone, struct mptsub, NET_KT_DEFAULT);
139 static KALLOC_TYPE_DEFINE(mptopt_zone, struct mptopt, NET_KT_DEFAULT);
140 static KALLOC_TYPE_DEFINE(mpt_subauth_zone, struct mptcp_subf_auth_entry,
141     NET_KT_DEFAULT);
142 
143 struct mppcbinfo mtcbinfo;
144 
145 SYSCTL_DECL(_net_inet);
146 
147 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
148 
149 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
150     &mtcbinfo.mppi_count, 0, "Number of active PCBs");
151 
152 
153 static int mptcp_alternate_port = 0;
154 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
155     &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
156 
157 static struct protosw mptcp_subflow_protosw;
158 static struct pr_usrreqs mptcp_subflow_usrreqs;
159 static struct ip6protosw mptcp_subflow_protosw6;
160 static struct pr_usrreqs mptcp_subflow_usrreqs6;
161 
162 static uint8_t  mptcp_create_subflows_scheduled;
163 
164 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
165 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
166 static uint32_t mptcp_kern_skt_inuse = 0;
167 static uint32_t mptcp_kern_skt_unit;
168 static symptoms_advisory_t mptcp_advisory;
169 
170 uint32_t mptcp_cellicon_refcount = 0;
171 
172 os_log_t mptcp_log_handle;
173 
174 int
mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats * stats,u_short ifindex,boolean_t create)175 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create)
176 {
177 	int i, index = -1;
178 
179 	for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
180 		if (create && stats[i].ifindex == IFSCOPE_NONE) {
181 			if (index < 0) {
182 				index = i;
183 			}
184 			continue;
185 		}
186 
187 		if (stats[i].ifindex == ifindex) {
188 			index = i;
189 			return index;
190 		}
191 	}
192 
193 	if (index != -1) {
194 		stats[index].ifindex = ifindex;
195 	}
196 
197 	return index;
198 }
199 
200 static int
mptcpstats_get_index(struct mptcp_itf_stats * stats,const struct mptsub * mpts)201 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
202 {
203 	const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
204 	int index;
205 
206 	if (ifp == NULL) {
207 		os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
208 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
209 		    sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
210 		return -1;
211 	}
212 
213 	index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
214 
215 	if (index != -1) {
216 		if (stats[index].is_expensive == 0) {
217 			stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
218 		}
219 	}
220 
221 	return index;
222 }
223 
224 void
mptcpstats_inc_switch(struct mptses * mpte,const struct mptsub * mpts)225 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
226 {
227 	int index;
228 
229 	tcpstat.tcps_mp_switches++;
230 	mpte->mpte_subflow_switches++;
231 
232 	index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
233 
234 	if (index != -1) {
235 		mpte->mpte_itfstats[index].switches++;
236 	}
237 }
238 
239 /*
240  * Flushes all recorded socket options from an MP socket.
241  */
242 static void
mptcp_flush_sopts(struct mptses * mpte)243 mptcp_flush_sopts(struct mptses *mpte)
244 {
245 	struct mptopt *mpo, *tmpo;
246 
247 	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
248 		mptcp_sopt_remove(mpte, mpo);
249 		mptcp_sopt_free(mpo);
250 	}
251 	VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
252 }
253 
254 /*
255  * Create an MPTCP session, called as a result of opening a MPTCP socket.
256  */
257 int
mptcp_session_create(struct mppcb * mpp)258 mptcp_session_create(struct mppcb *mpp)
259 {
260 	struct mpp_mtp *mtp;
261 	struct mppcbinfo *mppi;
262 	struct mptses *mpte;
263 	struct mptcb *mp_tp;
264 
265 	VERIFY(mpp != NULL);
266 	mppi = mpp->mpp_pcbinfo;
267 	VERIFY(mppi != NULL);
268 
269 	mtp = __container_of(mpp, struct mpp_mtp, mpp);
270 	mpte = &mtp->mpp_ses;
271 	mp_tp = &mtp->mtcb;
272 
273 	/* MPTCP Multipath PCB Extension */
274 	bzero(mpte, sizeof(*mpte));
275 	VERIFY(mpp->mpp_pcbe == NULL);
276 	mpp->mpp_pcbe = mpte;
277 	mpte->mpte_mppcb = mpp;
278 	mpte->mpte_mptcb = mp_tp;
279 
280 	TAILQ_INIT(&mpte->mpte_sopts);
281 	TAILQ_INIT(&mpte->mpte_subflows);
282 	mpte->mpte_associd = SAE_ASSOCID_ANY;
283 	mpte->mpte_connid_last = SAE_CONNID_ANY;
284 
285 	mptcp_init_urgency_timer(mpte);
286 
287 	mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
288 	mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
289 
290 	if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
291 		mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
292 	}
293 
294 	mpte->mpte_last_cellicon_set = tcp_now;
295 
296 	/* MPTCP Protocol Control Block */
297 	bzero(mp_tp, sizeof(*mp_tp));
298 	mp_tp->mpt_mpte = mpte;
299 	mp_tp->mpt_state = MPTCPS_CLOSED;
300 
301 	DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
302 
303 	return 0;
304 }
305 
306 struct sockaddr *
mptcp_get_session_dst(struct mptses * mpte,boolean_t ipv6,boolean_t ipv4)307 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
308 {
309 	if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
310 		return SA(&mpte->mpte_sub_dst_v6);
311 	}
312 
313 	if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
314 		return SA(&mpte->mpte_sub_dst_v4);
315 	}
316 
317 	/* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
318 	 * meaning we prefer IPv6 over IPv4.
319 	 */
320 	if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
321 		return SA(&mpte->mpte_sub_dst_v6);
322 	}
323 
324 	if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
325 		return SA(&mpte->mpte_sub_dst_v4);
326 	}
327 
328 	/* We don't yet have a unicast IP */
329 	return NULL;
330 }
331 
332 static void
mptcpstats_get_bytes(struct mptses * mpte,boolean_t initial_cell,uint64_t * cellbytes,uint64_t * allbytes)333 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
334     uint64_t *cellbytes, uint64_t *allbytes)
335 {
336 	int64_t mycellbytes = 0;
337 	uint64_t myallbytes = 0;
338 	int i;
339 
340 	for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
341 		if (mpte->mpte_itfstats[i].is_expensive) {
342 			mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
343 			mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
344 		}
345 
346 		myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
347 		myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
348 	}
349 
350 	if (initial_cell) {
351 		mycellbytes -= mpte->mpte_init_txbytes;
352 		mycellbytes -= mpte->mpte_init_rxbytes;
353 	}
354 
355 	if (mycellbytes < 0) {
356 		os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
357 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
358 		*cellbytes = 0;
359 		*allbytes = 0;
360 	} else {
361 		*cellbytes = mycellbytes;
362 		*allbytes = myallbytes;
363 	}
364 }
365 
366 static void
mptcpstats_session_wrapup(struct mptses * mpte)367 mptcpstats_session_wrapup(struct mptses *mpte)
368 {
369 	boolean_t cell = mpte->mpte_initial_cell;
370 
371 	switch (mpte->mpte_svctype) {
372 	case MPTCP_SVCTYPE_HANDOVER:
373 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
374 			tcpstat.tcps_mptcp_fp_handover_attempt++;
375 
376 			if (cell && mpte->mpte_handshake_success) {
377 				tcpstat.tcps_mptcp_fp_handover_success_cell++;
378 
379 				if (mpte->mpte_used_wifi) {
380 					tcpstat.tcps_mptcp_handover_wifi_from_cell++;
381 				}
382 			} else if (mpte->mpte_handshake_success) {
383 				tcpstat.tcps_mptcp_fp_handover_success_wifi++;
384 
385 				if (mpte->mpte_used_cell) {
386 					tcpstat.tcps_mptcp_handover_cell_from_wifi++;
387 				}
388 			}
389 		} else {
390 			tcpstat.tcps_mptcp_handover_attempt++;
391 
392 			if (cell && mpte->mpte_handshake_success) {
393 				tcpstat.tcps_mptcp_handover_success_cell++;
394 
395 				if (mpte->mpte_used_wifi) {
396 					tcpstat.tcps_mptcp_handover_wifi_from_cell++;
397 				}
398 			} else if (mpte->mpte_handshake_success) {
399 				tcpstat.tcps_mptcp_handover_success_wifi++;
400 
401 				if (mpte->mpte_used_cell) {
402 					tcpstat.tcps_mptcp_handover_cell_from_wifi++;
403 				}
404 			}
405 		}
406 
407 		if (mpte->mpte_handshake_success) {
408 			uint64_t cellbytes;
409 			uint64_t allbytes;
410 
411 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
412 
413 			tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
414 			tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
415 		}
416 		break;
417 	case MPTCP_SVCTYPE_INTERACTIVE:
418 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
419 			tcpstat.tcps_mptcp_fp_interactive_attempt++;
420 
421 			if (mpte->mpte_handshake_success) {
422 				tcpstat.tcps_mptcp_fp_interactive_success++;
423 
424 				if (!cell && mpte->mpte_used_cell) {
425 					tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
426 				}
427 			}
428 		} else {
429 			tcpstat.tcps_mptcp_interactive_attempt++;
430 
431 			if (mpte->mpte_handshake_success) {
432 				tcpstat.tcps_mptcp_interactive_success++;
433 
434 				if (!cell && mpte->mpte_used_cell) {
435 					tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
436 				}
437 			}
438 		}
439 
440 		if (mpte->mpte_handshake_success) {
441 			uint64_t cellbytes;
442 			uint64_t allbytes;
443 
444 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
445 
446 			tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
447 			tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
448 		}
449 		break;
450 	case MPTCP_SVCTYPE_AGGREGATE:
451 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
452 			tcpstat.tcps_mptcp_fp_aggregate_attempt++;
453 
454 			if (mpte->mpte_handshake_success) {
455 				tcpstat.tcps_mptcp_fp_aggregate_success++;
456 			}
457 		} else {
458 			tcpstat.tcps_mptcp_aggregate_attempt++;
459 
460 			if (mpte->mpte_handshake_success) {
461 				tcpstat.tcps_mptcp_aggregate_success++;
462 			}
463 		}
464 
465 		if (mpte->mpte_handshake_success) {
466 			uint64_t cellbytes;
467 			uint64_t allbytes;
468 
469 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
470 
471 			tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
472 			tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
473 		}
474 		break;
475 	}
476 
477 	if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
478 		tcpstat.tcps_mptcp_back_to_wifi++;
479 	}
480 
481 	if (mpte->mpte_triggered_cell) {
482 		tcpstat.tcps_mptcp_triggered_cell++;
483 	}
484 }
485 
486 /*
487  * Destroy an MPTCP session.
488  */
489 static void
mptcp_session_destroy(struct mptses * mpte)490 mptcp_session_destroy(struct mptses *mpte)
491 {
492 	struct mptcb *mp_tp = mpte->mpte_mptcb;
493 
494 	VERIFY(mp_tp != NULL);
495 	VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
496 
497 	mptcpstats_session_wrapup(mpte);
498 	mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
499 	mptcp_flush_sopts(mpte);
500 
501 	if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
502 		kfree_data(mpte->mpte_itfinfo,
503 		    sizeof(*mpte->mpte_itfinfo) * mpte->mpte_itfinfo_size);
504 	}
505 	mpte->mpte_itfinfo = NULL;
506 
507 	mptcp_freeq(mp_tp);
508 	m_freem_list(mpte->mpte_reinjectq);
509 
510 	os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
511 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
512 }
513 
514 boolean_t
mptcp_ok_to_create_subflows(struct mptcb * mp_tp)515 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
516 {
517 	return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
518 	       mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
519 	       !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
520 }
521 
522 static int
mptcp_synthesize_nat64(struct in6_addr * addr,uint32_t len,const struct in_addr * addrv4)523 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
524     const struct in_addr *addrv4)
525 {
526 	static const struct in6_addr well_known_prefix = {
527 		.__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
528 			                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
529 			                 0x00, 0x00, 0x00, 0x00},
530 	};
531 	const char *ptrv4 = (const char *)addrv4;
532 	char *ptr = (char *)addr;
533 
534 	if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
535 	    IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
536 	    IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
537 	    IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
538 	    IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
539 	    IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
540 	    INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
541 		return -1;
542 	}
543 
544 	/* Check for the well-known prefix */
545 	if (len == NAT64_PREFIX_LEN_96 &&
546 	    IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
547 		if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
548 		    IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
549 			return -1;
550 		}
551 	}
552 
553 	switch (len) {
554 	case NAT64_PREFIX_LEN_96:
555 		memcpy(ptr + 12, ptrv4, 4);
556 		break;
557 	case NAT64_PREFIX_LEN_64:
558 		memcpy(ptr + 9, ptrv4, 4);
559 		break;
560 	case NAT64_PREFIX_LEN_56:
561 		memcpy(ptr + 7, ptrv4, 1);
562 		memcpy(ptr + 9, ptrv4 + 1, 3);
563 		break;
564 	case NAT64_PREFIX_LEN_48:
565 		memcpy(ptr + 6, ptrv4, 2);
566 		memcpy(ptr + 9, ptrv4 + 2, 2);
567 		break;
568 	case NAT64_PREFIX_LEN_40:
569 		memcpy(ptr + 5, ptrv4, 3);
570 		memcpy(ptr + 9, ptrv4 + 3, 1);
571 		break;
572 	case NAT64_PREFIX_LEN_32:
573 		memcpy(ptr + 4, ptrv4, 4);
574 		break;
575 	default:
576 		panic("NAT64-prefix len is wrong: %u", len);
577 	}
578 
579 	return 0;
580 }
581 
582 static void
mptcp_trigger_cell_bringup(struct mptses * mpte)583 mptcp_trigger_cell_bringup(struct mptses *mpte)
584 {
585 	struct socket *mp_so = mptetoso(mpte);
586 
587 	if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
588 		uuid_string_t uuidstr;
589 		int err;
590 
591 		socket_unlock(mp_so, 0);
592 		err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
593 		    TRUE);
594 		socket_lock(mp_so, 0);
595 
596 		if (err == 0) {
597 			mpte->mpte_triggered_cell = 1;
598 		}
599 
600 		uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
601 		os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
602 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
603 	} else {
604 		os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
605 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
606 	}
607 }
608 
609 static boolean_t
mptcp_subflow_disconnecting(struct mptsub * mpts)610 mptcp_subflow_disconnecting(struct mptsub *mpts)
611 {
612 	if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
613 		return true;
614 	}
615 
616 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
617 		return true;
618 	}
619 
620 	if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
621 		return true;
622 	}
623 
624 	return false;
625 }
626 
627 /*
628  * In Handover mode, only create cell subflow if
629  * - Symptoms marked WiFi as weak:
630  *   Here, if we are sending data, then we can check the RTO-state. That is a
631  *   stronger signal of WiFi quality than the Symptoms indicator.
632  *   If however we are not sending any data, the only thing we can do is guess
633  *   and thus bring up Cell.
634  *
635  * - Symptoms marked WiFi as unknown:
636  *   In this state we don't know what the situation is and thus remain
637  *   conservative, only bringing up cell if there are retransmissions going on.
638  */
639 static boolean_t
mptcp_handover_use_cellular(struct mptses * mpte,struct tcpcb * tp)640 mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
641 {
642 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
643 
644 	if (wifi_quality == MPTCP_WIFI_QUALITY_GOOD) {
645 		/* WiFi is good - don't use cell */
646 		return false;
647 	}
648 
649 	if (wifi_quality == MPTCP_WIFI_QUALITY_UNSURE) {
650 		/*
651 		 * We are in unknown state, only use Cell if we have confirmed
652 		 * that WiFi is bad.
653 		 */
654 		if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
655 			return true;
656 		} else {
657 			return false;
658 		}
659 	}
660 
661 	if (wifi_quality == MPTCP_WIFI_QUALITY_BAD) {
662 		/*
663 		 * WiFi is confirmed to be bad from Symptoms-Framework.
664 		 * If we are sending data, check the RTOs.
665 		 * Otherwise, be pessimistic and use Cell.
666 		 */
667 		if (mptetoso(mpte)->so_snd.sb_cc != 0) {
668 			if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
669 				return true;
670 			} else {
671 				return false;
672 			}
673 		} else {
674 			return true;
675 		}
676 	}
677 
678 	return false;
679 }
680 
681 void
mptcp_check_subflows_and_add(struct mptses * mpte)682 mptcp_check_subflows_and_add(struct mptses *mpte)
683 {
684 	struct mptcb *mp_tp = mpte->mpte_mptcb;
685 	boolean_t cellular_viable = FALSE;
686 	boolean_t want_cellular = TRUE;
687 	uint32_t i;
688 
689 	if (!mptcp_ok_to_create_subflows(mp_tp)) {
690 		os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
691 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
692 		return;
693 	}
694 
695 	/* Just to see if we have an IP-address available */
696 	if (mptcp_get_session_dst(mpte, false, false) == NULL) {
697 		return;
698 	}
699 
700 	for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
701 		boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
702 		struct mpt_itf_info *info;
703 		struct sockaddr_in6 nat64pre;
704 		struct sockaddr *dst;
705 		struct mptsub *mpts;
706 		struct ifnet *ifp;
707 		uint32_t ifindex;
708 
709 		info = &mpte->mpte_itfinfo[i];
710 
711 		ifindex = info->ifindex;
712 		if (ifindex == IFSCOPE_NONE) {
713 			continue;
714 		}
715 
716 		os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
717 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
718 		    info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
719 
720 		if (info->no_mptcp_support) {
721 			continue;
722 		}
723 
724 		ifnet_head_lock_shared();
725 		ifp = ifindex2ifnet[ifindex];
726 		ifnet_head_done();
727 
728 		if (ifp == NULL) {
729 			continue;
730 		}
731 
732 		if (IFNET_IS_CELLULAR(ifp)) {
733 			cellular_viable = TRUE;
734 
735 			if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
736 			    mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
737 				if (mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD) {
738 					continue;
739 				}
740 			}
741 		}
742 
743 		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
744 			const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
745 			struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
746 
747 			if (subifp == NULL) {
748 				continue;
749 			}
750 
751 			/*
752 			 * If there is at least one functioning subflow on WiFi
753 			 * and we are checking for the cell interface, then
754 			 * we always need to ask symptoms for permission as
755 			 * cell is triggered even if WiFi is available.
756 			 */
757 			if (!IFNET_IS_CELLULAR(subifp) &&
758 			    !mptcp_subflow_disconnecting(mpts) &&
759 			    IFNET_IS_CELLULAR(ifp)) {
760 				need_to_ask_symptoms = TRUE;
761 			}
762 
763 			if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
764 				os_log(mptcp_log_handle,
765 				    "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
766 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
767 				    mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
768 				    IFNET_IS_CELLULAR(subifp),
769 				    mptcp_wifi_quality_for_session(mpte),
770 				    mpts->mpts_flags,
771 				    tp->t_rxtshift,
772 				    !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
773 				    mptetoso(mpte)->so_snd.sb_cc,
774 				    ifindex, subifp->if_index,
775 				    tp->t_srtt >> TCP_RTT_SHIFT,
776 				    tp->t_rttvar >> TCP_RTTVAR_SHIFT,
777 				    tp->t_rxtcur);
778 
779 				if (!IFNET_IS_CELLULAR(subifp) &&
780 				    !mptcp_subflow_disconnecting(mpts) &&
781 				    (mpts->mpts_flags & MPTSF_CONNECTED) &&
782 				    !mptcp_handover_use_cellular(mpte, tp)) {
783 					found = TRUE;
784 
785 					/* We found a proper subflow on WiFi - no need for cell */
786 					want_cellular = FALSE;
787 					break;
788 				}
789 			} else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
790 				uint64_t time_now = mach_continuous_time();
791 
792 				os_log(mptcp_log_handle,
793 				    "%s - %lx: target-based: %llu now %llu wifi quality %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
794 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
795 				    time_now, mptcp_wifi_quality_for_session(mpte),
796 				    IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
797 				    mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
798 
799 				if (!IFNET_IS_CELLULAR(subifp) &&
800 				    !mptcp_subflow_disconnecting(mpts) &&
801 				    (mpte->mpte_time_target == 0 ||
802 				    (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
803 				    mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD)) {
804 					found = TRUE;
805 
806 					want_cellular = FALSE;
807 					break;
808 				}
809 			}
810 
811 			if (subifp->if_index == ifindex &&
812 			    !mptcp_subflow_disconnecting(mpts)) {
813 				/*
814 				 * We found a subflow on this interface.
815 				 * No need to create a new one.
816 				 */
817 				found = TRUE;
818 				break;
819 			}
820 		}
821 
822 		if (found) {
823 			continue;
824 		}
825 
826 		if (need_to_ask_symptoms &&
827 		    !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
828 		    !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
829 		    mptcp_developer_mode == 0) {
830 			mptcp_ask_symptoms(mpte);
831 			return;
832 		}
833 
834 		dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
835 
836 		if (dst->sa_family == AF_INET &&
837 		    !info->has_v4_conn && info->has_nat64_conn) {
838 			struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
839 			int error, j;
840 
841 			SOCKADDR_ZERO(&nat64pre, sizeof(struct sockaddr_in6));
842 
843 			error = ifnet_get_nat64prefix(ifp, nat64prefixes);
844 			if (error) {
845 				os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
846 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
847 				continue;
848 			}
849 
850 			for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
851 				if (nat64prefixes[j].prefix_len != 0) {
852 					break;
853 				}
854 			}
855 
856 			VERIFY(j < NAT64_MAX_NUM_PREFIXES);
857 
858 			error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
859 			    nat64prefixes[j].prefix_len,
860 			    &SIN(dst)->sin_addr);
861 			if (error != 0) {
862 				os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
863 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
864 				continue;
865 			}
866 
867 			memcpy(&nat64pre.sin6_addr,
868 			    &nat64prefixes[j].ipv6_prefix,
869 			    sizeof(nat64pre.sin6_addr));
870 			nat64pre.sin6_len = sizeof(struct sockaddr_in6);
871 			nat64pre.sin6_family = AF_INET6;
872 			nat64pre.sin6_port = SIN(dst)->sin_port;
873 			nat64pre.sin6_flowinfo = 0;
874 			nat64pre.sin6_scope_id = 0;
875 
876 			dst = SA(&nat64pre);
877 		}
878 
879 		if (dst->sa_family == AF_INET && !info->has_v4_conn) {
880 			continue;
881 		}
882 		if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
883 			continue;
884 		}
885 
886 		mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
887 	}
888 
889 	if (!cellular_viable && want_cellular) {
890 		/* Trigger Cell Bringup */
891 		mptcp_trigger_cell_bringup(mpte);
892 	}
893 }
894 
895 static void
mptcp_remove_cell_subflows(struct mptses * mpte)896 mptcp_remove_cell_subflows(struct mptses *mpte)
897 {
898 	struct mptsub *mpts, *tmpts;
899 
900 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
901 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
902 
903 		if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
904 			continue;
905 		}
906 
907 		os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
908 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
909 
910 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
911 	}
912 
913 	return;
914 }
915 
916 static void
mptcp_remove_wifi_subflows(struct mptses * mpte)917 mptcp_remove_wifi_subflows(struct mptses *mpte)
918 {
919 	struct mptsub *mpts, *tmpts;
920 
921 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
922 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
923 
924 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
925 			continue;
926 		}
927 
928 		os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
929 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
930 
931 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
932 	}
933 
934 	return;
935 }
936 
937 static void
mptcp_pure_handover_subflows_remove(struct mptses * mpte)938 mptcp_pure_handover_subflows_remove(struct mptses *mpte)
939 {
940 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
941 	boolean_t found_working_wifi_subflow = false;
942 	boolean_t found_working_cell_subflow = false;
943 
944 	struct mptsub *mpts;
945 
946 	/*
947 	 * Look for a subflow that is on a non-cellular interface in connected
948 	 * state.
949 	 *
950 	 * In that case, remove all cellular subflows.
951 	 *
952 	 * If however there is no connected subflow
953 	 */
954 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
955 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
956 		struct socket *so;
957 		struct tcpcb *tp;
958 
959 		if (ifp == NULL) {
960 			continue;
961 		}
962 
963 		so = mpts->mpts_socket;
964 		tp = sototcpcb(so);
965 
966 		if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
967 		    tp->t_state != TCPS_ESTABLISHED ||
968 		    mptcp_subflow_disconnecting(mpts)) {
969 			continue;
970 		}
971 
972 		if (IFNET_IS_CELLULAR(ifp)) {
973 			found_working_cell_subflow = true;
974 		} else {
975 			os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
976 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
977 			if (!mptcp_handover_use_cellular(mpte, tp)) {
978 				found_working_wifi_subflow = true;
979 			}
980 		}
981 	}
982 
983 	/*
984 	 * Couldn't find a working subflow, let's not remove those on a cellular
985 	 * interface.
986 	 */
987 	os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
988 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
989 	    found_working_wifi_subflow, found_working_cell_subflow);
990 	if (!found_working_wifi_subflow && wifi_quality != MPTCP_WIFI_QUALITY_GOOD) {
991 		if (found_working_cell_subflow) {
992 			mptcp_remove_wifi_subflows(mpte);
993 		}
994 		return;
995 	}
996 
997 	mptcp_remove_cell_subflows(mpte);
998 }
999 
1000 static void
mptcp_handover_subflows_remove(struct mptses * mpte)1001 mptcp_handover_subflows_remove(struct mptses *mpte)
1002 {
1003 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
1004 	boolean_t found_working_subflow = false;
1005 	struct mptsub *mpts;
1006 
1007 	/*
1008 	 * Look for a subflow that is on a non-cellular interface
1009 	 * and actually works (aka, no retransmission timeout).
1010 	 */
1011 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1012 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1013 		struct socket *so;
1014 		struct tcpcb *tp;
1015 
1016 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1017 			continue;
1018 		}
1019 
1020 		so = mpts->mpts_socket;
1021 		tp = sototcpcb(so);
1022 
1023 		if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1024 		    tp->t_state != TCPS_ESTABLISHED) {
1025 			continue;
1026 		}
1027 
1028 		os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
1029 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
1030 
1031 		if (!mptcp_handover_use_cellular(mpte, tp)) {
1032 			found_working_subflow = true;
1033 			break;
1034 		}
1035 	}
1036 
1037 	/*
1038 	 * Couldn't find a working subflow, let's not remove those on a cellular
1039 	 * interface.
1040 	 */
1041 	if (!found_working_subflow) {
1042 		return;
1043 	}
1044 
1045 	mptcp_remove_cell_subflows(mpte);
1046 }
1047 
1048 static void
mptcp_targetbased_subflows_remove(struct mptses * mpte)1049 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1050 {
1051 	uint64_t time_now = mach_continuous_time();
1052 	struct mptsub *mpts;
1053 
1054 	if (mpte->mpte_time_target != 0 &&
1055 	    (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1056 	    mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
1057 		/* WiFi is bad and we are below the target - don't remove any subflows */
1058 		return;
1059 	}
1060 
1061 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1062 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1063 
1064 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1065 			continue;
1066 		}
1067 
1068 		/* We have a functioning subflow on WiFi. No need for cell! */
1069 		if (mpts->mpts_flags & MPTSF_CONNECTED &&
1070 		    !mptcp_subflow_disconnecting(mpts)) {
1071 			mptcp_remove_cell_subflows(mpte);
1072 			break;
1073 		}
1074 	}
1075 }
1076 
1077 /*
1078  * Based on the MPTCP Service-type and the state of the subflows, we
1079  * will destroy subflows here.
1080  */
1081 void
mptcp_check_subflows_and_remove(struct mptses * mpte)1082 mptcp_check_subflows_and_remove(struct mptses *mpte)
1083 {
1084 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1085 		return;
1086 	}
1087 
1088 	socket_lock_assert_owned(mptetoso(mpte));
1089 
1090 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1091 		mptcp_pure_handover_subflows_remove(mpte);
1092 	}
1093 
1094 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1095 		mptcp_handover_subflows_remove(mpte);
1096 	}
1097 
1098 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1099 		mptcp_targetbased_subflows_remove(mpte);
1100 	}
1101 }
1102 
1103 static void
mptcp_remove_subflows(struct mptses * mpte)1104 mptcp_remove_subflows(struct mptses *mpte)
1105 {
1106 	struct mptsub *mpts, *tmpts;
1107 
1108 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1109 		return;
1110 	}
1111 
1112 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1113 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1114 		boolean_t found = false;
1115 		uint32_t ifindex;
1116 		uint32_t i;
1117 
1118 		if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1119 			mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1120 
1121 			os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1122 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1123 			    ifp ? ifp->if_index : -1);
1124 			soevent(mpts->mpts_socket,
1125 			    SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1126 
1127 			continue;
1128 		}
1129 
1130 		if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1131 			continue;
1132 		}
1133 
1134 		if (ifp) {
1135 			ifindex = ifp->if_index;
1136 		} else {
1137 			ifindex = mpts->mpts_ifscope;
1138 		}
1139 
1140 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1141 			if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1142 				continue;
1143 			}
1144 
1145 			if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1146 				if (mpts->mpts_dst.sa_family == AF_INET6 &&
1147 				    (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1148 					found = true;
1149 					break;
1150 				}
1151 
1152 				if (mpts->mpts_dst.sa_family == AF_INET &&
1153 				    mpte->mpte_itfinfo[i].has_v4_conn) {
1154 					found = true;
1155 					break;
1156 				}
1157 			}
1158 		}
1159 
1160 		if (!found) {
1161 			os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1162 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1163 			    ifindex, mpts->mpts_flags);
1164 
1165 			soevent(mpts->mpts_socket,
1166 			    SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1167 		}
1168 	}
1169 }
1170 
1171 static void
mptcp_create_subflows(__unused void * arg)1172 mptcp_create_subflows(__unused void *arg)
1173 {
1174 	struct mppcb *mpp;
1175 
1176 	/*
1177 	 * Start with clearing, because we might be processing connections
1178 	 * while a new event comes in.
1179 	 */
1180 	if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1181 		os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1182 	}
1183 
1184 	/* Iterate over all MPTCP connections */
1185 
1186 	lck_mtx_lock(&mtcbinfo.mppi_lock);
1187 
1188 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1189 		struct socket *mp_so = mpp->mpp_socket;
1190 		struct mptses *mpte = mpp->mpp_pcbe;
1191 
1192 		socket_lock(mp_so, 1);
1193 		if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS) ||
1194 		    !(mpte->mpte_flags & MPTE_ITFINFO_INIT)) {
1195 			socket_unlock(mp_so, 1);
1196 			continue;
1197 		}
1198 
1199 		VERIFY(mp_so->so_usecount > 0);
1200 
1201 		mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1202 
1203 		mptcp_check_subflows_and_add(mpte);
1204 		mptcp_remove_subflows(mpte);
1205 
1206 		mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1207 		socket_unlock(mp_so, 1);
1208 	}
1209 
1210 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
1211 }
1212 
1213 /*
1214  * We need this because we are coming from an NECP-event. This event gets posted
1215  * while holding NECP-locks. The creation of the subflow however leads us back
1216  * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1217  * So, we would deadlock there as we already hold the NECP-lock.
1218  *
1219  * So, let's schedule this separately. It also gives NECP the chance to make
1220  * progress, without having to wait for MPTCP to finish its subflow creation.
1221  */
1222 void
mptcp_sched_create_subflows(struct mptses * mpte)1223 mptcp_sched_create_subflows(struct mptses *mpte)
1224 {
1225 	struct mppcb *mpp = mpte->mpte_mppcb;
1226 	struct mptcb *mp_tp = mpte->mpte_mptcb;
1227 	struct socket *mp_so = mpp->mpp_socket;
1228 
1229 	if (!mptcp_ok_to_create_subflows(mp_tp)) {
1230 		os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1231 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1232 		return;
1233 	}
1234 
1235 	if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1236 		mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1237 		mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1238 	}
1239 
1240 	if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1241 		return;
1242 	}
1243 
1244 	/* Do the call in 100ms to allow NECP to schedule it on all sockets */
1245 	timeout(mptcp_create_subflows, NULL, hz / 10);
1246 }
1247 
1248 /*
1249  * Allocate an MPTCP socket option structure.
1250  */
1251 struct mptopt *
mptcp_sopt_alloc(void)1252 mptcp_sopt_alloc(void)
1253 {
1254 	return zalloc_flags(mptopt_zone, Z_WAITOK | Z_ZERO);
1255 }
1256 
1257 /*
1258  * Free an MPTCP socket option structure.
1259  */
1260 void
mptcp_sopt_free(struct mptopt * mpo)1261 mptcp_sopt_free(struct mptopt *mpo)
1262 {
1263 	VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1264 
1265 	zfree(mptopt_zone, mpo);
1266 }
1267 
1268 /*
1269  * Add a socket option to the MPTCP socket option list.
1270  */
1271 void
mptcp_sopt_insert(struct mptses * mpte,struct mptopt * mpo)1272 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1273 {
1274 	socket_lock_assert_owned(mptetoso(mpte));
1275 	mpo->mpo_flags |= MPOF_ATTACHED;
1276 	TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1277 }
1278 
1279 /*
1280  * Remove a socket option from the MPTCP socket option list.
1281  */
1282 void
mptcp_sopt_remove(struct mptses * mpte,struct mptopt * mpo)1283 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1284 {
1285 	socket_lock_assert_owned(mptetoso(mpte));
1286 	VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1287 	mpo->mpo_flags &= ~MPOF_ATTACHED;
1288 	TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1289 }
1290 
1291 /*
1292  * Search for an existing <sopt_level,sopt_name> socket option.
1293  */
1294 struct mptopt *
mptcp_sopt_find(struct mptses * mpte,struct sockopt * sopt)1295 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1296 {
1297 	struct mptopt *mpo;
1298 
1299 	socket_lock_assert_owned(mptetoso(mpte));
1300 
1301 	TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1302 		if (mpo->mpo_level == sopt->sopt_level &&
1303 		    mpo->mpo_name == sopt->sopt_name) {
1304 			break;
1305 		}
1306 	}
1307 	return mpo;
1308 }
1309 
1310 /*
1311  * Allocate a MPTCP subflow structure.
1312  */
1313 static struct mptsub *
mptcp_subflow_alloc(void)1314 mptcp_subflow_alloc(void)
1315 {
1316 	return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1317 }
1318 
1319 /*
1320  * Deallocate a subflow structure, called when all of the references held
1321  * on it have been released.  This implies that the subflow has been deleted.
1322  */
1323 static void
mptcp_subflow_free(struct mptsub * mpts)1324 mptcp_subflow_free(struct mptsub *mpts)
1325 {
1326 	VERIFY(mpts->mpts_refcnt == 0);
1327 	VERIFY(mpts->mpts_mpte == NULL);
1328 	VERIFY(mpts->mpts_socket == NULL);
1329 
1330 	free_sockaddr(mpts->mpts_src);
1331 
1332 	zfree(mptsub_zone, mpts);
1333 }
1334 
1335 static void
mptcp_subflow_addref(struct mptsub * mpts)1336 mptcp_subflow_addref(struct mptsub *mpts)
1337 {
1338 	if (++mpts->mpts_refcnt == 0) {
1339 		panic("%s: mpts %p wraparound refcnt", __func__, mpts);
1340 	}
1341 	/* NOTREACHED */
1342 }
1343 
1344 static void
mptcp_subflow_remref(struct mptsub * mpts)1345 mptcp_subflow_remref(struct mptsub *mpts)
1346 {
1347 	if (mpts->mpts_refcnt == 0) {
1348 		panic("%s: mpts %p negative refcnt", __func__, mpts);
1349 		/* NOTREACHED */
1350 	}
1351 	if (--mpts->mpts_refcnt > 0) {
1352 		return;
1353 	}
1354 
1355 	/* callee will unlock and destroy lock */
1356 	mptcp_subflow_free(mpts);
1357 }
1358 
1359 static void
mptcp_subflow_attach(struct mptses * mpte,struct mptsub * mpts,struct socket * so)1360 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1361 {
1362 	struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1363 	struct tcpcb *tp = sototcpcb(so);
1364 
1365 	/*
1366 	 * From this moment on, the subflow is linked to the MPTCP-connection.
1367 	 * Locking,... happens now at the MPTCP-layer
1368 	 */
1369 	tp->t_mptcb = mpte->mpte_mptcb;
1370 	so->so_flags |= SOF_MP_SUBFLOW;
1371 	mp_so->so_usecount++;
1372 
1373 	/*
1374 	 * Insert the subflow into the list, and associate the MPTCP PCB
1375 	 * as well as the the subflow socket.  From this point on, removing
1376 	 * the subflow needs to be done via mptcp_subflow_del().
1377 	 */
1378 	TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1379 	mpte->mpte_numflows++;
1380 
1381 	mpts->mpts_mpte = mpte;
1382 	mpts->mpts_socket = so;
1383 	tp->t_mpsub = mpts;
1384 	mptcp_subflow_addref(mpts);     /* for being in MPTCP subflow list */
1385 	mptcp_subflow_addref(mpts);     /* for subflow socket */
1386 }
1387 
1388 static void
mptcp_subflow_necp_cb(void * handle,__unused int action,__unused uint32_t interface_index,uint32_t necp_flags,bool * viable)1389 mptcp_subflow_necp_cb(void *handle, __unused int action,
1390     __unused uint32_t interface_index,
1391     uint32_t necp_flags, bool *viable)
1392 {
1393 	boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1394 	struct inpcb *inp = (struct inpcb *)handle;
1395 	struct socket *so = inp->inp_socket;
1396 	struct mptsub *mpts;
1397 	struct mptses *mpte;
1398 
1399 	if (low_power) {
1400 		action = NECP_CLIENT_CBACTION_NONVIABLE;
1401 	}
1402 
1403 	if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1404 		return;
1405 	}
1406 
1407 	/*
1408 	 * The socket is being garbage-collected. There is nothing to be done
1409 	 * here.
1410 	 */
1411 	if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1412 		return;
1413 	}
1414 
1415 	socket_lock(so, 1);
1416 
1417 	/* Check again after we acquired the lock. */
1418 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1419 		goto out;
1420 	}
1421 
1422 	mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1423 	mpts = sototcpcb(so)->t_mpsub;
1424 
1425 	os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1426 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1427 
1428 	mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1429 
1430 	mptcp_sched_create_subflows(mpte);
1431 
1432 	if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1433 	    mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
1434 	    mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1435 	    viable != NULL) {
1436 		*viable = 1;
1437 	}
1438 
1439 out:
1440 	socket_unlock(so, 1);
1441 }
1442 
1443 /*
1444  * Create an MPTCP subflow socket.
1445  */
1446 static int
mptcp_subflow_socreate(struct mptses * mpte,struct mptsub * mpts,int dom,struct socket ** so)1447 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1448     struct socket **so)
1449 {
1450 	lck_mtx_t *subflow_mtx;
1451 	struct mptopt smpo, *mpo, *tmpo;
1452 	struct proc *p;
1453 	struct socket *mp_so;
1454 	struct mppcb *mpp;
1455 	int error;
1456 
1457 	*so = NULL;
1458 
1459 	mp_so = mptetoso(mpte);
1460 	mpp = mpsotomppcb(mp_so);
1461 
1462 	p = proc_find(mp_so->last_pid);
1463 	if (p == PROC_NULL) {
1464 		os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1465 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1466 
1467 		mptcp_subflow_free(mpts);
1468 		return ESRCH;
1469 	}
1470 
1471 	/*
1472 	 * Create the subflow socket (multipath subflow, non-blocking.)
1473 	 *
1474 	 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1475 	 * socket; it will be cleared when the socket is peeled off or closed.
1476 	 * It also indicates to the underlying TCP to handle MPTCP options.
1477 	 * A multipath subflow socket implies SS_NOFDREF state.
1478 	 */
1479 
1480 	/*
1481 	 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1482 	 * the ipi-lock. We cannot hold the socket-lock at that point.
1483 	 */
1484 	socket_unlock(mp_so, 0);
1485 	error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1486 	    SOCF_MPTCP, PROC_NULL);
1487 	socket_lock(mp_so, 0);
1488 	if (error) {
1489 		os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1490 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1491 
1492 		proc_rele(p);
1493 
1494 		mptcp_subflow_free(mpts);
1495 		return error;
1496 	}
1497 
1498 	/*
1499 	 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1500 	 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1501 	 * Which is why we also need to get the lock with pr_getlock, as after
1502 	 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1503 	 */
1504 	subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1505 	lck_mtx_lock(subflow_mtx);
1506 
1507 	/*
1508 	 * Must be the first thing we do, to make sure all pointers for this
1509 	 * subflow are set.
1510 	 */
1511 	mptcp_subflow_attach(mpte, mpts, *so);
1512 
1513 	/*
1514 	 * A multipath subflow socket is used internally in the kernel,
1515 	 * therefore it does not have a file desciptor associated by
1516 	 * default.
1517 	 */
1518 	(*so)->so_state |= SS_NOFDREF;
1519 
1520 	lck_mtx_unlock(subflow_mtx);
1521 
1522 	/* prevent the socket buffers from being compressed */
1523 	(*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1524 	(*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1525 
1526 	/* Inherit preconnect and TFO data flags */
1527 	if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1528 		(*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1529 	}
1530 	if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1531 		(*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1532 	}
1533 	if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1534 		(*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1535 	}
1536 
1537 	/* Inherit uuid and create the related flow. */
1538 	if (!uuid_is_null(mpp->necp_client_uuid)) {
1539 		struct mptcb *mp_tp = mpte->mpte_mptcb;
1540 
1541 		sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1542 
1543 		/*
1544 		 * A note on the unlock: With MPTCP, we do multiple times a
1545 		 * necp_client_register_socket_flow. This is problematic,
1546 		 * because now the lock-ordering guarantee (first necp-locks,
1547 		 * then socket-locks) is no more respected. So, we need to
1548 		 * unlock here.
1549 		 */
1550 		socket_unlock(mp_so, 0);
1551 		error = necp_client_register_socket_flow(mp_so->last_pid,
1552 		    mpp->necp_client_uuid, sotoinpcb(*so));
1553 		socket_lock(mp_so, 0);
1554 
1555 		if (error) {
1556 			os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1557 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1558 
1559 			goto out_err;
1560 		}
1561 
1562 		/* Possible state-change during the unlock above */
1563 		if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1564 		    (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1565 			os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1566 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1567 			    mp_tp->mpt_state, mp_tp->mpt_flags);
1568 
1569 			error = EINVAL;
1570 			goto out_err;
1571 		}
1572 
1573 		uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpp->necp_client_uuid);
1574 	}
1575 
1576 	if (mpp->inp_necp_attributes.inp_domain != NULL) {
1577 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain);
1578 		sotoinpcb(*so)->inp_necp_attributes.inp_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1579 
1580 		if (sotoinpcb(*so)->inp_necp_attributes.inp_domain) {
1581 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain, mpp->inp_necp_attributes.inp_domain, string_size + 1);
1582 		}
1583 	}
1584 	if (mpp->inp_necp_attributes.inp_account != NULL) {
1585 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_account);
1586 		sotoinpcb(*so)->inp_necp_attributes.inp_account = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1587 
1588 		if (sotoinpcb(*so)->inp_necp_attributes.inp_account) {
1589 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_account, mpp->inp_necp_attributes.inp_account, string_size + 1);
1590 		}
1591 	}
1592 
1593 	if (mpp->inp_necp_attributes.inp_domain_owner != NULL) {
1594 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain_owner);
1595 		sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1596 
1597 		if (sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner) {
1598 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner, mpp->inp_necp_attributes.inp_domain_owner, string_size + 1);
1599 		}
1600 	}
1601 
1602 	if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) {
1603 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_tracker_domain);
1604 		sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1605 
1606 		if (sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain) {
1607 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain, mpp->inp_necp_attributes.inp_tracker_domain, string_size + 1);
1608 		}
1609 	}
1610 
1611 	/* Needs to happen prior to the delegation! */
1612 	(*so)->last_pid = mp_so->last_pid;
1613 
1614 	if (mp_so->so_flags & SOF_DELEGATED) {
1615 		if (mpte->mpte_epid) {
1616 			error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1617 			if (error) {
1618 				os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1619 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1620 				goto out_err;
1621 			}
1622 		}
1623 		if (!uuid_is_null(mpte->mpte_euuid)) {
1624 			error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1625 			if (error) {
1626 				os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1627 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1628 				goto out_err;
1629 			}
1630 		}
1631 	}
1632 
1633 	/* inherit the other socket options */
1634 	bzero(&smpo, sizeof(smpo));
1635 	smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1636 	smpo.mpo_level = SOL_SOCKET;
1637 	smpo.mpo_intval = 1;
1638 
1639 	/* disable SIGPIPE */
1640 	smpo.mpo_name = SO_NOSIGPIPE;
1641 	if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1642 		goto out_err;
1643 	}
1644 
1645 	/* find out if the subflow's source address goes away */
1646 	smpo.mpo_name = SO_NOADDRERR;
1647 	if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1648 		goto out_err;
1649 	}
1650 
1651 	if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1652 		/*
1653 		 * On secondary subflows we might need to set the cell-fallback
1654 		 * flag (see conditions in mptcp_subflow_sosetopt).
1655 		 */
1656 		smpo.mpo_level = SOL_SOCKET;
1657 		smpo.mpo_name = SO_MARK_CELLFALLBACK;
1658 		smpo.mpo_intval = 1;
1659 		if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1660 			goto out_err;
1661 		}
1662 	}
1663 
1664 	/* replay setsockopt(2) on the subflow sockets for eligible options */
1665 	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1666 		int interim;
1667 
1668 		if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1669 			continue;
1670 		}
1671 
1672 		/*
1673 		 * Skip those that are handled internally; these options
1674 		 * should not have been recorded and marked with the
1675 		 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1676 		 */
1677 		if (mpo->mpo_level == SOL_SOCKET &&
1678 		    (mpo->mpo_name == SO_NOSIGPIPE ||
1679 		    mpo->mpo_name == SO_NOADDRERR ||
1680 		    mpo->mpo_name == SO_KEEPALIVE)) {
1681 			continue;
1682 		}
1683 
1684 		interim = (mpo->mpo_flags & MPOF_INTERIM);
1685 		if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1686 			os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1687 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1688 			    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1689 			    mpo->mpo_intval);
1690 			mptcp_sopt_remove(mpte, mpo);
1691 			mptcp_sopt_free(mpo);
1692 			continue;
1693 		}
1694 	}
1695 
1696 	/*
1697 	 * We need to receive everything that the subflow socket has,
1698 	 * so use a customized socket receive function.  We will undo
1699 	 * this when the socket is peeled off or closed.
1700 	 */
1701 	switch (dom) {
1702 	case PF_INET:
1703 		(*so)->so_proto = &mptcp_subflow_protosw;
1704 		break;
1705 	case PF_INET6:
1706 		(*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1707 		break;
1708 	default:
1709 		VERIFY(0);
1710 		/* NOTREACHED */
1711 	}
1712 
1713 	proc_rele(p);
1714 
1715 	DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1716 	    int, dom, int, error);
1717 
1718 	return 0;
1719 
1720 out_err:
1721 	mptcp_subflow_abort(mpts, error);
1722 
1723 	proc_rele(p);
1724 
1725 	return error;
1726 }
1727 
1728 /*
1729  * Close an MPTCP subflow socket.
1730  *
1731  * Note that this may be called on an embryonic subflow, and the only
1732  * thing that is guaranteed valid is the protocol-user request.
1733  */
1734 static void
mptcp_subflow_soclose(struct mptsub * mpts)1735 mptcp_subflow_soclose(struct mptsub *mpts)
1736 {
1737 	struct socket *so = mpts->mpts_socket;
1738 
1739 	if (mpts->mpts_flags & MPTSF_CLOSED) {
1740 		return;
1741 	}
1742 
1743 	VERIFY(so != NULL);
1744 	VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1745 	VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1746 
1747 	DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1748 	    struct socket *, so,
1749 	    struct sockbuf *, &so->so_rcv,
1750 	    struct sockbuf *, &so->so_snd,
1751 	    struct mptses *, mpts->mpts_mpte);
1752 
1753 	mpts->mpts_flags |= MPTSF_CLOSED;
1754 
1755 	if (so->so_retaincnt == 0) {
1756 		soclose_locked(so);
1757 
1758 		return;
1759 	} else {
1760 		VERIFY(so->so_usecount > 0);
1761 		so->so_usecount--;
1762 	}
1763 
1764 	return;
1765 }
1766 
1767 static void
mptcp_attach_to_subf(struct socket * so,struct mptcb * mp_tp,uint8_t addr_id)1768 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
1769 {
1770 	struct tcpcb *tp = sototcpcb(so);
1771 	struct mptcp_subf_auth_entry *sauth_entry;
1772 
1773 	/*
1774 	 * The address ID of the first flow is implicitly 0.
1775 	 */
1776 	if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1777 		tp->t_local_aid = 0;
1778 	} else {
1779 		tp->t_local_aid = addr_id;
1780 		tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
1781 		so->so_flags |= SOF_MP_SEC_SUBFLOW;
1782 	}
1783 	sauth_entry = zalloc(mpt_subauth_zone);
1784 	sauth_entry->msae_laddr_id = tp->t_local_aid;
1785 	sauth_entry->msae_raddr_id = 0;
1786 	sauth_entry->msae_raddr_rand = 0;
1787 try_again:
1788 	sauth_entry->msae_laddr_rand = RandomULong();
1789 	if (sauth_entry->msae_laddr_rand == 0) {
1790 		goto try_again;
1791 	}
1792 	LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
1793 }
1794 
1795 static void
mptcp_detach_mptcb_from_subf(struct mptcb * mp_tp,struct socket * so)1796 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
1797 {
1798 	struct mptcp_subf_auth_entry *sauth_entry;
1799 	struct tcpcb *tp = NULL;
1800 	int found = 0;
1801 
1802 	tp = sototcpcb(so);
1803 	if (tp == NULL) {
1804 		return;
1805 	}
1806 
1807 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
1808 		if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
1809 			found = 1;
1810 			break;
1811 		}
1812 	}
1813 	if (found) {
1814 		LIST_REMOVE(sauth_entry, msae_next);
1815 	}
1816 
1817 	if (found) {
1818 		zfree(mpt_subauth_zone, sauth_entry);
1819 	}
1820 }
1821 
1822 /*
1823  * Connect an MPTCP subflow socket.
1824  *
1825  * Note that in the pending connect case, the subflow socket may have been
1826  * bound to an interface and/or a source IP address which may no longer be
1827  * around by the time this routine is called; in that case the connect attempt
1828  * will most likely fail.
1829  */
1830 static int
mptcp_subflow_soconnectx(struct mptses * mpte,struct mptsub * mpts)1831 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1832 {
1833 	char dbuf[MAX_IPv6_STR_LEN];
1834 	struct socket *mp_so, *so;
1835 	struct mptcb *mp_tp;
1836 	struct sockaddr *dst;
1837 	struct proc *p;
1838 	int af, error, dport;
1839 
1840 	mp_so = mptetoso(mpte);
1841 	mp_tp = mpte->mpte_mptcb;
1842 	so = mpts->mpts_socket;
1843 	af = mpts->mpts_dst.sa_family;
1844 	dst = &mpts->mpts_dst;
1845 
1846 	VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1847 	VERIFY(mpts->mpts_socket != NULL);
1848 	VERIFY(af == AF_INET || af == AF_INET6);
1849 
1850 	if (af == AF_INET) {
1851 		inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1852 		dport = ntohs(SIN(dst)->sin_port);
1853 	} else {
1854 		inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1855 		dport = ntohs(SIN6(dst)->sin6_port);
1856 	}
1857 
1858 	os_log(mptcp_log_handle,
1859 	    "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1860 	    mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1861 
1862 	p = proc_find(mp_so->last_pid);
1863 	if (p == PROC_NULL) {
1864 		os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1865 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1866 
1867 		return ESRCH;
1868 	}
1869 
1870 	mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1871 
1872 	mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1873 
1874 	/* connect the subflow socket */
1875 	error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1876 	    p, mpts->mpts_ifscope,
1877 	    mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1878 
1879 	mpts->mpts_iss = sototcpcb(so)->iss;
1880 
1881 	/* See tcp_connect_complete */
1882 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1883 	    (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1884 		mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1885 	}
1886 
1887 	/* Allocate a unique address id per subflow */
1888 	mpte->mpte_addrid_last++;
1889 	if (mpte->mpte_addrid_last == 0) {
1890 		mpte->mpte_addrid_last++;
1891 	}
1892 
1893 	proc_rele(p);
1894 
1895 	DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1896 	    struct mptsub *, mpts, int, error);
1897 	if (error) {
1898 		os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1899 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1900 	}
1901 
1902 	return error;
1903 }
1904 
1905 static int
mptcp_adj_rmap(struct socket * so,struct mbuf * m,int off,uint64_t dsn,uint32_t rseq,uint16_t dlen,uint8_t dfin)1906 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1907     uint32_t rseq, uint16_t dlen, uint8_t dfin)
1908 {
1909 	struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1910 
1911 	if (m_pktlen(m) == 0) {
1912 		return 0;
1913 	}
1914 
1915 	if (!(m->m_flags & M_PKTHDR)) {
1916 		return 0;
1917 	}
1918 
1919 	if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
1920 		if (off && (dsn != m->m_pkthdr.mp_dsn ||
1921 		    rseq != m->m_pkthdr.mp_rseq ||
1922 		    dlen != m->m_pkthdr.mp_rlen ||
1923 		    dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1924 			os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
1925 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1926 			    (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1927 			    rseq, m->m_pkthdr.mp_rseq,
1928 			    dlen, m->m_pkthdr.mp_rlen,
1929 			    dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
1930 
1931 			soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1932 			return -1;
1933 		}
1934 	}
1935 
1936 	/* If mbuf is beyond right edge of the mapping, we need to split */
1937 	if (m_pktlen(m) > dlen - dfin - off) {
1938 		struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
1939 		if (new == NULL) {
1940 			os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
1941 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1942 			    dlen, dfin, off, m_pktlen(m),
1943 			    mpts->mpts_connid);
1944 
1945 			soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1946 			return -1;
1947 		}
1948 
1949 		m->m_next = new;
1950 		sballoc(&so->so_rcv, new);
1951 		/* Undo, as sballoc will add to it as well */
1952 		so->so_rcv.sb_cc -= new->m_len;
1953 
1954 		if (so->so_rcv.sb_mbtail == m) {
1955 			so->so_rcv.sb_mbtail = new;
1956 		}
1957 	}
1958 
1959 	m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1960 	m->m_pkthdr.mp_dsn = dsn + off;
1961 	m->m_pkthdr.mp_rseq = rseq + off;
1962 	VERIFY(m_pktlen(m) < UINT16_MAX);
1963 	m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
1964 
1965 	/* Only put the DATA_FIN-flag on the last mbuf of this mapping */
1966 	if (dfin) {
1967 		if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
1968 			m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
1969 		} else {
1970 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
1971 		}
1972 	}
1973 
1974 
1975 	mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
1976 
1977 	return 0;
1978 }
1979 
1980 /*
1981  * Update the pid, upid, uuid of the subflow so, based on parent so
1982  */
1983 static void
mptcp_update_last_owner(struct socket * so,struct socket * mp_so)1984 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
1985 {
1986 	if (so->last_pid != mp_so->last_pid ||
1987 	    so->last_upid != mp_so->last_upid) {
1988 		so->last_upid = mp_so->last_upid;
1989 		so->last_pid = mp_so->last_pid;
1990 		uuid_copy(so->last_uuid, mp_so->last_uuid);
1991 	}
1992 	so_update_policy(so);
1993 }
1994 
1995 /*
1996  * MPTCP subflow socket receive routine, derived from soreceive().
1997  */
1998 static int
mptcp_subflow_soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)1999 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
2000     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2001 {
2002 #pragma unused(uio)
2003 	struct socket *mp_so;
2004 	struct mptses *mpte;
2005 	struct mptcb *mp_tp;
2006 	int flags, error = 0;
2007 	struct mbuf *m, **mp = mp0;
2008 	struct tcpcb *tp = sototcpcb(so);
2009 
2010 	mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2011 	mp_so = mptetoso(mpte);
2012 	mp_tp = mpte->mpte_mptcb;
2013 
2014 	VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2015 
2016 #ifdef MORE_LOCKING_DEBUG
2017 	if (so->so_usecount == 1) {
2018 		panic("%s: so=%x no other reference on socket", __func__, so);
2019 		/* NOTREACHED */
2020 	}
2021 #endif
2022 	/*
2023 	 * We return all that is there in the subflow's socket receive buffer
2024 	 * to the MPTCP layer, so we require that the caller passes in the
2025 	 * expected parameters.
2026 	 */
2027 	if (mp == NULL || controlp != NULL) {
2028 		return EINVAL;
2029 	}
2030 
2031 	*mp = NULL;
2032 	if (psa != NULL) {
2033 		*psa = NULL;
2034 	}
2035 	if (flagsp != NULL) {
2036 		flags = *flagsp & ~MSG_EOR;
2037 	} else {
2038 		flags = 0;
2039 	}
2040 
2041 	if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2042 		return EOPNOTSUPP;
2043 	}
2044 
2045 	flags |= (MSG_DONTWAIT | MSG_NBIO);
2046 
2047 	/*
2048 	 * If a recv attempt is made on a previously-accepted socket
2049 	 * that has been marked as inactive (disconnected), reject
2050 	 * the request.
2051 	 */
2052 	if (so->so_flags & SOF_DEFUNCT) {
2053 		struct sockbuf *sb = &so->so_rcv;
2054 
2055 		error = ENOTCONN;
2056 		/*
2057 		 * This socket should have been disconnected and flushed
2058 		 * prior to being returned from sodefunct(); there should
2059 		 * be no data on its receive list, so panic otherwise.
2060 		 */
2061 		if (so->so_state & SS_DEFUNCT) {
2062 			sb_empty_assert(sb, __func__);
2063 		}
2064 		return error;
2065 	}
2066 
2067 	/*
2068 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2069 	 * and if so just return to the caller.  This could happen when
2070 	 * soreceive() is called by a socket upcall function during the
2071 	 * time the socket is freed.  The socket buffer would have been
2072 	 * locked across the upcall, therefore we cannot put this thread
2073 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2074 	 * we may livelock), because the lock on the socket buffer will
2075 	 * only be released when the upcall routine returns to its caller.
2076 	 * Because the socket has been officially closed, there can be
2077 	 * no further read on it.
2078 	 *
2079 	 * A multipath subflow socket would have its SS_NOFDREF set by
2080 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2081 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2082 	 */
2083 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2084 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2085 		return 0;
2086 	}
2087 
2088 	/*
2089 	 * For consistency with soreceive() semantics, we need to obey
2090 	 * SB_LOCK in case some other code path has locked the buffer.
2091 	 */
2092 	error = sblock(&so->so_rcv, 0);
2093 	if (error != 0) {
2094 		return error;
2095 	}
2096 
2097 	m = so->so_rcv.sb_mb;
2098 	if (m == NULL) {
2099 		/*
2100 		 * Panic if we notice inconsistencies in the socket's
2101 		 * receive list; both sb_mb and sb_cc should correctly
2102 		 * reflect the contents of the list, otherwise we may
2103 		 * end up with false positives during select() or poll()
2104 		 * which could put the application in a bad state.
2105 		 */
2106 		SB_MB_CHECK(&so->so_rcv);
2107 
2108 		if (so->so_error != 0) {
2109 			error = so->so_error;
2110 			so->so_error = 0;
2111 			goto release;
2112 		}
2113 
2114 		if (so->so_state & SS_CANTRCVMORE) {
2115 			goto release;
2116 		}
2117 
2118 		if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2119 			error = ENOTCONN;
2120 			goto release;
2121 		}
2122 
2123 		/*
2124 		 * MSG_DONTWAIT is implicitly defined and this routine will
2125 		 * never block, so return EWOULDBLOCK when there is nothing.
2126 		 */
2127 		error = EWOULDBLOCK;
2128 		goto release;
2129 	}
2130 
2131 	mptcp_update_last_owner(so, mp_so);
2132 
2133 	SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2134 	SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2135 
2136 	while (m != NULL) {
2137 		int dlen = 0, error_out = 0, off = 0;
2138 		uint8_t dfin = 0;
2139 		struct mbuf *start = m;
2140 		uint64_t dsn;
2141 		uint32_t sseq;
2142 		uint16_t orig_dlen;
2143 		uint16_t csum;
2144 
2145 		VERIFY(m->m_nextpkt == NULL);
2146 
2147 		if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2148 fallback:
2149 			/* Just move mbuf to MPTCP-level */
2150 
2151 			sbfree(&so->so_rcv, m);
2152 
2153 			if (mp != NULL) {
2154 				*mp = m;
2155 				mp = &m->m_next;
2156 				so->so_rcv.sb_mb = m = m->m_next;
2157 				*mp = NULL;
2158 			}
2159 
2160 			if (m != NULL) {
2161 				so->so_rcv.sb_lastrecord = m;
2162 			} else {
2163 				SB_EMPTY_FIXUP(&so->so_rcv);
2164 			}
2165 
2166 			continue;
2167 		} else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2168 			struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2169 			boolean_t found_mapping = false;
2170 			int parsed_length = 0;
2171 			struct mbuf *m_iter;
2172 
2173 			/*
2174 			 * No MPTCP-option in the header. Either fallback or
2175 			 * wait for additional mappings.
2176 			 */
2177 			if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2178 				/* data arrived without a DSS option mapping */
2179 
2180 				/* initial subflow can fallback right after SYN handshake */
2181 				if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2182 					mptcp_notify_mpfail(so);
2183 
2184 					goto fallback;
2185 				} else {
2186 					os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2187 					    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2188 					    mpts->mpts_connid);
2189 					soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2190 
2191 					error = EIO;
2192 					*mp0 = NULL;
2193 					goto release;
2194 				}
2195 			}
2196 
2197 			/* Thus, let's look for an mbuf with the mapping */
2198 			m_iter = m->m_next;
2199 			parsed_length = m->m_len;
2200 			while (m_iter != NULL && parsed_length < UINT16_MAX) {
2201 				if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2202 					parsed_length += m_iter->m_len;
2203 					m_iter = m_iter->m_next;
2204 					continue;
2205 				}
2206 
2207 				found_mapping = true;
2208 
2209 				/* Found an mbuf with a DSS-mapping */
2210 				orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2211 				dsn = m_iter->m_pkthdr.mp_dsn;
2212 				sseq = m_iter->m_pkthdr.mp_rseq;
2213 				csum = m_iter->m_pkthdr.mp_csum;
2214 
2215 				if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2216 					dfin = 1;
2217 					dlen--;
2218 				}
2219 
2220 				break;
2221 			}
2222 
2223 			if (!found_mapping && parsed_length < UINT16_MAX) {
2224 				/* Mapping not yet present, we can wait! */
2225 				if (*mp0 == NULL) {
2226 					error = EWOULDBLOCK;
2227 				}
2228 				goto release;
2229 			} else if (!found_mapping && parsed_length >= UINT16_MAX) {
2230 				os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2231 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2232 				    mpts->mpts_connid);
2233 				/* Received 64KB without DSS-mapping. We should kill the subflow */
2234 				soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2235 
2236 				error = EIO;
2237 				*mp0 = NULL;
2238 				goto release;
2239 			}
2240 		} else {
2241 			orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2242 			dsn = m->m_pkthdr.mp_dsn;
2243 			sseq = m->m_pkthdr.mp_rseq;
2244 			csum = m->m_pkthdr.mp_csum;
2245 
2246 			if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2247 				dfin = 1;
2248 				dlen--;
2249 			}
2250 		}
2251 
2252 		/* Now, see if we need to remove previous packets */
2253 		if (SEQ_GT(sseq + tp->irs, tp->rcv_nxt - so->so_rcv.sb_cc)) {
2254 			/* Ok, there is data in there that we don't need - let's throw it away! */
2255 			int totrim = (int)sseq + tp->irs - (tp->rcv_nxt - so->so_rcv.sb_cc);
2256 
2257 			sbdrop(&so->so_rcv, totrim);
2258 
2259 			m = so->so_rcv.sb_mb;
2260 		}
2261 
2262 		/*
2263 		 * Check if the full mapping is now present
2264 		 */
2265 		if ((int)so->so_rcv.sb_cc < dlen) {
2266 			if (*mp0 == NULL) {
2267 				error = EWOULDBLOCK;
2268 			}
2269 			goto release;
2270 		}
2271 
2272 		/* Now, get the full mapping */
2273 		off = 0;
2274 		while (dlen > 0) {
2275 			if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
2276 				error_out = 1;
2277 				error = EIO;
2278 				dlen = 0;
2279 				*mp0 = NULL;
2280 				break;
2281 			}
2282 
2283 			dlen -= m->m_len;
2284 			off += m->m_len;
2285 			sbfree(&so->so_rcv, m);
2286 
2287 			if (mp != NULL) {
2288 				*mp = m;
2289 				mp = &m->m_next;
2290 				so->so_rcv.sb_mb = m = m->m_next;
2291 				*mp = NULL;
2292 			}
2293 
2294 			ASSERT(dlen == 0 || m);
2295 			if (dlen != 0 && m == NULL) {
2296 				/* "try" to gracefully recover on customer builds */
2297 				error_out = 1;
2298 				error = EIO;
2299 				dlen  = 0;
2300 
2301 				*mp0 = NULL;
2302 
2303 				SB_EMPTY_FIXUP(&so->so_rcv);
2304 				soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2305 
2306 				break;
2307 			}
2308 		}
2309 
2310 		ASSERT(dlen == 0);
2311 		if (dlen != 0) {
2312 			/* "try" to gracefully recover on customer builds */
2313 			error_out = 1;
2314 			error = EIO;
2315 			dlen = 0;
2316 
2317 			*mp0 = NULL;
2318 
2319 			SB_EMPTY_FIXUP(&so->so_rcv);
2320 			soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2321 		}
2322 
2323 		if (m != NULL) {
2324 			so->so_rcv.sb_lastrecord = m;
2325 		} else {
2326 			SB_EMPTY_FIXUP(&so->so_rcv);
2327 		}
2328 
2329 		if (error_out) {
2330 			goto release;
2331 		}
2332 
2333 		if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2334 			error = EIO;
2335 			*mp0 = NULL;
2336 			goto release;
2337 		}
2338 
2339 		SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2340 		SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2341 	}
2342 
2343 	DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2344 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2345 
2346 	if (flagsp != NULL) {
2347 		*flagsp |= flags;
2348 	}
2349 
2350 release:
2351 	sbunlock(&so->so_rcv, TRUE);
2352 
2353 	return error;
2354 }
2355 
2356 /*
2357  * MPTCP subflow socket send routine, derived from sosend().
2358  */
2359 static int
mptcp_subflow_sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2360 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2361     struct mbuf *top, struct mbuf *control, int flags)
2362 {
2363 	struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2364 	boolean_t en_tracing = FALSE, proc_held = FALSE;
2365 	struct proc *p = current_proc();
2366 	int en_tracing_val;
2367 	int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2368 	int error;
2369 
2370 	VERIFY(control == NULL);
2371 	VERIFY(addr == NULL);
2372 	VERIFY(uio == NULL);
2373 	VERIFY(flags == 0);
2374 	VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2375 
2376 	VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2377 	VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2378 
2379 	/*
2380 	 * trace if tracing & network (vs. unix) sockets & and
2381 	 * non-loopback
2382 	 */
2383 	if (ENTR_SHOULDTRACE &&
2384 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2385 		struct inpcb *inp = sotoinpcb(so);
2386 		if (inp->inp_last_outifp != NULL &&
2387 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2388 			en_tracing = TRUE;
2389 			en_tracing_val = top->m_pkthdr.len;
2390 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2391 			    (unsigned long)VM_KERNEL_ADDRPERM(so),
2392 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2393 			    (int64_t)en_tracing_val);
2394 		}
2395 	}
2396 
2397 	mptcp_update_last_owner(so, mp_so);
2398 
2399 	if (mp_so->last_pid != proc_pid(p)) {
2400 		p = proc_find(mp_so->last_pid);
2401 		if (p == PROC_NULL) {
2402 			p = current_proc();
2403 		} else {
2404 			proc_held = TRUE;
2405 		}
2406 	}
2407 
2408 #if NECP
2409 	inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2410 #endif /* NECP */
2411 
2412 	error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2413 	if (error) {
2414 		goto out;
2415 	}
2416 
2417 	error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2418 	top = NULL;
2419 
2420 out:
2421 	if (top != NULL) {
2422 		m_freem(top);
2423 	}
2424 
2425 	if (proc_held) {
2426 		proc_rele(p);
2427 	}
2428 
2429 	soclearfastopen(so);
2430 
2431 	if (en_tracing) {
2432 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2433 		    (unsigned long)VM_KERNEL_ADDRPERM(so),
2434 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2435 		    (int64_t)en_tracing_val);
2436 	}
2437 
2438 	return error;
2439 }
2440 
2441 /*
2442  * Subflow socket write upcall.
2443  *
2444  * Called when the associated subflow socket posted a read event.
2445  */
2446 static void
mptcp_subflow_wupcall(struct socket * so,void * arg,int waitf)2447 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2448 {
2449 #pragma unused(so, waitf)
2450 	struct mptsub *mpts = arg;
2451 	struct mptses *mpte = mpts->mpts_mpte;
2452 
2453 	VERIFY(mpte != NULL);
2454 
2455 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2456 		if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2457 			mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2458 		}
2459 		return;
2460 	}
2461 
2462 	mptcp_output(mpte);
2463 }
2464 
2465 /*
2466  * Subflow socket control event upcall.
2467  */
2468 static void
mptcp_subflow_eupcall1(struct socket * so,void * arg,uint32_t events)2469 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
2470 {
2471 #pragma unused(so)
2472 	struct mptsub *mpts = arg;
2473 	struct mptses *mpte = mpts->mpts_mpte;
2474 
2475 	socket_lock_assert_owned(mptetoso(mpte));
2476 
2477 	if ((mpts->mpts_evctl & events) == events) {
2478 		return;
2479 	}
2480 
2481 	mpts->mpts_evctl |= events;
2482 
2483 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2484 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
2485 		return;
2486 	}
2487 
2488 	mptcp_subflow_workloop(mpte);
2489 }
2490 
2491 /*
2492  * Establish an initial MPTCP connection (if first subflow and not yet
2493  * connected), or add a subflow to an existing MPTCP connection.
2494  */
2495 int
mptcp_subflow_add(struct mptses * mpte,struct sockaddr * src,struct sockaddr * dst,uint32_t ifscope,sae_connid_t * pcid)2496 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2497     struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2498 {
2499 	struct socket *mp_so, *so = NULL;
2500 	struct mptcb *mp_tp;
2501 	struct mptsub *mpts = NULL;
2502 	int af, error = 0;
2503 
2504 	mp_so = mptetoso(mpte);
2505 	mp_tp = mpte->mpte_mptcb;
2506 
2507 	socket_lock_assert_owned(mp_so);
2508 
2509 	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2510 		/* If the remote end sends Data FIN, refuse subflow adds */
2511 		os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2512 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2513 		error = ENOTCONN;
2514 		goto out_err;
2515 	}
2516 
2517 	if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2518 		error = EOVERFLOW;
2519 		goto out_err;
2520 	}
2521 
2522 	mpts = mptcp_subflow_alloc();
2523 	if (mpts == NULL) {
2524 		os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2525 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2526 		error = ENOMEM;
2527 		goto out_err;
2528 	}
2529 
2530 	if (src) {
2531 		if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2532 			error = EAFNOSUPPORT;
2533 			goto out_err;
2534 		}
2535 
2536 		if (src->sa_family == AF_INET &&
2537 		    src->sa_len != sizeof(struct sockaddr_in)) {
2538 			error = EINVAL;
2539 			goto out_err;
2540 		}
2541 
2542 		if (src->sa_family == AF_INET6 &&
2543 		    src->sa_len != sizeof(struct sockaddr_in6)) {
2544 			error = EINVAL;
2545 			goto out_err;
2546 		}
2547 
2548 		mpts->mpts_src = SA(alloc_sockaddr(src->sa_len, Z_WAITOK | Z_NOFAIL));
2549 
2550 		SOCKADDR_COPY(src, mpts->mpts_src, src->sa_len);
2551 	}
2552 
2553 	if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2554 		error = EAFNOSUPPORT;
2555 		goto out_err;
2556 	}
2557 
2558 	if (dst->sa_family == AF_INET &&
2559 	    dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2560 		error = EINVAL;
2561 		goto out_err;
2562 	}
2563 
2564 	if (dst->sa_family == AF_INET6 &&
2565 	    dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2566 		error = EINVAL;
2567 		goto out_err;
2568 	}
2569 
2570 	SOCKADDR_COPY(dst, &mpts->mpts_dst, dst->sa_len);
2571 
2572 	af = mpts->mpts_dst.sa_family;
2573 
2574 	ifnet_head_lock_shared();
2575 	if ((ifscope > (unsigned)if_index)) {
2576 		ifnet_head_done();
2577 		error = ENXIO;
2578 		goto out_err;
2579 	}
2580 	ifnet_head_done();
2581 
2582 	mpts->mpts_ifscope = ifscope;
2583 
2584 	/* create the subflow socket */
2585 	if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2586 		/*
2587 		 * Returning (error) and not cleaning up, because up to here
2588 		 * all we did is creating mpts.
2589 		 *
2590 		 * And the contract is that the call to mptcp_subflow_socreate,
2591 		 * moves ownership of mpts to mptcp_subflow_socreate.
2592 		 */
2593 		return error;
2594 	}
2595 
2596 	/*
2597 	 * We may be called from within the kernel. Still need to account this
2598 	 * one to the real app.
2599 	 */
2600 	mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2601 
2602 	/*
2603 	 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2604 	 * -1 (SAE_CONNID_ALL).
2605 	 */
2606 	mpte->mpte_connid_last++;
2607 	if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2608 	    mpte->mpte_connid_last == SAE_CONNID_ANY) {
2609 		mpte->mpte_connid_last++;
2610 	}
2611 
2612 	mpts->mpts_connid = mpte->mpte_connid_last;
2613 
2614 	mpts->mpts_rel_seq = 1;
2615 
2616 	/* Allocate a unique address id per subflow */
2617 	mpte->mpte_addrid_last++;
2618 	if (mpte->mpte_addrid_last == 0) {
2619 		mpte->mpte_addrid_last++;
2620 	}
2621 
2622 	/* register for subflow socket read/write events */
2623 	sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2624 
2625 	/* Register for subflow socket control events */
2626 	sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2627 	    SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2628 	    SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2629 	    SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2630 	    SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2631 	    SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2632 	    SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2633 	    SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2634 
2635 	/* sanity check */
2636 	VERIFY(!(mpts->mpts_flags &
2637 	    (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2638 
2639 	/*
2640 	 * Indicate to the TCP subflow whether or not it should establish
2641 	 * the initial MPTCP connection, or join an existing one.  Fill
2642 	 * in the connection request structure with additional info needed
2643 	 * by the underlying TCP (to be used in the TCP options, etc.)
2644 	 */
2645 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2646 		mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2647 
2648 		if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2649 			mptcp_init_local_parms(mpte, dst);
2650 		}
2651 		soisconnecting(mp_so);
2652 
2653 		/* If fastopen is requested, set state in mpts */
2654 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2655 			mpts->mpts_flags |= MPTSF_TFO_REQD;
2656 		}
2657 	} else {
2658 		if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2659 			mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2660 		}
2661 	}
2662 
2663 	mpts->mpts_flags |= MPTSF_CONNECTING;
2664 
2665 	/* connect right away if first attempt, or if join can be done now */
2666 	if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2667 		error = mptcp_subflow_soconnectx(mpte, mpts);
2668 	}
2669 
2670 	if (error) {
2671 		goto out_err_close;
2672 	}
2673 
2674 	if (pcid) {
2675 		*pcid = mpts->mpts_connid;
2676 	}
2677 
2678 	return 0;
2679 
2680 out_err_close:
2681 	mptcp_subflow_abort(mpts, error);
2682 
2683 	return error;
2684 
2685 out_err:
2686 	if (mpts) {
2687 		mptcp_subflow_free(mpts);
2688 	}
2689 
2690 	return error;
2691 }
2692 
2693 void
mptcpstats_update(struct mptcp_itf_stats * stats,const struct mptsub * mpts)2694 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2695 {
2696 	int index = mptcpstats_get_index(stats, mpts);
2697 
2698 	if (index != -1) {
2699 		struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2700 
2701 		stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2702 		stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2703 
2704 		stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2705 		stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2706 
2707 		stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2708 		stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2709 
2710 		stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2711 		stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2712 	}
2713 }
2714 
2715 /*
2716  * Delete/remove a subflow from an MPTCP.  The underlying subflow socket
2717  * will no longer be accessible after a subflow is deleted, thus this
2718  * should occur only after the subflow socket has been disconnected.
2719  */
2720 void
mptcp_subflow_del(struct mptses * mpte,struct mptsub * mpts)2721 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2722 {
2723 	struct socket *mp_so = mptetoso(mpte);
2724 	struct socket *so = mpts->mpts_socket;
2725 	struct tcpcb *tp = sototcpcb(so);
2726 
2727 	socket_lock_assert_owned(mp_so);
2728 	VERIFY(mpts->mpts_mpte == mpte);
2729 	VERIFY(mpte->mpte_numflows != 0);
2730 	VERIFY(mp_so->so_usecount > 0);
2731 
2732 	mptcpstats_update(mpte->mpte_itfstats, mpts);
2733 
2734 	mptcp_unset_cellicon(mpte, mpts, 1);
2735 
2736 	mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2737 	mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2738 
2739 	TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2740 	mpte->mpte_numflows--;
2741 	if (mpte->mpte_active_sub == mpts) {
2742 		mpte->mpte_active_sub = NULL;
2743 	}
2744 
2745 	/*
2746 	 * Drop references held by this subflow socket; there
2747 	 * will be no further upcalls made from this point.
2748 	 */
2749 	sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2750 	sock_catchevents_locked(so, NULL, NULL, 0);
2751 
2752 	mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2753 
2754 	mp_so->so_usecount--;           /* for subflow socket */
2755 	mpts->mpts_mpte = NULL;
2756 	mpts->mpts_socket = NULL;
2757 
2758 	mptcp_subflow_remref(mpts);             /* for MPTCP subflow list */
2759 	mptcp_subflow_remref(mpts);             /* for subflow socket */
2760 
2761 	so->so_flags &= ~SOF_MP_SUBFLOW;
2762 	tp->t_mptcb = NULL;
2763 	tp->t_mpsub = NULL;
2764 }
2765 
2766 void
mptcp_subflow_shutdown(struct mptses * mpte,struct mptsub * mpts)2767 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2768 {
2769 	struct socket *so = mpts->mpts_socket;
2770 	struct mptcb *mp_tp = mpte->mpte_mptcb;
2771 	int send_dfin = 0;
2772 
2773 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2774 		send_dfin = 1;
2775 	}
2776 
2777 	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2778 	    (so->so_state & SS_ISCONNECTED)) {
2779 		if (send_dfin) {
2780 			mptcp_send_dfin(so);
2781 		}
2782 		soshutdownlock(so, SHUT_WR);
2783 	}
2784 }
2785 
2786 static void
mptcp_subflow_abort(struct mptsub * mpts,int error)2787 mptcp_subflow_abort(struct mptsub *mpts, int error)
2788 {
2789 	struct socket *so = mpts->mpts_socket;
2790 	struct tcpcb *tp = sototcpcb(so);
2791 
2792 	if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2793 		return;
2794 	}
2795 
2796 	if (tp->t_state != TCPS_CLOSED) {
2797 		tcp_drop(tp, error);
2798 	}
2799 
2800 	mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2801 }
2802 
2803 /*
2804  * Disconnect a subflow socket.
2805  */
2806 void
mptcp_subflow_disconnect(struct mptses * mpte,struct mptsub * mpts)2807 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2808 {
2809 	struct socket *so, *mp_so;
2810 	struct mptcb *mp_tp;
2811 	int send_dfin = 0;
2812 
2813 	so = mpts->mpts_socket;
2814 	mp_tp = mpte->mpte_mptcb;
2815 	mp_so = mptetoso(mpte);
2816 
2817 	socket_lock_assert_owned(mp_so);
2818 
2819 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2820 		return;
2821 	}
2822 
2823 	mptcp_unset_cellicon(mpte, mpts, 1);
2824 
2825 	mpts->mpts_flags |= MPTSF_DISCONNECTING;
2826 
2827 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2828 		send_dfin = 1;
2829 	}
2830 
2831 	if (mp_so->so_flags & SOF_DEFUNCT) {
2832 		errno_t ret;
2833 
2834 		ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2835 		if (ret == 0) {
2836 			ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2837 
2838 			if (ret != 0) {
2839 				os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2840 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2841 			}
2842 		} else {
2843 			os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2844 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2845 		}
2846 	}
2847 
2848 	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2849 	    (so->so_state & SS_ISCONNECTED)) {
2850 		if (send_dfin) {
2851 			mptcp_send_dfin(so);
2852 		}
2853 
2854 		(void) soshutdownlock(so, SHUT_RD);
2855 		(void) soshutdownlock(so, SHUT_WR);
2856 		(void) sodisconnectlocked(so);
2857 	}
2858 
2859 	/*
2860 	 * Generate a disconnect event for this subflow socket, in case
2861 	 * the lower layer doesn't do it; this is needed because the
2862 	 * subflow socket deletion relies on it.
2863 	 */
2864 	mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2865 }
2866 
2867 /*
2868  * Subflow socket input.
2869  */
2870 static void
mptcp_subflow_input(struct mptses * mpte,struct mptsub * mpts)2871 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2872 {
2873 	struct socket *mp_so = mptetoso(mpte);
2874 	struct mbuf *m = NULL;
2875 	struct socket *so;
2876 	int error, wakeup = 0;
2877 
2878 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2879 	mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2880 
2881 	DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2882 	    struct mptsub *, mpts);
2883 
2884 	if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2885 		goto out;
2886 	}
2887 
2888 	so = mpts->mpts_socket;
2889 
2890 	error = sock_receive_internal(so, NULL, &m, 0, NULL);
2891 	if (error != 0 && error != EWOULDBLOCK) {
2892 		os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2893 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2894 		if (error == ENODATA) {
2895 			/*
2896 			 * Don't ignore ENODATA so as to discover
2897 			 * nasty middleboxes.
2898 			 */
2899 			mp_so->so_error = ENODATA;
2900 
2901 			wakeup = 1;
2902 			goto out;
2903 		}
2904 	}
2905 
2906 	/* In fallback, make sure to accept data on all but one subflow */
2907 	if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2908 	    !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2909 		m_freem(m);
2910 		goto out;
2911 	}
2912 
2913 	if (m != NULL) {
2914 		if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2915 			mptcp_set_cellicon(mpte, mpts);
2916 
2917 			mpte->mpte_used_cell = 1;
2918 		} else {
2919 			/*
2920 			 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2921 			 * explicitly set the cellicon, then we unset it again.
2922 			 */
2923 			if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2924 				mptcp_unset_cellicon(mpte, NULL, 1);
2925 			}
2926 
2927 			mpte->mpte_used_wifi = 1;
2928 		}
2929 
2930 		mptcp_input(mpte, m);
2931 	}
2932 
2933 out:
2934 	if (wakeup) {
2935 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2936 	}
2937 
2938 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2939 }
2940 
2941 void
mptcp_handle_input(struct socket * so)2942 mptcp_handle_input(struct socket *so)
2943 {
2944 	struct mptsub *mpts, *tmpts;
2945 	struct mptses *mpte;
2946 
2947 	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2948 		return;
2949 	}
2950 
2951 	mpts = sototcpcb(so)->t_mpsub;
2952 	mpte = mpts->mpts_mpte;
2953 
2954 	socket_lock_assert_owned(mptetoso(mpte));
2955 
2956 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2957 		if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2958 			mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2959 		}
2960 		return;
2961 	}
2962 
2963 	mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2964 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2965 		if (mpts->mpts_socket->so_usecount == 0) {
2966 			/* Will be removed soon by tcp_garbage_collect */
2967 			continue;
2968 		}
2969 
2970 		mptcp_subflow_addref(mpts);
2971 		mpts->mpts_socket->so_usecount++;
2972 
2973 		mptcp_subflow_input(mpte, mpts);
2974 
2975 		mptcp_subflow_remref(mpts);             /* ours */
2976 
2977 		VERIFY(mpts->mpts_socket->so_usecount != 0);
2978 		mpts->mpts_socket->so_usecount--;
2979 	}
2980 
2981 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2982 }
2983 
2984 static boolean_t
mptcp_search_seq_in_sub(struct mbuf * m,struct socket * so)2985 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2986 {
2987 	struct mbuf *so_m = so->so_snd.sb_mb;
2988 	uint64_t dsn = m->m_pkthdr.mp_dsn;
2989 
2990 	while (so_m) {
2991 		VERIFY(so_m->m_flags & M_PKTHDR);
2992 		VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2993 
2994 		/* Part of the segment is covered, don't reinject here */
2995 		if (so_m->m_pkthdr.mp_dsn <= dsn &&
2996 		    so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2997 			return TRUE;
2998 		}
2999 
3000 		so_m = so_m->m_next;
3001 	}
3002 
3003 	return FALSE;
3004 }
3005 
3006 /*
3007  * Subflow socket output.
3008  *
3009  * Called for sending data from MPTCP to the underlying subflow socket.
3010  */
3011 int
mptcp_subflow_output(struct mptses * mpte,struct mptsub * mpts,int flags)3012 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
3013 {
3014 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3015 	struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL;
3016 	struct socket *mp_so, *so;
3017 	struct tcpcb *tp;
3018 	uint64_t mpt_dsn = 0, off = 0;
3019 	int sb_cc = 0, error = 0, wakeup = 0;
3020 	uint16_t dss_csum;
3021 	uint16_t tot_sent = 0;
3022 	boolean_t reinjected = FALSE;
3023 
3024 	mp_so = mptetoso(mpte);
3025 	so = mpts->mpts_socket;
3026 	tp = sototcpcb(so);
3027 
3028 	socket_lock_assert_owned(mp_so);
3029 
3030 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3031 	mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
3032 
3033 	VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3034 	VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
3035 	    (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3036 	    (mpts->mpts_flags & MPTSF_TFO_REQD));
3037 	VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
3038 
3039 	DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3040 	    struct mptsub *, mpts);
3041 
3042 	/* Remove Addr Option is not sent reliably as per I-D */
3043 	if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
3044 		tp->t_rem_aid = mpte->mpte_lost_aid;
3045 		tp->t_mpflags |= TMPF_SND_REM_ADDR;
3046 		mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3047 	}
3048 
3049 	/*
3050 	 * The mbuf chains containing the metadata (as well as pointing to
3051 	 * the user data sitting at the MPTCP output queue) would then be
3052 	 * sent down to the subflow socket.
3053 	 *
3054 	 * Some notes on data sequencing:
3055 	 *
3056 	 *   a. Each mbuf must be a M_PKTHDR.
3057 	 *   b. MPTCP metadata is stored in the mptcp_pktinfo structure
3058 	 *	in the mbuf pkthdr structure.
3059 	 *   c. Each mbuf containing the MPTCP metadata must have its
3060 	 *	pkt_flags marked with the PKTF_MPTCP flag.
3061 	 */
3062 
3063 	if (mpte->mpte_reinjectq) {
3064 		sb_mb = mpte->mpte_reinjectq;
3065 	} else {
3066 		sb_mb = mp_so->so_snd.sb_mb;
3067 	}
3068 
3069 	if (sb_mb == NULL) {
3070 		os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3071 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3072 		    (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3073 		    (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3074 
3075 		/* Fix it to prevent looping */
3076 		if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3077 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3078 		}
3079 		goto out;
3080 	}
3081 
3082 	VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3083 
3084 	if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3085 	    !(so->so_state & SS_ISCONNECTED) &&
3086 	    (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3087 		tp->t_mpflags |= TMPF_TFO_REQUEST;
3088 
3089 		/* Opting to call pru_send as no mbuf at subflow level */
3090 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3091 		    NULL, current_proc());
3092 
3093 		goto done_sending;
3094 	}
3095 
3096 	mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3097 
3098 	/* First, drop acknowledged data */
3099 	if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3100 		os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3101 		    "dsn %u suna %u reinject? %u\n",
3102 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3103 		    (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3104 		if (mpte->mpte_reinjectq) {
3105 			mptcp_clean_reinjectq(mpte);
3106 		} else {
3107 			uint64_t len = 0;
3108 			len = mp_tp->mpt_snduna - mpt_dsn;
3109 			sbdrop(&mp_so->so_snd, (int)len);
3110 			wakeup = 1;
3111 		}
3112 	}
3113 
3114 	/* Check again because of above sbdrop */
3115 	if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3116 		os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3117 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3118 		goto out;
3119 	}
3120 
3121 	/*
3122 	 * In degraded mode, we don't receive data acks, so force free
3123 	 * mbufs less than snd_nxt
3124 	 */
3125 	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3126 	    (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3127 	    mp_so->so_snd.sb_mb) {
3128 		mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3129 		if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3130 			uint64_t len = 0;
3131 			len = mp_tp->mpt_snduna - mpt_dsn;
3132 			sbdrop(&mp_so->so_snd, (int)len);
3133 			wakeup = 1;
3134 
3135 			os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3136 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3137 			    (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3138 		}
3139 	}
3140 
3141 	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3142 	    !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3143 		mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3144 		so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3145 	}
3146 
3147 	/*
3148 	 * Adjust the top level notion of next byte used for retransmissions
3149 	 * and sending FINs.
3150 	 */
3151 	if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3152 		mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3153 	}
3154 
3155 	/* Now determine the offset from which to start transmitting data */
3156 	if (mpte->mpte_reinjectq) {
3157 		sb_mb = mpte->mpte_reinjectq;
3158 	} else {
3159 dont_reinject:
3160 		sb_mb = mp_so->so_snd.sb_mb;
3161 	}
3162 	if (sb_mb == NULL) {
3163 		os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3164 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3165 		goto out;
3166 	}
3167 
3168 	if (sb_mb == mpte->mpte_reinjectq) {
3169 		sb_cc = sb_mb->m_pkthdr.mp_rlen;
3170 		off = 0;
3171 
3172 		if (mptcp_search_seq_in_sub(sb_mb, so)) {
3173 			if (mptcp_can_send_more(mp_tp, TRUE)) {
3174 				goto dont_reinject;
3175 			}
3176 
3177 			error = ECANCELED;
3178 			goto out;
3179 		}
3180 
3181 		reinjected = TRUE;
3182 	} else if (flags & MPTCP_SUBOUT_PROBING) {
3183 		sb_cc = sb_mb->m_pkthdr.mp_rlen;
3184 		off = 0;
3185 	} else {
3186 		sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3187 
3188 		/*
3189 		 * With TFO, there might be no data at all, thus still go into this
3190 		 * code-path here.
3191 		 */
3192 		if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3193 		    MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3194 			off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3195 			sb_cc -= off;
3196 		} else {
3197 			os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3198 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3199 			    (uint32_t)mp_tp->mpt_sndmax);
3200 
3201 			goto out;
3202 		}
3203 	}
3204 
3205 	sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3206 	if (sb_cc <= 0) {
3207 		os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3208 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3209 		    (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3210 		    mptcp_subflow_cwnd_space(so));
3211 	}
3212 
3213 	sb_cc = min(sb_cc, UINT16_MAX);
3214 
3215 	/*
3216 	 * Create a DSN mapping for the data we are about to send. It all
3217 	 * has the same mapping.
3218 	 */
3219 	if (reinjected) {
3220 		mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3221 	} else {
3222 		mpt_dsn = mp_tp->mpt_snduna + off;
3223 	}
3224 
3225 	mpt_mbuf = sb_mb;
3226 	while (mpt_mbuf && reinjected == FALSE &&
3227 	    (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3228 	    mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3229 		off -= mpt_mbuf->m_pkthdr.mp_rlen;
3230 		mpt_mbuf = mpt_mbuf->m_next;
3231 	}
3232 	VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3233 
3234 	head = tail = NULL;
3235 
3236 	while (tot_sent < sb_cc) {
3237 		int32_t mlen;
3238 
3239 		mlen = mpt_mbuf->m_len;
3240 		mlen -= off;
3241 		mlen = MIN(mlen, sb_cc - tot_sent);
3242 
3243 		if (mlen < 0) {
3244 			os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3245 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3246 			    (uint32_t)off, sb_cc, tot_sent);
3247 			goto out;
3248 		}
3249 
3250 		if (mlen == 0) {
3251 			goto next;
3252 		}
3253 
3254 		m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT, NULL, NULL,
3255 		    M_COPYM_MUST_COPY_HDR);
3256 		if (m == NULL) {
3257 			os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3258 			    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3259 			error = ENOBUFS;
3260 			break;
3261 		}
3262 
3263 		/* Create a DSN mapping for the data (m_copym does it) */
3264 		VERIFY(m->m_flags & M_PKTHDR);
3265 		VERIFY(m->m_next == NULL);
3266 
3267 		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3268 		m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3269 		m->m_pkthdr.mp_dsn = mpt_dsn;
3270 		m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3271 		m->m_pkthdr.len = mlen;
3272 
3273 		if (head == NULL) {
3274 			head = tail = m;
3275 		} else {
3276 			tail->m_next = m;
3277 			tail = m;
3278 		}
3279 
3280 		tot_sent += mlen;
3281 		off = 0;
3282 next:
3283 		mpt_mbuf = mpt_mbuf->m_next;
3284 	}
3285 
3286 	if (reinjected) {
3287 		if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3288 			struct mbuf *n = sb_mb;
3289 
3290 			while (n) {
3291 				n->m_pkthdr.mp_dsn += sb_cc;
3292 				n->m_pkthdr.mp_rlen -= sb_cc;
3293 				n = n->m_next;
3294 			}
3295 			m_adj(sb_mb, sb_cc);
3296 		} else {
3297 			mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3298 			m_freem(sb_mb);
3299 		}
3300 	}
3301 
3302 	if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3303 		dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3304 		    tot_sent);
3305 	}
3306 
3307 	/* Now, let's update rel-seq and the data-level length */
3308 	mpts->mpts_rel_seq += tot_sent;
3309 	m = head;
3310 	while (m) {
3311 		if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3312 			m->m_pkthdr.mp_csum = dss_csum;
3313 		}
3314 		m->m_pkthdr.mp_rlen = tot_sent;
3315 		m = m->m_next;
3316 	}
3317 
3318 	if (head != NULL) {
3319 		if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3320 		    (tp->t_tfo_stats == 0)) {
3321 			tp->t_mpflags |= TMPF_TFO_REQUEST;
3322 		}
3323 
3324 		error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0);
3325 		head = NULL;
3326 	}
3327 
3328 done_sending:
3329 	if (error == 0 ||
3330 	    (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3331 		uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3332 
3333 		if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3334 			tcpstat.tcps_mp_num_probes++;
3335 			if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3336 				mpts->mpts_probecnt += 1;
3337 			} else {
3338 				mpts->mpts_probecnt +=
3339 				    tot_sent / mpts->mpts_maxseg;
3340 			}
3341 		}
3342 
3343 		if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3344 			if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3345 			    MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3346 				mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3347 			}
3348 			mp_tp->mpt_sndnxt = new_sndnxt;
3349 		}
3350 
3351 		mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3352 
3353 		/* Must be here as mptcp_can_send_more() checks for this */
3354 		soclearfastopen(mp_so);
3355 
3356 		if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3357 			mptcp_set_cellicon(mpte, mpts);
3358 
3359 			mpte->mpte_used_cell = 1;
3360 		} else {
3361 			/*
3362 			 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3363 			 * explicitly set the cellicon, then we unset it again.
3364 			 */
3365 			if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3366 				mptcp_unset_cellicon(mpte, NULL, 1);
3367 			}
3368 
3369 			mpte->mpte_used_wifi = 1;
3370 		}
3371 
3372 		/*
3373 		 * Don't propagate EWOULDBLOCK - it's already taken care of
3374 		 * in mptcp_usr_send for TFO.
3375 		 */
3376 		error = 0;
3377 	} else {
3378 		/* We need to revert our change to mpts_rel_seq */
3379 		mpts->mpts_rel_seq -= tot_sent;
3380 
3381 		os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3382 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3383 	}
3384 out:
3385 
3386 	if (head != NULL) {
3387 		m_freem(head);
3388 	}
3389 
3390 	if (wakeup) {
3391 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3392 	}
3393 
3394 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3395 	return error;
3396 }
3397 
3398 static void
mptcp_add_reinjectq(struct mptses * mpte,struct mbuf * m)3399 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3400 {
3401 	struct mbuf *n, *prev = NULL;
3402 
3403 	n = mpte->mpte_reinjectq;
3404 
3405 	/* First, look for an mbuf n, whose data-sequence-number is bigger or
3406 	 * equal than m's sequence number.
3407 	 */
3408 	while (n) {
3409 		if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3410 			break;
3411 		}
3412 
3413 		prev = n;
3414 
3415 		n = n->m_nextpkt;
3416 	}
3417 
3418 	if (n) {
3419 		/* m is already fully covered by the next mbuf in the queue */
3420 		if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3421 		    n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3422 			os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3423 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3424 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3425 			    m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
3426 			goto dont_queue;
3427 		}
3428 
3429 		/* m is covering the next mbuf entirely, thus we remove this guy */
3430 		if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3431 			struct mbuf *tmp = n->m_nextpkt;
3432 
3433 			os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3434 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3435 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3436 			    (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
3437 
3438 			m->m_nextpkt = NULL;
3439 			if (prev == NULL) {
3440 				mpte->mpte_reinjectq = tmp;
3441 			} else {
3442 				prev->m_nextpkt = tmp;
3443 			}
3444 
3445 			m_freem(n);
3446 			n = tmp;
3447 		}
3448 	}
3449 
3450 	if (prev) {
3451 		/* m is already fully covered by the previous mbuf in the queue */
3452 		if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3453 			os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3454 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3455 			    (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3456 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
3457 			goto dont_queue;
3458 		}
3459 	}
3460 
3461 	if (prev == NULL) {
3462 		mpte->mpte_reinjectq = m;
3463 	} else {
3464 		prev->m_nextpkt = m;
3465 	}
3466 
3467 	m->m_nextpkt = n;
3468 
3469 	return;
3470 
3471 dont_queue:
3472 	m_freem(m);
3473 	return;
3474 }
3475 
3476 static struct mbuf *
mptcp_lookup_dsn(struct mptses * mpte,uint64_t dsn)3477 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3478 {
3479 	struct socket *mp_so = mptetoso(mpte);
3480 	struct mbuf *m;
3481 
3482 	m = mp_so->so_snd.sb_mb;
3483 
3484 	while (m) {
3485 		/* If this segment covers what we are looking for, return it. */
3486 		if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3487 		    MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3488 			break;
3489 		}
3490 
3491 
3492 		/* Segment is no more in the queue */
3493 		if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3494 			return NULL;
3495 		}
3496 
3497 		m = m->m_next;
3498 	}
3499 
3500 	return m;
3501 }
3502 
3503 static struct mbuf *
mptcp_copy_mbuf_list(struct mptses * mpte,struct mbuf * m,int len)3504 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3505 {
3506 	struct mbuf *top = NULL, *tail = NULL;
3507 	uint64_t dsn;
3508 	uint32_t dlen, rseq;
3509 
3510 	dsn = m->m_pkthdr.mp_dsn;
3511 	dlen = m->m_pkthdr.mp_rlen;
3512 	rseq = m->m_pkthdr.mp_rseq;
3513 
3514 	while (len > 0) {
3515 		struct mbuf *n;
3516 
3517 		VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3518 
3519 		n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, NULL, NULL, M_COPYM_MUST_COPY_HDR);
3520 		if (n == NULL) {
3521 			os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3522 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3523 			goto err;
3524 		}
3525 
3526 		VERIFY(n->m_flags & M_PKTHDR);
3527 		VERIFY(n->m_next == NULL);
3528 		VERIFY(n->m_pkthdr.mp_dsn == dsn);
3529 		VERIFY(n->m_pkthdr.mp_rlen == dlen);
3530 		VERIFY(n->m_pkthdr.mp_rseq == rseq);
3531 		VERIFY(n->m_len == m->m_len);
3532 
3533 		n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3534 
3535 		if (top == NULL) {
3536 			top = n;
3537 		}
3538 
3539 		if (tail != NULL) {
3540 			tail->m_next = n;
3541 		}
3542 
3543 		tail = n;
3544 
3545 		len -= m->m_len;
3546 		m = m->m_next;
3547 	}
3548 
3549 	return top;
3550 
3551 err:
3552 	if (top) {
3553 		m_freem(top);
3554 	}
3555 
3556 	return NULL;
3557 }
3558 
3559 static void
mptcp_reinject_mbufs(struct socket * so)3560 mptcp_reinject_mbufs(struct socket *so)
3561 {
3562 	struct tcpcb *tp = sototcpcb(so);
3563 	struct mptsub *mpts = tp->t_mpsub;
3564 	struct mptcb *mp_tp = tptomptp(tp);
3565 	struct mptses *mpte = mp_tp->mpt_mpte;
3566 	struct sockbuf *sb = &so->so_snd;
3567 	struct mbuf *m;
3568 
3569 	m = sb->sb_mb;
3570 	while (m) {
3571 		struct mbuf *n = m->m_next, *orig = m;
3572 		bool set_reinject_flag = false;
3573 
3574 		VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3575 
3576 		if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3577 			goto next;
3578 		}
3579 
3580 		/* Has it all already been acknowledged at the data-level? */
3581 		if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3582 			goto next;
3583 		}
3584 
3585 		/* Part of this has already been acknowledged - lookup in the
3586 		 * MPTCP-socket for the segment.
3587 		 */
3588 		if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3589 			m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3590 			if (m == NULL) {
3591 				goto next;
3592 			}
3593 		}
3594 
3595 		/* Copy the mbuf with headers (aka, DSN-numbers) */
3596 		m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3597 		if (m == NULL) {
3598 			break;
3599 		}
3600 
3601 		VERIFY(m->m_nextpkt == NULL);
3602 
3603 		/* Now, add to the reinject-queue, eliminating overlapping
3604 		 * segments
3605 		 */
3606 		mptcp_add_reinjectq(mpte, m);
3607 
3608 		set_reinject_flag = true;
3609 		orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3610 
3611 next:
3612 		/* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3613 		while (n) {
3614 			VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3615 
3616 			if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3617 				break;
3618 			}
3619 
3620 			if (set_reinject_flag) {
3621 				n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3622 			}
3623 			n = n->m_next;
3624 		}
3625 
3626 		m = n;
3627 	}
3628 }
3629 
3630 void
mptcp_clean_reinjectq(struct mptses * mpte)3631 mptcp_clean_reinjectq(struct mptses *mpte)
3632 {
3633 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3634 
3635 	socket_lock_assert_owned(mptetoso(mpte));
3636 
3637 	while (mpte->mpte_reinjectq) {
3638 		struct mbuf *m = mpte->mpte_reinjectq;
3639 
3640 		if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3641 		    MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3642 			break;
3643 		}
3644 
3645 		mpte->mpte_reinjectq = m->m_nextpkt;
3646 		m->m_nextpkt = NULL;
3647 		m_freem(m);
3648 	}
3649 }
3650 
3651 static ev_ret_t
mptcp_subflow_propagate_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3652 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3653     uint32_t *p_mpsofilt_hint, uint32_t event)
3654 {
3655 	struct socket *mp_so, *so;
3656 	struct mptcb *mp_tp;
3657 
3658 	mp_so = mptetoso(mpte);
3659 	mp_tp = mpte->mpte_mptcb;
3660 	so = mpts->mpts_socket;
3661 
3662 	/*
3663 	 * We got an event for this subflow that might need to be propagated,
3664 	 * based on the state of the MPTCP connection.
3665 	 */
3666 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3667 	    (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3668 	    ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3669 		mp_so->so_error = so->so_error;
3670 		*p_mpsofilt_hint |= event;
3671 	}
3672 
3673 	return MPTS_EVRET_OK;
3674 }
3675 
3676 /*
3677  * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3678  */
3679 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3680 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3681     uint32_t *p_mpsofilt_hint, uint32_t event)
3682 {
3683 	struct socket *mp_so;
3684 	struct tcpcb *tp;
3685 
3686 	mp_so = mptetoso(mpte);
3687 	tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3688 
3689 	/*
3690 	 * This overwrites any previous mpte_lost_aid to avoid storing
3691 	 * too much state when the typical case has only two subflows.
3692 	 */
3693 	mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3694 	mpte->mpte_lost_aid = tp->t_local_aid;
3695 
3696 	/*
3697 	 * The subflow connection has lost its source address.
3698 	 */
3699 	mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3700 
3701 	if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3702 		mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3703 	}
3704 
3705 	return MPTS_EVRET_DELETE;
3706 }
3707 
3708 static ev_ret_t
mptcp_subflow_mpsuberror_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3709 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3710     uint32_t *p_mpsofilt_hint, uint32_t event)
3711 {
3712 #pragma unused(event, p_mpsofilt_hint)
3713 	struct socket *so, *mp_so;
3714 
3715 	so = mpts->mpts_socket;
3716 
3717 	if (so->so_error != ENODATA) {
3718 		return MPTS_EVRET_OK;
3719 	}
3720 
3721 
3722 	mp_so = mptetoso(mpte);
3723 
3724 	mp_so->so_error = ENODATA;
3725 
3726 	sorwakeup(mp_so);
3727 	sowwakeup(mp_so);
3728 
3729 	return MPTS_EVRET_OK;
3730 }
3731 
3732 
3733 /*
3734  * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3735  * indicates that the remote side sent a Data FIN
3736  */
3737 static ev_ret_t
mptcp_subflow_mpcantrcvmore_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3738 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3739     uint32_t *p_mpsofilt_hint, uint32_t event)
3740 {
3741 #pragma unused(event, mpts)
3742 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3743 
3744 	/*
3745 	 * We got a Data FIN for the MPTCP connection.
3746 	 * The FIN may arrive with data. The data is handed up to the
3747 	 * mptcp socket and the user is notified so that it may close
3748 	 * the socket if needed.
3749 	 */
3750 	if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3751 		*p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3752 	}
3753 
3754 	return MPTS_EVRET_OK; /* keep the subflow socket around */
3755 }
3756 
3757 /*
3758  * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3759  */
3760 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3761 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3762     uint32_t *p_mpsofilt_hint, uint32_t event)
3763 {
3764 #pragma unused(event, p_mpsofilt_hint)
3765 	struct mptsub *mpts_alt = NULL;
3766 	struct socket *alt_so = NULL;
3767 	struct socket *mp_so;
3768 	int altpath_exists = 0;
3769 
3770 	mp_so = mptetoso(mpte);
3771 	os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3772 
3773 	mptcp_reinject_mbufs(mpts->mpts_socket);
3774 
3775 	mpts_alt = mptcp_get_subflow(mpte, NULL);
3776 
3777 	/* If there is no alternate eligible subflow, ignore the failover hint. */
3778 	if (mpts_alt == NULL || mpts_alt == mpts) {
3779 		os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3780 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3781 
3782 		goto done;
3783 	}
3784 
3785 	altpath_exists = 1;
3786 	alt_so = mpts_alt->mpts_socket;
3787 	if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3788 		/* All data acknowledged and no RTT spike */
3789 		if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3790 			mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3791 		} else {
3792 			/* no alternate path available */
3793 			altpath_exists = 0;
3794 		}
3795 	}
3796 
3797 	if (altpath_exists) {
3798 		mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3799 
3800 		mpte->mpte_active_sub = mpts_alt;
3801 		mpts->mpts_flags |= MPTSF_FAILINGOVER;
3802 		mpts->mpts_flags &= ~MPTSF_ACTIVE;
3803 
3804 		os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3805 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3806 
3807 		mptcpstats_inc_switch(mpte, mpts);
3808 
3809 		sowwakeup(alt_so);
3810 	} else {
3811 done:
3812 		mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3813 	}
3814 
3815 	return MPTS_EVRET_OK;
3816 }
3817 
3818 /*
3819  * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3820  */
3821 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3822 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3823     uint32_t *p_mpsofilt_hint, uint32_t event)
3824 {
3825 	/*
3826 	 * The subflow connection cannot use the outgoing interface, let's
3827 	 * close this subflow.
3828 	 */
3829 	mptcp_subflow_abort(mpts, EPERM);
3830 
3831 	mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3832 
3833 	return MPTS_EVRET_DELETE;
3834 }
3835 
3836 /*
3837  * https://tools.ietf.org/html/rfc6052#section-2
3838  * https://tools.ietf.org/html/rfc6147#section-5.2
3839  */
3840 static boolean_t
mptcp_desynthesize_ipv6_addr(struct mptses * mpte,const struct in6_addr * addr,const struct ipv6_prefix * prefix,struct in_addr * addrv4)3841 mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr,
3842     const struct ipv6_prefix *prefix,
3843     struct in_addr *addrv4)
3844 {
3845 	char buf[MAX_IPv4_STR_LEN];
3846 	char *ptrv4 = (char *)addrv4;
3847 	const char *ptr = (const char *)addr;
3848 
3849 	if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3850 		return false;
3851 	}
3852 
3853 	switch (prefix->prefix_len) {
3854 	case NAT64_PREFIX_LEN_96:
3855 		memcpy(ptrv4, ptr + 12, 4);
3856 		break;
3857 	case NAT64_PREFIX_LEN_64:
3858 		memcpy(ptrv4, ptr + 9, 4);
3859 		break;
3860 	case NAT64_PREFIX_LEN_56:
3861 		memcpy(ptrv4, ptr + 7, 1);
3862 		memcpy(ptrv4 + 1, ptr + 9, 3);
3863 		break;
3864 	case NAT64_PREFIX_LEN_48:
3865 		memcpy(ptrv4, ptr + 6, 2);
3866 		memcpy(ptrv4 + 2, ptr + 9, 2);
3867 		break;
3868 	case NAT64_PREFIX_LEN_40:
3869 		memcpy(ptrv4, ptr + 5, 3);
3870 		memcpy(ptrv4 + 3, ptr + 9, 1);
3871 		break;
3872 	case NAT64_PREFIX_LEN_32:
3873 		memcpy(ptrv4, ptr + 4, 4);
3874 		break;
3875 	default:
3876 		panic("NAT64-prefix len is wrong: %u",
3877 		    prefix->prefix_len);
3878 	}
3879 
3880 	os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__,
3881 	    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3882 	    inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3883 
3884 	return true;
3885 }
3886 
3887 static void
mptcp_handle_ipv6_connection(struct mptses * mpte,const struct mptsub * mpts)3888 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3889 {
3890 	struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3891 	struct socket *so = mpts->mpts_socket;
3892 	struct ifnet *ifp;
3893 	int j;
3894 
3895 	/* Subflow IPs will be steered directly by the server - no need to
3896 	 * desynthesize.
3897 	 */
3898 	if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3899 		return;
3900 	}
3901 
3902 	ifp = sotoinpcb(so)->inp_last_outifp;
3903 
3904 	if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3905 		return;
3906 	}
3907 
3908 	for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3909 		int success;
3910 
3911 		if (nat64prefixes[j].prefix_len == 0) {
3912 			continue;
3913 		}
3914 
3915 		success = mptcp_desynthesize_ipv6_addr(mpte,
3916 		    &mpte->__mpte_dst_v6.sin6_addr,
3917 		    &nat64prefixes[j],
3918 		    &mpte->mpte_sub_dst_v4.sin_addr);
3919 		if (success) {
3920 			mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
3921 			mpte->mpte_sub_dst_v4.sin_family = AF_INET;
3922 			mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
3923 
3924 			/*
3925 			 * We connected to a NAT64'ed address. Let's remove it
3926 			 * from the potential IPs to use. Whenever we are back on
3927 			 * that network and need to connect, we can synthesize again.
3928 			 *
3929 			 * Otherwise, on different IPv6 networks we will attempt
3930 			 * to connect to that NAT64 address...
3931 			 */
3932 			memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6));
3933 			break;
3934 		}
3935 	}
3936 }
3937 
3938 static void
mptcp_try_alternate_port(struct mptses * mpte,struct mptsub * mpts)3939 mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
3940 {
3941 	struct inpcb *inp;
3942 
3943 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
3944 		return;
3945 	}
3946 
3947 	inp = sotoinpcb(mpts->mpts_socket);
3948 	if (inp == NULL) {
3949 		return;
3950 	}
3951 
3952 	/* Should we try the alternate port? */
3953 	if (mpte->mpte_alternate_port &&
3954 	    inp->inp_fport != mpte->mpte_alternate_port) {
3955 		union sockaddr_in_4_6 dst;
3956 		struct sockaddr_in *dst_in = SIN(&dst);
3957 
3958 		SOCKADDR_COPY(&mpts->mpts_dst, &dst, mpts->mpts_dst.sa_len);
3959 
3960 		dst_in->sin_port = mpte->mpte_alternate_port;
3961 
3962 		mptcp_subflow_add(mpte, NULL, SA(&dst), mpts->mpts_ifscope, NULL);
3963 	} else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3964 		unsigned int i;
3965 
3966 		if (inp->inp_last_outifp == NULL) {
3967 			return;
3968 		}
3969 
3970 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3971 			struct mpt_itf_info *info =  &mpte->mpte_itfinfo[i];
3972 
3973 			if (inp->inp_last_outifp->if_index == info->ifindex) {
3974 				info->no_mptcp_support = 1;
3975 				break;
3976 			}
3977 		}
3978 	}
3979 }
3980 
3981 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
3982 static void
mptcp_drop_tfo_data(struct mptses * mpte,struct mptsub * mpts)3983 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
3984 {
3985 	struct socket *mp_so = mptetoso(mpte);
3986 	struct socket *so = mpts->mpts_socket;
3987 	struct tcpcb *tp = intotcpcb(sotoinpcb(so));
3988 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3989 
3990 	/* If data was sent with SYN, rewind state */
3991 	if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
3992 		u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3993 		unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
3994 
3995 		VERIFY(mp_droplen <= (UINT_MAX));
3996 		VERIFY(mp_droplen >= tcp_droplen);
3997 
3998 		mpts->mpts_flags &= ~MPTSF_TFO_REQD;
3999 		mpts->mpts_iss += tcp_droplen;
4000 		tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4001 
4002 		if (mp_droplen > tcp_droplen) {
4003 			/* handle partial TCP ack */
4004 			mp_so->so_flags1 |= SOF1_TFO_REWIND;
4005 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
4006 			mp_droplen = tcp_droplen;
4007 		} else {
4008 			/* all data on SYN was acked */
4009 			mpts->mpts_rel_seq = 1;
4010 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4011 		}
4012 		mp_tp->mpt_sndmax -= tcp_droplen;
4013 
4014 		if (mp_droplen != 0) {
4015 			VERIFY(mp_so->so_snd.sb_mb != NULL);
4016 			sbdrop(&mp_so->so_snd, (int)mp_droplen);
4017 		}
4018 	}
4019 }
4020 
4021 /*
4022  * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4023  */
4024 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4025 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4026     uint32_t *p_mpsofilt_hint, uint32_t event)
4027 {
4028 #pragma unused(event, p_mpsofilt_hint)
4029 	struct socket *mp_so, *so;
4030 	struct inpcb *inp;
4031 	struct tcpcb *tp;
4032 	struct mptcb *mp_tp;
4033 	int af;
4034 	boolean_t mpok = FALSE;
4035 
4036 	mp_so = mptetoso(mpte);
4037 	mp_tp = mpte->mpte_mptcb;
4038 	so = mpts->mpts_socket;
4039 	tp = sototcpcb(so);
4040 	af = mpts->mpts_dst.sa_family;
4041 
4042 	if (mpts->mpts_flags & MPTSF_CONNECTED) {
4043 		return MPTS_EVRET_OK;
4044 	}
4045 
4046 	if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4047 	    (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4048 		return MPTS_EVRET_OK;
4049 	}
4050 
4051 	/*
4052 	 * The subflow connection has been connected.  Find out whether it
4053 	 * is connected as a regular TCP or as a MPTCP subflow.  The idea is:
4054 	 *
4055 	 *   a. If MPTCP connection is not yet established, then this must be
4056 	 *	the first subflow connection.  If MPTCP failed to negotiate,
4057 	 *	fallback to regular TCP by degrading this subflow.
4058 	 *
4059 	 *   b. If MPTCP connection has been established, then this must be
4060 	 *	one of the subsequent subflow connections. If MPTCP failed
4061 	 *	to negotiate, disconnect the connection.
4062 	 *
4063 	 * Right now, we simply unblock any waiters at the MPTCP socket layer
4064 	 * if the MPTCP connection has not been established.
4065 	 */
4066 
4067 	if (so->so_state & SS_ISDISCONNECTED) {
4068 		/*
4069 		 * With MPTCP joins, a connection is connected at the subflow
4070 		 * level, but the 4th ACK from the server elevates the MPTCP
4071 		 * subflow to connected state. So there is a small window
4072 		 * where the subflow could get disconnected before the
4073 		 * connected event is processed.
4074 		 */
4075 		return MPTS_EVRET_OK;
4076 	}
4077 
4078 	if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4079 		mptcp_drop_tfo_data(mpte, mpts);
4080 	}
4081 
4082 	mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4083 	mpts->mpts_flags |= MPTSF_CONNECTED;
4084 
4085 	if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4086 		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4087 	}
4088 
4089 	tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4090 
4091 	/* get/verify the outbound interface */
4092 	inp = sotoinpcb(so);
4093 
4094 	mpts->mpts_maxseg = tp->t_maxseg;
4095 
4096 	mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4097 
4098 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4099 		mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4100 		mpte->mpte_associd = mpts->mpts_connid;
4101 		DTRACE_MPTCP2(state__change,
4102 		    struct mptcb *, mp_tp,
4103 		    uint32_t, 0 /* event */);
4104 
4105 		if (SOCK_DOM(so) == AF_INET) {
4106 			in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4107 		} else {
4108 			in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4109 		}
4110 
4111 		mpts->mpts_flags |= MPTSF_ACTIVE;
4112 
4113 		/* case (a) above */
4114 		if (!mpok) {
4115 			tcpstat.tcps_mpcap_fallback++;
4116 
4117 			tp->t_mpflags |= TMPF_INFIN_SENT;
4118 			mptcp_notify_mpfail(so);
4119 		} else {
4120 			if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4121 			    mptcp_subflows_need_backup_flag(mpte)) {
4122 				tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4123 			} else {
4124 				mpts->mpts_flags |= MPTSF_PREFERRED;
4125 			}
4126 			mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4127 			mpte->mpte_nummpcapflows++;
4128 
4129 			if (SOCK_DOM(so) == AF_INET6) {
4130 				mptcp_handle_ipv6_connection(mpte, mpts);
4131 			}
4132 
4133 			mptcp_check_subflows_and_add(mpte);
4134 
4135 			if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4136 				mpte->mpte_initial_cell = 1;
4137 			}
4138 
4139 			mpte->mpte_handshake_success = 1;
4140 		}
4141 
4142 		mp_tp->mpt_sndwnd = tp->snd_wnd;
4143 		mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4144 		mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4145 		soisconnected(mp_so);
4146 	} else if (mpok) {
4147 		/*
4148 		 * case (b) above
4149 		 * In case of additional flows, the MPTCP socket is not
4150 		 * MPTSF_MP_CAPABLE until an ACK is received from server
4151 		 * for 3-way handshake.  TCP would have guaranteed that this
4152 		 * is an MPTCP subflow.
4153 		 */
4154 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4155 		    !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4156 		    mptcp_subflows_need_backup_flag(mpte)) {
4157 			tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4158 			mpts->mpts_flags &= ~MPTSF_PREFERRED;
4159 		} else {
4160 			mpts->mpts_flags |= MPTSF_PREFERRED;
4161 		}
4162 
4163 		mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4164 		mpte->mpte_nummpcapflows++;
4165 
4166 		mpts->mpts_rel_seq = 1;
4167 
4168 		mptcp_check_subflows_and_remove(mpte);
4169 	} else {
4170 		mptcp_try_alternate_port(mpte, mpts);
4171 
4172 		tcpstat.tcps_join_fallback++;
4173 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4174 			tcpstat.tcps_mptcp_cell_proxy++;
4175 		} else {
4176 			tcpstat.tcps_mptcp_wifi_proxy++;
4177 		}
4178 
4179 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4180 
4181 		return MPTS_EVRET_OK;
4182 	}
4183 
4184 	/* This call, just to "book" an entry in the stats-table for this ifindex */
4185 	mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4186 
4187 	mptcp_output(mpte);
4188 
4189 	return MPTS_EVRET_OK; /* keep the subflow socket around */
4190 }
4191 
4192 /*
4193  * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4194  */
4195 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4196 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4197     uint32_t *p_mpsofilt_hint, uint32_t event)
4198 {
4199 #pragma unused(event, p_mpsofilt_hint)
4200 	struct socket *mp_so, *so;
4201 	struct mptcb *mp_tp;
4202 
4203 	mp_so = mptetoso(mpte);
4204 	mp_tp = mpte->mpte_mptcb;
4205 	so = mpts->mpts_socket;
4206 
4207 	if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4208 		return MPTS_EVRET_DELETE;
4209 	}
4210 
4211 	mpts->mpts_flags |= MPTSF_DISCONNECTED;
4212 
4213 	/* The subflow connection has been disconnected. */
4214 
4215 	if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4216 		mpte->mpte_nummpcapflows--;
4217 		if (mpte->mpte_active_sub == mpts) {
4218 			mpte->mpte_active_sub = NULL;
4219 		}
4220 		mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4221 	} else {
4222 		if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4223 		    !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4224 			mptcp_try_alternate_port(mpte, mpts);
4225 		}
4226 	}
4227 
4228 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4229 	    ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4230 		mptcp_drop(mpte, mp_tp, so->so_error);
4231 	}
4232 
4233 	/*
4234 	 * Clear flags that are used by getconninfo to return state.
4235 	 * Retain like MPTSF_DELETEOK for internal purposes.
4236 	 */
4237 	mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4238 	    MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4239 	    MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4240 
4241 	return MPTS_EVRET_DELETE;
4242 }
4243 
4244 /*
4245  * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4246  */
4247 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4248 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4249     uint32_t *p_mpsofilt_hint, uint32_t event)
4250 {
4251 #pragma unused(event, p_mpsofilt_hint)
4252 	ev_ret_t ret = MPTS_EVRET_OK;
4253 	struct socket *mp_so, *so;
4254 	struct mptcb *mp_tp;
4255 
4256 	mp_so = mptetoso(mpte);
4257 	mp_tp = mpte->mpte_mptcb;
4258 	so = mpts->mpts_socket;
4259 	struct inpcb *inp = sotoinpcb(so);
4260 	struct tcpcb *tp = intotcpcb(inp);
4261 
4262 	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4263 		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4264 	} else {
4265 		mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4266 	}
4267 
4268 	if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4269 		if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4270 			goto done;
4271 		}
4272 		mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4273 	} else {
4274 		mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4275 	}
4276 
4277 	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4278 		mpts->mpts_flags |= MPTSF_MP_READY;
4279 	} else {
4280 		mpts->mpts_flags &= ~MPTSF_MP_READY;
4281 	}
4282 
4283 	if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4284 		mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4285 		mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4286 		tcp_cache_update_mptcp_version(tp, FALSE);
4287 	}
4288 
4289 	if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4290 		ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4291 
4292 		m_freem_list(mpte->mpte_reinjectq);
4293 		mpte->mpte_reinjectq = NULL;
4294 	} else if (mpts->mpts_flags & MPTSF_MP_READY) {
4295 		mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4296 		ret = MPTS_EVRET_CONNECT_PENDING;
4297 	}
4298 
4299 done:
4300 	return ret;
4301 }
4302 
4303 /*
4304  * Handle SO_FILT_HINT_MUSTRST subflow socket event
4305  */
4306 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4307 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4308     uint32_t *p_mpsofilt_hint, uint32_t event)
4309 {
4310 #pragma unused(event)
4311 	struct socket *mp_so, *so;
4312 	struct mptcb *mp_tp;
4313 	boolean_t is_fastclose;
4314 
4315 	mp_so = mptetoso(mpte);
4316 	mp_tp = mpte->mpte_mptcb;
4317 	so = mpts->mpts_socket;
4318 
4319 	/* We got an invalid option or a fast close */
4320 	struct inpcb *inp = sotoinpcb(so);
4321 	struct tcpcb *tp = NULL;
4322 
4323 	tp = intotcpcb(inp);
4324 	so->so_error = ECONNABORTED;
4325 
4326 	is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4327 
4328 	tp->t_mpflags |= TMPF_RESET;
4329 
4330 	if (tp->t_state != TCPS_CLOSED) {
4331 		struct mbuf *m;
4332 		struct tcptemp *t_template = tcp_maketemplate(tp, &m);
4333 
4334 		if (t_template) {
4335 			struct tcp_respond_args tra;
4336 
4337 			bzero(&tra, sizeof(tra));
4338 			if (inp->inp_flags & INP_BOUND_IF) {
4339 				tra.ifscope = inp->inp_boundifp->if_index;
4340 			} else {
4341 				tra.ifscope = IFSCOPE_NONE;
4342 			}
4343 			tra.awdl_unrestricted = 1;
4344 
4345 			tcp_respond(tp, t_template->tt_ipgen,
4346 			    &t_template->tt_t, (struct mbuf *)NULL,
4347 			    tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4348 			(void) m_free(m);
4349 		}
4350 	}
4351 
4352 	if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4353 		struct mptsub *iter, *tmp;
4354 
4355 		*p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4356 
4357 		mp_so->so_error = ECONNRESET;
4358 
4359 		TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4360 			if (iter == mpts) {
4361 				continue;
4362 			}
4363 			mptcp_subflow_abort(iter, ECONNABORTED);
4364 		}
4365 
4366 		/*
4367 		 * mptcp_drop is being called after processing the events, to fully
4368 		 * close the MPTCP connection
4369 		 */
4370 		mptcp_drop(mpte, mp_tp, mp_so->so_error);
4371 	}
4372 
4373 	mptcp_subflow_abort(mpts, ECONNABORTED);
4374 
4375 	if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4376 		mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4377 	}
4378 
4379 	return MPTS_EVRET_DELETE;
4380 }
4381 
4382 static ev_ret_t
mptcp_subflow_adaptive_rtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4383 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4384     uint32_t *p_mpsofilt_hint, uint32_t event)
4385 {
4386 #pragma unused(event)
4387 	bool found_active = false;
4388 
4389 	mpts->mpts_flags |= MPTSF_READ_STALL;
4390 
4391 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4392 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4393 
4394 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4395 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
4396 			continue;
4397 		}
4398 
4399 		if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4400 			found_active = true;
4401 			break;
4402 		}
4403 	}
4404 
4405 	if (!found_active) {
4406 		*p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4407 	}
4408 
4409 	return MPTS_EVRET_OK;
4410 }
4411 
4412 static ev_ret_t
mptcp_subflow_adaptive_wtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4413 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4414     uint32_t *p_mpsofilt_hint, uint32_t event)
4415 {
4416 #pragma unused(event)
4417 	bool found_active = false;
4418 
4419 	mpts->mpts_flags |= MPTSF_WRITE_STALL;
4420 
4421 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4422 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4423 
4424 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4425 		    tp->t_state > TCPS_CLOSE_WAIT) {
4426 			continue;
4427 		}
4428 
4429 		if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4430 			found_active = true;
4431 			break;
4432 		}
4433 	}
4434 
4435 	if (!found_active) {
4436 		*p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4437 	}
4438 
4439 	return MPTS_EVRET_OK;
4440 }
4441 
4442 /*
4443  * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4444  * caller must ensure that the option can be issued on subflow sockets, via
4445  * MPOF_SUBFLOW_OK flag.
4446  */
4447 int
mptcp_subflow_sosetopt(struct mptses * mpte,struct mptsub * mpts,struct mptopt * mpo)4448 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4449 {
4450 	struct socket *mp_so, *so;
4451 	struct sockopt sopt;
4452 	int error;
4453 
4454 	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4455 
4456 	mp_so = mptetoso(mpte);
4457 	so = mpts->mpts_socket;
4458 
4459 	/* Don't try to apply an IP or IPv6 option on an IPv6 or IP socket */
4460 	if (mpo->mpo_level == IPPROTO_IP && SOCK_CHECK_DOM(so, PF_INET6)) {
4461 		return 0;
4462 	}
4463 	if (mpo->mpo_level == IPPROTO_IPV6 && SOCK_CHECK_DOM(so, PF_INET)) {
4464 		return 0;
4465 	}
4466 
4467 	socket_lock_assert_owned(mp_so);
4468 
4469 	if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4470 	    mpo->mpo_level == SOL_SOCKET &&
4471 	    mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4472 		struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4473 
4474 		/*
4475 		 * When we open a new subflow, mark it as cell fallback, if
4476 		 * this subflow goes over cell.
4477 		 *
4478 		 * (except for first-party apps)
4479 		 */
4480 
4481 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4482 			return 0;
4483 		}
4484 
4485 		if (sotoinpcb(so)->inp_last_outifp &&
4486 		    !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4487 			return 0;
4488 		}
4489 
4490 		/*
4491 		 * This here is an OR, because if the app is not binding to the
4492 		 * interface, then it definitely is not a cell-fallback
4493 		 * connection.
4494 		 */
4495 		if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4496 		    !IFNET_IS_CELLULAR(ifp)) {
4497 			return 0;
4498 		}
4499 	}
4500 
4501 	mpo->mpo_flags &= ~MPOF_INTERIM;
4502 
4503 	bzero(&sopt, sizeof(sopt));
4504 	sopt.sopt_dir = SOPT_SET;
4505 	sopt.sopt_level = mpo->mpo_level;
4506 	sopt.sopt_name = mpo->mpo_name;
4507 	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4508 	sopt.sopt_valsize = sizeof(int);
4509 	sopt.sopt_p = kernproc;
4510 
4511 	error = sosetoptlock(so, &sopt, 0);
4512 	if (error) {
4513 		os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4514 		    "val %d set error %d\n", __func__,
4515 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4516 		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4517 		    mpo->mpo_intval, error);
4518 	}
4519 	return error;
4520 }
4521 
4522 /*
4523  * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4524  * caller must ensure that the option can be issued on subflow sockets, via
4525  * MPOF_SUBFLOW_OK flag.
4526  */
4527 int
mptcp_subflow_sogetopt(struct mptses * mpte,struct socket * so,struct mptopt * mpo)4528 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4529     struct mptopt *mpo)
4530 {
4531 	struct socket *mp_so;
4532 	struct sockopt sopt;
4533 	int error;
4534 
4535 	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4536 	mp_so = mptetoso(mpte);
4537 
4538 	socket_lock_assert_owned(mp_so);
4539 
4540 	bzero(&sopt, sizeof(sopt));
4541 	sopt.sopt_dir = SOPT_GET;
4542 	sopt.sopt_level = mpo->mpo_level;
4543 	sopt.sopt_name = mpo->mpo_name;
4544 	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4545 	sopt.sopt_valsize = sizeof(int);
4546 	sopt.sopt_p = kernproc;
4547 
4548 	error = sogetoptlock(so, &sopt, 0);     /* already locked */
4549 	if (error) {
4550 		os_log_error(mptcp_log_handle,
4551 		    "%s - %lx: sopt %s get error %d\n",
4552 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4553 		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4554 	}
4555 	return error;
4556 }
4557 
4558 
4559 /*
4560  * MPTCP garbage collector.
4561  *
4562  * This routine is called by the MP domain on-demand, periodic callout,
4563  * which is triggered when a MPTCP socket is closed.  The callout will
4564  * repeat as long as this routine returns a non-zero value.
4565  */
4566 static uint32_t
mptcp_gc(struct mppcbinfo * mppi)4567 mptcp_gc(struct mppcbinfo *mppi)
4568 {
4569 	struct mppcb *mpp, *tmpp;
4570 	uint32_t active = 0;
4571 
4572 	LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4573 
4574 	TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4575 		struct socket *mp_so;
4576 		struct mptses *mpte;
4577 		struct mptcb *mp_tp;
4578 
4579 		mp_so = mpp->mpp_socket;
4580 		mpte = mptompte(mpp);
4581 		mp_tp = mpte->mpte_mptcb;
4582 
4583 		if (!mpp_try_lock(mpp)) {
4584 			active++;
4585 			continue;
4586 		}
4587 
4588 		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4589 
4590 		/* check again under the lock */
4591 		if (mp_so->so_usecount > 0) {
4592 			boolean_t wakeup = FALSE;
4593 			struct mptsub *mpts, *tmpts;
4594 
4595 			if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4596 				if (mp_tp->mpt_gc_ticks > 0) {
4597 					mp_tp->mpt_gc_ticks--;
4598 				}
4599 				if (mp_tp->mpt_gc_ticks == 0) {
4600 					wakeup = TRUE;
4601 				}
4602 			}
4603 			if (wakeup) {
4604 				TAILQ_FOREACH_SAFE(mpts,
4605 				    &mpte->mpte_subflows, mpts_entry, tmpts) {
4606 					mptcp_subflow_eupcall1(mpts->mpts_socket,
4607 					    mpts, SO_FILT_HINT_DISCONNECTED);
4608 				}
4609 			}
4610 			socket_unlock(mp_so, 0);
4611 			active++;
4612 			continue;
4613 		}
4614 
4615 		if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4616 			panic("%s - %lx: skipped state "
4617 			    "[u=%d,r=%d,s=%d]\n", __func__,
4618 			    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4619 			    mp_so->so_usecount, mp_so->so_retaincnt,
4620 			    mpp->mpp_state);
4621 		}
4622 
4623 		if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4624 			mptcp_close(mpte, mp_tp);
4625 		}
4626 
4627 		mptcp_session_destroy(mpte);
4628 
4629 		DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4630 		    struct sockbuf *, &mp_so->so_rcv,
4631 		    struct sockbuf *, &mp_so->so_snd,
4632 		    struct mppcb *, mpp);
4633 
4634 		mptcp_pcbdispose(mpp);
4635 		sodealloc(mp_so);
4636 	}
4637 
4638 	return active;
4639 }
4640 
4641 /*
4642  * Drop a MPTCP connection, reporting the specified error.
4643  */
4644 struct mptses *
mptcp_drop(struct mptses * mpte,struct mptcb * mp_tp,u_short errno)4645 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4646 {
4647 	struct socket *mp_so = mptetoso(mpte);
4648 
4649 	VERIFY(mpte->mpte_mptcb == mp_tp);
4650 
4651 	socket_lock_assert_owned(mp_so);
4652 
4653 	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4654 	    uint32_t, 0 /* event */);
4655 
4656 	if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4657 		errno = mp_tp->mpt_softerror;
4658 	}
4659 	mp_so->so_error = errno;
4660 
4661 	return mptcp_close(mpte, mp_tp);
4662 }
4663 
4664 /*
4665  * Close a MPTCP control block.
4666  */
4667 struct mptses *
mptcp_close(struct mptses * mpte,struct mptcb * mp_tp)4668 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4669 {
4670 	struct mptsub *mpts = NULL, *tmpts = NULL;
4671 	struct socket *mp_so = mptetoso(mpte);
4672 
4673 	socket_lock_assert_owned(mp_so);
4674 	VERIFY(mpte->mpte_mptcb == mp_tp);
4675 
4676 	mp_tp->mpt_state = MPTCPS_TERMINATE;
4677 
4678 	mptcp_freeq(mp_tp);
4679 
4680 	soisdisconnected(mp_so);
4681 
4682 	/* Clean up all subflows */
4683 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4684 		mptcp_subflow_disconnect(mpte, mpts);
4685 	}
4686 
4687 	return NULL;
4688 }
4689 
4690 void
mptcp_notify_close(struct socket * so)4691 mptcp_notify_close(struct socket *so)
4692 {
4693 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4694 }
4695 
4696 typedef struct mptcp_subflow_event_entry {
4697 	uint32_t    sofilt_hint_mask;
4698 	ev_ret_t    (*sofilt_hint_ev_hdlr)(
4699 		struct mptses *mpte,
4700 		struct mptsub *mpts,
4701 		uint32_t *p_mpsofilt_hint,
4702 		uint32_t event);
4703 } mptsub_ev_entry_t;
4704 
4705 /*
4706  * XXX The order of the event handlers below is really
4707  * really important. Think twice before changing it.
4708  */
4709 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
4710 	{
4711 		.sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
4712 		.sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
4713 	},
4714 	{
4715 		.sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
4716 		.sofilt_hint_ev_hdlr =  mptcp_subflow_mpcantrcvmore_ev,
4717 	},
4718 	{
4719 		.sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
4720 		.sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
4721 	},
4722 	{
4723 		.sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
4724 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4725 	},
4726 	{
4727 		.sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
4728 		.sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
4729 	},
4730 	{
4731 		.sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
4732 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4733 	},
4734 	{
4735 		.sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
4736 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4737 	},
4738 	{
4739 		.sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
4740 		.sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
4741 	},
4742 	{
4743 		.sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
4744 		.sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
4745 	},
4746 	{
4747 		.sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
4748 		.sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
4749 	},
4750 	{
4751 		.sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
4752 		.sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
4753 	},
4754 	{
4755 		.sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
4756 		.sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
4757 	},
4758 	{
4759 		.sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
4760 		.sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
4761 	},
4762 	{
4763 		.sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
4764 		.sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
4765 	},
4766 };
4767 
4768 /*
4769  * Subflow socket control events.
4770  *
4771  * Called for handling events related to the underlying subflow socket.
4772  */
4773 static ev_ret_t
mptcp_subflow_events(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint)4774 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
4775     uint32_t *p_mpsofilt_hint)
4776 {
4777 	ev_ret_t ret = MPTS_EVRET_OK;
4778 	int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
4779 	    sizeof(mpsub_ev_entry_tbl[0]);
4780 
4781 	/* bail if there's nothing to process */
4782 	if (!mpts->mpts_evctl) {
4783 		return ret;
4784 	}
4785 
4786 	if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
4787 	    SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
4788 	    SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
4789 	    SO_FILT_HINT_DISCONNECTED)) {
4790 		mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
4791 	}
4792 
4793 	DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
4794 	    struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
4795 
4796 	/*
4797 	 * Process all the socket filter hints and reset the hint
4798 	 * once it is handled
4799 	 */
4800 	for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
4801 		/*
4802 		 * Always execute the DISCONNECTED event, because it will wakeup
4803 		 * the app.
4804 		 */
4805 		if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
4806 		    (ret >= MPTS_EVRET_OK ||
4807 		    mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
4808 			mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
4809 			ev_ret_t error =
4810 			    mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
4811 			ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
4812 		}
4813 	}
4814 
4815 	return ret;
4816 }
4817 
4818 /*
4819  * MPTCP workloop.
4820  */
4821 void
mptcp_subflow_workloop(struct mptses * mpte)4822 mptcp_subflow_workloop(struct mptses *mpte)
4823 {
4824 	boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4825 	uint32_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4826 	struct mptsub *mpts, *tmpts;
4827 	struct socket *mp_so;
4828 
4829 	mp_so = mptetoso(mpte);
4830 
4831 	socket_lock_assert_owned(mp_so);
4832 
4833 	if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4834 		mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4835 		return;
4836 	}
4837 	mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4838 
4839 relaunch:
4840 	mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4841 
4842 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4843 		ev_ret_t ret;
4844 
4845 		if (mpts->mpts_socket->so_usecount == 0) {
4846 			/* Will be removed soon by tcp_garbage_collect */
4847 			continue;
4848 		}
4849 
4850 		mptcp_subflow_addref(mpts);
4851 		mpts->mpts_socket->so_usecount++;
4852 
4853 		ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4854 
4855 		/*
4856 		 * If MPTCP socket is closed, disconnect all subflows.
4857 		 * This will generate a disconnect event which will
4858 		 * be handled during the next iteration, causing a
4859 		 * non-zero error to be returned above.
4860 		 */
4861 		if (mp_so->so_flags & SOF_PCBCLEARING) {
4862 			mptcp_subflow_disconnect(mpte, mpts);
4863 		}
4864 
4865 		switch (ret) {
4866 		case MPTS_EVRET_OK:
4867 			/* nothing to do */
4868 			break;
4869 		case MPTS_EVRET_DELETE:
4870 			mptcp_subflow_soclose(mpts);
4871 			break;
4872 		case MPTS_EVRET_CONNECT_PENDING:
4873 			connect_pending = TRUE;
4874 			break;
4875 		case MPTS_EVRET_DISCONNECT_FALLBACK:
4876 			disconnect_fallback = TRUE;
4877 			break;
4878 		default:
4879 			break;
4880 		}
4881 		mptcp_subflow_remref(mpts);             /* ours */
4882 
4883 		VERIFY(mpts->mpts_socket->so_usecount != 0);
4884 		mpts->mpts_socket->so_usecount--;
4885 	}
4886 
4887 	if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4888 		VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4889 
4890 		if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4891 			mp_so->so_state |= SS_CANTRCVMORE;
4892 			sorwakeup(mp_so);
4893 		}
4894 
4895 		soevent(mp_so, mpsofilt_hint_mask);
4896 	}
4897 
4898 	if (!connect_pending && !disconnect_fallback) {
4899 		goto exit;
4900 	}
4901 
4902 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4903 		if (disconnect_fallback) {
4904 			struct socket *so = NULL;
4905 			struct inpcb *inp = NULL;
4906 			struct tcpcb *tp = NULL;
4907 
4908 			if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4909 				continue;
4910 			}
4911 
4912 			mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4913 
4914 			if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4915 			    MPTSF_DISCONNECTED)) {
4916 				continue;
4917 			}
4918 
4919 			so = mpts->mpts_socket;
4920 
4921 			/*
4922 			 * The MPTCP connection has degraded to a fallback
4923 			 * mode, so there is no point in keeping this subflow
4924 			 * regardless of its MPTCP-readiness state, unless it
4925 			 * is the primary one which we use for fallback.  This
4926 			 * assumes that the subflow used for fallback is the
4927 			 * ACTIVE one.
4928 			 */
4929 
4930 			inp = sotoinpcb(so);
4931 			tp = intotcpcb(inp);
4932 			tp->t_mpflags &=
4933 			    ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4934 			tp->t_mpflags |= TMPF_TCP_FALLBACK;
4935 
4936 			soevent(so, SO_FILT_HINT_MUSTRST);
4937 		} else if (connect_pending) {
4938 			/*
4939 			 * The MPTCP connection has progressed to a state
4940 			 * where it supports full multipath semantics; allow
4941 			 * additional joins to be attempted for all subflows
4942 			 * that are in the PENDING state.
4943 			 */
4944 			if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4945 				int error = mptcp_subflow_soconnectx(mpte, mpts);
4946 
4947 				if (error) {
4948 					mptcp_subflow_abort(mpts, error);
4949 				}
4950 			}
4951 		}
4952 	}
4953 
4954 exit:
4955 	if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4956 		goto relaunch;
4957 	}
4958 
4959 	mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4960 }
4961 
4962 /*
4963  * Protocol pr_lock callback.
4964  */
4965 int
mptcp_lock(struct socket * mp_so,int refcount,void * lr)4966 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4967 {
4968 	struct mppcb *mpp = mpsotomppcb(mp_so);
4969 	void *lr_saved;
4970 
4971 	if (lr == NULL) {
4972 		lr_saved = __builtin_return_address(0);
4973 	} else {
4974 		lr_saved = lr;
4975 	}
4976 
4977 	if (mpp == NULL) {
4978 		panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
4979 		    mp_so, lr_saved, solockhistory_nr(mp_so));
4980 		/* NOTREACHED */
4981 	}
4982 	mpp_lock(mpp);
4983 
4984 	if (mp_so->so_usecount < 0) {
4985 		panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__,
4986 		    mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4987 		    solockhistory_nr(mp_so));
4988 		/* NOTREACHED */
4989 	}
4990 	if (refcount != 0) {
4991 		mp_so->so_usecount++;
4992 		mpp->mpp_inside++;
4993 	}
4994 	mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4995 	mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4996 
4997 	return 0;
4998 }
4999 
5000 /*
5001  * Protocol pr_unlock callback.
5002  */
5003 int
mptcp_unlock(struct socket * mp_so,int refcount,void * lr)5004 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
5005 {
5006 	struct mppcb *mpp = mpsotomppcb(mp_so);
5007 	void *lr_saved;
5008 
5009 	if (lr == NULL) {
5010 		lr_saved = __builtin_return_address(0);
5011 	} else {
5012 		lr_saved = lr;
5013 	}
5014 
5015 	if (mpp == NULL) {
5016 		panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__,
5017 		    mp_so, mp_so->so_usecount, lr_saved,
5018 		    solockhistory_nr(mp_so));
5019 		/* NOTREACHED */
5020 	}
5021 	socket_lock_assert_owned(mp_so);
5022 
5023 	if (refcount != 0) {
5024 		mp_so->so_usecount--;
5025 		mpp->mpp_inside--;
5026 	}
5027 
5028 	if (mp_so->so_usecount < 0) {
5029 		panic("%s: so=%p usecount=%x lrh= %s", __func__,
5030 		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5031 		/* NOTREACHED */
5032 	}
5033 	if (mpp->mpp_inside < 0) {
5034 		panic("%s: mpp=%p inside=%x lrh= %s", __func__,
5035 		    mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5036 		/* NOTREACHED */
5037 	}
5038 	mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5039 	mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5040 	mpp_unlock(mpp);
5041 
5042 	return 0;
5043 }
5044 
5045 /*
5046  * Protocol pr_getlock callback.
5047  */
5048 lck_mtx_t *
mptcp_getlock(struct socket * mp_so,int flags)5049 mptcp_getlock(struct socket *mp_so, int flags)
5050 {
5051 	struct mppcb *mpp = mpsotomppcb(mp_so);
5052 
5053 	if (mpp == NULL) {
5054 		panic("%s: so=%p NULL so_pcb %s", __func__, mp_so,
5055 		    solockhistory_nr(mp_so));
5056 		/* NOTREACHED */
5057 	}
5058 	if (mp_so->so_usecount < 0) {
5059 		panic("%s: so=%p usecount=%x lrh= %s", __func__,
5060 		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5061 		/* NOTREACHED */
5062 	}
5063 	return mpp_getlock(mpp, flags);
5064 }
5065 
5066 void
mptcp_get_rands(mptcp_addr_id addr_id,struct mptcb * mp_tp,u_int32_t * lrand,u_int32_t * rrand)5067 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5068     u_int32_t *rrand)
5069 {
5070 	struct mptcp_subf_auth_entry *sauth_entry;
5071 
5072 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5073 		if (sauth_entry->msae_laddr_id == addr_id) {
5074 			if (lrand) {
5075 				*lrand = sauth_entry->msae_laddr_rand;
5076 			}
5077 			if (rrand) {
5078 				*rrand = sauth_entry->msae_raddr_rand;
5079 			}
5080 			break;
5081 		}
5082 	}
5083 }
5084 
5085 void
mptcp_set_raddr_rand(mptcp_addr_id laddr_id,struct mptcb * mp_tp,mptcp_addr_id raddr_id,u_int32_t raddr_rand)5086 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5087     mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5088 {
5089 	struct mptcp_subf_auth_entry *sauth_entry;
5090 
5091 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5092 		if (sauth_entry->msae_laddr_id == laddr_id) {
5093 			if ((sauth_entry->msae_raddr_id != 0) &&
5094 			    (sauth_entry->msae_raddr_id != raddr_id)) {
5095 				os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5096 				    " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5097 				    raddr_id, sauth_entry->msae_raddr_id);
5098 				return;
5099 			}
5100 			sauth_entry->msae_raddr_id = raddr_id;
5101 			if ((sauth_entry->msae_raddr_rand != 0) &&
5102 			    (sauth_entry->msae_raddr_rand != raddr_rand)) {
5103 				os_log_error(mptcp_log_handle, "%s - %lx: "
5104 				    "dup SYN_ACK %d %d \n",
5105 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5106 				    raddr_rand, sauth_entry->msae_raddr_rand);
5107 				return;
5108 			}
5109 			sauth_entry->msae_raddr_rand = raddr_rand;
5110 			return;
5111 		}
5112 	}
5113 }
5114 
5115 /*
5116  * SHA-256 support for MPTCP
5117  */
5118 
5119 static void
mptcp_do_sha256(mptcp_key_t * key,char * sha_digest)5120 mptcp_do_sha256(mptcp_key_t *key, char *sha_digest)
5121 {
5122 	const unsigned char *sha2_base;
5123 	int sha2_size;
5124 
5125 	sha2_base = (const unsigned char *) key;
5126 	sha2_size = sizeof(mptcp_key_t);
5127 
5128 	SHA256_CTX sha_ctx;
5129 	SHA256_Init(&sha_ctx);
5130 	SHA256_Update(&sha_ctx, sha2_base, sha2_size);
5131 	SHA256_Final(sha_digest, &sha_ctx);
5132 }
5133 
5134 void
mptcp_hmac_sha256(mptcp_key_t key1,mptcp_key_t key2,u_char * msg,uint16_t msg_len,u_char * digest)5135 mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2,
5136     u_char *msg, uint16_t msg_len, u_char *digest)
5137 {
5138 	SHA256_CTX sha_ctx;
5139 	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5140 	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5141 	int i;
5142 
5143 	bzero(digest, SHA256_DIGEST_LENGTH);
5144 
5145 	/* Set up the Key for HMAC */
5146 	key_ipad[0] = key1;
5147 	key_ipad[1] = key2;
5148 
5149 	key_opad[0] = key1;
5150 	key_opad[1] = key2;
5151 
5152 	/* Key is 512 block length, so no need to compute hash */
5153 
5154 	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5155 
5156 	for (i = 0; i < 8; i++) {
5157 		key_ipad[i] ^= 0x3636363636363636;
5158 		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5159 	}
5160 
5161 	/* Perform inner SHA256 */
5162 	SHA256_Init(&sha_ctx);
5163 	SHA256_Update(&sha_ctx, (unsigned char *)key_ipad, sizeof(key_ipad));
5164 	SHA256_Update(&sha_ctx, msg, msg_len);
5165 	SHA256_Final(digest, &sha_ctx);
5166 
5167 	/* Perform outer SHA256 */
5168 	SHA256_Init(&sha_ctx);
5169 	SHA256_Update(&sha_ctx, (unsigned char *)key_opad, sizeof(key_opad));
5170 	SHA256_Update(&sha_ctx, (unsigned char *)digest, SHA256_DIGEST_LENGTH);
5171 	SHA256_Final(digest, &sha_ctx);
5172 }
5173 
5174 /*
5175  * SHA1 support for MPTCP
5176  */
5177 
5178 static void
mptcp_do_sha1(mptcp_key_t * key,char * sha_digest)5179 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5180 {
5181 	SHA1_CTX sha1ctxt;
5182 	const unsigned char *sha1_base;
5183 	int sha1_size;
5184 
5185 	sha1_base = (const unsigned char *) key;
5186 	sha1_size = sizeof(mptcp_key_t);
5187 	SHA1Init(&sha1ctxt);
5188 	SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5189 	SHA1Final(sha_digest, &sha1ctxt);
5190 }
5191 
5192 void
mptcp_hmac_sha1(mptcp_key_t key1,mptcp_key_t key2,u_int32_t rand1,u_int32_t rand2,u_char * digest)5193 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5194     u_int32_t rand1, u_int32_t rand2, u_char *digest)
5195 {
5196 	SHA1_CTX  sha1ctxt;
5197 	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5198 	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5199 	u_int32_t data[2];
5200 	int i;
5201 
5202 	bzero(digest, SHA1_RESULTLEN);
5203 
5204 	/* Set up the Key for HMAC */
5205 	key_ipad[0] = key1;
5206 	key_ipad[1] = key2;
5207 
5208 	key_opad[0] = key1;
5209 	key_opad[1] = key2;
5210 
5211 	/* Set up the message for HMAC */
5212 	data[0] = rand1;
5213 	data[1] = rand2;
5214 
5215 	/* Key is 512 block length, so no need to compute hash */
5216 
5217 	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5218 
5219 	for (i = 0; i < 8; i++) {
5220 		key_ipad[i] ^= 0x3636363636363636;
5221 		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5222 	}
5223 
5224 	/* Perform inner SHA1 */
5225 	SHA1Init(&sha1ctxt);
5226 	SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5227 	SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5228 	SHA1Final(digest, &sha1ctxt);
5229 
5230 	/* Perform outer SHA1 */
5231 	SHA1Init(&sha1ctxt);
5232 	SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5233 	SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5234 	SHA1Final(digest, &sha1ctxt);
5235 }
5236 
5237 /*
5238  * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5239  * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5240  */
5241 void
mptcp_get_mpjoin_hmac(mptcp_addr_id aid,struct mptcb * mp_tp,u_char * digest,uint8_t digest_len)5242 mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, uint8_t digest_len)
5243 {
5244 	uint32_t lrand, rrand;
5245 
5246 	lrand = rrand = 0;
5247 	mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5248 
5249 	u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0};
5250 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5251 		mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, full_digest);
5252 	} else {
5253 		uint32_t data[2];
5254 		data[0] = lrand;
5255 		data[1] = rrand;
5256 		mptcp_hmac_sha256(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, (u_char*)data, 8, full_digest);
5257 	}
5258 	bcopy(full_digest, digest, digest_len);
5259 }
5260 
5261 /*
5262  * Authentication data generation
5263  */
5264 static void
mptcp_generate_token(char * sha_digest,int sha_digest_len,caddr_t token,int token_len)5265 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5266     int token_len)
5267 {
5268 	VERIFY(token_len == sizeof(u_int32_t));
5269 	VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5270 	    sha_digest_len == SHA256_DIGEST_LENGTH);
5271 
5272 	/* Most significant 32 bits of the SHA1/SHA256 hash */
5273 	bcopy(sha_digest, token, sizeof(u_int32_t));
5274 	return;
5275 }
5276 
5277 static void
mptcp_generate_idsn(char * sha_digest,int sha_digest_len,caddr_t idsn,int idsn_len,uint8_t mp_version)5278 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5279     int idsn_len, uint8_t mp_version)
5280 {
5281 	VERIFY(idsn_len == sizeof(u_int64_t));
5282 	VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5283 	    sha_digest_len == SHA256_DIGEST_LENGTH);
5284 	VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1);
5285 
5286 	/*
5287 	 * Least significant 64 bits of the hash
5288 	 */
5289 
5290 	if (mp_version == MPTCP_VERSION_0) {
5291 		idsn[7] = sha_digest[12];
5292 		idsn[6] = sha_digest[13];
5293 		idsn[5] = sha_digest[14];
5294 		idsn[4] = sha_digest[15];
5295 		idsn[3] = sha_digest[16];
5296 		idsn[2] = sha_digest[17];
5297 		idsn[1] = sha_digest[18];
5298 		idsn[0] = sha_digest[19];
5299 	} else {
5300 		idsn[7] = sha_digest[24];
5301 		idsn[6] = sha_digest[25];
5302 		idsn[5] = sha_digest[26];
5303 		idsn[4] = sha_digest[27];
5304 		idsn[3] = sha_digest[28];
5305 		idsn[2] = sha_digest[29];
5306 		idsn[1] = sha_digest[30];
5307 		idsn[0] = sha_digest[31];
5308 	}
5309 	return;
5310 }
5311 
5312 static void
mptcp_conn_properties(struct mptcb * mp_tp)5313 mptcp_conn_properties(struct mptcb *mp_tp)
5314 {
5315 	/* Set DSS checksum flag */
5316 	if (mptcp_dss_csum) {
5317 		mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5318 	}
5319 
5320 	/* Set up receive window */
5321 	mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5322 
5323 	/* Set up gc ticks */
5324 	mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5325 }
5326 
5327 static void
mptcp_init_local_parms(struct mptses * mpte,struct sockaddr * dst)5328 mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst)
5329 {
5330 	struct mptcb *mp_tp = mpte->mpte_mptcb;
5331 	char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5332 	uint16_t digest_len;
5333 
5334 	if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) {
5335 		mp_tp->mpt_version = MPTCP_VERSION_0;
5336 	} else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) {
5337 		mp_tp->mpt_version = MPTCP_VERSION_1;
5338 	} else {
5339 		mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst);
5340 	}
5341 	VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 ||
5342 	    mp_tp->mpt_version == MPTCP_VERSION_1);
5343 
5344 	read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5345 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5346 		digest_len = SHA1_RESULTLEN;
5347 		mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5348 	} else {
5349 		digest_len = SHA256_DIGEST_LENGTH;
5350 		mptcp_do_sha256(&mp_tp->mpt_localkey, key_digest);
5351 	}
5352 
5353 	mptcp_generate_token(key_digest, digest_len,
5354 	    (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5355 	mptcp_generate_idsn(key_digest, digest_len,
5356 	    (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5357 	/* The subflow SYN is also first MPTCP byte */
5358 	mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5359 	mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5360 
5361 	mptcp_conn_properties(mp_tp);
5362 }
5363 
5364 int
mptcp_init_remote_parms(struct mptcb * mp_tp)5365 mptcp_init_remote_parms(struct mptcb *mp_tp)
5366 {
5367 	/* Setup local and remote tokens and Initial DSNs */
5368 	char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5369 	uint16_t digest_len;
5370 
5371 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5372 		digest_len = SHA1_RESULTLEN;
5373 		mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5374 	} else if (mp_tp->mpt_version == MPTCP_VERSION_1) {
5375 		digest_len = SHA256_DIGEST_LENGTH;
5376 		mptcp_do_sha256(&mp_tp->mpt_remotekey, remote_digest);
5377 	} else {
5378 		return -1;
5379 	}
5380 
5381 	mptcp_generate_token(remote_digest, digest_len,
5382 	    (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5383 	mptcp_generate_idsn(remote_digest, digest_len,
5384 	    (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5385 	mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5386 	mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5387 	return 0;
5388 }
5389 
5390 static void
mptcp_send_dfin(struct socket * so)5391 mptcp_send_dfin(struct socket *so)
5392 {
5393 	struct tcpcb *tp = NULL;
5394 	struct inpcb *inp = NULL;
5395 
5396 	inp = sotoinpcb(so);
5397 	if (!inp) {
5398 		return;
5399 	}
5400 
5401 	tp = intotcpcb(inp);
5402 	if (!tp) {
5403 		return;
5404 	}
5405 
5406 	if (!(tp->t_mpflags & TMPF_RESET)) {
5407 		tp->t_mpflags |= TMPF_SEND_DFIN;
5408 	}
5409 }
5410 
5411 /*
5412  * Data Sequence Mapping routines
5413  */
5414 void
mptcp_insert_dsn(struct mppcb * mpp,struct mbuf * m)5415 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5416 {
5417 	struct mptcb *mp_tp;
5418 
5419 	if (m == NULL) {
5420 		return;
5421 	}
5422 
5423 	mp_tp = &__container_of(mpp, struct mpp_mtp, mpp)->mtcb;
5424 
5425 	while (m) {
5426 		VERIFY(m->m_flags & M_PKTHDR);
5427 		m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5428 		m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5429 		VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5430 		m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5431 		mp_tp->mpt_sndmax += m_pktlen(m);
5432 		m = m->m_next;
5433 	}
5434 }
5435 
5436 void
mptcp_fallback_sbdrop(struct socket * so,struct mbuf * m,int len)5437 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5438 {
5439 	struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5440 	uint64_t data_ack;
5441 	uint64_t dsn;
5442 
5443 	VERIFY(len >= 0);
5444 
5445 	if (!m || len == 0) {
5446 		return;
5447 	}
5448 
5449 	while (m && len > 0) {
5450 		VERIFY(m->m_flags & M_PKTHDR);
5451 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5452 
5453 		data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5454 		dsn = m->m_pkthdr.mp_dsn;
5455 
5456 		len -= m->m_len;
5457 		m = m->m_next;
5458 	}
5459 
5460 	if (m && len == 0) {
5461 		/*
5462 		 * If there is one more mbuf in the chain, it automatically means
5463 		 * that up to m->mp_dsn has been ack'ed.
5464 		 *
5465 		 * This means, we actually correct data_ack back down (compared
5466 		 * to what we set inside the loop - dsn + data_len). Because in
5467 		 * the loop we are "optimistic" and assume that the full mapping
5468 		 * will be acked. If that's not the case and we get out of the
5469 		 * loop with m != NULL, it means only up to m->mp_dsn has been
5470 		 * really acked.
5471 		 */
5472 		data_ack = m->m_pkthdr.mp_dsn;
5473 	}
5474 
5475 	if (len < 0) {
5476 		/*
5477 		 * If len is negative, meaning we acked in the middle of an mbuf,
5478 		 * only up to this mbuf's data-sequence number has been acked
5479 		 * at the MPTCP-level.
5480 		 */
5481 		data_ack = dsn;
5482 	}
5483 
5484 	/* We can have data in the subflow's send-queue that is being acked,
5485 	 * while the DATA_ACK has already advanced. Thus, we should check whether
5486 	 * or not the DATA_ACK is actually new here.
5487 	 */
5488 	if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5489 	    MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5490 		mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5491 	}
5492 }
5493 
5494 void
mptcp_preproc_sbdrop(struct socket * so,struct mbuf * m,unsigned int len)5495 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5496 {
5497 	int rewinding = 0;
5498 
5499 	/* TFO makes things complicated. */
5500 	if (so->so_flags1 & SOF1_TFO_REWIND) {
5501 		rewinding = 1;
5502 		so->so_flags1 &= ~SOF1_TFO_REWIND;
5503 	}
5504 
5505 	while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5506 		u_int32_t sub_len;
5507 		VERIFY(m->m_flags & M_PKTHDR);
5508 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5509 
5510 		sub_len = m->m_pkthdr.mp_rlen;
5511 
5512 		if (sub_len < len) {
5513 			m->m_pkthdr.mp_dsn += sub_len;
5514 			if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5515 				m->m_pkthdr.mp_rseq += sub_len;
5516 			}
5517 			m->m_pkthdr.mp_rlen = 0;
5518 			len -= sub_len;
5519 		} else {
5520 			/* sub_len >= len */
5521 			if (rewinding == 0) {
5522 				m->m_pkthdr.mp_dsn += len;
5523 			}
5524 			if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5525 				if (rewinding == 0) {
5526 					m->m_pkthdr.mp_rseq += len;
5527 				}
5528 			}
5529 			m->m_pkthdr.mp_rlen -= len;
5530 			break;
5531 		}
5532 		m = m->m_next;
5533 	}
5534 
5535 	if (so->so_flags & SOF_MP_SUBFLOW &&
5536 	    !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5537 	    !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5538 		/*
5539 		 * Received an ack without receiving a DATA_ACK.
5540 		 * Need to fallback to regular TCP (or destroy this subflow).
5541 		 */
5542 		sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5543 		mptcp_notify_mpfail(so);
5544 	}
5545 }
5546 
5547 /* Obtain the DSN mapping stored in the mbuf */
5548 void
mptcp_output_getm_dsnmap32(struct socket * so,int off,uint32_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5549 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5550     uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5551 {
5552 	u_int64_t dsn64;
5553 
5554 	mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5555 	*dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5556 }
5557 
5558 void
mptcp_output_getm_dsnmap64(struct socket * so,int off,uint64_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5559 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5560     uint32_t *relseq, uint16_t *data_len,
5561     uint16_t *dss_csum)
5562 {
5563 	struct mbuf *m = so->so_snd.sb_mb;
5564 
5565 	VERIFY(off >= 0);
5566 
5567 	if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5568 		*dsn = 0;
5569 		*relseq = 0;
5570 		*data_len = 0;
5571 		*dss_csum = 0;
5572 		return;
5573 	}
5574 
5575 	/*
5576 	 * In the subflow socket, the DSN sequencing can be discontiguous,
5577 	 * but the subflow sequence mapping is contiguous. Use the subflow
5578 	 * sequence property to find the right mbuf and corresponding dsn
5579 	 * mapping.
5580 	 */
5581 
5582 	while (m) {
5583 		VERIFY(m->m_flags & M_PKTHDR);
5584 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5585 
5586 		if (off >= m->m_len) {
5587 			off -= m->m_len;
5588 			m = m->m_next;
5589 		} else {
5590 			break;
5591 		}
5592 	}
5593 
5594 	VERIFY(off >= 0);
5595 	VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5596 
5597 	*dsn = m->m_pkthdr.mp_dsn;
5598 	*relseq = m->m_pkthdr.mp_rseq;
5599 	*data_len = m->m_pkthdr.mp_rlen;
5600 	*dss_csum = m->m_pkthdr.mp_csum;
5601 }
5602 
5603 void
mptcp_output_getm_data_level_details(struct socket * so,int off,uint16_t * data_len,uint16_t * dss_csum)5604 mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum)
5605 {
5606 	uint64_t dsn;
5607 	uint32_t relseq;
5608 
5609 	mptcp_output_getm_dsnmap64(so, off, &dsn, &relseq, data_len, dss_csum);
5610 }
5611 
5612 /*
5613  * Note that this is called only from tcp_input() via mptcp_input_preproc()
5614  * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5615  * When it trims data tcp_input calls m_adj() which does not remove the
5616  * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5617  * The dsn map insertion cannot be delayed after trim, because data can be in
5618  * the reassembly queue for a while and the DSN option info in tp will be
5619  * overwritten for every new packet received.
5620  * The dsn map will be adjusted just prior to appending to subflow sockbuf
5621  * with mptcp_adj_rmap()
5622  */
5623 void
mptcp_insert_rmap(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th)5624 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5625 {
5626 	VERIFY(m->m_flags & M_PKTHDR);
5627 	VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5628 
5629 	if (tp->t_mpflags & TMPF_EMBED_DSN) {
5630 		m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5631 		m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5632 		m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5633 		m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5634 		if (tp->t_rcv_map.mpt_dfin) {
5635 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5636 		}
5637 
5638 		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5639 
5640 		tp->t_mpflags &= ~TMPF_EMBED_DSN;
5641 		tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5642 	} else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5643 		if (th->th_flags & TH_FIN) {
5644 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5645 		}
5646 	}
5647 }
5648 
5649 /*
5650  * Following routines help with failure detection and failover of data
5651  * transfer from one subflow to another.
5652  */
5653 void
mptcp_act_on_txfail(struct socket * so)5654 mptcp_act_on_txfail(struct socket *so)
5655 {
5656 	struct tcpcb *tp = NULL;
5657 	struct inpcb *inp = sotoinpcb(so);
5658 
5659 	if (inp == NULL) {
5660 		return;
5661 	}
5662 
5663 	tp = intotcpcb(inp);
5664 	if (tp == NULL) {
5665 		return;
5666 	}
5667 
5668 	if (so->so_flags & SOF_MP_TRYFAILOVER) {
5669 		return;
5670 	}
5671 
5672 	so->so_flags |= SOF_MP_TRYFAILOVER;
5673 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5674 }
5675 
5676 /*
5677  * Support for MP_FAIL option
5678  */
5679 int
mptcp_get_map_for_dsn(struct socket * so,uint64_t dsn_fail,uint32_t * tcp_seq)5680 mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5681 {
5682 	struct mbuf *m = so->so_snd.sb_mb;
5683 	uint16_t datalen;
5684 	uint64_t dsn;
5685 	int off = 0;
5686 
5687 	if (m == NULL) {
5688 		return -1;
5689 	}
5690 
5691 	while (m != NULL) {
5692 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5693 		VERIFY(m->m_flags & M_PKTHDR);
5694 		dsn = m->m_pkthdr.mp_dsn;
5695 		datalen = m->m_pkthdr.mp_rlen;
5696 		if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5697 		    (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5698 			off = (int)(dsn_fail - dsn);
5699 			*tcp_seq = m->m_pkthdr.mp_rseq + off;
5700 			return 0;
5701 		}
5702 
5703 		m = m->m_next;
5704 	}
5705 
5706 	/*
5707 	 * If there was no mbuf data and a fallback to TCP occurred, there's
5708 	 * not much else to do.
5709 	 */
5710 
5711 	os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5712 	return -1;
5713 }
5714 
5715 /*
5716  * Support for sending contiguous MPTCP bytes in subflow
5717  * Also for preventing sending data with ACK in 3-way handshake
5718  */
5719 int32_t
mptcp_adj_sendlen(struct socket * so,int32_t off)5720 mptcp_adj_sendlen(struct socket *so, int32_t off)
5721 {
5722 	struct tcpcb *tp = sototcpcb(so);
5723 	struct mptsub *mpts = tp->t_mpsub;
5724 	uint64_t mdss_dsn;
5725 	uint32_t mdss_subflow_seq;
5726 	int mdss_subflow_off;
5727 	uint16_t mdss_data_len;
5728 	uint16_t dss_csum;
5729 
5730 	if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5731 		return 0;
5732 	}
5733 
5734 	mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5735 	    &mdss_data_len, &dss_csum);
5736 
5737 	/*
5738 	 * We need to compute how much of the mapping still remains.
5739 	 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5740 	 */
5741 	mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5742 
5743 	/*
5744 	 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5745 	 * seq has been set to 1 (while it should be 0).
5746 	 */
5747 	if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5748 		mdss_subflow_off--;
5749 	}
5750 
5751 	VERIFY(off >= mdss_subflow_off);
5752 
5753 	return mdss_data_len - (off - mdss_subflow_off);
5754 }
5755 
5756 static uint32_t
mptcp_get_maxseg(struct mptses * mpte)5757 mptcp_get_maxseg(struct mptses *mpte)
5758 {
5759 	struct mptsub *mpts;
5760 	uint32_t maxseg = 0;
5761 
5762 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5763 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5764 
5765 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5766 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
5767 			continue;
5768 		}
5769 
5770 		if (tp->t_maxseg > maxseg) {
5771 			maxseg = tp->t_maxseg;
5772 		}
5773 	}
5774 
5775 	return maxseg;
5776 }
5777 
5778 static uint8_t
mptcp_get_rcvscale(struct mptses * mpte)5779 mptcp_get_rcvscale(struct mptses *mpte)
5780 {
5781 	struct mptsub *mpts;
5782 	uint8_t rcvscale = UINT8_MAX;
5783 
5784 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5785 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5786 
5787 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5788 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
5789 			continue;
5790 		}
5791 
5792 		if (tp->rcv_scale < rcvscale) {
5793 			rcvscale = tp->rcv_scale;
5794 		}
5795 	}
5796 
5797 	return rcvscale;
5798 }
5799 
5800 /* Similar to tcp_sbrcv_reserve */
5801 static void
mptcp_sbrcv_reserve(struct mptcb * mp_tp,struct sockbuf * sbrcv,u_int32_t newsize,u_int32_t idealsize)5802 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5803     u_int32_t newsize, u_int32_t idealsize)
5804 {
5805 	uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5806 
5807 	if (rcvscale == UINT8_MAX) {
5808 		return;
5809 	}
5810 
5811 	/* newsize should not exceed max */
5812 	newsize = min(newsize, tcp_autorcvbuf_max);
5813 
5814 	/* The receive window scale negotiated at the
5815 	 * beginning of the connection will also set a
5816 	 * limit on the socket buffer size
5817 	 */
5818 	newsize = min(newsize, TCP_MAXWIN << rcvscale);
5819 
5820 	/* Set new socket buffer size */
5821 	if (newsize > sbrcv->sb_hiwat &&
5822 	    (sbreserve(sbrcv, newsize) == 1)) {
5823 		sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5824 		    (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5825 
5826 		/* Again check the limit set by the advertised
5827 		 * window scale
5828 		 */
5829 		sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5830 		    TCP_MAXWIN << rcvscale);
5831 	}
5832 }
5833 
5834 void
mptcp_sbrcv_grow(struct mptcb * mp_tp)5835 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5836 {
5837 	struct mptses *mpte = mp_tp->mpt_mpte;
5838 	struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5839 	struct sockbuf *sbrcv = &mp_so->so_rcv;
5840 	uint32_t hiwat_sum = 0;
5841 	uint32_t ideal_sum = 0;
5842 	struct mptsub *mpts;
5843 
5844 	/*
5845 	 * Do not grow the receive socket buffer if
5846 	 * - auto resizing is disabled, globally or on this socket
5847 	 * - the high water mark already reached the maximum
5848 	 * - the stream is in background and receive side is being
5849 	 * throttled
5850 	 * - if there are segments in reassembly queue indicating loss,
5851 	 * do not need to increase recv window during recovery as more
5852 	 * data is not going to be sent. A duplicate ack sent during
5853 	 * recovery should not change the receive window
5854 	 */
5855 	if (tcp_do_autorcvbuf == 0 ||
5856 	    (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5857 	    sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5858 	    (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5859 	    !LIST_EMPTY(&mp_tp->mpt_segq)) {
5860 		/* Can not resize the socket buffer, just return */
5861 		return;
5862 	}
5863 
5864 	/*
5865 	 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5866 	 *
5867 	 * But, for this we first need accurate receiver-RTT estimations, which
5868 	 * we currently don't have.
5869 	 *
5870 	 * Let's use a dummy algorithm for now, just taking the sum of all
5871 	 * subflow's receive-buffers. It's too low, but that's all we can get
5872 	 * for now.
5873 	 */
5874 
5875 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5876 		hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5877 		ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5878 	}
5879 
5880 	mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5881 }
5882 
5883 /*
5884  * Determine if we can grow the recieve socket buffer to avoid sending
5885  * a zero window update to the peer. We allow even socket buffers that
5886  * have fixed size (set by the application) to grow if the resource
5887  * constraints are met. They will also be trimmed after the application
5888  * reads data.
5889  *
5890  * Similar to tcp_sbrcv_grow_rwin
5891  */
5892 static void
mptcp_sbrcv_grow_rwin(struct mptcb * mp_tp,struct sockbuf * sb)5893 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5894 {
5895 	struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5896 	u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5897 	u_int32_t rcvbuf = sb->sb_hiwat;
5898 
5899 	if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5900 		return;
5901 	}
5902 
5903 	if (tcp_do_autorcvbuf == 1 &&
5904 	    /* Diff to tcp_sbrcv_grow_rwin */
5905 	    (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5906 	    (rcvbuf - sb->sb_cc) < rcvbufinc &&
5907 	    rcvbuf < tcp_autorcvbuf_max &&
5908 	    (sb->sb_idealsize > 0 &&
5909 	    sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5910 		sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5911 	}
5912 }
5913 
5914 /* Similar to tcp_sbspace */
5915 int32_t
mptcp_sbspace(struct mptcb * mp_tp)5916 mptcp_sbspace(struct mptcb *mp_tp)
5917 {
5918 	struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5919 	uint32_t rcvbuf;
5920 	int32_t space;
5921 	int32_t pending = 0;
5922 
5923 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5924 
5925 	mptcp_sbrcv_grow_rwin(mp_tp, sb);
5926 
5927 	/* hiwat might have changed */
5928 	rcvbuf = sb->sb_hiwat;
5929 
5930 	space =  ((int32_t) imin((rcvbuf - sb->sb_cc),
5931 	    (sb->sb_mbmax - sb->sb_mbcnt)));
5932 	if (space < 0) {
5933 		space = 0;
5934 	}
5935 
5936 #if CONTENT_FILTER
5937 	/* Compensate for data being processed by content filters */
5938 	pending = cfil_sock_data_space(sb);
5939 #endif /* CONTENT_FILTER */
5940 	if (pending > space) {
5941 		space = 0;
5942 	} else {
5943 		space -= pending;
5944 	}
5945 
5946 	return space;
5947 }
5948 
5949 /*
5950  * Support Fallback to Regular TCP
5951  */
5952 void
mptcp_notify_mpready(struct socket * so)5953 mptcp_notify_mpready(struct socket *so)
5954 {
5955 	struct tcpcb *tp = NULL;
5956 
5957 	if (so == NULL) {
5958 		return;
5959 	}
5960 
5961 	tp = intotcpcb(sotoinpcb(so));
5962 
5963 	if (tp == NULL) {
5964 		return;
5965 	}
5966 
5967 	DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5968 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5969 	    struct tcpcb *, tp);
5970 
5971 	if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5972 		return;
5973 	}
5974 
5975 	if (tp->t_mpflags & TMPF_MPTCP_READY) {
5976 		return;
5977 	}
5978 
5979 	tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5980 	tp->t_mpflags |= TMPF_MPTCP_READY;
5981 
5982 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5983 }
5984 
5985 void
mptcp_notify_mpfail(struct socket * so)5986 mptcp_notify_mpfail(struct socket *so)
5987 {
5988 	struct tcpcb *tp = NULL;
5989 
5990 	if (so == NULL) {
5991 		return;
5992 	}
5993 
5994 	tp = intotcpcb(sotoinpcb(so));
5995 
5996 	if (tp == NULL) {
5997 		return;
5998 	}
5999 
6000 	DTRACE_MPTCP4(multipath__failed, struct socket *, so,
6001 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
6002 	    struct tcpcb *, tp);
6003 
6004 	if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
6005 		return;
6006 	}
6007 
6008 	tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
6009 	tp->t_mpflags |= TMPF_TCP_FALLBACK;
6010 
6011 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
6012 }
6013 
6014 /*
6015  * Keepalive helper function
6016  */
6017 boolean_t
mptcp_ok_to_keepalive(struct mptcb * mp_tp)6018 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
6019 {
6020 	boolean_t ret = 1;
6021 
6022 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6023 
6024 	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
6025 		ret = 0;
6026 	}
6027 	return ret;
6028 }
6029 
6030 /*
6031  * MPTCP t_maxseg adjustment function
6032  */
6033 int
mptcp_adj_mss(struct tcpcb * tp,boolean_t mtudisc)6034 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6035 {
6036 	int mss_lower = 0;
6037 	struct mptcb *mp_tp = tptomptp(tp);
6038 
6039 #define MPTCP_COMPUTE_LEN {                             \
6040 	mss_lower = sizeof (struct mptcp_dss_ack_opt);  \
6041 	if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)         \
6042 	        mss_lower += 2;                         \
6043 	else                                            \
6044 	/* adjust to 32-bit boundary + EOL */   \
6045 	        mss_lower += 2;                         \
6046 }
6047 	if (mp_tp == NULL) {
6048 		return 0;
6049 	}
6050 
6051 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6052 
6053 	/*
6054 	 * For the first subflow and subsequent subflows, adjust mss for
6055 	 * most common MPTCP option size, for case where tcp_mss is called
6056 	 * during option processing and MTU discovery.
6057 	 */
6058 	if (!mtudisc) {
6059 		if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6060 		    !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6061 			MPTCP_COMPUTE_LEN;
6062 		}
6063 
6064 		if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6065 		    tp->t_mpflags & TMPF_SENT_JOIN) {
6066 			MPTCP_COMPUTE_LEN;
6067 		}
6068 	} else {
6069 		if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6070 			MPTCP_COMPUTE_LEN;
6071 		}
6072 	}
6073 
6074 	return mss_lower;
6075 }
6076 
6077 static void
fill_mptcp_subflow(struct socket * so,mptcp_flow_t * flow,struct mptsub * mpts)6078 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6079 {
6080 	struct inpcb *inp;
6081 
6082 	tcp_getconninfo(so, &flow->flow_ci);
6083 	inp = sotoinpcb(so);
6084 	if ((inp->inp_vflag & INP_IPV6) != 0) {
6085 		flow->flow_src.ss_family = AF_INET6;
6086 		flow->flow_dst.ss_family = AF_INET6;
6087 		flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6088 		flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6089 		SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6090 		SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6091 		SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6092 		SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
6093 	} else if ((inp->inp_vflag & INP_IPV4) != 0) {
6094 		flow->flow_src.ss_family = AF_INET;
6095 		flow->flow_dst.ss_family = AF_INET;
6096 		flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6097 		flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6098 		SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6099 		SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6100 		SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6101 		SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6102 	}
6103 	flow->flow_len = sizeof(*flow);
6104 	flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
6105 	flow->flow_flags = mpts->mpts_flags;
6106 	flow->flow_cid = mpts->mpts_connid;
6107 	flow->flow_relseq = mpts->mpts_rel_seq;
6108 	flow->flow_soerror = mpts->mpts_socket->so_error;
6109 	flow->flow_probecnt = mpts->mpts_probecnt;
6110 }
6111 
6112 static int
6113 mptcp_pcblist SYSCTL_HANDLER_ARGS
6114 {
6115 #pragma unused(oidp, arg1, arg2)
6116 	int error = 0, f;
6117 	size_t len;
6118 	struct mppcb *mpp;
6119 	struct mptses *mpte;
6120 	struct mptcb *mp_tp;
6121 	struct mptsub *mpts;
6122 	struct socket *so;
6123 	conninfo_mptcp_t mptcpci;
6124 	mptcp_flow_t *flows = NULL;
6125 
6126 	if (req->newptr != USER_ADDR_NULL) {
6127 		return EPERM;
6128 	}
6129 
6130 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6131 	if (req->oldptr == USER_ADDR_NULL) {
6132 		size_t n = mtcbinfo.mppi_count;
6133 		lck_mtx_unlock(&mtcbinfo.mppi_lock);
6134 		req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6135 		    4 * (n + n / 8)  * sizeof(mptcp_flow_t);
6136 		return 0;
6137 	}
6138 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6139 		flows = NULL;
6140 		socket_lock(mpp->mpp_socket, 1);
6141 		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6142 		mpte = mptompte(mpp);
6143 
6144 		socket_lock_assert_owned(mptetoso(mpte));
6145 		mp_tp = mpte->mpte_mptcb;
6146 
6147 		bzero(&mptcpci, sizeof(mptcpci));
6148 		mptcpci.mptcpci_state = mp_tp->mpt_state;
6149 		mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6150 		mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6151 		mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6152 		mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6153 		mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6154 		mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6155 		mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6156 		mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6157 		mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6158 		mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6159 		mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6160 		mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6161 		mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6162 
6163 		mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6164 		mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6165 		mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6166 		mptcpci.mptcpci_flow_offset =
6167 		    offsetof(conninfo_mptcp_t, mptcpci_flows);
6168 
6169 		len = sizeof(*flows) * mpte->mpte_numflows;
6170 		if (mpte->mpte_numflows != 0) {
6171 			flows = kalloc_data(len, Z_WAITOK | Z_ZERO);
6172 			if (flows == NULL) {
6173 				socket_unlock(mpp->mpp_socket, 1);
6174 				break;
6175 			}
6176 			mptcpci.mptcpci_len = sizeof(mptcpci) +
6177 			    sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6178 			error = SYSCTL_OUT(req, &mptcpci,
6179 			    sizeof(mptcpci) - sizeof(mptcp_flow_t));
6180 		} else {
6181 			mptcpci.mptcpci_len = sizeof(mptcpci);
6182 			error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6183 		}
6184 		if (error) {
6185 			socket_unlock(mpp->mpp_socket, 1);
6186 			kfree_data(flows, len);
6187 			break;
6188 		}
6189 		f = 0;
6190 		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6191 			so = mpts->mpts_socket;
6192 			fill_mptcp_subflow(so, &flows[f], mpts);
6193 			f++;
6194 		}
6195 		socket_unlock(mpp->mpp_socket, 1);
6196 		if (flows) {
6197 			error = SYSCTL_OUT(req, flows, len);
6198 			kfree_data(flows, len);
6199 			if (error) {
6200 				break;
6201 			}
6202 		}
6203 	}
6204 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6205 
6206 	return error;
6207 }
6208 
6209 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6210     0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6211     "List of active MPTCP connections");
6212 
6213 /*
6214  * Set notsent lowat mark on the MPTCB
6215  */
6216 int
mptcp_set_notsent_lowat(struct mptses * mpte,int optval)6217 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6218 {
6219 	struct mptcb *mp_tp = NULL;
6220 	int error = 0;
6221 
6222 	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6223 		mp_tp = mpte->mpte_mptcb;
6224 	}
6225 
6226 	if (mp_tp) {
6227 		mp_tp->mpt_notsent_lowat = optval;
6228 	} else {
6229 		error = EINVAL;
6230 	}
6231 
6232 	return error;
6233 }
6234 
6235 u_int32_t
mptcp_get_notsent_lowat(struct mptses * mpte)6236 mptcp_get_notsent_lowat(struct mptses *mpte)
6237 {
6238 	struct mptcb *mp_tp = NULL;
6239 
6240 	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6241 		mp_tp = mpte->mpte_mptcb;
6242 	}
6243 
6244 	if (mp_tp) {
6245 		return mp_tp->mpt_notsent_lowat;
6246 	} else {
6247 		return 0;
6248 	}
6249 }
6250 
6251 int
mptcp_notsent_lowat_check(struct socket * so)6252 mptcp_notsent_lowat_check(struct socket *so)
6253 {
6254 	struct mptses *mpte;
6255 	struct mppcb *mpp;
6256 	struct mptcb *mp_tp;
6257 	struct mptsub *mpts;
6258 
6259 	int notsent = 0;
6260 
6261 	mpp = mpsotomppcb(so);
6262 	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6263 		return 0;
6264 	}
6265 
6266 	mpte = mptompte(mpp);
6267 	socket_lock_assert_owned(mptetoso(mpte));
6268 	mp_tp = mpte->mpte_mptcb;
6269 
6270 	notsent = so->so_snd.sb_cc;
6271 
6272 	if ((notsent == 0) ||
6273 	    ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6274 	    mp_tp->mpt_notsent_lowat)) {
6275 		return 1;
6276 	}
6277 
6278 	/* When Nagle's algorithm is not disabled, it is better
6279 	 * to wakeup the client even before there is atleast one
6280 	 * maxseg of data to write.
6281 	 */
6282 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6283 		int retval = 0;
6284 		if (mpts->mpts_flags & MPTSF_ACTIVE) {
6285 			struct socket *subf_so = mpts->mpts_socket;
6286 			struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6287 
6288 			notsent = so->so_snd.sb_cc -
6289 			    (tp->snd_nxt - tp->snd_una);
6290 
6291 			if ((tp->t_flags & TF_NODELAY) == 0 &&
6292 			    notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6293 				retval = 1;
6294 			}
6295 			return retval;
6296 		}
6297 	}
6298 	return 0;
6299 }
6300 
6301 static errno_t
mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)6302 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6303     void **unitinfo)
6304 {
6305 #pragma unused(kctlref, sac, unitinfo)
6306 
6307 	if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6308 		os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6309 	}
6310 
6311 	mptcp_kern_skt_unit = sac->sc_unit;
6312 
6313 	return 0;
6314 }
6315 
6316 static void
mptcp_allow_uuid(uuid_t uuid,int32_t rssi)6317 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6318 {
6319 	struct mppcb *mpp;
6320 
6321 	/* Iterate over all MPTCP connections */
6322 
6323 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6324 
6325 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6326 		struct socket *mp_so = mpp->mpp_socket;
6327 		struct mptses *mpte = mpp->mpp_pcbe;
6328 
6329 		socket_lock(mp_so, 1);
6330 
6331 		if (mp_so->so_flags & SOF_DELEGATED &&
6332 		    uuid_compare(uuid, mp_so->e_uuid)) {
6333 			goto next;
6334 		} else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6335 		    uuid_compare(uuid, mp_so->last_uuid)) {
6336 			goto next;
6337 		}
6338 
6339 		os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6340 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6341 
6342 		mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6343 
6344 		if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6345 			mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6346 		}
6347 
6348 		mptcp_check_subflows_and_add(mpte);
6349 		mptcp_remove_subflows(mpte);
6350 
6351 		mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6352 
6353 next:
6354 		socket_unlock(mp_so, 1);
6355 	}
6356 
6357 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6358 }
6359 
6360 static void
mptcp_wifi_status_changed(void)6361 mptcp_wifi_status_changed(void)
6362 {
6363 	struct mppcb *mpp;
6364 
6365 	/* Iterate over all MPTCP connections */
6366 
6367 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6368 
6369 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6370 		struct socket *mp_so = mpp->mpp_socket;
6371 		struct mptses *mpte = mpp->mpp_pcbe;
6372 
6373 		socket_lock(mp_so, 1);
6374 
6375 		/* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6376 		if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6377 		    mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
6378 		    mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6379 			goto next;
6380 		}
6381 
6382 		mptcp_check_subflows_and_add(mpte);
6383 		mptcp_check_subflows_and_remove(mpte);
6384 
6385 next:
6386 		socket_unlock(mp_so, 1);
6387 	}
6388 
6389 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6390 }
6391 
6392 struct mptcp_uuid_search_info {
6393 	uuid_t target_uuid;
6394 	proc_t found_proc;
6395 	boolean_t is_proc_found;
6396 };
6397 
6398 static int
mptcp_find_proc_filter(proc_t p,void * arg)6399 mptcp_find_proc_filter(proc_t p, void *arg)
6400 {
6401 	struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6402 	int found;
6403 
6404 	if (info->is_proc_found) {
6405 		return 0;
6406 	}
6407 
6408 	/*
6409 	 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6410 	 * expects != 0 for a matching filter.
6411 	 */
6412 	found = uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0;
6413 	if (found) {
6414 		info->is_proc_found = true;
6415 	}
6416 
6417 	return found;
6418 }
6419 
6420 static int
mptcp_find_proc_callout(proc_t p,void * arg)6421 mptcp_find_proc_callout(proc_t p, void * arg)
6422 {
6423 	struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6424 
6425 	if (uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0) {
6426 		info->found_proc = p;
6427 		return PROC_CLAIMED_DONE;
6428 	}
6429 
6430 	return PROC_RETURNED;
6431 }
6432 
6433 static proc_t
mptcp_find_proc(const uuid_t uuid)6434 mptcp_find_proc(const uuid_t uuid)
6435 {
6436 	struct mptcp_uuid_search_info info;
6437 
6438 	uuid_copy(info.target_uuid, uuid);
6439 	info.found_proc = PROC_NULL;
6440 	info.is_proc_found = false;
6441 
6442 	proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
6443 	    mptcp_find_proc_filter, &info);
6444 
6445 	return info.found_proc;
6446 }
6447 
6448 void
mptcp_ask_symptoms(struct mptses * mpte)6449 mptcp_ask_symptoms(struct mptses *mpte)
6450 {
6451 	struct mptcp_symptoms_ask_uuid ask;
6452 	struct socket *mp_so;
6453 	struct proc *p = PROC_NULL;
6454 	int pid, prio, err;
6455 
6456 	if (mptcp_kern_skt_unit == 0) {
6457 		os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6458 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6459 		return;
6460 	}
6461 
6462 	mp_so = mptetoso(mpte);
6463 
6464 	if (mp_so->so_flags & SOF_DELEGATED) {
6465 		if (mpte->mpte_epid != 0) {
6466 			p = proc_find(mpte->mpte_epid);
6467 			if (p != PROC_NULL) {
6468 				/* We found a pid, check its UUID */
6469 				if (uuid_compare(mp_so->e_uuid, proc_executableuuid_addr(p))) {
6470 					/* It's not the same - we need to look for the real proc */
6471 					proc_rele(p);
6472 					p = PROC_NULL;
6473 				}
6474 			}
6475 		}
6476 
6477 		if (p == PROC_NULL) {
6478 			p = mptcp_find_proc(mp_so->e_uuid);
6479 			if (p == PROC_NULL) {
6480 				uuid_string_t uuid_string;
6481 				uuid_unparse(mp_so->e_uuid, uuid_string);
6482 
6483 				os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6484 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
6485 
6486 				return;
6487 			}
6488 			mpte->mpte_epid = proc_pid(p);
6489 		}
6490 
6491 		pid = mpte->mpte_epid;
6492 		uuid_copy(ask.uuid, mp_so->e_uuid);
6493 	} else {
6494 		pid = mp_so->last_pid;
6495 
6496 		p = proc_find(pid);
6497 		if (p == PROC_NULL) {
6498 			os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6499 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6500 			return;
6501 		}
6502 
6503 		uuid_copy(ask.uuid, mp_so->last_uuid);
6504 	}
6505 
6506 
6507 	ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6508 
6509 	prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6510 
6511 	if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6512 	    prio == TASK_DARWINBG_APPLICATION) {
6513 		ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6514 	} else if (prio == TASK_FOREGROUND_APPLICATION) {
6515 		ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6516 	} else {
6517 		ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6518 	}
6519 
6520 	err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6521 	    &ask, sizeof(ask), CTL_DATA_EOR);
6522 
6523 	os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6524 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6525 
6526 
6527 	proc_rele(p);
6528 }
6529 
6530 static errno_t
mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo)6531 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6532     void *unitinfo)
6533 {
6534 #pragma unused(kctlref, kcunit, unitinfo)
6535 
6536 	OSDecrementAtomic(&mptcp_kern_skt_inuse);
6537 
6538 	return 0;
6539 }
6540 
6541 static errno_t
mptcp_symptoms_ctl_send(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,mbuf_t m,int flags)6542 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6543     mbuf_t m, int flags)
6544 {
6545 #pragma unused(kctlref, unitinfo, flags)
6546 	symptoms_advisory_t *sa = NULL;
6547 
6548 	if (kcunit != mptcp_kern_skt_unit) {
6549 		os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6550 		    __func__, kcunit, mptcp_kern_skt_unit);
6551 	}
6552 
6553 	if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6554 		mbuf_freem(m);
6555 		return EINVAL;
6556 	}
6557 
6558 	if (mbuf_len(m) < sizeof(*sa)) {
6559 		os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6560 		    __func__, mbuf_len(m), sizeof(*sa));
6561 		mbuf_freem(m);
6562 		return EINVAL;
6563 	}
6564 
6565 	sa = mbuf_data(m);
6566 
6567 	if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6568 		os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6569 		    sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6570 		    sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6571 
6572 		if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6573 			mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6574 			mptcp_wifi_status_changed();
6575 		}
6576 	} else {
6577 		struct mptcp_symptoms_answer answer;
6578 		errno_t err;
6579 
6580 		/* We temporarily allow different sizes for ease of submission */
6581 		if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6582 		    mbuf_len(m) != sizeof(answer)) {
6583 			os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6584 			    __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6585 			    sizeof(answer));
6586 			mbuf_free(m);
6587 			return EINVAL;
6588 		}
6589 
6590 		memset(&answer, 0, sizeof(answer));
6591 
6592 		err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6593 		if (err) {
6594 			os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6595 			mbuf_free(m);
6596 			return err;
6597 		}
6598 
6599 		mptcp_allow_uuid(answer.uuid, answer.rssi);
6600 	}
6601 
6602 	mbuf_freem(m);
6603 	return 0;
6604 }
6605 
6606 void
mptcp_control_register(void)6607 mptcp_control_register(void)
6608 {
6609 	/* Set up the advisory control socket */
6610 	struct kern_ctl_reg mptcp_kern_ctl;
6611 
6612 	bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6613 	strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6614 	    sizeof(mptcp_kern_ctl.ctl_name));
6615 	mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6616 	mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6617 	mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6618 	mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6619 
6620 	(void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6621 }
6622 
6623 mptcp_wifi_quality_t
mptcp_wifi_quality_for_session(struct mptses * mpte)6624 mptcp_wifi_quality_for_session(struct mptses *mpte)
6625 {
6626 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6627 		if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6628 		    mptcp_advisory.sa_wifi_status) {
6629 			return symptoms_is_wifi_lossy() ? MPTCP_WIFI_QUALITY_BAD : MPTCP_WIFI_QUALITY_GOOD;
6630 		}
6631 
6632 		/*
6633 		 * If it's a first-party app and we don't have any info
6634 		 * about the Wi-Fi state, let's be pessimistic.
6635 		 */
6636 		return MPTCP_WIFI_QUALITY_UNSURE;
6637 	} else {
6638 		if (symptoms_is_wifi_lossy()) {
6639 			return MPTCP_WIFI_QUALITY_BAD;
6640 		}
6641 
6642 		/*
6643 		 * If we are target-based (meaning, we allow to be more lax on
6644 		 * the when wifi is considered bad), we only *know* about the state once
6645 		 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6646 		 *
6647 		 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6648 		 * be set.
6649 		 *
6650 		 * In any other case (while in target-mode), consider WiFi bad
6651 		 * and we are going to ask for allowance from Symptoms anyway.
6652 		 */
6653 		if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6654 			if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6655 			    mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6656 				return MPTCP_WIFI_QUALITY_GOOD;
6657 			}
6658 
6659 			return MPTCP_WIFI_QUALITY_BAD;
6660 		}
6661 
6662 		return MPTCP_WIFI_QUALITY_GOOD;
6663 	}
6664 }
6665 
6666 boolean_t
symptoms_is_wifi_lossy(void)6667 symptoms_is_wifi_lossy(void)
6668 {
6669 	return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6670 }
6671 
6672 int
mptcp_freeq(struct mptcb * mp_tp)6673 mptcp_freeq(struct mptcb *mp_tp)
6674 {
6675 	struct tseg_qent *q;
6676 	int rv = 0;
6677 	int count = 0;
6678 
6679 	while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6680 		LIST_REMOVE(q, tqe_q);
6681 		m_freem(q->tqe_m);
6682 		zfree(tcp_reass_zone, q);
6683 		count++;
6684 		rv = 1;
6685 	}
6686 	mp_tp->mpt_reassqlen = 0;
6687 
6688 	if (count > 0) {
6689 		OSAddAtomic(-count, &mptcp_reass_total_qlen);
6690 	}
6691 
6692 	return rv;
6693 }
6694 
6695 static int
mptcp_post_event(u_int32_t event_code,int value)6696 mptcp_post_event(u_int32_t event_code, int value)
6697 {
6698 	struct kev_mptcp_data event_data;
6699 	struct kev_msg ev_msg;
6700 
6701 	memset(&ev_msg, 0, sizeof(ev_msg));
6702 
6703 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
6704 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
6705 	ev_msg.kev_subclass     = KEV_MPTCP_SUBCLASS;
6706 	ev_msg.event_code       = event_code;
6707 
6708 	event_data.value = value;
6709 
6710 	ev_msg.dv[0].data_ptr    = &event_data;
6711 	ev_msg.dv[0].data_length = sizeof(event_data);
6712 
6713 	return kev_post_msg(&ev_msg);
6714 }
6715 
6716 static void
mptcp_set_cellicon(struct mptses * mpte,struct mptsub * mpts)6717 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6718 {
6719 	struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6720 	int error;
6721 
6722 	/* First-party apps (Siri) don't flip the cellicon */
6723 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6724 		return;
6725 	}
6726 
6727 	/* Subflow is disappearing - don't set it on this one */
6728 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6729 		return;
6730 	}
6731 
6732 	/* Fallen back connections are not triggering the cellicon */
6733 	if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6734 		return;
6735 	}
6736 
6737 	/* Remember the last time we set the cellicon. Needed for debouncing */
6738 	mpte->mpte_last_cellicon_set = tcp_now;
6739 
6740 	tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6741 	tcp_sched_timers(tp);
6742 
6743 	if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6744 	    mpte->mpte_cellicon_increments != 0) {
6745 		if (mptcp_cellicon_refcount == 0) {
6746 			os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6747 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6748 
6749 			/* Continue, so that the icon gets set... */
6750 		} else {
6751 			/*
6752 			 * In this case, the cellicon is already set. No need to bump it
6753 			 * even higher
6754 			 */
6755 
6756 			return;
6757 		}
6758 	}
6759 
6760 	/* When tearing down this subflow, we need to decrement the
6761 	 * reference counter
6762 	 */
6763 	mpts->mpts_flags |= MPTSF_CELLICON_SET;
6764 
6765 	/* This counter, so that when a session gets destroyed we decrement
6766 	 * the reference counter by whatever is left
6767 	 */
6768 	mpte->mpte_cellicon_increments++;
6769 
6770 	if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6771 		/* If cellicon is already set, get out of here! */
6772 		return;
6773 	}
6774 
6775 	error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6776 
6777 	if (error) {
6778 		os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6779 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6780 	} else {
6781 		os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6782 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6783 	}
6784 }
6785 
6786 void
mptcp_clear_cellicon(void)6787 mptcp_clear_cellicon(void)
6788 {
6789 	int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6790 
6791 	if (error) {
6792 		os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6793 		    __func__, error);
6794 	} else {
6795 		os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6796 		    __func__);
6797 	}
6798 }
6799 
6800 /*
6801  * Returns true if the icon has been flipped to WiFi.
6802  */
6803 static boolean_t
__mptcp_unset_cellicon(uint32_t val)6804 __mptcp_unset_cellicon(uint32_t val)
6805 {
6806 	VERIFY(val < INT32_MAX);
6807 	if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
6808 		return false;
6809 	}
6810 
6811 	mptcp_clear_cellicon();
6812 
6813 	return true;
6814 }
6815 
6816 void
mptcp_unset_cellicon(struct mptses * mpte,struct mptsub * mpts,uint32_t val)6817 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6818 {
6819 	/* First-party apps (Siri) don't flip the cellicon */
6820 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6821 		return;
6822 	}
6823 
6824 	if (mpte->mpte_cellicon_increments == 0) {
6825 		/* This flow never used cell - get out of here! */
6826 		return;
6827 	}
6828 
6829 	if (mptcp_cellicon_refcount == 0) {
6830 		os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6831 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6832 
6833 		return;
6834 	}
6835 
6836 	if (mpts) {
6837 		if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6838 			return;
6839 		}
6840 
6841 		mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6842 	}
6843 
6844 	if (mpte->mpte_cellicon_increments < val) {
6845 		os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6846 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6847 		val = mpte->mpte_cellicon_increments;
6848 	}
6849 
6850 	mpte->mpte_cellicon_increments -= val;
6851 
6852 	if (__mptcp_unset_cellicon(val) == false) {
6853 		return;
6854 	}
6855 
6856 	/* All flows are gone - our counter should be at zero too! */
6857 	if (mpte->mpte_cellicon_increments != 0) {
6858 		os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6859 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6860 	}
6861 }
6862 
6863 void
mptcp_reset_rexmit_state(struct tcpcb * tp)6864 mptcp_reset_rexmit_state(struct tcpcb *tp)
6865 {
6866 	struct mptsub *mpts;
6867 	struct inpcb *inp;
6868 	struct socket *so;
6869 
6870 	inp = tp->t_inpcb;
6871 	if (inp == NULL) {
6872 		return;
6873 	}
6874 
6875 	so = inp->inp_socket;
6876 	if (so == NULL) {
6877 		return;
6878 	}
6879 
6880 	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6881 		return;
6882 	}
6883 
6884 	mpts = tp->t_mpsub;
6885 
6886 	mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6887 	so->so_flags &= ~SOF_MP_TRYFAILOVER;
6888 }
6889 
6890 void
mptcp_reset_keepalive(struct tcpcb * tp)6891 mptcp_reset_keepalive(struct tcpcb *tp)
6892 {
6893 	struct mptsub *mpts = tp->t_mpsub;
6894 
6895 	mpts->mpts_flags &= ~MPTSF_READ_STALL;
6896 }
6897 
6898 static struct mppcb *
mtcp_alloc(void)6899 mtcp_alloc(void)
6900 {
6901 	return &kalloc_type(struct mpp_mtp, Z_WAITOK | Z_ZERO | Z_NOFAIL)->mpp;
6902 }
6903 
6904 static void
mtcp_free(struct mppcb * mpp)6905 mtcp_free(struct mppcb *mpp)
6906 {
6907 	struct mpp_mtp *mtp = __container_of(mpp, struct mpp_mtp, mpp);
6908 
6909 	kfree_type(struct mpp_mtp, mtp);
6910 }
6911 
6912 /*
6913  * Protocol pr_init callback.
6914  */
6915 void
mptcp_init(struct protosw * pp,struct domain * dp)6916 mptcp_init(struct protosw *pp, struct domain *dp)
6917 {
6918 #pragma unused(dp)
6919 	static int mptcp_initialized = 0;
6920 	struct protosw *prp;
6921 	struct ip6protosw *prp6;
6922 
6923 	VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
6924 
6925 	/* do this only once */
6926 	if (mptcp_initialized) {
6927 		return;
6928 	}
6929 	mptcp_initialized = 1;
6930 
6931 	mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
6932 
6933 	/*
6934 	 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
6935 	 * we must be able to find IPPROTO_TCP entries for both.
6936 	 */
6937 	prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
6938 	VERIFY(prp != NULL);
6939 	bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
6940 	bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
6941 	    sizeof(mptcp_subflow_usrreqs));
6942 	mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
6943 	mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
6944 	mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
6945 	mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
6946 	mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
6947 	mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
6948 	/*
6949 	 * Socket filters shouldn't attach/detach to/from this protosw
6950 	 * since pr_protosw is to be used instead, which points to the
6951 	 * real protocol; if they do, it is a bug and we should panic.
6952 	 */
6953 	mptcp_subflow_protosw.pr_filter_head.tqh_first =
6954 	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6955 	mptcp_subflow_protosw.pr_filter_head.tqh_last =
6956 	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6957 
6958 	prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
6959 	    IPPROTO_TCP, SOCK_STREAM);
6960 	VERIFY(prp6 != NULL);
6961 	bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
6962 	bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
6963 	    sizeof(mptcp_subflow_usrreqs6));
6964 	mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
6965 	mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
6966 	mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
6967 	mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
6968 	mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
6969 	mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
6970 	/*
6971 	 * Socket filters shouldn't attach/detach to/from this protosw
6972 	 * since pr_protosw is to be used instead, which points to the
6973 	 * real protocol; if they do, it is a bug and we should panic.
6974 	 */
6975 	mptcp_subflow_protosw6.pr_filter_head.tqh_first =
6976 	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6977 	mptcp_subflow_protosw6.pr_filter_head.tqh_last =
6978 	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6979 
6980 	bzero(&mtcbinfo, sizeof(mtcbinfo));
6981 	TAILQ_INIT(&mtcbinfo.mppi_pcbs);
6982 	mtcbinfo.mppi_alloc = mtcp_alloc;
6983 	mtcbinfo.mppi_free  = mtcp_free;
6984 
6985 	mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", LCK_GRP_ATTR_NULL);
6986 	lck_attr_setdefault(&mtcbinfo.mppi_lock_attr);
6987 	lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
6988 	    &mtcbinfo.mppi_lock_attr);
6989 
6990 	mtcbinfo.mppi_gc = mptcp_gc;
6991 	mtcbinfo.mppi_timer = mptcp_timer;
6992 
6993 	/* attach to MP domain for garbage collection to take place */
6994 	mp_pcbinfo_attach(&mtcbinfo);
6995 
6996 	mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
6997 }
6998