xref: /xnu-8792.41.9/bsd/netinet/mptcp_subr.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1 /*
2  * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32 
33 #include <mach/sdt.h>
34 
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50 
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_cache.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_var.h>
62 #include <netinet/mptcp_var.h>
63 #include <netinet/mptcp.h>
64 #include <netinet/mptcp_opt.h>
65 #include <netinet/mptcp_seq.h>
66 #include <netinet/mptcp_timer.h>
67 #include <libkern/crypto/sha1.h>
68 #include <libkern/crypto/sha2.h>
69 #include <netinet6/in6_pcb.h>
70 #include <netinet6/ip6protosw.h>
71 #include <dev/random/randomdev.h>
72 
73 /*
74  * Notes on MPTCP implementation.
75  *
76  * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77  * communication domain.  The structure mtcbinfo describes the MPTCP instance
78  * of a Multipath protocol in that domain.  It is used to keep track of all
79  * MPTCP PCB instances in the system, and is protected by the global lock
80  * mppi_lock.
81  *
82  * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83  * IPPROTO_TCP).  Upon success, a Multipath PCB gets allocated and along with
84  * it comes an MPTCP Session and an MPTCP PCB.  All three structures are
85  * allocated from the same memory block, and each structure has a pointer
86  * to the adjacent ones.  The layout is defined by the mpp_mtp structure.
87  * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88  * PCB (mppcb) as well as the MPTCP Session (mptses).
89  *
90  * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91  *
92  * A functioning MPTCP Session consists of one or more subflow sockets.  Each
93  * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94  * represented by the mptsub structure.  Because each subflow requires access
95  * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96  * subflow.  This gets decremented prior to the subflow's destruction.
97  *
98  * To handle events (read, write, control) from the subflows, we do direct
99  * upcalls into the specific function.
100  *
101  * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102  * lock. Incoming data on a subflow also ends up taking this single lock. To
103  * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104  * of the MPTCP-socket.
105  *
106  * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107  * work is done by the MPTCP garbage collector which is invoked on demand by
108  * the PF_MULTIPATH garbage collector.  This process will take place once all
109  * of the subflows have been destroyed.
110  */
111 
112 static void mptcp_subflow_abort(struct mptsub *, int);
113 
114 static void mptcp_send_dfin(struct socket *so);
115 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
116 static int mptcp_freeq(struct mptcb *mp_tp);
117 
118 /*
119  * Possible return values for subflow event handlers.  Note that success
120  * values must be greater or equal than MPTS_EVRET_OK.  Values less than that
121  * indicate errors or actions which require immediate attention; they will
122  * prevent the rest of the handlers from processing their respective events
123  * until the next round of events processing.
124  */
125 typedef enum {
126 	MPTS_EVRET_DELETE               = 1,    /* delete this subflow */
127 	MPTS_EVRET_OK                   = 2,    /* OK */
128 	MPTS_EVRET_CONNECT_PENDING      = 3,    /* resume pended connects */
129 	MPTS_EVRET_DISCONNECT_FALLBACK  = 4,    /* abort all but preferred */
130 } ev_ret_t;
131 
132 static void mptcp_do_sha1(mptcp_key_t *, char *);
133 static void mptcp_do_sha256(mptcp_key_t *, char *);
134 
135 static void mptcp_init_local_parms(struct mptses *, struct sockaddr *);
136 
137 static ZONE_DEFINE_TYPE(mptsub_zone, "mptsub", struct mptsub, ZC_ZFREE_CLEARMEM);
138 static ZONE_DEFINE_TYPE(mptopt_zone, "mptopt", struct mptopt, ZC_ZFREE_CLEARMEM);
139 static ZONE_DEFINE(mpt_subauth_zone, "mptauth",
140     sizeof(struct mptcp_subf_auth_entry), ZC_NONE);
141 
142 struct mppcbinfo mtcbinfo;
143 
144 SYSCTL_DECL(_net_inet);
145 
146 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
147 
148 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
149     &mtcbinfo.mppi_count, 0, "Number of active PCBs");
150 
151 
152 static int mptcp_alternate_port = 0;
153 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
154     &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
155 
156 static struct protosw mptcp_subflow_protosw;
157 static struct pr_usrreqs mptcp_subflow_usrreqs;
158 static struct ip6protosw mptcp_subflow_protosw6;
159 static struct pr_usrreqs mptcp_subflow_usrreqs6;
160 
161 static uint8_t  mptcp_create_subflows_scheduled;
162 
163 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
164 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
165 static uint32_t mptcp_kern_skt_inuse = 0;
166 static uint32_t mptcp_kern_skt_unit;
167 static symptoms_advisory_t mptcp_advisory;
168 
169 uint32_t mptcp_cellicon_refcount = 0;
170 
171 os_log_t mptcp_log_handle;
172 
173 int
mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats * stats,u_short ifindex,boolean_t create)174 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create)
175 {
176 	int i, index = -1;
177 
178 	for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
179 		if (create && stats[i].ifindex == IFSCOPE_NONE) {
180 			if (index < 0) {
181 				index = i;
182 			}
183 			continue;
184 		}
185 
186 		if (stats[i].ifindex == ifindex) {
187 			index = i;
188 			return index;
189 		}
190 	}
191 
192 	if (index != -1) {
193 		stats[index].ifindex = ifindex;
194 	}
195 
196 	return index;
197 }
198 
199 static int
mptcpstats_get_index(struct mptcp_itf_stats * stats,const struct mptsub * mpts)200 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
201 {
202 	const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
203 	int index;
204 
205 	if (ifp == NULL) {
206 		os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
207 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
208 		    sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
209 		return -1;
210 	}
211 
212 	index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
213 
214 	if (index != -1) {
215 		if (stats[index].is_expensive == 0) {
216 			stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
217 		}
218 	}
219 
220 	return index;
221 }
222 
223 void
mptcpstats_inc_switch(struct mptses * mpte,const struct mptsub * mpts)224 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
225 {
226 	int index;
227 
228 	tcpstat.tcps_mp_switches++;
229 	mpte->mpte_subflow_switches++;
230 
231 	index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
232 
233 	if (index != -1) {
234 		mpte->mpte_itfstats[index].switches++;
235 	}
236 }
237 
238 /*
239  * Flushes all recorded socket options from an MP socket.
240  */
241 static void
mptcp_flush_sopts(struct mptses * mpte)242 mptcp_flush_sopts(struct mptses *mpte)
243 {
244 	struct mptopt *mpo, *tmpo;
245 
246 	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
247 		mptcp_sopt_remove(mpte, mpo);
248 		mptcp_sopt_free(mpo);
249 	}
250 	VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
251 }
252 
253 /*
254  * Create an MPTCP session, called as a result of opening a MPTCP socket.
255  */
256 int
mptcp_session_create(struct mppcb * mpp)257 mptcp_session_create(struct mppcb *mpp)
258 {
259 	struct mppcbinfo *mppi;
260 	struct mptses *mpte;
261 	struct mptcb *mp_tp;
262 
263 	VERIFY(mpp != NULL);
264 	mppi = mpp->mpp_pcbinfo;
265 	VERIFY(mppi != NULL);
266 
267 	__IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
268 	__IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
269 
270 	/* MPTCP Multipath PCB Extension */
271 	bzero(mpte, sizeof(*mpte));
272 	VERIFY(mpp->mpp_pcbe == NULL);
273 	mpp->mpp_pcbe = mpte;
274 	mpte->mpte_mppcb = mpp;
275 	mpte->mpte_mptcb = mp_tp;
276 
277 	TAILQ_INIT(&mpte->mpte_sopts);
278 	TAILQ_INIT(&mpte->mpte_subflows);
279 	mpte->mpte_associd = SAE_ASSOCID_ANY;
280 	mpte->mpte_connid_last = SAE_CONNID_ANY;
281 
282 	mptcp_init_urgency_timer(mpte);
283 
284 	mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
285 	mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
286 
287 	if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
288 		mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
289 	}
290 
291 	mpte->mpte_last_cellicon_set = tcp_now;
292 
293 	/* MPTCP Protocol Control Block */
294 	bzero(mp_tp, sizeof(*mp_tp));
295 	mp_tp->mpt_mpte = mpte;
296 	mp_tp->mpt_state = MPTCPS_CLOSED;
297 
298 	DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
299 
300 	return 0;
301 }
302 
303 struct sockaddr *
mptcp_get_session_dst(struct mptses * mpte,boolean_t ipv6,boolean_t ipv4)304 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
305 {
306 	if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
307 		return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
308 	}
309 
310 	if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
311 		return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
312 	}
313 
314 	/* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
315 	 * meaning we prefer IPv6 over IPv4.
316 	 */
317 	if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
318 		return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
319 	}
320 
321 	if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
322 		return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
323 	}
324 
325 	/* We don't yet have a unicast IP */
326 	return NULL;
327 }
328 
329 static void
mptcpstats_get_bytes(struct mptses * mpte,boolean_t initial_cell,uint64_t * cellbytes,uint64_t * allbytes)330 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
331     uint64_t *cellbytes, uint64_t *allbytes)
332 {
333 	int64_t mycellbytes = 0;
334 	uint64_t myallbytes = 0;
335 	int i;
336 
337 	for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
338 		if (mpte->mpte_itfstats[i].is_expensive) {
339 			mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
340 			mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
341 		}
342 
343 		myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
344 		myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
345 	}
346 
347 	if (initial_cell) {
348 		mycellbytes -= mpte->mpte_init_txbytes;
349 		mycellbytes -= mpte->mpte_init_rxbytes;
350 	}
351 
352 	if (mycellbytes < 0) {
353 		os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
354 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
355 		*cellbytes = 0;
356 		*allbytes = 0;
357 	} else {
358 		*cellbytes = mycellbytes;
359 		*allbytes = myallbytes;
360 	}
361 }
362 
363 static void
mptcpstats_session_wrapup(struct mptses * mpte)364 mptcpstats_session_wrapup(struct mptses *mpte)
365 {
366 	boolean_t cell = mpte->mpte_initial_cell;
367 
368 	switch (mpte->mpte_svctype) {
369 	case MPTCP_SVCTYPE_HANDOVER:
370 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
371 			tcpstat.tcps_mptcp_fp_handover_attempt++;
372 
373 			if (cell && mpte->mpte_handshake_success) {
374 				tcpstat.tcps_mptcp_fp_handover_success_cell++;
375 
376 				if (mpte->mpte_used_wifi) {
377 					tcpstat.tcps_mptcp_handover_wifi_from_cell++;
378 				}
379 			} else if (mpte->mpte_handshake_success) {
380 				tcpstat.tcps_mptcp_fp_handover_success_wifi++;
381 
382 				if (mpte->mpte_used_cell) {
383 					tcpstat.tcps_mptcp_handover_cell_from_wifi++;
384 				}
385 			}
386 		} else {
387 			tcpstat.tcps_mptcp_handover_attempt++;
388 
389 			if (cell && mpte->mpte_handshake_success) {
390 				tcpstat.tcps_mptcp_handover_success_cell++;
391 
392 				if (mpte->mpte_used_wifi) {
393 					tcpstat.tcps_mptcp_handover_wifi_from_cell++;
394 				}
395 			} else if (mpte->mpte_handshake_success) {
396 				tcpstat.tcps_mptcp_handover_success_wifi++;
397 
398 				if (mpte->mpte_used_cell) {
399 					tcpstat.tcps_mptcp_handover_cell_from_wifi++;
400 				}
401 			}
402 		}
403 
404 		if (mpte->mpte_handshake_success) {
405 			uint64_t cellbytes;
406 			uint64_t allbytes;
407 
408 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
409 
410 			tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
411 			tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
412 		}
413 		break;
414 	case MPTCP_SVCTYPE_INTERACTIVE:
415 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
416 			tcpstat.tcps_mptcp_fp_interactive_attempt++;
417 
418 			if (mpte->mpte_handshake_success) {
419 				tcpstat.tcps_mptcp_fp_interactive_success++;
420 
421 				if (!cell && mpte->mpte_used_cell) {
422 					tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
423 				}
424 			}
425 		} else {
426 			tcpstat.tcps_mptcp_interactive_attempt++;
427 
428 			if (mpte->mpte_handshake_success) {
429 				tcpstat.tcps_mptcp_interactive_success++;
430 
431 				if (!cell && mpte->mpte_used_cell) {
432 					tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
433 				}
434 			}
435 		}
436 
437 		if (mpte->mpte_handshake_success) {
438 			uint64_t cellbytes;
439 			uint64_t allbytes;
440 
441 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
442 
443 			tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
444 			tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
445 		}
446 		break;
447 	case MPTCP_SVCTYPE_AGGREGATE:
448 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
449 			tcpstat.tcps_mptcp_fp_aggregate_attempt++;
450 
451 			if (mpte->mpte_handshake_success) {
452 				tcpstat.tcps_mptcp_fp_aggregate_success++;
453 			}
454 		} else {
455 			tcpstat.tcps_mptcp_aggregate_attempt++;
456 
457 			if (mpte->mpte_handshake_success) {
458 				tcpstat.tcps_mptcp_aggregate_success++;
459 			}
460 		}
461 
462 		if (mpte->mpte_handshake_success) {
463 			uint64_t cellbytes;
464 			uint64_t allbytes;
465 
466 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
467 
468 			tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
469 			tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
470 		}
471 		break;
472 	}
473 
474 	if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
475 		tcpstat.tcps_mptcp_back_to_wifi++;
476 	}
477 
478 	if (mpte->mpte_triggered_cell) {
479 		tcpstat.tcps_mptcp_triggered_cell++;
480 	}
481 }
482 
483 /*
484  * Destroy an MPTCP session.
485  */
486 static void
mptcp_session_destroy(struct mptses * mpte)487 mptcp_session_destroy(struct mptses *mpte)
488 {
489 	struct mptcb *mp_tp = mpte->mpte_mptcb;
490 
491 	VERIFY(mp_tp != NULL);
492 	VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
493 
494 	mptcpstats_session_wrapup(mpte);
495 	mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
496 	mptcp_flush_sopts(mpte);
497 
498 	if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
499 		kfree_data(mpte->mpte_itfinfo,
500 		    sizeof(*mpte->mpte_itfinfo) * mpte->mpte_itfinfo_size);
501 	}
502 	mpte->mpte_itfinfo = NULL;
503 
504 	mptcp_freeq(mp_tp);
505 	m_freem_list(mpte->mpte_reinjectq);
506 
507 	os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
508 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
509 }
510 
511 boolean_t
mptcp_ok_to_create_subflows(struct mptcb * mp_tp)512 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
513 {
514 	return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
515 	       mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
516 	       !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
517 }
518 
519 static int
mptcp_synthesize_nat64(struct in6_addr * addr,uint32_t len,const struct in_addr * addrv4)520 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
521     const struct in_addr *addrv4)
522 {
523 	static const struct in6_addr well_known_prefix = {
524 		.__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
525 			                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
526 			                 0x00, 0x00, 0x00, 0x00},
527 	};
528 	const char *ptrv4 = (const char *)addrv4;
529 	char *ptr = (char *)addr;
530 
531 	if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
532 	    IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
533 	    IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
534 	    IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
535 	    IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
536 	    IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
537 	    INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
538 		return -1;
539 	}
540 
541 	/* Check for the well-known prefix */
542 	if (len == NAT64_PREFIX_LEN_96 &&
543 	    IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
544 		if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
545 		    IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
546 			return -1;
547 		}
548 	}
549 
550 	switch (len) {
551 	case NAT64_PREFIX_LEN_96:
552 		memcpy(ptr + 12, ptrv4, 4);
553 		break;
554 	case NAT64_PREFIX_LEN_64:
555 		memcpy(ptr + 9, ptrv4, 4);
556 		break;
557 	case NAT64_PREFIX_LEN_56:
558 		memcpy(ptr + 7, ptrv4, 1);
559 		memcpy(ptr + 9, ptrv4 + 1, 3);
560 		break;
561 	case NAT64_PREFIX_LEN_48:
562 		memcpy(ptr + 6, ptrv4, 2);
563 		memcpy(ptr + 9, ptrv4 + 2, 2);
564 		break;
565 	case NAT64_PREFIX_LEN_40:
566 		memcpy(ptr + 5, ptrv4, 3);
567 		memcpy(ptr + 9, ptrv4 + 3, 1);
568 		break;
569 	case NAT64_PREFIX_LEN_32:
570 		memcpy(ptr + 4, ptrv4, 4);
571 		break;
572 	default:
573 		panic("NAT64-prefix len is wrong: %u", len);
574 	}
575 
576 	return 0;
577 }
578 
579 static void
mptcp_trigger_cell_bringup(struct mptses * mpte)580 mptcp_trigger_cell_bringup(struct mptses *mpte)
581 {
582 	struct socket *mp_so = mptetoso(mpte);
583 
584 	if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
585 		uuid_string_t uuidstr;
586 		int err;
587 
588 		socket_unlock(mp_so, 0);
589 		err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
590 		    TRUE);
591 		socket_lock(mp_so, 0);
592 
593 		if (err == 0) {
594 			mpte->mpte_triggered_cell = 1;
595 		}
596 
597 		uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
598 		os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
599 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
600 	} else {
601 		os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
602 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
603 	}
604 }
605 
606 static boolean_t
mptcp_subflow_disconnecting(struct mptsub * mpts)607 mptcp_subflow_disconnecting(struct mptsub *mpts)
608 {
609 	if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
610 		return true;
611 	}
612 
613 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
614 		return true;
615 	}
616 
617 	if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
618 		return true;
619 	}
620 
621 	return false;
622 }
623 
624 /*
625  * In Handover mode, only create cell subflow if
626  * - Symptoms marked WiFi as weak:
627  *   Here, if we are sending data, then we can check the RTO-state. That is a
628  *   stronger signal of WiFi quality than the Symptoms indicator.
629  *   If however we are not sending any data, the only thing we can do is guess
630  *   and thus bring up Cell.
631  *
632  * - Symptoms marked WiFi as unknown:
633  *   In this state we don't know what the situation is and thus remain
634  *   conservative, only bringing up cell if there are retransmissions going on.
635  */
636 static boolean_t
mptcp_handover_use_cellular(struct mptses * mpte,struct tcpcb * tp)637 mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
638 {
639 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
640 
641 	if (wifi_quality == MPTCP_WIFI_QUALITY_GOOD) {
642 		/* WiFi is good - don't use cell */
643 		return false;
644 	}
645 
646 	if (wifi_quality == MPTCP_WIFI_QUALITY_UNSURE) {
647 		/*
648 		 * We are in unknown state, only use Cell if we have confirmed
649 		 * that WiFi is bad.
650 		 */
651 		if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
652 			return true;
653 		} else {
654 			return false;
655 		}
656 	}
657 
658 	if (wifi_quality == MPTCP_WIFI_QUALITY_BAD) {
659 		/*
660 		 * WiFi is confirmed to be bad from Symptoms-Framework.
661 		 * If we are sending data, check the RTOs.
662 		 * Otherwise, be pessimistic and use Cell.
663 		 */
664 		if (mptetoso(mpte)->so_snd.sb_cc != 0) {
665 			if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
666 				return true;
667 			} else {
668 				return false;
669 			}
670 		} else {
671 			return true;
672 		}
673 	}
674 
675 	return false;
676 }
677 
678 void
mptcp_check_subflows_and_add(struct mptses * mpte)679 mptcp_check_subflows_and_add(struct mptses *mpte)
680 {
681 	struct mptcb *mp_tp = mpte->mpte_mptcb;
682 	boolean_t cellular_viable = FALSE;
683 	boolean_t want_cellular = TRUE;
684 	uint32_t i;
685 
686 	if (!mptcp_ok_to_create_subflows(mp_tp)) {
687 		os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
688 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
689 		return;
690 	}
691 
692 	/* Just to see if we have an IP-address available */
693 	if (mptcp_get_session_dst(mpte, false, false) == NULL) {
694 		return;
695 	}
696 
697 	for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
698 		boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
699 		struct mpt_itf_info *info;
700 		struct sockaddr_in6 nat64pre;
701 		struct sockaddr *dst;
702 		struct mptsub *mpts;
703 		struct ifnet *ifp;
704 		uint32_t ifindex;
705 
706 		info = &mpte->mpte_itfinfo[i];
707 
708 		ifindex = info->ifindex;
709 		if (ifindex == IFSCOPE_NONE) {
710 			continue;
711 		}
712 
713 		os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
714 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
715 		    info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
716 
717 		if (info->no_mptcp_support) {
718 			continue;
719 		}
720 
721 		ifnet_head_lock_shared();
722 		ifp = ifindex2ifnet[ifindex];
723 		ifnet_head_done();
724 
725 		if (ifp == NULL) {
726 			continue;
727 		}
728 
729 		if (IFNET_IS_CELLULAR(ifp)) {
730 			cellular_viable = TRUE;
731 
732 			if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
733 			    mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
734 				if (mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD) {
735 					continue;
736 				}
737 			}
738 		}
739 
740 		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
741 			const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
742 			struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
743 
744 			if (subifp == NULL) {
745 				continue;
746 			}
747 
748 			/*
749 			 * If there is at least one functioning subflow on WiFi
750 			 * and we are checking for the cell interface, then
751 			 * we always need to ask symptoms for permission as
752 			 * cell is triggered even if WiFi is available.
753 			 */
754 			if (!IFNET_IS_CELLULAR(subifp) &&
755 			    !mptcp_subflow_disconnecting(mpts) &&
756 			    IFNET_IS_CELLULAR(ifp)) {
757 				need_to_ask_symptoms = TRUE;
758 			}
759 
760 			if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
761 				os_log(mptcp_log_handle,
762 				    "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
763 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
764 				    mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
765 				    IFNET_IS_CELLULAR(subifp),
766 				    mptcp_wifi_quality_for_session(mpte),
767 				    mpts->mpts_flags,
768 				    tp->t_rxtshift,
769 				    !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
770 				    mptetoso(mpte)->so_snd.sb_cc,
771 				    ifindex, subifp->if_index,
772 				    tp->t_srtt >> TCP_RTT_SHIFT,
773 				    tp->t_rttvar >> TCP_RTTVAR_SHIFT,
774 				    tp->t_rxtcur);
775 
776 				if (!IFNET_IS_CELLULAR(subifp) &&
777 				    !mptcp_subflow_disconnecting(mpts) &&
778 				    (mpts->mpts_flags & MPTSF_CONNECTED) &&
779 				    !mptcp_handover_use_cellular(mpte, tp)) {
780 					found = TRUE;
781 
782 					/* We found a proper subflow on WiFi - no need for cell */
783 					want_cellular = FALSE;
784 					break;
785 				}
786 			} else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
787 				uint64_t time_now = mach_continuous_time();
788 
789 				os_log(mptcp_log_handle,
790 				    "%s - %lx: target-based: %llu now %llu wifi quality %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
791 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
792 				    time_now, mptcp_wifi_quality_for_session(mpte),
793 				    IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
794 				    mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
795 
796 				if (!IFNET_IS_CELLULAR(subifp) &&
797 				    !mptcp_subflow_disconnecting(mpts) &&
798 				    (mpte->mpte_time_target == 0 ||
799 				    (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
800 				    mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD)) {
801 					found = TRUE;
802 
803 					want_cellular = FALSE;
804 					break;
805 				}
806 			}
807 
808 			if (subifp->if_index == ifindex &&
809 			    !mptcp_subflow_disconnecting(mpts)) {
810 				/*
811 				 * We found a subflow on this interface.
812 				 * No need to create a new one.
813 				 */
814 				found = TRUE;
815 				break;
816 			}
817 		}
818 
819 		if (found) {
820 			continue;
821 		}
822 
823 		if (need_to_ask_symptoms &&
824 		    !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
825 		    !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
826 		    mptcp_developer_mode == 0) {
827 			mptcp_ask_symptoms(mpte);
828 			return;
829 		}
830 
831 		dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
832 
833 		if (dst->sa_family == AF_INET &&
834 		    !info->has_v4_conn && info->has_nat64_conn) {
835 			struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
836 			int error, j;
837 
838 			bzero(&nat64pre, sizeof(struct sockaddr_in6));
839 
840 			error = ifnet_get_nat64prefix(ifp, nat64prefixes);
841 			if (error) {
842 				os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
843 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
844 				continue;
845 			}
846 
847 			for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
848 				if (nat64prefixes[j].prefix_len != 0) {
849 					break;
850 				}
851 			}
852 
853 			VERIFY(j < NAT64_MAX_NUM_PREFIXES);
854 
855 			error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
856 			    nat64prefixes[j].prefix_len,
857 			    &((struct sockaddr_in *)(void *)dst)->sin_addr);
858 			if (error != 0) {
859 				os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
860 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
861 				continue;
862 			}
863 
864 			memcpy(&nat64pre.sin6_addr,
865 			    &nat64prefixes[j].ipv6_prefix,
866 			    sizeof(nat64pre.sin6_addr));
867 			nat64pre.sin6_len = sizeof(struct sockaddr_in6);
868 			nat64pre.sin6_family = AF_INET6;
869 			nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
870 			nat64pre.sin6_flowinfo = 0;
871 			nat64pre.sin6_scope_id = 0;
872 
873 			dst = (struct sockaddr *)&nat64pre;
874 		}
875 
876 		if (dst->sa_family == AF_INET && !info->has_v4_conn) {
877 			continue;
878 		}
879 		if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
880 			continue;
881 		}
882 
883 		mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
884 	}
885 
886 	if (!cellular_viable && want_cellular) {
887 		/* Trigger Cell Bringup */
888 		mptcp_trigger_cell_bringup(mpte);
889 	}
890 }
891 
892 static void
mptcp_remove_cell_subflows(struct mptses * mpte)893 mptcp_remove_cell_subflows(struct mptses *mpte)
894 {
895 	struct mptsub *mpts, *tmpts;
896 
897 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
898 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
899 
900 		if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
901 			continue;
902 		}
903 
904 		os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
905 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
906 
907 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
908 	}
909 
910 	return;
911 }
912 
913 static void
mptcp_remove_wifi_subflows(struct mptses * mpte)914 mptcp_remove_wifi_subflows(struct mptses *mpte)
915 {
916 	struct mptsub *mpts, *tmpts;
917 
918 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
919 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
920 
921 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
922 			continue;
923 		}
924 
925 		os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
926 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
927 
928 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
929 	}
930 
931 	return;
932 }
933 
934 static void
mptcp_pure_handover_subflows_remove(struct mptses * mpte)935 mptcp_pure_handover_subflows_remove(struct mptses *mpte)
936 {
937 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
938 	boolean_t found_working_wifi_subflow = false;
939 	boolean_t found_working_cell_subflow = false;
940 
941 	struct mptsub *mpts;
942 
943 	/*
944 	 * Look for a subflow that is on a non-cellular interface in connected
945 	 * state.
946 	 *
947 	 * In that case, remove all cellular subflows.
948 	 *
949 	 * If however there is no connected subflow
950 	 */
951 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
952 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
953 		struct socket *so;
954 		struct tcpcb *tp;
955 
956 		if (ifp == NULL) {
957 			continue;
958 		}
959 
960 		so = mpts->mpts_socket;
961 		tp = sototcpcb(so);
962 
963 		if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
964 		    tp->t_state != TCPS_ESTABLISHED ||
965 		    mptcp_subflow_disconnecting(mpts)) {
966 			continue;
967 		}
968 
969 		if (IFNET_IS_CELLULAR(ifp)) {
970 			found_working_cell_subflow = true;
971 		} else {
972 			os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
973 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
974 			if (!mptcp_handover_use_cellular(mpte, tp)) {
975 				found_working_wifi_subflow = true;
976 			}
977 		}
978 	}
979 
980 	/*
981 	 * Couldn't find a working subflow, let's not remove those on a cellular
982 	 * interface.
983 	 */
984 	os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
985 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
986 	    found_working_wifi_subflow, found_working_cell_subflow);
987 	if (!found_working_wifi_subflow && wifi_quality != MPTCP_WIFI_QUALITY_GOOD) {
988 		if (found_working_cell_subflow) {
989 			mptcp_remove_wifi_subflows(mpte);
990 		}
991 		return;
992 	}
993 
994 	mptcp_remove_cell_subflows(mpte);
995 }
996 
997 static void
mptcp_handover_subflows_remove(struct mptses * mpte)998 mptcp_handover_subflows_remove(struct mptses *mpte)
999 {
1000 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
1001 	boolean_t found_working_subflow = false;
1002 	struct mptsub *mpts;
1003 
1004 	/*
1005 	 * Look for a subflow that is on a non-cellular interface
1006 	 * and actually works (aka, no retransmission timeout).
1007 	 */
1008 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1009 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1010 		struct socket *so;
1011 		struct tcpcb *tp;
1012 
1013 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1014 			continue;
1015 		}
1016 
1017 		so = mpts->mpts_socket;
1018 		tp = sototcpcb(so);
1019 
1020 		if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1021 		    tp->t_state != TCPS_ESTABLISHED) {
1022 			continue;
1023 		}
1024 
1025 		os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
1026 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
1027 
1028 		if (!mptcp_handover_use_cellular(mpte, tp)) {
1029 			found_working_subflow = true;
1030 			break;
1031 		}
1032 	}
1033 
1034 	/*
1035 	 * Couldn't find a working subflow, let's not remove those on a cellular
1036 	 * interface.
1037 	 */
1038 	if (!found_working_subflow) {
1039 		return;
1040 	}
1041 
1042 	mptcp_remove_cell_subflows(mpte);
1043 }
1044 
1045 static void
mptcp_targetbased_subflows_remove(struct mptses * mpte)1046 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1047 {
1048 	uint64_t time_now = mach_continuous_time();
1049 	struct mptsub *mpts;
1050 
1051 	if (mpte->mpte_time_target != 0 &&
1052 	    (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1053 	    mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
1054 		/* WiFi is bad and we are below the target - don't remove any subflows */
1055 		return;
1056 	}
1057 
1058 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1059 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1060 
1061 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1062 			continue;
1063 		}
1064 
1065 		/* We have a functioning subflow on WiFi. No need for cell! */
1066 		if (mpts->mpts_flags & MPTSF_CONNECTED &&
1067 		    !mptcp_subflow_disconnecting(mpts)) {
1068 			mptcp_remove_cell_subflows(mpte);
1069 			break;
1070 		}
1071 	}
1072 }
1073 
1074 /*
1075  * Based on the MPTCP Service-type and the state of the subflows, we
1076  * will destroy subflows here.
1077  */
1078 void
mptcp_check_subflows_and_remove(struct mptses * mpte)1079 mptcp_check_subflows_and_remove(struct mptses *mpte)
1080 {
1081 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1082 		return;
1083 	}
1084 
1085 	socket_lock_assert_owned(mptetoso(mpte));
1086 
1087 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1088 		mptcp_pure_handover_subflows_remove(mpte);
1089 	}
1090 
1091 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1092 		mptcp_handover_subflows_remove(mpte);
1093 	}
1094 
1095 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1096 		mptcp_targetbased_subflows_remove(mpte);
1097 	}
1098 }
1099 
1100 static void
mptcp_remove_subflows(struct mptses * mpte)1101 mptcp_remove_subflows(struct mptses *mpte)
1102 {
1103 	struct mptsub *mpts, *tmpts;
1104 
1105 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1106 		return;
1107 	}
1108 
1109 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1110 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1111 		boolean_t found = false;
1112 		uint32_t ifindex;
1113 		uint32_t i;
1114 
1115 		if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1116 			mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1117 
1118 			os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1119 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1120 			    ifp ? ifp->if_index : -1);
1121 			soevent(mpts->mpts_socket,
1122 			    SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1123 
1124 			continue;
1125 		}
1126 
1127 		if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1128 			continue;
1129 		}
1130 
1131 		if (ifp) {
1132 			ifindex = ifp->if_index;
1133 		} else {
1134 			ifindex = mpts->mpts_ifscope;
1135 		}
1136 
1137 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1138 			if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1139 				continue;
1140 			}
1141 
1142 			if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1143 				if (mpts->mpts_dst.sa_family == AF_INET6 &&
1144 				    (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1145 					found = true;
1146 					break;
1147 				}
1148 
1149 				if (mpts->mpts_dst.sa_family == AF_INET &&
1150 				    mpte->mpte_itfinfo[i].has_v4_conn) {
1151 					found = true;
1152 					break;
1153 				}
1154 			}
1155 		}
1156 
1157 		if (!found) {
1158 			os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1159 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1160 			    ifindex, mpts->mpts_flags);
1161 
1162 			soevent(mpts->mpts_socket,
1163 			    SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1164 		}
1165 	}
1166 }
1167 
1168 static void
mptcp_create_subflows(__unused void * arg)1169 mptcp_create_subflows(__unused void *arg)
1170 {
1171 	struct mppcb *mpp;
1172 
1173 	/*
1174 	 * Start with clearing, because we might be processing connections
1175 	 * while a new event comes in.
1176 	 */
1177 	if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1178 		os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1179 	}
1180 
1181 	/* Iterate over all MPTCP connections */
1182 
1183 	lck_mtx_lock(&mtcbinfo.mppi_lock);
1184 
1185 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1186 		struct socket *mp_so = mpp->mpp_socket;
1187 		struct mptses *mpte = mpp->mpp_pcbe;
1188 
1189 		if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1190 			continue;
1191 		}
1192 
1193 		socket_lock(mp_so, 1);
1194 		VERIFY(mp_so->so_usecount > 0);
1195 
1196 		mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1197 
1198 		mptcp_check_subflows_and_add(mpte);
1199 		mptcp_remove_subflows(mpte);
1200 
1201 		mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1202 		socket_unlock(mp_so, 1);
1203 	}
1204 
1205 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
1206 }
1207 
1208 /*
1209  * We need this because we are coming from an NECP-event. This event gets posted
1210  * while holding NECP-locks. The creation of the subflow however leads us back
1211  * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1212  * So, we would deadlock there as we already hold the NECP-lock.
1213  *
1214  * So, let's schedule this separately. It also gives NECP the chance to make
1215  * progress, without having to wait for MPTCP to finish its subflow creation.
1216  */
1217 void
mptcp_sched_create_subflows(struct mptses * mpte)1218 mptcp_sched_create_subflows(struct mptses *mpte)
1219 {
1220 	struct mppcb *mpp = mpte->mpte_mppcb;
1221 	struct mptcb *mp_tp = mpte->mpte_mptcb;
1222 	struct socket *mp_so = mpp->mpp_socket;
1223 
1224 	if (!mptcp_ok_to_create_subflows(mp_tp)) {
1225 		os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1226 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1227 		return;
1228 	}
1229 
1230 	if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1231 		mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1232 		mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1233 	}
1234 
1235 	if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1236 		return;
1237 	}
1238 
1239 	/* Do the call in 100ms to allow NECP to schedule it on all sockets */
1240 	timeout(mptcp_create_subflows, NULL, hz / 10);
1241 }
1242 
1243 /*
1244  * Allocate an MPTCP socket option structure.
1245  */
1246 struct mptopt *
mptcp_sopt_alloc(zalloc_flags_t how)1247 mptcp_sopt_alloc(zalloc_flags_t how)
1248 {
1249 	return zalloc_flags(mptopt_zone, how | Z_ZERO);
1250 }
1251 
1252 /*
1253  * Free an MPTCP socket option structure.
1254  */
1255 void
mptcp_sopt_free(struct mptopt * mpo)1256 mptcp_sopt_free(struct mptopt *mpo)
1257 {
1258 	VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1259 
1260 	zfree(mptopt_zone, mpo);
1261 }
1262 
1263 /*
1264  * Add a socket option to the MPTCP socket option list.
1265  */
1266 void
mptcp_sopt_insert(struct mptses * mpte,struct mptopt * mpo)1267 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1268 {
1269 	socket_lock_assert_owned(mptetoso(mpte));
1270 	mpo->mpo_flags |= MPOF_ATTACHED;
1271 	TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1272 }
1273 
1274 /*
1275  * Remove a socket option from the MPTCP socket option list.
1276  */
1277 void
mptcp_sopt_remove(struct mptses * mpte,struct mptopt * mpo)1278 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1279 {
1280 	socket_lock_assert_owned(mptetoso(mpte));
1281 	VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1282 	mpo->mpo_flags &= ~MPOF_ATTACHED;
1283 	TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1284 }
1285 
1286 /*
1287  * Search for an existing <sopt_level,sopt_name> socket option.
1288  */
1289 struct mptopt *
mptcp_sopt_find(struct mptses * mpte,struct sockopt * sopt)1290 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1291 {
1292 	struct mptopt *mpo;
1293 
1294 	socket_lock_assert_owned(mptetoso(mpte));
1295 
1296 	TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1297 		if (mpo->mpo_level == sopt->sopt_level &&
1298 		    mpo->mpo_name == sopt->sopt_name) {
1299 			break;
1300 		}
1301 	}
1302 	return mpo;
1303 }
1304 
1305 /*
1306  * Allocate a MPTCP subflow structure.
1307  */
1308 static struct mptsub *
mptcp_subflow_alloc(void)1309 mptcp_subflow_alloc(void)
1310 {
1311 	return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1312 }
1313 
1314 /*
1315  * Deallocate a subflow structure, called when all of the references held
1316  * on it have been released.  This implies that the subflow has been deleted.
1317  */
1318 static void
mptcp_subflow_free(struct mptsub * mpts)1319 mptcp_subflow_free(struct mptsub *mpts)
1320 {
1321 	VERIFY(mpts->mpts_refcnt == 0);
1322 	VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1323 	VERIFY(mpts->mpts_mpte == NULL);
1324 	VERIFY(mpts->mpts_socket == NULL);
1325 
1326 	free_sockaddr(mpts->mpts_src);
1327 
1328 	zfree(mptsub_zone, mpts);
1329 }
1330 
1331 static void
mptcp_subflow_addref(struct mptsub * mpts)1332 mptcp_subflow_addref(struct mptsub *mpts)
1333 {
1334 	if (++mpts->mpts_refcnt == 0) {
1335 		panic("%s: mpts %p wraparound refcnt", __func__, mpts);
1336 	}
1337 	/* NOTREACHED */
1338 }
1339 
1340 static void
mptcp_subflow_remref(struct mptsub * mpts)1341 mptcp_subflow_remref(struct mptsub *mpts)
1342 {
1343 	if (mpts->mpts_refcnt == 0) {
1344 		panic("%s: mpts %p negative refcnt", __func__, mpts);
1345 		/* NOTREACHED */
1346 	}
1347 	if (--mpts->mpts_refcnt > 0) {
1348 		return;
1349 	}
1350 
1351 	/* callee will unlock and destroy lock */
1352 	mptcp_subflow_free(mpts);
1353 }
1354 
1355 static void
mptcp_subflow_attach(struct mptses * mpte,struct mptsub * mpts,struct socket * so)1356 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1357 {
1358 	struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1359 	struct tcpcb *tp = sototcpcb(so);
1360 
1361 	/*
1362 	 * From this moment on, the subflow is linked to the MPTCP-connection.
1363 	 * Locking,... happens now at the MPTCP-layer
1364 	 */
1365 	tp->t_mptcb = mpte->mpte_mptcb;
1366 	so->so_flags |= SOF_MP_SUBFLOW;
1367 	mp_so->so_usecount++;
1368 
1369 	/*
1370 	 * Insert the subflow into the list, and associate the MPTCP PCB
1371 	 * as well as the the subflow socket.  From this point on, removing
1372 	 * the subflow needs to be done via mptcp_subflow_del().
1373 	 */
1374 	TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1375 	mpte->mpte_numflows++;
1376 
1377 	atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1378 	mpts->mpts_mpte = mpte;
1379 	mpts->mpts_socket = so;
1380 	tp->t_mpsub = mpts;
1381 	mptcp_subflow_addref(mpts);     /* for being in MPTCP subflow list */
1382 	mptcp_subflow_addref(mpts);     /* for subflow socket */
1383 }
1384 
1385 static void
mptcp_subflow_necp_cb(void * handle,__unused int action,__unused uint32_t interface_index,uint32_t necp_flags,bool * viable)1386 mptcp_subflow_necp_cb(void *handle, __unused int action,
1387     __unused uint32_t interface_index,
1388     uint32_t necp_flags, bool *viable)
1389 {
1390 	boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1391 	struct inpcb *inp = (struct inpcb *)handle;
1392 	struct socket *so = inp->inp_socket;
1393 	struct mptsub *mpts;
1394 	struct mptses *mpte;
1395 
1396 	if (low_power) {
1397 		action = NECP_CLIENT_CBACTION_NONVIABLE;
1398 	}
1399 
1400 	if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1401 		return;
1402 	}
1403 
1404 	/*
1405 	 * The socket is being garbage-collected. There is nothing to be done
1406 	 * here.
1407 	 */
1408 	if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1409 		return;
1410 	}
1411 
1412 	socket_lock(so, 1);
1413 
1414 	/* Check again after we acquired the lock. */
1415 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1416 		goto out;
1417 	}
1418 
1419 	mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1420 	mpts = sototcpcb(so)->t_mpsub;
1421 
1422 	os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1423 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1424 
1425 	mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1426 
1427 	mptcp_sched_create_subflows(mpte);
1428 
1429 	if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1430 	    mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
1431 	    mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1432 	    viable != NULL) {
1433 		*viable = 1;
1434 	}
1435 
1436 out:
1437 	socket_unlock(so, 1);
1438 }
1439 
1440 /*
1441  * Create an MPTCP subflow socket.
1442  */
1443 static int
mptcp_subflow_socreate(struct mptses * mpte,struct mptsub * mpts,int dom,struct socket ** so)1444 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1445     struct socket **so)
1446 {
1447 	lck_mtx_t *subflow_mtx;
1448 	struct mptopt smpo, *mpo, *tmpo;
1449 	struct proc *p;
1450 	struct socket *mp_so;
1451 	struct mppcb *mpp;
1452 	int error;
1453 
1454 	*so = NULL;
1455 
1456 	mp_so = mptetoso(mpte);
1457 	mpp = mpsotomppcb(mp_so);
1458 
1459 	p = proc_find(mp_so->last_pid);
1460 	if (p == PROC_NULL) {
1461 		os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1462 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1463 
1464 		mptcp_subflow_free(mpts);
1465 		return ESRCH;
1466 	}
1467 
1468 	/*
1469 	 * Create the subflow socket (multipath subflow, non-blocking.)
1470 	 *
1471 	 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1472 	 * socket; it will be cleared when the socket is peeled off or closed.
1473 	 * It also indicates to the underlying TCP to handle MPTCP options.
1474 	 * A multipath subflow socket implies SS_NOFDREF state.
1475 	 */
1476 
1477 	/*
1478 	 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1479 	 * the ipi-lock. We cannot hold the socket-lock at that point.
1480 	 */
1481 	socket_unlock(mp_so, 0);
1482 	error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1483 	    SOCF_MPTCP, PROC_NULL);
1484 	socket_lock(mp_so, 0);
1485 	if (error) {
1486 		os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1487 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1488 
1489 		proc_rele(p);
1490 
1491 		mptcp_subflow_free(mpts);
1492 		return error;
1493 	}
1494 
1495 	/*
1496 	 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1497 	 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1498 	 * Which is why we also need to get the lock with pr_getlock, as after
1499 	 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1500 	 */
1501 	subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1502 	lck_mtx_lock(subflow_mtx);
1503 
1504 	/*
1505 	 * Must be the first thing we do, to make sure all pointers for this
1506 	 * subflow are set.
1507 	 */
1508 	mptcp_subflow_attach(mpte, mpts, *so);
1509 
1510 	/*
1511 	 * A multipath subflow socket is used internally in the kernel,
1512 	 * therefore it does not have a file desciptor associated by
1513 	 * default.
1514 	 */
1515 	(*so)->so_state |= SS_NOFDREF;
1516 
1517 	lck_mtx_unlock(subflow_mtx);
1518 
1519 	/* prevent the socket buffers from being compressed */
1520 	(*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1521 	(*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1522 
1523 	/* Inherit preconnect and TFO data flags */
1524 	if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1525 		(*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1526 	}
1527 	if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1528 		(*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1529 	}
1530 	if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1531 		(*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1532 	}
1533 
1534 	/* Inherit uuid and create the related flow. */
1535 	if (!uuid_is_null(mpp->necp_client_uuid)) {
1536 		struct mptcb *mp_tp = mpte->mpte_mptcb;
1537 
1538 		sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1539 
1540 		/*
1541 		 * A note on the unlock: With MPTCP, we do multiple times a
1542 		 * necp_client_register_socket_flow. This is problematic,
1543 		 * because now the lock-ordering guarantee (first necp-locks,
1544 		 * then socket-locks) is no more respected. So, we need to
1545 		 * unlock here.
1546 		 */
1547 		socket_unlock(mp_so, 0);
1548 		error = necp_client_register_socket_flow(mp_so->last_pid,
1549 		    mpp->necp_client_uuid, sotoinpcb(*so));
1550 		socket_lock(mp_so, 0);
1551 
1552 		if (error) {
1553 			os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1554 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1555 
1556 			goto out_err;
1557 		}
1558 
1559 		/* Possible state-change during the unlock above */
1560 		if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1561 		    (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1562 			os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1563 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1564 			    mp_tp->mpt_state, mp_tp->mpt_flags);
1565 
1566 			error = EINVAL;
1567 			goto out_err;
1568 		}
1569 
1570 		uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpp->necp_client_uuid);
1571 	}
1572 
1573 	if (mpp->inp_necp_attributes.inp_domain != NULL) {
1574 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain);
1575 		sotoinpcb(*so)->inp_necp_attributes.inp_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1576 
1577 		if (sotoinpcb(*so)->inp_necp_attributes.inp_domain) {
1578 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain, mpp->inp_necp_attributes.inp_domain, string_size + 1);
1579 		}
1580 	}
1581 	if (mpp->inp_necp_attributes.inp_account != NULL) {
1582 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_account);
1583 		sotoinpcb(*so)->inp_necp_attributes.inp_account = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1584 
1585 		if (sotoinpcb(*so)->inp_necp_attributes.inp_account) {
1586 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_account, mpp->inp_necp_attributes.inp_account, string_size + 1);
1587 		}
1588 	}
1589 
1590 	if (mpp->inp_necp_attributes.inp_domain_owner != NULL) {
1591 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain_owner);
1592 		sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1593 
1594 		if (sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner) {
1595 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner, mpp->inp_necp_attributes.inp_domain_owner, string_size + 1);
1596 		}
1597 	}
1598 
1599 	if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) {
1600 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_tracker_domain);
1601 		sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1602 
1603 		if (sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain) {
1604 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain, mpp->inp_necp_attributes.inp_tracker_domain, string_size + 1);
1605 		}
1606 	}
1607 
1608 	/* Needs to happen prior to the delegation! */
1609 	(*so)->last_pid = mp_so->last_pid;
1610 
1611 	if (mp_so->so_flags & SOF_DELEGATED) {
1612 		if (mpte->mpte_epid) {
1613 			error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1614 			if (error) {
1615 				os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1616 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1617 				goto out_err;
1618 			}
1619 		}
1620 		if (!uuid_is_null(mpte->mpte_euuid)) {
1621 			error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1622 			if (error) {
1623 				os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1624 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1625 				goto out_err;
1626 			}
1627 		}
1628 	}
1629 
1630 	/* inherit the other socket options */
1631 	bzero(&smpo, sizeof(smpo));
1632 	smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1633 	smpo.mpo_level = SOL_SOCKET;
1634 	smpo.mpo_intval = 1;
1635 
1636 	/* disable SIGPIPE */
1637 	smpo.mpo_name = SO_NOSIGPIPE;
1638 	if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1639 		goto out_err;
1640 	}
1641 
1642 	/* find out if the subflow's source address goes away */
1643 	smpo.mpo_name = SO_NOADDRERR;
1644 	if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1645 		goto out_err;
1646 	}
1647 
1648 	if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1649 		/*
1650 		 * On secondary subflows we might need to set the cell-fallback
1651 		 * flag (see conditions in mptcp_subflow_sosetopt).
1652 		 */
1653 		smpo.mpo_level = SOL_SOCKET;
1654 		smpo.mpo_name = SO_MARK_CELLFALLBACK;
1655 		smpo.mpo_intval = 1;
1656 		if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1657 			goto out_err;
1658 		}
1659 	}
1660 
1661 	/* replay setsockopt(2) on the subflow sockets for eligible options */
1662 	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1663 		int interim;
1664 
1665 		if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1666 			continue;
1667 		}
1668 
1669 		/*
1670 		 * Skip those that are handled internally; these options
1671 		 * should not have been recorded and marked with the
1672 		 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1673 		 */
1674 		if (mpo->mpo_level == SOL_SOCKET &&
1675 		    (mpo->mpo_name == SO_NOSIGPIPE ||
1676 		    mpo->mpo_name == SO_NOADDRERR ||
1677 		    mpo->mpo_name == SO_KEEPALIVE)) {
1678 			continue;
1679 		}
1680 
1681 		interim = (mpo->mpo_flags & MPOF_INTERIM);
1682 		if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1683 			os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1684 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1685 			    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1686 			    mpo->mpo_intval);
1687 			mptcp_sopt_remove(mpte, mpo);
1688 			mptcp_sopt_free(mpo);
1689 			continue;
1690 		}
1691 	}
1692 
1693 	/*
1694 	 * We need to receive everything that the subflow socket has,
1695 	 * so use a customized socket receive function.  We will undo
1696 	 * this when the socket is peeled off or closed.
1697 	 */
1698 	switch (dom) {
1699 	case PF_INET:
1700 		(*so)->so_proto = &mptcp_subflow_protosw;
1701 		break;
1702 	case PF_INET6:
1703 		(*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1704 		break;
1705 	default:
1706 		VERIFY(0);
1707 		/* NOTREACHED */
1708 	}
1709 
1710 	proc_rele(p);
1711 
1712 	DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1713 	    int, dom, int, error);
1714 
1715 	return 0;
1716 
1717 out_err:
1718 	mptcp_subflow_abort(mpts, error);
1719 
1720 	proc_rele(p);
1721 
1722 	return error;
1723 }
1724 
1725 /*
1726  * Close an MPTCP subflow socket.
1727  *
1728  * Note that this may be called on an embryonic subflow, and the only
1729  * thing that is guaranteed valid is the protocol-user request.
1730  */
1731 static void
mptcp_subflow_soclose(struct mptsub * mpts)1732 mptcp_subflow_soclose(struct mptsub *mpts)
1733 {
1734 	struct socket *so = mpts->mpts_socket;
1735 
1736 	if (mpts->mpts_flags & MPTSF_CLOSED) {
1737 		return;
1738 	}
1739 
1740 	VERIFY(so != NULL);
1741 	VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1742 	VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1743 
1744 	DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1745 	    struct socket *, so,
1746 	    struct sockbuf *, &so->so_rcv,
1747 	    struct sockbuf *, &so->so_snd,
1748 	    struct mptses *, mpts->mpts_mpte);
1749 
1750 	mpts->mpts_flags |= MPTSF_CLOSED;
1751 
1752 	if (so->so_retaincnt == 0) {
1753 		soclose_locked(so);
1754 
1755 		return;
1756 	} else {
1757 		VERIFY(so->so_usecount > 0);
1758 		so->so_usecount--;
1759 	}
1760 
1761 	return;
1762 }
1763 
1764 static void
mptcp_attach_to_subf(struct socket * so,struct mptcb * mp_tp,uint8_t addr_id)1765 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
1766 {
1767 	struct tcpcb *tp = sototcpcb(so);
1768 	struct mptcp_subf_auth_entry *sauth_entry;
1769 
1770 	/*
1771 	 * The address ID of the first flow is implicitly 0.
1772 	 */
1773 	if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1774 		tp->t_local_aid = 0;
1775 	} else {
1776 		tp->t_local_aid = addr_id;
1777 		tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
1778 		so->so_flags |= SOF_MP_SEC_SUBFLOW;
1779 	}
1780 	sauth_entry = zalloc(mpt_subauth_zone);
1781 	sauth_entry->msae_laddr_id = tp->t_local_aid;
1782 	sauth_entry->msae_raddr_id = 0;
1783 	sauth_entry->msae_raddr_rand = 0;
1784 try_again:
1785 	sauth_entry->msae_laddr_rand = RandomULong();
1786 	if (sauth_entry->msae_laddr_rand == 0) {
1787 		goto try_again;
1788 	}
1789 	LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
1790 }
1791 
1792 static void
mptcp_detach_mptcb_from_subf(struct mptcb * mp_tp,struct socket * so)1793 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
1794 {
1795 	struct mptcp_subf_auth_entry *sauth_entry;
1796 	struct tcpcb *tp = NULL;
1797 	int found = 0;
1798 
1799 	tp = sototcpcb(so);
1800 	if (tp == NULL) {
1801 		return;
1802 	}
1803 
1804 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
1805 		if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
1806 			found = 1;
1807 			break;
1808 		}
1809 	}
1810 	if (found) {
1811 		LIST_REMOVE(sauth_entry, msae_next);
1812 	}
1813 
1814 	if (found) {
1815 		zfree(mpt_subauth_zone, sauth_entry);
1816 	}
1817 }
1818 
1819 /*
1820  * Connect an MPTCP subflow socket.
1821  *
1822  * Note that in the pending connect case, the subflow socket may have been
1823  * bound to an interface and/or a source IP address which may no longer be
1824  * around by the time this routine is called; in that case the connect attempt
1825  * will most likely fail.
1826  */
1827 static int
mptcp_subflow_soconnectx(struct mptses * mpte,struct mptsub * mpts)1828 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1829 {
1830 	char dbuf[MAX_IPv6_STR_LEN];
1831 	struct socket *mp_so, *so;
1832 	struct mptcb *mp_tp;
1833 	struct sockaddr *dst;
1834 	struct proc *p;
1835 	int af, error, dport;
1836 
1837 	mp_so = mptetoso(mpte);
1838 	mp_tp = mpte->mpte_mptcb;
1839 	so = mpts->mpts_socket;
1840 	af = mpts->mpts_dst.sa_family;
1841 	dst = &mpts->mpts_dst;
1842 
1843 	VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1844 	VERIFY(mpts->mpts_socket != NULL);
1845 	VERIFY(af == AF_INET || af == AF_INET6);
1846 
1847 	if (af == AF_INET) {
1848 		inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1849 		dport = ntohs(SIN(dst)->sin_port);
1850 	} else {
1851 		inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1852 		dport = ntohs(SIN6(dst)->sin6_port);
1853 	}
1854 
1855 	os_log(mptcp_log_handle,
1856 	    "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1857 	    mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1858 
1859 	p = proc_find(mp_so->last_pid);
1860 	if (p == PROC_NULL) {
1861 		os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1862 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1863 
1864 		return ESRCH;
1865 	}
1866 
1867 	mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1868 
1869 	mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1870 
1871 	/* connect the subflow socket */
1872 	error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1873 	    p, mpts->mpts_ifscope,
1874 	    mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1875 
1876 	mpts->mpts_iss = sototcpcb(so)->iss;
1877 
1878 	/* See tcp_connect_complete */
1879 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1880 	    (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1881 		mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1882 	}
1883 
1884 	/* Allocate a unique address id per subflow */
1885 	mpte->mpte_addrid_last++;
1886 	if (mpte->mpte_addrid_last == 0) {
1887 		mpte->mpte_addrid_last++;
1888 	}
1889 
1890 	proc_rele(p);
1891 
1892 	DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1893 	    struct mptsub *, mpts, int, error);
1894 	if (error) {
1895 		os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1896 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1897 	}
1898 
1899 	return error;
1900 }
1901 
1902 static int
mptcp_adj_rmap(struct socket * so,struct mbuf * m,int off,uint64_t dsn,uint32_t rseq,uint16_t dlen,uint8_t dfin)1903 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1904     uint32_t rseq, uint16_t dlen, uint8_t dfin)
1905 {
1906 	struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1907 
1908 	if (m_pktlen(m) == 0) {
1909 		return 0;
1910 	}
1911 
1912 	if (!(m->m_flags & M_PKTHDR)) {
1913 		return 0;
1914 	}
1915 
1916 	if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
1917 		if (off && (dsn != m->m_pkthdr.mp_dsn ||
1918 		    rseq != m->m_pkthdr.mp_rseq ||
1919 		    dlen != m->m_pkthdr.mp_rlen ||
1920 		    dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1921 			os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
1922 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1923 			    (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1924 			    rseq, m->m_pkthdr.mp_rseq,
1925 			    dlen, m->m_pkthdr.mp_rlen,
1926 			    dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
1927 
1928 			soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1929 			return -1;
1930 		}
1931 	}
1932 
1933 	/* If mbuf is beyond right edge of the mapping, we need to split */
1934 	if (m_pktlen(m) > dlen - dfin - off) {
1935 		struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
1936 		if (new == NULL) {
1937 			os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
1938 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1939 			    dlen, dfin, off, m_pktlen(m),
1940 			    mpts->mpts_connid);
1941 
1942 			soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1943 			return -1;
1944 		}
1945 
1946 		m->m_next = new;
1947 		sballoc(&so->so_rcv, new);
1948 		/* Undo, as sballoc will add to it as well */
1949 		so->so_rcv.sb_cc -= new->m_len;
1950 
1951 		if (so->so_rcv.sb_mbtail == m) {
1952 			so->so_rcv.sb_mbtail = new;
1953 		}
1954 	}
1955 
1956 	m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1957 	m->m_pkthdr.mp_dsn = dsn + off;
1958 	m->m_pkthdr.mp_rseq = rseq + off;
1959 	VERIFY(m_pktlen(m) < UINT16_MAX);
1960 	m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
1961 
1962 	/* Only put the DATA_FIN-flag on the last mbuf of this mapping */
1963 	if (dfin) {
1964 		if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
1965 			m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
1966 		} else {
1967 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
1968 		}
1969 	}
1970 
1971 
1972 	mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
1973 
1974 	return 0;
1975 }
1976 
1977 /*
1978  * Update the pid, upid, uuid of the subflow so, based on parent so
1979  */
1980 static void
mptcp_update_last_owner(struct socket * so,struct socket * mp_so)1981 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
1982 {
1983 	if (so->last_pid != mp_so->last_pid ||
1984 	    so->last_upid != mp_so->last_upid) {
1985 		so->last_upid = mp_so->last_upid;
1986 		so->last_pid = mp_so->last_pid;
1987 		uuid_copy(so->last_uuid, mp_so->last_uuid);
1988 	}
1989 	so_update_policy(so);
1990 }
1991 
1992 /*
1993  * MPTCP subflow socket receive routine, derived from soreceive().
1994  */
1995 static int
mptcp_subflow_soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)1996 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1997     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1998 {
1999 #pragma unused(uio)
2000 	struct socket *mp_so;
2001 	struct mptses *mpte;
2002 	struct mptcb *mp_tp;
2003 	int flags, error = 0;
2004 	struct mbuf *m, **mp = mp0;
2005 
2006 	mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2007 	mp_so = mptetoso(mpte);
2008 	mp_tp = mpte->mpte_mptcb;
2009 
2010 	VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2011 
2012 #ifdef MORE_LOCKING_DEBUG
2013 	if (so->so_usecount == 1) {
2014 		panic("%s: so=%x no other reference on socket", __func__, so);
2015 		/* NOTREACHED */
2016 	}
2017 #endif
2018 	/*
2019 	 * We return all that is there in the subflow's socket receive buffer
2020 	 * to the MPTCP layer, so we require that the caller passes in the
2021 	 * expected parameters.
2022 	 */
2023 	if (mp == NULL || controlp != NULL) {
2024 		return EINVAL;
2025 	}
2026 
2027 	*mp = NULL;
2028 	if (psa != NULL) {
2029 		*psa = NULL;
2030 	}
2031 	if (flagsp != NULL) {
2032 		flags = *flagsp & ~MSG_EOR;
2033 	} else {
2034 		flags = 0;
2035 	}
2036 
2037 	if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2038 		return EOPNOTSUPP;
2039 	}
2040 
2041 	flags |= (MSG_DONTWAIT | MSG_NBIO);
2042 
2043 	/*
2044 	 * If a recv attempt is made on a previously-accepted socket
2045 	 * that has been marked as inactive (disconnected), reject
2046 	 * the request.
2047 	 */
2048 	if (so->so_flags & SOF_DEFUNCT) {
2049 		struct sockbuf *sb = &so->so_rcv;
2050 
2051 		error = ENOTCONN;
2052 		/*
2053 		 * This socket should have been disconnected and flushed
2054 		 * prior to being returned from sodefunct(); there should
2055 		 * be no data on its receive list, so panic otherwise.
2056 		 */
2057 		if (so->so_state & SS_DEFUNCT) {
2058 			sb_empty_assert(sb, __func__);
2059 		}
2060 		return error;
2061 	}
2062 
2063 	/*
2064 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2065 	 * and if so just return to the caller.  This could happen when
2066 	 * soreceive() is called by a socket upcall function during the
2067 	 * time the socket is freed.  The socket buffer would have been
2068 	 * locked across the upcall, therefore we cannot put this thread
2069 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2070 	 * we may livelock), because the lock on the socket buffer will
2071 	 * only be released when the upcall routine returns to its caller.
2072 	 * Because the socket has been officially closed, there can be
2073 	 * no further read on it.
2074 	 *
2075 	 * A multipath subflow socket would have its SS_NOFDREF set by
2076 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2077 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2078 	 */
2079 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2080 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2081 		return 0;
2082 	}
2083 
2084 	/*
2085 	 * For consistency with soreceive() semantics, we need to obey
2086 	 * SB_LOCK in case some other code path has locked the buffer.
2087 	 */
2088 	error = sblock(&so->so_rcv, 0);
2089 	if (error != 0) {
2090 		return error;
2091 	}
2092 
2093 	m = so->so_rcv.sb_mb;
2094 	if (m == NULL) {
2095 		/*
2096 		 * Panic if we notice inconsistencies in the socket's
2097 		 * receive list; both sb_mb and sb_cc should correctly
2098 		 * reflect the contents of the list, otherwise we may
2099 		 * end up with false positives during select() or poll()
2100 		 * which could put the application in a bad state.
2101 		 */
2102 		SB_MB_CHECK(&so->so_rcv);
2103 
2104 		if (so->so_error != 0) {
2105 			error = so->so_error;
2106 			so->so_error = 0;
2107 			goto release;
2108 		}
2109 
2110 		if (so->so_state & SS_CANTRCVMORE) {
2111 			goto release;
2112 		}
2113 
2114 		if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2115 			error = ENOTCONN;
2116 			goto release;
2117 		}
2118 
2119 		/*
2120 		 * MSG_DONTWAIT is implicitly defined and this routine will
2121 		 * never block, so return EWOULDBLOCK when there is nothing.
2122 		 */
2123 		error = EWOULDBLOCK;
2124 		goto release;
2125 	}
2126 
2127 	mptcp_update_last_owner(so, mp_so);
2128 
2129 	SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2130 	SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2131 
2132 	while (m != NULL) {
2133 		int dlen = 0, error_out = 0, off = 0;
2134 		uint8_t dfin = 0;
2135 		struct mbuf *start = m;
2136 		uint64_t dsn;
2137 		uint32_t sseq;
2138 		uint16_t orig_dlen;
2139 		uint16_t csum;
2140 
2141 		VERIFY(m->m_nextpkt == NULL);
2142 
2143 		if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2144 fallback:
2145 			/* Just move mbuf to MPTCP-level */
2146 
2147 			sbfree(&so->so_rcv, m);
2148 
2149 			if (mp != NULL) {
2150 				*mp = m;
2151 				mp = &m->m_next;
2152 				so->so_rcv.sb_mb = m = m->m_next;
2153 				*mp = NULL;
2154 			}
2155 
2156 			if (m != NULL) {
2157 				so->so_rcv.sb_lastrecord = m;
2158 			} else {
2159 				SB_EMPTY_FIXUP(&so->so_rcv);
2160 			}
2161 
2162 			continue;
2163 		} else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2164 			struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2165 			boolean_t found_mapping = false;
2166 			int parsed_length = 0;
2167 			struct mbuf *m_iter;
2168 
2169 			/*
2170 			 * No MPTCP-option in the header. Either fallback or
2171 			 * wait for additional mappings.
2172 			 */
2173 			if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2174 				/* data arrived without a DSS option mapping */
2175 
2176 				/* initial subflow can fallback right after SYN handshake */
2177 				if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2178 					mptcp_notify_mpfail(so);
2179 
2180 					goto fallback;
2181 				} else {
2182 					os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2183 					    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2184 					    mpts->mpts_connid);
2185 					soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2186 
2187 					error = EIO;
2188 					*mp0 = NULL;
2189 					goto release;
2190 				}
2191 			}
2192 
2193 			/* Thus, let's look for an mbuf with the mapping */
2194 			m_iter = m->m_next;
2195 			parsed_length = m->m_len;
2196 			while (m_iter != NULL && parsed_length < UINT16_MAX) {
2197 				if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2198 					parsed_length += m_iter->m_len;
2199 					m_iter = m_iter->m_next;
2200 					continue;
2201 				}
2202 
2203 				found_mapping = true;
2204 
2205 				/* Found an mbuf with a DSS-mapping */
2206 				orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2207 				dsn = m_iter->m_pkthdr.mp_dsn;
2208 				sseq = m_iter->m_pkthdr.mp_rseq;
2209 				csum = m_iter->m_pkthdr.mp_csum;
2210 
2211 				if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2212 					dfin = 1;
2213 					dlen--;
2214 				}
2215 
2216 				break;
2217 			}
2218 
2219 			if (!found_mapping && parsed_length < UINT16_MAX) {
2220 				/* Mapping not yet present, we can wait! */
2221 				if (*mp0 == NULL) {
2222 					error = EWOULDBLOCK;
2223 				}
2224 				goto release;
2225 			} else if (!found_mapping && parsed_length >= UINT16_MAX) {
2226 				os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2227 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2228 				    mpts->mpts_connid);
2229 				/* Received 64KB without DSS-mapping. We should kill the subflow */
2230 				soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2231 
2232 				error = EIO;
2233 				*mp0 = NULL;
2234 				goto release;
2235 			}
2236 		} else {
2237 			orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2238 			dsn = m->m_pkthdr.mp_dsn;
2239 			sseq = m->m_pkthdr.mp_rseq;
2240 			csum = m->m_pkthdr.mp_csum;
2241 
2242 			if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2243 				dfin = 1;
2244 				dlen--;
2245 			}
2246 		}
2247 
2248 		/*
2249 		 * Check if the full mapping is now present
2250 		 */
2251 		if ((int)so->so_rcv.sb_cc < dlen) {
2252 			if (*mp0 == NULL) {
2253 				error = EWOULDBLOCK;
2254 			}
2255 			goto release;
2256 		}
2257 
2258 		/* Now, get the full mapping */
2259 		off = 0;
2260 		while (dlen > 0) {
2261 			if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
2262 				error_out = 1;
2263 				error = EIO;
2264 				dlen = 0;
2265 				*mp0 = NULL;
2266 				break;
2267 			}
2268 
2269 			dlen -= m->m_len;
2270 			off += m->m_len;
2271 			sbfree(&so->so_rcv, m);
2272 
2273 			if (mp != NULL) {
2274 				*mp = m;
2275 				mp = &m->m_next;
2276 				so->so_rcv.sb_mb = m = m->m_next;
2277 				*mp = NULL;
2278 			}
2279 
2280 			ASSERT(dlen == 0 || m);
2281 			if (dlen != 0 && m == NULL) {
2282 				/* "try" to gracefully recover on customer builds */
2283 				error_out = 1;
2284 				error = EIO;
2285 				dlen  = 0;
2286 
2287 				*mp0 = NULL;
2288 
2289 				SB_EMPTY_FIXUP(&so->so_rcv);
2290 				soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2291 
2292 				break;
2293 			}
2294 		}
2295 
2296 		VERIFY(dlen == 0);
2297 
2298 		if (m != NULL) {
2299 			so->so_rcv.sb_lastrecord = m;
2300 		} else {
2301 			SB_EMPTY_FIXUP(&so->so_rcv);
2302 		}
2303 
2304 		if (error_out) {
2305 			goto release;
2306 		}
2307 
2308 		if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2309 			error = EIO;
2310 			*mp0 = NULL;
2311 			goto release;
2312 		}
2313 
2314 		SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2315 		SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2316 	}
2317 
2318 	DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2319 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2320 
2321 	if (flagsp != NULL) {
2322 		*flagsp |= flags;
2323 	}
2324 
2325 release:
2326 	sbunlock(&so->so_rcv, TRUE);
2327 
2328 	return error;
2329 }
2330 
2331 /*
2332  * MPTCP subflow socket send routine, derived from sosend().
2333  */
2334 static int
mptcp_subflow_sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2335 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2336     struct mbuf *top, struct mbuf *control, int flags)
2337 {
2338 	struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2339 	boolean_t en_tracing = FALSE, proc_held = FALSE;
2340 	struct proc *p = current_proc();
2341 	int en_tracing_val;
2342 	int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2343 	int error;
2344 
2345 	VERIFY(control == NULL);
2346 	VERIFY(addr == NULL);
2347 	VERIFY(uio == NULL);
2348 	VERIFY(flags == 0);
2349 	VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2350 
2351 	VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2352 	VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2353 
2354 	/*
2355 	 * trace if tracing & network (vs. unix) sockets & and
2356 	 * non-loopback
2357 	 */
2358 	if (ENTR_SHOULDTRACE &&
2359 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2360 		struct inpcb *inp = sotoinpcb(so);
2361 		if (inp->inp_last_outifp != NULL &&
2362 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2363 			en_tracing = TRUE;
2364 			en_tracing_val = top->m_pkthdr.len;
2365 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2366 			    (unsigned long)VM_KERNEL_ADDRPERM(so),
2367 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2368 			    (int64_t)en_tracing_val);
2369 		}
2370 	}
2371 
2372 	mptcp_update_last_owner(so, mp_so);
2373 
2374 	if (mp_so->last_pid != proc_pid(p)) {
2375 		p = proc_find(mp_so->last_pid);
2376 		if (p == PROC_NULL) {
2377 			p = current_proc();
2378 		} else {
2379 			proc_held = TRUE;
2380 		}
2381 	}
2382 
2383 #if NECP
2384 	inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2385 #endif /* NECP */
2386 
2387 	error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2388 	if (error) {
2389 		goto out;
2390 	}
2391 
2392 	error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2393 	top = NULL;
2394 
2395 out:
2396 	if (top != NULL) {
2397 		m_freem(top);
2398 	}
2399 
2400 	if (proc_held) {
2401 		proc_rele(p);
2402 	}
2403 
2404 	soclearfastopen(so);
2405 
2406 	if (en_tracing) {
2407 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2408 		    (unsigned long)VM_KERNEL_ADDRPERM(so),
2409 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2410 		    (int64_t)en_tracing_val);
2411 	}
2412 
2413 	return error;
2414 }
2415 
2416 /*
2417  * Subflow socket write upcall.
2418  *
2419  * Called when the associated subflow socket posted a read event.
2420  */
2421 static void
mptcp_subflow_wupcall(struct socket * so,void * arg,int waitf)2422 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2423 {
2424 #pragma unused(so, waitf)
2425 	struct mptsub *mpts = arg;
2426 	struct mptses *mpte = mpts->mpts_mpte;
2427 
2428 	VERIFY(mpte != NULL);
2429 
2430 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2431 		if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2432 			mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2433 		}
2434 		return;
2435 	}
2436 
2437 	mptcp_output(mpte);
2438 }
2439 
2440 /*
2441  * Subflow socket control event upcall.
2442  */
2443 static void
mptcp_subflow_eupcall1(struct socket * so,void * arg,uint32_t events)2444 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
2445 {
2446 #pragma unused(so)
2447 	struct mptsub *mpts = arg;
2448 	struct mptses *mpte = mpts->mpts_mpte;
2449 
2450 	socket_lock_assert_owned(mptetoso(mpte));
2451 
2452 	if ((mpts->mpts_evctl & events) == events) {
2453 		return;
2454 	}
2455 
2456 	mpts->mpts_evctl |= events;
2457 
2458 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2459 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
2460 		return;
2461 	}
2462 
2463 	mptcp_subflow_workloop(mpte);
2464 }
2465 
2466 /*
2467  * Establish an initial MPTCP connection (if first subflow and not yet
2468  * connected), or add a subflow to an existing MPTCP connection.
2469  */
2470 int
mptcp_subflow_add(struct mptses * mpte,struct sockaddr * src,struct sockaddr * dst,uint32_t ifscope,sae_connid_t * pcid)2471 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2472     struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2473 {
2474 	struct socket *mp_so, *so = NULL;
2475 	struct mptcb *mp_tp;
2476 	struct mptsub *mpts = NULL;
2477 	int af, error = 0;
2478 
2479 	mp_so = mptetoso(mpte);
2480 	mp_tp = mpte->mpte_mptcb;
2481 
2482 	socket_lock_assert_owned(mp_so);
2483 
2484 	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2485 		/* If the remote end sends Data FIN, refuse subflow adds */
2486 		os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2487 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2488 		error = ENOTCONN;
2489 		goto out_err;
2490 	}
2491 
2492 	if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2493 		error = EOVERFLOW;
2494 		goto out_err;
2495 	}
2496 
2497 	mpts = mptcp_subflow_alloc();
2498 	if (mpts == NULL) {
2499 		os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2500 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2501 		error = ENOMEM;
2502 		goto out_err;
2503 	}
2504 
2505 	if (src) {
2506 		if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2507 			error = EAFNOSUPPORT;
2508 			goto out_err;
2509 		}
2510 
2511 		if (src->sa_family == AF_INET &&
2512 		    src->sa_len != sizeof(struct sockaddr_in)) {
2513 			error = EINVAL;
2514 			goto out_err;
2515 		}
2516 
2517 		if (src->sa_family == AF_INET6 &&
2518 		    src->sa_len != sizeof(struct sockaddr_in6)) {
2519 			error = EINVAL;
2520 			goto out_err;
2521 		}
2522 
2523 		mpts->mpts_src = (struct sockaddr *)alloc_sockaddr(src->sa_len,
2524 		    Z_WAITOK | Z_NOFAIL);
2525 
2526 		bcopy(src, mpts->mpts_src, src->sa_len);
2527 	}
2528 
2529 	if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2530 		error = EAFNOSUPPORT;
2531 		goto out_err;
2532 	}
2533 
2534 	if (dst->sa_family == AF_INET &&
2535 	    dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2536 		error = EINVAL;
2537 		goto out_err;
2538 	}
2539 
2540 	if (dst->sa_family == AF_INET6 &&
2541 	    dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2542 		error = EINVAL;
2543 		goto out_err;
2544 	}
2545 
2546 	memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
2547 
2548 	af = mpts->mpts_dst.sa_family;
2549 
2550 	ifnet_head_lock_shared();
2551 	if ((ifscope > (unsigned)if_index)) {
2552 		ifnet_head_done();
2553 		error = ENXIO;
2554 		goto out_err;
2555 	}
2556 	ifnet_head_done();
2557 
2558 	mpts->mpts_ifscope = ifscope;
2559 
2560 	/* create the subflow socket */
2561 	if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2562 		/*
2563 		 * Returning (error) and not cleaning up, because up to here
2564 		 * all we did is creating mpts.
2565 		 *
2566 		 * And the contract is that the call to mptcp_subflow_socreate,
2567 		 * moves ownership of mpts to mptcp_subflow_socreate.
2568 		 */
2569 		return error;
2570 	}
2571 
2572 	/*
2573 	 * We may be called from within the kernel. Still need to account this
2574 	 * one to the real app.
2575 	 */
2576 	mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2577 
2578 	/*
2579 	 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2580 	 * -1 (SAE_CONNID_ALL).
2581 	 */
2582 	mpte->mpte_connid_last++;
2583 	if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2584 	    mpte->mpte_connid_last == SAE_CONNID_ANY) {
2585 		mpte->mpte_connid_last++;
2586 	}
2587 
2588 	mpts->mpts_connid = mpte->mpte_connid_last;
2589 
2590 	mpts->mpts_rel_seq = 1;
2591 
2592 	/* Allocate a unique address id per subflow */
2593 	mpte->mpte_addrid_last++;
2594 	if (mpte->mpte_addrid_last == 0) {
2595 		mpte->mpte_addrid_last++;
2596 	}
2597 
2598 	/* register for subflow socket read/write events */
2599 	sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2600 
2601 	/* Register for subflow socket control events */
2602 	sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2603 	    SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2604 	    SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2605 	    SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2606 	    SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2607 	    SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2608 	    SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2609 	    SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2610 
2611 	/* sanity check */
2612 	VERIFY(!(mpts->mpts_flags &
2613 	    (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2614 
2615 	/*
2616 	 * Indicate to the TCP subflow whether or not it should establish
2617 	 * the initial MPTCP connection, or join an existing one.  Fill
2618 	 * in the connection request structure with additional info needed
2619 	 * by the underlying TCP (to be used in the TCP options, etc.)
2620 	 */
2621 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2622 		mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2623 
2624 		if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2625 			mptcp_init_local_parms(mpte, dst);
2626 		}
2627 		soisconnecting(mp_so);
2628 
2629 		/* If fastopen is requested, set state in mpts */
2630 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2631 			mpts->mpts_flags |= MPTSF_TFO_REQD;
2632 		}
2633 	} else {
2634 		if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2635 			mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2636 		}
2637 	}
2638 
2639 	mpts->mpts_flags |= MPTSF_CONNECTING;
2640 
2641 	/* connect right away if first attempt, or if join can be done now */
2642 	if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2643 		error = mptcp_subflow_soconnectx(mpte, mpts);
2644 	}
2645 
2646 	if (error) {
2647 		goto out_err_close;
2648 	}
2649 
2650 	if (pcid) {
2651 		*pcid = mpts->mpts_connid;
2652 	}
2653 
2654 	return 0;
2655 
2656 out_err_close:
2657 	mptcp_subflow_abort(mpts, error);
2658 
2659 	return error;
2660 
2661 out_err:
2662 	if (mpts) {
2663 		mptcp_subflow_free(mpts);
2664 	}
2665 
2666 	return error;
2667 }
2668 
2669 void
mptcpstats_update(struct mptcp_itf_stats * stats,const struct mptsub * mpts)2670 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2671 {
2672 	int index = mptcpstats_get_index(stats, mpts);
2673 
2674 	if (index != -1) {
2675 		struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2676 
2677 		stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2678 		stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2679 
2680 		stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2681 		stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2682 
2683 		stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2684 		stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2685 
2686 		stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2687 		stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2688 	}
2689 }
2690 
2691 /*
2692  * Delete/remove a subflow from an MPTCP.  The underlying subflow socket
2693  * will no longer be accessible after a subflow is deleted, thus this
2694  * should occur only after the subflow socket has been disconnected.
2695  */
2696 void
mptcp_subflow_del(struct mptses * mpte,struct mptsub * mpts)2697 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2698 {
2699 	struct socket *mp_so = mptetoso(mpte);
2700 	struct socket *so = mpts->mpts_socket;
2701 	struct tcpcb *tp = sototcpcb(so);
2702 
2703 	socket_lock_assert_owned(mp_so);
2704 	VERIFY(mpts->mpts_mpte == mpte);
2705 	VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2706 	VERIFY(mpte->mpte_numflows != 0);
2707 	VERIFY(mp_so->so_usecount > 0);
2708 
2709 	mptcpstats_update(mpte->mpte_itfstats, mpts);
2710 
2711 	mptcp_unset_cellicon(mpte, mpts, 1);
2712 
2713 	mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2714 	mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2715 
2716 	atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2717 	TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2718 	mpte->mpte_numflows--;
2719 	if (mpte->mpte_active_sub == mpts) {
2720 		mpte->mpte_active_sub = NULL;
2721 	}
2722 
2723 	/*
2724 	 * Drop references held by this subflow socket; there
2725 	 * will be no further upcalls made from this point.
2726 	 */
2727 	sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2728 	sock_catchevents_locked(so, NULL, NULL, 0);
2729 
2730 	mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2731 
2732 	mp_so->so_usecount--;           /* for subflow socket */
2733 	mpts->mpts_mpte = NULL;
2734 	mpts->mpts_socket = NULL;
2735 
2736 	mptcp_subflow_remref(mpts);             /* for MPTCP subflow list */
2737 	mptcp_subflow_remref(mpts);             /* for subflow socket */
2738 
2739 	so->so_flags &= ~SOF_MP_SUBFLOW;
2740 	tp->t_mptcb = NULL;
2741 	tp->t_mpsub = NULL;
2742 }
2743 
2744 void
mptcp_subflow_shutdown(struct mptses * mpte,struct mptsub * mpts)2745 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2746 {
2747 	struct socket *so = mpts->mpts_socket;
2748 	struct mptcb *mp_tp = mpte->mpte_mptcb;
2749 	int send_dfin = 0;
2750 
2751 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2752 		send_dfin = 1;
2753 	}
2754 
2755 	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2756 	    (so->so_state & SS_ISCONNECTED)) {
2757 		if (send_dfin) {
2758 			mptcp_send_dfin(so);
2759 		}
2760 		soshutdownlock(so, SHUT_WR);
2761 	}
2762 }
2763 
2764 static void
mptcp_subflow_abort(struct mptsub * mpts,int error)2765 mptcp_subflow_abort(struct mptsub *mpts, int error)
2766 {
2767 	struct socket *so = mpts->mpts_socket;
2768 	struct tcpcb *tp = sototcpcb(so);
2769 
2770 	if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2771 		return;
2772 	}
2773 
2774 	if (tp->t_state != TCPS_CLOSED) {
2775 		tcp_drop(tp, error);
2776 	}
2777 
2778 	mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2779 }
2780 
2781 /*
2782  * Disconnect a subflow socket.
2783  */
2784 void
mptcp_subflow_disconnect(struct mptses * mpte,struct mptsub * mpts)2785 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2786 {
2787 	struct socket *so, *mp_so;
2788 	struct mptcb *mp_tp;
2789 	int send_dfin = 0;
2790 
2791 	so = mpts->mpts_socket;
2792 	mp_tp = mpte->mpte_mptcb;
2793 	mp_so = mptetoso(mpte);
2794 
2795 	socket_lock_assert_owned(mp_so);
2796 
2797 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2798 		return;
2799 	}
2800 
2801 	mptcp_unset_cellicon(mpte, mpts, 1);
2802 
2803 	mpts->mpts_flags |= MPTSF_DISCONNECTING;
2804 
2805 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2806 		send_dfin = 1;
2807 	}
2808 
2809 	if (mp_so->so_flags & SOF_DEFUNCT) {
2810 		errno_t ret;
2811 
2812 		ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2813 		if (ret == 0) {
2814 			ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2815 
2816 			if (ret != 0) {
2817 				os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2818 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2819 			}
2820 		} else {
2821 			os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2822 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2823 		}
2824 	}
2825 
2826 	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2827 	    (so->so_state & SS_ISCONNECTED)) {
2828 		if (send_dfin) {
2829 			mptcp_send_dfin(so);
2830 		}
2831 
2832 		(void) soshutdownlock(so, SHUT_RD);
2833 		(void) soshutdownlock(so, SHUT_WR);
2834 		(void) sodisconnectlocked(so);
2835 	}
2836 
2837 	/*
2838 	 * Generate a disconnect event for this subflow socket, in case
2839 	 * the lower layer doesn't do it; this is needed because the
2840 	 * subflow socket deletion relies on it.
2841 	 */
2842 	mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2843 }
2844 
2845 /*
2846  * Subflow socket input.
2847  */
2848 static void
mptcp_subflow_input(struct mptses * mpte,struct mptsub * mpts)2849 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2850 {
2851 	struct socket *mp_so = mptetoso(mpte);
2852 	struct mbuf *m = NULL;
2853 	struct socket *so;
2854 	int error, wakeup = 0;
2855 
2856 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2857 	mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2858 
2859 	DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2860 	    struct mptsub *, mpts);
2861 
2862 	if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2863 		goto out;
2864 	}
2865 
2866 	so = mpts->mpts_socket;
2867 
2868 	error = sock_receive_internal(so, NULL, &m, 0, NULL);
2869 	if (error != 0 && error != EWOULDBLOCK) {
2870 		os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2871 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2872 		if (error == ENODATA) {
2873 			/*
2874 			 * Don't ignore ENODATA so as to discover
2875 			 * nasty middleboxes.
2876 			 */
2877 			mp_so->so_error = ENODATA;
2878 
2879 			wakeup = 1;
2880 			goto out;
2881 		}
2882 	}
2883 
2884 	/* In fallback, make sure to accept data on all but one subflow */
2885 	if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2886 	    !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2887 		m_freem(m);
2888 		goto out;
2889 	}
2890 
2891 	if (m != NULL) {
2892 		if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2893 			mptcp_set_cellicon(mpte, mpts);
2894 
2895 			mpte->mpte_used_cell = 1;
2896 		} else {
2897 			/*
2898 			 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2899 			 * explicitly set the cellicon, then we unset it again.
2900 			 */
2901 			if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2902 				mptcp_unset_cellicon(mpte, NULL, 1);
2903 			}
2904 
2905 			mpte->mpte_used_wifi = 1;
2906 		}
2907 
2908 		mptcp_input(mpte, m);
2909 	}
2910 
2911 out:
2912 	if (wakeup) {
2913 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2914 	}
2915 
2916 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2917 }
2918 
2919 void
mptcp_handle_input(struct socket * so)2920 mptcp_handle_input(struct socket *so)
2921 {
2922 	struct mptsub *mpts, *tmpts;
2923 	struct mptses *mpte;
2924 
2925 	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2926 		return;
2927 	}
2928 
2929 	mpts = sototcpcb(so)->t_mpsub;
2930 	mpte = mpts->mpts_mpte;
2931 
2932 	socket_lock_assert_owned(mptetoso(mpte));
2933 
2934 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2935 		if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2936 			mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2937 		}
2938 		return;
2939 	}
2940 
2941 	mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2942 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2943 		if (mpts->mpts_socket->so_usecount == 0) {
2944 			/* Will be removed soon by tcp_garbage_collect */
2945 			continue;
2946 		}
2947 
2948 		mptcp_subflow_addref(mpts);
2949 		mpts->mpts_socket->so_usecount++;
2950 
2951 		mptcp_subflow_input(mpte, mpts);
2952 
2953 		mptcp_subflow_remref(mpts);             /* ours */
2954 
2955 		VERIFY(mpts->mpts_socket->so_usecount != 0);
2956 		mpts->mpts_socket->so_usecount--;
2957 	}
2958 
2959 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2960 }
2961 
2962 static boolean_t
mptcp_search_seq_in_sub(struct mbuf * m,struct socket * so)2963 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2964 {
2965 	struct mbuf *so_m = so->so_snd.sb_mb;
2966 	uint64_t dsn = m->m_pkthdr.mp_dsn;
2967 
2968 	while (so_m) {
2969 		VERIFY(so_m->m_flags & M_PKTHDR);
2970 		VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2971 
2972 		/* Part of the segment is covered, don't reinject here */
2973 		if (so_m->m_pkthdr.mp_dsn <= dsn &&
2974 		    so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2975 			return TRUE;
2976 		}
2977 
2978 		so_m = so_m->m_next;
2979 	}
2980 
2981 	return FALSE;
2982 }
2983 
2984 /*
2985  * Subflow socket output.
2986  *
2987  * Called for sending data from MPTCP to the underlying subflow socket.
2988  */
2989 int
mptcp_subflow_output(struct mptses * mpte,struct mptsub * mpts,int flags)2990 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
2991 {
2992 	struct mptcb *mp_tp = mpte->mpte_mptcb;
2993 	struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL;
2994 	struct socket *mp_so, *so;
2995 	struct tcpcb *tp;
2996 	uint64_t mpt_dsn = 0, off = 0;
2997 	int sb_cc = 0, error = 0, wakeup = 0;
2998 	uint16_t dss_csum;
2999 	uint16_t tot_sent = 0;
3000 	boolean_t reinjected = FALSE;
3001 
3002 	mp_so = mptetoso(mpte);
3003 	so = mpts->mpts_socket;
3004 	tp = sototcpcb(so);
3005 
3006 	socket_lock_assert_owned(mp_so);
3007 
3008 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3009 	mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
3010 
3011 	VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3012 	VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
3013 	    (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3014 	    (mpts->mpts_flags & MPTSF_TFO_REQD));
3015 	VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
3016 
3017 	DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3018 	    struct mptsub *, mpts);
3019 
3020 	/* Remove Addr Option is not sent reliably as per I-D */
3021 	if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
3022 		tp->t_rem_aid = mpte->mpte_lost_aid;
3023 		tp->t_mpflags |= TMPF_SND_REM_ADDR;
3024 		mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3025 	}
3026 
3027 	/*
3028 	 * The mbuf chains containing the metadata (as well as pointing to
3029 	 * the user data sitting at the MPTCP output queue) would then be
3030 	 * sent down to the subflow socket.
3031 	 *
3032 	 * Some notes on data sequencing:
3033 	 *
3034 	 *   a. Each mbuf must be a M_PKTHDR.
3035 	 *   b. MPTCP metadata is stored in the mptcp_pktinfo structure
3036 	 *	in the mbuf pkthdr structure.
3037 	 *   c. Each mbuf containing the MPTCP metadata must have its
3038 	 *	pkt_flags marked with the PKTF_MPTCP flag.
3039 	 */
3040 
3041 	if (mpte->mpte_reinjectq) {
3042 		sb_mb = mpte->mpte_reinjectq;
3043 	} else {
3044 		sb_mb = mp_so->so_snd.sb_mb;
3045 	}
3046 
3047 	if (sb_mb == NULL) {
3048 		os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3049 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3050 		    (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3051 		    (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3052 
3053 		/* Fix it to prevent looping */
3054 		if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3055 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3056 		}
3057 		goto out;
3058 	}
3059 
3060 	VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3061 
3062 	if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3063 	    !(so->so_state & SS_ISCONNECTED) &&
3064 	    (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3065 		tp->t_mpflags |= TMPF_TFO_REQUEST;
3066 
3067 		/* Opting to call pru_send as no mbuf at subflow level */
3068 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3069 		    NULL, current_proc());
3070 
3071 		goto done_sending;
3072 	}
3073 
3074 	mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3075 
3076 	/* First, drop acknowledged data */
3077 	if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3078 		os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3079 		    "dsn %u suna %u reinject? %u\n",
3080 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3081 		    (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3082 		if (mpte->mpte_reinjectq) {
3083 			mptcp_clean_reinjectq(mpte);
3084 		} else {
3085 			uint64_t len = 0;
3086 			len = mp_tp->mpt_snduna - mpt_dsn;
3087 			sbdrop(&mp_so->so_snd, (int)len);
3088 			wakeup = 1;
3089 		}
3090 	}
3091 
3092 	/* Check again because of above sbdrop */
3093 	if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3094 		os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3095 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3096 		goto out;
3097 	}
3098 
3099 	/*
3100 	 * In degraded mode, we don't receive data acks, so force free
3101 	 * mbufs less than snd_nxt
3102 	 */
3103 	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3104 	    (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3105 	    mp_so->so_snd.sb_mb) {
3106 		mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3107 		if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3108 			uint64_t len = 0;
3109 			len = mp_tp->mpt_snduna - mpt_dsn;
3110 			sbdrop(&mp_so->so_snd, (int)len);
3111 			wakeup = 1;
3112 
3113 			os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3114 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3115 			    (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3116 		}
3117 	}
3118 
3119 	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3120 	    !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3121 		mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3122 		so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3123 	}
3124 
3125 	/*
3126 	 * Adjust the top level notion of next byte used for retransmissions
3127 	 * and sending FINs.
3128 	 */
3129 	if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3130 		mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3131 	}
3132 
3133 	/* Now determine the offset from which to start transmitting data */
3134 	if (mpte->mpte_reinjectq) {
3135 		sb_mb = mpte->mpte_reinjectq;
3136 	} else {
3137 dont_reinject:
3138 		sb_mb = mp_so->so_snd.sb_mb;
3139 	}
3140 	if (sb_mb == NULL) {
3141 		os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3142 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3143 		goto out;
3144 	}
3145 
3146 	if (sb_mb == mpte->mpte_reinjectq) {
3147 		sb_cc = sb_mb->m_pkthdr.mp_rlen;
3148 		off = 0;
3149 
3150 		if (mptcp_search_seq_in_sub(sb_mb, so)) {
3151 			if (mptcp_can_send_more(mp_tp, TRUE)) {
3152 				goto dont_reinject;
3153 			}
3154 
3155 			error = ECANCELED;
3156 			goto out;
3157 		}
3158 
3159 		reinjected = TRUE;
3160 	} else if (flags & MPTCP_SUBOUT_PROBING) {
3161 		sb_cc = sb_mb->m_pkthdr.mp_rlen;
3162 		off = 0;
3163 	} else {
3164 		sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3165 
3166 		/*
3167 		 * With TFO, there might be no data at all, thus still go into this
3168 		 * code-path here.
3169 		 */
3170 		if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3171 		    MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3172 			off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3173 			sb_cc -= off;
3174 		} else {
3175 			os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3176 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3177 			    (uint32_t)mp_tp->mpt_sndmax);
3178 
3179 			goto out;
3180 		}
3181 	}
3182 
3183 	sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3184 	if (sb_cc <= 0) {
3185 		os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3186 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3187 		    (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3188 		    mptcp_subflow_cwnd_space(so));
3189 	}
3190 
3191 	sb_cc = min(sb_cc, UINT16_MAX);
3192 
3193 	/*
3194 	 * Create a DSN mapping for the data we are about to send. It all
3195 	 * has the same mapping.
3196 	 */
3197 	if (reinjected) {
3198 		mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3199 	} else {
3200 		mpt_dsn = mp_tp->mpt_snduna + off;
3201 	}
3202 
3203 	mpt_mbuf = sb_mb;
3204 	while (mpt_mbuf && reinjected == FALSE &&
3205 	    (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3206 	    mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3207 		off -= mpt_mbuf->m_pkthdr.mp_rlen;
3208 		mpt_mbuf = mpt_mbuf->m_next;
3209 	}
3210 	VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3211 
3212 	head = tail = NULL;
3213 
3214 	while (tot_sent < sb_cc) {
3215 		int32_t mlen;
3216 
3217 		mlen = mpt_mbuf->m_len;
3218 		mlen -= off;
3219 		mlen = MIN(mlen, sb_cc - tot_sent);
3220 
3221 		if (mlen < 0) {
3222 			os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3223 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3224 			    (uint32_t)off, sb_cc, tot_sent);
3225 			goto out;
3226 		}
3227 
3228 		if (mlen == 0) {
3229 			goto next;
3230 		}
3231 
3232 		m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
3233 		    M_COPYM_MUST_COPY_HDR);
3234 		if (m == NULL) {
3235 			os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3236 			    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3237 			error = ENOBUFS;
3238 			break;
3239 		}
3240 
3241 		/* Create a DSN mapping for the data (m_copym does it) */
3242 		VERIFY(m->m_flags & M_PKTHDR);
3243 		VERIFY(m->m_next == NULL);
3244 
3245 		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3246 		m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3247 		m->m_pkthdr.mp_dsn = mpt_dsn;
3248 		m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3249 		m->m_pkthdr.len = mlen;
3250 
3251 		if (head == NULL) {
3252 			head = tail = m;
3253 		} else {
3254 			tail->m_next = m;
3255 			tail = m;
3256 		}
3257 
3258 		tot_sent += mlen;
3259 		off = 0;
3260 next:
3261 		mpt_mbuf = mpt_mbuf->m_next;
3262 	}
3263 
3264 	if (reinjected) {
3265 		if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3266 			struct mbuf *n = sb_mb;
3267 
3268 			while (n) {
3269 				n->m_pkthdr.mp_dsn += sb_cc;
3270 				n->m_pkthdr.mp_rlen -= sb_cc;
3271 				n = n->m_next;
3272 			}
3273 			m_adj(sb_mb, sb_cc);
3274 		} else {
3275 			mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3276 			m_freem(sb_mb);
3277 		}
3278 	}
3279 
3280 	if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3281 		dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3282 		    tot_sent);
3283 	}
3284 
3285 	/* Now, let's update rel-seq and the data-level length */
3286 	mpts->mpts_rel_seq += tot_sent;
3287 	m = head;
3288 	while (m) {
3289 		if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3290 			m->m_pkthdr.mp_csum = dss_csum;
3291 		}
3292 		m->m_pkthdr.mp_rlen = tot_sent;
3293 		m = m->m_next;
3294 	}
3295 
3296 	if (head != NULL) {
3297 		if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3298 		    (tp->t_tfo_stats == 0)) {
3299 			tp->t_mpflags |= TMPF_TFO_REQUEST;
3300 		}
3301 
3302 		error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0);
3303 		head = NULL;
3304 	}
3305 
3306 done_sending:
3307 	if (error == 0 ||
3308 	    (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3309 		uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3310 
3311 		if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3312 			tcpstat.tcps_mp_num_probes++;
3313 			if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3314 				mpts->mpts_probecnt += 1;
3315 			} else {
3316 				mpts->mpts_probecnt +=
3317 				    tot_sent / mpts->mpts_maxseg;
3318 			}
3319 		}
3320 
3321 		if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3322 			if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3323 			    MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3324 				mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3325 			}
3326 			mp_tp->mpt_sndnxt = new_sndnxt;
3327 		}
3328 
3329 		mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3330 
3331 		/* Must be here as mptcp_can_send_more() checks for this */
3332 		soclearfastopen(mp_so);
3333 
3334 		if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3335 			mptcp_set_cellicon(mpte, mpts);
3336 
3337 			mpte->mpte_used_cell = 1;
3338 		} else {
3339 			/*
3340 			 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3341 			 * explicitly set the cellicon, then we unset it again.
3342 			 */
3343 			if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3344 				mptcp_unset_cellicon(mpte, NULL, 1);
3345 			}
3346 
3347 			mpte->mpte_used_wifi = 1;
3348 		}
3349 
3350 		/*
3351 		 * Don't propagate EWOULDBLOCK - it's already taken care of
3352 		 * in mptcp_usr_send for TFO.
3353 		 */
3354 		error = 0;
3355 	} else {
3356 		/* We need to revert our change to mpts_rel_seq */
3357 		mpts->mpts_rel_seq -= tot_sent;
3358 
3359 		os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3360 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3361 	}
3362 out:
3363 
3364 	if (head != NULL) {
3365 		m_freem(head);
3366 	}
3367 
3368 	if (wakeup) {
3369 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3370 	}
3371 
3372 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3373 	return error;
3374 }
3375 
3376 static void
mptcp_add_reinjectq(struct mptses * mpte,struct mbuf * m)3377 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3378 {
3379 	struct mbuf *n, *prev = NULL;
3380 
3381 	n = mpte->mpte_reinjectq;
3382 
3383 	/* First, look for an mbuf n, whose data-sequence-number is bigger or
3384 	 * equal than m's sequence number.
3385 	 */
3386 	while (n) {
3387 		if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3388 			break;
3389 		}
3390 
3391 		prev = n;
3392 
3393 		n = n->m_nextpkt;
3394 	}
3395 
3396 	if (n) {
3397 		/* m is already fully covered by the next mbuf in the queue */
3398 		if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3399 		    n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3400 			os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3401 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3402 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3403 			    m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
3404 			goto dont_queue;
3405 		}
3406 
3407 		/* m is covering the next mbuf entirely, thus we remove this guy */
3408 		if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3409 			struct mbuf *tmp = n->m_nextpkt;
3410 
3411 			os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3412 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3413 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3414 			    (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
3415 
3416 			m->m_nextpkt = NULL;
3417 			if (prev == NULL) {
3418 				mpte->mpte_reinjectq = tmp;
3419 			} else {
3420 				prev->m_nextpkt = tmp;
3421 			}
3422 
3423 			m_freem(n);
3424 			n = tmp;
3425 		}
3426 	}
3427 
3428 	if (prev) {
3429 		/* m is already fully covered by the previous mbuf in the queue */
3430 		if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3431 			os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3432 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3433 			    (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3434 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
3435 			goto dont_queue;
3436 		}
3437 	}
3438 
3439 	if (prev == NULL) {
3440 		mpte->mpte_reinjectq = m;
3441 	} else {
3442 		prev->m_nextpkt = m;
3443 	}
3444 
3445 	m->m_nextpkt = n;
3446 
3447 	return;
3448 
3449 dont_queue:
3450 	m_freem(m);
3451 	return;
3452 }
3453 
3454 static struct mbuf *
mptcp_lookup_dsn(struct mptses * mpte,uint64_t dsn)3455 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3456 {
3457 	struct socket *mp_so = mptetoso(mpte);
3458 	struct mbuf *m;
3459 
3460 	m = mp_so->so_snd.sb_mb;
3461 
3462 	while (m) {
3463 		/* If this segment covers what we are looking for, return it. */
3464 		if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3465 		    MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3466 			break;
3467 		}
3468 
3469 
3470 		/* Segment is no more in the queue */
3471 		if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3472 			return NULL;
3473 		}
3474 
3475 		m = m->m_next;
3476 	}
3477 
3478 	return m;
3479 }
3480 
3481 static struct mbuf *
mptcp_copy_mbuf_list(struct mptses * mpte,struct mbuf * m,int len)3482 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3483 {
3484 	struct mbuf *top = NULL, *tail = NULL;
3485 	uint64_t dsn;
3486 	uint32_t dlen, rseq;
3487 
3488 	dsn = m->m_pkthdr.mp_dsn;
3489 	dlen = m->m_pkthdr.mp_rlen;
3490 	rseq = m->m_pkthdr.mp_rseq;
3491 
3492 	while (len > 0) {
3493 		struct mbuf *n;
3494 
3495 		VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3496 
3497 		n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3498 		if (n == NULL) {
3499 			os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3500 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3501 			goto err;
3502 		}
3503 
3504 		VERIFY(n->m_flags & M_PKTHDR);
3505 		VERIFY(n->m_next == NULL);
3506 		VERIFY(n->m_pkthdr.mp_dsn == dsn);
3507 		VERIFY(n->m_pkthdr.mp_rlen == dlen);
3508 		VERIFY(n->m_pkthdr.mp_rseq == rseq);
3509 		VERIFY(n->m_len == m->m_len);
3510 
3511 		n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3512 
3513 		if (top == NULL) {
3514 			top = n;
3515 		}
3516 
3517 		if (tail != NULL) {
3518 			tail->m_next = n;
3519 		}
3520 
3521 		tail = n;
3522 
3523 		len -= m->m_len;
3524 		m = m->m_next;
3525 	}
3526 
3527 	return top;
3528 
3529 err:
3530 	if (top) {
3531 		m_freem(top);
3532 	}
3533 
3534 	return NULL;
3535 }
3536 
3537 static void
mptcp_reinject_mbufs(struct socket * so)3538 mptcp_reinject_mbufs(struct socket *so)
3539 {
3540 	struct tcpcb *tp = sototcpcb(so);
3541 	struct mptsub *mpts = tp->t_mpsub;
3542 	struct mptcb *mp_tp = tptomptp(tp);
3543 	struct mptses *mpte = mp_tp->mpt_mpte;
3544 	struct sockbuf *sb = &so->so_snd;
3545 	struct mbuf *m;
3546 
3547 	m = sb->sb_mb;
3548 	while (m) {
3549 		struct mbuf *n = m->m_next, *orig = m;
3550 		bool set_reinject_flag = false;
3551 
3552 		VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3553 
3554 		if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3555 			goto next;
3556 		}
3557 
3558 		/* Has it all already been acknowledged at the data-level? */
3559 		if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3560 			goto next;
3561 		}
3562 
3563 		/* Part of this has already been acknowledged - lookup in the
3564 		 * MPTCP-socket for the segment.
3565 		 */
3566 		if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3567 			m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3568 			if (m == NULL) {
3569 				goto next;
3570 			}
3571 		}
3572 
3573 		/* Copy the mbuf with headers (aka, DSN-numbers) */
3574 		m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3575 		if (m == NULL) {
3576 			break;
3577 		}
3578 
3579 		VERIFY(m->m_nextpkt == NULL);
3580 
3581 		/* Now, add to the reinject-queue, eliminating overlapping
3582 		 * segments
3583 		 */
3584 		mptcp_add_reinjectq(mpte, m);
3585 
3586 		set_reinject_flag = true;
3587 		orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3588 
3589 next:
3590 		/* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3591 		while (n) {
3592 			VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3593 
3594 			if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3595 				break;
3596 			}
3597 
3598 			if (set_reinject_flag) {
3599 				n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3600 			}
3601 			n = n->m_next;
3602 		}
3603 
3604 		m = n;
3605 	}
3606 }
3607 
3608 void
mptcp_clean_reinjectq(struct mptses * mpte)3609 mptcp_clean_reinjectq(struct mptses *mpte)
3610 {
3611 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3612 
3613 	socket_lock_assert_owned(mptetoso(mpte));
3614 
3615 	while (mpte->mpte_reinjectq) {
3616 		struct mbuf *m = mpte->mpte_reinjectq;
3617 
3618 		if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3619 		    MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3620 			break;
3621 		}
3622 
3623 		mpte->mpte_reinjectq = m->m_nextpkt;
3624 		m->m_nextpkt = NULL;
3625 		m_freem(m);
3626 	}
3627 }
3628 
3629 static ev_ret_t
mptcp_subflow_propagate_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3630 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3631     uint32_t *p_mpsofilt_hint, uint32_t event)
3632 {
3633 	struct socket *mp_so, *so;
3634 	struct mptcb *mp_tp;
3635 
3636 	mp_so = mptetoso(mpte);
3637 	mp_tp = mpte->mpte_mptcb;
3638 	so = mpts->mpts_socket;
3639 
3640 	/*
3641 	 * We got an event for this subflow that might need to be propagated,
3642 	 * based on the state of the MPTCP connection.
3643 	 */
3644 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3645 	    (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3646 	    ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3647 		mp_so->so_error = so->so_error;
3648 		*p_mpsofilt_hint |= event;
3649 	}
3650 
3651 	return MPTS_EVRET_OK;
3652 }
3653 
3654 /*
3655  * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3656  */
3657 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3658 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3659     uint32_t *p_mpsofilt_hint, uint32_t event)
3660 {
3661 	struct socket *mp_so;
3662 	struct tcpcb *tp;
3663 
3664 	mp_so = mptetoso(mpte);
3665 	tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3666 
3667 	/*
3668 	 * This overwrites any previous mpte_lost_aid to avoid storing
3669 	 * too much state when the typical case has only two subflows.
3670 	 */
3671 	mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3672 	mpte->mpte_lost_aid = tp->t_local_aid;
3673 
3674 	/*
3675 	 * The subflow connection has lost its source address.
3676 	 */
3677 	mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3678 
3679 	if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3680 		mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3681 	}
3682 
3683 	return MPTS_EVRET_DELETE;
3684 }
3685 
3686 static ev_ret_t
mptcp_subflow_mpsuberror_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3687 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3688     uint32_t *p_mpsofilt_hint, uint32_t event)
3689 {
3690 #pragma unused(event, p_mpsofilt_hint)
3691 	struct socket *so, *mp_so;
3692 
3693 	so = mpts->mpts_socket;
3694 
3695 	if (so->so_error != ENODATA) {
3696 		return MPTS_EVRET_OK;
3697 	}
3698 
3699 
3700 	mp_so = mptetoso(mpte);
3701 
3702 	mp_so->so_error = ENODATA;
3703 
3704 	sorwakeup(mp_so);
3705 	sowwakeup(mp_so);
3706 
3707 	return MPTS_EVRET_OK;
3708 }
3709 
3710 
3711 /*
3712  * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3713  * indicates that the remote side sent a Data FIN
3714  */
3715 static ev_ret_t
mptcp_subflow_mpcantrcvmore_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3716 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3717     uint32_t *p_mpsofilt_hint, uint32_t event)
3718 {
3719 #pragma unused(event, mpts)
3720 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3721 
3722 	/*
3723 	 * We got a Data FIN for the MPTCP connection.
3724 	 * The FIN may arrive with data. The data is handed up to the
3725 	 * mptcp socket and the user is notified so that it may close
3726 	 * the socket if needed.
3727 	 */
3728 	if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3729 		*p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3730 	}
3731 
3732 	return MPTS_EVRET_OK; /* keep the subflow socket around */
3733 }
3734 
3735 /*
3736  * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3737  */
3738 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3739 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3740     uint32_t *p_mpsofilt_hint, uint32_t event)
3741 {
3742 #pragma unused(event, p_mpsofilt_hint)
3743 	struct mptsub *mpts_alt = NULL;
3744 	struct socket *alt_so = NULL;
3745 	struct socket *mp_so;
3746 	int altpath_exists = 0;
3747 
3748 	mp_so = mptetoso(mpte);
3749 	os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3750 
3751 	mptcp_reinject_mbufs(mpts->mpts_socket);
3752 
3753 	mpts_alt = mptcp_get_subflow(mpte, NULL);
3754 
3755 	/* If there is no alternate eligible subflow, ignore the failover hint. */
3756 	if (mpts_alt == NULL || mpts_alt == mpts) {
3757 		os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3758 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3759 
3760 		goto done;
3761 	}
3762 
3763 	altpath_exists = 1;
3764 	alt_so = mpts_alt->mpts_socket;
3765 	if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3766 		/* All data acknowledged and no RTT spike */
3767 		if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3768 			mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3769 		} else {
3770 			/* no alternate path available */
3771 			altpath_exists = 0;
3772 		}
3773 	}
3774 
3775 	if (altpath_exists) {
3776 		mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3777 
3778 		mpte->mpte_active_sub = mpts_alt;
3779 		mpts->mpts_flags |= MPTSF_FAILINGOVER;
3780 		mpts->mpts_flags &= ~MPTSF_ACTIVE;
3781 
3782 		os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3783 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3784 
3785 		mptcpstats_inc_switch(mpte, mpts);
3786 
3787 		sowwakeup(alt_so);
3788 	} else {
3789 done:
3790 		mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3791 	}
3792 
3793 	return MPTS_EVRET_OK;
3794 }
3795 
3796 /*
3797  * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3798  */
3799 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3800 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3801     uint32_t *p_mpsofilt_hint, uint32_t event)
3802 {
3803 	/*
3804 	 * The subflow connection cannot use the outgoing interface, let's
3805 	 * close this subflow.
3806 	 */
3807 	mptcp_subflow_abort(mpts, EPERM);
3808 
3809 	mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3810 
3811 	return MPTS_EVRET_DELETE;
3812 }
3813 
3814 /*
3815  * https://tools.ietf.org/html/rfc6052#section-2
3816  * https://tools.ietf.org/html/rfc6147#section-5.2
3817  */
3818 static boolean_t
mptcp_desynthesize_ipv6_addr(struct mptses * mpte,const struct in6_addr * addr,const struct ipv6_prefix * prefix,struct in_addr * addrv4)3819 mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr,
3820     const struct ipv6_prefix *prefix,
3821     struct in_addr *addrv4)
3822 {
3823 	char buf[MAX_IPv4_STR_LEN];
3824 	char *ptrv4 = (char *)addrv4;
3825 	const char *ptr = (const char *)addr;
3826 
3827 	if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3828 		return false;
3829 	}
3830 
3831 	switch (prefix->prefix_len) {
3832 	case NAT64_PREFIX_LEN_96:
3833 		memcpy(ptrv4, ptr + 12, 4);
3834 		break;
3835 	case NAT64_PREFIX_LEN_64:
3836 		memcpy(ptrv4, ptr + 9, 4);
3837 		break;
3838 	case NAT64_PREFIX_LEN_56:
3839 		memcpy(ptrv4, ptr + 7, 1);
3840 		memcpy(ptrv4 + 1, ptr + 9, 3);
3841 		break;
3842 	case NAT64_PREFIX_LEN_48:
3843 		memcpy(ptrv4, ptr + 6, 2);
3844 		memcpy(ptrv4 + 2, ptr + 9, 2);
3845 		break;
3846 	case NAT64_PREFIX_LEN_40:
3847 		memcpy(ptrv4, ptr + 5, 3);
3848 		memcpy(ptrv4 + 3, ptr + 9, 1);
3849 		break;
3850 	case NAT64_PREFIX_LEN_32:
3851 		memcpy(ptrv4, ptr + 4, 4);
3852 		break;
3853 	default:
3854 		panic("NAT64-prefix len is wrong: %u",
3855 		    prefix->prefix_len);
3856 	}
3857 
3858 	os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__,
3859 	    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3860 	    inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3861 
3862 	return true;
3863 }
3864 
3865 static void
mptcp_handle_ipv6_connection(struct mptses * mpte,const struct mptsub * mpts)3866 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3867 {
3868 	struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3869 	struct socket *so = mpts->mpts_socket;
3870 	struct ifnet *ifp;
3871 	int j;
3872 
3873 	/* Subflow IPs will be steered directly by the server - no need to
3874 	 * desynthesize.
3875 	 */
3876 	if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3877 		return;
3878 	}
3879 
3880 	ifp = sotoinpcb(so)->inp_last_outifp;
3881 
3882 	if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3883 		return;
3884 	}
3885 
3886 	for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3887 		int success;
3888 
3889 		if (nat64prefixes[j].prefix_len == 0) {
3890 			continue;
3891 		}
3892 
3893 		success = mptcp_desynthesize_ipv6_addr(mpte,
3894 		    &mpte->__mpte_dst_v6.sin6_addr,
3895 		    &nat64prefixes[j],
3896 		    &mpte->mpte_sub_dst_v4.sin_addr);
3897 		if (success) {
3898 			mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
3899 			mpte->mpte_sub_dst_v4.sin_family = AF_INET;
3900 			mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
3901 
3902 			/*
3903 			 * We connected to a NAT64'ed address. Let's remove it
3904 			 * from the potential IPs to use. Whenever we are back on
3905 			 * that network and need to connect, we can synthesize again.
3906 			 *
3907 			 * Otherwise, on different IPv6 networks we will attempt
3908 			 * to connect to that NAT64 address...
3909 			 */
3910 			memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6));
3911 			break;
3912 		}
3913 	}
3914 }
3915 
3916 static void
mptcp_try_alternate_port(struct mptses * mpte,struct mptsub * mpts)3917 mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
3918 {
3919 	struct inpcb *inp;
3920 
3921 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
3922 		return;
3923 	}
3924 
3925 	inp = sotoinpcb(mpts->mpts_socket);
3926 	if (inp == NULL) {
3927 		return;
3928 	}
3929 
3930 	/* Should we try the alternate port? */
3931 	if (mpte->mpte_alternate_port &&
3932 	    inp->inp_fport != mpte->mpte_alternate_port) {
3933 		union sockaddr_in_4_6 dst;
3934 		struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
3935 
3936 		memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
3937 
3938 		dst_in->sin_port = mpte->mpte_alternate_port;
3939 
3940 		mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
3941 		    mpts->mpts_ifscope, NULL);
3942 	} else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3943 		unsigned int i;
3944 
3945 		if (inp->inp_last_outifp == NULL) {
3946 			return;
3947 		}
3948 
3949 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3950 			struct mpt_itf_info *info =  &mpte->mpte_itfinfo[i];
3951 
3952 			if (inp->inp_last_outifp->if_index == info->ifindex) {
3953 				info->no_mptcp_support = 1;
3954 				break;
3955 			}
3956 		}
3957 	}
3958 }
3959 
3960 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
3961 static void
mptcp_drop_tfo_data(struct mptses * mpte,struct mptsub * mpts)3962 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
3963 {
3964 	struct socket *mp_so = mptetoso(mpte);
3965 	struct socket *so = mpts->mpts_socket;
3966 	struct tcpcb *tp = intotcpcb(sotoinpcb(so));
3967 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3968 
3969 	/* If data was sent with SYN, rewind state */
3970 	if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
3971 		u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3972 		unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
3973 
3974 		VERIFY(mp_droplen <= (UINT_MAX));
3975 		VERIFY(mp_droplen >= tcp_droplen);
3976 
3977 		mpts->mpts_flags &= ~MPTSF_TFO_REQD;
3978 		mpts->mpts_iss += tcp_droplen;
3979 		tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3980 
3981 		if (mp_droplen > tcp_droplen) {
3982 			/* handle partial TCP ack */
3983 			mp_so->so_flags1 |= SOF1_TFO_REWIND;
3984 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
3985 			mp_droplen = tcp_droplen;
3986 		} else {
3987 			/* all data on SYN was acked */
3988 			mpts->mpts_rel_seq = 1;
3989 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3990 		}
3991 		mp_tp->mpt_sndmax -= tcp_droplen;
3992 
3993 		if (mp_droplen != 0) {
3994 			VERIFY(mp_so->so_snd.sb_mb != NULL);
3995 			sbdrop(&mp_so->so_snd, (int)mp_droplen);
3996 		}
3997 	}
3998 }
3999 
4000 /*
4001  * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4002  */
4003 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4004 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4005     uint32_t *p_mpsofilt_hint, uint32_t event)
4006 {
4007 #pragma unused(event, p_mpsofilt_hint)
4008 	struct socket *mp_so, *so;
4009 	struct inpcb *inp;
4010 	struct tcpcb *tp;
4011 	struct mptcb *mp_tp;
4012 	int af;
4013 	boolean_t mpok = FALSE;
4014 
4015 	mp_so = mptetoso(mpte);
4016 	mp_tp = mpte->mpte_mptcb;
4017 	so = mpts->mpts_socket;
4018 	tp = sototcpcb(so);
4019 	af = mpts->mpts_dst.sa_family;
4020 
4021 	if (mpts->mpts_flags & MPTSF_CONNECTED) {
4022 		return MPTS_EVRET_OK;
4023 	}
4024 
4025 	if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4026 	    (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4027 		return MPTS_EVRET_OK;
4028 	}
4029 
4030 	/*
4031 	 * The subflow connection has been connected.  Find out whether it
4032 	 * is connected as a regular TCP or as a MPTCP subflow.  The idea is:
4033 	 *
4034 	 *   a. If MPTCP connection is not yet established, then this must be
4035 	 *	the first subflow connection.  If MPTCP failed to negotiate,
4036 	 *	fallback to regular TCP by degrading this subflow.
4037 	 *
4038 	 *   b. If MPTCP connection has been established, then this must be
4039 	 *	one of the subsequent subflow connections. If MPTCP failed
4040 	 *	to negotiate, disconnect the connection.
4041 	 *
4042 	 * Right now, we simply unblock any waiters at the MPTCP socket layer
4043 	 * if the MPTCP connection has not been established.
4044 	 */
4045 
4046 	if (so->so_state & SS_ISDISCONNECTED) {
4047 		/*
4048 		 * With MPTCP joins, a connection is connected at the subflow
4049 		 * level, but the 4th ACK from the server elevates the MPTCP
4050 		 * subflow to connected state. So there is a small window
4051 		 * where the subflow could get disconnected before the
4052 		 * connected event is processed.
4053 		 */
4054 		return MPTS_EVRET_OK;
4055 	}
4056 
4057 	if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4058 		mptcp_drop_tfo_data(mpte, mpts);
4059 	}
4060 
4061 	mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4062 	mpts->mpts_flags |= MPTSF_CONNECTED;
4063 
4064 	if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4065 		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4066 	}
4067 
4068 	tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4069 
4070 	/* get/verify the outbound interface */
4071 	inp = sotoinpcb(so);
4072 
4073 	mpts->mpts_maxseg = tp->t_maxseg;
4074 
4075 	mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4076 
4077 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4078 		mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4079 		mpte->mpte_associd = mpts->mpts_connid;
4080 		DTRACE_MPTCP2(state__change,
4081 		    struct mptcb *, mp_tp,
4082 		    uint32_t, 0 /* event */);
4083 
4084 		if (SOCK_DOM(so) == AF_INET) {
4085 			in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4086 		} else {
4087 			in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4088 		}
4089 
4090 		mpts->mpts_flags |= MPTSF_ACTIVE;
4091 
4092 		/* case (a) above */
4093 		if (!mpok) {
4094 			tcpstat.tcps_mpcap_fallback++;
4095 
4096 			tp->t_mpflags |= TMPF_INFIN_SENT;
4097 			mptcp_notify_mpfail(so);
4098 		} else {
4099 			if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4100 			    mptcp_subflows_need_backup_flag(mpte)) {
4101 				tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4102 			} else {
4103 				mpts->mpts_flags |= MPTSF_PREFERRED;
4104 			}
4105 			mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4106 			mpte->mpte_nummpcapflows++;
4107 
4108 			if (SOCK_DOM(so) == AF_INET6) {
4109 				mptcp_handle_ipv6_connection(mpte, mpts);
4110 			}
4111 
4112 			mptcp_check_subflows_and_add(mpte);
4113 
4114 			if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4115 				mpte->mpte_initial_cell = 1;
4116 			}
4117 
4118 			mpte->mpte_handshake_success = 1;
4119 		}
4120 
4121 		mp_tp->mpt_sndwnd = tp->snd_wnd;
4122 		mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4123 		mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4124 		soisconnected(mp_so);
4125 	} else if (mpok) {
4126 		/*
4127 		 * case (b) above
4128 		 * In case of additional flows, the MPTCP socket is not
4129 		 * MPTSF_MP_CAPABLE until an ACK is received from server
4130 		 * for 3-way handshake.  TCP would have guaranteed that this
4131 		 * is an MPTCP subflow.
4132 		 */
4133 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4134 		    !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4135 		    mptcp_subflows_need_backup_flag(mpte)) {
4136 			tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4137 			mpts->mpts_flags &= ~MPTSF_PREFERRED;
4138 		} else {
4139 			mpts->mpts_flags |= MPTSF_PREFERRED;
4140 		}
4141 
4142 		mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4143 		mpte->mpte_nummpcapflows++;
4144 
4145 		mpts->mpts_rel_seq = 1;
4146 
4147 		mptcp_check_subflows_and_remove(mpte);
4148 	} else {
4149 		mptcp_try_alternate_port(mpte, mpts);
4150 
4151 		tcpstat.tcps_join_fallback++;
4152 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4153 			tcpstat.tcps_mptcp_cell_proxy++;
4154 		} else {
4155 			tcpstat.tcps_mptcp_wifi_proxy++;
4156 		}
4157 
4158 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4159 
4160 		return MPTS_EVRET_OK;
4161 	}
4162 
4163 	/* This call, just to "book" an entry in the stats-table for this ifindex */
4164 	mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4165 
4166 	mptcp_output(mpte);
4167 
4168 	return MPTS_EVRET_OK; /* keep the subflow socket around */
4169 }
4170 
4171 /*
4172  * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4173  */
4174 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4175 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4176     uint32_t *p_mpsofilt_hint, uint32_t event)
4177 {
4178 #pragma unused(event, p_mpsofilt_hint)
4179 	struct socket *mp_so, *so;
4180 	struct mptcb *mp_tp;
4181 
4182 	mp_so = mptetoso(mpte);
4183 	mp_tp = mpte->mpte_mptcb;
4184 	so = mpts->mpts_socket;
4185 
4186 	if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4187 		return MPTS_EVRET_DELETE;
4188 	}
4189 
4190 	mpts->mpts_flags |= MPTSF_DISCONNECTED;
4191 
4192 	/* The subflow connection has been disconnected. */
4193 
4194 	if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4195 		mpte->mpte_nummpcapflows--;
4196 		if (mpte->mpte_active_sub == mpts) {
4197 			mpte->mpte_active_sub = NULL;
4198 		}
4199 		mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4200 	} else {
4201 		if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4202 		    !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4203 			mptcp_try_alternate_port(mpte, mpts);
4204 		}
4205 	}
4206 
4207 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4208 	    ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4209 		mptcp_drop(mpte, mp_tp, so->so_error);
4210 	}
4211 
4212 	/*
4213 	 * Clear flags that are used by getconninfo to return state.
4214 	 * Retain like MPTSF_DELETEOK for internal purposes.
4215 	 */
4216 	mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4217 	    MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4218 	    MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4219 
4220 	return MPTS_EVRET_DELETE;
4221 }
4222 
4223 /*
4224  * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4225  */
4226 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4227 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4228     uint32_t *p_mpsofilt_hint, uint32_t event)
4229 {
4230 #pragma unused(event, p_mpsofilt_hint)
4231 	ev_ret_t ret = MPTS_EVRET_OK;
4232 	struct socket *mp_so, *so;
4233 	struct mptcb *mp_tp;
4234 
4235 	mp_so = mptetoso(mpte);
4236 	mp_tp = mpte->mpte_mptcb;
4237 	so = mpts->mpts_socket;
4238 	struct inpcb *inp = sotoinpcb(so);
4239 	struct tcpcb *tp = intotcpcb(inp);
4240 
4241 	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4242 		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4243 	} else {
4244 		mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4245 	}
4246 
4247 	if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4248 		if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4249 			goto done;
4250 		}
4251 		mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4252 	} else {
4253 		mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4254 	}
4255 
4256 	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4257 		mpts->mpts_flags |= MPTSF_MP_READY;
4258 	} else {
4259 		mpts->mpts_flags &= ~MPTSF_MP_READY;
4260 	}
4261 
4262 	if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4263 		mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4264 		mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4265 		tcp_cache_update_mptcp_version(tp, FALSE);
4266 	}
4267 
4268 	if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4269 		ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4270 
4271 		m_freem_list(mpte->mpte_reinjectq);
4272 		mpte->mpte_reinjectq = NULL;
4273 	} else if (mpts->mpts_flags & MPTSF_MP_READY) {
4274 		mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4275 		ret = MPTS_EVRET_CONNECT_PENDING;
4276 	}
4277 
4278 done:
4279 	return ret;
4280 }
4281 
4282 /*
4283  * Handle SO_FILT_HINT_MUSTRST subflow socket event
4284  */
4285 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4286 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4287     uint32_t *p_mpsofilt_hint, uint32_t event)
4288 {
4289 #pragma unused(event)
4290 	struct socket *mp_so, *so;
4291 	struct mptcb *mp_tp;
4292 	boolean_t is_fastclose;
4293 
4294 	mp_so = mptetoso(mpte);
4295 	mp_tp = mpte->mpte_mptcb;
4296 	so = mpts->mpts_socket;
4297 
4298 	/* We got an invalid option or a fast close */
4299 	struct inpcb *inp = sotoinpcb(so);
4300 	struct tcpcb *tp = NULL;
4301 
4302 	tp = intotcpcb(inp);
4303 	so->so_error = ECONNABORTED;
4304 
4305 	is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4306 
4307 	tp->t_mpflags |= TMPF_RESET;
4308 
4309 	if (tp->t_state != TCPS_CLOSED) {
4310 		struct tcptemp *t_template = tcp_maketemplate(tp);
4311 
4312 		if (t_template) {
4313 			struct tcp_respond_args tra;
4314 
4315 			bzero(&tra, sizeof(tra));
4316 			if (inp->inp_flags & INP_BOUND_IF) {
4317 				tra.ifscope = inp->inp_boundifp->if_index;
4318 			} else {
4319 				tra.ifscope = IFSCOPE_NONE;
4320 			}
4321 			tra.awdl_unrestricted = 1;
4322 
4323 			tcp_respond(tp, t_template->tt_ipgen,
4324 			    &t_template->tt_t, (struct mbuf *)NULL,
4325 			    tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4326 			(void) m_free(dtom(t_template));
4327 		}
4328 	}
4329 
4330 	if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4331 		struct mptsub *iter, *tmp;
4332 
4333 		*p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4334 
4335 		mp_so->so_error = ECONNRESET;
4336 
4337 		TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4338 			if (iter == mpts) {
4339 				continue;
4340 			}
4341 			mptcp_subflow_abort(iter, ECONNABORTED);
4342 		}
4343 
4344 		/*
4345 		 * mptcp_drop is being called after processing the events, to fully
4346 		 * close the MPTCP connection
4347 		 */
4348 		mptcp_drop(mpte, mp_tp, mp_so->so_error);
4349 	}
4350 
4351 	mptcp_subflow_abort(mpts, ECONNABORTED);
4352 
4353 	if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4354 		mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4355 	}
4356 
4357 	return MPTS_EVRET_DELETE;
4358 }
4359 
4360 static ev_ret_t
mptcp_subflow_adaptive_rtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4361 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4362     uint32_t *p_mpsofilt_hint, uint32_t event)
4363 {
4364 #pragma unused(event)
4365 	bool found_active = false;
4366 
4367 	mpts->mpts_flags |= MPTSF_READ_STALL;
4368 
4369 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4370 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4371 
4372 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4373 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
4374 			continue;
4375 		}
4376 
4377 		if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4378 			found_active = true;
4379 			break;
4380 		}
4381 	}
4382 
4383 	if (!found_active) {
4384 		*p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4385 	}
4386 
4387 	return MPTS_EVRET_OK;
4388 }
4389 
4390 static ev_ret_t
mptcp_subflow_adaptive_wtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4391 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4392     uint32_t *p_mpsofilt_hint, uint32_t event)
4393 {
4394 #pragma unused(event)
4395 	bool found_active = false;
4396 
4397 	mpts->mpts_flags |= MPTSF_WRITE_STALL;
4398 
4399 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4400 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4401 
4402 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4403 		    tp->t_state > TCPS_CLOSE_WAIT) {
4404 			continue;
4405 		}
4406 
4407 		if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4408 			found_active = true;
4409 			break;
4410 		}
4411 	}
4412 
4413 	if (!found_active) {
4414 		*p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4415 	}
4416 
4417 	return MPTS_EVRET_OK;
4418 }
4419 
4420 /*
4421  * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4422  * caller must ensure that the option can be issued on subflow sockets, via
4423  * MPOF_SUBFLOW_OK flag.
4424  */
4425 int
mptcp_subflow_sosetopt(struct mptses * mpte,struct mptsub * mpts,struct mptopt * mpo)4426 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4427 {
4428 	struct socket *mp_so, *so;
4429 	struct sockopt sopt;
4430 	int error;
4431 
4432 	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4433 
4434 	mp_so = mptetoso(mpte);
4435 	so = mpts->mpts_socket;
4436 
4437 	socket_lock_assert_owned(mp_so);
4438 
4439 	if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4440 	    mpo->mpo_level == SOL_SOCKET &&
4441 	    mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4442 		struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4443 
4444 		/*
4445 		 * When we open a new subflow, mark it as cell fallback, if
4446 		 * this subflow goes over cell.
4447 		 *
4448 		 * (except for first-party apps)
4449 		 */
4450 
4451 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4452 			return 0;
4453 		}
4454 
4455 		if (sotoinpcb(so)->inp_last_outifp &&
4456 		    !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4457 			return 0;
4458 		}
4459 
4460 		/*
4461 		 * This here is an OR, because if the app is not binding to the
4462 		 * interface, then it definitely is not a cell-fallback
4463 		 * connection.
4464 		 */
4465 		if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4466 		    !IFNET_IS_CELLULAR(ifp)) {
4467 			return 0;
4468 		}
4469 	}
4470 
4471 	mpo->mpo_flags &= ~MPOF_INTERIM;
4472 
4473 	bzero(&sopt, sizeof(sopt));
4474 	sopt.sopt_dir = SOPT_SET;
4475 	sopt.sopt_level = mpo->mpo_level;
4476 	sopt.sopt_name = mpo->mpo_name;
4477 	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4478 	sopt.sopt_valsize = sizeof(int);
4479 	sopt.sopt_p = kernproc;
4480 
4481 	error = sosetoptlock(so, &sopt, 0);
4482 	if (error) {
4483 		os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4484 		    "val %d set error %d\n", __func__,
4485 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4486 		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4487 		    mpo->mpo_intval, error);
4488 	}
4489 	return error;
4490 }
4491 
4492 /*
4493  * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4494  * caller must ensure that the option can be issued on subflow sockets, via
4495  * MPOF_SUBFLOW_OK flag.
4496  */
4497 int
mptcp_subflow_sogetopt(struct mptses * mpte,struct socket * so,struct mptopt * mpo)4498 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4499     struct mptopt *mpo)
4500 {
4501 	struct socket *mp_so;
4502 	struct sockopt sopt;
4503 	int error;
4504 
4505 	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4506 	mp_so = mptetoso(mpte);
4507 
4508 	socket_lock_assert_owned(mp_so);
4509 
4510 	bzero(&sopt, sizeof(sopt));
4511 	sopt.sopt_dir = SOPT_GET;
4512 	sopt.sopt_level = mpo->mpo_level;
4513 	sopt.sopt_name = mpo->mpo_name;
4514 	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4515 	sopt.sopt_valsize = sizeof(int);
4516 	sopt.sopt_p = kernproc;
4517 
4518 	error = sogetoptlock(so, &sopt, 0);     /* already locked */
4519 	if (error) {
4520 		os_log_error(mptcp_log_handle,
4521 		    "%s - %lx: sopt %s get error %d\n",
4522 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4523 		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4524 	}
4525 	return error;
4526 }
4527 
4528 
4529 /*
4530  * MPTCP garbage collector.
4531  *
4532  * This routine is called by the MP domain on-demand, periodic callout,
4533  * which is triggered when a MPTCP socket is closed.  The callout will
4534  * repeat as long as this routine returns a non-zero value.
4535  */
4536 static uint32_t
mptcp_gc(struct mppcbinfo * mppi)4537 mptcp_gc(struct mppcbinfo *mppi)
4538 {
4539 	struct mppcb *mpp, *tmpp;
4540 	uint32_t active = 0;
4541 
4542 	LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4543 
4544 	TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4545 		struct socket *mp_so;
4546 		struct mptses *mpte;
4547 		struct mptcb *mp_tp;
4548 
4549 		mp_so = mpp->mpp_socket;
4550 		mpte = mptompte(mpp);
4551 		mp_tp = mpte->mpte_mptcb;
4552 
4553 		if (!mpp_try_lock(mpp)) {
4554 			active++;
4555 			continue;
4556 		}
4557 
4558 		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4559 
4560 		/* check again under the lock */
4561 		if (mp_so->so_usecount > 0) {
4562 			boolean_t wakeup = FALSE;
4563 			struct mptsub *mpts, *tmpts;
4564 
4565 			if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4566 				if (mp_tp->mpt_gc_ticks > 0) {
4567 					mp_tp->mpt_gc_ticks--;
4568 				}
4569 				if (mp_tp->mpt_gc_ticks == 0) {
4570 					wakeup = TRUE;
4571 				}
4572 			}
4573 			if (wakeup) {
4574 				TAILQ_FOREACH_SAFE(mpts,
4575 				    &mpte->mpte_subflows, mpts_entry, tmpts) {
4576 					mptcp_subflow_eupcall1(mpts->mpts_socket,
4577 					    mpts, SO_FILT_HINT_DISCONNECTED);
4578 				}
4579 			}
4580 			socket_unlock(mp_so, 0);
4581 			active++;
4582 			continue;
4583 		}
4584 
4585 		if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4586 			panic("%s - %lx: skipped state "
4587 			    "[u=%d,r=%d,s=%d]\n", __func__,
4588 			    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4589 			    mp_so->so_usecount, mp_so->so_retaincnt,
4590 			    mpp->mpp_state);
4591 		}
4592 
4593 		if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4594 			mptcp_close(mpte, mp_tp);
4595 		}
4596 
4597 		mptcp_session_destroy(mpte);
4598 
4599 		DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4600 		    struct sockbuf *, &mp_so->so_rcv,
4601 		    struct sockbuf *, &mp_so->so_snd,
4602 		    struct mppcb *, mpp);
4603 
4604 		mptcp_pcbdispose(mpp);
4605 		sodealloc(mp_so);
4606 	}
4607 
4608 	return active;
4609 }
4610 
4611 /*
4612  * Drop a MPTCP connection, reporting the specified error.
4613  */
4614 struct mptses *
mptcp_drop(struct mptses * mpte,struct mptcb * mp_tp,u_short errno)4615 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4616 {
4617 	struct socket *mp_so = mptetoso(mpte);
4618 
4619 	VERIFY(mpte->mpte_mptcb == mp_tp);
4620 
4621 	socket_lock_assert_owned(mp_so);
4622 
4623 	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4624 	    uint32_t, 0 /* event */);
4625 
4626 	if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4627 		errno = mp_tp->mpt_softerror;
4628 	}
4629 	mp_so->so_error = errno;
4630 
4631 	return mptcp_close(mpte, mp_tp);
4632 }
4633 
4634 /*
4635  * Close a MPTCP control block.
4636  */
4637 struct mptses *
mptcp_close(struct mptses * mpte,struct mptcb * mp_tp)4638 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4639 {
4640 	struct mptsub *mpts = NULL, *tmpts = NULL;
4641 	struct socket *mp_so = mptetoso(mpte);
4642 
4643 	socket_lock_assert_owned(mp_so);
4644 	VERIFY(mpte->mpte_mptcb == mp_tp);
4645 
4646 	mp_tp->mpt_state = MPTCPS_TERMINATE;
4647 
4648 	mptcp_freeq(mp_tp);
4649 
4650 	soisdisconnected(mp_so);
4651 
4652 	/* Clean up all subflows */
4653 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4654 		mptcp_subflow_disconnect(mpte, mpts);
4655 	}
4656 
4657 	return NULL;
4658 }
4659 
4660 void
mptcp_notify_close(struct socket * so)4661 mptcp_notify_close(struct socket *so)
4662 {
4663 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4664 }
4665 
4666 typedef struct mptcp_subflow_event_entry {
4667 	uint32_t    sofilt_hint_mask;
4668 	ev_ret_t    (*sofilt_hint_ev_hdlr)(
4669 		struct mptses *mpte,
4670 		struct mptsub *mpts,
4671 		uint32_t *p_mpsofilt_hint,
4672 		uint32_t event);
4673 } mptsub_ev_entry_t;
4674 
4675 /*
4676  * XXX The order of the event handlers below is really
4677  * really important. Think twice before changing it.
4678  */
4679 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
4680 	{
4681 		.sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
4682 		.sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
4683 	},
4684 	{
4685 		.sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
4686 		.sofilt_hint_ev_hdlr =  mptcp_subflow_mpcantrcvmore_ev,
4687 	},
4688 	{
4689 		.sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
4690 		.sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
4691 	},
4692 	{
4693 		.sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
4694 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4695 	},
4696 	{
4697 		.sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
4698 		.sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
4699 	},
4700 	{
4701 		.sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
4702 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4703 	},
4704 	{
4705 		.sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
4706 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4707 	},
4708 	{
4709 		.sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
4710 		.sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
4711 	},
4712 	{
4713 		.sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
4714 		.sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
4715 	},
4716 	{
4717 		.sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
4718 		.sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
4719 	},
4720 	{
4721 		.sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
4722 		.sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
4723 	},
4724 	{
4725 		.sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
4726 		.sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
4727 	},
4728 	{
4729 		.sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
4730 		.sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
4731 	},
4732 	{
4733 		.sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
4734 		.sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
4735 	},
4736 };
4737 
4738 /*
4739  * Subflow socket control events.
4740  *
4741  * Called for handling events related to the underlying subflow socket.
4742  */
4743 static ev_ret_t
mptcp_subflow_events(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint)4744 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
4745     uint32_t *p_mpsofilt_hint)
4746 {
4747 	ev_ret_t ret = MPTS_EVRET_OK;
4748 	int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
4749 	    sizeof(mpsub_ev_entry_tbl[0]);
4750 
4751 	/* bail if there's nothing to process */
4752 	if (!mpts->mpts_evctl) {
4753 		return ret;
4754 	}
4755 
4756 	if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
4757 	    SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
4758 	    SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
4759 	    SO_FILT_HINT_DISCONNECTED)) {
4760 		mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
4761 	}
4762 
4763 	DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
4764 	    struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
4765 
4766 	/*
4767 	 * Process all the socket filter hints and reset the hint
4768 	 * once it is handled
4769 	 */
4770 	for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
4771 		/*
4772 		 * Always execute the DISCONNECTED event, because it will wakeup
4773 		 * the app.
4774 		 */
4775 		if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
4776 		    (ret >= MPTS_EVRET_OK ||
4777 		    mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
4778 			mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
4779 			ev_ret_t error =
4780 			    mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
4781 			ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
4782 		}
4783 	}
4784 
4785 	return ret;
4786 }
4787 
4788 /*
4789  * MPTCP workloop.
4790  */
4791 void
mptcp_subflow_workloop(struct mptses * mpte)4792 mptcp_subflow_workloop(struct mptses *mpte)
4793 {
4794 	boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4795 	uint32_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4796 	struct mptsub *mpts, *tmpts;
4797 	struct socket *mp_so;
4798 
4799 	mp_so = mptetoso(mpte);
4800 
4801 	socket_lock_assert_owned(mp_so);
4802 
4803 	if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4804 		mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4805 		return;
4806 	}
4807 	mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4808 
4809 relaunch:
4810 	mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4811 
4812 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4813 		ev_ret_t ret;
4814 
4815 		if (mpts->mpts_socket->so_usecount == 0) {
4816 			/* Will be removed soon by tcp_garbage_collect */
4817 			continue;
4818 		}
4819 
4820 		mptcp_subflow_addref(mpts);
4821 		mpts->mpts_socket->so_usecount++;
4822 
4823 		ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4824 
4825 		/*
4826 		 * If MPTCP socket is closed, disconnect all subflows.
4827 		 * This will generate a disconnect event which will
4828 		 * be handled during the next iteration, causing a
4829 		 * non-zero error to be returned above.
4830 		 */
4831 		if (mp_so->so_flags & SOF_PCBCLEARING) {
4832 			mptcp_subflow_disconnect(mpte, mpts);
4833 		}
4834 
4835 		switch (ret) {
4836 		case MPTS_EVRET_OK:
4837 			/* nothing to do */
4838 			break;
4839 		case MPTS_EVRET_DELETE:
4840 			mptcp_subflow_soclose(mpts);
4841 			break;
4842 		case MPTS_EVRET_CONNECT_PENDING:
4843 			connect_pending = TRUE;
4844 			break;
4845 		case MPTS_EVRET_DISCONNECT_FALLBACK:
4846 			disconnect_fallback = TRUE;
4847 			break;
4848 		default:
4849 			break;
4850 		}
4851 		mptcp_subflow_remref(mpts);             /* ours */
4852 
4853 		VERIFY(mpts->mpts_socket->so_usecount != 0);
4854 		mpts->mpts_socket->so_usecount--;
4855 	}
4856 
4857 	if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4858 		VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4859 
4860 		if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4861 			mp_so->so_state |= SS_CANTRCVMORE;
4862 			sorwakeup(mp_so);
4863 		}
4864 
4865 		soevent(mp_so, mpsofilt_hint_mask);
4866 	}
4867 
4868 	if (!connect_pending && !disconnect_fallback) {
4869 		goto exit;
4870 	}
4871 
4872 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4873 		if (disconnect_fallback) {
4874 			struct socket *so = NULL;
4875 			struct inpcb *inp = NULL;
4876 			struct tcpcb *tp = NULL;
4877 
4878 			if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4879 				continue;
4880 			}
4881 
4882 			mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4883 
4884 			if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4885 			    MPTSF_DISCONNECTED)) {
4886 				continue;
4887 			}
4888 
4889 			so = mpts->mpts_socket;
4890 
4891 			/*
4892 			 * The MPTCP connection has degraded to a fallback
4893 			 * mode, so there is no point in keeping this subflow
4894 			 * regardless of its MPTCP-readiness state, unless it
4895 			 * is the primary one which we use for fallback.  This
4896 			 * assumes that the subflow used for fallback is the
4897 			 * ACTIVE one.
4898 			 */
4899 
4900 			inp = sotoinpcb(so);
4901 			tp = intotcpcb(inp);
4902 			tp->t_mpflags &=
4903 			    ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4904 			tp->t_mpflags |= TMPF_TCP_FALLBACK;
4905 
4906 			soevent(so, SO_FILT_HINT_MUSTRST);
4907 		} else if (connect_pending) {
4908 			/*
4909 			 * The MPTCP connection has progressed to a state
4910 			 * where it supports full multipath semantics; allow
4911 			 * additional joins to be attempted for all subflows
4912 			 * that are in the PENDING state.
4913 			 */
4914 			if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4915 				int error = mptcp_subflow_soconnectx(mpte, mpts);
4916 
4917 				if (error) {
4918 					mptcp_subflow_abort(mpts, error);
4919 				}
4920 			}
4921 		}
4922 	}
4923 
4924 exit:
4925 	if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4926 		goto relaunch;
4927 	}
4928 
4929 	mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4930 }
4931 
4932 /*
4933  * Protocol pr_lock callback.
4934  */
4935 int
mptcp_lock(struct socket * mp_so,int refcount,void * lr)4936 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4937 {
4938 	struct mppcb *mpp = mpsotomppcb(mp_so);
4939 	void *lr_saved;
4940 
4941 	if (lr == NULL) {
4942 		lr_saved = __builtin_return_address(0);
4943 	} else {
4944 		lr_saved = lr;
4945 	}
4946 
4947 	if (mpp == NULL) {
4948 		panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
4949 		    mp_so, lr_saved, solockhistory_nr(mp_so));
4950 		/* NOTREACHED */
4951 	}
4952 	mpp_lock(mpp);
4953 
4954 	if (mp_so->so_usecount < 0) {
4955 		panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__,
4956 		    mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4957 		    solockhistory_nr(mp_so));
4958 		/* NOTREACHED */
4959 	}
4960 	if (refcount != 0) {
4961 		mp_so->so_usecount++;
4962 		mpp->mpp_inside++;
4963 	}
4964 	mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4965 	mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4966 
4967 	return 0;
4968 }
4969 
4970 /*
4971  * Protocol pr_unlock callback.
4972  */
4973 int
mptcp_unlock(struct socket * mp_so,int refcount,void * lr)4974 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4975 {
4976 	struct mppcb *mpp = mpsotomppcb(mp_so);
4977 	void *lr_saved;
4978 
4979 	if (lr == NULL) {
4980 		lr_saved = __builtin_return_address(0);
4981 	} else {
4982 		lr_saved = lr;
4983 	}
4984 
4985 	if (mpp == NULL) {
4986 		panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__,
4987 		    mp_so, mp_so->so_usecount, lr_saved,
4988 		    solockhistory_nr(mp_so));
4989 		/* NOTREACHED */
4990 	}
4991 	socket_lock_assert_owned(mp_so);
4992 
4993 	if (refcount != 0) {
4994 		mp_so->so_usecount--;
4995 		mpp->mpp_inside--;
4996 	}
4997 
4998 	if (mp_so->so_usecount < 0) {
4999 		panic("%s: so=%p usecount=%x lrh= %s", __func__,
5000 		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5001 		/* NOTREACHED */
5002 	}
5003 	if (mpp->mpp_inside < 0) {
5004 		panic("%s: mpp=%p inside=%x lrh= %s", __func__,
5005 		    mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5006 		/* NOTREACHED */
5007 	}
5008 	mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5009 	mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5010 	mpp_unlock(mpp);
5011 
5012 	return 0;
5013 }
5014 
5015 /*
5016  * Protocol pr_getlock callback.
5017  */
5018 lck_mtx_t *
mptcp_getlock(struct socket * mp_so,int flags)5019 mptcp_getlock(struct socket *mp_so, int flags)
5020 {
5021 	struct mppcb *mpp = mpsotomppcb(mp_so);
5022 
5023 	if (mpp == NULL) {
5024 		panic("%s: so=%p NULL so_pcb %s", __func__, mp_so,
5025 		    solockhistory_nr(mp_so));
5026 		/* NOTREACHED */
5027 	}
5028 	if (mp_so->so_usecount < 0) {
5029 		panic("%s: so=%p usecount=%x lrh= %s", __func__,
5030 		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5031 		/* NOTREACHED */
5032 	}
5033 	return mpp_getlock(mpp, flags);
5034 }
5035 
5036 void
mptcp_get_rands(mptcp_addr_id addr_id,struct mptcb * mp_tp,u_int32_t * lrand,u_int32_t * rrand)5037 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5038     u_int32_t *rrand)
5039 {
5040 	struct mptcp_subf_auth_entry *sauth_entry;
5041 
5042 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5043 		if (sauth_entry->msae_laddr_id == addr_id) {
5044 			if (lrand) {
5045 				*lrand = sauth_entry->msae_laddr_rand;
5046 			}
5047 			if (rrand) {
5048 				*rrand = sauth_entry->msae_raddr_rand;
5049 			}
5050 			break;
5051 		}
5052 	}
5053 }
5054 
5055 void
mptcp_set_raddr_rand(mptcp_addr_id laddr_id,struct mptcb * mp_tp,mptcp_addr_id raddr_id,u_int32_t raddr_rand)5056 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5057     mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5058 {
5059 	struct mptcp_subf_auth_entry *sauth_entry;
5060 
5061 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5062 		if (sauth_entry->msae_laddr_id == laddr_id) {
5063 			if ((sauth_entry->msae_raddr_id != 0) &&
5064 			    (sauth_entry->msae_raddr_id != raddr_id)) {
5065 				os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5066 				    " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5067 				    raddr_id, sauth_entry->msae_raddr_id);
5068 				return;
5069 			}
5070 			sauth_entry->msae_raddr_id = raddr_id;
5071 			if ((sauth_entry->msae_raddr_rand != 0) &&
5072 			    (sauth_entry->msae_raddr_rand != raddr_rand)) {
5073 				os_log_error(mptcp_log_handle, "%s - %lx: "
5074 				    "dup SYN_ACK %d %d \n",
5075 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5076 				    raddr_rand, sauth_entry->msae_raddr_rand);
5077 				return;
5078 			}
5079 			sauth_entry->msae_raddr_rand = raddr_rand;
5080 			return;
5081 		}
5082 	}
5083 }
5084 
5085 /*
5086  * SHA-256 support for MPTCP
5087  */
5088 
5089 static void
mptcp_do_sha256(mptcp_key_t * key,char * sha_digest)5090 mptcp_do_sha256(mptcp_key_t *key, char *sha_digest)
5091 {
5092 	const unsigned char *sha2_base;
5093 	int sha2_size;
5094 
5095 	sha2_base = (const unsigned char *) key;
5096 	sha2_size = sizeof(mptcp_key_t);
5097 
5098 	SHA256_CTX sha_ctx;
5099 	SHA256_Init(&sha_ctx);
5100 	SHA256_Update(&sha_ctx, sha2_base, sha2_size);
5101 	SHA256_Final(sha_digest, &sha_ctx);
5102 }
5103 
5104 void
mptcp_hmac_sha256(mptcp_key_t key1,mptcp_key_t key2,u_char * msg,uint16_t msg_len,u_char * digest)5105 mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2,
5106     u_char *msg, uint16_t msg_len, u_char *digest)
5107 {
5108 	SHA256_CTX sha_ctx;
5109 	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5110 	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5111 	int i;
5112 
5113 	bzero(digest, SHA256_DIGEST_LENGTH);
5114 
5115 	/* Set up the Key for HMAC */
5116 	key_ipad[0] = key1;
5117 	key_ipad[1] = key2;
5118 
5119 	key_opad[0] = key1;
5120 	key_opad[1] = key2;
5121 
5122 	/* Key is 512 block length, so no need to compute hash */
5123 
5124 	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5125 
5126 	for (i = 0; i < 8; i++) {
5127 		key_ipad[i] ^= 0x3636363636363636;
5128 		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5129 	}
5130 
5131 	/* Perform inner SHA256 */
5132 	SHA256_Init(&sha_ctx);
5133 	SHA256_Update(&sha_ctx, (unsigned char *)key_ipad, sizeof(key_ipad));
5134 	SHA256_Update(&sha_ctx, msg, msg_len);
5135 	SHA256_Final(digest, &sha_ctx);
5136 
5137 	/* Perform outer SHA256 */
5138 	SHA256_Init(&sha_ctx);
5139 	SHA256_Update(&sha_ctx, (unsigned char *)key_opad, sizeof(key_opad));
5140 	SHA256_Update(&sha_ctx, (unsigned char *)digest, SHA256_DIGEST_LENGTH);
5141 	SHA256_Final(digest, &sha_ctx);
5142 }
5143 
5144 /*
5145  * SHA1 support for MPTCP
5146  */
5147 
5148 static void
mptcp_do_sha1(mptcp_key_t * key,char * sha_digest)5149 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5150 {
5151 	SHA1_CTX sha1ctxt;
5152 	const unsigned char *sha1_base;
5153 	int sha1_size;
5154 
5155 	sha1_base = (const unsigned char *) key;
5156 	sha1_size = sizeof(mptcp_key_t);
5157 	SHA1Init(&sha1ctxt);
5158 	SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5159 	SHA1Final(sha_digest, &sha1ctxt);
5160 }
5161 
5162 void
mptcp_hmac_sha1(mptcp_key_t key1,mptcp_key_t key2,u_int32_t rand1,u_int32_t rand2,u_char * digest)5163 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5164     u_int32_t rand1, u_int32_t rand2, u_char *digest)
5165 {
5166 	SHA1_CTX  sha1ctxt;
5167 	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5168 	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5169 	u_int32_t data[2];
5170 	int i;
5171 
5172 	bzero(digest, SHA1_RESULTLEN);
5173 
5174 	/* Set up the Key for HMAC */
5175 	key_ipad[0] = key1;
5176 	key_ipad[1] = key2;
5177 
5178 	key_opad[0] = key1;
5179 	key_opad[1] = key2;
5180 
5181 	/* Set up the message for HMAC */
5182 	data[0] = rand1;
5183 	data[1] = rand2;
5184 
5185 	/* Key is 512 block length, so no need to compute hash */
5186 
5187 	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5188 
5189 	for (i = 0; i < 8; i++) {
5190 		key_ipad[i] ^= 0x3636363636363636;
5191 		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5192 	}
5193 
5194 	/* Perform inner SHA1 */
5195 	SHA1Init(&sha1ctxt);
5196 	SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5197 	SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5198 	SHA1Final(digest, &sha1ctxt);
5199 
5200 	/* Perform outer SHA1 */
5201 	SHA1Init(&sha1ctxt);
5202 	SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5203 	SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5204 	SHA1Final(digest, &sha1ctxt);
5205 }
5206 
5207 /*
5208  * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5209  * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5210  */
5211 void
mptcp_get_mpjoin_hmac(mptcp_addr_id aid,struct mptcb * mp_tp,u_char * digest,uint8_t digest_len)5212 mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, uint8_t digest_len)
5213 {
5214 	uint32_t lrand, rrand;
5215 
5216 	lrand = rrand = 0;
5217 	mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5218 
5219 	u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0};
5220 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5221 		mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, full_digest);
5222 	} else {
5223 		uint32_t data[2];
5224 		data[0] = lrand;
5225 		data[1] = rrand;
5226 		mptcp_hmac_sha256(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, (u_char*)data, 8, full_digest);
5227 	}
5228 	bcopy(full_digest, digest, digest_len);
5229 }
5230 
5231 /*
5232  * Authentication data generation
5233  */
5234 static void
mptcp_generate_token(char * sha_digest,int sha_digest_len,caddr_t token,int token_len)5235 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5236     int token_len)
5237 {
5238 	VERIFY(token_len == sizeof(u_int32_t));
5239 	VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5240 	    sha_digest_len == SHA256_DIGEST_LENGTH);
5241 
5242 	/* Most significant 32 bits of the SHA1/SHA256 hash */
5243 	bcopy(sha_digest, token, sizeof(u_int32_t));
5244 	return;
5245 }
5246 
5247 static void
mptcp_generate_idsn(char * sha_digest,int sha_digest_len,caddr_t idsn,int idsn_len,uint8_t mp_version)5248 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5249     int idsn_len, uint8_t mp_version)
5250 {
5251 	VERIFY(idsn_len == sizeof(u_int64_t));
5252 	VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5253 	    sha_digest_len == SHA256_DIGEST_LENGTH);
5254 	VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1);
5255 
5256 	/*
5257 	 * Least significant 64 bits of the hash
5258 	 */
5259 
5260 	if (mp_version == MPTCP_VERSION_0) {
5261 		idsn[7] = sha_digest[12];
5262 		idsn[6] = sha_digest[13];
5263 		idsn[5] = sha_digest[14];
5264 		idsn[4] = sha_digest[15];
5265 		idsn[3] = sha_digest[16];
5266 		idsn[2] = sha_digest[17];
5267 		idsn[1] = sha_digest[18];
5268 		idsn[0] = sha_digest[19];
5269 	} else {
5270 		idsn[7] = sha_digest[24];
5271 		idsn[6] = sha_digest[25];
5272 		idsn[5] = sha_digest[26];
5273 		idsn[4] = sha_digest[27];
5274 		idsn[3] = sha_digest[28];
5275 		idsn[2] = sha_digest[29];
5276 		idsn[1] = sha_digest[30];
5277 		idsn[0] = sha_digest[31];
5278 	}
5279 	return;
5280 }
5281 
5282 static void
mptcp_conn_properties(struct mptcb * mp_tp)5283 mptcp_conn_properties(struct mptcb *mp_tp)
5284 {
5285 	/* Set DSS checksum flag */
5286 	if (mptcp_dss_csum) {
5287 		mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5288 	}
5289 
5290 	/* Set up receive window */
5291 	mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5292 
5293 	/* Set up gc ticks */
5294 	mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5295 }
5296 
5297 static void
mptcp_init_local_parms(struct mptses * mpte,struct sockaddr * dst)5298 mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst)
5299 {
5300 	struct mptcb *mp_tp = mpte->mpte_mptcb;
5301 	char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5302 	uint16_t digest_len;
5303 
5304 	if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) {
5305 		mp_tp->mpt_version = MPTCP_VERSION_0;
5306 	} else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) {
5307 		mp_tp->mpt_version = MPTCP_VERSION_1;
5308 	} else {
5309 		mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst);
5310 	}
5311 	VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 ||
5312 	    mp_tp->mpt_version == MPTCP_VERSION_1);
5313 
5314 	read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5315 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5316 		digest_len = SHA1_RESULTLEN;
5317 		mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5318 	} else {
5319 		digest_len = SHA256_DIGEST_LENGTH;
5320 		mptcp_do_sha256(&mp_tp->mpt_localkey, key_digest);
5321 	}
5322 
5323 	mptcp_generate_token(key_digest, digest_len,
5324 	    (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5325 	mptcp_generate_idsn(key_digest, digest_len,
5326 	    (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5327 	/* The subflow SYN is also first MPTCP byte */
5328 	mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5329 	mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5330 
5331 	mptcp_conn_properties(mp_tp);
5332 }
5333 
5334 int
mptcp_init_remote_parms(struct mptcb * mp_tp)5335 mptcp_init_remote_parms(struct mptcb *mp_tp)
5336 {
5337 	/* Setup local and remote tokens and Initial DSNs */
5338 	char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5339 	uint16_t digest_len;
5340 
5341 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5342 		digest_len = SHA1_RESULTLEN;
5343 		mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5344 	} else if (mp_tp->mpt_version == MPTCP_VERSION_1) {
5345 		digest_len = SHA256_DIGEST_LENGTH;
5346 		mptcp_do_sha256(&mp_tp->mpt_remotekey, remote_digest);
5347 	} else {
5348 		return -1;
5349 	}
5350 
5351 	mptcp_generate_token(remote_digest, digest_len,
5352 	    (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5353 	mptcp_generate_idsn(remote_digest, digest_len,
5354 	    (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5355 	mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5356 	mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5357 	return 0;
5358 }
5359 
5360 static void
mptcp_send_dfin(struct socket * so)5361 mptcp_send_dfin(struct socket *so)
5362 {
5363 	struct tcpcb *tp = NULL;
5364 	struct inpcb *inp = NULL;
5365 
5366 	inp = sotoinpcb(so);
5367 	if (!inp) {
5368 		return;
5369 	}
5370 
5371 	tp = intotcpcb(inp);
5372 	if (!tp) {
5373 		return;
5374 	}
5375 
5376 	if (!(tp->t_mpflags & TMPF_RESET)) {
5377 		tp->t_mpflags |= TMPF_SEND_DFIN;
5378 	}
5379 }
5380 
5381 /*
5382  * Data Sequence Mapping routines
5383  */
5384 void
mptcp_insert_dsn(struct mppcb * mpp,struct mbuf * m)5385 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5386 {
5387 	struct mptcb *mp_tp;
5388 
5389 	if (m == NULL) {
5390 		return;
5391 	}
5392 
5393 	__IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
5394 
5395 	while (m) {
5396 		VERIFY(m->m_flags & M_PKTHDR);
5397 		m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5398 		m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5399 		VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5400 		m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5401 		mp_tp->mpt_sndmax += m_pktlen(m);
5402 		m = m->m_next;
5403 	}
5404 }
5405 
5406 void
mptcp_fallback_sbdrop(struct socket * so,struct mbuf * m,int len)5407 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5408 {
5409 	struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5410 	uint64_t data_ack;
5411 	uint64_t dsn;
5412 
5413 	VERIFY(len >= 0);
5414 
5415 	if (!m || len == 0) {
5416 		return;
5417 	}
5418 
5419 	while (m && len > 0) {
5420 		VERIFY(m->m_flags & M_PKTHDR);
5421 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5422 
5423 		data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5424 		dsn = m->m_pkthdr.mp_dsn;
5425 
5426 		len -= m->m_len;
5427 		m = m->m_next;
5428 	}
5429 
5430 	if (m && len == 0) {
5431 		/*
5432 		 * If there is one more mbuf in the chain, it automatically means
5433 		 * that up to m->mp_dsn has been ack'ed.
5434 		 *
5435 		 * This means, we actually correct data_ack back down (compared
5436 		 * to what we set inside the loop - dsn + data_len). Because in
5437 		 * the loop we are "optimistic" and assume that the full mapping
5438 		 * will be acked. If that's not the case and we get out of the
5439 		 * loop with m != NULL, it means only up to m->mp_dsn has been
5440 		 * really acked.
5441 		 */
5442 		data_ack = m->m_pkthdr.mp_dsn;
5443 	}
5444 
5445 	if (len < 0) {
5446 		/*
5447 		 * If len is negative, meaning we acked in the middle of an mbuf,
5448 		 * only up to this mbuf's data-sequence number has been acked
5449 		 * at the MPTCP-level.
5450 		 */
5451 		data_ack = dsn;
5452 	}
5453 
5454 	/* We can have data in the subflow's send-queue that is being acked,
5455 	 * while the DATA_ACK has already advanced. Thus, we should check whether
5456 	 * or not the DATA_ACK is actually new here.
5457 	 */
5458 	if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5459 	    MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5460 		mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5461 	}
5462 }
5463 
5464 void
mptcp_preproc_sbdrop(struct socket * so,struct mbuf * m,unsigned int len)5465 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5466 {
5467 	int rewinding = 0;
5468 
5469 	/* TFO makes things complicated. */
5470 	if (so->so_flags1 & SOF1_TFO_REWIND) {
5471 		rewinding = 1;
5472 		so->so_flags1 &= ~SOF1_TFO_REWIND;
5473 	}
5474 
5475 	while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5476 		u_int32_t sub_len;
5477 		VERIFY(m->m_flags & M_PKTHDR);
5478 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5479 
5480 		sub_len = m->m_pkthdr.mp_rlen;
5481 
5482 		if (sub_len < len) {
5483 			m->m_pkthdr.mp_dsn += sub_len;
5484 			if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5485 				m->m_pkthdr.mp_rseq += sub_len;
5486 			}
5487 			m->m_pkthdr.mp_rlen = 0;
5488 			len -= sub_len;
5489 		} else {
5490 			/* sub_len >= len */
5491 			if (rewinding == 0) {
5492 				m->m_pkthdr.mp_dsn += len;
5493 			}
5494 			if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5495 				if (rewinding == 0) {
5496 					m->m_pkthdr.mp_rseq += len;
5497 				}
5498 			}
5499 			m->m_pkthdr.mp_rlen -= len;
5500 			break;
5501 		}
5502 		m = m->m_next;
5503 	}
5504 
5505 	if (so->so_flags & SOF_MP_SUBFLOW &&
5506 	    !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5507 	    !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5508 		/*
5509 		 * Received an ack without receiving a DATA_ACK.
5510 		 * Need to fallback to regular TCP (or destroy this subflow).
5511 		 */
5512 		sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5513 		mptcp_notify_mpfail(so);
5514 	}
5515 }
5516 
5517 /* Obtain the DSN mapping stored in the mbuf */
5518 void
mptcp_output_getm_dsnmap32(struct socket * so,int off,uint32_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5519 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5520     uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5521 {
5522 	u_int64_t dsn64;
5523 
5524 	mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5525 	*dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5526 }
5527 
5528 void
mptcp_output_getm_dsnmap64(struct socket * so,int off,uint64_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5529 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5530     uint32_t *relseq, uint16_t *data_len,
5531     uint16_t *dss_csum)
5532 {
5533 	struct mbuf *m = so->so_snd.sb_mb;
5534 
5535 	VERIFY(off >= 0);
5536 
5537 	if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5538 		*dsn = 0;
5539 		*relseq = 0;
5540 		*data_len = 0;
5541 		*dss_csum = 0;
5542 		return;
5543 	}
5544 
5545 	/*
5546 	 * In the subflow socket, the DSN sequencing can be discontiguous,
5547 	 * but the subflow sequence mapping is contiguous. Use the subflow
5548 	 * sequence property to find the right mbuf and corresponding dsn
5549 	 * mapping.
5550 	 */
5551 
5552 	while (m) {
5553 		VERIFY(m->m_flags & M_PKTHDR);
5554 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5555 
5556 		if (off >= m->m_len) {
5557 			off -= m->m_len;
5558 			m = m->m_next;
5559 		} else {
5560 			break;
5561 		}
5562 	}
5563 
5564 	VERIFY(off >= 0);
5565 	VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5566 
5567 	*dsn = m->m_pkthdr.mp_dsn;
5568 	*relseq = m->m_pkthdr.mp_rseq;
5569 	*data_len = m->m_pkthdr.mp_rlen;
5570 	*dss_csum = m->m_pkthdr.mp_csum;
5571 }
5572 
5573 void
mptcp_output_getm_data_level_details(struct socket * so,int off,uint16_t * data_len,uint16_t * dss_csum)5574 mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum)
5575 {
5576 	uint64_t dsn;
5577 	uint32_t relseq;
5578 
5579 	mptcp_output_getm_dsnmap64(so, off, &dsn, &relseq, data_len, dss_csum);
5580 }
5581 
5582 /*
5583  * Note that this is called only from tcp_input() via mptcp_input_preproc()
5584  * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5585  * When it trims data tcp_input calls m_adj() which does not remove the
5586  * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5587  * The dsn map insertion cannot be delayed after trim, because data can be in
5588  * the reassembly queue for a while and the DSN option info in tp will be
5589  * overwritten for every new packet received.
5590  * The dsn map will be adjusted just prior to appending to subflow sockbuf
5591  * with mptcp_adj_rmap()
5592  */
5593 void
mptcp_insert_rmap(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th)5594 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5595 {
5596 	VERIFY(m->m_flags & M_PKTHDR);
5597 	VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5598 
5599 	if (tp->t_mpflags & TMPF_EMBED_DSN) {
5600 		m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5601 		m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5602 		m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5603 		m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5604 		if (tp->t_rcv_map.mpt_dfin) {
5605 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5606 		}
5607 
5608 		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5609 
5610 		tp->t_mpflags &= ~TMPF_EMBED_DSN;
5611 		tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5612 	} else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5613 		if (th->th_flags & TH_FIN) {
5614 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5615 		}
5616 	}
5617 }
5618 
5619 /*
5620  * Following routines help with failure detection and failover of data
5621  * transfer from one subflow to another.
5622  */
5623 void
mptcp_act_on_txfail(struct socket * so)5624 mptcp_act_on_txfail(struct socket *so)
5625 {
5626 	struct tcpcb *tp = NULL;
5627 	struct inpcb *inp = sotoinpcb(so);
5628 
5629 	if (inp == NULL) {
5630 		return;
5631 	}
5632 
5633 	tp = intotcpcb(inp);
5634 	if (tp == NULL) {
5635 		return;
5636 	}
5637 
5638 	if (so->so_flags & SOF_MP_TRYFAILOVER) {
5639 		return;
5640 	}
5641 
5642 	so->so_flags |= SOF_MP_TRYFAILOVER;
5643 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5644 }
5645 
5646 /*
5647  * Support for MP_FAIL option
5648  */
5649 int
mptcp_get_map_for_dsn(struct socket * so,uint64_t dsn_fail,uint32_t * tcp_seq)5650 mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5651 {
5652 	struct mbuf *m = so->so_snd.sb_mb;
5653 	uint16_t datalen;
5654 	uint64_t dsn;
5655 	int off = 0;
5656 
5657 	if (m == NULL) {
5658 		return -1;
5659 	}
5660 
5661 	while (m != NULL) {
5662 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5663 		VERIFY(m->m_flags & M_PKTHDR);
5664 		dsn = m->m_pkthdr.mp_dsn;
5665 		datalen = m->m_pkthdr.mp_rlen;
5666 		if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5667 		    (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5668 			off = (int)(dsn_fail - dsn);
5669 			*tcp_seq = m->m_pkthdr.mp_rseq + off;
5670 			return 0;
5671 		}
5672 
5673 		m = m->m_next;
5674 	}
5675 
5676 	/*
5677 	 * If there was no mbuf data and a fallback to TCP occurred, there's
5678 	 * not much else to do.
5679 	 */
5680 
5681 	os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5682 	return -1;
5683 }
5684 
5685 /*
5686  * Support for sending contiguous MPTCP bytes in subflow
5687  * Also for preventing sending data with ACK in 3-way handshake
5688  */
5689 int32_t
mptcp_adj_sendlen(struct socket * so,int32_t off)5690 mptcp_adj_sendlen(struct socket *so, int32_t off)
5691 {
5692 	struct tcpcb *tp = sototcpcb(so);
5693 	struct mptsub *mpts = tp->t_mpsub;
5694 	uint64_t mdss_dsn;
5695 	uint32_t mdss_subflow_seq;
5696 	int mdss_subflow_off;
5697 	uint16_t mdss_data_len;
5698 	uint16_t dss_csum;
5699 
5700 	if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5701 		return 0;
5702 	}
5703 
5704 	mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5705 	    &mdss_data_len, &dss_csum);
5706 
5707 	/*
5708 	 * We need to compute how much of the mapping still remains.
5709 	 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5710 	 */
5711 	mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5712 
5713 	/*
5714 	 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5715 	 * seq has been set to 1 (while it should be 0).
5716 	 */
5717 	if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5718 		mdss_subflow_off--;
5719 	}
5720 
5721 	VERIFY(off >= mdss_subflow_off);
5722 
5723 	return mdss_data_len - (off - mdss_subflow_off);
5724 }
5725 
5726 static uint32_t
mptcp_get_maxseg(struct mptses * mpte)5727 mptcp_get_maxseg(struct mptses *mpte)
5728 {
5729 	struct mptsub *mpts;
5730 	uint32_t maxseg = 0;
5731 
5732 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5733 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5734 
5735 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5736 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
5737 			continue;
5738 		}
5739 
5740 		if (tp->t_maxseg > maxseg) {
5741 			maxseg = tp->t_maxseg;
5742 		}
5743 	}
5744 
5745 	return maxseg;
5746 }
5747 
5748 static uint8_t
mptcp_get_rcvscale(struct mptses * mpte)5749 mptcp_get_rcvscale(struct mptses *mpte)
5750 {
5751 	struct mptsub *mpts;
5752 	uint8_t rcvscale = UINT8_MAX;
5753 
5754 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5755 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5756 
5757 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5758 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
5759 			continue;
5760 		}
5761 
5762 		if (tp->rcv_scale < rcvscale) {
5763 			rcvscale = tp->rcv_scale;
5764 		}
5765 	}
5766 
5767 	return rcvscale;
5768 }
5769 
5770 /* Similar to tcp_sbrcv_reserve */
5771 static void
mptcp_sbrcv_reserve(struct mptcb * mp_tp,struct sockbuf * sbrcv,u_int32_t newsize,u_int32_t idealsize)5772 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5773     u_int32_t newsize, u_int32_t idealsize)
5774 {
5775 	uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5776 
5777 	if (rcvscale == UINT8_MAX) {
5778 		return;
5779 	}
5780 
5781 	/* newsize should not exceed max */
5782 	newsize = min(newsize, tcp_autorcvbuf_max);
5783 
5784 	/* The receive window scale negotiated at the
5785 	 * beginning of the connection will also set a
5786 	 * limit on the socket buffer size
5787 	 */
5788 	newsize = min(newsize, TCP_MAXWIN << rcvscale);
5789 
5790 	/* Set new socket buffer size */
5791 	if (newsize > sbrcv->sb_hiwat &&
5792 	    (sbreserve(sbrcv, newsize) == 1)) {
5793 		sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5794 		    (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5795 
5796 		/* Again check the limit set by the advertised
5797 		 * window scale
5798 		 */
5799 		sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5800 		    TCP_MAXWIN << rcvscale);
5801 	}
5802 }
5803 
5804 void
mptcp_sbrcv_grow(struct mptcb * mp_tp)5805 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5806 {
5807 	struct mptses *mpte = mp_tp->mpt_mpte;
5808 	struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5809 	struct sockbuf *sbrcv = &mp_so->so_rcv;
5810 	uint32_t hiwat_sum = 0;
5811 	uint32_t ideal_sum = 0;
5812 	struct mptsub *mpts;
5813 
5814 	/*
5815 	 * Do not grow the receive socket buffer if
5816 	 * - auto resizing is disabled, globally or on this socket
5817 	 * - the high water mark already reached the maximum
5818 	 * - the stream is in background and receive side is being
5819 	 * throttled
5820 	 * - if there are segments in reassembly queue indicating loss,
5821 	 * do not need to increase recv window during recovery as more
5822 	 * data is not going to be sent. A duplicate ack sent during
5823 	 * recovery should not change the receive window
5824 	 */
5825 	if (tcp_do_autorcvbuf == 0 ||
5826 	    (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5827 	    tcp_cansbgrow(sbrcv) == 0 ||
5828 	    sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5829 	    (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5830 	    !LIST_EMPTY(&mp_tp->mpt_segq)) {
5831 		/* Can not resize the socket buffer, just return */
5832 		return;
5833 	}
5834 
5835 	/*
5836 	 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5837 	 *
5838 	 * But, for this we first need accurate receiver-RTT estimations, which
5839 	 * we currently don't have.
5840 	 *
5841 	 * Let's use a dummy algorithm for now, just taking the sum of all
5842 	 * subflow's receive-buffers. It's too low, but that's all we can get
5843 	 * for now.
5844 	 */
5845 
5846 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5847 		hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5848 		ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5849 	}
5850 
5851 	mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5852 }
5853 
5854 /*
5855  * Determine if we can grow the recieve socket buffer to avoid sending
5856  * a zero window update to the peer. We allow even socket buffers that
5857  * have fixed size (set by the application) to grow if the resource
5858  * constraints are met. They will also be trimmed after the application
5859  * reads data.
5860  *
5861  * Similar to tcp_sbrcv_grow_rwin
5862  */
5863 static void
mptcp_sbrcv_grow_rwin(struct mptcb * mp_tp,struct sockbuf * sb)5864 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5865 {
5866 	struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5867 	u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5868 	u_int32_t rcvbuf = sb->sb_hiwat;
5869 
5870 	if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5871 		return;
5872 	}
5873 
5874 	if (tcp_do_autorcvbuf == 1 &&
5875 	    tcp_cansbgrow(sb) &&
5876 	    /* Diff to tcp_sbrcv_grow_rwin */
5877 	    (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5878 	    (rcvbuf - sb->sb_cc) < rcvbufinc &&
5879 	    rcvbuf < tcp_autorcvbuf_max &&
5880 	    (sb->sb_idealsize > 0 &&
5881 	    sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5882 		sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5883 	}
5884 }
5885 
5886 /* Similar to tcp_sbspace */
5887 int32_t
mptcp_sbspace(struct mptcb * mp_tp)5888 mptcp_sbspace(struct mptcb *mp_tp)
5889 {
5890 	struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5891 	uint32_t rcvbuf;
5892 	int32_t space;
5893 	int32_t pending = 0;
5894 
5895 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5896 
5897 	mptcp_sbrcv_grow_rwin(mp_tp, sb);
5898 
5899 	/* hiwat might have changed */
5900 	rcvbuf = sb->sb_hiwat;
5901 
5902 	space =  ((int32_t) imin((rcvbuf - sb->sb_cc),
5903 	    (sb->sb_mbmax - sb->sb_mbcnt)));
5904 	if (space < 0) {
5905 		space = 0;
5906 	}
5907 
5908 #if CONTENT_FILTER
5909 	/* Compensate for data being processed by content filters */
5910 	pending = cfil_sock_data_space(sb);
5911 #endif /* CONTENT_FILTER */
5912 	if (pending > space) {
5913 		space = 0;
5914 	} else {
5915 		space -= pending;
5916 	}
5917 
5918 	return space;
5919 }
5920 
5921 /*
5922  * Support Fallback to Regular TCP
5923  */
5924 void
mptcp_notify_mpready(struct socket * so)5925 mptcp_notify_mpready(struct socket *so)
5926 {
5927 	struct tcpcb *tp = NULL;
5928 
5929 	if (so == NULL) {
5930 		return;
5931 	}
5932 
5933 	tp = intotcpcb(sotoinpcb(so));
5934 
5935 	if (tp == NULL) {
5936 		return;
5937 	}
5938 
5939 	DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5940 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5941 	    struct tcpcb *, tp);
5942 
5943 	if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5944 		return;
5945 	}
5946 
5947 	if (tp->t_mpflags & TMPF_MPTCP_READY) {
5948 		return;
5949 	}
5950 
5951 	tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5952 	tp->t_mpflags |= TMPF_MPTCP_READY;
5953 
5954 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5955 }
5956 
5957 void
mptcp_notify_mpfail(struct socket * so)5958 mptcp_notify_mpfail(struct socket *so)
5959 {
5960 	struct tcpcb *tp = NULL;
5961 
5962 	if (so == NULL) {
5963 		return;
5964 	}
5965 
5966 	tp = intotcpcb(sotoinpcb(so));
5967 
5968 	if (tp == NULL) {
5969 		return;
5970 	}
5971 
5972 	DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5973 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5974 	    struct tcpcb *, tp);
5975 
5976 	if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5977 		return;
5978 	}
5979 
5980 	tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5981 	tp->t_mpflags |= TMPF_TCP_FALLBACK;
5982 
5983 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5984 }
5985 
5986 /*
5987  * Keepalive helper function
5988  */
5989 boolean_t
mptcp_ok_to_keepalive(struct mptcb * mp_tp)5990 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5991 {
5992 	boolean_t ret = 1;
5993 
5994 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5995 
5996 	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5997 		ret = 0;
5998 	}
5999 	return ret;
6000 }
6001 
6002 /*
6003  * MPTCP t_maxseg adjustment function
6004  */
6005 int
mptcp_adj_mss(struct tcpcb * tp,boolean_t mtudisc)6006 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6007 {
6008 	int mss_lower = 0;
6009 	struct mptcb *mp_tp = tptomptp(tp);
6010 
6011 #define MPTCP_COMPUTE_LEN {                             \
6012 	mss_lower = sizeof (struct mptcp_dss_ack_opt);  \
6013 	if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)         \
6014 	        mss_lower += 2;                         \
6015 	else                                            \
6016 	/* adjust to 32-bit boundary + EOL */   \
6017 	        mss_lower += 2;                         \
6018 }
6019 	if (mp_tp == NULL) {
6020 		return 0;
6021 	}
6022 
6023 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6024 
6025 	/*
6026 	 * For the first subflow and subsequent subflows, adjust mss for
6027 	 * most common MPTCP option size, for case where tcp_mss is called
6028 	 * during option processing and MTU discovery.
6029 	 */
6030 	if (!mtudisc) {
6031 		if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6032 		    !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6033 			MPTCP_COMPUTE_LEN;
6034 		}
6035 
6036 		if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6037 		    tp->t_mpflags & TMPF_SENT_JOIN) {
6038 			MPTCP_COMPUTE_LEN;
6039 		}
6040 	} else {
6041 		if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6042 			MPTCP_COMPUTE_LEN;
6043 		}
6044 	}
6045 
6046 	return mss_lower;
6047 }
6048 
6049 static void
fill_mptcp_subflow(struct socket * so,mptcp_flow_t * flow,struct mptsub * mpts)6050 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6051 {
6052 	struct inpcb *inp;
6053 
6054 	tcp_getconninfo(so, &flow->flow_ci);
6055 	inp = sotoinpcb(so);
6056 	if ((inp->inp_vflag & INP_IPV6) != 0) {
6057 		flow->flow_src.ss_family = AF_INET6;
6058 		flow->flow_dst.ss_family = AF_INET6;
6059 		flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6060 		flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6061 		SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6062 		SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6063 		SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6064 		SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
6065 	} else if ((inp->inp_vflag & INP_IPV4) != 0) {
6066 		flow->flow_src.ss_family = AF_INET;
6067 		flow->flow_dst.ss_family = AF_INET;
6068 		flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6069 		flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6070 		SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6071 		SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6072 		SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6073 		SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6074 	}
6075 	flow->flow_len = sizeof(*flow);
6076 	flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
6077 	flow->flow_flags = mpts->mpts_flags;
6078 	flow->flow_cid = mpts->mpts_connid;
6079 	flow->flow_relseq = mpts->mpts_rel_seq;
6080 	flow->flow_soerror = mpts->mpts_socket->so_error;
6081 	flow->flow_probecnt = mpts->mpts_probecnt;
6082 }
6083 
6084 static int
6085 mptcp_pcblist SYSCTL_HANDLER_ARGS
6086 {
6087 #pragma unused(oidp, arg1, arg2)
6088 	int error = 0, f;
6089 	size_t len;
6090 	struct mppcb *mpp;
6091 	struct mptses *mpte;
6092 	struct mptcb *mp_tp;
6093 	struct mptsub *mpts;
6094 	struct socket *so;
6095 	conninfo_mptcp_t mptcpci;
6096 	mptcp_flow_t *flows = NULL;
6097 
6098 	if (req->newptr != USER_ADDR_NULL) {
6099 		return EPERM;
6100 	}
6101 
6102 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6103 	if (req->oldptr == USER_ADDR_NULL) {
6104 		size_t n = mtcbinfo.mppi_count;
6105 		lck_mtx_unlock(&mtcbinfo.mppi_lock);
6106 		req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6107 		    4 * (n + n / 8)  * sizeof(mptcp_flow_t);
6108 		return 0;
6109 	}
6110 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6111 		flows = NULL;
6112 		socket_lock(mpp->mpp_socket, 1);
6113 		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6114 		mpte = mptompte(mpp);
6115 
6116 		socket_lock_assert_owned(mptetoso(mpte));
6117 		mp_tp = mpte->mpte_mptcb;
6118 
6119 		bzero(&mptcpci, sizeof(mptcpci));
6120 		mptcpci.mptcpci_state = mp_tp->mpt_state;
6121 		mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6122 		mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6123 		mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6124 		mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6125 		mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6126 		mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6127 		mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6128 		mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6129 		mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6130 		mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6131 		mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6132 		mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6133 		mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6134 
6135 		mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6136 		mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6137 		mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6138 		mptcpci.mptcpci_flow_offset =
6139 		    offsetof(conninfo_mptcp_t, mptcpci_flows);
6140 
6141 		len = sizeof(*flows) * mpte->mpte_numflows;
6142 		if (mpte->mpte_numflows != 0) {
6143 			flows = kalloc_data(len, Z_WAITOK | Z_ZERO);
6144 			if (flows == NULL) {
6145 				socket_unlock(mpp->mpp_socket, 1);
6146 				break;
6147 			}
6148 			mptcpci.mptcpci_len = sizeof(mptcpci) +
6149 			    sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6150 			error = SYSCTL_OUT(req, &mptcpci,
6151 			    sizeof(mptcpci) - sizeof(mptcp_flow_t));
6152 		} else {
6153 			mptcpci.mptcpci_len = sizeof(mptcpci);
6154 			error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6155 		}
6156 		if (error) {
6157 			socket_unlock(mpp->mpp_socket, 1);
6158 			kfree_data(flows, len);
6159 			break;
6160 		}
6161 		f = 0;
6162 		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6163 			so = mpts->mpts_socket;
6164 			fill_mptcp_subflow(so, &flows[f], mpts);
6165 			f++;
6166 		}
6167 		socket_unlock(mpp->mpp_socket, 1);
6168 		if (flows) {
6169 			error = SYSCTL_OUT(req, flows, len);
6170 			kfree_data(flows, len);
6171 			if (error) {
6172 				break;
6173 			}
6174 		}
6175 	}
6176 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6177 
6178 	return error;
6179 }
6180 
6181 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6182     0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6183     "List of active MPTCP connections");
6184 
6185 /*
6186  * Set notsent lowat mark on the MPTCB
6187  */
6188 int
mptcp_set_notsent_lowat(struct mptses * mpte,int optval)6189 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6190 {
6191 	struct mptcb *mp_tp = NULL;
6192 	int error = 0;
6193 
6194 	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6195 		mp_tp = mpte->mpte_mptcb;
6196 	}
6197 
6198 	if (mp_tp) {
6199 		mp_tp->mpt_notsent_lowat = optval;
6200 	} else {
6201 		error = EINVAL;
6202 	}
6203 
6204 	return error;
6205 }
6206 
6207 u_int32_t
mptcp_get_notsent_lowat(struct mptses * mpte)6208 mptcp_get_notsent_lowat(struct mptses *mpte)
6209 {
6210 	struct mptcb *mp_tp = NULL;
6211 
6212 	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6213 		mp_tp = mpte->mpte_mptcb;
6214 	}
6215 
6216 	if (mp_tp) {
6217 		return mp_tp->mpt_notsent_lowat;
6218 	} else {
6219 		return 0;
6220 	}
6221 }
6222 
6223 int
mptcp_notsent_lowat_check(struct socket * so)6224 mptcp_notsent_lowat_check(struct socket *so)
6225 {
6226 	struct mptses *mpte;
6227 	struct mppcb *mpp;
6228 	struct mptcb *mp_tp;
6229 	struct mptsub *mpts;
6230 
6231 	int notsent = 0;
6232 
6233 	mpp = mpsotomppcb(so);
6234 	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6235 		return 0;
6236 	}
6237 
6238 	mpte = mptompte(mpp);
6239 	socket_lock_assert_owned(mptetoso(mpte));
6240 	mp_tp = mpte->mpte_mptcb;
6241 
6242 	notsent = so->so_snd.sb_cc;
6243 
6244 	if ((notsent == 0) ||
6245 	    ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6246 	    mp_tp->mpt_notsent_lowat)) {
6247 		return 1;
6248 	}
6249 
6250 	/* When Nagle's algorithm is not disabled, it is better
6251 	 * to wakeup the client even before there is atleast one
6252 	 * maxseg of data to write.
6253 	 */
6254 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6255 		int retval = 0;
6256 		if (mpts->mpts_flags & MPTSF_ACTIVE) {
6257 			struct socket *subf_so = mpts->mpts_socket;
6258 			struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6259 
6260 			notsent = so->so_snd.sb_cc -
6261 			    (tp->snd_nxt - tp->snd_una);
6262 
6263 			if ((tp->t_flags & TF_NODELAY) == 0 &&
6264 			    notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6265 				retval = 1;
6266 			}
6267 			return retval;
6268 		}
6269 	}
6270 	return 0;
6271 }
6272 
6273 static errno_t
mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)6274 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6275     void **unitinfo)
6276 {
6277 #pragma unused(kctlref, sac, unitinfo)
6278 
6279 	if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6280 		os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6281 	}
6282 
6283 	mptcp_kern_skt_unit = sac->sc_unit;
6284 
6285 	return 0;
6286 }
6287 
6288 static void
mptcp_allow_uuid(uuid_t uuid,int32_t rssi)6289 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6290 {
6291 	struct mppcb *mpp;
6292 
6293 	/* Iterate over all MPTCP connections */
6294 
6295 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6296 
6297 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6298 		struct socket *mp_so = mpp->mpp_socket;
6299 		struct mptses *mpte = mpp->mpp_pcbe;
6300 
6301 		socket_lock(mp_so, 1);
6302 
6303 		if (mp_so->so_flags & SOF_DELEGATED &&
6304 		    uuid_compare(uuid, mp_so->e_uuid)) {
6305 			goto next;
6306 		} else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6307 		    uuid_compare(uuid, mp_so->last_uuid)) {
6308 			goto next;
6309 		}
6310 
6311 		os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6312 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6313 
6314 		mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6315 
6316 		if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6317 			mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6318 		}
6319 
6320 		mptcp_check_subflows_and_add(mpte);
6321 		mptcp_remove_subflows(mpte);
6322 
6323 		mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6324 
6325 next:
6326 		socket_unlock(mp_so, 1);
6327 	}
6328 
6329 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6330 }
6331 
6332 static void
mptcp_wifi_status_changed(void)6333 mptcp_wifi_status_changed(void)
6334 {
6335 	struct mppcb *mpp;
6336 
6337 	/* Iterate over all MPTCP connections */
6338 
6339 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6340 
6341 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6342 		struct socket *mp_so = mpp->mpp_socket;
6343 		struct mptses *mpte = mpp->mpp_pcbe;
6344 
6345 		socket_lock(mp_so, 1);
6346 
6347 		/* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6348 		if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6349 		    mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
6350 		    mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6351 			goto next;
6352 		}
6353 
6354 		mptcp_check_subflows_and_add(mpte);
6355 		mptcp_check_subflows_and_remove(mpte);
6356 
6357 next:
6358 		socket_unlock(mp_so, 1);
6359 	}
6360 
6361 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6362 }
6363 
6364 struct mptcp_uuid_search_info {
6365 	uuid_t target_uuid;
6366 	proc_t found_proc;
6367 	boolean_t is_proc_found;
6368 };
6369 
6370 static int
mptcp_find_proc_filter(proc_t p,void * arg)6371 mptcp_find_proc_filter(proc_t p, void *arg)
6372 {
6373 	struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6374 	int found;
6375 
6376 	if (info->is_proc_found) {
6377 		return 0;
6378 	}
6379 
6380 	/*
6381 	 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6382 	 * expects != 0 for a matching filter.
6383 	 */
6384 	found = uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0;
6385 	if (found) {
6386 		info->is_proc_found = true;
6387 	}
6388 
6389 	return found;
6390 }
6391 
6392 static int
mptcp_find_proc_callout(proc_t p,void * arg)6393 mptcp_find_proc_callout(proc_t p, void * arg)
6394 {
6395 	struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6396 
6397 	if (uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0) {
6398 		info->found_proc = p;
6399 		return PROC_CLAIMED_DONE;
6400 	}
6401 
6402 	return PROC_RETURNED;
6403 }
6404 
6405 static proc_t
mptcp_find_proc(const uuid_t uuid)6406 mptcp_find_proc(const uuid_t uuid)
6407 {
6408 	struct mptcp_uuid_search_info info;
6409 
6410 	uuid_copy(info.target_uuid, uuid);
6411 	info.found_proc = PROC_NULL;
6412 	info.is_proc_found = false;
6413 
6414 	proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
6415 	    mptcp_find_proc_filter, &info);
6416 
6417 	return info.found_proc;
6418 }
6419 
6420 void
mptcp_ask_symptoms(struct mptses * mpte)6421 mptcp_ask_symptoms(struct mptses *mpte)
6422 {
6423 	struct mptcp_symptoms_ask_uuid ask;
6424 	struct socket *mp_so;
6425 	struct proc *p = PROC_NULL;
6426 	int pid, prio, err;
6427 
6428 	if (mptcp_kern_skt_unit == 0) {
6429 		os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6430 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6431 		return;
6432 	}
6433 
6434 	mp_so = mptetoso(mpte);
6435 
6436 	if (mp_so->so_flags & SOF_DELEGATED) {
6437 		if (mpte->mpte_epid != 0) {
6438 			p = proc_find(mpte->mpte_epid);
6439 			if (p != PROC_NULL) {
6440 				/* We found a pid, check its UUID */
6441 				if (uuid_compare(mp_so->e_uuid, proc_executableuuid_addr(p))) {
6442 					/* It's not the same - we need to look for the real proc */
6443 					proc_rele(p);
6444 					p = PROC_NULL;
6445 				}
6446 			}
6447 		}
6448 
6449 		if (p == PROC_NULL) {
6450 			p = mptcp_find_proc(mp_so->e_uuid);
6451 			if (p == PROC_NULL) {
6452 				uuid_string_t uuid_string;
6453 				uuid_unparse(mp_so->e_uuid, uuid_string);
6454 
6455 				os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6456 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
6457 
6458 				return;
6459 			}
6460 			mpte->mpte_epid = proc_pid(p);
6461 		}
6462 
6463 		pid = mpte->mpte_epid;
6464 		uuid_copy(ask.uuid, mp_so->e_uuid);
6465 	} else {
6466 		pid = mp_so->last_pid;
6467 
6468 		p = proc_find(pid);
6469 		if (p == PROC_NULL) {
6470 			os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6471 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6472 			return;
6473 		}
6474 
6475 		uuid_copy(ask.uuid, mp_so->last_uuid);
6476 	}
6477 
6478 
6479 	ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6480 
6481 	prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6482 
6483 	if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6484 	    prio == TASK_DARWINBG_APPLICATION) {
6485 		ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6486 	} else if (prio == TASK_FOREGROUND_APPLICATION) {
6487 		ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6488 	} else {
6489 		ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6490 	}
6491 
6492 	err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6493 	    &ask, sizeof(ask), CTL_DATA_EOR);
6494 
6495 	os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6496 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6497 
6498 
6499 	proc_rele(p);
6500 }
6501 
6502 static errno_t
mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo)6503 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6504     void *unitinfo)
6505 {
6506 #pragma unused(kctlref, kcunit, unitinfo)
6507 
6508 	OSDecrementAtomic(&mptcp_kern_skt_inuse);
6509 
6510 	return 0;
6511 }
6512 
6513 static errno_t
mptcp_symptoms_ctl_send(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,mbuf_t m,int flags)6514 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6515     mbuf_t m, int flags)
6516 {
6517 #pragma unused(kctlref, unitinfo, flags)
6518 	symptoms_advisory_t *sa = NULL;
6519 
6520 	if (kcunit != mptcp_kern_skt_unit) {
6521 		os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6522 		    __func__, kcunit, mptcp_kern_skt_unit);
6523 	}
6524 
6525 	if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6526 		mbuf_freem(m);
6527 		return EINVAL;
6528 	}
6529 
6530 	if (mbuf_len(m) < sizeof(*sa)) {
6531 		os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6532 		    __func__, mbuf_len(m), sizeof(*sa));
6533 		mbuf_freem(m);
6534 		return EINVAL;
6535 	}
6536 
6537 	sa = mbuf_data(m);
6538 
6539 	if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6540 		os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6541 		    sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6542 		    sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6543 
6544 		if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6545 			mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6546 			mptcp_wifi_status_changed();
6547 		}
6548 	} else {
6549 		struct mptcp_symptoms_answer answer;
6550 		errno_t err;
6551 
6552 		/* We temporarily allow different sizes for ease of submission */
6553 		if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6554 		    mbuf_len(m) != sizeof(answer)) {
6555 			os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6556 			    __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6557 			    sizeof(answer));
6558 			mbuf_free(m);
6559 			return EINVAL;
6560 		}
6561 
6562 		memset(&answer, 0, sizeof(answer));
6563 
6564 		err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6565 		if (err) {
6566 			os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6567 			mbuf_free(m);
6568 			return err;
6569 		}
6570 
6571 		mptcp_allow_uuid(answer.uuid, answer.rssi);
6572 	}
6573 
6574 	mbuf_freem(m);
6575 	return 0;
6576 }
6577 
6578 void
mptcp_control_register(void)6579 mptcp_control_register(void)
6580 {
6581 	/* Set up the advisory control socket */
6582 	struct kern_ctl_reg mptcp_kern_ctl;
6583 
6584 	bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6585 	strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6586 	    sizeof(mptcp_kern_ctl.ctl_name));
6587 	mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6588 	mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6589 	mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6590 	mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6591 
6592 	(void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6593 }
6594 
6595 mptcp_wifi_quality_t
mptcp_wifi_quality_for_session(struct mptses * mpte)6596 mptcp_wifi_quality_for_session(struct mptses *mpte)
6597 {
6598 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6599 		if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6600 		    mptcp_advisory.sa_wifi_status) {
6601 			return symptoms_is_wifi_lossy() ? MPTCP_WIFI_QUALITY_BAD : MPTCP_WIFI_QUALITY_GOOD;
6602 		}
6603 
6604 		/*
6605 		 * If it's a first-party app and we don't have any info
6606 		 * about the Wi-Fi state, let's be pessimistic.
6607 		 */
6608 		return MPTCP_WIFI_QUALITY_UNSURE;
6609 	} else {
6610 		if (symptoms_is_wifi_lossy()) {
6611 			return MPTCP_WIFI_QUALITY_BAD;
6612 		}
6613 
6614 		/*
6615 		 * If we are target-based (meaning, we allow to be more lax on
6616 		 * the when wifi is considered bad), we only *know* about the state once
6617 		 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6618 		 *
6619 		 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6620 		 * be set.
6621 		 *
6622 		 * In any other case (while in target-mode), consider WiFi bad
6623 		 * and we are going to ask for allowance from Symptoms anyway.
6624 		 */
6625 		if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6626 			if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6627 			    mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6628 				return MPTCP_WIFI_QUALITY_GOOD;
6629 			}
6630 
6631 			return MPTCP_WIFI_QUALITY_BAD;
6632 		}
6633 
6634 		return MPTCP_WIFI_QUALITY_GOOD;
6635 	}
6636 }
6637 
6638 boolean_t
symptoms_is_wifi_lossy(void)6639 symptoms_is_wifi_lossy(void)
6640 {
6641 	return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6642 }
6643 
6644 int
mptcp_freeq(struct mptcb * mp_tp)6645 mptcp_freeq(struct mptcb *mp_tp)
6646 {
6647 	struct tseg_qent *q;
6648 	int rv = 0;
6649 	int count = 0;
6650 
6651 	while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6652 		LIST_REMOVE(q, tqe_q);
6653 		m_freem(q->tqe_m);
6654 		zfree(tcp_reass_zone, q);
6655 		count++;
6656 		rv = 1;
6657 	}
6658 	mp_tp->mpt_reassqlen = 0;
6659 
6660 	if (count > 0) {
6661 		OSAddAtomic(-count, &mptcp_reass_total_qlen);
6662 	}
6663 
6664 	return rv;
6665 }
6666 
6667 static int
mptcp_post_event(u_int32_t event_code,int value)6668 mptcp_post_event(u_int32_t event_code, int value)
6669 {
6670 	struct kev_mptcp_data event_data;
6671 	struct kev_msg ev_msg;
6672 
6673 	memset(&ev_msg, 0, sizeof(ev_msg));
6674 
6675 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
6676 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
6677 	ev_msg.kev_subclass     = KEV_MPTCP_SUBCLASS;
6678 	ev_msg.event_code       = event_code;
6679 
6680 	event_data.value = value;
6681 
6682 	ev_msg.dv[0].data_ptr    = &event_data;
6683 	ev_msg.dv[0].data_length = sizeof(event_data);
6684 
6685 	return kev_post_msg(&ev_msg);
6686 }
6687 
6688 static void
mptcp_set_cellicon(struct mptses * mpte,struct mptsub * mpts)6689 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6690 {
6691 	struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6692 	int error;
6693 
6694 	/* First-party apps (Siri) don't flip the cellicon */
6695 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6696 		return;
6697 	}
6698 
6699 	/* Subflow is disappearing - don't set it on this one */
6700 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6701 		return;
6702 	}
6703 
6704 	/* Fallen back connections are not triggering the cellicon */
6705 	if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6706 		return;
6707 	}
6708 
6709 	/* Remember the last time we set the cellicon. Needed for debouncing */
6710 	mpte->mpte_last_cellicon_set = tcp_now;
6711 
6712 	tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6713 	tcp_sched_timers(tp);
6714 
6715 	if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6716 	    mpte->mpte_cellicon_increments != 0) {
6717 		if (mptcp_cellicon_refcount == 0) {
6718 			os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6719 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6720 
6721 			/* Continue, so that the icon gets set... */
6722 		} else {
6723 			/*
6724 			 * In this case, the cellicon is already set. No need to bump it
6725 			 * even higher
6726 			 */
6727 
6728 			return;
6729 		}
6730 	}
6731 
6732 	/* When tearing down this subflow, we need to decrement the
6733 	 * reference counter
6734 	 */
6735 	mpts->mpts_flags |= MPTSF_CELLICON_SET;
6736 
6737 	/* This counter, so that when a session gets destroyed we decrement
6738 	 * the reference counter by whatever is left
6739 	 */
6740 	mpte->mpte_cellicon_increments++;
6741 
6742 	if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6743 		/* If cellicon is already set, get out of here! */
6744 		return;
6745 	}
6746 
6747 	error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6748 
6749 	if (error) {
6750 		os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6751 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6752 	} else {
6753 		os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6754 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6755 	}
6756 }
6757 
6758 void
mptcp_clear_cellicon(void)6759 mptcp_clear_cellicon(void)
6760 {
6761 	int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6762 
6763 	if (error) {
6764 		os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6765 		    __func__, error);
6766 	} else {
6767 		os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6768 		    __func__);
6769 	}
6770 }
6771 
6772 /*
6773  * Returns true if the icon has been flipped to WiFi.
6774  */
6775 static boolean_t
__mptcp_unset_cellicon(uint32_t val)6776 __mptcp_unset_cellicon(uint32_t val)
6777 {
6778 	VERIFY(val < INT32_MAX);
6779 	if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
6780 		return false;
6781 	}
6782 
6783 	mptcp_clear_cellicon();
6784 
6785 	return true;
6786 }
6787 
6788 void
mptcp_unset_cellicon(struct mptses * mpte,struct mptsub * mpts,uint32_t val)6789 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6790 {
6791 	/* First-party apps (Siri) don't flip the cellicon */
6792 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6793 		return;
6794 	}
6795 
6796 	if (mpte->mpte_cellicon_increments == 0) {
6797 		/* This flow never used cell - get out of here! */
6798 		return;
6799 	}
6800 
6801 	if (mptcp_cellicon_refcount == 0) {
6802 		os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6803 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6804 
6805 		return;
6806 	}
6807 
6808 	if (mpts) {
6809 		if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6810 			return;
6811 		}
6812 
6813 		mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6814 	}
6815 
6816 	if (mpte->mpte_cellicon_increments < val) {
6817 		os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6818 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6819 		val = mpte->mpte_cellicon_increments;
6820 	}
6821 
6822 	mpte->mpte_cellicon_increments -= val;
6823 
6824 	if (__mptcp_unset_cellicon(val) == false) {
6825 		return;
6826 	}
6827 
6828 	/* All flows are gone - our counter should be at zero too! */
6829 	if (mpte->mpte_cellicon_increments != 0) {
6830 		os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6831 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6832 	}
6833 }
6834 
6835 void
mptcp_reset_rexmit_state(struct tcpcb * tp)6836 mptcp_reset_rexmit_state(struct tcpcb *tp)
6837 {
6838 	struct mptsub *mpts;
6839 	struct inpcb *inp;
6840 	struct socket *so;
6841 
6842 	inp = tp->t_inpcb;
6843 	if (inp == NULL) {
6844 		return;
6845 	}
6846 
6847 	so = inp->inp_socket;
6848 	if (so == NULL) {
6849 		return;
6850 	}
6851 
6852 	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6853 		return;
6854 	}
6855 
6856 	mpts = tp->t_mpsub;
6857 
6858 	mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6859 	so->so_flags &= ~SOF_MP_TRYFAILOVER;
6860 }
6861 
6862 void
mptcp_reset_keepalive(struct tcpcb * tp)6863 mptcp_reset_keepalive(struct tcpcb *tp)
6864 {
6865 	struct mptsub *mpts = tp->t_mpsub;
6866 
6867 	mpts->mpts_flags &= ~MPTSF_READ_STALL;
6868 }
6869 
6870 /*
6871  * Protocol pr_init callback.
6872  */
6873 void
mptcp_init(struct protosw * pp,struct domain * dp)6874 mptcp_init(struct protosw *pp, struct domain *dp)
6875 {
6876 #pragma unused(dp)
6877 	static int mptcp_initialized = 0;
6878 	struct protosw *prp;
6879 	struct ip6protosw *prp6;
6880 
6881 	VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
6882 
6883 	/* do this only once */
6884 	if (mptcp_initialized) {
6885 		return;
6886 	}
6887 	mptcp_initialized = 1;
6888 
6889 	mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
6890 
6891 	/*
6892 	 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
6893 	 * we must be able to find IPPROTO_TCP entries for both.
6894 	 */
6895 	prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
6896 	VERIFY(prp != NULL);
6897 	bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
6898 	bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
6899 	    sizeof(mptcp_subflow_usrreqs));
6900 	mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
6901 	mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
6902 	mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
6903 	mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
6904 	mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
6905 	mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
6906 	/*
6907 	 * Socket filters shouldn't attach/detach to/from this protosw
6908 	 * since pr_protosw is to be used instead, which points to the
6909 	 * real protocol; if they do, it is a bug and we should panic.
6910 	 */
6911 	mptcp_subflow_protosw.pr_filter_head.tqh_first =
6912 	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6913 	mptcp_subflow_protosw.pr_filter_head.tqh_last =
6914 	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6915 
6916 	prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
6917 	    IPPROTO_TCP, SOCK_STREAM);
6918 	VERIFY(prp6 != NULL);
6919 	bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
6920 	bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
6921 	    sizeof(mptcp_subflow_usrreqs6));
6922 	mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
6923 	mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
6924 	mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
6925 	mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
6926 	mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
6927 	mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
6928 	/*
6929 	 * Socket filters shouldn't attach/detach to/from this protosw
6930 	 * since pr_protosw is to be used instead, which points to the
6931 	 * real protocol; if they do, it is a bug and we should panic.
6932 	 */
6933 	mptcp_subflow_protosw6.pr_filter_head.tqh_first =
6934 	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6935 	mptcp_subflow_protosw6.pr_filter_head.tqh_last =
6936 	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6937 
6938 	bzero(&mtcbinfo, sizeof(mtcbinfo));
6939 	TAILQ_INIT(&mtcbinfo.mppi_pcbs);
6940 	mtcbinfo.mppi_zone = zone_create("mptc", sizeof(struct mpp_mtp), ZC_NONE);
6941 
6942 	mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", LCK_GRP_ATTR_NULL);
6943 	lck_attr_setdefault(&mtcbinfo.mppi_lock_attr);
6944 	lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
6945 	    &mtcbinfo.mppi_lock_attr);
6946 
6947 	mtcbinfo.mppi_gc = mptcp_gc;
6948 	mtcbinfo.mppi_timer = mptcp_timer;
6949 
6950 	/* attach to MP domain for garbage collection to take place */
6951 	mp_pcbinfo_attach(&mtcbinfo);
6952 
6953 	mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
6954 }
6955