xref: /xnu-10002.61.3/bsd/netinet/mptcp_subr.c (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1 /*
2  * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32 
33 #include <mach/sdt.h>
34 
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50 
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_cache.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_var.h>
62 #include <netinet/mptcp_var.h>
63 #include <netinet/mptcp.h>
64 #include <netinet/mptcp_opt.h>
65 #include <netinet/mptcp_seq.h>
66 #include <netinet/mptcp_timer.h>
67 #include <libkern/crypto/sha1.h>
68 #include <libkern/crypto/sha2.h>
69 #include <netinet6/in6_pcb.h>
70 #include <netinet6/ip6protosw.h>
71 #include <dev/random/randomdev.h>
72 
73 /*
74  * Notes on MPTCP implementation.
75  *
76  * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77  * communication domain.  The structure mtcbinfo describes the MPTCP instance
78  * of a Multipath protocol in that domain.  It is used to keep track of all
79  * MPTCP PCB instances in the system, and is protected by the global lock
80  * mppi_lock.
81  *
82  * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83  * IPPROTO_TCP).  Upon success, a Multipath PCB gets allocated and along with
84  * it comes an MPTCP Session and an MPTCP PCB.  All three structures are
85  * allocated from the same memory block, and each structure has a pointer
86  * to the adjacent ones.  The layout is defined by the mpp_mtp structure.
87  * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88  * PCB (mppcb) as well as the MPTCP Session (mptses).
89  *
90  * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91  *
92  * A functioning MPTCP Session consists of one or more subflow sockets.  Each
93  * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94  * represented by the mptsub structure.  Because each subflow requires access
95  * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96  * subflow.  This gets decremented prior to the subflow's destruction.
97  *
98  * To handle events (read, write, control) from the subflows, we do direct
99  * upcalls into the specific function.
100  *
101  * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102  * lock. Incoming data on a subflow also ends up taking this single lock. To
103  * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104  * of the MPTCP-socket.
105  *
106  * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107  * work is done by the MPTCP garbage collector which is invoked on demand by
108  * the PF_MULTIPATH garbage collector.  This process will take place once all
109  * of the subflows have been destroyed.
110  */
111 
112 static void mptcp_subflow_abort(struct mptsub *, int);
113 
114 static void mptcp_send_dfin(struct socket *so);
115 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
116 static int mptcp_freeq(struct mptcb *mp_tp);
117 
118 /*
119  * Possible return values for subflow event handlers.  Note that success
120  * values must be greater or equal than MPTS_EVRET_OK.  Values less than that
121  * indicate errors or actions which require immediate attention; they will
122  * prevent the rest of the handlers from processing their respective events
123  * until the next round of events processing.
124  */
125 typedef enum {
126 	MPTS_EVRET_DELETE               = 1,    /* delete this subflow */
127 	MPTS_EVRET_OK                   = 2,    /* OK */
128 	MPTS_EVRET_CONNECT_PENDING      = 3,    /* resume pended connects */
129 	MPTS_EVRET_DISCONNECT_FALLBACK  = 4,    /* abort all but preferred */
130 } ev_ret_t;
131 
132 static void mptcp_do_sha1(mptcp_key_t *, char *);
133 static void mptcp_do_sha256(mptcp_key_t *, char *);
134 
135 static void mptcp_init_local_parms(struct mptses *, struct sockaddr *);
136 
137 static KALLOC_TYPE_DEFINE(mptsub_zone, struct mptsub, NET_KT_DEFAULT);
138 static KALLOC_TYPE_DEFINE(mptopt_zone, struct mptopt, NET_KT_DEFAULT);
139 static KALLOC_TYPE_DEFINE(mpt_subauth_zone, struct mptcp_subf_auth_entry,
140     NET_KT_DEFAULT);
141 
142 struct mppcbinfo mtcbinfo;
143 
144 SYSCTL_DECL(_net_inet);
145 
146 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
147 
148 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
149     &mtcbinfo.mppi_count, 0, "Number of active PCBs");
150 
151 
152 static int mptcp_alternate_port = 0;
153 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
154     &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
155 
156 static struct protosw mptcp_subflow_protosw;
157 static struct pr_usrreqs mptcp_subflow_usrreqs;
158 static struct ip6protosw mptcp_subflow_protosw6;
159 static struct pr_usrreqs mptcp_subflow_usrreqs6;
160 
161 static uint8_t  mptcp_create_subflows_scheduled;
162 
163 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
164 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
165 static uint32_t mptcp_kern_skt_inuse = 0;
166 static uint32_t mptcp_kern_skt_unit;
167 static symptoms_advisory_t mptcp_advisory;
168 
169 uint32_t mptcp_cellicon_refcount = 0;
170 
171 os_log_t mptcp_log_handle;
172 
173 int
mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats * stats,u_short ifindex,boolean_t create)174 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create)
175 {
176 	int i, index = -1;
177 
178 	for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
179 		if (create && stats[i].ifindex == IFSCOPE_NONE) {
180 			if (index < 0) {
181 				index = i;
182 			}
183 			continue;
184 		}
185 
186 		if (stats[i].ifindex == ifindex) {
187 			index = i;
188 			return index;
189 		}
190 	}
191 
192 	if (index != -1) {
193 		stats[index].ifindex = ifindex;
194 	}
195 
196 	return index;
197 }
198 
199 static int
mptcpstats_get_index(struct mptcp_itf_stats * stats,const struct mptsub * mpts)200 mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
201 {
202 	const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
203 	int index;
204 
205 	if (ifp == NULL) {
206 		os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
207 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
208 		    sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
209 		return -1;
210 	}
211 
212 	index = mptcpstats_get_index_by_ifindex(stats, ifp->if_index, true);
213 
214 	if (index != -1) {
215 		if (stats[index].is_expensive == 0) {
216 			stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
217 		}
218 	}
219 
220 	return index;
221 }
222 
223 void
mptcpstats_inc_switch(struct mptses * mpte,const struct mptsub * mpts)224 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
225 {
226 	int index;
227 
228 	tcpstat.tcps_mp_switches++;
229 	mpte->mpte_subflow_switches++;
230 
231 	index = mptcpstats_get_index(mpte->mpte_itfstats, mpts);
232 
233 	if (index != -1) {
234 		mpte->mpte_itfstats[index].switches++;
235 	}
236 }
237 
238 /*
239  * Flushes all recorded socket options from an MP socket.
240  */
241 static void
mptcp_flush_sopts(struct mptses * mpte)242 mptcp_flush_sopts(struct mptses *mpte)
243 {
244 	struct mptopt *mpo, *tmpo;
245 
246 	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
247 		mptcp_sopt_remove(mpte, mpo);
248 		mptcp_sopt_free(mpo);
249 	}
250 	VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
251 }
252 
253 /*
254  * Create an MPTCP session, called as a result of opening a MPTCP socket.
255  */
256 int
mptcp_session_create(struct mppcb * mpp)257 mptcp_session_create(struct mppcb *mpp)
258 {
259 	struct mpp_mtp *mtp;
260 	struct mppcbinfo *mppi;
261 	struct mptses *mpte;
262 	struct mptcb *mp_tp;
263 
264 	VERIFY(mpp != NULL);
265 	mppi = mpp->mpp_pcbinfo;
266 	VERIFY(mppi != NULL);
267 
268 	mtp = __container_of(mpp, struct mpp_mtp, mpp);
269 	mpte = &mtp->mpp_ses;
270 	mp_tp = &mtp->mtcb;
271 
272 	/* MPTCP Multipath PCB Extension */
273 	bzero(mpte, sizeof(*mpte));
274 	VERIFY(mpp->mpp_pcbe == NULL);
275 	mpp->mpp_pcbe = mpte;
276 	mpte->mpte_mppcb = mpp;
277 	mpte->mpte_mptcb = mp_tp;
278 
279 	TAILQ_INIT(&mpte->mpte_sopts);
280 	TAILQ_INIT(&mpte->mpte_subflows);
281 	mpte->mpte_associd = SAE_ASSOCID_ANY;
282 	mpte->mpte_connid_last = SAE_CONNID_ANY;
283 
284 	mptcp_init_urgency_timer(mpte);
285 
286 	mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
287 	mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
288 
289 	if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
290 		mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
291 	}
292 
293 	mpte->mpte_last_cellicon_set = tcp_now;
294 
295 	/* MPTCP Protocol Control Block */
296 	bzero(mp_tp, sizeof(*mp_tp));
297 	mp_tp->mpt_mpte = mpte;
298 	mp_tp->mpt_state = MPTCPS_CLOSED;
299 
300 	DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
301 
302 	return 0;
303 }
304 
305 struct sockaddr *
mptcp_get_session_dst(struct mptses * mpte,boolean_t ipv6,boolean_t ipv4)306 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
307 {
308 	if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
309 		return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
310 	}
311 
312 	if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
313 		return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
314 	}
315 
316 	/* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
317 	 * meaning we prefer IPv6 over IPv4.
318 	 */
319 	if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
320 		return (struct sockaddr *)&mpte->mpte_sub_dst_v6;
321 	}
322 
323 	if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
324 		return (struct sockaddr *)&mpte->mpte_sub_dst_v4;
325 	}
326 
327 	/* We don't yet have a unicast IP */
328 	return NULL;
329 }
330 
331 static void
mptcpstats_get_bytes(struct mptses * mpte,boolean_t initial_cell,uint64_t * cellbytes,uint64_t * allbytes)332 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
333     uint64_t *cellbytes, uint64_t *allbytes)
334 {
335 	int64_t mycellbytes = 0;
336 	uint64_t myallbytes = 0;
337 	int i;
338 
339 	for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
340 		if (mpte->mpte_itfstats[i].is_expensive) {
341 			mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
342 			mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
343 		}
344 
345 		myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
346 		myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
347 	}
348 
349 	if (initial_cell) {
350 		mycellbytes -= mpte->mpte_init_txbytes;
351 		mycellbytes -= mpte->mpte_init_rxbytes;
352 	}
353 
354 	if (mycellbytes < 0) {
355 		os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
356 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
357 		*cellbytes = 0;
358 		*allbytes = 0;
359 	} else {
360 		*cellbytes = mycellbytes;
361 		*allbytes = myallbytes;
362 	}
363 }
364 
365 static void
mptcpstats_session_wrapup(struct mptses * mpte)366 mptcpstats_session_wrapup(struct mptses *mpte)
367 {
368 	boolean_t cell = mpte->mpte_initial_cell;
369 
370 	switch (mpte->mpte_svctype) {
371 	case MPTCP_SVCTYPE_HANDOVER:
372 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
373 			tcpstat.tcps_mptcp_fp_handover_attempt++;
374 
375 			if (cell && mpte->mpte_handshake_success) {
376 				tcpstat.tcps_mptcp_fp_handover_success_cell++;
377 
378 				if (mpte->mpte_used_wifi) {
379 					tcpstat.tcps_mptcp_handover_wifi_from_cell++;
380 				}
381 			} else if (mpte->mpte_handshake_success) {
382 				tcpstat.tcps_mptcp_fp_handover_success_wifi++;
383 
384 				if (mpte->mpte_used_cell) {
385 					tcpstat.tcps_mptcp_handover_cell_from_wifi++;
386 				}
387 			}
388 		} else {
389 			tcpstat.tcps_mptcp_handover_attempt++;
390 
391 			if (cell && mpte->mpte_handshake_success) {
392 				tcpstat.tcps_mptcp_handover_success_cell++;
393 
394 				if (mpte->mpte_used_wifi) {
395 					tcpstat.tcps_mptcp_handover_wifi_from_cell++;
396 				}
397 			} else if (mpte->mpte_handshake_success) {
398 				tcpstat.tcps_mptcp_handover_success_wifi++;
399 
400 				if (mpte->mpte_used_cell) {
401 					tcpstat.tcps_mptcp_handover_cell_from_wifi++;
402 				}
403 			}
404 		}
405 
406 		if (mpte->mpte_handshake_success) {
407 			uint64_t cellbytes;
408 			uint64_t allbytes;
409 
410 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
411 
412 			tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
413 			tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
414 		}
415 		break;
416 	case MPTCP_SVCTYPE_INTERACTIVE:
417 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
418 			tcpstat.tcps_mptcp_fp_interactive_attempt++;
419 
420 			if (mpte->mpte_handshake_success) {
421 				tcpstat.tcps_mptcp_fp_interactive_success++;
422 
423 				if (!cell && mpte->mpte_used_cell) {
424 					tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
425 				}
426 			}
427 		} else {
428 			tcpstat.tcps_mptcp_interactive_attempt++;
429 
430 			if (mpte->mpte_handshake_success) {
431 				tcpstat.tcps_mptcp_interactive_success++;
432 
433 				if (!cell && mpte->mpte_used_cell) {
434 					tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
435 				}
436 			}
437 		}
438 
439 		if (mpte->mpte_handshake_success) {
440 			uint64_t cellbytes;
441 			uint64_t allbytes;
442 
443 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
444 
445 			tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
446 			tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
447 		}
448 		break;
449 	case MPTCP_SVCTYPE_AGGREGATE:
450 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
451 			tcpstat.tcps_mptcp_fp_aggregate_attempt++;
452 
453 			if (mpte->mpte_handshake_success) {
454 				tcpstat.tcps_mptcp_fp_aggregate_success++;
455 			}
456 		} else {
457 			tcpstat.tcps_mptcp_aggregate_attempt++;
458 
459 			if (mpte->mpte_handshake_success) {
460 				tcpstat.tcps_mptcp_aggregate_success++;
461 			}
462 		}
463 
464 		if (mpte->mpte_handshake_success) {
465 			uint64_t cellbytes;
466 			uint64_t allbytes;
467 
468 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
469 
470 			tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
471 			tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
472 		}
473 		break;
474 	}
475 
476 	if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
477 		tcpstat.tcps_mptcp_back_to_wifi++;
478 	}
479 
480 	if (mpte->mpte_triggered_cell) {
481 		tcpstat.tcps_mptcp_triggered_cell++;
482 	}
483 }
484 
485 /*
486  * Destroy an MPTCP session.
487  */
488 static void
mptcp_session_destroy(struct mptses * mpte)489 mptcp_session_destroy(struct mptses *mpte)
490 {
491 	struct mptcb *mp_tp = mpte->mpte_mptcb;
492 
493 	VERIFY(mp_tp != NULL);
494 	VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
495 
496 	mptcpstats_session_wrapup(mpte);
497 	mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
498 	mptcp_flush_sopts(mpte);
499 
500 	if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
501 		kfree_data(mpte->mpte_itfinfo,
502 		    sizeof(*mpte->mpte_itfinfo) * mpte->mpte_itfinfo_size);
503 	}
504 	mpte->mpte_itfinfo = NULL;
505 
506 	mptcp_freeq(mp_tp);
507 	m_freem_list(mpte->mpte_reinjectq);
508 
509 	os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
510 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
511 }
512 
513 boolean_t
mptcp_ok_to_create_subflows(struct mptcb * mp_tp)514 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
515 {
516 	return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
517 	       mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
518 	       !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
519 }
520 
521 static int
mptcp_synthesize_nat64(struct in6_addr * addr,uint32_t len,const struct in_addr * addrv4)522 mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
523     const struct in_addr *addrv4)
524 {
525 	static const struct in6_addr well_known_prefix = {
526 		.__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
527 			                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
528 			                 0x00, 0x00, 0x00, 0x00},
529 	};
530 	const char *ptrv4 = (const char *)addrv4;
531 	char *ptr = (char *)addr;
532 
533 	if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
534 	    IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
535 	    IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
536 	    IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
537 	    IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
538 	    IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
539 	    INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
540 		return -1;
541 	}
542 
543 	/* Check for the well-known prefix */
544 	if (len == NAT64_PREFIX_LEN_96 &&
545 	    IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
546 		if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
547 		    IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
548 			return -1;
549 		}
550 	}
551 
552 	switch (len) {
553 	case NAT64_PREFIX_LEN_96:
554 		memcpy(ptr + 12, ptrv4, 4);
555 		break;
556 	case NAT64_PREFIX_LEN_64:
557 		memcpy(ptr + 9, ptrv4, 4);
558 		break;
559 	case NAT64_PREFIX_LEN_56:
560 		memcpy(ptr + 7, ptrv4, 1);
561 		memcpy(ptr + 9, ptrv4 + 1, 3);
562 		break;
563 	case NAT64_PREFIX_LEN_48:
564 		memcpy(ptr + 6, ptrv4, 2);
565 		memcpy(ptr + 9, ptrv4 + 2, 2);
566 		break;
567 	case NAT64_PREFIX_LEN_40:
568 		memcpy(ptr + 5, ptrv4, 3);
569 		memcpy(ptr + 9, ptrv4 + 3, 1);
570 		break;
571 	case NAT64_PREFIX_LEN_32:
572 		memcpy(ptr + 4, ptrv4, 4);
573 		break;
574 	default:
575 		panic("NAT64-prefix len is wrong: %u", len);
576 	}
577 
578 	return 0;
579 }
580 
581 static void
mptcp_trigger_cell_bringup(struct mptses * mpte)582 mptcp_trigger_cell_bringup(struct mptses *mpte)
583 {
584 	struct socket *mp_so = mptetoso(mpte);
585 
586 	if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
587 		uuid_string_t uuidstr;
588 		int err;
589 
590 		socket_unlock(mp_so, 0);
591 		err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
592 		    TRUE);
593 		socket_lock(mp_so, 0);
594 
595 		if (err == 0) {
596 			mpte->mpte_triggered_cell = 1;
597 		}
598 
599 		uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
600 		os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
601 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
602 	} else {
603 		os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
604 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
605 	}
606 }
607 
608 static boolean_t
mptcp_subflow_disconnecting(struct mptsub * mpts)609 mptcp_subflow_disconnecting(struct mptsub *mpts)
610 {
611 	if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
612 		return true;
613 	}
614 
615 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
616 		return true;
617 	}
618 
619 	if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
620 		return true;
621 	}
622 
623 	return false;
624 }
625 
626 /*
627  * In Handover mode, only create cell subflow if
628  * - Symptoms marked WiFi as weak:
629  *   Here, if we are sending data, then we can check the RTO-state. That is a
630  *   stronger signal of WiFi quality than the Symptoms indicator.
631  *   If however we are not sending any data, the only thing we can do is guess
632  *   and thus bring up Cell.
633  *
634  * - Symptoms marked WiFi as unknown:
635  *   In this state we don't know what the situation is and thus remain
636  *   conservative, only bringing up cell if there are retransmissions going on.
637  */
638 static boolean_t
mptcp_handover_use_cellular(struct mptses * mpte,struct tcpcb * tp)639 mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
640 {
641 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
642 
643 	if (wifi_quality == MPTCP_WIFI_QUALITY_GOOD) {
644 		/* WiFi is good - don't use cell */
645 		return false;
646 	}
647 
648 	if (wifi_quality == MPTCP_WIFI_QUALITY_UNSURE) {
649 		/*
650 		 * We are in unknown state, only use Cell if we have confirmed
651 		 * that WiFi is bad.
652 		 */
653 		if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
654 			return true;
655 		} else {
656 			return false;
657 		}
658 	}
659 
660 	if (wifi_quality == MPTCP_WIFI_QUALITY_BAD) {
661 		/*
662 		 * WiFi is confirmed to be bad from Symptoms-Framework.
663 		 * If we are sending data, check the RTOs.
664 		 * Otherwise, be pessimistic and use Cell.
665 		 */
666 		if (mptetoso(mpte)->so_snd.sb_cc != 0) {
667 			if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
668 				return true;
669 			} else {
670 				return false;
671 			}
672 		} else {
673 			return true;
674 		}
675 	}
676 
677 	return false;
678 }
679 
680 void
mptcp_check_subflows_and_add(struct mptses * mpte)681 mptcp_check_subflows_and_add(struct mptses *mpte)
682 {
683 	struct mptcb *mp_tp = mpte->mpte_mptcb;
684 	boolean_t cellular_viable = FALSE;
685 	boolean_t want_cellular = TRUE;
686 	uint32_t i;
687 
688 	if (!mptcp_ok_to_create_subflows(mp_tp)) {
689 		os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
690 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
691 		return;
692 	}
693 
694 	/* Just to see if we have an IP-address available */
695 	if (mptcp_get_session_dst(mpte, false, false) == NULL) {
696 		return;
697 	}
698 
699 	for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
700 		boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
701 		struct mpt_itf_info *info;
702 		struct sockaddr_in6 nat64pre;
703 		struct sockaddr *dst;
704 		struct mptsub *mpts;
705 		struct ifnet *ifp;
706 		uint32_t ifindex;
707 
708 		info = &mpte->mpte_itfinfo[i];
709 
710 		ifindex = info->ifindex;
711 		if (ifindex == IFSCOPE_NONE) {
712 			continue;
713 		}
714 
715 		os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
716 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
717 		    info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
718 
719 		if (info->no_mptcp_support) {
720 			continue;
721 		}
722 
723 		ifnet_head_lock_shared();
724 		ifp = ifindex2ifnet[ifindex];
725 		ifnet_head_done();
726 
727 		if (ifp == NULL) {
728 			continue;
729 		}
730 
731 		if (IFNET_IS_CELLULAR(ifp)) {
732 			cellular_viable = TRUE;
733 
734 			if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
735 			    mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
736 				if (mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD) {
737 					continue;
738 				}
739 			}
740 		}
741 
742 		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
743 			const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
744 			struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
745 
746 			if (subifp == NULL) {
747 				continue;
748 			}
749 
750 			/*
751 			 * If there is at least one functioning subflow on WiFi
752 			 * and we are checking for the cell interface, then
753 			 * we always need to ask symptoms for permission as
754 			 * cell is triggered even if WiFi is available.
755 			 */
756 			if (!IFNET_IS_CELLULAR(subifp) &&
757 			    !mptcp_subflow_disconnecting(mpts) &&
758 			    IFNET_IS_CELLULAR(ifp)) {
759 				need_to_ask_symptoms = TRUE;
760 			}
761 
762 			if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
763 				os_log(mptcp_log_handle,
764 				    "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
765 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
766 				    mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
767 				    IFNET_IS_CELLULAR(subifp),
768 				    mptcp_wifi_quality_for_session(mpte),
769 				    mpts->mpts_flags,
770 				    tp->t_rxtshift,
771 				    !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
772 				    mptetoso(mpte)->so_snd.sb_cc,
773 				    ifindex, subifp->if_index,
774 				    tp->t_srtt >> TCP_RTT_SHIFT,
775 				    tp->t_rttvar >> TCP_RTTVAR_SHIFT,
776 				    tp->t_rxtcur);
777 
778 				if (!IFNET_IS_CELLULAR(subifp) &&
779 				    !mptcp_subflow_disconnecting(mpts) &&
780 				    (mpts->mpts_flags & MPTSF_CONNECTED) &&
781 				    !mptcp_handover_use_cellular(mpte, tp)) {
782 					found = TRUE;
783 
784 					/* We found a proper subflow on WiFi - no need for cell */
785 					want_cellular = FALSE;
786 					break;
787 				}
788 			} else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
789 				uint64_t time_now = mach_continuous_time();
790 
791 				os_log(mptcp_log_handle,
792 				    "%s - %lx: target-based: %llu now %llu wifi quality %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
793 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
794 				    time_now, mptcp_wifi_quality_for_session(mpte),
795 				    IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
796 				    mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
797 
798 				if (!IFNET_IS_CELLULAR(subifp) &&
799 				    !mptcp_subflow_disconnecting(mpts) &&
800 				    (mpte->mpte_time_target == 0 ||
801 				    (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
802 				    mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD)) {
803 					found = TRUE;
804 
805 					want_cellular = FALSE;
806 					break;
807 				}
808 			}
809 
810 			if (subifp->if_index == ifindex &&
811 			    !mptcp_subflow_disconnecting(mpts)) {
812 				/*
813 				 * We found a subflow on this interface.
814 				 * No need to create a new one.
815 				 */
816 				found = TRUE;
817 				break;
818 			}
819 		}
820 
821 		if (found) {
822 			continue;
823 		}
824 
825 		if (need_to_ask_symptoms &&
826 		    !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
827 		    !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
828 		    mptcp_developer_mode == 0) {
829 			mptcp_ask_symptoms(mpte);
830 			return;
831 		}
832 
833 		dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
834 
835 		if (dst->sa_family == AF_INET &&
836 		    !info->has_v4_conn && info->has_nat64_conn) {
837 			struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
838 			int error, j;
839 
840 			bzero(&nat64pre, sizeof(struct sockaddr_in6));
841 
842 			error = ifnet_get_nat64prefix(ifp, nat64prefixes);
843 			if (error) {
844 				os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
845 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
846 				continue;
847 			}
848 
849 			for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
850 				if (nat64prefixes[j].prefix_len != 0) {
851 					break;
852 				}
853 			}
854 
855 			VERIFY(j < NAT64_MAX_NUM_PREFIXES);
856 
857 			error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
858 			    nat64prefixes[j].prefix_len,
859 			    &((struct sockaddr_in *)(void *)dst)->sin_addr);
860 			if (error != 0) {
861 				os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
862 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
863 				continue;
864 			}
865 
866 			memcpy(&nat64pre.sin6_addr,
867 			    &nat64prefixes[j].ipv6_prefix,
868 			    sizeof(nat64pre.sin6_addr));
869 			nat64pre.sin6_len = sizeof(struct sockaddr_in6);
870 			nat64pre.sin6_family = AF_INET6;
871 			nat64pre.sin6_port = ((struct sockaddr_in *)(void *)dst)->sin_port;
872 			nat64pre.sin6_flowinfo = 0;
873 			nat64pre.sin6_scope_id = 0;
874 
875 			dst = (struct sockaddr *)&nat64pre;
876 		}
877 
878 		if (dst->sa_family == AF_INET && !info->has_v4_conn) {
879 			continue;
880 		}
881 		if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
882 			continue;
883 		}
884 
885 		mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
886 	}
887 
888 	if (!cellular_viable && want_cellular) {
889 		/* Trigger Cell Bringup */
890 		mptcp_trigger_cell_bringup(mpte);
891 	}
892 }
893 
894 static void
mptcp_remove_cell_subflows(struct mptses * mpte)895 mptcp_remove_cell_subflows(struct mptses *mpte)
896 {
897 	struct mptsub *mpts, *tmpts;
898 
899 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
900 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
901 
902 		if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
903 			continue;
904 		}
905 
906 		os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
907 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
908 
909 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
910 	}
911 
912 	return;
913 }
914 
915 static void
mptcp_remove_wifi_subflows(struct mptses * mpte)916 mptcp_remove_wifi_subflows(struct mptses *mpte)
917 {
918 	struct mptsub *mpts, *tmpts;
919 
920 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
921 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
922 
923 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
924 			continue;
925 		}
926 
927 		os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
928 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
929 
930 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
931 	}
932 
933 	return;
934 }
935 
936 static void
mptcp_pure_handover_subflows_remove(struct mptses * mpte)937 mptcp_pure_handover_subflows_remove(struct mptses *mpte)
938 {
939 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
940 	boolean_t found_working_wifi_subflow = false;
941 	boolean_t found_working_cell_subflow = false;
942 
943 	struct mptsub *mpts;
944 
945 	/*
946 	 * Look for a subflow that is on a non-cellular interface in connected
947 	 * state.
948 	 *
949 	 * In that case, remove all cellular subflows.
950 	 *
951 	 * If however there is no connected subflow
952 	 */
953 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
954 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
955 		struct socket *so;
956 		struct tcpcb *tp;
957 
958 		if (ifp == NULL) {
959 			continue;
960 		}
961 
962 		so = mpts->mpts_socket;
963 		tp = sototcpcb(so);
964 
965 		if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
966 		    tp->t_state != TCPS_ESTABLISHED ||
967 		    mptcp_subflow_disconnecting(mpts)) {
968 			continue;
969 		}
970 
971 		if (IFNET_IS_CELLULAR(ifp)) {
972 			found_working_cell_subflow = true;
973 		} else {
974 			os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
975 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
976 			if (!mptcp_handover_use_cellular(mpte, tp)) {
977 				found_working_wifi_subflow = true;
978 			}
979 		}
980 	}
981 
982 	/*
983 	 * Couldn't find a working subflow, let's not remove those on a cellular
984 	 * interface.
985 	 */
986 	os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
987 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
988 	    found_working_wifi_subflow, found_working_cell_subflow);
989 	if (!found_working_wifi_subflow && wifi_quality != MPTCP_WIFI_QUALITY_GOOD) {
990 		if (found_working_cell_subflow) {
991 			mptcp_remove_wifi_subflows(mpte);
992 		}
993 		return;
994 	}
995 
996 	mptcp_remove_cell_subflows(mpte);
997 }
998 
999 static void
mptcp_handover_subflows_remove(struct mptses * mpte)1000 mptcp_handover_subflows_remove(struct mptses *mpte)
1001 {
1002 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
1003 	boolean_t found_working_subflow = false;
1004 	struct mptsub *mpts;
1005 
1006 	/*
1007 	 * Look for a subflow that is on a non-cellular interface
1008 	 * and actually works (aka, no retransmission timeout).
1009 	 */
1010 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1011 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1012 		struct socket *so;
1013 		struct tcpcb *tp;
1014 
1015 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1016 			continue;
1017 		}
1018 
1019 		so = mpts->mpts_socket;
1020 		tp = sototcpcb(so);
1021 
1022 		if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1023 		    tp->t_state != TCPS_ESTABLISHED) {
1024 			continue;
1025 		}
1026 
1027 		os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
1028 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
1029 
1030 		if (!mptcp_handover_use_cellular(mpte, tp)) {
1031 			found_working_subflow = true;
1032 			break;
1033 		}
1034 	}
1035 
1036 	/*
1037 	 * Couldn't find a working subflow, let's not remove those on a cellular
1038 	 * interface.
1039 	 */
1040 	if (!found_working_subflow) {
1041 		return;
1042 	}
1043 
1044 	mptcp_remove_cell_subflows(mpte);
1045 }
1046 
1047 static void
mptcp_targetbased_subflows_remove(struct mptses * mpte)1048 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1049 {
1050 	uint64_t time_now = mach_continuous_time();
1051 	struct mptsub *mpts;
1052 
1053 	if (mpte->mpte_time_target != 0 &&
1054 	    (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1055 	    mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
1056 		/* WiFi is bad and we are below the target - don't remove any subflows */
1057 		return;
1058 	}
1059 
1060 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1061 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1062 
1063 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1064 			continue;
1065 		}
1066 
1067 		/* We have a functioning subflow on WiFi. No need for cell! */
1068 		if (mpts->mpts_flags & MPTSF_CONNECTED &&
1069 		    !mptcp_subflow_disconnecting(mpts)) {
1070 			mptcp_remove_cell_subflows(mpte);
1071 			break;
1072 		}
1073 	}
1074 }
1075 
1076 /*
1077  * Based on the MPTCP Service-type and the state of the subflows, we
1078  * will destroy subflows here.
1079  */
1080 void
mptcp_check_subflows_and_remove(struct mptses * mpte)1081 mptcp_check_subflows_and_remove(struct mptses *mpte)
1082 {
1083 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1084 		return;
1085 	}
1086 
1087 	socket_lock_assert_owned(mptetoso(mpte));
1088 
1089 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1090 		mptcp_pure_handover_subflows_remove(mpte);
1091 	}
1092 
1093 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1094 		mptcp_handover_subflows_remove(mpte);
1095 	}
1096 
1097 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1098 		mptcp_targetbased_subflows_remove(mpte);
1099 	}
1100 }
1101 
1102 static void
mptcp_remove_subflows(struct mptses * mpte)1103 mptcp_remove_subflows(struct mptses *mpte)
1104 {
1105 	struct mptsub *mpts, *tmpts;
1106 
1107 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1108 		return;
1109 	}
1110 
1111 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1112 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1113 		boolean_t found = false;
1114 		uint32_t ifindex;
1115 		uint32_t i;
1116 
1117 		if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1118 			mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1119 
1120 			os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1121 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1122 			    ifp ? ifp->if_index : -1);
1123 			soevent(mpts->mpts_socket,
1124 			    SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1125 
1126 			continue;
1127 		}
1128 
1129 		if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1130 			continue;
1131 		}
1132 
1133 		if (ifp) {
1134 			ifindex = ifp->if_index;
1135 		} else {
1136 			ifindex = mpts->mpts_ifscope;
1137 		}
1138 
1139 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1140 			if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1141 				continue;
1142 			}
1143 
1144 			if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1145 				if (mpts->mpts_dst.sa_family == AF_INET6 &&
1146 				    (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1147 					found = true;
1148 					break;
1149 				}
1150 
1151 				if (mpts->mpts_dst.sa_family == AF_INET &&
1152 				    mpte->mpte_itfinfo[i].has_v4_conn) {
1153 					found = true;
1154 					break;
1155 				}
1156 			}
1157 		}
1158 
1159 		if (!found) {
1160 			os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1161 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1162 			    ifindex, mpts->mpts_flags);
1163 
1164 			soevent(mpts->mpts_socket,
1165 			    SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1166 		}
1167 	}
1168 }
1169 
1170 static void
mptcp_create_subflows(__unused void * arg)1171 mptcp_create_subflows(__unused void *arg)
1172 {
1173 	struct mppcb *mpp;
1174 
1175 	/*
1176 	 * Start with clearing, because we might be processing connections
1177 	 * while a new event comes in.
1178 	 */
1179 	if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1180 		os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1181 	}
1182 
1183 	/* Iterate over all MPTCP connections */
1184 
1185 	lck_mtx_lock(&mtcbinfo.mppi_lock);
1186 
1187 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1188 		struct socket *mp_so = mpp->mpp_socket;
1189 		struct mptses *mpte = mpp->mpp_pcbe;
1190 
1191 		socket_lock(mp_so, 1);
1192 		if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS) ||
1193 		    !(mpte->mpte_flags & MPTE_ITFINFO_INIT)) {
1194 			socket_unlock(mp_so, 1);
1195 			continue;
1196 		}
1197 
1198 		VERIFY(mp_so->so_usecount > 0);
1199 
1200 		mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1201 
1202 		mptcp_check_subflows_and_add(mpte);
1203 		mptcp_remove_subflows(mpte);
1204 
1205 		mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1206 		socket_unlock(mp_so, 1);
1207 	}
1208 
1209 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
1210 }
1211 
1212 /*
1213  * We need this because we are coming from an NECP-event. This event gets posted
1214  * while holding NECP-locks. The creation of the subflow however leads us back
1215  * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1216  * So, we would deadlock there as we already hold the NECP-lock.
1217  *
1218  * So, let's schedule this separately. It also gives NECP the chance to make
1219  * progress, without having to wait for MPTCP to finish its subflow creation.
1220  */
1221 void
mptcp_sched_create_subflows(struct mptses * mpte)1222 mptcp_sched_create_subflows(struct mptses *mpte)
1223 {
1224 	struct mppcb *mpp = mpte->mpte_mppcb;
1225 	struct mptcb *mp_tp = mpte->mpte_mptcb;
1226 	struct socket *mp_so = mpp->mpp_socket;
1227 
1228 	if (!mptcp_ok_to_create_subflows(mp_tp)) {
1229 		os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1230 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1231 		return;
1232 	}
1233 
1234 	if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1235 		mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1236 		mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1237 	}
1238 
1239 	if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1240 		return;
1241 	}
1242 
1243 	/* Do the call in 100ms to allow NECP to schedule it on all sockets */
1244 	timeout(mptcp_create_subflows, NULL, hz / 10);
1245 }
1246 
1247 /*
1248  * Allocate an MPTCP socket option structure.
1249  */
1250 struct mptopt *
mptcp_sopt_alloc(zalloc_flags_t how)1251 mptcp_sopt_alloc(zalloc_flags_t how)
1252 {
1253 	return zalloc_flags(mptopt_zone, how | Z_ZERO);
1254 }
1255 
1256 /*
1257  * Free an MPTCP socket option structure.
1258  */
1259 void
mptcp_sopt_free(struct mptopt * mpo)1260 mptcp_sopt_free(struct mptopt *mpo)
1261 {
1262 	VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1263 
1264 	zfree(mptopt_zone, mpo);
1265 }
1266 
1267 /*
1268  * Add a socket option to the MPTCP socket option list.
1269  */
1270 void
mptcp_sopt_insert(struct mptses * mpte,struct mptopt * mpo)1271 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1272 {
1273 	socket_lock_assert_owned(mptetoso(mpte));
1274 	mpo->mpo_flags |= MPOF_ATTACHED;
1275 	TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1276 }
1277 
1278 /*
1279  * Remove a socket option from the MPTCP socket option list.
1280  */
1281 void
mptcp_sopt_remove(struct mptses * mpte,struct mptopt * mpo)1282 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1283 {
1284 	socket_lock_assert_owned(mptetoso(mpte));
1285 	VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1286 	mpo->mpo_flags &= ~MPOF_ATTACHED;
1287 	TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1288 }
1289 
1290 /*
1291  * Search for an existing <sopt_level,sopt_name> socket option.
1292  */
1293 struct mptopt *
mptcp_sopt_find(struct mptses * mpte,struct sockopt * sopt)1294 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1295 {
1296 	struct mptopt *mpo;
1297 
1298 	socket_lock_assert_owned(mptetoso(mpte));
1299 
1300 	TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1301 		if (mpo->mpo_level == sopt->sopt_level &&
1302 		    mpo->mpo_name == sopt->sopt_name) {
1303 			break;
1304 		}
1305 	}
1306 	return mpo;
1307 }
1308 
1309 /*
1310  * Allocate a MPTCP subflow structure.
1311  */
1312 static struct mptsub *
mptcp_subflow_alloc(void)1313 mptcp_subflow_alloc(void)
1314 {
1315 	return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1316 }
1317 
1318 /*
1319  * Deallocate a subflow structure, called when all of the references held
1320  * on it have been released.  This implies that the subflow has been deleted.
1321  */
1322 static void
mptcp_subflow_free(struct mptsub * mpts)1323 mptcp_subflow_free(struct mptsub *mpts)
1324 {
1325 	VERIFY(mpts->mpts_refcnt == 0);
1326 	VERIFY(mpts->mpts_mpte == NULL);
1327 	VERIFY(mpts->mpts_socket == NULL);
1328 
1329 	free_sockaddr(mpts->mpts_src);
1330 
1331 	zfree(mptsub_zone, mpts);
1332 }
1333 
1334 static void
mptcp_subflow_addref(struct mptsub * mpts)1335 mptcp_subflow_addref(struct mptsub *mpts)
1336 {
1337 	if (++mpts->mpts_refcnt == 0) {
1338 		panic("%s: mpts %p wraparound refcnt", __func__, mpts);
1339 	}
1340 	/* NOTREACHED */
1341 }
1342 
1343 static void
mptcp_subflow_remref(struct mptsub * mpts)1344 mptcp_subflow_remref(struct mptsub *mpts)
1345 {
1346 	if (mpts->mpts_refcnt == 0) {
1347 		panic("%s: mpts %p negative refcnt", __func__, mpts);
1348 		/* NOTREACHED */
1349 	}
1350 	if (--mpts->mpts_refcnt > 0) {
1351 		return;
1352 	}
1353 
1354 	/* callee will unlock and destroy lock */
1355 	mptcp_subflow_free(mpts);
1356 }
1357 
1358 static void
mptcp_subflow_attach(struct mptses * mpte,struct mptsub * mpts,struct socket * so)1359 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1360 {
1361 	struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1362 	struct tcpcb *tp = sototcpcb(so);
1363 
1364 	/*
1365 	 * From this moment on, the subflow is linked to the MPTCP-connection.
1366 	 * Locking,... happens now at the MPTCP-layer
1367 	 */
1368 	tp->t_mptcb = mpte->mpte_mptcb;
1369 	so->so_flags |= SOF_MP_SUBFLOW;
1370 	mp_so->so_usecount++;
1371 
1372 	/*
1373 	 * Insert the subflow into the list, and associate the MPTCP PCB
1374 	 * as well as the the subflow socket.  From this point on, removing
1375 	 * the subflow needs to be done via mptcp_subflow_del().
1376 	 */
1377 	TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1378 	mpte->mpte_numflows++;
1379 
1380 	mpts->mpts_mpte = mpte;
1381 	mpts->mpts_socket = so;
1382 	tp->t_mpsub = mpts;
1383 	mptcp_subflow_addref(mpts);     /* for being in MPTCP subflow list */
1384 	mptcp_subflow_addref(mpts);     /* for subflow socket */
1385 }
1386 
1387 static void
mptcp_subflow_necp_cb(void * handle,__unused int action,__unused uint32_t interface_index,uint32_t necp_flags,bool * viable)1388 mptcp_subflow_necp_cb(void *handle, __unused int action,
1389     __unused uint32_t interface_index,
1390     uint32_t necp_flags, bool *viable)
1391 {
1392 	boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1393 	struct inpcb *inp = (struct inpcb *)handle;
1394 	struct socket *so = inp->inp_socket;
1395 	struct mptsub *mpts;
1396 	struct mptses *mpte;
1397 
1398 	if (low_power) {
1399 		action = NECP_CLIENT_CBACTION_NONVIABLE;
1400 	}
1401 
1402 	if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1403 		return;
1404 	}
1405 
1406 	/*
1407 	 * The socket is being garbage-collected. There is nothing to be done
1408 	 * here.
1409 	 */
1410 	if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1411 		return;
1412 	}
1413 
1414 	socket_lock(so, 1);
1415 
1416 	/* Check again after we acquired the lock. */
1417 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1418 		goto out;
1419 	}
1420 
1421 	mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1422 	mpts = sototcpcb(so)->t_mpsub;
1423 
1424 	os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1425 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1426 
1427 	mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1428 
1429 	mptcp_sched_create_subflows(mpte);
1430 
1431 	if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1432 	    mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
1433 	    mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1434 	    viable != NULL) {
1435 		*viable = 1;
1436 	}
1437 
1438 out:
1439 	socket_unlock(so, 1);
1440 }
1441 
1442 /*
1443  * Create an MPTCP subflow socket.
1444  */
1445 static int
mptcp_subflow_socreate(struct mptses * mpte,struct mptsub * mpts,int dom,struct socket ** so)1446 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1447     struct socket **so)
1448 {
1449 	lck_mtx_t *subflow_mtx;
1450 	struct mptopt smpo, *mpo, *tmpo;
1451 	struct proc *p;
1452 	struct socket *mp_so;
1453 	struct mppcb *mpp;
1454 	int error;
1455 
1456 	*so = NULL;
1457 
1458 	mp_so = mptetoso(mpte);
1459 	mpp = mpsotomppcb(mp_so);
1460 
1461 	p = proc_find(mp_so->last_pid);
1462 	if (p == PROC_NULL) {
1463 		os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1464 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1465 
1466 		mptcp_subflow_free(mpts);
1467 		return ESRCH;
1468 	}
1469 
1470 	/*
1471 	 * Create the subflow socket (multipath subflow, non-blocking.)
1472 	 *
1473 	 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1474 	 * socket; it will be cleared when the socket is peeled off or closed.
1475 	 * It also indicates to the underlying TCP to handle MPTCP options.
1476 	 * A multipath subflow socket implies SS_NOFDREF state.
1477 	 */
1478 
1479 	/*
1480 	 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1481 	 * the ipi-lock. We cannot hold the socket-lock at that point.
1482 	 */
1483 	socket_unlock(mp_so, 0);
1484 	error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1485 	    SOCF_MPTCP, PROC_NULL);
1486 	socket_lock(mp_so, 0);
1487 	if (error) {
1488 		os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1489 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1490 
1491 		proc_rele(p);
1492 
1493 		mptcp_subflow_free(mpts);
1494 		return error;
1495 	}
1496 
1497 	/*
1498 	 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1499 	 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1500 	 * Which is why we also need to get the lock with pr_getlock, as after
1501 	 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1502 	 */
1503 	subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1504 	lck_mtx_lock(subflow_mtx);
1505 
1506 	/*
1507 	 * Must be the first thing we do, to make sure all pointers for this
1508 	 * subflow are set.
1509 	 */
1510 	mptcp_subflow_attach(mpte, mpts, *so);
1511 
1512 	/*
1513 	 * A multipath subflow socket is used internally in the kernel,
1514 	 * therefore it does not have a file desciptor associated by
1515 	 * default.
1516 	 */
1517 	(*so)->so_state |= SS_NOFDREF;
1518 
1519 	lck_mtx_unlock(subflow_mtx);
1520 
1521 	/* prevent the socket buffers from being compressed */
1522 	(*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1523 	(*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1524 
1525 	/* Inherit preconnect and TFO data flags */
1526 	if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1527 		(*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1528 	}
1529 	if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1530 		(*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1531 	}
1532 	if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1533 		(*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1534 	}
1535 
1536 	/* Inherit uuid and create the related flow. */
1537 	if (!uuid_is_null(mpp->necp_client_uuid)) {
1538 		struct mptcb *mp_tp = mpte->mpte_mptcb;
1539 
1540 		sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1541 
1542 		/*
1543 		 * A note on the unlock: With MPTCP, we do multiple times a
1544 		 * necp_client_register_socket_flow. This is problematic,
1545 		 * because now the lock-ordering guarantee (first necp-locks,
1546 		 * then socket-locks) is no more respected. So, we need to
1547 		 * unlock here.
1548 		 */
1549 		socket_unlock(mp_so, 0);
1550 		error = necp_client_register_socket_flow(mp_so->last_pid,
1551 		    mpp->necp_client_uuid, sotoinpcb(*so));
1552 		socket_lock(mp_so, 0);
1553 
1554 		if (error) {
1555 			os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1556 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1557 
1558 			goto out_err;
1559 		}
1560 
1561 		/* Possible state-change during the unlock above */
1562 		if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1563 		    (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1564 			os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1565 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1566 			    mp_tp->mpt_state, mp_tp->mpt_flags);
1567 
1568 			error = EINVAL;
1569 			goto out_err;
1570 		}
1571 
1572 		uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpp->necp_client_uuid);
1573 	}
1574 
1575 	if (mpp->inp_necp_attributes.inp_domain != NULL) {
1576 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain);
1577 		sotoinpcb(*so)->inp_necp_attributes.inp_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1578 
1579 		if (sotoinpcb(*so)->inp_necp_attributes.inp_domain) {
1580 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain, mpp->inp_necp_attributes.inp_domain, string_size + 1);
1581 		}
1582 	}
1583 	if (mpp->inp_necp_attributes.inp_account != NULL) {
1584 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_account);
1585 		sotoinpcb(*so)->inp_necp_attributes.inp_account = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1586 
1587 		if (sotoinpcb(*so)->inp_necp_attributes.inp_account) {
1588 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_account, mpp->inp_necp_attributes.inp_account, string_size + 1);
1589 		}
1590 	}
1591 
1592 	if (mpp->inp_necp_attributes.inp_domain_owner != NULL) {
1593 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain_owner);
1594 		sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1595 
1596 		if (sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner) {
1597 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner, mpp->inp_necp_attributes.inp_domain_owner, string_size + 1);
1598 		}
1599 	}
1600 
1601 	if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) {
1602 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_tracker_domain);
1603 		sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1604 
1605 		if (sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain) {
1606 			memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain, mpp->inp_necp_attributes.inp_tracker_domain, string_size + 1);
1607 		}
1608 	}
1609 
1610 	/* Needs to happen prior to the delegation! */
1611 	(*so)->last_pid = mp_so->last_pid;
1612 
1613 	if (mp_so->so_flags & SOF_DELEGATED) {
1614 		if (mpte->mpte_epid) {
1615 			error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1616 			if (error) {
1617 				os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1618 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1619 				goto out_err;
1620 			}
1621 		}
1622 		if (!uuid_is_null(mpte->mpte_euuid)) {
1623 			error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1624 			if (error) {
1625 				os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1626 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1627 				goto out_err;
1628 			}
1629 		}
1630 	}
1631 
1632 	/* inherit the other socket options */
1633 	bzero(&smpo, sizeof(smpo));
1634 	smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1635 	smpo.mpo_level = SOL_SOCKET;
1636 	smpo.mpo_intval = 1;
1637 
1638 	/* disable SIGPIPE */
1639 	smpo.mpo_name = SO_NOSIGPIPE;
1640 	if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1641 		goto out_err;
1642 	}
1643 
1644 	/* find out if the subflow's source address goes away */
1645 	smpo.mpo_name = SO_NOADDRERR;
1646 	if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1647 		goto out_err;
1648 	}
1649 
1650 	if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1651 		/*
1652 		 * On secondary subflows we might need to set the cell-fallback
1653 		 * flag (see conditions in mptcp_subflow_sosetopt).
1654 		 */
1655 		smpo.mpo_level = SOL_SOCKET;
1656 		smpo.mpo_name = SO_MARK_CELLFALLBACK;
1657 		smpo.mpo_intval = 1;
1658 		if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1659 			goto out_err;
1660 		}
1661 	}
1662 
1663 	/* replay setsockopt(2) on the subflow sockets for eligible options */
1664 	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1665 		int interim;
1666 
1667 		if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1668 			continue;
1669 		}
1670 
1671 		/*
1672 		 * Skip those that are handled internally; these options
1673 		 * should not have been recorded and marked with the
1674 		 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1675 		 */
1676 		if (mpo->mpo_level == SOL_SOCKET &&
1677 		    (mpo->mpo_name == SO_NOSIGPIPE ||
1678 		    mpo->mpo_name == SO_NOADDRERR ||
1679 		    mpo->mpo_name == SO_KEEPALIVE)) {
1680 			continue;
1681 		}
1682 
1683 		interim = (mpo->mpo_flags & MPOF_INTERIM);
1684 		if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1685 			os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1686 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1687 			    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1688 			    mpo->mpo_intval);
1689 			mptcp_sopt_remove(mpte, mpo);
1690 			mptcp_sopt_free(mpo);
1691 			continue;
1692 		}
1693 	}
1694 
1695 	/*
1696 	 * We need to receive everything that the subflow socket has,
1697 	 * so use a customized socket receive function.  We will undo
1698 	 * this when the socket is peeled off or closed.
1699 	 */
1700 	switch (dom) {
1701 	case PF_INET:
1702 		(*so)->so_proto = &mptcp_subflow_protosw;
1703 		break;
1704 	case PF_INET6:
1705 		(*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1706 		break;
1707 	default:
1708 		VERIFY(0);
1709 		/* NOTREACHED */
1710 	}
1711 
1712 	proc_rele(p);
1713 
1714 	DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1715 	    int, dom, int, error);
1716 
1717 	return 0;
1718 
1719 out_err:
1720 	mptcp_subflow_abort(mpts, error);
1721 
1722 	proc_rele(p);
1723 
1724 	return error;
1725 }
1726 
1727 /*
1728  * Close an MPTCP subflow socket.
1729  *
1730  * Note that this may be called on an embryonic subflow, and the only
1731  * thing that is guaranteed valid is the protocol-user request.
1732  */
1733 static void
mptcp_subflow_soclose(struct mptsub * mpts)1734 mptcp_subflow_soclose(struct mptsub *mpts)
1735 {
1736 	struct socket *so = mpts->mpts_socket;
1737 
1738 	if (mpts->mpts_flags & MPTSF_CLOSED) {
1739 		return;
1740 	}
1741 
1742 	VERIFY(so != NULL);
1743 	VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1744 	VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1745 
1746 	DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1747 	    struct socket *, so,
1748 	    struct sockbuf *, &so->so_rcv,
1749 	    struct sockbuf *, &so->so_snd,
1750 	    struct mptses *, mpts->mpts_mpte);
1751 
1752 	mpts->mpts_flags |= MPTSF_CLOSED;
1753 
1754 	if (so->so_retaincnt == 0) {
1755 		soclose_locked(so);
1756 
1757 		return;
1758 	} else {
1759 		VERIFY(so->so_usecount > 0);
1760 		so->so_usecount--;
1761 	}
1762 
1763 	return;
1764 }
1765 
1766 static void
mptcp_attach_to_subf(struct socket * so,struct mptcb * mp_tp,uint8_t addr_id)1767 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
1768 {
1769 	struct tcpcb *tp = sototcpcb(so);
1770 	struct mptcp_subf_auth_entry *sauth_entry;
1771 
1772 	/*
1773 	 * The address ID of the first flow is implicitly 0.
1774 	 */
1775 	if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1776 		tp->t_local_aid = 0;
1777 	} else {
1778 		tp->t_local_aid = addr_id;
1779 		tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
1780 		so->so_flags |= SOF_MP_SEC_SUBFLOW;
1781 	}
1782 	sauth_entry = zalloc(mpt_subauth_zone);
1783 	sauth_entry->msae_laddr_id = tp->t_local_aid;
1784 	sauth_entry->msae_raddr_id = 0;
1785 	sauth_entry->msae_raddr_rand = 0;
1786 try_again:
1787 	sauth_entry->msae_laddr_rand = RandomULong();
1788 	if (sauth_entry->msae_laddr_rand == 0) {
1789 		goto try_again;
1790 	}
1791 	LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
1792 }
1793 
1794 static void
mptcp_detach_mptcb_from_subf(struct mptcb * mp_tp,struct socket * so)1795 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
1796 {
1797 	struct mptcp_subf_auth_entry *sauth_entry;
1798 	struct tcpcb *tp = NULL;
1799 	int found = 0;
1800 
1801 	tp = sototcpcb(so);
1802 	if (tp == NULL) {
1803 		return;
1804 	}
1805 
1806 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
1807 		if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
1808 			found = 1;
1809 			break;
1810 		}
1811 	}
1812 	if (found) {
1813 		LIST_REMOVE(sauth_entry, msae_next);
1814 	}
1815 
1816 	if (found) {
1817 		zfree(mpt_subauth_zone, sauth_entry);
1818 	}
1819 }
1820 
1821 /*
1822  * Connect an MPTCP subflow socket.
1823  *
1824  * Note that in the pending connect case, the subflow socket may have been
1825  * bound to an interface and/or a source IP address which may no longer be
1826  * around by the time this routine is called; in that case the connect attempt
1827  * will most likely fail.
1828  */
1829 static int
mptcp_subflow_soconnectx(struct mptses * mpte,struct mptsub * mpts)1830 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1831 {
1832 	char dbuf[MAX_IPv6_STR_LEN];
1833 	struct socket *mp_so, *so;
1834 	struct mptcb *mp_tp;
1835 	struct sockaddr *dst;
1836 	struct proc *p;
1837 	int af, error, dport;
1838 
1839 	mp_so = mptetoso(mpte);
1840 	mp_tp = mpte->mpte_mptcb;
1841 	so = mpts->mpts_socket;
1842 	af = mpts->mpts_dst.sa_family;
1843 	dst = &mpts->mpts_dst;
1844 
1845 	VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1846 	VERIFY(mpts->mpts_socket != NULL);
1847 	VERIFY(af == AF_INET || af == AF_INET6);
1848 
1849 	if (af == AF_INET) {
1850 		inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1851 		dport = ntohs(SIN(dst)->sin_port);
1852 	} else {
1853 		inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1854 		dport = ntohs(SIN6(dst)->sin6_port);
1855 	}
1856 
1857 	os_log(mptcp_log_handle,
1858 	    "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1859 	    mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1860 
1861 	p = proc_find(mp_so->last_pid);
1862 	if (p == PROC_NULL) {
1863 		os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1864 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1865 
1866 		return ESRCH;
1867 	}
1868 
1869 	mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1870 
1871 	mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1872 
1873 	/* connect the subflow socket */
1874 	error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1875 	    p, mpts->mpts_ifscope,
1876 	    mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1877 
1878 	mpts->mpts_iss = sototcpcb(so)->iss;
1879 
1880 	/* See tcp_connect_complete */
1881 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1882 	    (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1883 		mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1884 	}
1885 
1886 	/* Allocate a unique address id per subflow */
1887 	mpte->mpte_addrid_last++;
1888 	if (mpte->mpte_addrid_last == 0) {
1889 		mpte->mpte_addrid_last++;
1890 	}
1891 
1892 	proc_rele(p);
1893 
1894 	DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1895 	    struct mptsub *, mpts, int, error);
1896 	if (error) {
1897 		os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1898 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1899 	}
1900 
1901 	return error;
1902 }
1903 
1904 static int
mptcp_adj_rmap(struct socket * so,struct mbuf * m,int off,uint64_t dsn,uint32_t rseq,uint16_t dlen,uint8_t dfin)1905 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1906     uint32_t rseq, uint16_t dlen, uint8_t dfin)
1907 {
1908 	struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1909 
1910 	if (m_pktlen(m) == 0) {
1911 		return 0;
1912 	}
1913 
1914 	if (!(m->m_flags & M_PKTHDR)) {
1915 		return 0;
1916 	}
1917 
1918 	if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
1919 		if (off && (dsn != m->m_pkthdr.mp_dsn ||
1920 		    rseq != m->m_pkthdr.mp_rseq ||
1921 		    dlen != m->m_pkthdr.mp_rlen ||
1922 		    dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1923 			os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
1924 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1925 			    (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1926 			    rseq, m->m_pkthdr.mp_rseq,
1927 			    dlen, m->m_pkthdr.mp_rlen,
1928 			    dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
1929 
1930 			soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1931 			return -1;
1932 		}
1933 	}
1934 
1935 	/* If mbuf is beyond right edge of the mapping, we need to split */
1936 	if (m_pktlen(m) > dlen - dfin - off) {
1937 		struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
1938 		if (new == NULL) {
1939 			os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
1940 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1941 			    dlen, dfin, off, m_pktlen(m),
1942 			    mpts->mpts_connid);
1943 
1944 			soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1945 			return -1;
1946 		}
1947 
1948 		m->m_next = new;
1949 		sballoc(&so->so_rcv, new);
1950 		/* Undo, as sballoc will add to it as well */
1951 		so->so_rcv.sb_cc -= new->m_len;
1952 
1953 		if (so->so_rcv.sb_mbtail == m) {
1954 			so->so_rcv.sb_mbtail = new;
1955 		}
1956 	}
1957 
1958 	m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1959 	m->m_pkthdr.mp_dsn = dsn + off;
1960 	m->m_pkthdr.mp_rseq = rseq + off;
1961 	VERIFY(m_pktlen(m) < UINT16_MAX);
1962 	m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
1963 
1964 	/* Only put the DATA_FIN-flag on the last mbuf of this mapping */
1965 	if (dfin) {
1966 		if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
1967 			m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
1968 		} else {
1969 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
1970 		}
1971 	}
1972 
1973 
1974 	mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
1975 
1976 	return 0;
1977 }
1978 
1979 /*
1980  * Update the pid, upid, uuid of the subflow so, based on parent so
1981  */
1982 static void
mptcp_update_last_owner(struct socket * so,struct socket * mp_so)1983 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
1984 {
1985 	if (so->last_pid != mp_so->last_pid ||
1986 	    so->last_upid != mp_so->last_upid) {
1987 		so->last_upid = mp_so->last_upid;
1988 		so->last_pid = mp_so->last_pid;
1989 		uuid_copy(so->last_uuid, mp_so->last_uuid);
1990 	}
1991 	so_update_policy(so);
1992 }
1993 
1994 /*
1995  * MPTCP subflow socket receive routine, derived from soreceive().
1996  */
1997 static int
mptcp_subflow_soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)1998 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1999     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2000 {
2001 #pragma unused(uio)
2002 	struct socket *mp_so;
2003 	struct mptses *mpte;
2004 	struct mptcb *mp_tp;
2005 	int flags, error = 0;
2006 	struct mbuf *m, **mp = mp0;
2007 	struct tcpcb *tp = sototcpcb(so);
2008 
2009 	mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2010 	mp_so = mptetoso(mpte);
2011 	mp_tp = mpte->mpte_mptcb;
2012 
2013 	VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2014 
2015 #ifdef MORE_LOCKING_DEBUG
2016 	if (so->so_usecount == 1) {
2017 		panic("%s: so=%x no other reference on socket", __func__, so);
2018 		/* NOTREACHED */
2019 	}
2020 #endif
2021 	/*
2022 	 * We return all that is there in the subflow's socket receive buffer
2023 	 * to the MPTCP layer, so we require that the caller passes in the
2024 	 * expected parameters.
2025 	 */
2026 	if (mp == NULL || controlp != NULL) {
2027 		return EINVAL;
2028 	}
2029 
2030 	*mp = NULL;
2031 	if (psa != NULL) {
2032 		*psa = NULL;
2033 	}
2034 	if (flagsp != NULL) {
2035 		flags = *flagsp & ~MSG_EOR;
2036 	} else {
2037 		flags = 0;
2038 	}
2039 
2040 	if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2041 		return EOPNOTSUPP;
2042 	}
2043 
2044 	flags |= (MSG_DONTWAIT | MSG_NBIO);
2045 
2046 	/*
2047 	 * If a recv attempt is made on a previously-accepted socket
2048 	 * that has been marked as inactive (disconnected), reject
2049 	 * the request.
2050 	 */
2051 	if (so->so_flags & SOF_DEFUNCT) {
2052 		struct sockbuf *sb = &so->so_rcv;
2053 
2054 		error = ENOTCONN;
2055 		/*
2056 		 * This socket should have been disconnected and flushed
2057 		 * prior to being returned from sodefunct(); there should
2058 		 * be no data on its receive list, so panic otherwise.
2059 		 */
2060 		if (so->so_state & SS_DEFUNCT) {
2061 			sb_empty_assert(sb, __func__);
2062 		}
2063 		return error;
2064 	}
2065 
2066 	/*
2067 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2068 	 * and if so just return to the caller.  This could happen when
2069 	 * soreceive() is called by a socket upcall function during the
2070 	 * time the socket is freed.  The socket buffer would have been
2071 	 * locked across the upcall, therefore we cannot put this thread
2072 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2073 	 * we may livelock), because the lock on the socket buffer will
2074 	 * only be released when the upcall routine returns to its caller.
2075 	 * Because the socket has been officially closed, there can be
2076 	 * no further read on it.
2077 	 *
2078 	 * A multipath subflow socket would have its SS_NOFDREF set by
2079 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2080 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2081 	 */
2082 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2083 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2084 		return 0;
2085 	}
2086 
2087 	/*
2088 	 * For consistency with soreceive() semantics, we need to obey
2089 	 * SB_LOCK in case some other code path has locked the buffer.
2090 	 */
2091 	error = sblock(&so->so_rcv, 0);
2092 	if (error != 0) {
2093 		return error;
2094 	}
2095 
2096 	m = so->so_rcv.sb_mb;
2097 	if (m == NULL) {
2098 		/*
2099 		 * Panic if we notice inconsistencies in the socket's
2100 		 * receive list; both sb_mb and sb_cc should correctly
2101 		 * reflect the contents of the list, otherwise we may
2102 		 * end up with false positives during select() or poll()
2103 		 * which could put the application in a bad state.
2104 		 */
2105 		SB_MB_CHECK(&so->so_rcv);
2106 
2107 		if (so->so_error != 0) {
2108 			error = so->so_error;
2109 			so->so_error = 0;
2110 			goto release;
2111 		}
2112 
2113 		if (so->so_state & SS_CANTRCVMORE) {
2114 			goto release;
2115 		}
2116 
2117 		if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2118 			error = ENOTCONN;
2119 			goto release;
2120 		}
2121 
2122 		/*
2123 		 * MSG_DONTWAIT is implicitly defined and this routine will
2124 		 * never block, so return EWOULDBLOCK when there is nothing.
2125 		 */
2126 		error = EWOULDBLOCK;
2127 		goto release;
2128 	}
2129 
2130 	mptcp_update_last_owner(so, mp_so);
2131 
2132 	SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2133 	SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2134 
2135 	while (m != NULL) {
2136 		int dlen = 0, error_out = 0, off = 0;
2137 		uint8_t dfin = 0;
2138 		struct mbuf *start = m;
2139 		uint64_t dsn;
2140 		uint32_t sseq;
2141 		uint16_t orig_dlen;
2142 		uint16_t csum;
2143 
2144 		VERIFY(m->m_nextpkt == NULL);
2145 
2146 		if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2147 fallback:
2148 			/* Just move mbuf to MPTCP-level */
2149 
2150 			sbfree(&so->so_rcv, m);
2151 
2152 			if (mp != NULL) {
2153 				*mp = m;
2154 				mp = &m->m_next;
2155 				so->so_rcv.sb_mb = m = m->m_next;
2156 				*mp = NULL;
2157 			}
2158 
2159 			if (m != NULL) {
2160 				so->so_rcv.sb_lastrecord = m;
2161 			} else {
2162 				SB_EMPTY_FIXUP(&so->so_rcv);
2163 			}
2164 
2165 			continue;
2166 		} else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2167 			struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2168 			boolean_t found_mapping = false;
2169 			int parsed_length = 0;
2170 			struct mbuf *m_iter;
2171 
2172 			/*
2173 			 * No MPTCP-option in the header. Either fallback or
2174 			 * wait for additional mappings.
2175 			 */
2176 			if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2177 				/* data arrived without a DSS option mapping */
2178 
2179 				/* initial subflow can fallback right after SYN handshake */
2180 				if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2181 					mptcp_notify_mpfail(so);
2182 
2183 					goto fallback;
2184 				} else {
2185 					os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2186 					    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2187 					    mpts->mpts_connid);
2188 					soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2189 
2190 					error = EIO;
2191 					*mp0 = NULL;
2192 					goto release;
2193 				}
2194 			}
2195 
2196 			/* Thus, let's look for an mbuf with the mapping */
2197 			m_iter = m->m_next;
2198 			parsed_length = m->m_len;
2199 			while (m_iter != NULL && parsed_length < UINT16_MAX) {
2200 				if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2201 					parsed_length += m_iter->m_len;
2202 					m_iter = m_iter->m_next;
2203 					continue;
2204 				}
2205 
2206 				found_mapping = true;
2207 
2208 				/* Found an mbuf with a DSS-mapping */
2209 				orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2210 				dsn = m_iter->m_pkthdr.mp_dsn;
2211 				sseq = m_iter->m_pkthdr.mp_rseq;
2212 				csum = m_iter->m_pkthdr.mp_csum;
2213 
2214 				if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2215 					dfin = 1;
2216 					dlen--;
2217 				}
2218 
2219 				break;
2220 			}
2221 
2222 			if (!found_mapping && parsed_length < UINT16_MAX) {
2223 				/* Mapping not yet present, we can wait! */
2224 				if (*mp0 == NULL) {
2225 					error = EWOULDBLOCK;
2226 				}
2227 				goto release;
2228 			} else if (!found_mapping && parsed_length >= UINT16_MAX) {
2229 				os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2230 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2231 				    mpts->mpts_connid);
2232 				/* Received 64KB without DSS-mapping. We should kill the subflow */
2233 				soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2234 
2235 				error = EIO;
2236 				*mp0 = NULL;
2237 				goto release;
2238 			}
2239 		} else {
2240 			orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2241 			dsn = m->m_pkthdr.mp_dsn;
2242 			sseq = m->m_pkthdr.mp_rseq;
2243 			csum = m->m_pkthdr.mp_csum;
2244 
2245 			if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2246 				dfin = 1;
2247 				dlen--;
2248 			}
2249 		}
2250 
2251 		/* Now, see if we need to remove previous packets */
2252 		if (SEQ_GT(sseq + tp->irs, tp->rcv_nxt - so->so_rcv.sb_cc)) {
2253 			/* Ok, there is data in there that we don't need - let's throw it away! */
2254 			int totrim = (int)sseq + tp->irs - (tp->rcv_nxt - so->so_rcv.sb_cc);
2255 
2256 			sbdrop(&so->so_rcv, totrim);
2257 
2258 			m = so->so_rcv.sb_mb;
2259 		}
2260 
2261 		/*
2262 		 * Check if the full mapping is now present
2263 		 */
2264 		if ((int)so->so_rcv.sb_cc < dlen) {
2265 			if (*mp0 == NULL) {
2266 				error = EWOULDBLOCK;
2267 			}
2268 			goto release;
2269 		}
2270 
2271 		/* Now, get the full mapping */
2272 		off = 0;
2273 		while (dlen > 0) {
2274 			if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
2275 				error_out = 1;
2276 				error = EIO;
2277 				dlen = 0;
2278 				*mp0 = NULL;
2279 				break;
2280 			}
2281 
2282 			dlen -= m->m_len;
2283 			off += m->m_len;
2284 			sbfree(&so->so_rcv, m);
2285 
2286 			if (mp != NULL) {
2287 				*mp = m;
2288 				mp = &m->m_next;
2289 				so->so_rcv.sb_mb = m = m->m_next;
2290 				*mp = NULL;
2291 			}
2292 
2293 			ASSERT(dlen == 0 || m);
2294 			if (dlen != 0 && m == NULL) {
2295 				/* "try" to gracefully recover on customer builds */
2296 				error_out = 1;
2297 				error = EIO;
2298 				dlen  = 0;
2299 
2300 				*mp0 = NULL;
2301 
2302 				SB_EMPTY_FIXUP(&so->so_rcv);
2303 				soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2304 
2305 				break;
2306 			}
2307 		}
2308 
2309 		VERIFY(dlen == 0);
2310 
2311 		if (m != NULL) {
2312 			so->so_rcv.sb_lastrecord = m;
2313 		} else {
2314 			SB_EMPTY_FIXUP(&so->so_rcv);
2315 		}
2316 
2317 		if (error_out) {
2318 			goto release;
2319 		}
2320 
2321 		if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2322 			error = EIO;
2323 			*mp0 = NULL;
2324 			goto release;
2325 		}
2326 
2327 		SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2328 		SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2329 	}
2330 
2331 	DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2332 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2333 
2334 	if (flagsp != NULL) {
2335 		*flagsp |= flags;
2336 	}
2337 
2338 release:
2339 	sbunlock(&so->so_rcv, TRUE);
2340 
2341 	return error;
2342 }
2343 
2344 /*
2345  * MPTCP subflow socket send routine, derived from sosend().
2346  */
2347 static int
mptcp_subflow_sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2348 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2349     struct mbuf *top, struct mbuf *control, int flags)
2350 {
2351 	struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2352 	boolean_t en_tracing = FALSE, proc_held = FALSE;
2353 	struct proc *p = current_proc();
2354 	int en_tracing_val;
2355 	int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2356 	int error;
2357 
2358 	VERIFY(control == NULL);
2359 	VERIFY(addr == NULL);
2360 	VERIFY(uio == NULL);
2361 	VERIFY(flags == 0);
2362 	VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2363 
2364 	VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2365 	VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2366 
2367 	/*
2368 	 * trace if tracing & network (vs. unix) sockets & and
2369 	 * non-loopback
2370 	 */
2371 	if (ENTR_SHOULDTRACE &&
2372 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2373 		struct inpcb *inp = sotoinpcb(so);
2374 		if (inp->inp_last_outifp != NULL &&
2375 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2376 			en_tracing = TRUE;
2377 			en_tracing_val = top->m_pkthdr.len;
2378 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2379 			    (unsigned long)VM_KERNEL_ADDRPERM(so),
2380 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2381 			    (int64_t)en_tracing_val);
2382 		}
2383 	}
2384 
2385 	mptcp_update_last_owner(so, mp_so);
2386 
2387 	if (mp_so->last_pid != proc_pid(p)) {
2388 		p = proc_find(mp_so->last_pid);
2389 		if (p == PROC_NULL) {
2390 			p = current_proc();
2391 		} else {
2392 			proc_held = TRUE;
2393 		}
2394 	}
2395 
2396 #if NECP
2397 	inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2398 #endif /* NECP */
2399 
2400 	error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2401 	if (error) {
2402 		goto out;
2403 	}
2404 
2405 	error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2406 	top = NULL;
2407 
2408 out:
2409 	if (top != NULL) {
2410 		m_freem(top);
2411 	}
2412 
2413 	if (proc_held) {
2414 		proc_rele(p);
2415 	}
2416 
2417 	soclearfastopen(so);
2418 
2419 	if (en_tracing) {
2420 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2421 		    (unsigned long)VM_KERNEL_ADDRPERM(so),
2422 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2423 		    (int64_t)en_tracing_val);
2424 	}
2425 
2426 	return error;
2427 }
2428 
2429 /*
2430  * Subflow socket write upcall.
2431  *
2432  * Called when the associated subflow socket posted a read event.
2433  */
2434 static void
mptcp_subflow_wupcall(struct socket * so,void * arg,int waitf)2435 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2436 {
2437 #pragma unused(so, waitf)
2438 	struct mptsub *mpts = arg;
2439 	struct mptses *mpte = mpts->mpts_mpte;
2440 
2441 	VERIFY(mpte != NULL);
2442 
2443 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2444 		if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2445 			mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2446 		}
2447 		return;
2448 	}
2449 
2450 	mptcp_output(mpte);
2451 }
2452 
2453 /*
2454  * Subflow socket control event upcall.
2455  */
2456 static void
mptcp_subflow_eupcall1(struct socket * so,void * arg,uint32_t events)2457 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
2458 {
2459 #pragma unused(so)
2460 	struct mptsub *mpts = arg;
2461 	struct mptses *mpte = mpts->mpts_mpte;
2462 
2463 	socket_lock_assert_owned(mptetoso(mpte));
2464 
2465 	if ((mpts->mpts_evctl & events) == events) {
2466 		return;
2467 	}
2468 
2469 	mpts->mpts_evctl |= events;
2470 
2471 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2472 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
2473 		return;
2474 	}
2475 
2476 	mptcp_subflow_workloop(mpte);
2477 }
2478 
2479 /*
2480  * Establish an initial MPTCP connection (if first subflow and not yet
2481  * connected), or add a subflow to an existing MPTCP connection.
2482  */
2483 int
mptcp_subflow_add(struct mptses * mpte,struct sockaddr * src,struct sockaddr * dst,uint32_t ifscope,sae_connid_t * pcid)2484 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2485     struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2486 {
2487 	struct socket *mp_so, *so = NULL;
2488 	struct mptcb *mp_tp;
2489 	struct mptsub *mpts = NULL;
2490 	int af, error = 0;
2491 
2492 	mp_so = mptetoso(mpte);
2493 	mp_tp = mpte->mpte_mptcb;
2494 
2495 	socket_lock_assert_owned(mp_so);
2496 
2497 	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2498 		/* If the remote end sends Data FIN, refuse subflow adds */
2499 		os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2500 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2501 		error = ENOTCONN;
2502 		goto out_err;
2503 	}
2504 
2505 	if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2506 		error = EOVERFLOW;
2507 		goto out_err;
2508 	}
2509 
2510 	mpts = mptcp_subflow_alloc();
2511 	if (mpts == NULL) {
2512 		os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2513 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2514 		error = ENOMEM;
2515 		goto out_err;
2516 	}
2517 
2518 	if (src) {
2519 		if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2520 			error = EAFNOSUPPORT;
2521 			goto out_err;
2522 		}
2523 
2524 		if (src->sa_family == AF_INET &&
2525 		    src->sa_len != sizeof(struct sockaddr_in)) {
2526 			error = EINVAL;
2527 			goto out_err;
2528 		}
2529 
2530 		if (src->sa_family == AF_INET6 &&
2531 		    src->sa_len != sizeof(struct sockaddr_in6)) {
2532 			error = EINVAL;
2533 			goto out_err;
2534 		}
2535 
2536 		mpts->mpts_src = (struct sockaddr *)alloc_sockaddr(src->sa_len,
2537 		    Z_WAITOK | Z_NOFAIL);
2538 
2539 		bcopy(src, mpts->mpts_src, src->sa_len);
2540 	}
2541 
2542 	if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2543 		error = EAFNOSUPPORT;
2544 		goto out_err;
2545 	}
2546 
2547 	if (dst->sa_family == AF_INET &&
2548 	    dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2549 		error = EINVAL;
2550 		goto out_err;
2551 	}
2552 
2553 	if (dst->sa_family == AF_INET6 &&
2554 	    dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2555 		error = EINVAL;
2556 		goto out_err;
2557 	}
2558 
2559 	memcpy(&mpts->mpts_u_dst, dst, dst->sa_len);
2560 
2561 	af = mpts->mpts_dst.sa_family;
2562 
2563 	ifnet_head_lock_shared();
2564 	if ((ifscope > (unsigned)if_index)) {
2565 		ifnet_head_done();
2566 		error = ENXIO;
2567 		goto out_err;
2568 	}
2569 	ifnet_head_done();
2570 
2571 	mpts->mpts_ifscope = ifscope;
2572 
2573 	/* create the subflow socket */
2574 	if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2575 		/*
2576 		 * Returning (error) and not cleaning up, because up to here
2577 		 * all we did is creating mpts.
2578 		 *
2579 		 * And the contract is that the call to mptcp_subflow_socreate,
2580 		 * moves ownership of mpts to mptcp_subflow_socreate.
2581 		 */
2582 		return error;
2583 	}
2584 
2585 	/*
2586 	 * We may be called from within the kernel. Still need to account this
2587 	 * one to the real app.
2588 	 */
2589 	mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2590 
2591 	/*
2592 	 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2593 	 * -1 (SAE_CONNID_ALL).
2594 	 */
2595 	mpte->mpte_connid_last++;
2596 	if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2597 	    mpte->mpte_connid_last == SAE_CONNID_ANY) {
2598 		mpte->mpte_connid_last++;
2599 	}
2600 
2601 	mpts->mpts_connid = mpte->mpte_connid_last;
2602 
2603 	mpts->mpts_rel_seq = 1;
2604 
2605 	/* Allocate a unique address id per subflow */
2606 	mpte->mpte_addrid_last++;
2607 	if (mpte->mpte_addrid_last == 0) {
2608 		mpte->mpte_addrid_last++;
2609 	}
2610 
2611 	/* register for subflow socket read/write events */
2612 	sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2613 
2614 	/* Register for subflow socket control events */
2615 	sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2616 	    SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2617 	    SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2618 	    SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2619 	    SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2620 	    SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2621 	    SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2622 	    SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2623 
2624 	/* sanity check */
2625 	VERIFY(!(mpts->mpts_flags &
2626 	    (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2627 
2628 	/*
2629 	 * Indicate to the TCP subflow whether or not it should establish
2630 	 * the initial MPTCP connection, or join an existing one.  Fill
2631 	 * in the connection request structure with additional info needed
2632 	 * by the underlying TCP (to be used in the TCP options, etc.)
2633 	 */
2634 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2635 		mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2636 
2637 		if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2638 			mptcp_init_local_parms(mpte, dst);
2639 		}
2640 		soisconnecting(mp_so);
2641 
2642 		/* If fastopen is requested, set state in mpts */
2643 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2644 			mpts->mpts_flags |= MPTSF_TFO_REQD;
2645 		}
2646 	} else {
2647 		if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2648 			mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2649 		}
2650 	}
2651 
2652 	mpts->mpts_flags |= MPTSF_CONNECTING;
2653 
2654 	/* connect right away if first attempt, or if join can be done now */
2655 	if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2656 		error = mptcp_subflow_soconnectx(mpte, mpts);
2657 	}
2658 
2659 	if (error) {
2660 		goto out_err_close;
2661 	}
2662 
2663 	if (pcid) {
2664 		*pcid = mpts->mpts_connid;
2665 	}
2666 
2667 	return 0;
2668 
2669 out_err_close:
2670 	mptcp_subflow_abort(mpts, error);
2671 
2672 	return error;
2673 
2674 out_err:
2675 	if (mpts) {
2676 		mptcp_subflow_free(mpts);
2677 	}
2678 
2679 	return error;
2680 }
2681 
2682 void
mptcpstats_update(struct mptcp_itf_stats * stats,const struct mptsub * mpts)2683 mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2684 {
2685 	int index = mptcpstats_get_index(stats, mpts);
2686 
2687 	if (index != -1) {
2688 		struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2689 
2690 		stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2691 		stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2692 
2693 		stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2694 		stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2695 
2696 		stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2697 		stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2698 
2699 		stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2700 		stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2701 	}
2702 }
2703 
2704 /*
2705  * Delete/remove a subflow from an MPTCP.  The underlying subflow socket
2706  * will no longer be accessible after a subflow is deleted, thus this
2707  * should occur only after the subflow socket has been disconnected.
2708  */
2709 void
mptcp_subflow_del(struct mptses * mpte,struct mptsub * mpts)2710 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2711 {
2712 	struct socket *mp_so = mptetoso(mpte);
2713 	struct socket *so = mpts->mpts_socket;
2714 	struct tcpcb *tp = sototcpcb(so);
2715 
2716 	socket_lock_assert_owned(mp_so);
2717 	VERIFY(mpts->mpts_mpte == mpte);
2718 	VERIFY(mpte->mpte_numflows != 0);
2719 	VERIFY(mp_so->so_usecount > 0);
2720 
2721 	mptcpstats_update(mpte->mpte_itfstats, mpts);
2722 
2723 	mptcp_unset_cellicon(mpte, mpts, 1);
2724 
2725 	mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2726 	mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2727 
2728 	TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2729 	mpte->mpte_numflows--;
2730 	if (mpte->mpte_active_sub == mpts) {
2731 		mpte->mpte_active_sub = NULL;
2732 	}
2733 
2734 	/*
2735 	 * Drop references held by this subflow socket; there
2736 	 * will be no further upcalls made from this point.
2737 	 */
2738 	sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2739 	sock_catchevents_locked(so, NULL, NULL, 0);
2740 
2741 	mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2742 
2743 	mp_so->so_usecount--;           /* for subflow socket */
2744 	mpts->mpts_mpte = NULL;
2745 	mpts->mpts_socket = NULL;
2746 
2747 	mptcp_subflow_remref(mpts);             /* for MPTCP subflow list */
2748 	mptcp_subflow_remref(mpts);             /* for subflow socket */
2749 
2750 	so->so_flags &= ~SOF_MP_SUBFLOW;
2751 	tp->t_mptcb = NULL;
2752 	tp->t_mpsub = NULL;
2753 }
2754 
2755 void
mptcp_subflow_shutdown(struct mptses * mpte,struct mptsub * mpts)2756 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2757 {
2758 	struct socket *so = mpts->mpts_socket;
2759 	struct mptcb *mp_tp = mpte->mpte_mptcb;
2760 	int send_dfin = 0;
2761 
2762 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2763 		send_dfin = 1;
2764 	}
2765 
2766 	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2767 	    (so->so_state & SS_ISCONNECTED)) {
2768 		if (send_dfin) {
2769 			mptcp_send_dfin(so);
2770 		}
2771 		soshutdownlock(so, SHUT_WR);
2772 	}
2773 }
2774 
2775 static void
mptcp_subflow_abort(struct mptsub * mpts,int error)2776 mptcp_subflow_abort(struct mptsub *mpts, int error)
2777 {
2778 	struct socket *so = mpts->mpts_socket;
2779 	struct tcpcb *tp = sototcpcb(so);
2780 
2781 	if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2782 		return;
2783 	}
2784 
2785 	if (tp->t_state != TCPS_CLOSED) {
2786 		tcp_drop(tp, error);
2787 	}
2788 
2789 	mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2790 }
2791 
2792 /*
2793  * Disconnect a subflow socket.
2794  */
2795 void
mptcp_subflow_disconnect(struct mptses * mpte,struct mptsub * mpts)2796 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2797 {
2798 	struct socket *so, *mp_so;
2799 	struct mptcb *mp_tp;
2800 	int send_dfin = 0;
2801 
2802 	so = mpts->mpts_socket;
2803 	mp_tp = mpte->mpte_mptcb;
2804 	mp_so = mptetoso(mpte);
2805 
2806 	socket_lock_assert_owned(mp_so);
2807 
2808 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2809 		return;
2810 	}
2811 
2812 	mptcp_unset_cellicon(mpte, mpts, 1);
2813 
2814 	mpts->mpts_flags |= MPTSF_DISCONNECTING;
2815 
2816 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2817 		send_dfin = 1;
2818 	}
2819 
2820 	if (mp_so->so_flags & SOF_DEFUNCT) {
2821 		errno_t ret;
2822 
2823 		ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2824 		if (ret == 0) {
2825 			ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2826 
2827 			if (ret != 0) {
2828 				os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2829 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2830 			}
2831 		} else {
2832 			os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2833 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2834 		}
2835 	}
2836 
2837 	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2838 	    (so->so_state & SS_ISCONNECTED)) {
2839 		if (send_dfin) {
2840 			mptcp_send_dfin(so);
2841 		}
2842 
2843 		(void) soshutdownlock(so, SHUT_RD);
2844 		(void) soshutdownlock(so, SHUT_WR);
2845 		(void) sodisconnectlocked(so);
2846 	}
2847 
2848 	/*
2849 	 * Generate a disconnect event for this subflow socket, in case
2850 	 * the lower layer doesn't do it; this is needed because the
2851 	 * subflow socket deletion relies on it.
2852 	 */
2853 	mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2854 }
2855 
2856 /*
2857  * Subflow socket input.
2858  */
2859 static void
mptcp_subflow_input(struct mptses * mpte,struct mptsub * mpts)2860 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2861 {
2862 	struct socket *mp_so = mptetoso(mpte);
2863 	struct mbuf *m = NULL;
2864 	struct socket *so;
2865 	int error, wakeup = 0;
2866 
2867 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2868 	mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2869 
2870 	DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2871 	    struct mptsub *, mpts);
2872 
2873 	if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2874 		goto out;
2875 	}
2876 
2877 	so = mpts->mpts_socket;
2878 
2879 	error = sock_receive_internal(so, NULL, &m, 0, NULL);
2880 	if (error != 0 && error != EWOULDBLOCK) {
2881 		os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2882 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2883 		if (error == ENODATA) {
2884 			/*
2885 			 * Don't ignore ENODATA so as to discover
2886 			 * nasty middleboxes.
2887 			 */
2888 			mp_so->so_error = ENODATA;
2889 
2890 			wakeup = 1;
2891 			goto out;
2892 		}
2893 	}
2894 
2895 	/* In fallback, make sure to accept data on all but one subflow */
2896 	if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2897 	    !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2898 		m_freem(m);
2899 		goto out;
2900 	}
2901 
2902 	if (m != NULL) {
2903 		if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2904 			mptcp_set_cellicon(mpte, mpts);
2905 
2906 			mpte->mpte_used_cell = 1;
2907 		} else {
2908 			/*
2909 			 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2910 			 * explicitly set the cellicon, then we unset it again.
2911 			 */
2912 			if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2913 				mptcp_unset_cellicon(mpte, NULL, 1);
2914 			}
2915 
2916 			mpte->mpte_used_wifi = 1;
2917 		}
2918 
2919 		mptcp_input(mpte, m);
2920 	}
2921 
2922 out:
2923 	if (wakeup) {
2924 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2925 	}
2926 
2927 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2928 }
2929 
2930 void
mptcp_handle_input(struct socket * so)2931 mptcp_handle_input(struct socket *so)
2932 {
2933 	struct mptsub *mpts, *tmpts;
2934 	struct mptses *mpte;
2935 
2936 	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2937 		return;
2938 	}
2939 
2940 	mpts = sototcpcb(so)->t_mpsub;
2941 	mpte = mpts->mpts_mpte;
2942 
2943 	socket_lock_assert_owned(mptetoso(mpte));
2944 
2945 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2946 		if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2947 			mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2948 		}
2949 		return;
2950 	}
2951 
2952 	mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2953 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2954 		if (mpts->mpts_socket->so_usecount == 0) {
2955 			/* Will be removed soon by tcp_garbage_collect */
2956 			continue;
2957 		}
2958 
2959 		mptcp_subflow_addref(mpts);
2960 		mpts->mpts_socket->so_usecount++;
2961 
2962 		mptcp_subflow_input(mpte, mpts);
2963 
2964 		mptcp_subflow_remref(mpts);             /* ours */
2965 
2966 		VERIFY(mpts->mpts_socket->so_usecount != 0);
2967 		mpts->mpts_socket->so_usecount--;
2968 	}
2969 
2970 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2971 }
2972 
2973 static boolean_t
mptcp_search_seq_in_sub(struct mbuf * m,struct socket * so)2974 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2975 {
2976 	struct mbuf *so_m = so->so_snd.sb_mb;
2977 	uint64_t dsn = m->m_pkthdr.mp_dsn;
2978 
2979 	while (so_m) {
2980 		VERIFY(so_m->m_flags & M_PKTHDR);
2981 		VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2982 
2983 		/* Part of the segment is covered, don't reinject here */
2984 		if (so_m->m_pkthdr.mp_dsn <= dsn &&
2985 		    so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2986 			return TRUE;
2987 		}
2988 
2989 		so_m = so_m->m_next;
2990 	}
2991 
2992 	return FALSE;
2993 }
2994 
2995 /*
2996  * Subflow socket output.
2997  *
2998  * Called for sending data from MPTCP to the underlying subflow socket.
2999  */
3000 int
mptcp_subflow_output(struct mptses * mpte,struct mptsub * mpts,int flags)3001 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
3002 {
3003 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3004 	struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL;
3005 	struct socket *mp_so, *so;
3006 	struct tcpcb *tp;
3007 	uint64_t mpt_dsn = 0, off = 0;
3008 	int sb_cc = 0, error = 0, wakeup = 0;
3009 	uint16_t dss_csum;
3010 	uint16_t tot_sent = 0;
3011 	boolean_t reinjected = FALSE;
3012 
3013 	mp_so = mptetoso(mpte);
3014 	so = mpts->mpts_socket;
3015 	tp = sototcpcb(so);
3016 
3017 	socket_lock_assert_owned(mp_so);
3018 
3019 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3020 	mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
3021 
3022 	VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3023 	VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
3024 	    (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3025 	    (mpts->mpts_flags & MPTSF_TFO_REQD));
3026 	VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
3027 
3028 	DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3029 	    struct mptsub *, mpts);
3030 
3031 	/* Remove Addr Option is not sent reliably as per I-D */
3032 	if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
3033 		tp->t_rem_aid = mpte->mpte_lost_aid;
3034 		tp->t_mpflags |= TMPF_SND_REM_ADDR;
3035 		mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3036 	}
3037 
3038 	/*
3039 	 * The mbuf chains containing the metadata (as well as pointing to
3040 	 * the user data sitting at the MPTCP output queue) would then be
3041 	 * sent down to the subflow socket.
3042 	 *
3043 	 * Some notes on data sequencing:
3044 	 *
3045 	 *   a. Each mbuf must be a M_PKTHDR.
3046 	 *   b. MPTCP metadata is stored in the mptcp_pktinfo structure
3047 	 *	in the mbuf pkthdr structure.
3048 	 *   c. Each mbuf containing the MPTCP metadata must have its
3049 	 *	pkt_flags marked with the PKTF_MPTCP flag.
3050 	 */
3051 
3052 	if (mpte->mpte_reinjectq) {
3053 		sb_mb = mpte->mpte_reinjectq;
3054 	} else {
3055 		sb_mb = mp_so->so_snd.sb_mb;
3056 	}
3057 
3058 	if (sb_mb == NULL) {
3059 		os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3060 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3061 		    (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3062 		    (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3063 
3064 		/* Fix it to prevent looping */
3065 		if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3066 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3067 		}
3068 		goto out;
3069 	}
3070 
3071 	VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3072 
3073 	if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3074 	    !(so->so_state & SS_ISCONNECTED) &&
3075 	    (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3076 		tp->t_mpflags |= TMPF_TFO_REQUEST;
3077 
3078 		/* Opting to call pru_send as no mbuf at subflow level */
3079 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3080 		    NULL, current_proc());
3081 
3082 		goto done_sending;
3083 	}
3084 
3085 	mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3086 
3087 	/* First, drop acknowledged data */
3088 	if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3089 		os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3090 		    "dsn %u suna %u reinject? %u\n",
3091 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3092 		    (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3093 		if (mpte->mpte_reinjectq) {
3094 			mptcp_clean_reinjectq(mpte);
3095 		} else {
3096 			uint64_t len = 0;
3097 			len = mp_tp->mpt_snduna - mpt_dsn;
3098 			sbdrop(&mp_so->so_snd, (int)len);
3099 			wakeup = 1;
3100 		}
3101 	}
3102 
3103 	/* Check again because of above sbdrop */
3104 	if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3105 		os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3106 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3107 		goto out;
3108 	}
3109 
3110 	/*
3111 	 * In degraded mode, we don't receive data acks, so force free
3112 	 * mbufs less than snd_nxt
3113 	 */
3114 	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3115 	    (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3116 	    mp_so->so_snd.sb_mb) {
3117 		mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3118 		if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3119 			uint64_t len = 0;
3120 			len = mp_tp->mpt_snduna - mpt_dsn;
3121 			sbdrop(&mp_so->so_snd, (int)len);
3122 			wakeup = 1;
3123 
3124 			os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3125 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3126 			    (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3127 		}
3128 	}
3129 
3130 	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3131 	    !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3132 		mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3133 		so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3134 	}
3135 
3136 	/*
3137 	 * Adjust the top level notion of next byte used for retransmissions
3138 	 * and sending FINs.
3139 	 */
3140 	if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3141 		mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3142 	}
3143 
3144 	/* Now determine the offset from which to start transmitting data */
3145 	if (mpte->mpte_reinjectq) {
3146 		sb_mb = mpte->mpte_reinjectq;
3147 	} else {
3148 dont_reinject:
3149 		sb_mb = mp_so->so_snd.sb_mb;
3150 	}
3151 	if (sb_mb == NULL) {
3152 		os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3153 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3154 		goto out;
3155 	}
3156 
3157 	if (sb_mb == mpte->mpte_reinjectq) {
3158 		sb_cc = sb_mb->m_pkthdr.mp_rlen;
3159 		off = 0;
3160 
3161 		if (mptcp_search_seq_in_sub(sb_mb, so)) {
3162 			if (mptcp_can_send_more(mp_tp, TRUE)) {
3163 				goto dont_reinject;
3164 			}
3165 
3166 			error = ECANCELED;
3167 			goto out;
3168 		}
3169 
3170 		reinjected = TRUE;
3171 	} else if (flags & MPTCP_SUBOUT_PROBING) {
3172 		sb_cc = sb_mb->m_pkthdr.mp_rlen;
3173 		off = 0;
3174 	} else {
3175 		sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3176 
3177 		/*
3178 		 * With TFO, there might be no data at all, thus still go into this
3179 		 * code-path here.
3180 		 */
3181 		if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3182 		    MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3183 			off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3184 			sb_cc -= off;
3185 		} else {
3186 			os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3187 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3188 			    (uint32_t)mp_tp->mpt_sndmax);
3189 
3190 			goto out;
3191 		}
3192 	}
3193 
3194 	sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3195 	if (sb_cc <= 0) {
3196 		os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3197 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3198 		    (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3199 		    mptcp_subflow_cwnd_space(so));
3200 	}
3201 
3202 	sb_cc = min(sb_cc, UINT16_MAX);
3203 
3204 	/*
3205 	 * Create a DSN mapping for the data we are about to send. It all
3206 	 * has the same mapping.
3207 	 */
3208 	if (reinjected) {
3209 		mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3210 	} else {
3211 		mpt_dsn = mp_tp->mpt_snduna + off;
3212 	}
3213 
3214 	mpt_mbuf = sb_mb;
3215 	while (mpt_mbuf && reinjected == FALSE &&
3216 	    (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3217 	    mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3218 		off -= mpt_mbuf->m_pkthdr.mp_rlen;
3219 		mpt_mbuf = mpt_mbuf->m_next;
3220 	}
3221 	VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3222 
3223 	head = tail = NULL;
3224 
3225 	while (tot_sent < sb_cc) {
3226 		int32_t mlen;
3227 
3228 		mlen = mpt_mbuf->m_len;
3229 		mlen -= off;
3230 		mlen = MIN(mlen, sb_cc - tot_sent);
3231 
3232 		if (mlen < 0) {
3233 			os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3234 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3235 			    (uint32_t)off, sb_cc, tot_sent);
3236 			goto out;
3237 		}
3238 
3239 		if (mlen == 0) {
3240 			goto next;
3241 		}
3242 
3243 		m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT, NULL, NULL,
3244 		    M_COPYM_MUST_COPY_HDR);
3245 		if (m == NULL) {
3246 			os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3247 			    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3248 			error = ENOBUFS;
3249 			break;
3250 		}
3251 
3252 		/* Create a DSN mapping for the data (m_copym does it) */
3253 		VERIFY(m->m_flags & M_PKTHDR);
3254 		VERIFY(m->m_next == NULL);
3255 
3256 		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3257 		m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3258 		m->m_pkthdr.mp_dsn = mpt_dsn;
3259 		m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3260 		m->m_pkthdr.len = mlen;
3261 
3262 		if (head == NULL) {
3263 			head = tail = m;
3264 		} else {
3265 			tail->m_next = m;
3266 			tail = m;
3267 		}
3268 
3269 		tot_sent += mlen;
3270 		off = 0;
3271 next:
3272 		mpt_mbuf = mpt_mbuf->m_next;
3273 	}
3274 
3275 	if (reinjected) {
3276 		if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3277 			struct mbuf *n = sb_mb;
3278 
3279 			while (n) {
3280 				n->m_pkthdr.mp_dsn += sb_cc;
3281 				n->m_pkthdr.mp_rlen -= sb_cc;
3282 				n = n->m_next;
3283 			}
3284 			m_adj(sb_mb, sb_cc);
3285 		} else {
3286 			mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3287 			m_freem(sb_mb);
3288 		}
3289 	}
3290 
3291 	if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3292 		dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3293 		    tot_sent);
3294 	}
3295 
3296 	/* Now, let's update rel-seq and the data-level length */
3297 	mpts->mpts_rel_seq += tot_sent;
3298 	m = head;
3299 	while (m) {
3300 		if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3301 			m->m_pkthdr.mp_csum = dss_csum;
3302 		}
3303 		m->m_pkthdr.mp_rlen = tot_sent;
3304 		m = m->m_next;
3305 	}
3306 
3307 	if (head != NULL) {
3308 		if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3309 		    (tp->t_tfo_stats == 0)) {
3310 			tp->t_mpflags |= TMPF_TFO_REQUEST;
3311 		}
3312 
3313 		error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0);
3314 		head = NULL;
3315 	}
3316 
3317 done_sending:
3318 	if (error == 0 ||
3319 	    (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3320 		uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3321 
3322 		if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3323 			tcpstat.tcps_mp_num_probes++;
3324 			if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3325 				mpts->mpts_probecnt += 1;
3326 			} else {
3327 				mpts->mpts_probecnt +=
3328 				    tot_sent / mpts->mpts_maxseg;
3329 			}
3330 		}
3331 
3332 		if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3333 			if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3334 			    MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3335 				mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3336 			}
3337 			mp_tp->mpt_sndnxt = new_sndnxt;
3338 		}
3339 
3340 		mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3341 
3342 		/* Must be here as mptcp_can_send_more() checks for this */
3343 		soclearfastopen(mp_so);
3344 
3345 		if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3346 			mptcp_set_cellicon(mpte, mpts);
3347 
3348 			mpte->mpte_used_cell = 1;
3349 		} else {
3350 			/*
3351 			 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3352 			 * explicitly set the cellicon, then we unset it again.
3353 			 */
3354 			if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3355 				mptcp_unset_cellicon(mpte, NULL, 1);
3356 			}
3357 
3358 			mpte->mpte_used_wifi = 1;
3359 		}
3360 
3361 		/*
3362 		 * Don't propagate EWOULDBLOCK - it's already taken care of
3363 		 * in mptcp_usr_send for TFO.
3364 		 */
3365 		error = 0;
3366 	} else {
3367 		/* We need to revert our change to mpts_rel_seq */
3368 		mpts->mpts_rel_seq -= tot_sent;
3369 
3370 		os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3371 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3372 	}
3373 out:
3374 
3375 	if (head != NULL) {
3376 		m_freem(head);
3377 	}
3378 
3379 	if (wakeup) {
3380 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3381 	}
3382 
3383 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3384 	return error;
3385 }
3386 
3387 static void
mptcp_add_reinjectq(struct mptses * mpte,struct mbuf * m)3388 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3389 {
3390 	struct mbuf *n, *prev = NULL;
3391 
3392 	n = mpte->mpte_reinjectq;
3393 
3394 	/* First, look for an mbuf n, whose data-sequence-number is bigger or
3395 	 * equal than m's sequence number.
3396 	 */
3397 	while (n) {
3398 		if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3399 			break;
3400 		}
3401 
3402 		prev = n;
3403 
3404 		n = n->m_nextpkt;
3405 	}
3406 
3407 	if (n) {
3408 		/* m is already fully covered by the next mbuf in the queue */
3409 		if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3410 		    n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3411 			os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3412 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3413 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3414 			    m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
3415 			goto dont_queue;
3416 		}
3417 
3418 		/* m is covering the next mbuf entirely, thus we remove this guy */
3419 		if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3420 			struct mbuf *tmp = n->m_nextpkt;
3421 
3422 			os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3423 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3424 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3425 			    (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
3426 
3427 			m->m_nextpkt = NULL;
3428 			if (prev == NULL) {
3429 				mpte->mpte_reinjectq = tmp;
3430 			} else {
3431 				prev->m_nextpkt = tmp;
3432 			}
3433 
3434 			m_freem(n);
3435 			n = tmp;
3436 		}
3437 	}
3438 
3439 	if (prev) {
3440 		/* m is already fully covered by the previous mbuf in the queue */
3441 		if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3442 			os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3443 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3444 			    (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3445 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
3446 			goto dont_queue;
3447 		}
3448 	}
3449 
3450 	if (prev == NULL) {
3451 		mpte->mpte_reinjectq = m;
3452 	} else {
3453 		prev->m_nextpkt = m;
3454 	}
3455 
3456 	m->m_nextpkt = n;
3457 
3458 	return;
3459 
3460 dont_queue:
3461 	m_freem(m);
3462 	return;
3463 }
3464 
3465 static struct mbuf *
mptcp_lookup_dsn(struct mptses * mpte,uint64_t dsn)3466 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3467 {
3468 	struct socket *mp_so = mptetoso(mpte);
3469 	struct mbuf *m;
3470 
3471 	m = mp_so->so_snd.sb_mb;
3472 
3473 	while (m) {
3474 		/* If this segment covers what we are looking for, return it. */
3475 		if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3476 		    MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3477 			break;
3478 		}
3479 
3480 
3481 		/* Segment is no more in the queue */
3482 		if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3483 			return NULL;
3484 		}
3485 
3486 		m = m->m_next;
3487 	}
3488 
3489 	return m;
3490 }
3491 
3492 static struct mbuf *
mptcp_copy_mbuf_list(struct mptses * mpte,struct mbuf * m,int len)3493 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3494 {
3495 	struct mbuf *top = NULL, *tail = NULL;
3496 	uint64_t dsn;
3497 	uint32_t dlen, rseq;
3498 
3499 	dsn = m->m_pkthdr.mp_dsn;
3500 	dlen = m->m_pkthdr.mp_rlen;
3501 	rseq = m->m_pkthdr.mp_rseq;
3502 
3503 	while (len > 0) {
3504 		struct mbuf *n;
3505 
3506 		VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3507 
3508 		n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, NULL, NULL, M_COPYM_MUST_COPY_HDR);
3509 		if (n == NULL) {
3510 			os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3511 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3512 			goto err;
3513 		}
3514 
3515 		VERIFY(n->m_flags & M_PKTHDR);
3516 		VERIFY(n->m_next == NULL);
3517 		VERIFY(n->m_pkthdr.mp_dsn == dsn);
3518 		VERIFY(n->m_pkthdr.mp_rlen == dlen);
3519 		VERIFY(n->m_pkthdr.mp_rseq == rseq);
3520 		VERIFY(n->m_len == m->m_len);
3521 
3522 		n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3523 
3524 		if (top == NULL) {
3525 			top = n;
3526 		}
3527 
3528 		if (tail != NULL) {
3529 			tail->m_next = n;
3530 		}
3531 
3532 		tail = n;
3533 
3534 		len -= m->m_len;
3535 		m = m->m_next;
3536 	}
3537 
3538 	return top;
3539 
3540 err:
3541 	if (top) {
3542 		m_freem(top);
3543 	}
3544 
3545 	return NULL;
3546 }
3547 
3548 static void
mptcp_reinject_mbufs(struct socket * so)3549 mptcp_reinject_mbufs(struct socket *so)
3550 {
3551 	struct tcpcb *tp = sototcpcb(so);
3552 	struct mptsub *mpts = tp->t_mpsub;
3553 	struct mptcb *mp_tp = tptomptp(tp);
3554 	struct mptses *mpte = mp_tp->mpt_mpte;
3555 	struct sockbuf *sb = &so->so_snd;
3556 	struct mbuf *m;
3557 
3558 	m = sb->sb_mb;
3559 	while (m) {
3560 		struct mbuf *n = m->m_next, *orig = m;
3561 		bool set_reinject_flag = false;
3562 
3563 		VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3564 
3565 		if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3566 			goto next;
3567 		}
3568 
3569 		/* Has it all already been acknowledged at the data-level? */
3570 		if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3571 			goto next;
3572 		}
3573 
3574 		/* Part of this has already been acknowledged - lookup in the
3575 		 * MPTCP-socket for the segment.
3576 		 */
3577 		if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3578 			m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3579 			if (m == NULL) {
3580 				goto next;
3581 			}
3582 		}
3583 
3584 		/* Copy the mbuf with headers (aka, DSN-numbers) */
3585 		m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3586 		if (m == NULL) {
3587 			break;
3588 		}
3589 
3590 		VERIFY(m->m_nextpkt == NULL);
3591 
3592 		/* Now, add to the reinject-queue, eliminating overlapping
3593 		 * segments
3594 		 */
3595 		mptcp_add_reinjectq(mpte, m);
3596 
3597 		set_reinject_flag = true;
3598 		orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3599 
3600 next:
3601 		/* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3602 		while (n) {
3603 			VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3604 
3605 			if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3606 				break;
3607 			}
3608 
3609 			if (set_reinject_flag) {
3610 				n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3611 			}
3612 			n = n->m_next;
3613 		}
3614 
3615 		m = n;
3616 	}
3617 }
3618 
3619 void
mptcp_clean_reinjectq(struct mptses * mpte)3620 mptcp_clean_reinjectq(struct mptses *mpte)
3621 {
3622 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3623 
3624 	socket_lock_assert_owned(mptetoso(mpte));
3625 
3626 	while (mpte->mpte_reinjectq) {
3627 		struct mbuf *m = mpte->mpte_reinjectq;
3628 
3629 		if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3630 		    MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3631 			break;
3632 		}
3633 
3634 		mpte->mpte_reinjectq = m->m_nextpkt;
3635 		m->m_nextpkt = NULL;
3636 		m_freem(m);
3637 	}
3638 }
3639 
3640 static ev_ret_t
mptcp_subflow_propagate_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3641 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3642     uint32_t *p_mpsofilt_hint, uint32_t event)
3643 {
3644 	struct socket *mp_so, *so;
3645 	struct mptcb *mp_tp;
3646 
3647 	mp_so = mptetoso(mpte);
3648 	mp_tp = mpte->mpte_mptcb;
3649 	so = mpts->mpts_socket;
3650 
3651 	/*
3652 	 * We got an event for this subflow that might need to be propagated,
3653 	 * based on the state of the MPTCP connection.
3654 	 */
3655 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3656 	    (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3657 	    ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3658 		mp_so->so_error = so->so_error;
3659 		*p_mpsofilt_hint |= event;
3660 	}
3661 
3662 	return MPTS_EVRET_OK;
3663 }
3664 
3665 /*
3666  * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3667  */
3668 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3669 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3670     uint32_t *p_mpsofilt_hint, uint32_t event)
3671 {
3672 	struct socket *mp_so;
3673 	struct tcpcb *tp;
3674 
3675 	mp_so = mptetoso(mpte);
3676 	tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3677 
3678 	/*
3679 	 * This overwrites any previous mpte_lost_aid to avoid storing
3680 	 * too much state when the typical case has only two subflows.
3681 	 */
3682 	mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3683 	mpte->mpte_lost_aid = tp->t_local_aid;
3684 
3685 	/*
3686 	 * The subflow connection has lost its source address.
3687 	 */
3688 	mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3689 
3690 	if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3691 		mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3692 	}
3693 
3694 	return MPTS_EVRET_DELETE;
3695 }
3696 
3697 static ev_ret_t
mptcp_subflow_mpsuberror_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3698 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3699     uint32_t *p_mpsofilt_hint, uint32_t event)
3700 {
3701 #pragma unused(event, p_mpsofilt_hint)
3702 	struct socket *so, *mp_so;
3703 
3704 	so = mpts->mpts_socket;
3705 
3706 	if (so->so_error != ENODATA) {
3707 		return MPTS_EVRET_OK;
3708 	}
3709 
3710 
3711 	mp_so = mptetoso(mpte);
3712 
3713 	mp_so->so_error = ENODATA;
3714 
3715 	sorwakeup(mp_so);
3716 	sowwakeup(mp_so);
3717 
3718 	return MPTS_EVRET_OK;
3719 }
3720 
3721 
3722 /*
3723  * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3724  * indicates that the remote side sent a Data FIN
3725  */
3726 static ev_ret_t
mptcp_subflow_mpcantrcvmore_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3727 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3728     uint32_t *p_mpsofilt_hint, uint32_t event)
3729 {
3730 #pragma unused(event, mpts)
3731 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3732 
3733 	/*
3734 	 * We got a Data FIN for the MPTCP connection.
3735 	 * The FIN may arrive with data. The data is handed up to the
3736 	 * mptcp socket and the user is notified so that it may close
3737 	 * the socket if needed.
3738 	 */
3739 	if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3740 		*p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3741 	}
3742 
3743 	return MPTS_EVRET_OK; /* keep the subflow socket around */
3744 }
3745 
3746 /*
3747  * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3748  */
3749 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3750 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3751     uint32_t *p_mpsofilt_hint, uint32_t event)
3752 {
3753 #pragma unused(event, p_mpsofilt_hint)
3754 	struct mptsub *mpts_alt = NULL;
3755 	struct socket *alt_so = NULL;
3756 	struct socket *mp_so;
3757 	int altpath_exists = 0;
3758 
3759 	mp_so = mptetoso(mpte);
3760 	os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3761 
3762 	mptcp_reinject_mbufs(mpts->mpts_socket);
3763 
3764 	mpts_alt = mptcp_get_subflow(mpte, NULL);
3765 
3766 	/* If there is no alternate eligible subflow, ignore the failover hint. */
3767 	if (mpts_alt == NULL || mpts_alt == mpts) {
3768 		os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3769 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3770 
3771 		goto done;
3772 	}
3773 
3774 	altpath_exists = 1;
3775 	alt_so = mpts_alt->mpts_socket;
3776 	if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3777 		/* All data acknowledged and no RTT spike */
3778 		if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3779 			mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3780 		} else {
3781 			/* no alternate path available */
3782 			altpath_exists = 0;
3783 		}
3784 	}
3785 
3786 	if (altpath_exists) {
3787 		mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3788 
3789 		mpte->mpte_active_sub = mpts_alt;
3790 		mpts->mpts_flags |= MPTSF_FAILINGOVER;
3791 		mpts->mpts_flags &= ~MPTSF_ACTIVE;
3792 
3793 		os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3794 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3795 
3796 		mptcpstats_inc_switch(mpte, mpts);
3797 
3798 		sowwakeup(alt_so);
3799 	} else {
3800 done:
3801 		mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3802 	}
3803 
3804 	return MPTS_EVRET_OK;
3805 }
3806 
3807 /*
3808  * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3809  */
3810 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3811 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3812     uint32_t *p_mpsofilt_hint, uint32_t event)
3813 {
3814 	/*
3815 	 * The subflow connection cannot use the outgoing interface, let's
3816 	 * close this subflow.
3817 	 */
3818 	mptcp_subflow_abort(mpts, EPERM);
3819 
3820 	mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3821 
3822 	return MPTS_EVRET_DELETE;
3823 }
3824 
3825 /*
3826  * https://tools.ietf.org/html/rfc6052#section-2
3827  * https://tools.ietf.org/html/rfc6147#section-5.2
3828  */
3829 static boolean_t
mptcp_desynthesize_ipv6_addr(struct mptses * mpte,const struct in6_addr * addr,const struct ipv6_prefix * prefix,struct in_addr * addrv4)3830 mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr,
3831     const struct ipv6_prefix *prefix,
3832     struct in_addr *addrv4)
3833 {
3834 	char buf[MAX_IPv4_STR_LEN];
3835 	char *ptrv4 = (char *)addrv4;
3836 	const char *ptr = (const char *)addr;
3837 
3838 	if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3839 		return false;
3840 	}
3841 
3842 	switch (prefix->prefix_len) {
3843 	case NAT64_PREFIX_LEN_96:
3844 		memcpy(ptrv4, ptr + 12, 4);
3845 		break;
3846 	case NAT64_PREFIX_LEN_64:
3847 		memcpy(ptrv4, ptr + 9, 4);
3848 		break;
3849 	case NAT64_PREFIX_LEN_56:
3850 		memcpy(ptrv4, ptr + 7, 1);
3851 		memcpy(ptrv4 + 1, ptr + 9, 3);
3852 		break;
3853 	case NAT64_PREFIX_LEN_48:
3854 		memcpy(ptrv4, ptr + 6, 2);
3855 		memcpy(ptrv4 + 2, ptr + 9, 2);
3856 		break;
3857 	case NAT64_PREFIX_LEN_40:
3858 		memcpy(ptrv4, ptr + 5, 3);
3859 		memcpy(ptrv4 + 3, ptr + 9, 1);
3860 		break;
3861 	case NAT64_PREFIX_LEN_32:
3862 		memcpy(ptrv4, ptr + 4, 4);
3863 		break;
3864 	default:
3865 		panic("NAT64-prefix len is wrong: %u",
3866 		    prefix->prefix_len);
3867 	}
3868 
3869 	os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__,
3870 	    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3871 	    inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3872 
3873 	return true;
3874 }
3875 
3876 static void
mptcp_handle_ipv6_connection(struct mptses * mpte,const struct mptsub * mpts)3877 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3878 {
3879 	struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3880 	struct socket *so = mpts->mpts_socket;
3881 	struct ifnet *ifp;
3882 	int j;
3883 
3884 	/* Subflow IPs will be steered directly by the server - no need to
3885 	 * desynthesize.
3886 	 */
3887 	if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3888 		return;
3889 	}
3890 
3891 	ifp = sotoinpcb(so)->inp_last_outifp;
3892 
3893 	if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3894 		return;
3895 	}
3896 
3897 	for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3898 		int success;
3899 
3900 		if (nat64prefixes[j].prefix_len == 0) {
3901 			continue;
3902 		}
3903 
3904 		success = mptcp_desynthesize_ipv6_addr(mpte,
3905 		    &mpte->__mpte_dst_v6.sin6_addr,
3906 		    &nat64prefixes[j],
3907 		    &mpte->mpte_sub_dst_v4.sin_addr);
3908 		if (success) {
3909 			mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
3910 			mpte->mpte_sub_dst_v4.sin_family = AF_INET;
3911 			mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
3912 
3913 			/*
3914 			 * We connected to a NAT64'ed address. Let's remove it
3915 			 * from the potential IPs to use. Whenever we are back on
3916 			 * that network and need to connect, we can synthesize again.
3917 			 *
3918 			 * Otherwise, on different IPv6 networks we will attempt
3919 			 * to connect to that NAT64 address...
3920 			 */
3921 			memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6));
3922 			break;
3923 		}
3924 	}
3925 }
3926 
3927 static void
mptcp_try_alternate_port(struct mptses * mpte,struct mptsub * mpts)3928 mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
3929 {
3930 	struct inpcb *inp;
3931 
3932 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
3933 		return;
3934 	}
3935 
3936 	inp = sotoinpcb(mpts->mpts_socket);
3937 	if (inp == NULL) {
3938 		return;
3939 	}
3940 
3941 	/* Should we try the alternate port? */
3942 	if (mpte->mpte_alternate_port &&
3943 	    inp->inp_fport != mpte->mpte_alternate_port) {
3944 		union sockaddr_in_4_6 dst;
3945 		struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
3946 
3947 		memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
3948 
3949 		dst_in->sin_port = mpte->mpte_alternate_port;
3950 
3951 		mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
3952 		    mpts->mpts_ifscope, NULL);
3953 	} else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3954 		unsigned int i;
3955 
3956 		if (inp->inp_last_outifp == NULL) {
3957 			return;
3958 		}
3959 
3960 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3961 			struct mpt_itf_info *info =  &mpte->mpte_itfinfo[i];
3962 
3963 			if (inp->inp_last_outifp->if_index == info->ifindex) {
3964 				info->no_mptcp_support = 1;
3965 				break;
3966 			}
3967 		}
3968 	}
3969 }
3970 
3971 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
3972 static void
mptcp_drop_tfo_data(struct mptses * mpte,struct mptsub * mpts)3973 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
3974 {
3975 	struct socket *mp_so = mptetoso(mpte);
3976 	struct socket *so = mpts->mpts_socket;
3977 	struct tcpcb *tp = intotcpcb(sotoinpcb(so));
3978 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3979 
3980 	/* If data was sent with SYN, rewind state */
3981 	if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
3982 		u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3983 		unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
3984 
3985 		VERIFY(mp_droplen <= (UINT_MAX));
3986 		VERIFY(mp_droplen >= tcp_droplen);
3987 
3988 		mpts->mpts_flags &= ~MPTSF_TFO_REQD;
3989 		mpts->mpts_iss += tcp_droplen;
3990 		tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3991 
3992 		if (mp_droplen > tcp_droplen) {
3993 			/* handle partial TCP ack */
3994 			mp_so->so_flags1 |= SOF1_TFO_REWIND;
3995 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
3996 			mp_droplen = tcp_droplen;
3997 		} else {
3998 			/* all data on SYN was acked */
3999 			mpts->mpts_rel_seq = 1;
4000 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4001 		}
4002 		mp_tp->mpt_sndmax -= tcp_droplen;
4003 
4004 		if (mp_droplen != 0) {
4005 			VERIFY(mp_so->so_snd.sb_mb != NULL);
4006 			sbdrop(&mp_so->so_snd, (int)mp_droplen);
4007 		}
4008 	}
4009 }
4010 
4011 /*
4012  * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4013  */
4014 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4015 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4016     uint32_t *p_mpsofilt_hint, uint32_t event)
4017 {
4018 #pragma unused(event, p_mpsofilt_hint)
4019 	struct socket *mp_so, *so;
4020 	struct inpcb *inp;
4021 	struct tcpcb *tp;
4022 	struct mptcb *mp_tp;
4023 	int af;
4024 	boolean_t mpok = FALSE;
4025 
4026 	mp_so = mptetoso(mpte);
4027 	mp_tp = mpte->mpte_mptcb;
4028 	so = mpts->mpts_socket;
4029 	tp = sototcpcb(so);
4030 	af = mpts->mpts_dst.sa_family;
4031 
4032 	if (mpts->mpts_flags & MPTSF_CONNECTED) {
4033 		return MPTS_EVRET_OK;
4034 	}
4035 
4036 	if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4037 	    (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4038 		return MPTS_EVRET_OK;
4039 	}
4040 
4041 	/*
4042 	 * The subflow connection has been connected.  Find out whether it
4043 	 * is connected as a regular TCP or as a MPTCP subflow.  The idea is:
4044 	 *
4045 	 *   a. If MPTCP connection is not yet established, then this must be
4046 	 *	the first subflow connection.  If MPTCP failed to negotiate,
4047 	 *	fallback to regular TCP by degrading this subflow.
4048 	 *
4049 	 *   b. If MPTCP connection has been established, then this must be
4050 	 *	one of the subsequent subflow connections. If MPTCP failed
4051 	 *	to negotiate, disconnect the connection.
4052 	 *
4053 	 * Right now, we simply unblock any waiters at the MPTCP socket layer
4054 	 * if the MPTCP connection has not been established.
4055 	 */
4056 
4057 	if (so->so_state & SS_ISDISCONNECTED) {
4058 		/*
4059 		 * With MPTCP joins, a connection is connected at the subflow
4060 		 * level, but the 4th ACK from the server elevates the MPTCP
4061 		 * subflow to connected state. So there is a small window
4062 		 * where the subflow could get disconnected before the
4063 		 * connected event is processed.
4064 		 */
4065 		return MPTS_EVRET_OK;
4066 	}
4067 
4068 	if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4069 		mptcp_drop_tfo_data(mpte, mpts);
4070 	}
4071 
4072 	mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4073 	mpts->mpts_flags |= MPTSF_CONNECTED;
4074 
4075 	if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4076 		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4077 	}
4078 
4079 	tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4080 
4081 	/* get/verify the outbound interface */
4082 	inp = sotoinpcb(so);
4083 
4084 	mpts->mpts_maxseg = tp->t_maxseg;
4085 
4086 	mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4087 
4088 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4089 		mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4090 		mpte->mpte_associd = mpts->mpts_connid;
4091 		DTRACE_MPTCP2(state__change,
4092 		    struct mptcb *, mp_tp,
4093 		    uint32_t, 0 /* event */);
4094 
4095 		if (SOCK_DOM(so) == AF_INET) {
4096 			in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4097 		} else {
4098 			in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4099 		}
4100 
4101 		mpts->mpts_flags |= MPTSF_ACTIVE;
4102 
4103 		/* case (a) above */
4104 		if (!mpok) {
4105 			tcpstat.tcps_mpcap_fallback++;
4106 
4107 			tp->t_mpflags |= TMPF_INFIN_SENT;
4108 			mptcp_notify_mpfail(so);
4109 		} else {
4110 			if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4111 			    mptcp_subflows_need_backup_flag(mpte)) {
4112 				tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4113 			} else {
4114 				mpts->mpts_flags |= MPTSF_PREFERRED;
4115 			}
4116 			mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4117 			mpte->mpte_nummpcapflows++;
4118 
4119 			if (SOCK_DOM(so) == AF_INET6) {
4120 				mptcp_handle_ipv6_connection(mpte, mpts);
4121 			}
4122 
4123 			mptcp_check_subflows_and_add(mpte);
4124 
4125 			if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4126 				mpte->mpte_initial_cell = 1;
4127 			}
4128 
4129 			mpte->mpte_handshake_success = 1;
4130 		}
4131 
4132 		mp_tp->mpt_sndwnd = tp->snd_wnd;
4133 		mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4134 		mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4135 		soisconnected(mp_so);
4136 	} else if (mpok) {
4137 		/*
4138 		 * case (b) above
4139 		 * In case of additional flows, the MPTCP socket is not
4140 		 * MPTSF_MP_CAPABLE until an ACK is received from server
4141 		 * for 3-way handshake.  TCP would have guaranteed that this
4142 		 * is an MPTCP subflow.
4143 		 */
4144 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4145 		    !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4146 		    mptcp_subflows_need_backup_flag(mpte)) {
4147 			tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4148 			mpts->mpts_flags &= ~MPTSF_PREFERRED;
4149 		} else {
4150 			mpts->mpts_flags |= MPTSF_PREFERRED;
4151 		}
4152 
4153 		mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4154 		mpte->mpte_nummpcapflows++;
4155 
4156 		mpts->mpts_rel_seq = 1;
4157 
4158 		mptcp_check_subflows_and_remove(mpte);
4159 	} else {
4160 		mptcp_try_alternate_port(mpte, mpts);
4161 
4162 		tcpstat.tcps_join_fallback++;
4163 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4164 			tcpstat.tcps_mptcp_cell_proxy++;
4165 		} else {
4166 			tcpstat.tcps_mptcp_wifi_proxy++;
4167 		}
4168 
4169 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4170 
4171 		return MPTS_EVRET_OK;
4172 	}
4173 
4174 	/* This call, just to "book" an entry in the stats-table for this ifindex */
4175 	mptcpstats_get_index(mpte->mpte_itfstats, mpts);
4176 
4177 	mptcp_output(mpte);
4178 
4179 	return MPTS_EVRET_OK; /* keep the subflow socket around */
4180 }
4181 
4182 /*
4183  * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4184  */
4185 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4186 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4187     uint32_t *p_mpsofilt_hint, uint32_t event)
4188 {
4189 #pragma unused(event, p_mpsofilt_hint)
4190 	struct socket *mp_so, *so;
4191 	struct mptcb *mp_tp;
4192 
4193 	mp_so = mptetoso(mpte);
4194 	mp_tp = mpte->mpte_mptcb;
4195 	so = mpts->mpts_socket;
4196 
4197 	if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4198 		return MPTS_EVRET_DELETE;
4199 	}
4200 
4201 	mpts->mpts_flags |= MPTSF_DISCONNECTED;
4202 
4203 	/* The subflow connection has been disconnected. */
4204 
4205 	if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4206 		mpte->mpte_nummpcapflows--;
4207 		if (mpte->mpte_active_sub == mpts) {
4208 			mpte->mpte_active_sub = NULL;
4209 		}
4210 		mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4211 	} else {
4212 		if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4213 		    !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4214 			mptcp_try_alternate_port(mpte, mpts);
4215 		}
4216 	}
4217 
4218 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4219 	    ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4220 		mptcp_drop(mpte, mp_tp, so->so_error);
4221 	}
4222 
4223 	/*
4224 	 * Clear flags that are used by getconninfo to return state.
4225 	 * Retain like MPTSF_DELETEOK for internal purposes.
4226 	 */
4227 	mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4228 	    MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4229 	    MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4230 
4231 	return MPTS_EVRET_DELETE;
4232 }
4233 
4234 /*
4235  * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4236  */
4237 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4238 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4239     uint32_t *p_mpsofilt_hint, uint32_t event)
4240 {
4241 #pragma unused(event, p_mpsofilt_hint)
4242 	ev_ret_t ret = MPTS_EVRET_OK;
4243 	struct socket *mp_so, *so;
4244 	struct mptcb *mp_tp;
4245 
4246 	mp_so = mptetoso(mpte);
4247 	mp_tp = mpte->mpte_mptcb;
4248 	so = mpts->mpts_socket;
4249 	struct inpcb *inp = sotoinpcb(so);
4250 	struct tcpcb *tp = intotcpcb(inp);
4251 
4252 	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4253 		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4254 	} else {
4255 		mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4256 	}
4257 
4258 	if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4259 		if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4260 			goto done;
4261 		}
4262 		mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4263 	} else {
4264 		mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4265 	}
4266 
4267 	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4268 		mpts->mpts_flags |= MPTSF_MP_READY;
4269 	} else {
4270 		mpts->mpts_flags &= ~MPTSF_MP_READY;
4271 	}
4272 
4273 	if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4274 		mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4275 		mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4276 		tcp_cache_update_mptcp_version(tp, FALSE);
4277 	}
4278 
4279 	if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4280 		ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4281 
4282 		m_freem_list(mpte->mpte_reinjectq);
4283 		mpte->mpte_reinjectq = NULL;
4284 	} else if (mpts->mpts_flags & MPTSF_MP_READY) {
4285 		mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4286 		ret = MPTS_EVRET_CONNECT_PENDING;
4287 	}
4288 
4289 done:
4290 	return ret;
4291 }
4292 
4293 /*
4294  * Handle SO_FILT_HINT_MUSTRST subflow socket event
4295  */
4296 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4297 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4298     uint32_t *p_mpsofilt_hint, uint32_t event)
4299 {
4300 #pragma unused(event)
4301 	struct socket *mp_so, *so;
4302 	struct mptcb *mp_tp;
4303 	boolean_t is_fastclose;
4304 
4305 	mp_so = mptetoso(mpte);
4306 	mp_tp = mpte->mpte_mptcb;
4307 	so = mpts->mpts_socket;
4308 
4309 	/* We got an invalid option or a fast close */
4310 	struct inpcb *inp = sotoinpcb(so);
4311 	struct tcpcb *tp = NULL;
4312 
4313 	tp = intotcpcb(inp);
4314 	so->so_error = ECONNABORTED;
4315 
4316 	is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4317 
4318 	tp->t_mpflags |= TMPF_RESET;
4319 
4320 	if (tp->t_state != TCPS_CLOSED) {
4321 		struct mbuf *m;
4322 		struct tcptemp *t_template = tcp_maketemplate(tp, &m);
4323 
4324 		if (t_template) {
4325 			struct tcp_respond_args tra;
4326 
4327 			bzero(&tra, sizeof(tra));
4328 			if (inp->inp_flags & INP_BOUND_IF) {
4329 				tra.ifscope = inp->inp_boundifp->if_index;
4330 			} else {
4331 				tra.ifscope = IFSCOPE_NONE;
4332 			}
4333 			tra.awdl_unrestricted = 1;
4334 
4335 			tcp_respond(tp, t_template->tt_ipgen,
4336 			    &t_template->tt_t, (struct mbuf *)NULL,
4337 			    tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4338 			(void) m_free(m);
4339 		}
4340 	}
4341 
4342 	if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4343 		struct mptsub *iter, *tmp;
4344 
4345 		*p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4346 
4347 		mp_so->so_error = ECONNRESET;
4348 
4349 		TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4350 			if (iter == mpts) {
4351 				continue;
4352 			}
4353 			mptcp_subflow_abort(iter, ECONNABORTED);
4354 		}
4355 
4356 		/*
4357 		 * mptcp_drop is being called after processing the events, to fully
4358 		 * close the MPTCP connection
4359 		 */
4360 		mptcp_drop(mpte, mp_tp, mp_so->so_error);
4361 	}
4362 
4363 	mptcp_subflow_abort(mpts, ECONNABORTED);
4364 
4365 	if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4366 		mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4367 	}
4368 
4369 	return MPTS_EVRET_DELETE;
4370 }
4371 
4372 static ev_ret_t
mptcp_subflow_adaptive_rtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4373 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4374     uint32_t *p_mpsofilt_hint, uint32_t event)
4375 {
4376 #pragma unused(event)
4377 	bool found_active = false;
4378 
4379 	mpts->mpts_flags |= MPTSF_READ_STALL;
4380 
4381 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4382 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4383 
4384 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4385 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
4386 			continue;
4387 		}
4388 
4389 		if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4390 			found_active = true;
4391 			break;
4392 		}
4393 	}
4394 
4395 	if (!found_active) {
4396 		*p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4397 	}
4398 
4399 	return MPTS_EVRET_OK;
4400 }
4401 
4402 static ev_ret_t
mptcp_subflow_adaptive_wtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4403 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4404     uint32_t *p_mpsofilt_hint, uint32_t event)
4405 {
4406 #pragma unused(event)
4407 	bool found_active = false;
4408 
4409 	mpts->mpts_flags |= MPTSF_WRITE_STALL;
4410 
4411 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4412 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4413 
4414 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4415 		    tp->t_state > TCPS_CLOSE_WAIT) {
4416 			continue;
4417 		}
4418 
4419 		if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4420 			found_active = true;
4421 			break;
4422 		}
4423 	}
4424 
4425 	if (!found_active) {
4426 		*p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4427 	}
4428 
4429 	return MPTS_EVRET_OK;
4430 }
4431 
4432 /*
4433  * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4434  * caller must ensure that the option can be issued on subflow sockets, via
4435  * MPOF_SUBFLOW_OK flag.
4436  */
4437 int
mptcp_subflow_sosetopt(struct mptses * mpte,struct mptsub * mpts,struct mptopt * mpo)4438 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4439 {
4440 	struct socket *mp_so, *so;
4441 	struct sockopt sopt;
4442 	int error;
4443 
4444 	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4445 
4446 	mp_so = mptetoso(mpte);
4447 	so = mpts->mpts_socket;
4448 
4449 	socket_lock_assert_owned(mp_so);
4450 
4451 	if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4452 	    mpo->mpo_level == SOL_SOCKET &&
4453 	    mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4454 		struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4455 
4456 		/*
4457 		 * When we open a new subflow, mark it as cell fallback, if
4458 		 * this subflow goes over cell.
4459 		 *
4460 		 * (except for first-party apps)
4461 		 */
4462 
4463 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4464 			return 0;
4465 		}
4466 
4467 		if (sotoinpcb(so)->inp_last_outifp &&
4468 		    !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4469 			return 0;
4470 		}
4471 
4472 		/*
4473 		 * This here is an OR, because if the app is not binding to the
4474 		 * interface, then it definitely is not a cell-fallback
4475 		 * connection.
4476 		 */
4477 		if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4478 		    !IFNET_IS_CELLULAR(ifp)) {
4479 			return 0;
4480 		}
4481 	}
4482 
4483 	mpo->mpo_flags &= ~MPOF_INTERIM;
4484 
4485 	bzero(&sopt, sizeof(sopt));
4486 	sopt.sopt_dir = SOPT_SET;
4487 	sopt.sopt_level = mpo->mpo_level;
4488 	sopt.sopt_name = mpo->mpo_name;
4489 	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4490 	sopt.sopt_valsize = sizeof(int);
4491 	sopt.sopt_p = kernproc;
4492 
4493 	error = sosetoptlock(so, &sopt, 0);
4494 	if (error) {
4495 		os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4496 		    "val %d set error %d\n", __func__,
4497 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4498 		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4499 		    mpo->mpo_intval, error);
4500 	}
4501 	return error;
4502 }
4503 
4504 /*
4505  * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4506  * caller must ensure that the option can be issued on subflow sockets, via
4507  * MPOF_SUBFLOW_OK flag.
4508  */
4509 int
mptcp_subflow_sogetopt(struct mptses * mpte,struct socket * so,struct mptopt * mpo)4510 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4511     struct mptopt *mpo)
4512 {
4513 	struct socket *mp_so;
4514 	struct sockopt sopt;
4515 	int error;
4516 
4517 	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4518 	mp_so = mptetoso(mpte);
4519 
4520 	socket_lock_assert_owned(mp_so);
4521 
4522 	bzero(&sopt, sizeof(sopt));
4523 	sopt.sopt_dir = SOPT_GET;
4524 	sopt.sopt_level = mpo->mpo_level;
4525 	sopt.sopt_name = mpo->mpo_name;
4526 	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4527 	sopt.sopt_valsize = sizeof(int);
4528 	sopt.sopt_p = kernproc;
4529 
4530 	error = sogetoptlock(so, &sopt, 0);     /* already locked */
4531 	if (error) {
4532 		os_log_error(mptcp_log_handle,
4533 		    "%s - %lx: sopt %s get error %d\n",
4534 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4535 		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4536 	}
4537 	return error;
4538 }
4539 
4540 
4541 /*
4542  * MPTCP garbage collector.
4543  *
4544  * This routine is called by the MP domain on-demand, periodic callout,
4545  * which is triggered when a MPTCP socket is closed.  The callout will
4546  * repeat as long as this routine returns a non-zero value.
4547  */
4548 static uint32_t
mptcp_gc(struct mppcbinfo * mppi)4549 mptcp_gc(struct mppcbinfo *mppi)
4550 {
4551 	struct mppcb *mpp, *tmpp;
4552 	uint32_t active = 0;
4553 
4554 	LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4555 
4556 	TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4557 		struct socket *mp_so;
4558 		struct mptses *mpte;
4559 		struct mptcb *mp_tp;
4560 
4561 		mp_so = mpp->mpp_socket;
4562 		mpte = mptompte(mpp);
4563 		mp_tp = mpte->mpte_mptcb;
4564 
4565 		if (!mpp_try_lock(mpp)) {
4566 			active++;
4567 			continue;
4568 		}
4569 
4570 		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4571 
4572 		/* check again under the lock */
4573 		if (mp_so->so_usecount > 0) {
4574 			boolean_t wakeup = FALSE;
4575 			struct mptsub *mpts, *tmpts;
4576 
4577 			if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4578 				if (mp_tp->mpt_gc_ticks > 0) {
4579 					mp_tp->mpt_gc_ticks--;
4580 				}
4581 				if (mp_tp->mpt_gc_ticks == 0) {
4582 					wakeup = TRUE;
4583 				}
4584 			}
4585 			if (wakeup) {
4586 				TAILQ_FOREACH_SAFE(mpts,
4587 				    &mpte->mpte_subflows, mpts_entry, tmpts) {
4588 					mptcp_subflow_eupcall1(mpts->mpts_socket,
4589 					    mpts, SO_FILT_HINT_DISCONNECTED);
4590 				}
4591 			}
4592 			socket_unlock(mp_so, 0);
4593 			active++;
4594 			continue;
4595 		}
4596 
4597 		if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4598 			panic("%s - %lx: skipped state "
4599 			    "[u=%d,r=%d,s=%d]\n", __func__,
4600 			    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4601 			    mp_so->so_usecount, mp_so->so_retaincnt,
4602 			    mpp->mpp_state);
4603 		}
4604 
4605 		if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4606 			mptcp_close(mpte, mp_tp);
4607 		}
4608 
4609 		mptcp_session_destroy(mpte);
4610 
4611 		DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4612 		    struct sockbuf *, &mp_so->so_rcv,
4613 		    struct sockbuf *, &mp_so->so_snd,
4614 		    struct mppcb *, mpp);
4615 
4616 		mptcp_pcbdispose(mpp);
4617 		sodealloc(mp_so);
4618 	}
4619 
4620 	return active;
4621 }
4622 
4623 /*
4624  * Drop a MPTCP connection, reporting the specified error.
4625  */
4626 struct mptses *
mptcp_drop(struct mptses * mpte,struct mptcb * mp_tp,u_short errno)4627 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4628 {
4629 	struct socket *mp_so = mptetoso(mpte);
4630 
4631 	VERIFY(mpte->mpte_mptcb == mp_tp);
4632 
4633 	socket_lock_assert_owned(mp_so);
4634 
4635 	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4636 	    uint32_t, 0 /* event */);
4637 
4638 	if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4639 		errno = mp_tp->mpt_softerror;
4640 	}
4641 	mp_so->so_error = errno;
4642 
4643 	return mptcp_close(mpte, mp_tp);
4644 }
4645 
4646 /*
4647  * Close a MPTCP control block.
4648  */
4649 struct mptses *
mptcp_close(struct mptses * mpte,struct mptcb * mp_tp)4650 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4651 {
4652 	struct mptsub *mpts = NULL, *tmpts = NULL;
4653 	struct socket *mp_so = mptetoso(mpte);
4654 
4655 	socket_lock_assert_owned(mp_so);
4656 	VERIFY(mpte->mpte_mptcb == mp_tp);
4657 
4658 	mp_tp->mpt_state = MPTCPS_TERMINATE;
4659 
4660 	mptcp_freeq(mp_tp);
4661 
4662 	soisdisconnected(mp_so);
4663 
4664 	/* Clean up all subflows */
4665 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4666 		mptcp_subflow_disconnect(mpte, mpts);
4667 	}
4668 
4669 	return NULL;
4670 }
4671 
4672 void
mptcp_notify_close(struct socket * so)4673 mptcp_notify_close(struct socket *so)
4674 {
4675 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4676 }
4677 
4678 typedef struct mptcp_subflow_event_entry {
4679 	uint32_t    sofilt_hint_mask;
4680 	ev_ret_t    (*sofilt_hint_ev_hdlr)(
4681 		struct mptses *mpte,
4682 		struct mptsub *mpts,
4683 		uint32_t *p_mpsofilt_hint,
4684 		uint32_t event);
4685 } mptsub_ev_entry_t;
4686 
4687 /*
4688  * XXX The order of the event handlers below is really
4689  * really important. Think twice before changing it.
4690  */
4691 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
4692 	{
4693 		.sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
4694 		.sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
4695 	},
4696 	{
4697 		.sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
4698 		.sofilt_hint_ev_hdlr =  mptcp_subflow_mpcantrcvmore_ev,
4699 	},
4700 	{
4701 		.sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
4702 		.sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
4703 	},
4704 	{
4705 		.sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
4706 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4707 	},
4708 	{
4709 		.sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
4710 		.sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
4711 	},
4712 	{
4713 		.sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
4714 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4715 	},
4716 	{
4717 		.sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
4718 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4719 	},
4720 	{
4721 		.sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
4722 		.sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
4723 	},
4724 	{
4725 		.sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
4726 		.sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
4727 	},
4728 	{
4729 		.sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
4730 		.sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
4731 	},
4732 	{
4733 		.sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
4734 		.sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
4735 	},
4736 	{
4737 		.sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
4738 		.sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
4739 	},
4740 	{
4741 		.sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
4742 		.sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
4743 	},
4744 	{
4745 		.sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
4746 		.sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
4747 	},
4748 };
4749 
4750 /*
4751  * Subflow socket control events.
4752  *
4753  * Called for handling events related to the underlying subflow socket.
4754  */
4755 static ev_ret_t
mptcp_subflow_events(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint)4756 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
4757     uint32_t *p_mpsofilt_hint)
4758 {
4759 	ev_ret_t ret = MPTS_EVRET_OK;
4760 	int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
4761 	    sizeof(mpsub_ev_entry_tbl[0]);
4762 
4763 	/* bail if there's nothing to process */
4764 	if (!mpts->mpts_evctl) {
4765 		return ret;
4766 	}
4767 
4768 	if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
4769 	    SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
4770 	    SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
4771 	    SO_FILT_HINT_DISCONNECTED)) {
4772 		mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
4773 	}
4774 
4775 	DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
4776 	    struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
4777 
4778 	/*
4779 	 * Process all the socket filter hints and reset the hint
4780 	 * once it is handled
4781 	 */
4782 	for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
4783 		/*
4784 		 * Always execute the DISCONNECTED event, because it will wakeup
4785 		 * the app.
4786 		 */
4787 		if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
4788 		    (ret >= MPTS_EVRET_OK ||
4789 		    mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
4790 			mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
4791 			ev_ret_t error =
4792 			    mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
4793 			ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
4794 		}
4795 	}
4796 
4797 	return ret;
4798 }
4799 
4800 /*
4801  * MPTCP workloop.
4802  */
4803 void
mptcp_subflow_workloop(struct mptses * mpte)4804 mptcp_subflow_workloop(struct mptses *mpte)
4805 {
4806 	boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4807 	uint32_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4808 	struct mptsub *mpts, *tmpts;
4809 	struct socket *mp_so;
4810 
4811 	mp_so = mptetoso(mpte);
4812 
4813 	socket_lock_assert_owned(mp_so);
4814 
4815 	if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4816 		mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4817 		return;
4818 	}
4819 	mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4820 
4821 relaunch:
4822 	mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4823 
4824 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4825 		ev_ret_t ret;
4826 
4827 		if (mpts->mpts_socket->so_usecount == 0) {
4828 			/* Will be removed soon by tcp_garbage_collect */
4829 			continue;
4830 		}
4831 
4832 		mptcp_subflow_addref(mpts);
4833 		mpts->mpts_socket->so_usecount++;
4834 
4835 		ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4836 
4837 		/*
4838 		 * If MPTCP socket is closed, disconnect all subflows.
4839 		 * This will generate a disconnect event which will
4840 		 * be handled during the next iteration, causing a
4841 		 * non-zero error to be returned above.
4842 		 */
4843 		if (mp_so->so_flags & SOF_PCBCLEARING) {
4844 			mptcp_subflow_disconnect(mpte, mpts);
4845 		}
4846 
4847 		switch (ret) {
4848 		case MPTS_EVRET_OK:
4849 			/* nothing to do */
4850 			break;
4851 		case MPTS_EVRET_DELETE:
4852 			mptcp_subflow_soclose(mpts);
4853 			break;
4854 		case MPTS_EVRET_CONNECT_PENDING:
4855 			connect_pending = TRUE;
4856 			break;
4857 		case MPTS_EVRET_DISCONNECT_FALLBACK:
4858 			disconnect_fallback = TRUE;
4859 			break;
4860 		default:
4861 			break;
4862 		}
4863 		mptcp_subflow_remref(mpts);             /* ours */
4864 
4865 		VERIFY(mpts->mpts_socket->so_usecount != 0);
4866 		mpts->mpts_socket->so_usecount--;
4867 	}
4868 
4869 	if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4870 		VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4871 
4872 		if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4873 			mp_so->so_state |= SS_CANTRCVMORE;
4874 			sorwakeup(mp_so);
4875 		}
4876 
4877 		soevent(mp_so, mpsofilt_hint_mask);
4878 	}
4879 
4880 	if (!connect_pending && !disconnect_fallback) {
4881 		goto exit;
4882 	}
4883 
4884 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4885 		if (disconnect_fallback) {
4886 			struct socket *so = NULL;
4887 			struct inpcb *inp = NULL;
4888 			struct tcpcb *tp = NULL;
4889 
4890 			if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4891 				continue;
4892 			}
4893 
4894 			mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4895 
4896 			if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4897 			    MPTSF_DISCONNECTED)) {
4898 				continue;
4899 			}
4900 
4901 			so = mpts->mpts_socket;
4902 
4903 			/*
4904 			 * The MPTCP connection has degraded to a fallback
4905 			 * mode, so there is no point in keeping this subflow
4906 			 * regardless of its MPTCP-readiness state, unless it
4907 			 * is the primary one which we use for fallback.  This
4908 			 * assumes that the subflow used for fallback is the
4909 			 * ACTIVE one.
4910 			 */
4911 
4912 			inp = sotoinpcb(so);
4913 			tp = intotcpcb(inp);
4914 			tp->t_mpflags &=
4915 			    ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4916 			tp->t_mpflags |= TMPF_TCP_FALLBACK;
4917 
4918 			soevent(so, SO_FILT_HINT_MUSTRST);
4919 		} else if (connect_pending) {
4920 			/*
4921 			 * The MPTCP connection has progressed to a state
4922 			 * where it supports full multipath semantics; allow
4923 			 * additional joins to be attempted for all subflows
4924 			 * that are in the PENDING state.
4925 			 */
4926 			if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4927 				int error = mptcp_subflow_soconnectx(mpte, mpts);
4928 
4929 				if (error) {
4930 					mptcp_subflow_abort(mpts, error);
4931 				}
4932 			}
4933 		}
4934 	}
4935 
4936 exit:
4937 	if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4938 		goto relaunch;
4939 	}
4940 
4941 	mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4942 }
4943 
4944 /*
4945  * Protocol pr_lock callback.
4946  */
4947 int
mptcp_lock(struct socket * mp_so,int refcount,void * lr)4948 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4949 {
4950 	struct mppcb *mpp = mpsotomppcb(mp_so);
4951 	void *lr_saved;
4952 
4953 	if (lr == NULL) {
4954 		lr_saved = __builtin_return_address(0);
4955 	} else {
4956 		lr_saved = lr;
4957 	}
4958 
4959 	if (mpp == NULL) {
4960 		panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
4961 		    mp_so, lr_saved, solockhistory_nr(mp_so));
4962 		/* NOTREACHED */
4963 	}
4964 	mpp_lock(mpp);
4965 
4966 	if (mp_so->so_usecount < 0) {
4967 		panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__,
4968 		    mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4969 		    solockhistory_nr(mp_so));
4970 		/* NOTREACHED */
4971 	}
4972 	if (refcount != 0) {
4973 		mp_so->so_usecount++;
4974 		mpp->mpp_inside++;
4975 	}
4976 	mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4977 	mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4978 
4979 	return 0;
4980 }
4981 
4982 /*
4983  * Protocol pr_unlock callback.
4984  */
4985 int
mptcp_unlock(struct socket * mp_so,int refcount,void * lr)4986 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4987 {
4988 	struct mppcb *mpp = mpsotomppcb(mp_so);
4989 	void *lr_saved;
4990 
4991 	if (lr == NULL) {
4992 		lr_saved = __builtin_return_address(0);
4993 	} else {
4994 		lr_saved = lr;
4995 	}
4996 
4997 	if (mpp == NULL) {
4998 		panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__,
4999 		    mp_so, mp_so->so_usecount, lr_saved,
5000 		    solockhistory_nr(mp_so));
5001 		/* NOTREACHED */
5002 	}
5003 	socket_lock_assert_owned(mp_so);
5004 
5005 	if (refcount != 0) {
5006 		mp_so->so_usecount--;
5007 		mpp->mpp_inside--;
5008 	}
5009 
5010 	if (mp_so->so_usecount < 0) {
5011 		panic("%s: so=%p usecount=%x lrh= %s", __func__,
5012 		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5013 		/* NOTREACHED */
5014 	}
5015 	if (mpp->mpp_inside < 0) {
5016 		panic("%s: mpp=%p inside=%x lrh= %s", __func__,
5017 		    mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5018 		/* NOTREACHED */
5019 	}
5020 	mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5021 	mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5022 	mpp_unlock(mpp);
5023 
5024 	return 0;
5025 }
5026 
5027 /*
5028  * Protocol pr_getlock callback.
5029  */
5030 lck_mtx_t *
mptcp_getlock(struct socket * mp_so,int flags)5031 mptcp_getlock(struct socket *mp_so, int flags)
5032 {
5033 	struct mppcb *mpp = mpsotomppcb(mp_so);
5034 
5035 	if (mpp == NULL) {
5036 		panic("%s: so=%p NULL so_pcb %s", __func__, mp_so,
5037 		    solockhistory_nr(mp_so));
5038 		/* NOTREACHED */
5039 	}
5040 	if (mp_so->so_usecount < 0) {
5041 		panic("%s: so=%p usecount=%x lrh= %s", __func__,
5042 		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5043 		/* NOTREACHED */
5044 	}
5045 	return mpp_getlock(mpp, flags);
5046 }
5047 
5048 void
mptcp_get_rands(mptcp_addr_id addr_id,struct mptcb * mp_tp,u_int32_t * lrand,u_int32_t * rrand)5049 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5050     u_int32_t *rrand)
5051 {
5052 	struct mptcp_subf_auth_entry *sauth_entry;
5053 
5054 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5055 		if (sauth_entry->msae_laddr_id == addr_id) {
5056 			if (lrand) {
5057 				*lrand = sauth_entry->msae_laddr_rand;
5058 			}
5059 			if (rrand) {
5060 				*rrand = sauth_entry->msae_raddr_rand;
5061 			}
5062 			break;
5063 		}
5064 	}
5065 }
5066 
5067 void
mptcp_set_raddr_rand(mptcp_addr_id laddr_id,struct mptcb * mp_tp,mptcp_addr_id raddr_id,u_int32_t raddr_rand)5068 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5069     mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5070 {
5071 	struct mptcp_subf_auth_entry *sauth_entry;
5072 
5073 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5074 		if (sauth_entry->msae_laddr_id == laddr_id) {
5075 			if ((sauth_entry->msae_raddr_id != 0) &&
5076 			    (sauth_entry->msae_raddr_id != raddr_id)) {
5077 				os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5078 				    " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5079 				    raddr_id, sauth_entry->msae_raddr_id);
5080 				return;
5081 			}
5082 			sauth_entry->msae_raddr_id = raddr_id;
5083 			if ((sauth_entry->msae_raddr_rand != 0) &&
5084 			    (sauth_entry->msae_raddr_rand != raddr_rand)) {
5085 				os_log_error(mptcp_log_handle, "%s - %lx: "
5086 				    "dup SYN_ACK %d %d \n",
5087 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5088 				    raddr_rand, sauth_entry->msae_raddr_rand);
5089 				return;
5090 			}
5091 			sauth_entry->msae_raddr_rand = raddr_rand;
5092 			return;
5093 		}
5094 	}
5095 }
5096 
5097 /*
5098  * SHA-256 support for MPTCP
5099  */
5100 
5101 static void
mptcp_do_sha256(mptcp_key_t * key,char * sha_digest)5102 mptcp_do_sha256(mptcp_key_t *key, char *sha_digest)
5103 {
5104 	const unsigned char *sha2_base;
5105 	int sha2_size;
5106 
5107 	sha2_base = (const unsigned char *) key;
5108 	sha2_size = sizeof(mptcp_key_t);
5109 
5110 	SHA256_CTX sha_ctx;
5111 	SHA256_Init(&sha_ctx);
5112 	SHA256_Update(&sha_ctx, sha2_base, sha2_size);
5113 	SHA256_Final(sha_digest, &sha_ctx);
5114 }
5115 
5116 void
mptcp_hmac_sha256(mptcp_key_t key1,mptcp_key_t key2,u_char * msg,uint16_t msg_len,u_char * digest)5117 mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2,
5118     u_char *msg, uint16_t msg_len, u_char *digest)
5119 {
5120 	SHA256_CTX sha_ctx;
5121 	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5122 	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5123 	int i;
5124 
5125 	bzero(digest, SHA256_DIGEST_LENGTH);
5126 
5127 	/* Set up the Key for HMAC */
5128 	key_ipad[0] = key1;
5129 	key_ipad[1] = key2;
5130 
5131 	key_opad[0] = key1;
5132 	key_opad[1] = key2;
5133 
5134 	/* Key is 512 block length, so no need to compute hash */
5135 
5136 	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5137 
5138 	for (i = 0; i < 8; i++) {
5139 		key_ipad[i] ^= 0x3636363636363636;
5140 		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5141 	}
5142 
5143 	/* Perform inner SHA256 */
5144 	SHA256_Init(&sha_ctx);
5145 	SHA256_Update(&sha_ctx, (unsigned char *)key_ipad, sizeof(key_ipad));
5146 	SHA256_Update(&sha_ctx, msg, msg_len);
5147 	SHA256_Final(digest, &sha_ctx);
5148 
5149 	/* Perform outer SHA256 */
5150 	SHA256_Init(&sha_ctx);
5151 	SHA256_Update(&sha_ctx, (unsigned char *)key_opad, sizeof(key_opad));
5152 	SHA256_Update(&sha_ctx, (unsigned char *)digest, SHA256_DIGEST_LENGTH);
5153 	SHA256_Final(digest, &sha_ctx);
5154 }
5155 
5156 /*
5157  * SHA1 support for MPTCP
5158  */
5159 
5160 static void
mptcp_do_sha1(mptcp_key_t * key,char * sha_digest)5161 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5162 {
5163 	SHA1_CTX sha1ctxt;
5164 	const unsigned char *sha1_base;
5165 	int sha1_size;
5166 
5167 	sha1_base = (const unsigned char *) key;
5168 	sha1_size = sizeof(mptcp_key_t);
5169 	SHA1Init(&sha1ctxt);
5170 	SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5171 	SHA1Final(sha_digest, &sha1ctxt);
5172 }
5173 
5174 void
mptcp_hmac_sha1(mptcp_key_t key1,mptcp_key_t key2,u_int32_t rand1,u_int32_t rand2,u_char * digest)5175 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5176     u_int32_t rand1, u_int32_t rand2, u_char *digest)
5177 {
5178 	SHA1_CTX  sha1ctxt;
5179 	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5180 	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5181 	u_int32_t data[2];
5182 	int i;
5183 
5184 	bzero(digest, SHA1_RESULTLEN);
5185 
5186 	/* Set up the Key for HMAC */
5187 	key_ipad[0] = key1;
5188 	key_ipad[1] = key2;
5189 
5190 	key_opad[0] = key1;
5191 	key_opad[1] = key2;
5192 
5193 	/* Set up the message for HMAC */
5194 	data[0] = rand1;
5195 	data[1] = rand2;
5196 
5197 	/* Key is 512 block length, so no need to compute hash */
5198 
5199 	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5200 
5201 	for (i = 0; i < 8; i++) {
5202 		key_ipad[i] ^= 0x3636363636363636;
5203 		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5204 	}
5205 
5206 	/* Perform inner SHA1 */
5207 	SHA1Init(&sha1ctxt);
5208 	SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5209 	SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5210 	SHA1Final(digest, &sha1ctxt);
5211 
5212 	/* Perform outer SHA1 */
5213 	SHA1Init(&sha1ctxt);
5214 	SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5215 	SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5216 	SHA1Final(digest, &sha1ctxt);
5217 }
5218 
5219 /*
5220  * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5221  * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5222  */
5223 void
mptcp_get_mpjoin_hmac(mptcp_addr_id aid,struct mptcb * mp_tp,u_char * digest,uint8_t digest_len)5224 mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, uint8_t digest_len)
5225 {
5226 	uint32_t lrand, rrand;
5227 
5228 	lrand = rrand = 0;
5229 	mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5230 
5231 	u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0};
5232 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5233 		mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, full_digest);
5234 	} else {
5235 		uint32_t data[2];
5236 		data[0] = lrand;
5237 		data[1] = rrand;
5238 		mptcp_hmac_sha256(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, (u_char*)data, 8, full_digest);
5239 	}
5240 	bcopy(full_digest, digest, digest_len);
5241 }
5242 
5243 /*
5244  * Authentication data generation
5245  */
5246 static void
mptcp_generate_token(char * sha_digest,int sha_digest_len,caddr_t token,int token_len)5247 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5248     int token_len)
5249 {
5250 	VERIFY(token_len == sizeof(u_int32_t));
5251 	VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5252 	    sha_digest_len == SHA256_DIGEST_LENGTH);
5253 
5254 	/* Most significant 32 bits of the SHA1/SHA256 hash */
5255 	bcopy(sha_digest, token, sizeof(u_int32_t));
5256 	return;
5257 }
5258 
5259 static void
mptcp_generate_idsn(char * sha_digest,int sha_digest_len,caddr_t idsn,int idsn_len,uint8_t mp_version)5260 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5261     int idsn_len, uint8_t mp_version)
5262 {
5263 	VERIFY(idsn_len == sizeof(u_int64_t));
5264 	VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5265 	    sha_digest_len == SHA256_DIGEST_LENGTH);
5266 	VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1);
5267 
5268 	/*
5269 	 * Least significant 64 bits of the hash
5270 	 */
5271 
5272 	if (mp_version == MPTCP_VERSION_0) {
5273 		idsn[7] = sha_digest[12];
5274 		idsn[6] = sha_digest[13];
5275 		idsn[5] = sha_digest[14];
5276 		idsn[4] = sha_digest[15];
5277 		idsn[3] = sha_digest[16];
5278 		idsn[2] = sha_digest[17];
5279 		idsn[1] = sha_digest[18];
5280 		idsn[0] = sha_digest[19];
5281 	} else {
5282 		idsn[7] = sha_digest[24];
5283 		idsn[6] = sha_digest[25];
5284 		idsn[5] = sha_digest[26];
5285 		idsn[4] = sha_digest[27];
5286 		idsn[3] = sha_digest[28];
5287 		idsn[2] = sha_digest[29];
5288 		idsn[1] = sha_digest[30];
5289 		idsn[0] = sha_digest[31];
5290 	}
5291 	return;
5292 }
5293 
5294 static void
mptcp_conn_properties(struct mptcb * mp_tp)5295 mptcp_conn_properties(struct mptcb *mp_tp)
5296 {
5297 	/* Set DSS checksum flag */
5298 	if (mptcp_dss_csum) {
5299 		mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5300 	}
5301 
5302 	/* Set up receive window */
5303 	mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5304 
5305 	/* Set up gc ticks */
5306 	mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5307 }
5308 
5309 static void
mptcp_init_local_parms(struct mptses * mpte,struct sockaddr * dst)5310 mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst)
5311 {
5312 	struct mptcb *mp_tp = mpte->mpte_mptcb;
5313 	char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5314 	uint16_t digest_len;
5315 
5316 	if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) {
5317 		mp_tp->mpt_version = MPTCP_VERSION_0;
5318 	} else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) {
5319 		mp_tp->mpt_version = MPTCP_VERSION_1;
5320 	} else {
5321 		mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst);
5322 	}
5323 	VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 ||
5324 	    mp_tp->mpt_version == MPTCP_VERSION_1);
5325 
5326 	read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5327 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5328 		digest_len = SHA1_RESULTLEN;
5329 		mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5330 	} else {
5331 		digest_len = SHA256_DIGEST_LENGTH;
5332 		mptcp_do_sha256(&mp_tp->mpt_localkey, key_digest);
5333 	}
5334 
5335 	mptcp_generate_token(key_digest, digest_len,
5336 	    (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5337 	mptcp_generate_idsn(key_digest, digest_len,
5338 	    (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5339 	/* The subflow SYN is also first MPTCP byte */
5340 	mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5341 	mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5342 
5343 	mptcp_conn_properties(mp_tp);
5344 }
5345 
5346 int
mptcp_init_remote_parms(struct mptcb * mp_tp)5347 mptcp_init_remote_parms(struct mptcb *mp_tp)
5348 {
5349 	/* Setup local and remote tokens and Initial DSNs */
5350 	char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5351 	uint16_t digest_len;
5352 
5353 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5354 		digest_len = SHA1_RESULTLEN;
5355 		mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5356 	} else if (mp_tp->mpt_version == MPTCP_VERSION_1) {
5357 		digest_len = SHA256_DIGEST_LENGTH;
5358 		mptcp_do_sha256(&mp_tp->mpt_remotekey, remote_digest);
5359 	} else {
5360 		return -1;
5361 	}
5362 
5363 	mptcp_generate_token(remote_digest, digest_len,
5364 	    (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5365 	mptcp_generate_idsn(remote_digest, digest_len,
5366 	    (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5367 	mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5368 	mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5369 	return 0;
5370 }
5371 
5372 static void
mptcp_send_dfin(struct socket * so)5373 mptcp_send_dfin(struct socket *so)
5374 {
5375 	struct tcpcb *tp = NULL;
5376 	struct inpcb *inp = NULL;
5377 
5378 	inp = sotoinpcb(so);
5379 	if (!inp) {
5380 		return;
5381 	}
5382 
5383 	tp = intotcpcb(inp);
5384 	if (!tp) {
5385 		return;
5386 	}
5387 
5388 	if (!(tp->t_mpflags & TMPF_RESET)) {
5389 		tp->t_mpflags |= TMPF_SEND_DFIN;
5390 	}
5391 }
5392 
5393 /*
5394  * Data Sequence Mapping routines
5395  */
5396 void
mptcp_insert_dsn(struct mppcb * mpp,struct mbuf * m)5397 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5398 {
5399 	struct mptcb *mp_tp;
5400 
5401 	if (m == NULL) {
5402 		return;
5403 	}
5404 
5405 	mp_tp = &__container_of(mpp, struct mpp_mtp, mpp)->mtcb;
5406 
5407 	while (m) {
5408 		VERIFY(m->m_flags & M_PKTHDR);
5409 		m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5410 		m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5411 		VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5412 		m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5413 		mp_tp->mpt_sndmax += m_pktlen(m);
5414 		m = m->m_next;
5415 	}
5416 }
5417 
5418 void
mptcp_fallback_sbdrop(struct socket * so,struct mbuf * m,int len)5419 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5420 {
5421 	struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5422 	uint64_t data_ack;
5423 	uint64_t dsn;
5424 
5425 	VERIFY(len >= 0);
5426 
5427 	if (!m || len == 0) {
5428 		return;
5429 	}
5430 
5431 	while (m && len > 0) {
5432 		VERIFY(m->m_flags & M_PKTHDR);
5433 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5434 
5435 		data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5436 		dsn = m->m_pkthdr.mp_dsn;
5437 
5438 		len -= m->m_len;
5439 		m = m->m_next;
5440 	}
5441 
5442 	if (m && len == 0) {
5443 		/*
5444 		 * If there is one more mbuf in the chain, it automatically means
5445 		 * that up to m->mp_dsn has been ack'ed.
5446 		 *
5447 		 * This means, we actually correct data_ack back down (compared
5448 		 * to what we set inside the loop - dsn + data_len). Because in
5449 		 * the loop we are "optimistic" and assume that the full mapping
5450 		 * will be acked. If that's not the case and we get out of the
5451 		 * loop with m != NULL, it means only up to m->mp_dsn has been
5452 		 * really acked.
5453 		 */
5454 		data_ack = m->m_pkthdr.mp_dsn;
5455 	}
5456 
5457 	if (len < 0) {
5458 		/*
5459 		 * If len is negative, meaning we acked in the middle of an mbuf,
5460 		 * only up to this mbuf's data-sequence number has been acked
5461 		 * at the MPTCP-level.
5462 		 */
5463 		data_ack = dsn;
5464 	}
5465 
5466 	/* We can have data in the subflow's send-queue that is being acked,
5467 	 * while the DATA_ACK has already advanced. Thus, we should check whether
5468 	 * or not the DATA_ACK is actually new here.
5469 	 */
5470 	if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5471 	    MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5472 		mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5473 	}
5474 }
5475 
5476 void
mptcp_preproc_sbdrop(struct socket * so,struct mbuf * m,unsigned int len)5477 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5478 {
5479 	int rewinding = 0;
5480 
5481 	/* TFO makes things complicated. */
5482 	if (so->so_flags1 & SOF1_TFO_REWIND) {
5483 		rewinding = 1;
5484 		so->so_flags1 &= ~SOF1_TFO_REWIND;
5485 	}
5486 
5487 	while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5488 		u_int32_t sub_len;
5489 		VERIFY(m->m_flags & M_PKTHDR);
5490 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5491 
5492 		sub_len = m->m_pkthdr.mp_rlen;
5493 
5494 		if (sub_len < len) {
5495 			m->m_pkthdr.mp_dsn += sub_len;
5496 			if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5497 				m->m_pkthdr.mp_rseq += sub_len;
5498 			}
5499 			m->m_pkthdr.mp_rlen = 0;
5500 			len -= sub_len;
5501 		} else {
5502 			/* sub_len >= len */
5503 			if (rewinding == 0) {
5504 				m->m_pkthdr.mp_dsn += len;
5505 			}
5506 			if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5507 				if (rewinding == 0) {
5508 					m->m_pkthdr.mp_rseq += len;
5509 				}
5510 			}
5511 			m->m_pkthdr.mp_rlen -= len;
5512 			break;
5513 		}
5514 		m = m->m_next;
5515 	}
5516 
5517 	if (so->so_flags & SOF_MP_SUBFLOW &&
5518 	    !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5519 	    !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5520 		/*
5521 		 * Received an ack without receiving a DATA_ACK.
5522 		 * Need to fallback to regular TCP (or destroy this subflow).
5523 		 */
5524 		sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5525 		mptcp_notify_mpfail(so);
5526 	}
5527 }
5528 
5529 /* Obtain the DSN mapping stored in the mbuf */
5530 void
mptcp_output_getm_dsnmap32(struct socket * so,int off,uint32_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5531 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5532     uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5533 {
5534 	u_int64_t dsn64;
5535 
5536 	mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5537 	*dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5538 }
5539 
5540 void
mptcp_output_getm_dsnmap64(struct socket * so,int off,uint64_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5541 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5542     uint32_t *relseq, uint16_t *data_len,
5543     uint16_t *dss_csum)
5544 {
5545 	struct mbuf *m = so->so_snd.sb_mb;
5546 
5547 	VERIFY(off >= 0);
5548 
5549 	if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5550 		*dsn = 0;
5551 		*relseq = 0;
5552 		*data_len = 0;
5553 		*dss_csum = 0;
5554 		return;
5555 	}
5556 
5557 	/*
5558 	 * In the subflow socket, the DSN sequencing can be discontiguous,
5559 	 * but the subflow sequence mapping is contiguous. Use the subflow
5560 	 * sequence property to find the right mbuf and corresponding dsn
5561 	 * mapping.
5562 	 */
5563 
5564 	while (m) {
5565 		VERIFY(m->m_flags & M_PKTHDR);
5566 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5567 
5568 		if (off >= m->m_len) {
5569 			off -= m->m_len;
5570 			m = m->m_next;
5571 		} else {
5572 			break;
5573 		}
5574 	}
5575 
5576 	VERIFY(off >= 0);
5577 	VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5578 
5579 	*dsn = m->m_pkthdr.mp_dsn;
5580 	*relseq = m->m_pkthdr.mp_rseq;
5581 	*data_len = m->m_pkthdr.mp_rlen;
5582 	*dss_csum = m->m_pkthdr.mp_csum;
5583 }
5584 
5585 void
mptcp_output_getm_data_level_details(struct socket * so,int off,uint16_t * data_len,uint16_t * dss_csum)5586 mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum)
5587 {
5588 	uint64_t dsn;
5589 	uint32_t relseq;
5590 
5591 	mptcp_output_getm_dsnmap64(so, off, &dsn, &relseq, data_len, dss_csum);
5592 }
5593 
5594 /*
5595  * Note that this is called only from tcp_input() via mptcp_input_preproc()
5596  * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5597  * When it trims data tcp_input calls m_adj() which does not remove the
5598  * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5599  * The dsn map insertion cannot be delayed after trim, because data can be in
5600  * the reassembly queue for a while and the DSN option info in tp will be
5601  * overwritten for every new packet received.
5602  * The dsn map will be adjusted just prior to appending to subflow sockbuf
5603  * with mptcp_adj_rmap()
5604  */
5605 void
mptcp_insert_rmap(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th)5606 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5607 {
5608 	VERIFY(m->m_flags & M_PKTHDR);
5609 	VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5610 
5611 	if (tp->t_mpflags & TMPF_EMBED_DSN) {
5612 		m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5613 		m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5614 		m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5615 		m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5616 		if (tp->t_rcv_map.mpt_dfin) {
5617 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5618 		}
5619 
5620 		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5621 
5622 		tp->t_mpflags &= ~TMPF_EMBED_DSN;
5623 		tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5624 	} else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5625 		if (th->th_flags & TH_FIN) {
5626 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5627 		}
5628 	}
5629 }
5630 
5631 /*
5632  * Following routines help with failure detection and failover of data
5633  * transfer from one subflow to another.
5634  */
5635 void
mptcp_act_on_txfail(struct socket * so)5636 mptcp_act_on_txfail(struct socket *so)
5637 {
5638 	struct tcpcb *tp = NULL;
5639 	struct inpcb *inp = sotoinpcb(so);
5640 
5641 	if (inp == NULL) {
5642 		return;
5643 	}
5644 
5645 	tp = intotcpcb(inp);
5646 	if (tp == NULL) {
5647 		return;
5648 	}
5649 
5650 	if (so->so_flags & SOF_MP_TRYFAILOVER) {
5651 		return;
5652 	}
5653 
5654 	so->so_flags |= SOF_MP_TRYFAILOVER;
5655 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5656 }
5657 
5658 /*
5659  * Support for MP_FAIL option
5660  */
5661 int
mptcp_get_map_for_dsn(struct socket * so,uint64_t dsn_fail,uint32_t * tcp_seq)5662 mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5663 {
5664 	struct mbuf *m = so->so_snd.sb_mb;
5665 	uint16_t datalen;
5666 	uint64_t dsn;
5667 	int off = 0;
5668 
5669 	if (m == NULL) {
5670 		return -1;
5671 	}
5672 
5673 	while (m != NULL) {
5674 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5675 		VERIFY(m->m_flags & M_PKTHDR);
5676 		dsn = m->m_pkthdr.mp_dsn;
5677 		datalen = m->m_pkthdr.mp_rlen;
5678 		if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5679 		    (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5680 			off = (int)(dsn_fail - dsn);
5681 			*tcp_seq = m->m_pkthdr.mp_rseq + off;
5682 			return 0;
5683 		}
5684 
5685 		m = m->m_next;
5686 	}
5687 
5688 	/*
5689 	 * If there was no mbuf data and a fallback to TCP occurred, there's
5690 	 * not much else to do.
5691 	 */
5692 
5693 	os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5694 	return -1;
5695 }
5696 
5697 /*
5698  * Support for sending contiguous MPTCP bytes in subflow
5699  * Also for preventing sending data with ACK in 3-way handshake
5700  */
5701 int32_t
mptcp_adj_sendlen(struct socket * so,int32_t off)5702 mptcp_adj_sendlen(struct socket *so, int32_t off)
5703 {
5704 	struct tcpcb *tp = sototcpcb(so);
5705 	struct mptsub *mpts = tp->t_mpsub;
5706 	uint64_t mdss_dsn;
5707 	uint32_t mdss_subflow_seq;
5708 	int mdss_subflow_off;
5709 	uint16_t mdss_data_len;
5710 	uint16_t dss_csum;
5711 
5712 	if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5713 		return 0;
5714 	}
5715 
5716 	mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5717 	    &mdss_data_len, &dss_csum);
5718 
5719 	/*
5720 	 * We need to compute how much of the mapping still remains.
5721 	 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5722 	 */
5723 	mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5724 
5725 	/*
5726 	 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5727 	 * seq has been set to 1 (while it should be 0).
5728 	 */
5729 	if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5730 		mdss_subflow_off--;
5731 	}
5732 
5733 	VERIFY(off >= mdss_subflow_off);
5734 
5735 	return mdss_data_len - (off - mdss_subflow_off);
5736 }
5737 
5738 static uint32_t
mptcp_get_maxseg(struct mptses * mpte)5739 mptcp_get_maxseg(struct mptses *mpte)
5740 {
5741 	struct mptsub *mpts;
5742 	uint32_t maxseg = 0;
5743 
5744 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5745 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5746 
5747 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5748 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
5749 			continue;
5750 		}
5751 
5752 		if (tp->t_maxseg > maxseg) {
5753 			maxseg = tp->t_maxseg;
5754 		}
5755 	}
5756 
5757 	return maxseg;
5758 }
5759 
5760 static uint8_t
mptcp_get_rcvscale(struct mptses * mpte)5761 mptcp_get_rcvscale(struct mptses *mpte)
5762 {
5763 	struct mptsub *mpts;
5764 	uint8_t rcvscale = UINT8_MAX;
5765 
5766 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5767 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5768 
5769 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5770 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
5771 			continue;
5772 		}
5773 
5774 		if (tp->rcv_scale < rcvscale) {
5775 			rcvscale = tp->rcv_scale;
5776 		}
5777 	}
5778 
5779 	return rcvscale;
5780 }
5781 
5782 /* Similar to tcp_sbrcv_reserve */
5783 static void
mptcp_sbrcv_reserve(struct mptcb * mp_tp,struct sockbuf * sbrcv,u_int32_t newsize,u_int32_t idealsize)5784 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5785     u_int32_t newsize, u_int32_t idealsize)
5786 {
5787 	uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5788 
5789 	if (rcvscale == UINT8_MAX) {
5790 		return;
5791 	}
5792 
5793 	/* newsize should not exceed max */
5794 	newsize = min(newsize, tcp_autorcvbuf_max);
5795 
5796 	/* The receive window scale negotiated at the
5797 	 * beginning of the connection will also set a
5798 	 * limit on the socket buffer size
5799 	 */
5800 	newsize = min(newsize, TCP_MAXWIN << rcvscale);
5801 
5802 	/* Set new socket buffer size */
5803 	if (newsize > sbrcv->sb_hiwat &&
5804 	    (sbreserve(sbrcv, newsize) == 1)) {
5805 		sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5806 		    (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5807 
5808 		/* Again check the limit set by the advertised
5809 		 * window scale
5810 		 */
5811 		sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5812 		    TCP_MAXWIN << rcvscale);
5813 	}
5814 }
5815 
5816 void
mptcp_sbrcv_grow(struct mptcb * mp_tp)5817 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5818 {
5819 	struct mptses *mpte = mp_tp->mpt_mpte;
5820 	struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5821 	struct sockbuf *sbrcv = &mp_so->so_rcv;
5822 	uint32_t hiwat_sum = 0;
5823 	uint32_t ideal_sum = 0;
5824 	struct mptsub *mpts;
5825 
5826 	/*
5827 	 * Do not grow the receive socket buffer if
5828 	 * - auto resizing is disabled, globally or on this socket
5829 	 * - the high water mark already reached the maximum
5830 	 * - the stream is in background and receive side is being
5831 	 * throttled
5832 	 * - if there are segments in reassembly queue indicating loss,
5833 	 * do not need to increase recv window during recovery as more
5834 	 * data is not going to be sent. A duplicate ack sent during
5835 	 * recovery should not change the receive window
5836 	 */
5837 	if (tcp_do_autorcvbuf == 0 ||
5838 	    (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5839 	    tcp_cansbgrow(sbrcv) == 0 ||
5840 	    sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5841 	    (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5842 	    !LIST_EMPTY(&mp_tp->mpt_segq)) {
5843 		/* Can not resize the socket buffer, just return */
5844 		return;
5845 	}
5846 
5847 	/*
5848 	 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5849 	 *
5850 	 * But, for this we first need accurate receiver-RTT estimations, which
5851 	 * we currently don't have.
5852 	 *
5853 	 * Let's use a dummy algorithm for now, just taking the sum of all
5854 	 * subflow's receive-buffers. It's too low, but that's all we can get
5855 	 * for now.
5856 	 */
5857 
5858 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5859 		hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5860 		ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5861 	}
5862 
5863 	mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5864 }
5865 
5866 /*
5867  * Determine if we can grow the recieve socket buffer to avoid sending
5868  * a zero window update to the peer. We allow even socket buffers that
5869  * have fixed size (set by the application) to grow if the resource
5870  * constraints are met. They will also be trimmed after the application
5871  * reads data.
5872  *
5873  * Similar to tcp_sbrcv_grow_rwin
5874  */
5875 static void
mptcp_sbrcv_grow_rwin(struct mptcb * mp_tp,struct sockbuf * sb)5876 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5877 {
5878 	struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5879 	u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5880 	u_int32_t rcvbuf = sb->sb_hiwat;
5881 
5882 	if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5883 		return;
5884 	}
5885 
5886 	if (tcp_do_autorcvbuf == 1 &&
5887 	    tcp_cansbgrow(sb) &&
5888 	    /* Diff to tcp_sbrcv_grow_rwin */
5889 	    (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5890 	    (rcvbuf - sb->sb_cc) < rcvbufinc &&
5891 	    rcvbuf < tcp_autorcvbuf_max &&
5892 	    (sb->sb_idealsize > 0 &&
5893 	    sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5894 		sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5895 	}
5896 }
5897 
5898 /* Similar to tcp_sbspace */
5899 int32_t
mptcp_sbspace(struct mptcb * mp_tp)5900 mptcp_sbspace(struct mptcb *mp_tp)
5901 {
5902 	struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5903 	uint32_t rcvbuf;
5904 	int32_t space;
5905 	int32_t pending = 0;
5906 
5907 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5908 
5909 	mptcp_sbrcv_grow_rwin(mp_tp, sb);
5910 
5911 	/* hiwat might have changed */
5912 	rcvbuf = sb->sb_hiwat;
5913 
5914 	space =  ((int32_t) imin((rcvbuf - sb->sb_cc),
5915 	    (sb->sb_mbmax - sb->sb_mbcnt)));
5916 	if (space < 0) {
5917 		space = 0;
5918 	}
5919 
5920 #if CONTENT_FILTER
5921 	/* Compensate for data being processed by content filters */
5922 	pending = cfil_sock_data_space(sb);
5923 #endif /* CONTENT_FILTER */
5924 	if (pending > space) {
5925 		space = 0;
5926 	} else {
5927 		space -= pending;
5928 	}
5929 
5930 	return space;
5931 }
5932 
5933 /*
5934  * Support Fallback to Regular TCP
5935  */
5936 void
mptcp_notify_mpready(struct socket * so)5937 mptcp_notify_mpready(struct socket *so)
5938 {
5939 	struct tcpcb *tp = NULL;
5940 
5941 	if (so == NULL) {
5942 		return;
5943 	}
5944 
5945 	tp = intotcpcb(sotoinpcb(so));
5946 
5947 	if (tp == NULL) {
5948 		return;
5949 	}
5950 
5951 	DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5952 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5953 	    struct tcpcb *, tp);
5954 
5955 	if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5956 		return;
5957 	}
5958 
5959 	if (tp->t_mpflags & TMPF_MPTCP_READY) {
5960 		return;
5961 	}
5962 
5963 	tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5964 	tp->t_mpflags |= TMPF_MPTCP_READY;
5965 
5966 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5967 }
5968 
5969 void
mptcp_notify_mpfail(struct socket * so)5970 mptcp_notify_mpfail(struct socket *so)
5971 {
5972 	struct tcpcb *tp = NULL;
5973 
5974 	if (so == NULL) {
5975 		return;
5976 	}
5977 
5978 	tp = intotcpcb(sotoinpcb(so));
5979 
5980 	if (tp == NULL) {
5981 		return;
5982 	}
5983 
5984 	DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5985 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5986 	    struct tcpcb *, tp);
5987 
5988 	if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5989 		return;
5990 	}
5991 
5992 	tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5993 	tp->t_mpflags |= TMPF_TCP_FALLBACK;
5994 
5995 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5996 }
5997 
5998 /*
5999  * Keepalive helper function
6000  */
6001 boolean_t
mptcp_ok_to_keepalive(struct mptcb * mp_tp)6002 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
6003 {
6004 	boolean_t ret = 1;
6005 
6006 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6007 
6008 	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
6009 		ret = 0;
6010 	}
6011 	return ret;
6012 }
6013 
6014 /*
6015  * MPTCP t_maxseg adjustment function
6016  */
6017 int
mptcp_adj_mss(struct tcpcb * tp,boolean_t mtudisc)6018 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6019 {
6020 	int mss_lower = 0;
6021 	struct mptcb *mp_tp = tptomptp(tp);
6022 
6023 #define MPTCP_COMPUTE_LEN {                             \
6024 	mss_lower = sizeof (struct mptcp_dss_ack_opt);  \
6025 	if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)         \
6026 	        mss_lower += 2;                         \
6027 	else                                            \
6028 	/* adjust to 32-bit boundary + EOL */   \
6029 	        mss_lower += 2;                         \
6030 }
6031 	if (mp_tp == NULL) {
6032 		return 0;
6033 	}
6034 
6035 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6036 
6037 	/*
6038 	 * For the first subflow and subsequent subflows, adjust mss for
6039 	 * most common MPTCP option size, for case where tcp_mss is called
6040 	 * during option processing and MTU discovery.
6041 	 */
6042 	if (!mtudisc) {
6043 		if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6044 		    !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6045 			MPTCP_COMPUTE_LEN;
6046 		}
6047 
6048 		if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6049 		    tp->t_mpflags & TMPF_SENT_JOIN) {
6050 			MPTCP_COMPUTE_LEN;
6051 		}
6052 	} else {
6053 		if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6054 			MPTCP_COMPUTE_LEN;
6055 		}
6056 	}
6057 
6058 	return mss_lower;
6059 }
6060 
6061 static void
fill_mptcp_subflow(struct socket * so,mptcp_flow_t * flow,struct mptsub * mpts)6062 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6063 {
6064 	struct inpcb *inp;
6065 
6066 	tcp_getconninfo(so, &flow->flow_ci);
6067 	inp = sotoinpcb(so);
6068 	if ((inp->inp_vflag & INP_IPV6) != 0) {
6069 		flow->flow_src.ss_family = AF_INET6;
6070 		flow->flow_dst.ss_family = AF_INET6;
6071 		flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6072 		flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6073 		SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6074 		SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6075 		SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6076 		SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
6077 	} else if ((inp->inp_vflag & INP_IPV4) != 0) {
6078 		flow->flow_src.ss_family = AF_INET;
6079 		flow->flow_dst.ss_family = AF_INET;
6080 		flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6081 		flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6082 		SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6083 		SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6084 		SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6085 		SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6086 	}
6087 	flow->flow_len = sizeof(*flow);
6088 	flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
6089 	flow->flow_flags = mpts->mpts_flags;
6090 	flow->flow_cid = mpts->mpts_connid;
6091 	flow->flow_relseq = mpts->mpts_rel_seq;
6092 	flow->flow_soerror = mpts->mpts_socket->so_error;
6093 	flow->flow_probecnt = mpts->mpts_probecnt;
6094 }
6095 
6096 static int
6097 mptcp_pcblist SYSCTL_HANDLER_ARGS
6098 {
6099 #pragma unused(oidp, arg1, arg2)
6100 	int error = 0, f;
6101 	size_t len;
6102 	struct mppcb *mpp;
6103 	struct mptses *mpte;
6104 	struct mptcb *mp_tp;
6105 	struct mptsub *mpts;
6106 	struct socket *so;
6107 	conninfo_mptcp_t mptcpci;
6108 	mptcp_flow_t *flows = NULL;
6109 
6110 	if (req->newptr != USER_ADDR_NULL) {
6111 		return EPERM;
6112 	}
6113 
6114 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6115 	if (req->oldptr == USER_ADDR_NULL) {
6116 		size_t n = mtcbinfo.mppi_count;
6117 		lck_mtx_unlock(&mtcbinfo.mppi_lock);
6118 		req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6119 		    4 * (n + n / 8)  * sizeof(mptcp_flow_t);
6120 		return 0;
6121 	}
6122 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6123 		flows = NULL;
6124 		socket_lock(mpp->mpp_socket, 1);
6125 		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6126 		mpte = mptompte(mpp);
6127 
6128 		socket_lock_assert_owned(mptetoso(mpte));
6129 		mp_tp = mpte->mpte_mptcb;
6130 
6131 		bzero(&mptcpci, sizeof(mptcpci));
6132 		mptcpci.mptcpci_state = mp_tp->mpt_state;
6133 		mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6134 		mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6135 		mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6136 		mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6137 		mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6138 		mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6139 		mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6140 		mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6141 		mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6142 		mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6143 		mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6144 		mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6145 		mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6146 
6147 		mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6148 		mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6149 		mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6150 		mptcpci.mptcpci_flow_offset =
6151 		    offsetof(conninfo_mptcp_t, mptcpci_flows);
6152 
6153 		len = sizeof(*flows) * mpte->mpte_numflows;
6154 		if (mpte->mpte_numflows != 0) {
6155 			flows = kalloc_data(len, Z_WAITOK | Z_ZERO);
6156 			if (flows == NULL) {
6157 				socket_unlock(mpp->mpp_socket, 1);
6158 				break;
6159 			}
6160 			mptcpci.mptcpci_len = sizeof(mptcpci) +
6161 			    sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6162 			error = SYSCTL_OUT(req, &mptcpci,
6163 			    sizeof(mptcpci) - sizeof(mptcp_flow_t));
6164 		} else {
6165 			mptcpci.mptcpci_len = sizeof(mptcpci);
6166 			error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6167 		}
6168 		if (error) {
6169 			socket_unlock(mpp->mpp_socket, 1);
6170 			kfree_data(flows, len);
6171 			break;
6172 		}
6173 		f = 0;
6174 		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6175 			so = mpts->mpts_socket;
6176 			fill_mptcp_subflow(so, &flows[f], mpts);
6177 			f++;
6178 		}
6179 		socket_unlock(mpp->mpp_socket, 1);
6180 		if (flows) {
6181 			error = SYSCTL_OUT(req, flows, len);
6182 			kfree_data(flows, len);
6183 			if (error) {
6184 				break;
6185 			}
6186 		}
6187 	}
6188 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6189 
6190 	return error;
6191 }
6192 
6193 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6194     0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6195     "List of active MPTCP connections");
6196 
6197 /*
6198  * Set notsent lowat mark on the MPTCB
6199  */
6200 int
mptcp_set_notsent_lowat(struct mptses * mpte,int optval)6201 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6202 {
6203 	struct mptcb *mp_tp = NULL;
6204 	int error = 0;
6205 
6206 	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6207 		mp_tp = mpte->mpte_mptcb;
6208 	}
6209 
6210 	if (mp_tp) {
6211 		mp_tp->mpt_notsent_lowat = optval;
6212 	} else {
6213 		error = EINVAL;
6214 	}
6215 
6216 	return error;
6217 }
6218 
6219 u_int32_t
mptcp_get_notsent_lowat(struct mptses * mpte)6220 mptcp_get_notsent_lowat(struct mptses *mpte)
6221 {
6222 	struct mptcb *mp_tp = NULL;
6223 
6224 	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6225 		mp_tp = mpte->mpte_mptcb;
6226 	}
6227 
6228 	if (mp_tp) {
6229 		return mp_tp->mpt_notsent_lowat;
6230 	} else {
6231 		return 0;
6232 	}
6233 }
6234 
6235 int
mptcp_notsent_lowat_check(struct socket * so)6236 mptcp_notsent_lowat_check(struct socket *so)
6237 {
6238 	struct mptses *mpte;
6239 	struct mppcb *mpp;
6240 	struct mptcb *mp_tp;
6241 	struct mptsub *mpts;
6242 
6243 	int notsent = 0;
6244 
6245 	mpp = mpsotomppcb(so);
6246 	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6247 		return 0;
6248 	}
6249 
6250 	mpte = mptompte(mpp);
6251 	socket_lock_assert_owned(mptetoso(mpte));
6252 	mp_tp = mpte->mpte_mptcb;
6253 
6254 	notsent = so->so_snd.sb_cc;
6255 
6256 	if ((notsent == 0) ||
6257 	    ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6258 	    mp_tp->mpt_notsent_lowat)) {
6259 		return 1;
6260 	}
6261 
6262 	/* When Nagle's algorithm is not disabled, it is better
6263 	 * to wakeup the client even before there is atleast one
6264 	 * maxseg of data to write.
6265 	 */
6266 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6267 		int retval = 0;
6268 		if (mpts->mpts_flags & MPTSF_ACTIVE) {
6269 			struct socket *subf_so = mpts->mpts_socket;
6270 			struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6271 
6272 			notsent = so->so_snd.sb_cc -
6273 			    (tp->snd_nxt - tp->snd_una);
6274 
6275 			if ((tp->t_flags & TF_NODELAY) == 0 &&
6276 			    notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6277 				retval = 1;
6278 			}
6279 			return retval;
6280 		}
6281 	}
6282 	return 0;
6283 }
6284 
6285 static errno_t
mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)6286 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6287     void **unitinfo)
6288 {
6289 #pragma unused(kctlref, sac, unitinfo)
6290 
6291 	if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6292 		os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6293 	}
6294 
6295 	mptcp_kern_skt_unit = sac->sc_unit;
6296 
6297 	return 0;
6298 }
6299 
6300 static void
mptcp_allow_uuid(uuid_t uuid,int32_t rssi)6301 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6302 {
6303 	struct mppcb *mpp;
6304 
6305 	/* Iterate over all MPTCP connections */
6306 
6307 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6308 
6309 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6310 		struct socket *mp_so = mpp->mpp_socket;
6311 		struct mptses *mpte = mpp->mpp_pcbe;
6312 
6313 		socket_lock(mp_so, 1);
6314 
6315 		if (mp_so->so_flags & SOF_DELEGATED &&
6316 		    uuid_compare(uuid, mp_so->e_uuid)) {
6317 			goto next;
6318 		} else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6319 		    uuid_compare(uuid, mp_so->last_uuid)) {
6320 			goto next;
6321 		}
6322 
6323 		os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6324 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6325 
6326 		mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6327 
6328 		if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6329 			mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6330 		}
6331 
6332 		mptcp_check_subflows_and_add(mpte);
6333 		mptcp_remove_subflows(mpte);
6334 
6335 		mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6336 
6337 next:
6338 		socket_unlock(mp_so, 1);
6339 	}
6340 
6341 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6342 }
6343 
6344 static void
mptcp_wifi_status_changed(void)6345 mptcp_wifi_status_changed(void)
6346 {
6347 	struct mppcb *mpp;
6348 
6349 	/* Iterate over all MPTCP connections */
6350 
6351 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6352 
6353 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6354 		struct socket *mp_so = mpp->mpp_socket;
6355 		struct mptses *mpte = mpp->mpp_pcbe;
6356 
6357 		socket_lock(mp_so, 1);
6358 
6359 		/* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6360 		if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6361 		    mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
6362 		    mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6363 			goto next;
6364 		}
6365 
6366 		mptcp_check_subflows_and_add(mpte);
6367 		mptcp_check_subflows_and_remove(mpte);
6368 
6369 next:
6370 		socket_unlock(mp_so, 1);
6371 	}
6372 
6373 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6374 }
6375 
6376 struct mptcp_uuid_search_info {
6377 	uuid_t target_uuid;
6378 	proc_t found_proc;
6379 	boolean_t is_proc_found;
6380 };
6381 
6382 static int
mptcp_find_proc_filter(proc_t p,void * arg)6383 mptcp_find_proc_filter(proc_t p, void *arg)
6384 {
6385 	struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6386 	int found;
6387 
6388 	if (info->is_proc_found) {
6389 		return 0;
6390 	}
6391 
6392 	/*
6393 	 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6394 	 * expects != 0 for a matching filter.
6395 	 */
6396 	found = uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0;
6397 	if (found) {
6398 		info->is_proc_found = true;
6399 	}
6400 
6401 	return found;
6402 }
6403 
6404 static int
mptcp_find_proc_callout(proc_t p,void * arg)6405 mptcp_find_proc_callout(proc_t p, void * arg)
6406 {
6407 	struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6408 
6409 	if (uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0) {
6410 		info->found_proc = p;
6411 		return PROC_CLAIMED_DONE;
6412 	}
6413 
6414 	return PROC_RETURNED;
6415 }
6416 
6417 static proc_t
mptcp_find_proc(const uuid_t uuid)6418 mptcp_find_proc(const uuid_t uuid)
6419 {
6420 	struct mptcp_uuid_search_info info;
6421 
6422 	uuid_copy(info.target_uuid, uuid);
6423 	info.found_proc = PROC_NULL;
6424 	info.is_proc_found = false;
6425 
6426 	proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
6427 	    mptcp_find_proc_filter, &info);
6428 
6429 	return info.found_proc;
6430 }
6431 
6432 void
mptcp_ask_symptoms(struct mptses * mpte)6433 mptcp_ask_symptoms(struct mptses *mpte)
6434 {
6435 	struct mptcp_symptoms_ask_uuid ask;
6436 	struct socket *mp_so;
6437 	struct proc *p = PROC_NULL;
6438 	int pid, prio, err;
6439 
6440 	if (mptcp_kern_skt_unit == 0) {
6441 		os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6442 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6443 		return;
6444 	}
6445 
6446 	mp_so = mptetoso(mpte);
6447 
6448 	if (mp_so->so_flags & SOF_DELEGATED) {
6449 		if (mpte->mpte_epid != 0) {
6450 			p = proc_find(mpte->mpte_epid);
6451 			if (p != PROC_NULL) {
6452 				/* We found a pid, check its UUID */
6453 				if (uuid_compare(mp_so->e_uuid, proc_executableuuid_addr(p))) {
6454 					/* It's not the same - we need to look for the real proc */
6455 					proc_rele(p);
6456 					p = PROC_NULL;
6457 				}
6458 			}
6459 		}
6460 
6461 		if (p == PROC_NULL) {
6462 			p = mptcp_find_proc(mp_so->e_uuid);
6463 			if (p == PROC_NULL) {
6464 				uuid_string_t uuid_string;
6465 				uuid_unparse(mp_so->e_uuid, uuid_string);
6466 
6467 				os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6468 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
6469 
6470 				return;
6471 			}
6472 			mpte->mpte_epid = proc_pid(p);
6473 		}
6474 
6475 		pid = mpte->mpte_epid;
6476 		uuid_copy(ask.uuid, mp_so->e_uuid);
6477 	} else {
6478 		pid = mp_so->last_pid;
6479 
6480 		p = proc_find(pid);
6481 		if (p == PROC_NULL) {
6482 			os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6483 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6484 			return;
6485 		}
6486 
6487 		uuid_copy(ask.uuid, mp_so->last_uuid);
6488 	}
6489 
6490 
6491 	ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6492 
6493 	prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6494 
6495 	if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6496 	    prio == TASK_DARWINBG_APPLICATION) {
6497 		ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6498 	} else if (prio == TASK_FOREGROUND_APPLICATION) {
6499 		ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6500 	} else {
6501 		ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6502 	}
6503 
6504 	err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6505 	    &ask, sizeof(ask), CTL_DATA_EOR);
6506 
6507 	os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6508 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6509 
6510 
6511 	proc_rele(p);
6512 }
6513 
6514 static errno_t
mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo)6515 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6516     void *unitinfo)
6517 {
6518 #pragma unused(kctlref, kcunit, unitinfo)
6519 
6520 	OSDecrementAtomic(&mptcp_kern_skt_inuse);
6521 
6522 	return 0;
6523 }
6524 
6525 static errno_t
mptcp_symptoms_ctl_send(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,mbuf_t m,int flags)6526 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6527     mbuf_t m, int flags)
6528 {
6529 #pragma unused(kctlref, unitinfo, flags)
6530 	symptoms_advisory_t *sa = NULL;
6531 
6532 	if (kcunit != mptcp_kern_skt_unit) {
6533 		os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6534 		    __func__, kcunit, mptcp_kern_skt_unit);
6535 	}
6536 
6537 	if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6538 		mbuf_freem(m);
6539 		return EINVAL;
6540 	}
6541 
6542 	if (mbuf_len(m) < sizeof(*sa)) {
6543 		os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6544 		    __func__, mbuf_len(m), sizeof(*sa));
6545 		mbuf_freem(m);
6546 		return EINVAL;
6547 	}
6548 
6549 	sa = mbuf_data(m);
6550 
6551 	if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6552 		os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6553 		    sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6554 		    sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6555 
6556 		if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6557 			mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6558 			mptcp_wifi_status_changed();
6559 		}
6560 	} else {
6561 		struct mptcp_symptoms_answer answer;
6562 		errno_t err;
6563 
6564 		/* We temporarily allow different sizes for ease of submission */
6565 		if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6566 		    mbuf_len(m) != sizeof(answer)) {
6567 			os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6568 			    __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6569 			    sizeof(answer));
6570 			mbuf_free(m);
6571 			return EINVAL;
6572 		}
6573 
6574 		memset(&answer, 0, sizeof(answer));
6575 
6576 		err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6577 		if (err) {
6578 			os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6579 			mbuf_free(m);
6580 			return err;
6581 		}
6582 
6583 		mptcp_allow_uuid(answer.uuid, answer.rssi);
6584 	}
6585 
6586 	mbuf_freem(m);
6587 	return 0;
6588 }
6589 
6590 void
mptcp_control_register(void)6591 mptcp_control_register(void)
6592 {
6593 	/* Set up the advisory control socket */
6594 	struct kern_ctl_reg mptcp_kern_ctl;
6595 
6596 	bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6597 	strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6598 	    sizeof(mptcp_kern_ctl.ctl_name));
6599 	mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6600 	mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6601 	mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6602 	mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6603 
6604 	(void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6605 }
6606 
6607 mptcp_wifi_quality_t
mptcp_wifi_quality_for_session(struct mptses * mpte)6608 mptcp_wifi_quality_for_session(struct mptses *mpte)
6609 {
6610 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6611 		if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6612 		    mptcp_advisory.sa_wifi_status) {
6613 			return symptoms_is_wifi_lossy() ? MPTCP_WIFI_QUALITY_BAD : MPTCP_WIFI_QUALITY_GOOD;
6614 		}
6615 
6616 		/*
6617 		 * If it's a first-party app and we don't have any info
6618 		 * about the Wi-Fi state, let's be pessimistic.
6619 		 */
6620 		return MPTCP_WIFI_QUALITY_UNSURE;
6621 	} else {
6622 		if (symptoms_is_wifi_lossy()) {
6623 			return MPTCP_WIFI_QUALITY_BAD;
6624 		}
6625 
6626 		/*
6627 		 * If we are target-based (meaning, we allow to be more lax on
6628 		 * the when wifi is considered bad), we only *know* about the state once
6629 		 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6630 		 *
6631 		 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6632 		 * be set.
6633 		 *
6634 		 * In any other case (while in target-mode), consider WiFi bad
6635 		 * and we are going to ask for allowance from Symptoms anyway.
6636 		 */
6637 		if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6638 			if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6639 			    mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6640 				return MPTCP_WIFI_QUALITY_GOOD;
6641 			}
6642 
6643 			return MPTCP_WIFI_QUALITY_BAD;
6644 		}
6645 
6646 		return MPTCP_WIFI_QUALITY_GOOD;
6647 	}
6648 }
6649 
6650 boolean_t
symptoms_is_wifi_lossy(void)6651 symptoms_is_wifi_lossy(void)
6652 {
6653 	return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6654 }
6655 
6656 int
mptcp_freeq(struct mptcb * mp_tp)6657 mptcp_freeq(struct mptcb *mp_tp)
6658 {
6659 	struct tseg_qent *q;
6660 	int rv = 0;
6661 	int count = 0;
6662 
6663 	while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6664 		LIST_REMOVE(q, tqe_q);
6665 		m_freem(q->tqe_m);
6666 		zfree(tcp_reass_zone, q);
6667 		count++;
6668 		rv = 1;
6669 	}
6670 	mp_tp->mpt_reassqlen = 0;
6671 
6672 	if (count > 0) {
6673 		OSAddAtomic(-count, &mptcp_reass_total_qlen);
6674 	}
6675 
6676 	return rv;
6677 }
6678 
6679 static int
mptcp_post_event(u_int32_t event_code,int value)6680 mptcp_post_event(u_int32_t event_code, int value)
6681 {
6682 	struct kev_mptcp_data event_data;
6683 	struct kev_msg ev_msg;
6684 
6685 	memset(&ev_msg, 0, sizeof(ev_msg));
6686 
6687 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
6688 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
6689 	ev_msg.kev_subclass     = KEV_MPTCP_SUBCLASS;
6690 	ev_msg.event_code       = event_code;
6691 
6692 	event_data.value = value;
6693 
6694 	ev_msg.dv[0].data_ptr    = &event_data;
6695 	ev_msg.dv[0].data_length = sizeof(event_data);
6696 
6697 	return kev_post_msg(&ev_msg);
6698 }
6699 
6700 static void
mptcp_set_cellicon(struct mptses * mpte,struct mptsub * mpts)6701 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6702 {
6703 	struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6704 	int error;
6705 
6706 	/* First-party apps (Siri) don't flip the cellicon */
6707 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6708 		return;
6709 	}
6710 
6711 	/* Subflow is disappearing - don't set it on this one */
6712 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6713 		return;
6714 	}
6715 
6716 	/* Fallen back connections are not triggering the cellicon */
6717 	if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6718 		return;
6719 	}
6720 
6721 	/* Remember the last time we set the cellicon. Needed for debouncing */
6722 	mpte->mpte_last_cellicon_set = tcp_now;
6723 
6724 	tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6725 	tcp_sched_timers(tp);
6726 
6727 	if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6728 	    mpte->mpte_cellicon_increments != 0) {
6729 		if (mptcp_cellicon_refcount == 0) {
6730 			os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6731 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6732 
6733 			/* Continue, so that the icon gets set... */
6734 		} else {
6735 			/*
6736 			 * In this case, the cellicon is already set. No need to bump it
6737 			 * even higher
6738 			 */
6739 
6740 			return;
6741 		}
6742 	}
6743 
6744 	/* When tearing down this subflow, we need to decrement the
6745 	 * reference counter
6746 	 */
6747 	mpts->mpts_flags |= MPTSF_CELLICON_SET;
6748 
6749 	/* This counter, so that when a session gets destroyed we decrement
6750 	 * the reference counter by whatever is left
6751 	 */
6752 	mpte->mpte_cellicon_increments++;
6753 
6754 	if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6755 		/* If cellicon is already set, get out of here! */
6756 		return;
6757 	}
6758 
6759 	error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6760 
6761 	if (error) {
6762 		os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6763 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6764 	} else {
6765 		os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6766 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6767 	}
6768 }
6769 
6770 void
mptcp_clear_cellicon(void)6771 mptcp_clear_cellicon(void)
6772 {
6773 	int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6774 
6775 	if (error) {
6776 		os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6777 		    __func__, error);
6778 	} else {
6779 		os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6780 		    __func__);
6781 	}
6782 }
6783 
6784 /*
6785  * Returns true if the icon has been flipped to WiFi.
6786  */
6787 static boolean_t
__mptcp_unset_cellicon(uint32_t val)6788 __mptcp_unset_cellicon(uint32_t val)
6789 {
6790 	VERIFY(val < INT32_MAX);
6791 	if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
6792 		return false;
6793 	}
6794 
6795 	mptcp_clear_cellicon();
6796 
6797 	return true;
6798 }
6799 
6800 void
mptcp_unset_cellicon(struct mptses * mpte,struct mptsub * mpts,uint32_t val)6801 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6802 {
6803 	/* First-party apps (Siri) don't flip the cellicon */
6804 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6805 		return;
6806 	}
6807 
6808 	if (mpte->mpte_cellicon_increments == 0) {
6809 		/* This flow never used cell - get out of here! */
6810 		return;
6811 	}
6812 
6813 	if (mptcp_cellicon_refcount == 0) {
6814 		os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6815 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6816 
6817 		return;
6818 	}
6819 
6820 	if (mpts) {
6821 		if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6822 			return;
6823 		}
6824 
6825 		mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6826 	}
6827 
6828 	if (mpte->mpte_cellicon_increments < val) {
6829 		os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6830 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6831 		val = mpte->mpte_cellicon_increments;
6832 	}
6833 
6834 	mpte->mpte_cellicon_increments -= val;
6835 
6836 	if (__mptcp_unset_cellicon(val) == false) {
6837 		return;
6838 	}
6839 
6840 	/* All flows are gone - our counter should be at zero too! */
6841 	if (mpte->mpte_cellicon_increments != 0) {
6842 		os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6843 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6844 	}
6845 }
6846 
6847 void
mptcp_reset_rexmit_state(struct tcpcb * tp)6848 mptcp_reset_rexmit_state(struct tcpcb *tp)
6849 {
6850 	struct mptsub *mpts;
6851 	struct inpcb *inp;
6852 	struct socket *so;
6853 
6854 	inp = tp->t_inpcb;
6855 	if (inp == NULL) {
6856 		return;
6857 	}
6858 
6859 	so = inp->inp_socket;
6860 	if (so == NULL) {
6861 		return;
6862 	}
6863 
6864 	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6865 		return;
6866 	}
6867 
6868 	mpts = tp->t_mpsub;
6869 
6870 	mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6871 	so->so_flags &= ~SOF_MP_TRYFAILOVER;
6872 }
6873 
6874 void
mptcp_reset_keepalive(struct tcpcb * tp)6875 mptcp_reset_keepalive(struct tcpcb *tp)
6876 {
6877 	struct mptsub *mpts = tp->t_mpsub;
6878 
6879 	mpts->mpts_flags &= ~MPTSF_READ_STALL;
6880 }
6881 
6882 static struct mppcb *
mtcp_alloc(void)6883 mtcp_alloc(void)
6884 {
6885 	return &kalloc_type(struct mpp_mtp, Z_WAITOK | Z_ZERO | Z_NOFAIL)->mpp;
6886 }
6887 
6888 static void
mtcp_free(struct mppcb * mpp)6889 mtcp_free(struct mppcb *mpp)
6890 {
6891 	struct mpp_mtp *mtp = __container_of(mpp, struct mpp_mtp, mpp);
6892 
6893 	kfree_type(struct mpp_mtp, mtp);
6894 }
6895 
6896 /*
6897  * Protocol pr_init callback.
6898  */
6899 void
mptcp_init(struct protosw * pp,struct domain * dp)6900 mptcp_init(struct protosw *pp, struct domain *dp)
6901 {
6902 #pragma unused(dp)
6903 	static int mptcp_initialized = 0;
6904 	struct protosw *prp;
6905 	struct ip6protosw *prp6;
6906 
6907 	VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
6908 
6909 	/* do this only once */
6910 	if (mptcp_initialized) {
6911 		return;
6912 	}
6913 	mptcp_initialized = 1;
6914 
6915 	mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
6916 
6917 	/*
6918 	 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
6919 	 * we must be able to find IPPROTO_TCP entries for both.
6920 	 */
6921 	prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
6922 	VERIFY(prp != NULL);
6923 	bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
6924 	bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
6925 	    sizeof(mptcp_subflow_usrreqs));
6926 	mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
6927 	mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
6928 	mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
6929 	mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
6930 	mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
6931 	mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
6932 	/*
6933 	 * Socket filters shouldn't attach/detach to/from this protosw
6934 	 * since pr_protosw is to be used instead, which points to the
6935 	 * real protocol; if they do, it is a bug and we should panic.
6936 	 */
6937 	mptcp_subflow_protosw.pr_filter_head.tqh_first =
6938 	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6939 	mptcp_subflow_protosw.pr_filter_head.tqh_last =
6940 	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6941 
6942 	prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
6943 	    IPPROTO_TCP, SOCK_STREAM);
6944 	VERIFY(prp6 != NULL);
6945 	bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
6946 	bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
6947 	    sizeof(mptcp_subflow_usrreqs6));
6948 	mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
6949 	mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
6950 	mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
6951 	mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
6952 	mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
6953 	mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
6954 	/*
6955 	 * Socket filters shouldn't attach/detach to/from this protosw
6956 	 * since pr_protosw is to be used instead, which points to the
6957 	 * real protocol; if they do, it is a bug and we should panic.
6958 	 */
6959 	mptcp_subflow_protosw6.pr_filter_head.tqh_first =
6960 	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6961 	mptcp_subflow_protosw6.pr_filter_head.tqh_last =
6962 	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6963 
6964 	bzero(&mtcbinfo, sizeof(mtcbinfo));
6965 	TAILQ_INIT(&mtcbinfo.mppi_pcbs);
6966 	mtcbinfo.mppi_alloc = mtcp_alloc;
6967 	mtcbinfo.mppi_free  = mtcp_free;
6968 
6969 	mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", LCK_GRP_ATTR_NULL);
6970 	lck_attr_setdefault(&mtcbinfo.mppi_lock_attr);
6971 	lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
6972 	    &mtcbinfo.mppi_lock_attr);
6973 
6974 	mtcbinfo.mppi_gc = mptcp_gc;
6975 	mtcbinfo.mppi_timer = mptcp_timer;
6976 
6977 	/* attach to MP domain for garbage collection to take place */
6978 	mp_pcbinfo_attach(&mtcbinfo);
6979 
6980 	mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
6981 }
6982