xref: /xnu-12377.1.9/bsd/netinet/mptcp_subr.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32 
33 #include <mach/sdt.h>
34 
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50 
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_cache.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_var.h>
62 #include <netinet/mptcp_var.h>
63 #include <netinet/mptcp.h>
64 #include <netinet/mptcp_opt.h>
65 #include <netinet/mptcp_seq.h>
66 #include <netinet/mptcp_timer.h>
67 #include <libkern/crypto/sha1.h>
68 #include <libkern/crypto/sha2.h>
69 #include <netinet6/in6_pcb.h>
70 #include <netinet6/ip6protosw.h>
71 #include <dev/random/randomdev.h>
72 #include <net/sockaddr_utils.h>
73 
74 /*
75  * Notes on MPTCP implementation.
76  *
77  * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
78  * communication domain.  The structure mtcbinfo describes the MPTCP instance
79  * of a Multipath protocol in that domain.  It is used to keep track of all
80  * MPTCP PCB instances in the system, and is protected by the global lock
81  * mppi_lock.
82  *
83  * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
84  * IPPROTO_TCP).  Upon success, a Multipath PCB gets allocated and along with
85  * it comes an MPTCP Session and an MPTCP PCB.  All three structures are
86  * allocated from the same memory block, and each structure has a pointer
87  * to the adjacent ones.  The layout is defined by the mpp_mtp structure.
88  * The socket lock (mpp_lock) is used to protect accesses to the Multipath
89  * PCB (mppcb) as well as the MPTCP Session (mptses).
90  *
91  * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
92  *
93  * A functioning MPTCP Session consists of one or more subflow sockets.  Each
94  * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
95  * represented by the mptsub structure.  Because each subflow requires access
96  * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
97  * subflow.  This gets decremented prior to the subflow's destruction.
98  *
99  * To handle events (read, write, control) from the subflows, we do direct
100  * upcalls into the specific function.
101  *
102  * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
103  * lock. Incoming data on a subflow also ends up taking this single lock. To
104  * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
105  * of the MPTCP-socket.
106  *
107  * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
108  * work is done by the MPTCP garbage collector which is invoked on demand by
109  * the PF_MULTIPATH garbage collector.  This process will take place once all
110  * of the subflows have been destroyed.
111  */
112 
113 static void mptcp_subflow_abort(struct mptsub *, int);
114 
115 static void mptcp_send_dfin(struct socket *so);
116 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
117 static int mptcp_freeq(struct mptcb *mp_tp);
118 
119 /*
120  * Possible return values for subflow event handlers.  Note that success
121  * values must be greater or equal than MPTS_EVRET_OK.  Values less than that
122  * indicate errors or actions which require immediate attention; they will
123  * prevent the rest of the handlers from processing their respective events
124  * until the next round of events processing.
125  */
126 typedef enum {
127 	MPTS_EVRET_DELETE               = 1,    /* delete this subflow */
128 	MPTS_EVRET_OK                   = 2,    /* OK */
129 	MPTS_EVRET_CONNECT_PENDING      = 3,    /* resume pended connects */
130 	MPTS_EVRET_DISCONNECT_FALLBACK  = 4,    /* abort all but preferred */
131 } ev_ret_t;
132 
133 static void mptcp_do_sha1(mptcp_key_t *, char sha_digest[SHA1_RESULTLEN]);
134 static void mptcp_do_sha256(mptcp_key_t *, char sha_digest[SHA256_DIGEST_LENGTH]);
135 
136 static void mptcp_init_local_parms(struct mptses *, struct sockaddr *);
137 
138 static KALLOC_TYPE_DEFINE(mptsub_zone, struct mptsub, NET_KT_DEFAULT);
139 static KALLOC_TYPE_DEFINE(mptopt_zone, struct mptopt, NET_KT_DEFAULT);
140 static KALLOC_TYPE_DEFINE(mpt_subauth_zone, struct mptcp_subf_auth_entry,
141     NET_KT_DEFAULT);
142 
143 struct mppcbinfo mtcbinfo;
144 
145 SYSCTL_DECL(_net_inet);
146 
147 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
148 
149 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
150     &mtcbinfo.mppi_count, 0, "Number of active PCBs");
151 
152 
153 static int mptcp_alternate_port = 0;
154 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
155     &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
156 
157 static struct protosw mptcp_subflow_protosw;
158 static struct pr_usrreqs mptcp_subflow_usrreqs;
159 static struct ip6protosw mptcp_subflow_protosw6;
160 static struct pr_usrreqs mptcp_subflow_usrreqs6;
161 
162 static uint8_t  mptcp_create_subflows_scheduled;
163 
164 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
165 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
166 static uint32_t mptcp_kern_skt_inuse = 0;
167 static uint32_t mptcp_kern_skt_unit;
168 static symptoms_advisory_t mptcp_advisory;
169 
170 uint32_t mptcp_cellicon_refcount = 0;
171 
172 os_log_t mptcp_log_handle;
173 
174 int
mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats * stats __counted_by (stats_count),uint16_t stats_count,u_short ifindex,boolean_t create)175 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats __counted_by(stats_count), uint16_t stats_count, u_short ifindex, boolean_t create)
176 {
177 	int i, index = -1;
178 
179 	VERIFY(stats_count <= MPTCP_ITFSTATS_SIZE);
180 
181 	for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
182 		if (create && stats[i].ifindex == IFSCOPE_NONE) {
183 			if (index < 0) {
184 				index = i;
185 			}
186 			continue;
187 		}
188 
189 		if (stats[i].ifindex == ifindex) {
190 			index = i;
191 			return index;
192 		}
193 	}
194 
195 	if (index != -1) {
196 		stats[index].ifindex = ifindex;
197 	}
198 
199 	return index;
200 }
201 
202 static int
mptcpstats_get_index(struct mptcp_itf_stats * stats __counted_by (stats_count),uint16_t stats_count,const struct mptsub * mpts)203 mptcpstats_get_index(struct mptcp_itf_stats *stats __counted_by(stats_count), uint16_t stats_count, const struct mptsub *mpts)
204 {
205 	const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
206 	int index;
207 
208 	VERIFY(stats_count <= MPTCP_ITFSTATS_SIZE);
209 
210 	if (ifp == NULL) {
211 		os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
212 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
213 		    sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
214 		return -1;
215 	}
216 
217 	index = mptcpstats_get_index_by_ifindex(stats, MPTCP_ITFSTATS_SIZE, ifp->if_index, true);
218 
219 	if (index != -1) {
220 		if (stats[index].is_expensive == 0) {
221 			stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
222 		}
223 	}
224 
225 	return index;
226 }
227 
228 void
mptcpstats_inc_switch(struct mptses * mpte,const struct mptsub * mpts)229 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
230 {
231 	int index;
232 
233 	tcpstat.tcps_mp_switches++;
234 	mpte->mpte_subflow_switches++;
235 
236 	index = mptcpstats_get_index(mpte->mpte_itfstats, MPTCP_ITFSTATS_SIZE, mpts);
237 
238 	if (index != -1) {
239 		mpte->mpte_itfstats[index].switches++;
240 	}
241 }
242 
243 /*
244  * Flushes all recorded socket options from an MP socket.
245  */
246 static void
mptcp_flush_sopts(struct mptses * mpte)247 mptcp_flush_sopts(struct mptses *mpte)
248 {
249 	struct mptopt *mpo, *tmpo;
250 
251 	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
252 		mptcp_sopt_remove(mpte, mpo);
253 		mptcp_sopt_free(mpo);
254 	}
255 	VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
256 }
257 
258 /*
259  * Create an MPTCP session, called as a result of opening a MPTCP socket.
260  */
261 int
mptcp_session_create(struct mppcb * mpp)262 mptcp_session_create(struct mppcb *mpp)
263 {
264 	struct mpp_mtp *mtp;
265 	struct mppcbinfo *mppi;
266 	struct mptses *mpte;
267 	struct mptcb *mp_tp;
268 
269 	VERIFY(mpp != NULL);
270 	mppi = mpp->mpp_pcbinfo;
271 	VERIFY(mppi != NULL);
272 
273 	mtp = __container_of(mpp, struct mpp_mtp, mpp);
274 	mpte = &mtp->mpp_ses;
275 	mp_tp = &mtp->mtcb;
276 
277 	/* MPTCP Multipath PCB Extension */
278 	bzero(mpte, sizeof(*mpte));
279 	VERIFY(mpp->mpp_pcbe == NULL);
280 	mpp->mpp_pcbe = mpte;
281 	mpte->mpte_mppcb = mpp;
282 	mpte->mpte_mptcb = mp_tp;
283 
284 	TAILQ_INIT(&mpte->mpte_sopts);
285 	TAILQ_INIT(&mpte->mpte_subflows);
286 	mpte->mpte_associd = SAE_ASSOCID_ANY;
287 	mpte->mpte_connid_last = SAE_CONNID_ANY;
288 
289 	mptcp_init_urgency_timer(mpte);
290 
291 	mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
292 	mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
293 
294 	if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
295 		mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
296 	}
297 
298 	mpte->mpte_last_cellicon_set = tcp_now;
299 
300 	/* MPTCP Protocol Control Block */
301 	bzero(mp_tp, sizeof(*mp_tp));
302 	mp_tp->mpt_mpte = mpte;
303 	mp_tp->mpt_state = MPTCPS_CLOSED;
304 
305 	DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
306 
307 	return 0;
308 }
309 
310 struct sockaddr *
mptcp_get_session_dst(struct mptses * mpte,boolean_t ipv6,boolean_t ipv4)311 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
312 {
313 	if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
314 		return SA(&mpte->mpte_sub_dst_v6);
315 	}
316 
317 	if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
318 		return SA(&mpte->mpte_sub_dst_v4);
319 	}
320 
321 	/* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
322 	 * meaning we prefer IPv6 over IPv4.
323 	 */
324 	if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
325 		return SA(&mpte->mpte_sub_dst_v6);
326 	}
327 
328 	if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
329 		return SA(&mpte->mpte_sub_dst_v4);
330 	}
331 
332 	/* We don't yet have a unicast IP */
333 	return NULL;
334 }
335 
336 static void
mptcpstats_get_bytes(struct mptses * mpte,boolean_t initial_cell,uint64_t * cellbytes,uint64_t * allbytes)337 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
338     uint64_t *cellbytes, uint64_t *allbytes)
339 {
340 	int64_t mycellbytes = 0;
341 	uint64_t myallbytes = 0;
342 	int i;
343 
344 	for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
345 		if (mpte->mpte_itfstats[i].is_expensive) {
346 			mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
347 			mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
348 		}
349 
350 		myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
351 		myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
352 	}
353 
354 	if (initial_cell) {
355 		mycellbytes -= mpte->mpte_init_txbytes;
356 		mycellbytes -= mpte->mpte_init_rxbytes;
357 	}
358 
359 	if (mycellbytes < 0) {
360 		os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
361 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
362 		*cellbytes = 0;
363 		*allbytes = 0;
364 	} else {
365 		*cellbytes = mycellbytes;
366 		*allbytes = myallbytes;
367 	}
368 }
369 
370 static void
mptcpstats_session_wrapup(struct mptses * mpte)371 mptcpstats_session_wrapup(struct mptses *mpte)
372 {
373 	boolean_t cell = mpte->mpte_initial_cell;
374 
375 	switch (mpte->mpte_svctype) {
376 	case MPTCP_SVCTYPE_HANDOVER:
377 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
378 			tcpstat.tcps_mptcp_fp_handover_attempt++;
379 
380 			if (cell && mpte->mpte_handshake_success) {
381 				tcpstat.tcps_mptcp_fp_handover_success_cell++;
382 
383 				if (mpte->mpte_used_wifi) {
384 					tcpstat.tcps_mptcp_handover_wifi_from_cell++;
385 				}
386 			} else if (mpte->mpte_handshake_success) {
387 				tcpstat.tcps_mptcp_fp_handover_success_wifi++;
388 
389 				if (mpte->mpte_used_cell) {
390 					tcpstat.tcps_mptcp_handover_cell_from_wifi++;
391 				}
392 			}
393 		} else {
394 			tcpstat.tcps_mptcp_handover_attempt++;
395 
396 			if (cell && mpte->mpte_handshake_success) {
397 				tcpstat.tcps_mptcp_handover_success_cell++;
398 
399 				if (mpte->mpte_used_wifi) {
400 					tcpstat.tcps_mptcp_handover_wifi_from_cell++;
401 				}
402 			} else if (mpte->mpte_handshake_success) {
403 				tcpstat.tcps_mptcp_handover_success_wifi++;
404 
405 				if (mpte->mpte_used_cell) {
406 					tcpstat.tcps_mptcp_handover_cell_from_wifi++;
407 				}
408 			}
409 		}
410 
411 		if (mpte->mpte_handshake_success) {
412 			uint64_t cellbytes;
413 			uint64_t allbytes;
414 
415 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
416 
417 			tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
418 			tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
419 		}
420 		break;
421 	case MPTCP_SVCTYPE_INTERACTIVE:
422 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
423 			tcpstat.tcps_mptcp_fp_interactive_attempt++;
424 
425 			if (mpte->mpte_handshake_success) {
426 				tcpstat.tcps_mptcp_fp_interactive_success++;
427 
428 				if (!cell && mpte->mpte_used_cell) {
429 					tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
430 				}
431 			}
432 		} else {
433 			tcpstat.tcps_mptcp_interactive_attempt++;
434 
435 			if (mpte->mpte_handshake_success) {
436 				tcpstat.tcps_mptcp_interactive_success++;
437 
438 				if (!cell && mpte->mpte_used_cell) {
439 					tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
440 				}
441 			}
442 		}
443 
444 		if (mpte->mpte_handshake_success) {
445 			uint64_t cellbytes;
446 			uint64_t allbytes;
447 
448 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
449 
450 			tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
451 			tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
452 		}
453 		break;
454 	case MPTCP_SVCTYPE_AGGREGATE:
455 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
456 			tcpstat.tcps_mptcp_fp_aggregate_attempt++;
457 
458 			if (mpte->mpte_handshake_success) {
459 				tcpstat.tcps_mptcp_fp_aggregate_success++;
460 			}
461 		} else {
462 			tcpstat.tcps_mptcp_aggregate_attempt++;
463 
464 			if (mpte->mpte_handshake_success) {
465 				tcpstat.tcps_mptcp_aggregate_success++;
466 			}
467 		}
468 
469 		if (mpte->mpte_handshake_success) {
470 			uint64_t cellbytes;
471 			uint64_t allbytes;
472 
473 			mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
474 
475 			tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
476 			tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
477 		}
478 		break;
479 	}
480 
481 	if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
482 		tcpstat.tcps_mptcp_back_to_wifi++;
483 	}
484 
485 	if (mpte->mpte_triggered_cell) {
486 		tcpstat.tcps_mptcp_triggered_cell++;
487 	}
488 }
489 
490 /*
491  * Destroy an MPTCP session.
492  */
493 static void
mptcp_session_destroy(struct mptses * mpte)494 mptcp_session_destroy(struct mptses *mpte)
495 {
496 	struct mptcb *mp_tp = mpte->mpte_mptcb;
497 
498 	VERIFY(mp_tp != NULL);
499 	VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
500 
501 	mptcpstats_session_wrapup(mpte);
502 	mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
503 	mptcp_flush_sopts(mpte);
504 
505 	if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
506 		kfree_data_counted_by(mpte->mpte_itfinfo, mpte->mpte_itfinfo_size);
507 	}
508 	mpte->mpte_itfinfo = NULL;
509 	mpte->mpte_itfinfo_size = 0;
510 
511 	mptcp_freeq(mp_tp);
512 	m_freem_list(mpte->mpte_reinjectq);
513 
514 	os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
515 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
516 }
517 
518 boolean_t
mptcp_ok_to_create_subflows(struct mptcb * mp_tp)519 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
520 {
521 	return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
522 	       mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
523 	       !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
524 }
525 
526 static int
mptcp_synthesize_nat64(struct in6_addr * addr0,uint32_t len,const struct in_addr * addrv4_0)527 mptcp_synthesize_nat64(struct in6_addr *addr0, uint32_t len,
528     const struct in_addr *addrv4_0)
529 {
530 	static const struct in6_addr well_known_prefix = {
531 		.__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
532 			                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
533 			                 0x00, 0x00, 0x00, 0x00},
534 	};
535 	struct in6_addr *addr = addr0;
536 	char *ptr = (char *)addr;
537 	const struct in_addr *addrv4 = addrv4_0;
538 	const char *ptrv4 = (const char *)addrv4;
539 
540 	if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
541 	    IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
542 	    IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
543 	    IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
544 	    IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
545 	    IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
546 	    INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
547 		return -1;
548 	}
549 
550 	/* Check for the well-known prefix */
551 	if (len == NAT64_PREFIX_LEN_96 &&
552 	    IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
553 		if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
554 		    IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
555 			return -1;
556 		}
557 	}
558 
559 	switch (len) {
560 	case NAT64_PREFIX_LEN_96:
561 		memcpy(ptr + 12, ptrv4, 4);
562 		break;
563 	case NAT64_PREFIX_LEN_64:
564 		memcpy(ptr + 9, ptrv4, 4);
565 		break;
566 	case NAT64_PREFIX_LEN_56:
567 		memcpy(ptr + 7, ptrv4, 1);
568 		memcpy(ptr + 9, ptrv4 + 1, 3);
569 		break;
570 	case NAT64_PREFIX_LEN_48:
571 		memcpy(ptr + 6, ptrv4, 2);
572 		memcpy(ptr + 9, ptrv4 + 2, 2);
573 		break;
574 	case NAT64_PREFIX_LEN_40:
575 		memcpy(ptr + 5, ptrv4, 3);
576 		memcpy(ptr + 9, ptrv4 + 3, 1);
577 		break;
578 	case NAT64_PREFIX_LEN_32:
579 		memcpy(ptr + 4, ptrv4, 4);
580 		break;
581 	default:
582 		panic("NAT64-prefix len is wrong: %u", len);
583 	}
584 
585 	return 0;
586 }
587 
588 static void
mptcp_trigger_cell_bringup(struct mptses * mpte)589 mptcp_trigger_cell_bringup(struct mptses *mpte)
590 {
591 	struct socket *mp_so = mptetoso(mpte);
592 
593 	if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
594 		uuid_string_t uuidstr;
595 		int err;
596 
597 		socket_unlock(mp_so, 0);
598 		err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
599 		    TRUE);
600 		socket_lock(mp_so, 0);
601 
602 		if (err == 0) {
603 			mpte->mpte_triggered_cell = 1;
604 		}
605 
606 		uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
607 		os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
608 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
609 	} else {
610 		os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
611 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
612 	}
613 }
614 
615 static boolean_t
mptcp_subflow_disconnecting(struct mptsub * mpts)616 mptcp_subflow_disconnecting(struct mptsub *mpts)
617 {
618 	if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
619 		return true;
620 	}
621 
622 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
623 		return true;
624 	}
625 
626 	if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
627 		return true;
628 	}
629 
630 	return false;
631 }
632 
633 /*
634  * In Handover mode, only create cell subflow if
635  * - Symptoms marked WiFi as weak:
636  *   Here, if we are sending data, then we can check the RTO-state. That is a
637  *   stronger signal of WiFi quality than the Symptoms indicator.
638  *   If however we are not sending any data, the only thing we can do is guess
639  *   and thus bring up Cell.
640  *
641  * - Symptoms marked WiFi as unknown:
642  *   In this state we don't know what the situation is and thus remain
643  *   conservative, only bringing up cell if there are retransmissions going on.
644  */
645 static boolean_t
mptcp_handover_use_cellular(struct mptses * mpte,struct tcpcb * tp)646 mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
647 {
648 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
649 
650 	if (wifi_quality == MPTCP_WIFI_QUALITY_GOOD) {
651 		/* WiFi is good - don't use cell */
652 		return false;
653 	}
654 
655 	if (wifi_quality == MPTCP_WIFI_QUALITY_UNSURE) {
656 		/*
657 		 * We are in unknown state, only use Cell if we have confirmed
658 		 * that WiFi is bad.
659 		 */
660 		if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
661 			return true;
662 		} else {
663 			return false;
664 		}
665 	}
666 
667 	if (wifi_quality == MPTCP_WIFI_QUALITY_BAD) {
668 		/*
669 		 * WiFi is confirmed to be bad from Symptoms-Framework.
670 		 * If we are sending data, check the RTOs.
671 		 * Otherwise, be pessimistic and use Cell.
672 		 */
673 		if (mptetoso(mpte)->so_snd.sb_cc != 0) {
674 			if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
675 				return true;
676 			} else {
677 				return false;
678 			}
679 		} else {
680 			return true;
681 		}
682 	}
683 
684 	return false;
685 }
686 
687 void
mptcp_check_subflows_and_add(struct mptses * mpte)688 mptcp_check_subflows_and_add(struct mptses *mpte)
689 {
690 	struct mptcb *mp_tp = mpte->mpte_mptcb;
691 	boolean_t cellular_viable = FALSE;
692 	boolean_t want_cellular = TRUE;
693 	uint32_t i;
694 
695 	if (!mptcp_ok_to_create_subflows(mp_tp)) {
696 		os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
697 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
698 		return;
699 	}
700 
701 	/* Just to see if we have an IP-address available */
702 	if (mptcp_get_session_dst(mpte, false, false) == NULL) {
703 		return;
704 	}
705 
706 	for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
707 		boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
708 		struct mpt_itf_info *info;
709 		struct sockaddr_in6 nat64pre;
710 		struct sockaddr *dst;
711 		struct mptsub *mpts;
712 		struct ifnet *ifp;
713 		uint32_t ifindex;
714 
715 		info = &mpte->mpte_itfinfo[i];
716 
717 		ifindex = info->ifindex;
718 		if (ifindex == IFSCOPE_NONE) {
719 			continue;
720 		}
721 
722 		os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
723 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
724 		    info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
725 
726 		if (info->no_mptcp_support) {
727 			continue;
728 		}
729 
730 		ifnet_head_lock_shared();
731 		ifp = ifindex2ifnet[ifindex];
732 		ifnet_head_done();
733 
734 		if (ifp == NULL) {
735 			continue;
736 		}
737 
738 		if (IFNET_IS_CELLULAR(ifp)) {
739 			cellular_viable = TRUE;
740 
741 			if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
742 			    mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
743 				if (mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD) {
744 					continue;
745 				}
746 			}
747 		}
748 
749 		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
750 			const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
751 			struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
752 
753 			if (subifp == NULL) {
754 				continue;
755 			}
756 
757 			/*
758 			 * If there is at least one functioning subflow on WiFi
759 			 * and we are checking for the cell interface, then
760 			 * we always need to ask symptoms for permission as
761 			 * cell is triggered even if WiFi is available.
762 			 */
763 			if (!IFNET_IS_CELLULAR(subifp) &&
764 			    !mptcp_subflow_disconnecting(mpts) &&
765 			    IFNET_IS_CELLULAR(ifp)) {
766 				need_to_ask_symptoms = TRUE;
767 			}
768 
769 			if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
770 				os_log(mptcp_log_handle,
771 				    "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
772 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
773 				    mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
774 				    IFNET_IS_CELLULAR(subifp),
775 				    mptcp_wifi_quality_for_session(mpte),
776 				    mpts->mpts_flags,
777 				    tp->t_rxtshift,
778 				    !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
779 				    mptetoso(mpte)->so_snd.sb_cc,
780 				    ifindex, subifp->if_index,
781 				    tp->t_srtt >> TCP_RTT_SHIFT,
782 				    tp->t_rttvar >> TCP_RTTVAR_SHIFT,
783 				    tp->t_rxtcur);
784 
785 				if (!IFNET_IS_CELLULAR(subifp) &&
786 				    !mptcp_subflow_disconnecting(mpts) &&
787 				    (mpts->mpts_flags & MPTSF_CONNECTED) &&
788 				    !mptcp_handover_use_cellular(mpte, tp)) {
789 					found = TRUE;
790 
791 					/* We found a proper subflow on WiFi - no need for cell */
792 					want_cellular = FALSE;
793 					break;
794 				}
795 			} else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
796 				uint64_t time_now = mach_continuous_time();
797 
798 				os_log(mptcp_log_handle,
799 				    "%s - %lx: target-based: %llu now %llu wifi quality %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
800 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
801 				    time_now, mptcp_wifi_quality_for_session(mpte),
802 				    IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
803 				    mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
804 
805 				if (!IFNET_IS_CELLULAR(subifp) &&
806 				    !mptcp_subflow_disconnecting(mpts) &&
807 				    (mpte->mpte_time_target == 0 ||
808 				    (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
809 				    mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD)) {
810 					found = TRUE;
811 
812 					want_cellular = FALSE;
813 					break;
814 				}
815 			}
816 
817 			if (subifp->if_index == ifindex &&
818 			    !mptcp_subflow_disconnecting(mpts)) {
819 				/*
820 				 * We found a subflow on this interface.
821 				 * No need to create a new one.
822 				 */
823 				found = TRUE;
824 				break;
825 			}
826 		}
827 
828 		if (found) {
829 			continue;
830 		}
831 
832 		if (need_to_ask_symptoms &&
833 		    !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
834 		    !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
835 		    mptcp_developer_mode == 0) {
836 			mptcp_ask_symptoms(mpte);
837 			return;
838 		}
839 
840 		dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
841 
842 		if (dst->sa_family == AF_INET &&
843 		    !info->has_v4_conn && info->has_nat64_conn) {
844 			struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
845 			int error, j;
846 
847 			SOCKADDR_ZERO(&nat64pre, sizeof(struct sockaddr_in6));
848 
849 			error = ifnet_get_nat64prefix(ifp, nat64prefixes);
850 			if (error) {
851 				os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
852 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
853 				continue;
854 			}
855 
856 			for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
857 				if (nat64prefixes[j].prefix_len != 0) {
858 					break;
859 				}
860 			}
861 
862 			VERIFY(j < NAT64_MAX_NUM_PREFIXES);
863 
864 			error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
865 			    nat64prefixes[j].prefix_len,
866 			    &SIN(dst)->sin_addr);
867 			if (error != 0) {
868 				os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
869 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
870 				continue;
871 			}
872 
873 			memcpy(&nat64pre.sin6_addr,
874 			    &nat64prefixes[j].ipv6_prefix,
875 			    sizeof(nat64pre.sin6_addr));
876 			nat64pre.sin6_len = sizeof(struct sockaddr_in6);
877 			nat64pre.sin6_family = AF_INET6;
878 			nat64pre.sin6_port = SIN(dst)->sin_port;
879 			nat64pre.sin6_flowinfo = 0;
880 			nat64pre.sin6_scope_id = 0;
881 
882 			dst = SA(&nat64pre);
883 		}
884 
885 		if (dst->sa_family == AF_INET && !info->has_v4_conn) {
886 			continue;
887 		}
888 		if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
889 			continue;
890 		}
891 
892 		mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
893 	}
894 
895 	if (!cellular_viable && want_cellular) {
896 		/* Trigger Cell Bringup */
897 		mptcp_trigger_cell_bringup(mpte);
898 	}
899 }
900 
901 static void
mptcp_remove_cell_subflows(struct mptses * mpte)902 mptcp_remove_cell_subflows(struct mptses *mpte)
903 {
904 	struct mptsub *mpts, *tmpts;
905 
906 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
907 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
908 
909 		if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
910 			continue;
911 		}
912 
913 		os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
914 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
915 
916 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
917 	}
918 
919 	return;
920 }
921 
922 static void
mptcp_remove_wifi_subflows(struct mptses * mpte)923 mptcp_remove_wifi_subflows(struct mptses *mpte)
924 {
925 	struct mptsub *mpts, *tmpts;
926 
927 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
928 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
929 
930 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
931 			continue;
932 		}
933 
934 		os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
935 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
936 
937 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
938 	}
939 
940 	return;
941 }
942 
943 static void
mptcp_pure_handover_subflows_remove(struct mptses * mpte)944 mptcp_pure_handover_subflows_remove(struct mptses *mpte)
945 {
946 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
947 	boolean_t found_working_wifi_subflow = false;
948 	boolean_t found_working_cell_subflow = false;
949 
950 	struct mptsub *mpts;
951 
952 	/*
953 	 * Look for a subflow that is on a non-cellular interface in connected
954 	 * state.
955 	 *
956 	 * In that case, remove all cellular subflows.
957 	 *
958 	 * If however there is no connected subflow
959 	 */
960 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
961 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
962 		struct socket *so;
963 		struct tcpcb *tp;
964 
965 		if (ifp == NULL) {
966 			continue;
967 		}
968 
969 		so = mpts->mpts_socket;
970 		tp = sototcpcb(so);
971 
972 		if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
973 		    tp->t_state != TCPS_ESTABLISHED ||
974 		    mptcp_subflow_disconnecting(mpts)) {
975 			continue;
976 		}
977 
978 		if (IFNET_IS_CELLULAR(ifp)) {
979 			found_working_cell_subflow = true;
980 		} else {
981 			os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
982 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
983 			if (!mptcp_handover_use_cellular(mpte, tp)) {
984 				found_working_wifi_subflow = true;
985 			}
986 		}
987 	}
988 
989 	/*
990 	 * Couldn't find a working subflow, let's not remove those on a cellular
991 	 * interface.
992 	 */
993 	os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
994 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
995 	    found_working_wifi_subflow, found_working_cell_subflow);
996 	if (!found_working_wifi_subflow && wifi_quality != MPTCP_WIFI_QUALITY_GOOD) {
997 		if (found_working_cell_subflow) {
998 			mptcp_remove_wifi_subflows(mpte);
999 		}
1000 		return;
1001 	}
1002 
1003 	mptcp_remove_cell_subflows(mpte);
1004 }
1005 
1006 static void
mptcp_handover_subflows_remove(struct mptses * mpte)1007 mptcp_handover_subflows_remove(struct mptses *mpte)
1008 {
1009 	mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
1010 	boolean_t found_working_subflow = false;
1011 	struct mptsub *mpts;
1012 
1013 	/*
1014 	 * Look for a subflow that is on a non-cellular interface
1015 	 * and actually works (aka, no retransmission timeout).
1016 	 */
1017 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1018 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1019 		struct socket *so;
1020 		struct tcpcb *tp;
1021 
1022 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1023 			continue;
1024 		}
1025 
1026 		so = mpts->mpts_socket;
1027 		tp = sototcpcb(so);
1028 
1029 		if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1030 		    tp->t_state != TCPS_ESTABLISHED) {
1031 			continue;
1032 		}
1033 
1034 		os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
1035 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
1036 
1037 		if (!mptcp_handover_use_cellular(mpte, tp)) {
1038 			found_working_subflow = true;
1039 			break;
1040 		}
1041 	}
1042 
1043 	/*
1044 	 * Couldn't find a working subflow, let's not remove those on a cellular
1045 	 * interface.
1046 	 */
1047 	if (!found_working_subflow) {
1048 		return;
1049 	}
1050 
1051 	mptcp_remove_cell_subflows(mpte);
1052 }
1053 
1054 static void
mptcp_targetbased_subflows_remove(struct mptses * mpte)1055 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1056 {
1057 	uint64_t time_now = mach_continuous_time();
1058 	struct mptsub *mpts;
1059 
1060 	if (mpte->mpte_time_target != 0 &&
1061 	    (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1062 	    mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
1063 		/* WiFi is bad and we are below the target - don't remove any subflows */
1064 		return;
1065 	}
1066 
1067 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1068 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1069 
1070 		if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1071 			continue;
1072 		}
1073 
1074 		/* We have a functioning subflow on WiFi. No need for cell! */
1075 		if (mpts->mpts_flags & MPTSF_CONNECTED &&
1076 		    !mptcp_subflow_disconnecting(mpts)) {
1077 			mptcp_remove_cell_subflows(mpte);
1078 			break;
1079 		}
1080 	}
1081 }
1082 
1083 /*
1084  * Based on the MPTCP Service-type and the state of the subflows, we
1085  * will destroy subflows here.
1086  */
1087 void
mptcp_check_subflows_and_remove(struct mptses * mpte)1088 mptcp_check_subflows_and_remove(struct mptses *mpte)
1089 {
1090 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1091 		return;
1092 	}
1093 
1094 	socket_lock_assert_owned(mptetoso(mpte));
1095 
1096 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1097 		mptcp_pure_handover_subflows_remove(mpte);
1098 	}
1099 
1100 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1101 		mptcp_handover_subflows_remove(mpte);
1102 	}
1103 
1104 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1105 		mptcp_targetbased_subflows_remove(mpte);
1106 	}
1107 }
1108 
1109 static void
mptcp_remove_subflows(struct mptses * mpte)1110 mptcp_remove_subflows(struct mptses *mpte)
1111 {
1112 	struct mptsub *mpts, *tmpts;
1113 
1114 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1115 		return;
1116 	}
1117 
1118 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1119 		const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1120 		boolean_t found = false;
1121 		uint32_t ifindex;
1122 		uint32_t i;
1123 
1124 		if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1125 			mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1126 
1127 			os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1128 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1129 			    ifp ? ifp->if_index : -1);
1130 			soevent(mpts->mpts_socket,
1131 			    SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1132 
1133 			continue;
1134 		}
1135 
1136 		if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1137 			continue;
1138 		}
1139 
1140 		if (ifp) {
1141 			ifindex = ifp->if_index;
1142 		} else {
1143 			ifindex = mpts->mpts_ifscope;
1144 		}
1145 
1146 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1147 			if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1148 				continue;
1149 			}
1150 
1151 			if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1152 				if (mpts->mpts_dst.sa_family == AF_INET6 &&
1153 				    (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1154 					found = true;
1155 					break;
1156 				}
1157 
1158 				if (mpts->mpts_dst.sa_family == AF_INET &&
1159 				    mpte->mpte_itfinfo[i].has_v4_conn) {
1160 					found = true;
1161 					break;
1162 				}
1163 			}
1164 		}
1165 
1166 		if (!found) {
1167 			os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1168 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1169 			    ifindex, mpts->mpts_flags);
1170 
1171 			soevent(mpts->mpts_socket,
1172 			    SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1173 		}
1174 	}
1175 }
1176 
1177 static void
mptcp_create_subflows(__unused void * arg)1178 mptcp_create_subflows(__unused void *arg)
1179 {
1180 	struct mppcb *mpp;
1181 
1182 	/*
1183 	 * Start with clearing, because we might be processing connections
1184 	 * while a new event comes in.
1185 	 */
1186 	if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1187 		os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1188 	}
1189 
1190 	/* Iterate over all MPTCP connections */
1191 
1192 	lck_mtx_lock(&mtcbinfo.mppi_lock);
1193 
1194 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1195 		struct socket *mp_so = mpp->mpp_socket;
1196 		struct mptses *mpte = mpp->mpp_pcbe;
1197 
1198 		socket_lock(mp_so, 1);
1199 		if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS) ||
1200 		    !(mpte->mpte_flags & MPTE_ITFINFO_INIT)) {
1201 			socket_unlock(mp_so, 1);
1202 			continue;
1203 		}
1204 
1205 		VERIFY(mp_so->so_usecount > 0);
1206 
1207 		mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1208 
1209 		mptcp_check_subflows_and_add(mpte);
1210 		mptcp_remove_subflows(mpte);
1211 
1212 		mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1213 		socket_unlock(mp_so, 1);
1214 	}
1215 
1216 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
1217 }
1218 
1219 /*
1220  * We need this because we are coming from an NECP-event. This event gets posted
1221  * while holding NECP-locks. The creation of the subflow however leads us back
1222  * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1223  * So, we would deadlock there as we already hold the NECP-lock.
1224  *
1225  * So, let's schedule this separately. It also gives NECP the chance to make
1226  * progress, without having to wait for MPTCP to finish its subflow creation.
1227  */
1228 void
mptcp_sched_create_subflows(struct mptses * mpte)1229 mptcp_sched_create_subflows(struct mptses *mpte)
1230 {
1231 	struct mppcb *mpp = mpte->mpte_mppcb;
1232 	struct mptcb *mp_tp = mpte->mpte_mptcb;
1233 	struct socket *mp_so = mpp->mpp_socket;
1234 
1235 	if (!mptcp_ok_to_create_subflows(mp_tp)) {
1236 		os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1237 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1238 		return;
1239 	}
1240 
1241 	if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1242 		mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1243 		mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1244 	}
1245 
1246 	if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1247 		return;
1248 	}
1249 
1250 	/* Do the call in 100ms to allow NECP to schedule it on all sockets */
1251 	timeout(mptcp_create_subflows, NULL, hz / 10);
1252 }
1253 
1254 /*
1255  * Allocate an MPTCP socket option structure.
1256  */
1257 struct mptopt *
mptcp_sopt_alloc(void)1258 mptcp_sopt_alloc(void)
1259 {
1260 	return zalloc_flags(mptopt_zone, Z_WAITOK | Z_ZERO);
1261 }
1262 
1263 /*
1264  * Free an MPTCP socket option structure.
1265  */
1266 void
mptcp_sopt_free(struct mptopt * mpo)1267 mptcp_sopt_free(struct mptopt *mpo)
1268 {
1269 	VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1270 
1271 	zfree(mptopt_zone, mpo);
1272 }
1273 
1274 /*
1275  * Add a socket option to the MPTCP socket option list.
1276  */
1277 void
mptcp_sopt_insert(struct mptses * mpte,struct mptopt * mpo)1278 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1279 {
1280 	socket_lock_assert_owned(mptetoso(mpte));
1281 	mpo->mpo_flags |= MPOF_ATTACHED;
1282 	TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1283 }
1284 
1285 /*
1286  * Remove a socket option from the MPTCP socket option list.
1287  */
1288 void
mptcp_sopt_remove(struct mptses * mpte,struct mptopt * mpo)1289 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1290 {
1291 	socket_lock_assert_owned(mptetoso(mpte));
1292 	VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1293 	mpo->mpo_flags &= ~MPOF_ATTACHED;
1294 	TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1295 }
1296 
1297 /*
1298  * Search for an existing <sopt_level,sopt_name> socket option.
1299  */
1300 struct mptopt *
mptcp_sopt_find(struct mptses * mpte,struct sockopt * sopt)1301 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1302 {
1303 	struct mptopt *mpo;
1304 
1305 	socket_lock_assert_owned(mptetoso(mpte));
1306 
1307 	TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1308 		if (mpo->mpo_level == sopt->sopt_level &&
1309 		    mpo->mpo_name == sopt->sopt_name) {
1310 			break;
1311 		}
1312 	}
1313 	return mpo;
1314 }
1315 
1316 /*
1317  * Allocate a MPTCP subflow structure.
1318  */
1319 static struct mptsub *
mptcp_subflow_alloc(void)1320 mptcp_subflow_alloc(void)
1321 {
1322 	return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1323 }
1324 
1325 /*
1326  * Deallocate a subflow structure, called when all of the references held
1327  * on it have been released.  This implies that the subflow has been deleted.
1328  */
1329 static void
mptcp_subflow_free(struct mptsub * mpts)1330 mptcp_subflow_free(struct mptsub *mpts)
1331 {
1332 	VERIFY(mpts->mpts_refcnt == 0);
1333 	VERIFY(mpts->mpts_mpte == NULL);
1334 	VERIFY(mpts->mpts_socket == NULL);
1335 
1336 	free_sockaddr(mpts->mpts_src);
1337 
1338 	zfree(mptsub_zone, mpts);
1339 }
1340 
1341 static void
mptcp_subflow_addref(struct mptsub * mpts)1342 mptcp_subflow_addref(struct mptsub *mpts)
1343 {
1344 	if (++mpts->mpts_refcnt == 0) {
1345 		panic("%s: mpts %p wraparound refcnt", __func__, mpts);
1346 	}
1347 	/* NOTREACHED */
1348 }
1349 
1350 static void
mptcp_subflow_remref(struct mptsub * mpts)1351 mptcp_subflow_remref(struct mptsub *mpts)
1352 {
1353 	if (mpts->mpts_refcnt == 0) {
1354 		panic("%s: mpts %p negative refcnt", __func__, mpts);
1355 		/* NOTREACHED */
1356 	}
1357 	if (--mpts->mpts_refcnt > 0) {
1358 		return;
1359 	}
1360 
1361 	/* callee will unlock and destroy lock */
1362 	mptcp_subflow_free(mpts);
1363 }
1364 
1365 static void
mptcp_subflow_attach(struct mptses * mpte,struct mptsub * mpts,struct socket * so)1366 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1367 {
1368 	struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1369 	struct tcpcb *tp = sototcpcb(so);
1370 
1371 	/*
1372 	 * From this moment on, the subflow is linked to the MPTCP-connection.
1373 	 * Locking,... happens now at the MPTCP-layer
1374 	 */
1375 	tp->t_mptcb = mpte->mpte_mptcb;
1376 	so->so_flags |= SOF_MP_SUBFLOW;
1377 	mp_so->so_usecount++;
1378 
1379 	/*
1380 	 * Insert the subflow into the list, and associate the MPTCP PCB
1381 	 * as well as the the subflow socket.  From this point on, removing
1382 	 * the subflow needs to be done via mptcp_subflow_del().
1383 	 */
1384 	TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1385 	mpte->mpte_numflows++;
1386 
1387 	mpts->mpts_mpte = mpte;
1388 	mpts->mpts_socket = so;
1389 	tp->t_mpsub = mpts;
1390 	mptcp_subflow_addref(mpts);     /* for being in MPTCP subflow list */
1391 	mptcp_subflow_addref(mpts);     /* for subflow socket */
1392 }
1393 
1394 static void
mptcp_subflow_necp_cb(void * handle,__unused int action,__unused uint32_t interface_index,uint32_t necp_flags,bool * viable)1395 mptcp_subflow_necp_cb(void *handle, __unused int action,
1396     __unused uint32_t interface_index,
1397     uint32_t necp_flags, bool *viable)
1398 {
1399 	boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1400 	struct inpcb *inp = (struct inpcb *)handle;
1401 	struct socket *so = inp->inp_socket;
1402 	struct mptsub *mpts;
1403 	struct mptses *mpte;
1404 
1405 	if (low_power) {
1406 		action = NECP_CLIENT_CBACTION_NONVIABLE;
1407 	}
1408 
1409 	if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1410 		return;
1411 	}
1412 
1413 	/*
1414 	 * The socket is being garbage-collected. There is nothing to be done
1415 	 * here.
1416 	 */
1417 	if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1418 		return;
1419 	}
1420 
1421 	socket_lock(so, 1);
1422 
1423 	/* Check again after we acquired the lock. */
1424 	if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1425 		goto out;
1426 	}
1427 
1428 	mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1429 	mpts = sototcpcb(so)->t_mpsub;
1430 
1431 	os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1432 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1433 
1434 	mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1435 
1436 	mptcp_sched_create_subflows(mpte);
1437 
1438 	if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1439 	    mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
1440 	    mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1441 	    viable != NULL) {
1442 		*viable = 1;
1443 	}
1444 
1445 out:
1446 	socket_unlock(so, 1);
1447 }
1448 
1449 /*
1450  * Create an MPTCP subflow socket.
1451  */
1452 static int
mptcp_subflow_socreate(struct mptses * mpte,struct mptsub * mpts,int dom,struct socket ** so)1453 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1454     struct socket **so)
1455 {
1456 	lck_mtx_t *subflow_mtx;
1457 	struct mptopt smpo, *mpo, *tmpo;
1458 	struct proc *p;
1459 	struct socket *mp_so;
1460 	struct mppcb *mpp;
1461 	int error;
1462 
1463 	*so = NULL;
1464 
1465 	mp_so = mptetoso(mpte);
1466 	mpp = mpsotomppcb(mp_so);
1467 
1468 	p = proc_find(mp_so->last_pid);
1469 	if (p == PROC_NULL) {
1470 		os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1471 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1472 
1473 		mptcp_subflow_free(mpts);
1474 		return ESRCH;
1475 	}
1476 
1477 	/*
1478 	 * Create the subflow socket (multipath subflow, non-blocking.)
1479 	 *
1480 	 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1481 	 * socket; it will be cleared when the socket is peeled off or closed.
1482 	 * It also indicates to the underlying TCP to handle MPTCP options.
1483 	 * A multipath subflow socket implies SS_NOFDREF state.
1484 	 */
1485 
1486 	/*
1487 	 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1488 	 * the ipi-lock. We cannot hold the socket-lock at that point.
1489 	 */
1490 	socket_unlock(mp_so, 0);
1491 	error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1492 	    SOCF_MPTCP, PROC_NULL);
1493 	socket_lock(mp_so, 0);
1494 	if (error) {
1495 		os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1496 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1497 
1498 		proc_rele(p);
1499 
1500 		mptcp_subflow_free(mpts);
1501 		return error;
1502 	}
1503 
1504 	/*
1505 	 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1506 	 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1507 	 * Which is why we also need to get the lock with pr_getlock, as after
1508 	 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1509 	 */
1510 	subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1511 	lck_mtx_lock(subflow_mtx);
1512 
1513 	/*
1514 	 * Must be the first thing we do, to make sure all pointers for this
1515 	 * subflow are set.
1516 	 */
1517 	mptcp_subflow_attach(mpte, mpts, *so);
1518 
1519 	/*
1520 	 * A multipath subflow socket is used internally in the kernel,
1521 	 * therefore it does not have a file desciptor associated by
1522 	 * default.
1523 	 */
1524 	(*so)->so_state |= SS_NOFDREF;
1525 
1526 	lck_mtx_unlock(subflow_mtx);
1527 
1528 	/* prevent the socket buffers from being compressed */
1529 	(*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1530 	(*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1531 
1532 	/* Inherit preconnect and TFO data flags */
1533 	if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1534 		(*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1535 	}
1536 	if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1537 		(*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1538 	}
1539 	if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1540 		(*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1541 	}
1542 
1543 	/* Inherit uuid and create the related flow. */
1544 	if (!uuid_is_null(mpp->necp_client_uuid)) {
1545 		struct mptcb *mp_tp = mpte->mpte_mptcb;
1546 
1547 		sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1548 
1549 		/*
1550 		 * A note on the unlock: With MPTCP, we do multiple times a
1551 		 * necp_client_register_socket_flow. This is problematic,
1552 		 * because now the lock-ordering guarantee (first necp-locks,
1553 		 * then socket-locks) is no more respected. So, we need to
1554 		 * unlock here.
1555 		 */
1556 		socket_unlock(mp_so, 0);
1557 		error = necp_client_register_socket_flow(mp_so->last_pid,
1558 		    mpp->necp_client_uuid, sotoinpcb(*so));
1559 		socket_lock(mp_so, 0);
1560 
1561 		if (error) {
1562 			os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1563 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1564 
1565 			goto out_err;
1566 		}
1567 
1568 		/* Possible state-change during the unlock above */
1569 		if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1570 		    (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1571 			os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1572 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1573 			    mp_tp->mpt_state, mp_tp->mpt_flags);
1574 
1575 			error = EINVAL;
1576 			goto out_err;
1577 		}
1578 
1579 		uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpp->necp_client_uuid);
1580 	}
1581 
1582 	if (mpp->inp_necp_attributes.inp_domain != NULL) {
1583 		char *buffer = NULL;
1584 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain);
1585 		buffer = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1586 		if (buffer != NULL) {
1587 			sotoinpcb(*so)->inp_necp_attributes.inp_domain = strlcpy_ret(buffer, mpp->inp_necp_attributes.inp_domain, string_size + 1);
1588 		} else {
1589 			sotoinpcb(*so)->inp_necp_attributes.inp_domain = NULL;
1590 		}
1591 	}
1592 	if (mpp->inp_necp_attributes.inp_account != NULL) {
1593 		char *buffer = NULL;
1594 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_account);
1595 		buffer = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1596 		if (buffer != NULL) {
1597 			sotoinpcb(*so)->inp_necp_attributes.inp_account = strlcpy_ret(buffer, mpp->inp_necp_attributes.inp_account, string_size + 1);
1598 		} else {
1599 			sotoinpcb(*so)->inp_necp_attributes.inp_account = NULL;
1600 		}
1601 	}
1602 
1603 	if (mpp->inp_necp_attributes.inp_domain_owner != NULL) {
1604 		char *buffer = NULL;
1605 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain_owner);
1606 		buffer = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1607 		if (buffer != NULL) {
1608 			sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = strlcpy_ret(buffer, mpp->inp_necp_attributes.inp_domain_owner, string_size + 1);
1609 		} else {
1610 			sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = NULL;
1611 		}
1612 	}
1613 
1614 	if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) {
1615 		char *buffer = NULL;
1616 		size_t string_size = strlen(mpp->inp_necp_attributes.inp_tracker_domain);
1617 		buffer = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1618 		if (buffer != NULL) {
1619 			sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = strlcpy_ret(buffer, mpp->inp_necp_attributes.inp_tracker_domain, string_size + 1);
1620 		} else {
1621 			sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = NULL;
1622 		}
1623 	}
1624 
1625 	/* Needs to happen prior to the delegation! */
1626 	(*so)->last_pid = mp_so->last_pid;
1627 
1628 	if (mp_so->so_flags & SOF_DELEGATED) {
1629 		if (mpte->mpte_epid) {
1630 			error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1631 			if (error) {
1632 				os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1633 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1634 				goto out_err;
1635 			}
1636 		}
1637 		if (!uuid_is_null(mpte->mpte_euuid)) {
1638 			error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1639 			if (error) {
1640 				os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1641 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1642 				goto out_err;
1643 			}
1644 		}
1645 	}
1646 
1647 	/* inherit the other socket options */
1648 	bzero(&smpo, sizeof(smpo));
1649 	smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1650 	smpo.mpo_level = SOL_SOCKET;
1651 	smpo.mpo_intval = 1;
1652 
1653 	/* disable SIGPIPE */
1654 	smpo.mpo_name = SO_NOSIGPIPE;
1655 	if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1656 		goto out_err;
1657 	}
1658 
1659 	/* find out if the subflow's source address goes away */
1660 	smpo.mpo_name = SO_NOADDRERR;
1661 	if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1662 		goto out_err;
1663 	}
1664 
1665 	if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1666 		/*
1667 		 * On secondary subflows we might need to set the cell-fallback
1668 		 * flag (see conditions in mptcp_subflow_sosetopt).
1669 		 */
1670 		smpo.mpo_level = SOL_SOCKET;
1671 		smpo.mpo_name = SO_MARK_CELLFALLBACK;
1672 		smpo.mpo_intval = 1;
1673 		if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1674 			goto out_err;
1675 		}
1676 	}
1677 
1678 	/* replay setsockopt(2) on the subflow sockets for eligible options */
1679 	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1680 		int interim;
1681 
1682 		if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1683 			continue;
1684 		}
1685 
1686 		/*
1687 		 * Skip those that are handled internally; these options
1688 		 * should not have been recorded and marked with the
1689 		 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1690 		 */
1691 		if (mpo->mpo_level == SOL_SOCKET &&
1692 		    (mpo->mpo_name == SO_NOSIGPIPE ||
1693 		    mpo->mpo_name == SO_NOADDRERR ||
1694 		    mpo->mpo_name == SO_KEEPALIVE)) {
1695 			continue;
1696 		}
1697 
1698 		interim = (mpo->mpo_flags & MPOF_INTERIM);
1699 		if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1700 			os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1701 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1702 			    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1703 			    mpo->mpo_intval);
1704 			mptcp_sopt_remove(mpte, mpo);
1705 			mptcp_sopt_free(mpo);
1706 			continue;
1707 		}
1708 	}
1709 
1710 	/*
1711 	 * We need to receive everything that the subflow socket has,
1712 	 * so use a customized socket receive function.  We will undo
1713 	 * this when the socket is peeled off or closed.
1714 	 */
1715 	switch (dom) {
1716 	case PF_INET:
1717 		(*so)->so_proto = &mptcp_subflow_protosw;
1718 		break;
1719 	case PF_INET6:
1720 		(*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1721 		break;
1722 	default:
1723 		VERIFY(0);
1724 		/* NOTREACHED */
1725 	}
1726 
1727 	proc_rele(p);
1728 
1729 	DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1730 	    int, dom, int, error);
1731 
1732 	return 0;
1733 
1734 out_err:
1735 	mptcp_subflow_abort(mpts, error);
1736 
1737 	proc_rele(p);
1738 
1739 	return error;
1740 }
1741 
1742 /*
1743  * Close an MPTCP subflow socket.
1744  *
1745  * Note that this may be called on an embryonic subflow, and the only
1746  * thing that is guaranteed valid is the protocol-user request.
1747  */
1748 static void
mptcp_subflow_soclose(struct mptsub * mpts)1749 mptcp_subflow_soclose(struct mptsub *mpts)
1750 {
1751 	struct socket *so = mpts->mpts_socket;
1752 
1753 	if (mpts->mpts_flags & MPTSF_CLOSED) {
1754 		return;
1755 	}
1756 
1757 	VERIFY(so != NULL);
1758 	VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1759 	VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1760 
1761 	DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1762 	    struct socket *, so,
1763 	    struct sockbuf *, &so->so_rcv,
1764 	    struct sockbuf *, &so->so_snd,
1765 	    struct mptses *, mpts->mpts_mpte);
1766 
1767 	mpts->mpts_flags |= MPTSF_CLOSED;
1768 
1769 	if (so->so_retaincnt == 0) {
1770 		soclose_locked(so);
1771 
1772 		return;
1773 	} else {
1774 		VERIFY(so->so_usecount > 0);
1775 		so->so_usecount--;
1776 	}
1777 
1778 	return;
1779 }
1780 
1781 static void
mptcp_attach_to_subf(struct socket * so,struct mptcb * mp_tp,uint8_t addr_id)1782 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
1783 {
1784 	struct tcpcb *tp = sototcpcb(so);
1785 	struct mptcp_subf_auth_entry *sauth_entry;
1786 
1787 	/*
1788 	 * The address ID of the first flow is implicitly 0.
1789 	 */
1790 	if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1791 		tp->t_local_aid = 0;
1792 	} else {
1793 		tp->t_local_aid = addr_id;
1794 		tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
1795 		so->so_flags |= SOF_MP_SEC_SUBFLOW;
1796 	}
1797 	sauth_entry = zalloc(mpt_subauth_zone);
1798 	sauth_entry->msae_laddr_id = tp->t_local_aid;
1799 	sauth_entry->msae_raddr_id = 0;
1800 	sauth_entry->msae_raddr_rand = 0;
1801 try_again:
1802 	sauth_entry->msae_laddr_rand = RandomULong();
1803 	if (sauth_entry->msae_laddr_rand == 0) {
1804 		goto try_again;
1805 	}
1806 	LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
1807 }
1808 
1809 static void
mptcp_detach_mptcb_from_subf(struct mptcb * mp_tp,struct socket * so)1810 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
1811 {
1812 	struct mptcp_subf_auth_entry *sauth_entry;
1813 	struct tcpcb *tp = NULL;
1814 	int found = 0;
1815 
1816 	tp = sototcpcb(so);
1817 	if (tp == NULL) {
1818 		return;
1819 	}
1820 
1821 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
1822 		if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
1823 			found = 1;
1824 			break;
1825 		}
1826 	}
1827 	if (found) {
1828 		LIST_REMOVE(sauth_entry, msae_next);
1829 	}
1830 
1831 	if (found) {
1832 		zfree(mpt_subauth_zone, sauth_entry);
1833 	}
1834 }
1835 
1836 /*
1837  * Connect an MPTCP subflow socket.
1838  *
1839  * Note that in the pending connect case, the subflow socket may have been
1840  * bound to an interface and/or a source IP address which may no longer be
1841  * around by the time this routine is called; in that case the connect attempt
1842  * will most likely fail.
1843  */
1844 static int
mptcp_subflow_soconnectx(struct mptses * mpte,struct mptsub * mpts)1845 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1846 {
1847 	char dbuf[MAX_IPv6_STR_LEN];
1848 	struct socket *mp_so, *so;
1849 	struct mptcb *mp_tp;
1850 	struct sockaddr *dst;
1851 	struct proc *p;
1852 	int af, error, dport;
1853 
1854 	mp_so = mptetoso(mpte);
1855 	mp_tp = mpte->mpte_mptcb;
1856 	so = mpts->mpts_socket;
1857 	af = mpts->mpts_dst.sa_family;
1858 	dst = &mpts->mpts_dst;
1859 
1860 	VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1861 	VERIFY(mpts->mpts_socket != NULL);
1862 	VERIFY(af == AF_INET || af == AF_INET6);
1863 
1864 	if (af == AF_INET) {
1865 		inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1866 		dport = ntohs(SIN(dst)->sin_port);
1867 	} else {
1868 		inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1869 		dport = ntohs(SIN6(dst)->sin6_port);
1870 	}
1871 
1872 	os_log(mptcp_log_handle,
1873 	    "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1874 	    mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1875 
1876 	p = proc_find(mp_so->last_pid);
1877 	if (p == PROC_NULL) {
1878 		os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1879 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1880 
1881 		return ESRCH;
1882 	}
1883 
1884 	mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1885 
1886 	mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1887 
1888 	/* connect the subflow socket */
1889 	error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1890 	    p, mpts->mpts_ifscope,
1891 	    mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1892 
1893 	mpts->mpts_iss = sototcpcb(so)->iss;
1894 
1895 	/* See tcp_connect_complete */
1896 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1897 	    (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1898 		mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1899 	}
1900 
1901 	/* Allocate a unique address id per subflow */
1902 	mpte->mpte_addrid_last++;
1903 	if (mpte->mpte_addrid_last == 0) {
1904 		mpte->mpte_addrid_last++;
1905 	}
1906 
1907 	proc_rele(p);
1908 
1909 	DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1910 	    struct mptsub *, mpts, int, error);
1911 	if (error) {
1912 		os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1913 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1914 	}
1915 
1916 	return error;
1917 }
1918 
1919 static int
mptcp_adj_rmap(struct socket * so,struct mbuf * m,int off,uint64_t dsn,uint32_t rseq,uint16_t dlen,uint8_t dfin)1920 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1921     uint32_t rseq, uint16_t dlen, uint8_t dfin)
1922 {
1923 	struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1924 
1925 	if (m_pktlen(m) == 0) {
1926 		return 0;
1927 	}
1928 
1929 	if (!(m->m_flags & M_PKTHDR)) {
1930 		return 0;
1931 	}
1932 
1933 	if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
1934 		if (off && (dsn != m->m_pkthdr.mp_dsn ||
1935 		    rseq != m->m_pkthdr.mp_rseq ||
1936 		    dlen != m->m_pkthdr.mp_rlen ||
1937 		    dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1938 			os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
1939 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1940 			    (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1941 			    rseq, m->m_pkthdr.mp_rseq,
1942 			    dlen, m->m_pkthdr.mp_rlen,
1943 			    dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
1944 
1945 			soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1946 			return -1;
1947 		}
1948 	}
1949 
1950 	/* If mbuf is beyond right edge of the mapping, we need to split */
1951 	if (m_pktlen(m) > dlen - dfin - off) {
1952 		struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
1953 		if (new == NULL) {
1954 			os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
1955 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1956 			    dlen, dfin, off, m_pktlen(m),
1957 			    mpts->mpts_connid);
1958 
1959 			soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1960 			return -1;
1961 		}
1962 
1963 		m->m_next = new;
1964 		sballoc(&so->so_rcv, new);
1965 		/* Undo, as sballoc will add to it as well */
1966 		so->so_rcv.sb_cc -= new->m_len;
1967 
1968 		if (so->so_rcv.sb_mbtail == m) {
1969 			so->so_rcv.sb_mbtail = new;
1970 		}
1971 	}
1972 
1973 	m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1974 	m->m_pkthdr.mp_dsn = dsn + off;
1975 	m->m_pkthdr.mp_rseq = rseq + off;
1976 	VERIFY(m_pktlen(m) < UINT16_MAX);
1977 	m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
1978 
1979 	/* Only put the DATA_FIN-flag on the last mbuf of this mapping */
1980 	if (dfin) {
1981 		if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
1982 			m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
1983 		} else {
1984 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
1985 		}
1986 	}
1987 
1988 
1989 	mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
1990 
1991 	return 0;
1992 }
1993 
1994 /*
1995  * Update the pid, upid, uuid of the subflow so, based on parent so
1996  */
1997 static void
mptcp_update_last_owner(struct socket * so,struct socket * mp_so)1998 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
1999 {
2000 	if (so->last_pid != mp_so->last_pid ||
2001 	    so->last_upid != mp_so->last_upid) {
2002 		so->last_upid = mp_so->last_upid;
2003 		so->last_pid = mp_so->last_pid;
2004 		uuid_copy(so->last_uuid, mp_so->last_uuid);
2005 	}
2006 	so_update_policy(so);
2007 }
2008 
2009 /*
2010  * MPTCP subflow socket receive routine, derived from soreceive().
2011  */
2012 static int
mptcp_subflow_soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)2013 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
2014     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2015 {
2016 #pragma unused(uio)
2017 	struct socket *mp_so;
2018 	struct mptses *mpte;
2019 	struct mptcb *mp_tp;
2020 	int flags, error = 0;
2021 	struct mbuf *m, **mp = mp0;
2022 	struct tcpcb *tp = sototcpcb(so);
2023 
2024 	mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2025 	mp_so = mptetoso(mpte);
2026 	mp_tp = mpte->mpte_mptcb;
2027 
2028 	VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2029 
2030 #ifdef MORE_LOCKING_DEBUG
2031 	if (so->so_usecount == 1) {
2032 		panic("%s: so=%x no other reference on socket", __func__, so);
2033 		/* NOTREACHED */
2034 	}
2035 #endif
2036 	/*
2037 	 * We return all that is there in the subflow's socket receive buffer
2038 	 * to the MPTCP layer, so we require that the caller passes in the
2039 	 * expected parameters.
2040 	 */
2041 	if (mp == NULL || controlp != NULL) {
2042 		return EINVAL;
2043 	}
2044 
2045 	*mp = NULL;
2046 	if (psa != NULL) {
2047 		*psa = NULL;
2048 	}
2049 	if (flagsp != NULL) {
2050 		flags = *flagsp & ~MSG_EOR;
2051 	} else {
2052 		flags = 0;
2053 	}
2054 
2055 	if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2056 		return EOPNOTSUPP;
2057 	}
2058 
2059 	flags |= (MSG_DONTWAIT | MSG_NBIO);
2060 
2061 	/*
2062 	 * If a recv attempt is made on a previously-accepted socket
2063 	 * that has been marked as inactive (disconnected), reject
2064 	 * the request.
2065 	 */
2066 	if (so->so_flags & SOF_DEFUNCT) {
2067 		struct sockbuf *sb = &so->so_rcv;
2068 
2069 		error = ENOTCONN;
2070 		/*
2071 		 * This socket should have been disconnected and flushed
2072 		 * prior to being returned from sodefunct(); there should
2073 		 * be no data on its receive list, so panic otherwise.
2074 		 */
2075 		if (so->so_state & SS_DEFUNCT) {
2076 			sb_empty_assert(sb, __func__);
2077 		}
2078 		return error;
2079 	}
2080 
2081 	/*
2082 	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2083 	 * and if so just return to the caller.  This could happen when
2084 	 * soreceive() is called by a socket upcall function during the
2085 	 * time the socket is freed.  The socket buffer would have been
2086 	 * locked across the upcall, therefore we cannot put this thread
2087 	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2088 	 * we may livelock), because the lock on the socket buffer will
2089 	 * only be released when the upcall routine returns to its caller.
2090 	 * Because the socket has been officially closed, there can be
2091 	 * no further read on it.
2092 	 *
2093 	 * A multipath subflow socket would have its SS_NOFDREF set by
2094 	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2095 	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2096 	 */
2097 	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2098 	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2099 		return 0;
2100 	}
2101 
2102 	/*
2103 	 * For consistency with soreceive() semantics, we need to obey
2104 	 * SB_LOCK in case some other code path has locked the buffer.
2105 	 */
2106 	error = sblock(&so->so_rcv, 0);
2107 	if (error != 0) {
2108 		return error;
2109 	}
2110 
2111 	m = so->so_rcv.sb_mb;
2112 	if (m == NULL) {
2113 		/*
2114 		 * Panic if we notice inconsistencies in the socket's
2115 		 * receive list; both sb_mb and sb_cc should correctly
2116 		 * reflect the contents of the list, otherwise we may
2117 		 * end up with false positives during select() or poll()
2118 		 * which could put the application in a bad state.
2119 		 */
2120 		SB_MB_CHECK(&so->so_rcv);
2121 
2122 		if (so->so_error != 0) {
2123 			error = so->so_error;
2124 			so->so_error = 0;
2125 			goto release;
2126 		}
2127 
2128 		if (so->so_state & SS_CANTRCVMORE) {
2129 			goto release;
2130 		}
2131 
2132 		if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2133 			error = ENOTCONN;
2134 			goto release;
2135 		}
2136 
2137 		/*
2138 		 * MSG_DONTWAIT is implicitly defined and this routine will
2139 		 * never block, so return EWOULDBLOCK when there is nothing.
2140 		 */
2141 		error = EWOULDBLOCK;
2142 		goto release;
2143 	}
2144 
2145 	mptcp_update_last_owner(so, mp_so);
2146 
2147 	SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2148 	SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2149 
2150 	while (m != NULL) {
2151 		int dlen = 0, error_out = 0, off = 0;
2152 		uint8_t dfin = 0;
2153 		struct mbuf *start = m;
2154 		uint64_t dsn;
2155 		uint32_t sseq;
2156 		uint16_t orig_dlen;
2157 		uint16_t csum;
2158 
2159 		VERIFY(m->m_nextpkt == NULL);
2160 
2161 		if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2162 fallback:
2163 			/* Just move mbuf to MPTCP-level */
2164 
2165 			sbfree(&so->so_rcv, m);
2166 
2167 			if (mp != NULL) {
2168 				*mp = m;
2169 				mp = &m->m_next;
2170 				so->so_rcv.sb_mb = m = m->m_next;
2171 				*mp = NULL;
2172 			}
2173 
2174 			if (m != NULL) {
2175 				so->so_rcv.sb_lastrecord = m;
2176 			} else {
2177 				SB_EMPTY_FIXUP(&so->so_rcv);
2178 			}
2179 
2180 			continue;
2181 		} else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2182 			struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2183 			boolean_t found_mapping = false;
2184 			int parsed_length = 0;
2185 			struct mbuf *m_iter;
2186 
2187 			/*
2188 			 * No MPTCP-option in the header. Either fallback or
2189 			 * wait for additional mappings.
2190 			 */
2191 			if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2192 				/* data arrived without a DSS option mapping */
2193 
2194 				/* initial subflow can fallback right after SYN handshake */
2195 				if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2196 					mptcp_notify_mpfail(so);
2197 
2198 					goto fallback;
2199 				} else {
2200 					os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2201 					    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2202 					    mpts->mpts_connid);
2203 					soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2204 
2205 					error = EIO;
2206 					*mp0 = NULL;
2207 					goto release;
2208 				}
2209 			}
2210 
2211 			/* Thus, let's look for an mbuf with the mapping */
2212 			m_iter = m->m_next;
2213 			parsed_length = m->m_len;
2214 			while (m_iter != NULL && parsed_length < UINT16_MAX) {
2215 				if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2216 					parsed_length += m_iter->m_len;
2217 					m_iter = m_iter->m_next;
2218 					continue;
2219 				}
2220 
2221 				found_mapping = true;
2222 
2223 				/* Found an mbuf with a DSS-mapping */
2224 				orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2225 				dsn = m_iter->m_pkthdr.mp_dsn;
2226 				sseq = m_iter->m_pkthdr.mp_rseq;
2227 				csum = m_iter->m_pkthdr.mp_csum;
2228 
2229 				if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2230 					dfin = 1;
2231 					dlen--;
2232 				}
2233 
2234 				break;
2235 			}
2236 
2237 			if (!found_mapping && parsed_length < UINT16_MAX) {
2238 				/* Mapping not yet present, we can wait! */
2239 				if (*mp0 == NULL) {
2240 					error = EWOULDBLOCK;
2241 				}
2242 				goto release;
2243 			} else if (!found_mapping && parsed_length >= UINT16_MAX) {
2244 				os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2245 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2246 				    mpts->mpts_connid);
2247 				/* Received 64KB without DSS-mapping. We should kill the subflow */
2248 				soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2249 
2250 				error = EIO;
2251 				*mp0 = NULL;
2252 				goto release;
2253 			}
2254 		} else {
2255 			orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2256 			dsn = m->m_pkthdr.mp_dsn;
2257 			sseq = m->m_pkthdr.mp_rseq;
2258 			csum = m->m_pkthdr.mp_csum;
2259 
2260 			if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2261 				dfin = 1;
2262 				dlen--;
2263 			}
2264 		}
2265 
2266 		/* Now, see if we need to remove previous packets */
2267 		if (SEQ_GT(sseq + tp->irs, tp->rcv_nxt - so->so_rcv.sb_cc)) {
2268 			/* Ok, there is data in there that we don't need - let's throw it away! */
2269 			int totrim = (int)sseq + tp->irs - (tp->rcv_nxt - so->so_rcv.sb_cc);
2270 
2271 			sbdrop(&so->so_rcv, totrim);
2272 
2273 			m = so->so_rcv.sb_mb;
2274 		}
2275 
2276 		/*
2277 		 * Check if the full mapping is now present
2278 		 */
2279 		if ((int)so->so_rcv.sb_cc < dlen) {
2280 			if (*mp0 == NULL) {
2281 				error = EWOULDBLOCK;
2282 			}
2283 			goto release;
2284 		}
2285 
2286 		/* Now, get the full mapping */
2287 		off = 0;
2288 		while (dlen > 0) {
2289 			if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
2290 				error_out = 1;
2291 				error = EIO;
2292 				dlen = 0;
2293 				*mp0 = NULL;
2294 				break;
2295 			}
2296 
2297 			dlen -= m->m_len;
2298 			off += m->m_len;
2299 			sbfree(&so->so_rcv, m);
2300 
2301 			if (mp != NULL) {
2302 				*mp = m;
2303 				mp = &m->m_next;
2304 				so->so_rcv.sb_mb = m = m->m_next;
2305 				*mp = NULL;
2306 			}
2307 
2308 			ASSERT(dlen == 0 || m);
2309 			if (dlen != 0 && m == NULL) {
2310 				/* "try" to gracefully recover on customer builds */
2311 				error_out = 1;
2312 				error = EIO;
2313 				dlen  = 0;
2314 
2315 				*mp0 = NULL;
2316 
2317 				SB_EMPTY_FIXUP(&so->so_rcv);
2318 				soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2319 
2320 				break;
2321 			}
2322 		}
2323 
2324 		ASSERT(dlen == 0);
2325 		if (dlen != 0) {
2326 			/* "try" to gracefully recover on customer builds */
2327 			error_out = 1;
2328 			error = EIO;
2329 			dlen = 0;
2330 
2331 			*mp0 = NULL;
2332 
2333 			SB_EMPTY_FIXUP(&so->so_rcv);
2334 			soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2335 		}
2336 
2337 		if (m != NULL) {
2338 			so->so_rcv.sb_lastrecord = m;
2339 		} else {
2340 			SB_EMPTY_FIXUP(&so->so_rcv);
2341 		}
2342 
2343 		if (error_out) {
2344 			goto release;
2345 		}
2346 
2347 		if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2348 			error = EIO;
2349 			*mp0 = NULL;
2350 			goto release;
2351 		}
2352 
2353 		SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2354 		SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2355 	}
2356 
2357 	DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2358 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2359 
2360 	if (flagsp != NULL) {
2361 		*flagsp |= flags;
2362 	}
2363 
2364 release:
2365 	sbunlock(&so->so_rcv, TRUE);
2366 
2367 	return error;
2368 }
2369 
2370 /*
2371  * MPTCP subflow socket send routine, derived from sosend().
2372  */
2373 static int
mptcp_subflow_sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2374 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2375     struct mbuf *top, struct mbuf *control, int flags)
2376 {
2377 	struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2378 	boolean_t en_tracing = FALSE, proc_held = FALSE;
2379 	struct proc *p = current_proc();
2380 	int en_tracing_val;
2381 	int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2382 	int error;
2383 
2384 	VERIFY(control == NULL);
2385 	VERIFY(addr == NULL);
2386 	VERIFY(uio == NULL);
2387 	VERIFY(flags == 0);
2388 	VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2389 
2390 	VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2391 	VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2392 
2393 	/*
2394 	 * trace if tracing & network (vs. unix) sockets & and
2395 	 * non-loopback
2396 	 */
2397 	if (ENTR_SHOULDTRACE &&
2398 	    (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2399 		struct inpcb *inp = sotoinpcb(so);
2400 		if (inp->inp_last_outifp != NULL &&
2401 		    !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2402 			en_tracing = TRUE;
2403 			en_tracing_val = top->m_pkthdr.len;
2404 			KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2405 			    (unsigned long)VM_KERNEL_ADDRPERM(so),
2406 			    ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2407 			    (int64_t)en_tracing_val);
2408 		}
2409 	}
2410 
2411 	mptcp_update_last_owner(so, mp_so);
2412 
2413 	if (mp_so->last_pid != proc_pid(p)) {
2414 		p = proc_find(mp_so->last_pid);
2415 		if (p == PROC_NULL) {
2416 			p = current_proc();
2417 		} else {
2418 			proc_held = TRUE;
2419 		}
2420 	}
2421 
2422 #if NECP
2423 	inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2424 #endif /* NECP */
2425 
2426 	error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2427 	if (error) {
2428 		goto out;
2429 	}
2430 
2431 	error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2432 	top = NULL;
2433 
2434 out:
2435 	if (top != NULL) {
2436 		m_freem(top);
2437 	}
2438 
2439 	if (proc_held) {
2440 		proc_rele(p);
2441 	}
2442 
2443 	soclearfastopen(so);
2444 
2445 	if (en_tracing) {
2446 		KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2447 		    (unsigned long)VM_KERNEL_ADDRPERM(so),
2448 		    ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2449 		    (int64_t)en_tracing_val);
2450 	}
2451 
2452 	return error;
2453 }
2454 
2455 /*
2456  * Subflow socket write upcall.
2457  *
2458  * Called when the associated subflow socket posted a read event.
2459  */
2460 static void
mptcp_subflow_wupcall(struct socket * so,void * arg,int waitf)2461 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2462 {
2463 #pragma unused(so, waitf)
2464 	struct mptsub *mpts __single = arg;
2465 	struct mptses *mpte = mpts->mpts_mpte;
2466 
2467 	VERIFY(mpte != NULL);
2468 
2469 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2470 		if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2471 			mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2472 		}
2473 		return;
2474 	}
2475 
2476 	mptcp_output(mpte);
2477 }
2478 
2479 /*
2480  * Subflow socket control event upcall.
2481  */
2482 static void
mptcp_subflow_eupcall1(struct socket * so,void * arg,uint32_t events)2483 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
2484 {
2485 #pragma unused(so)
2486 	struct mptsub *mpts __single = arg;
2487 	struct mptses *mpte = mpts->mpts_mpte;
2488 
2489 	socket_lock_assert_owned(mptetoso(mpte));
2490 
2491 	if ((mpts->mpts_evctl & events) == events) {
2492 		return;
2493 	}
2494 
2495 	mpts->mpts_evctl |= events;
2496 
2497 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2498 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
2499 		return;
2500 	}
2501 
2502 	mptcp_subflow_workloop(mpte);
2503 }
2504 
2505 /*
2506  * Establish an initial MPTCP connection (if first subflow and not yet
2507  * connected), or add a subflow to an existing MPTCP connection.
2508  */
2509 int
mptcp_subflow_add(struct mptses * mpte,struct sockaddr * src,struct sockaddr * dst,uint32_t ifscope,sae_connid_t * pcid)2510 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2511     struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2512 {
2513 	socket_ref_t mp_so, so = NULL;
2514 	struct mptcb *mp_tp;
2515 	struct mptsub *mpts = NULL;
2516 	int af, error = 0;
2517 
2518 	mp_so = mptetoso(mpte);
2519 	mp_tp = mpte->mpte_mptcb;
2520 
2521 	socket_lock_assert_owned(mp_so);
2522 
2523 	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2524 		/* If the remote end sends Data FIN, refuse subflow adds */
2525 		os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2526 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2527 		error = ENOTCONN;
2528 		goto out_err;
2529 	}
2530 
2531 	if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2532 		error = EOVERFLOW;
2533 		goto out_err;
2534 	}
2535 
2536 	mpts = mptcp_subflow_alloc();
2537 	if (mpts == NULL) {
2538 		os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2539 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2540 		error = ENOMEM;
2541 		goto out_err;
2542 	}
2543 
2544 	if (src) {
2545 		if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2546 			error = EAFNOSUPPORT;
2547 			goto out_err;
2548 		}
2549 
2550 		if (src->sa_family == AF_INET &&
2551 		    src->sa_len != sizeof(struct sockaddr_in)) {
2552 			error = EINVAL;
2553 			goto out_err;
2554 		}
2555 
2556 		if (src->sa_family == AF_INET6 &&
2557 		    src->sa_len != sizeof(struct sockaddr_in6)) {
2558 			error = EINVAL;
2559 			goto out_err;
2560 		}
2561 
2562 		mpts->mpts_src = SA(alloc_sockaddr(src->sa_len, Z_WAITOK | Z_NOFAIL));
2563 
2564 		SOCKADDR_COPY(src, mpts->mpts_src, src->sa_len);
2565 	}
2566 
2567 	if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2568 		error = EAFNOSUPPORT;
2569 		goto out_err;
2570 	}
2571 
2572 	if (dst->sa_family == AF_INET &&
2573 	    dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2574 		error = EINVAL;
2575 		goto out_err;
2576 	}
2577 
2578 	if (dst->sa_family == AF_INET6 &&
2579 	    dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2580 		error = EINVAL;
2581 		goto out_err;
2582 	}
2583 
2584 	SOCKADDR_COPY(dst, &mpts->mpts_dst, dst->sa_len);
2585 
2586 	af = mpts->mpts_dst.sa_family;
2587 
2588 	ifnet_head_lock_shared();
2589 	if ((ifscope > (unsigned)if_index)) {
2590 		ifnet_head_done();
2591 		error = ENXIO;
2592 		goto out_err;
2593 	}
2594 	ifnet_head_done();
2595 
2596 	mpts->mpts_ifscope = ifscope;
2597 
2598 	/* create the subflow socket */
2599 	if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2600 		/*
2601 		 * Returning (error) and not cleaning up, because up to here
2602 		 * all we did is creating mpts.
2603 		 *
2604 		 * And the contract is that the call to mptcp_subflow_socreate,
2605 		 * moves ownership of mpts to mptcp_subflow_socreate.
2606 		 */
2607 		return error;
2608 	}
2609 
2610 	/*
2611 	 * We may be called from within the kernel. Still need to account this
2612 	 * one to the real app.
2613 	 */
2614 	mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2615 
2616 	/*
2617 	 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2618 	 * -1 (SAE_CONNID_ALL).
2619 	 */
2620 	mpte->mpte_connid_last++;
2621 	if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2622 	    mpte->mpte_connid_last == SAE_CONNID_ANY) {
2623 		mpte->mpte_connid_last++;
2624 	}
2625 
2626 	mpts->mpts_connid = mpte->mpte_connid_last;
2627 
2628 	mpts->mpts_rel_seq = 1;
2629 
2630 	/* Allocate a unique address id per subflow */
2631 	mpte->mpte_addrid_last++;
2632 	if (mpte->mpte_addrid_last == 0) {
2633 		mpte->mpte_addrid_last++;
2634 	}
2635 
2636 	/* register for subflow socket read/write events */
2637 	sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2638 
2639 	/* Register for subflow socket control events */
2640 	sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2641 	    SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2642 	    SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2643 	    SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2644 	    SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2645 	    SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2646 	    SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2647 	    SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2648 
2649 	/* sanity check */
2650 	VERIFY(!(mpts->mpts_flags &
2651 	    (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2652 
2653 	/*
2654 	 * Indicate to the TCP subflow whether or not it should establish
2655 	 * the initial MPTCP connection, or join an existing one.  Fill
2656 	 * in the connection request structure with additional info needed
2657 	 * by the underlying TCP (to be used in the TCP options, etc.)
2658 	 */
2659 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2660 		mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2661 
2662 		if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2663 			mptcp_init_local_parms(mpte, dst);
2664 		}
2665 		soisconnecting(mp_so);
2666 
2667 		/* If fastopen is requested, set state in mpts */
2668 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2669 			mpts->mpts_flags |= MPTSF_TFO_REQD;
2670 		}
2671 	} else {
2672 		if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2673 			mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2674 		}
2675 	}
2676 
2677 	mpts->mpts_flags |= MPTSF_CONNECTING;
2678 
2679 	/* connect right away if first attempt, or if join can be done now */
2680 	if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2681 		error = mptcp_subflow_soconnectx(mpte, mpts);
2682 	}
2683 
2684 	if (error) {
2685 		goto out_err_close;
2686 	}
2687 
2688 	if (pcid) {
2689 		*pcid = mpts->mpts_connid;
2690 	}
2691 
2692 	return 0;
2693 
2694 out_err_close:
2695 	mptcp_subflow_abort(mpts, error);
2696 
2697 	return error;
2698 
2699 out_err:
2700 	if (mpts) {
2701 		mptcp_subflow_free(mpts);
2702 	}
2703 
2704 	return error;
2705 }
2706 
2707 void
mptcpstats_update(struct mptcp_itf_stats * stats __counted_by (stats_count),uint16_t stats_count,const struct mptsub * mpts)2708 mptcpstats_update(struct mptcp_itf_stats *stats __counted_by(stats_count), uint16_t stats_count, const struct mptsub *mpts)
2709 {
2710 	int index = mptcpstats_get_index(stats, stats_count, mpts);
2711 
2712 	if (index != -1) {
2713 		struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2714 
2715 		stats[index].mpis_txbytes += inp->inp_mstat.ms_total.ts_txbytes;
2716 		stats[index].mpis_rxbytes += inp->inp_mstat.ms_total.ts_rxbytes;
2717 
2718 		stats[index].mpis_wifi_txbytes += inp->inp_mstat.ms_wifi_infra.ts_txbytes +
2719 		    inp->inp_mstat.ms_wifi_non_infra.ts_txbytes;
2720 		stats[index].mpis_wifi_rxbytes += inp->inp_mstat.ms_wifi_infra.ts_rxbytes +
2721 		    inp->inp_mstat.ms_wifi_non_infra.ts_rxbytes;
2722 
2723 		stats[index].mpis_wired_txbytes += inp->inp_mstat.ms_wired.ts_txbytes;
2724 		stats[index].mpis_wired_rxbytes += inp->inp_mstat.ms_wired.ts_rxbytes;
2725 
2726 		stats[index].mpis_cell_txbytes += inp->inp_mstat.ms_cellular.ts_txbytes;
2727 		stats[index].mpis_cell_rxbytes += inp->inp_mstat.ms_cellular.ts_rxbytes;
2728 	}
2729 }
2730 
2731 /*
2732  * Delete/remove a subflow from an MPTCP.  The underlying subflow socket
2733  * will no longer be accessible after a subflow is deleted, thus this
2734  * should occur only after the subflow socket has been disconnected.
2735  */
2736 void
mptcp_subflow_del(struct mptses * mpte,struct mptsub * mpts)2737 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2738 {
2739 	struct socket *mp_so = mptetoso(mpte);
2740 	struct socket *so = mpts->mpts_socket;
2741 	struct tcpcb *tp = sototcpcb(so);
2742 
2743 	socket_lock_assert_owned(mp_so);
2744 	VERIFY(mpts->mpts_mpte == mpte);
2745 	VERIFY(mpte->mpte_numflows != 0);
2746 	VERIFY(mp_so->so_usecount > 0);
2747 
2748 	mptcpstats_update(mpte->mpte_itfstats, MPTCP_ITFSTATS_SIZE, mpts);
2749 
2750 	mptcp_unset_cellicon(mpte, mpts, 1);
2751 
2752 	mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_mstat.ms_total.ts_rxbytes;
2753 	mpte->mpte_init_txbytes = sotoinpcb(so)->inp_mstat.ms_total.ts_txbytes;
2754 
2755 	TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2756 	mpte->mpte_numflows--;
2757 	if (mpte->mpte_active_sub == mpts) {
2758 		mpte->mpte_active_sub = NULL;
2759 	}
2760 
2761 	/*
2762 	 * Drop references held by this subflow socket; there
2763 	 * will be no further upcalls made from this point.
2764 	 */
2765 	sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2766 	sock_catchevents_locked(so, NULL, NULL, 0);
2767 
2768 	mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2769 
2770 	mp_so->so_usecount--;           /* for subflow socket */
2771 	mpts->mpts_mpte = NULL;
2772 	mpts->mpts_socket = NULL;
2773 
2774 	mptcp_subflow_remref(mpts);             /* for MPTCP subflow list */
2775 	mptcp_subflow_remref(mpts);             /* for subflow socket */
2776 
2777 	so->so_flags &= ~SOF_MP_SUBFLOW;
2778 	tp->t_mptcb = NULL;
2779 	tp->t_mpsub = NULL;
2780 }
2781 
2782 void
mptcp_subflow_shutdown(struct mptses * mpte,struct mptsub * mpts)2783 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2784 {
2785 	struct socket *so = mpts->mpts_socket;
2786 	struct mptcb *mp_tp = mpte->mpte_mptcb;
2787 	int send_dfin = 0;
2788 
2789 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2790 		send_dfin = 1;
2791 	}
2792 
2793 	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2794 	    (so->so_state & SS_ISCONNECTED)) {
2795 		if (send_dfin) {
2796 			mptcp_send_dfin(so);
2797 		}
2798 		soshutdownlock(so, SHUT_WR);
2799 	}
2800 }
2801 
2802 static void
mptcp_subflow_abort(struct mptsub * mpts,int error)2803 mptcp_subflow_abort(struct mptsub *mpts, int error)
2804 {
2805 	struct socket *so = mpts->mpts_socket;
2806 	struct tcpcb *tp = sototcpcb(so);
2807 
2808 	if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2809 		return;
2810 	}
2811 
2812 	if (tp->t_state != TCPS_CLOSED) {
2813 		tcp_drop(tp, error);
2814 	}
2815 
2816 	mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2817 }
2818 
2819 /*
2820  * Disconnect a subflow socket.
2821  */
2822 void
mptcp_subflow_disconnect(struct mptses * mpte,struct mptsub * mpts)2823 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2824 {
2825 	struct socket *so, *mp_so;
2826 	struct mptcb *mp_tp;
2827 	int send_dfin = 0;
2828 
2829 	so = mpts->mpts_socket;
2830 	mp_tp = mpte->mpte_mptcb;
2831 	mp_so = mptetoso(mpte);
2832 
2833 	socket_lock_assert_owned(mp_so);
2834 
2835 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2836 		return;
2837 	}
2838 
2839 	mptcp_unset_cellicon(mpte, mpts, 1);
2840 
2841 	mpts->mpts_flags |= MPTSF_DISCONNECTING;
2842 
2843 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2844 		send_dfin = 1;
2845 	}
2846 
2847 	if (mp_so->so_flags & SOF_DEFUNCT) {
2848 		errno_t ret;
2849 
2850 		ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2851 		if (ret == 0) {
2852 			ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2853 
2854 			if (ret != 0) {
2855 				os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2856 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2857 			}
2858 		} else {
2859 			os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2860 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2861 		}
2862 	}
2863 
2864 	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2865 	    (so->so_state & SS_ISCONNECTED)) {
2866 		if (send_dfin) {
2867 			mptcp_send_dfin(so);
2868 		}
2869 
2870 		(void) soshutdownlock(so, SHUT_RD);
2871 		(void) soshutdownlock(so, SHUT_WR);
2872 		(void) sodisconnectlocked(so);
2873 	}
2874 
2875 	/*
2876 	 * Generate a disconnect event for this subflow socket, in case
2877 	 * the lower layer doesn't do it; this is needed because the
2878 	 * subflow socket deletion relies on it.
2879 	 */
2880 	mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2881 }
2882 
2883 /*
2884  * Subflow socket input.
2885  */
2886 static void
mptcp_subflow_input(struct mptses * mpte,struct mptsub * mpts)2887 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2888 {
2889 	struct socket *mp_so = mptetoso(mpte);
2890 	mbuf_ref_t m = NULL;
2891 	struct socket *so;
2892 	int error, wakeup = 0;
2893 
2894 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2895 	mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2896 
2897 	DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2898 	    struct mptsub *, mpts);
2899 
2900 	if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2901 		goto out;
2902 	}
2903 
2904 	so = mpts->mpts_socket;
2905 
2906 	error = sock_receive_internal(so, NULL, &m, 0, NULL);
2907 	if (error != 0 && error != EWOULDBLOCK) {
2908 		os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2909 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2910 		if (error == ENODATA) {
2911 			/*
2912 			 * Don't ignore ENODATA so as to discover
2913 			 * nasty middleboxes.
2914 			 */
2915 			mp_so->so_error = ENODATA;
2916 
2917 			wakeup = 1;
2918 			goto out;
2919 		}
2920 	}
2921 
2922 	/* In fallback, make sure to accept data on all but one subflow */
2923 	if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2924 	    !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2925 		m_freem(m);
2926 		goto out;
2927 	}
2928 
2929 	if (m != NULL) {
2930 		if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2931 			mptcp_set_cellicon(mpte, mpts);
2932 
2933 			mpte->mpte_used_cell = 1;
2934 		} else {
2935 			/*
2936 			 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2937 			 * explicitly set the cellicon, then we unset it again.
2938 			 */
2939 			if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2940 				mptcp_unset_cellicon(mpte, NULL, 1);
2941 			}
2942 
2943 			mpte->mpte_used_wifi = 1;
2944 		}
2945 
2946 		mptcp_input(mpte, m);
2947 	}
2948 
2949 out:
2950 	if (wakeup) {
2951 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2952 	}
2953 
2954 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2955 }
2956 
2957 void
mptcp_handle_input(struct socket * so)2958 mptcp_handle_input(struct socket *so)
2959 {
2960 	struct mptsub *mpts, *tmpts;
2961 	struct mptses *mpte;
2962 
2963 	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2964 		return;
2965 	}
2966 
2967 	mpts = sototcpcb(so)->t_mpsub;
2968 	mpte = mpts->mpts_mpte;
2969 
2970 	socket_lock_assert_owned(mptetoso(mpte));
2971 
2972 	if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2973 		if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2974 			mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2975 		}
2976 		return;
2977 	}
2978 
2979 	mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2980 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2981 		if (mpts->mpts_socket->so_usecount == 0) {
2982 			/* Will be removed soon by tcp_garbage_collect */
2983 			continue;
2984 		}
2985 
2986 		mptcp_subflow_addref(mpts);
2987 		mpts->mpts_socket->so_usecount++;
2988 
2989 		mptcp_subflow_input(mpte, mpts);
2990 
2991 		mptcp_subflow_remref(mpts);             /* ours */
2992 
2993 		VERIFY(mpts->mpts_socket->so_usecount != 0);
2994 		mpts->mpts_socket->so_usecount--;
2995 	}
2996 
2997 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2998 }
2999 
3000 static boolean_t
mptcp_search_seq_in_sub(struct mbuf * m,struct socket * so)3001 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
3002 {
3003 	struct mbuf *so_m = so->so_snd.sb_mb;
3004 	uint64_t dsn = m->m_pkthdr.mp_dsn;
3005 
3006 	while (so_m) {
3007 		VERIFY(so_m->m_flags & M_PKTHDR);
3008 		VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
3009 
3010 		/* Part of the segment is covered, don't reinject here */
3011 		if (so_m->m_pkthdr.mp_dsn <= dsn &&
3012 		    so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
3013 			return TRUE;
3014 		}
3015 
3016 		so_m = so_m->m_next;
3017 	}
3018 
3019 	return FALSE;
3020 }
3021 
3022 /*
3023  * Subflow socket output.
3024  *
3025  * Called for sending data from MPTCP to the underlying subflow socket.
3026  */
3027 int
mptcp_subflow_output(struct mptses * mpte,struct mptsub * mpts,int flags)3028 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
3029 {
3030 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3031 	struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL;
3032 	struct socket *mp_so, *so;
3033 	struct tcpcb *tp;
3034 	uint64_t mpt_dsn = 0, off = 0;
3035 	int sb_cc = 0, error = 0, wakeup = 0;
3036 	uint16_t dss_csum;
3037 	uint16_t tot_sent = 0;
3038 	boolean_t reinjected = FALSE;
3039 
3040 	mp_so = mptetoso(mpte);
3041 	so = mpts->mpts_socket;
3042 	tp = sototcpcb(so);
3043 
3044 	socket_lock_assert_owned(mp_so);
3045 
3046 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3047 	mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
3048 
3049 	VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3050 	VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
3051 	    (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3052 	    (mpts->mpts_flags & MPTSF_TFO_REQD));
3053 	VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
3054 
3055 	DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3056 	    struct mptsub *, mpts);
3057 
3058 	/* Remove Addr Option is not sent reliably as per I-D */
3059 	if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
3060 		tp->t_rem_aid = mpte->mpte_lost_aid;
3061 		tp->t_mpflags |= TMPF_SND_REM_ADDR;
3062 		mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3063 	}
3064 
3065 	/*
3066 	 * The mbuf chains containing the metadata (as well as pointing to
3067 	 * the user data sitting at the MPTCP output queue) would then be
3068 	 * sent down to the subflow socket.
3069 	 *
3070 	 * Some notes on data sequencing:
3071 	 *
3072 	 *   a. Each mbuf must be a M_PKTHDR.
3073 	 *   b. MPTCP metadata is stored in the mptcp_pktinfo structure
3074 	 *	in the mbuf pkthdr structure.
3075 	 *   c. Each mbuf containing the MPTCP metadata must have its
3076 	 *	pkt_flags marked with the PKTF_MPTCP flag.
3077 	 */
3078 
3079 	if (mpte->mpte_reinjectq) {
3080 		sb_mb = mpte->mpte_reinjectq;
3081 	} else {
3082 		sb_mb = mp_so->so_snd.sb_mb;
3083 	}
3084 
3085 	if (sb_mb == NULL) {
3086 		os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3087 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3088 		    (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3089 		    (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3090 
3091 		/* Fix it to prevent looping */
3092 		if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3093 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3094 		}
3095 		goto out;
3096 	}
3097 
3098 	VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3099 
3100 	if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3101 	    !(so->so_state & SS_ISCONNECTED) &&
3102 	    (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3103 		tp->t_mpflags |= TMPF_TFO_REQUEST;
3104 
3105 		/* Opting to call pru_send as no mbuf at subflow level */
3106 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3107 		    NULL, current_proc());
3108 
3109 		goto done_sending;
3110 	}
3111 
3112 	mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3113 
3114 	/* First, drop acknowledged data */
3115 	if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3116 		os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3117 		    "dsn %u suna %u reinject? %u\n",
3118 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3119 		    (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3120 		if (mpte->mpte_reinjectq) {
3121 			mptcp_clean_reinjectq(mpte);
3122 		} else {
3123 			uint64_t len = 0;
3124 			len = mp_tp->mpt_snduna - mpt_dsn;
3125 			sbdrop(&mp_so->so_snd, (int)len);
3126 			wakeup = 1;
3127 		}
3128 	}
3129 
3130 	/* Check again because of above sbdrop */
3131 	if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3132 		os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3133 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3134 		goto out;
3135 	}
3136 
3137 	/*
3138 	 * In degraded mode, we don't receive data acks, so force free
3139 	 * mbufs less than snd_nxt
3140 	 */
3141 	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3142 	    (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3143 	    mp_so->so_snd.sb_mb) {
3144 		mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3145 		if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3146 			uint64_t len = 0;
3147 			len = mp_tp->mpt_snduna - mpt_dsn;
3148 			sbdrop(&mp_so->so_snd, (int)len);
3149 			wakeup = 1;
3150 
3151 			os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3152 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3153 			    (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3154 		}
3155 	}
3156 
3157 	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3158 	    !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3159 		mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3160 		so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3161 	}
3162 
3163 	/*
3164 	 * Adjust the top level notion of next byte used for retransmissions
3165 	 * and sending FINs.
3166 	 */
3167 	if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3168 		mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3169 	}
3170 
3171 	/* Now determine the offset from which to start transmitting data */
3172 	if (mpte->mpte_reinjectq) {
3173 		sb_mb = mpte->mpte_reinjectq;
3174 	} else {
3175 dont_reinject:
3176 		sb_mb = mp_so->so_snd.sb_mb;
3177 	}
3178 	if (sb_mb == NULL) {
3179 		os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3180 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3181 		goto out;
3182 	}
3183 
3184 	if (sb_mb == mpte->mpte_reinjectq) {
3185 		sb_cc = sb_mb->m_pkthdr.mp_rlen;
3186 		off = 0;
3187 
3188 		if (mptcp_search_seq_in_sub(sb_mb, so)) {
3189 			if (mptcp_can_send_more(mp_tp, TRUE)) {
3190 				goto dont_reinject;
3191 			}
3192 
3193 			error = ECANCELED;
3194 			goto out;
3195 		}
3196 
3197 		reinjected = TRUE;
3198 	} else if (flags & MPTCP_SUBOUT_PROBING) {
3199 		sb_cc = sb_mb->m_pkthdr.mp_rlen;
3200 		off = 0;
3201 	} else {
3202 		sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3203 
3204 		/*
3205 		 * With TFO, there might be no data at all, thus still go into this
3206 		 * code-path here.
3207 		 */
3208 		if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3209 		    MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3210 			off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3211 			sb_cc -= off;
3212 		} else {
3213 			os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3214 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3215 			    (uint32_t)mp_tp->mpt_sndmax);
3216 
3217 			goto out;
3218 		}
3219 	}
3220 
3221 	sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3222 	if (sb_cc <= 0) {
3223 		os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3224 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3225 		    (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3226 		    mptcp_subflow_cwnd_space(so));
3227 	}
3228 
3229 	sb_cc = min(sb_cc, UINT16_MAX);
3230 
3231 	/*
3232 	 * Create a DSN mapping for the data we are about to send. It all
3233 	 * has the same mapping.
3234 	 */
3235 	if (reinjected) {
3236 		mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3237 	} else {
3238 		mpt_dsn = mp_tp->mpt_snduna + off;
3239 	}
3240 
3241 	mpt_mbuf = sb_mb;
3242 	while (mpt_mbuf && reinjected == FALSE &&
3243 	    (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3244 	    mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3245 		off -= mpt_mbuf->m_pkthdr.mp_rlen;
3246 		mpt_mbuf = mpt_mbuf->m_next;
3247 	}
3248 	VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3249 
3250 	head = tail = NULL;
3251 
3252 	while (tot_sent < sb_cc) {
3253 		int32_t mlen;
3254 
3255 		mlen = mpt_mbuf->m_len;
3256 		mlen -= off;
3257 		mlen = MIN(mlen, sb_cc - tot_sent);
3258 
3259 		if (mlen < 0) {
3260 			os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3261 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3262 			    (uint32_t)off, sb_cc, tot_sent);
3263 			goto out;
3264 		}
3265 
3266 		if (mlen == 0) {
3267 			goto next;
3268 		}
3269 
3270 		m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT, NULL, NULL,
3271 		    M_COPYM_MUST_COPY_HDR);
3272 		if (m == NULL) {
3273 			os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3274 			    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3275 			error = ENOBUFS;
3276 			break;
3277 		}
3278 
3279 		/* Create a DSN mapping for the data (m_copym does it) */
3280 		VERIFY(m->m_flags & M_PKTHDR);
3281 		VERIFY(m->m_next == NULL);
3282 
3283 		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3284 		m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3285 		m->m_pkthdr.mp_dsn = mpt_dsn;
3286 		m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3287 		m->m_pkthdr.len = mlen;
3288 
3289 		if (head == NULL) {
3290 			head = tail = m;
3291 		} else {
3292 			tail->m_next = m;
3293 			tail = m;
3294 		}
3295 
3296 		tot_sent += mlen;
3297 		off = 0;
3298 next:
3299 		mpt_mbuf = mpt_mbuf->m_next;
3300 	}
3301 
3302 	if (reinjected) {
3303 		if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3304 			struct mbuf *n = sb_mb;
3305 
3306 			while (n) {
3307 				n->m_pkthdr.mp_dsn += sb_cc;
3308 				n->m_pkthdr.mp_rlen -= sb_cc;
3309 				n = n->m_next;
3310 			}
3311 			m_adj(sb_mb, sb_cc);
3312 		} else {
3313 			mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3314 			m_freem(sb_mb);
3315 		}
3316 	}
3317 
3318 	if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3319 		dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3320 		    tot_sent);
3321 	}
3322 
3323 	/* Now, let's update rel-seq and the data-level length */
3324 	mpts->mpts_rel_seq += tot_sent;
3325 	m = head;
3326 	while (m) {
3327 		if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3328 			m->m_pkthdr.mp_csum = dss_csum;
3329 		}
3330 		m->m_pkthdr.mp_rlen = tot_sent;
3331 		m = m->m_next;
3332 	}
3333 
3334 	if (head != NULL) {
3335 		if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3336 		    (tp->t_tfo_stats == 0)) {
3337 			tp->t_mpflags |= TMPF_TFO_REQUEST;
3338 		}
3339 
3340 		error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0);
3341 		head = NULL;
3342 	}
3343 
3344 done_sending:
3345 	if (error == 0 ||
3346 	    (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3347 		uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3348 
3349 		if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3350 			tcpstat.tcps_mp_num_probes++;
3351 			if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3352 				mpts->mpts_probecnt += 1;
3353 			} else {
3354 				mpts->mpts_probecnt +=
3355 				    tot_sent / mpts->mpts_maxseg;
3356 			}
3357 		}
3358 
3359 		if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3360 			if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3361 			    MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3362 				mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3363 			}
3364 			mp_tp->mpt_sndnxt = new_sndnxt;
3365 		}
3366 
3367 		mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3368 
3369 		/* Must be here as mptcp_can_send_more() checks for this */
3370 		soclearfastopen(mp_so);
3371 
3372 		if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3373 			mptcp_set_cellicon(mpte, mpts);
3374 
3375 			mpte->mpte_used_cell = 1;
3376 		} else {
3377 			/*
3378 			 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3379 			 * explicitly set the cellicon, then we unset it again.
3380 			 */
3381 			if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3382 				mptcp_unset_cellicon(mpte, NULL, 1);
3383 			}
3384 
3385 			mpte->mpte_used_wifi = 1;
3386 		}
3387 
3388 		/*
3389 		 * Don't propagate EWOULDBLOCK - it's already taken care of
3390 		 * in mptcp_usr_send for TFO.
3391 		 */
3392 		error = 0;
3393 	} else {
3394 		/* We need to revert our change to mpts_rel_seq */
3395 		mpts->mpts_rel_seq -= tot_sent;
3396 
3397 		os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3398 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3399 	}
3400 out:
3401 
3402 	if (head != NULL) {
3403 		m_freem(head);
3404 	}
3405 
3406 	if (wakeup) {
3407 		mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3408 	}
3409 
3410 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3411 	return error;
3412 }
3413 
3414 static void
mptcp_add_reinjectq(struct mptses * mpte,struct mbuf * m)3415 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3416 {
3417 	struct mbuf *n, *prev = NULL;
3418 
3419 	n = mpte->mpte_reinjectq;
3420 
3421 	/* First, look for an mbuf n, whose data-sequence-number is bigger or
3422 	 * equal than m's sequence number.
3423 	 */
3424 	while (n) {
3425 		if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3426 			break;
3427 		}
3428 
3429 		prev = n;
3430 
3431 		n = n->m_nextpkt;
3432 	}
3433 
3434 	if (n) {
3435 		/* m is already fully covered by the next mbuf in the queue */
3436 		if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3437 		    n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3438 			os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3439 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3440 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3441 			    m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
3442 			goto dont_queue;
3443 		}
3444 
3445 		/* m is covering the next mbuf entirely, thus we remove this guy */
3446 		if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3447 			struct mbuf *tmp = n->m_nextpkt;
3448 
3449 			os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3450 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3451 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3452 			    (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
3453 
3454 			m->m_nextpkt = NULL;
3455 			if (prev == NULL) {
3456 				mpte->mpte_reinjectq = tmp;
3457 			} else {
3458 				prev->m_nextpkt = tmp;
3459 			}
3460 
3461 			m_freem(n);
3462 			n = tmp;
3463 		}
3464 	}
3465 
3466 	if (prev) {
3467 		/* m is already fully covered by the previous mbuf in the queue */
3468 		if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3469 			os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3470 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3471 			    (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3472 			    (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
3473 			goto dont_queue;
3474 		}
3475 	}
3476 
3477 	if (prev == NULL) {
3478 		mpte->mpte_reinjectq = m;
3479 	} else {
3480 		prev->m_nextpkt = m;
3481 	}
3482 
3483 	m->m_nextpkt = n;
3484 
3485 	return;
3486 
3487 dont_queue:
3488 	m_freem(m);
3489 	return;
3490 }
3491 
3492 static struct mbuf *
mptcp_lookup_dsn(struct mptses * mpte,uint64_t dsn)3493 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3494 {
3495 	struct socket *mp_so = mptetoso(mpte);
3496 	struct mbuf *m;
3497 
3498 	m = mp_so->so_snd.sb_mb;
3499 
3500 	while (m) {
3501 		/* If this segment covers what we are looking for, return it. */
3502 		if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3503 		    MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3504 			break;
3505 		}
3506 
3507 
3508 		/* Segment is no more in the queue */
3509 		if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3510 			return NULL;
3511 		}
3512 
3513 		m = m->m_next;
3514 	}
3515 
3516 	return m;
3517 }
3518 
3519 static struct mbuf *
mptcp_copy_mbuf_list(struct mptses * mpte,struct mbuf * m,int len)3520 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3521 {
3522 	struct mbuf *top = NULL, *tail = NULL;
3523 	uint64_t dsn;
3524 	uint32_t dlen, rseq;
3525 
3526 	dsn = m->m_pkthdr.mp_dsn;
3527 	dlen = m->m_pkthdr.mp_rlen;
3528 	rseq = m->m_pkthdr.mp_rseq;
3529 
3530 	while (len > 0) {
3531 		struct mbuf *n;
3532 
3533 		VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3534 
3535 		n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, NULL, NULL, M_COPYM_MUST_COPY_HDR);
3536 		if (n == NULL) {
3537 			os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3538 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3539 			goto err;
3540 		}
3541 
3542 		VERIFY(n->m_flags & M_PKTHDR);
3543 		VERIFY(n->m_next == NULL);
3544 		VERIFY(n->m_pkthdr.mp_dsn == dsn);
3545 		VERIFY(n->m_pkthdr.mp_rlen == dlen);
3546 		VERIFY(n->m_pkthdr.mp_rseq == rseq);
3547 		VERIFY(n->m_len == m->m_len);
3548 
3549 		n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3550 
3551 		if (top == NULL) {
3552 			top = n;
3553 		}
3554 
3555 		if (tail != NULL) {
3556 			tail->m_next = n;
3557 		}
3558 
3559 		tail = n;
3560 
3561 		len -= m->m_len;
3562 		m = m->m_next;
3563 	}
3564 
3565 	return top;
3566 
3567 err:
3568 	if (top) {
3569 		m_freem(top);
3570 	}
3571 
3572 	return NULL;
3573 }
3574 
3575 static void
mptcp_reinject_mbufs(struct socket * so)3576 mptcp_reinject_mbufs(struct socket *so)
3577 {
3578 	struct tcpcb *tp = sototcpcb(so);
3579 	struct mptsub *mpts = tp->t_mpsub;
3580 	struct mptcb *mp_tp = tptomptp(tp);
3581 	struct mptses *mpte = mp_tp->mpt_mpte;
3582 	struct sockbuf *sb = &so->so_snd;
3583 	struct mbuf *m;
3584 
3585 	m = sb->sb_mb;
3586 	while (m) {
3587 		struct mbuf *n = m->m_next, *orig = m;
3588 		bool set_reinject_flag = false;
3589 
3590 		VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3591 
3592 		if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3593 			goto next;
3594 		}
3595 
3596 		/* Has it all already been acknowledged at the data-level? */
3597 		if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3598 			goto next;
3599 		}
3600 
3601 		/* Part of this has already been acknowledged - lookup in the
3602 		 * MPTCP-socket for the segment.
3603 		 */
3604 		if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3605 			m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3606 			if (m == NULL) {
3607 				goto next;
3608 			}
3609 		}
3610 
3611 		/* Copy the mbuf with headers (aka, DSN-numbers) */
3612 		m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3613 		if (m == NULL) {
3614 			break;
3615 		}
3616 
3617 		VERIFY(m->m_nextpkt == NULL);
3618 
3619 		/* Now, add to the reinject-queue, eliminating overlapping
3620 		 * segments
3621 		 */
3622 		mptcp_add_reinjectq(mpte, m);
3623 
3624 		set_reinject_flag = true;
3625 		orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3626 
3627 next:
3628 		/* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3629 		while (n) {
3630 			VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3631 
3632 			if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3633 				break;
3634 			}
3635 
3636 			if (set_reinject_flag) {
3637 				n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3638 			}
3639 			n = n->m_next;
3640 		}
3641 
3642 		m = n;
3643 	}
3644 }
3645 
3646 void
mptcp_clean_reinjectq(struct mptses * mpte)3647 mptcp_clean_reinjectq(struct mptses *mpte)
3648 {
3649 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3650 
3651 	socket_lock_assert_owned(mptetoso(mpte));
3652 
3653 	while (mpte->mpte_reinjectq) {
3654 		struct mbuf *m = mpte->mpte_reinjectq;
3655 
3656 		if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3657 		    MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3658 			break;
3659 		}
3660 
3661 		mpte->mpte_reinjectq = m->m_nextpkt;
3662 		m->m_nextpkt = NULL;
3663 		m_freem(m);
3664 	}
3665 }
3666 
3667 static ev_ret_t
mptcp_subflow_propagate_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3668 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3669     uint32_t *p_mpsofilt_hint, uint32_t event)
3670 {
3671 	struct socket *mp_so, *so;
3672 	struct mptcb *mp_tp;
3673 
3674 	mp_so = mptetoso(mpte);
3675 	mp_tp = mpte->mpte_mptcb;
3676 	so = mpts->mpts_socket;
3677 
3678 	/*
3679 	 * We got an event for this subflow that might need to be propagated,
3680 	 * based on the state of the MPTCP connection.
3681 	 */
3682 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3683 	    (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3684 	    ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3685 		mp_so->so_error = so->so_error;
3686 		*p_mpsofilt_hint |= event;
3687 	}
3688 
3689 	return MPTS_EVRET_OK;
3690 }
3691 
3692 /*
3693  * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3694  */
3695 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3696 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3697     uint32_t *p_mpsofilt_hint, uint32_t event)
3698 {
3699 	struct socket *mp_so;
3700 	struct tcpcb *tp;
3701 
3702 	mp_so = mptetoso(mpte);
3703 	tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3704 
3705 	/*
3706 	 * This overwrites any previous mpte_lost_aid to avoid storing
3707 	 * too much state when the typical case has only two subflows.
3708 	 */
3709 	mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3710 	mpte->mpte_lost_aid = tp->t_local_aid;
3711 
3712 	/*
3713 	 * The subflow connection has lost its source address.
3714 	 */
3715 	mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3716 
3717 	if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3718 		mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3719 	}
3720 
3721 	return MPTS_EVRET_DELETE;
3722 }
3723 
3724 static ev_ret_t
mptcp_subflow_mpsuberror_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3725 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3726     uint32_t *p_mpsofilt_hint, uint32_t event)
3727 {
3728 #pragma unused(event, p_mpsofilt_hint)
3729 	struct socket *so, *mp_so;
3730 
3731 	so = mpts->mpts_socket;
3732 
3733 	if (so->so_error != ENODATA) {
3734 		return MPTS_EVRET_OK;
3735 	}
3736 
3737 
3738 	mp_so = mptetoso(mpte);
3739 
3740 	mp_so->so_error = ENODATA;
3741 
3742 	sorwakeup(mp_so);
3743 	sowwakeup(mp_so);
3744 
3745 	return MPTS_EVRET_OK;
3746 }
3747 
3748 
3749 /*
3750  * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3751  * indicates that the remote side sent a Data FIN
3752  */
3753 static ev_ret_t
mptcp_subflow_mpcantrcvmore_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3754 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3755     uint32_t *p_mpsofilt_hint, uint32_t event)
3756 {
3757 #pragma unused(event, mpts)
3758 	struct mptcb *mp_tp = mpte->mpte_mptcb;
3759 
3760 	/*
3761 	 * We got a Data FIN for the MPTCP connection.
3762 	 * The FIN may arrive with data. The data is handed up to the
3763 	 * mptcp socket and the user is notified so that it may close
3764 	 * the socket if needed.
3765 	 */
3766 	if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3767 		*p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3768 	}
3769 
3770 	return MPTS_EVRET_OK; /* keep the subflow socket around */
3771 }
3772 
3773 /*
3774  * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3775  */
3776 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3777 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3778     uint32_t *p_mpsofilt_hint, uint32_t event)
3779 {
3780 #pragma unused(event, p_mpsofilt_hint)
3781 	struct mptsub *mpts_alt = NULL;
3782 	struct socket *alt_so = NULL;
3783 	struct socket *mp_so;
3784 	int altpath_exists = 0;
3785 
3786 	mp_so = mptetoso(mpte);
3787 	os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3788 
3789 	mptcp_reinject_mbufs(mpts->mpts_socket);
3790 
3791 	mpts_alt = mptcp_get_subflow(mpte, NULL);
3792 
3793 	/* If there is no alternate eligible subflow, ignore the failover hint. */
3794 	if (mpts_alt == NULL || mpts_alt == mpts) {
3795 		os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3796 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3797 
3798 		goto done;
3799 	}
3800 
3801 	altpath_exists = 1;
3802 	alt_so = mpts_alt->mpts_socket;
3803 	if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3804 		/* All data acknowledged and no RTT spike */
3805 		if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3806 			mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3807 		} else {
3808 			/* no alternate path available */
3809 			altpath_exists = 0;
3810 		}
3811 	}
3812 
3813 	if (altpath_exists) {
3814 		mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3815 
3816 		mpte->mpte_active_sub = mpts_alt;
3817 		mpts->mpts_flags |= MPTSF_FAILINGOVER;
3818 		mpts->mpts_flags &= ~MPTSF_ACTIVE;
3819 
3820 		os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3821 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3822 
3823 		mptcpstats_inc_switch(mpte, mpts);
3824 
3825 		sowwakeup(alt_so);
3826 	} else {
3827 done:
3828 		mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3829 	}
3830 
3831 	return MPTS_EVRET_OK;
3832 }
3833 
3834 /*
3835  * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3836  */
3837 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3838 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3839     uint32_t *p_mpsofilt_hint, uint32_t event)
3840 {
3841 	/*
3842 	 * The subflow connection cannot use the outgoing interface, let's
3843 	 * close this subflow.
3844 	 */
3845 	mptcp_subflow_abort(mpts, EPERM);
3846 
3847 	mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3848 
3849 	return MPTS_EVRET_DELETE;
3850 }
3851 
3852 /*
3853  * https://tools.ietf.org/html/rfc6052#section-2
3854  * https://tools.ietf.org/html/rfc6147#section-5.2
3855  */
3856 static boolean_t
mptcp_desynthesize_ipv6_addr(struct mptses * mpte,const struct in6_addr * addr0,const struct ipv6_prefix * prefix,struct in_addr * addrv4_0)3857 mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr0,
3858     const struct ipv6_prefix *prefix,
3859     struct in_addr *addrv4_0)
3860 {
3861 	char buf[MAX_IPv4_STR_LEN];
3862 	const struct in6_addr *addr = addr0;
3863 	const char *ptr = (const char *)addr;
3864 	struct in_addr *addrv4 = addrv4_0;
3865 	char *ptrv4 = (char *)addrv4;
3866 
3867 	if (memcmp(ptr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3868 		return false;
3869 	}
3870 
3871 	switch (prefix->prefix_len) {
3872 	case NAT64_PREFIX_LEN_96:
3873 		memcpy(ptrv4, ptr + 12, 4);
3874 		break;
3875 	case NAT64_PREFIX_LEN_64:
3876 		memcpy(ptrv4, ptr + 9, 4);
3877 		break;
3878 	case NAT64_PREFIX_LEN_56:
3879 		memcpy(ptrv4, ptr + 7, 1);
3880 		memcpy(ptrv4 + 1, ptr + 9, 3);
3881 		break;
3882 	case NAT64_PREFIX_LEN_48:
3883 		memcpy(ptrv4, ptr + 6, 2);
3884 		memcpy(ptrv4 + 2, ptr + 9, 2);
3885 		break;
3886 	case NAT64_PREFIX_LEN_40:
3887 		memcpy(ptrv4, ptr + 5, 3);
3888 		memcpy(ptrv4 + 3, ptr + 9, 1);
3889 		break;
3890 	case NAT64_PREFIX_LEN_32:
3891 		memcpy(ptrv4, ptr + 4, 4);
3892 		break;
3893 	default:
3894 		panic("NAT64-prefix len is wrong: %u",
3895 		    prefix->prefix_len);
3896 	}
3897 
3898 	os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__,
3899 	    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3900 	    inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3901 
3902 	return true;
3903 }
3904 
3905 static void
mptcp_handle_ipv6_connection(struct mptses * mpte,const struct mptsub * mpts)3906 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3907 {
3908 	struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3909 	struct socket *so = mpts->mpts_socket;
3910 	struct ifnet *ifp;
3911 	int j;
3912 
3913 	/* Subflow IPs will be steered directly by the server - no need to
3914 	 * desynthesize.
3915 	 */
3916 	if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3917 		return;
3918 	}
3919 
3920 	ifp = sotoinpcb(so)->inp_last_outifp;
3921 
3922 	if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3923 		return;
3924 	}
3925 
3926 	for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3927 		int success;
3928 
3929 		if (nat64prefixes[j].prefix_len == 0) {
3930 			continue;
3931 		}
3932 
3933 		success = mptcp_desynthesize_ipv6_addr(mpte,
3934 		    &mpte->__mpte_dst_v6.sin6_addr,
3935 		    &nat64prefixes[j],
3936 		    &mpte->mpte_sub_dst_v4.sin_addr);
3937 		if (success) {
3938 			mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
3939 			mpte->mpte_sub_dst_v4.sin_family = AF_INET;
3940 			mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
3941 
3942 			/*
3943 			 * We connected to a NAT64'ed address. Let's remove it
3944 			 * from the potential IPs to use. Whenever we are back on
3945 			 * that network and need to connect, we can synthesize again.
3946 			 *
3947 			 * Otherwise, on different IPv6 networks we will attempt
3948 			 * to connect to that NAT64 address...
3949 			 */
3950 			memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6));
3951 			break;
3952 		}
3953 	}
3954 }
3955 
3956 static void
mptcp_try_alternate_port(struct mptses * mpte,struct mptsub * mpts)3957 mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
3958 {
3959 	struct inpcb *inp;
3960 
3961 	if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
3962 		return;
3963 	}
3964 
3965 	inp = sotoinpcb(mpts->mpts_socket);
3966 	if (inp == NULL) {
3967 		return;
3968 	}
3969 
3970 	/* Should we try the alternate port? */
3971 	if (mpte->mpte_alternate_port &&
3972 	    inp->inp_fport != mpte->mpte_alternate_port) {
3973 		union sockaddr_in_4_6 dst;
3974 		struct sockaddr_in *dst_in = SIN(&dst);
3975 
3976 		SOCKADDR_COPY(&mpts->mpts_dst, &dst, mpts->mpts_dst.sa_len);
3977 
3978 		dst_in->sin_port = mpte->mpte_alternate_port;
3979 
3980 		mptcp_subflow_add(mpte, NULL, SA(&dst), mpts->mpts_ifscope, NULL);
3981 	} else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3982 		unsigned int i;
3983 
3984 		if (inp->inp_last_outifp == NULL) {
3985 			return;
3986 		}
3987 
3988 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3989 			struct mpt_itf_info *info =  &mpte->mpte_itfinfo[i];
3990 
3991 			if (inp->inp_last_outifp->if_index == info->ifindex) {
3992 				info->no_mptcp_support = 1;
3993 				break;
3994 			}
3995 		}
3996 	}
3997 }
3998 
3999 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
4000 static void
mptcp_drop_tfo_data(struct mptses * mpte,struct mptsub * mpts)4001 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
4002 {
4003 	struct socket *mp_so = mptetoso(mpte);
4004 	struct socket *so = mpts->mpts_socket;
4005 	struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4006 	struct mptcb *mp_tp = mpte->mpte_mptcb;
4007 
4008 	/* If data was sent with SYN, rewind state */
4009 	if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
4010 		u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
4011 		unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
4012 
4013 		VERIFY(mp_droplen <= (UINT_MAX));
4014 		VERIFY(mp_droplen >= tcp_droplen);
4015 
4016 		mpts->mpts_flags &= ~MPTSF_TFO_REQD;
4017 		mpts->mpts_iss += tcp_droplen;
4018 		tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4019 
4020 		if (mp_droplen > tcp_droplen) {
4021 			/* handle partial TCP ack */
4022 			mp_so->so_flags1 |= SOF1_TFO_REWIND;
4023 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
4024 			mp_droplen = tcp_droplen;
4025 		} else {
4026 			/* all data on SYN was acked */
4027 			mpts->mpts_rel_seq = 1;
4028 			mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4029 		}
4030 		mp_tp->mpt_sndmax -= tcp_droplen;
4031 
4032 		if (mp_droplen != 0) {
4033 			VERIFY(mp_so->so_snd.sb_mb != NULL);
4034 			sbdrop(&mp_so->so_snd, (int)mp_droplen);
4035 		}
4036 	}
4037 }
4038 
4039 /*
4040  * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4041  */
4042 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4043 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4044     uint32_t *p_mpsofilt_hint, uint32_t event)
4045 {
4046 #pragma unused(event, p_mpsofilt_hint)
4047 	struct socket *mp_so, *so;
4048 	struct inpcb *inp;
4049 	struct tcpcb *tp;
4050 	struct mptcb *mp_tp;
4051 	int af;
4052 	boolean_t mpok = FALSE;
4053 
4054 	mp_so = mptetoso(mpte);
4055 	mp_tp = mpte->mpte_mptcb;
4056 	so = mpts->mpts_socket;
4057 	tp = sototcpcb(so);
4058 	af = mpts->mpts_dst.sa_family;
4059 
4060 	if (mpts->mpts_flags & MPTSF_CONNECTED) {
4061 		return MPTS_EVRET_OK;
4062 	}
4063 
4064 	if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4065 	    (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4066 		return MPTS_EVRET_OK;
4067 	}
4068 
4069 	/*
4070 	 * The subflow connection has been connected.  Find out whether it
4071 	 * is connected as a regular TCP or as a MPTCP subflow.  The idea is:
4072 	 *
4073 	 *   a. If MPTCP connection is not yet established, then this must be
4074 	 *	the first subflow connection.  If MPTCP failed to negotiate,
4075 	 *	fallback to regular TCP by degrading this subflow.
4076 	 *
4077 	 *   b. If MPTCP connection has been established, then this must be
4078 	 *	one of the subsequent subflow connections. If MPTCP failed
4079 	 *	to negotiate, disconnect the connection.
4080 	 *
4081 	 * Right now, we simply unblock any waiters at the MPTCP socket layer
4082 	 * if the MPTCP connection has not been established.
4083 	 */
4084 
4085 	if (so->so_state & SS_ISDISCONNECTED) {
4086 		/*
4087 		 * With MPTCP joins, a connection is connected at the subflow
4088 		 * level, but the 4th ACK from the server elevates the MPTCP
4089 		 * subflow to connected state. So there is a small window
4090 		 * where the subflow could get disconnected before the
4091 		 * connected event is processed.
4092 		 */
4093 		return MPTS_EVRET_OK;
4094 	}
4095 
4096 	if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4097 		mptcp_drop_tfo_data(mpte, mpts);
4098 	}
4099 
4100 	mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4101 	mpts->mpts_flags |= MPTSF_CONNECTED;
4102 
4103 	if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4104 		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4105 	}
4106 
4107 	tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4108 
4109 	/* get/verify the outbound interface */
4110 	inp = sotoinpcb(so);
4111 
4112 	mpts->mpts_maxseg = tp->t_maxseg;
4113 
4114 	mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4115 
4116 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4117 		mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4118 		mpte->mpte_associd = mpts->mpts_connid;
4119 		DTRACE_MPTCP2(state__change,
4120 		    struct mptcb *, mp_tp,
4121 		    uint32_t, 0 /* event */);
4122 
4123 		if (SOCK_DOM(so) == AF_INET) {
4124 			in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4125 		} else {
4126 			in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4127 		}
4128 
4129 		mpts->mpts_flags |= MPTSF_ACTIVE;
4130 
4131 		/* case (a) above */
4132 		if (!mpok) {
4133 			tcpstat.tcps_mpcap_fallback++;
4134 
4135 			tp->t_mpflags |= TMPF_INFIN_SENT;
4136 			mptcp_notify_mpfail(so);
4137 		} else {
4138 			if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4139 			    mptcp_subflows_need_backup_flag(mpte)) {
4140 				tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4141 			} else {
4142 				mpts->mpts_flags |= MPTSF_PREFERRED;
4143 			}
4144 			mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4145 			mpte->mpte_nummpcapflows++;
4146 
4147 			if (SOCK_DOM(so) == AF_INET6) {
4148 				mptcp_handle_ipv6_connection(mpte, mpts);
4149 			}
4150 
4151 			mptcp_check_subflows_and_add(mpte);
4152 
4153 			if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4154 				mpte->mpte_initial_cell = 1;
4155 			}
4156 
4157 			mpte->mpte_handshake_success = 1;
4158 		}
4159 
4160 		mp_tp->mpt_sndwnd = tp->snd_wnd;
4161 		mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4162 		mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4163 		soisconnected(mp_so);
4164 	} else if (mpok) {
4165 		/*
4166 		 * case (b) above
4167 		 * In case of additional flows, the MPTCP socket is not
4168 		 * MPTSF_MP_CAPABLE until an ACK is received from server
4169 		 * for 3-way handshake.  TCP would have guaranteed that this
4170 		 * is an MPTCP subflow.
4171 		 */
4172 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4173 		    !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4174 		    mptcp_subflows_need_backup_flag(mpte)) {
4175 			tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4176 			mpts->mpts_flags &= ~MPTSF_PREFERRED;
4177 		} else {
4178 			mpts->mpts_flags |= MPTSF_PREFERRED;
4179 		}
4180 
4181 		mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4182 		mpte->mpte_nummpcapflows++;
4183 
4184 		mpts->mpts_rel_seq = 1;
4185 
4186 		mptcp_check_subflows_and_remove(mpte);
4187 	} else {
4188 		mptcp_try_alternate_port(mpte, mpts);
4189 
4190 		tcpstat.tcps_join_fallback++;
4191 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4192 			tcpstat.tcps_mptcp_cell_proxy++;
4193 		} else {
4194 			tcpstat.tcps_mptcp_wifi_proxy++;
4195 		}
4196 
4197 		soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4198 
4199 		return MPTS_EVRET_OK;
4200 	}
4201 
4202 	/* This call, just to "book" an entry in the stats-table for this ifindex */
4203 	mptcpstats_get_index(mpte->mpte_itfstats, MPTCP_ITFSTATS_SIZE, mpts);
4204 
4205 	mptcp_output(mpte);
4206 
4207 	return MPTS_EVRET_OK; /* keep the subflow socket around */
4208 }
4209 
4210 /*
4211  * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4212  */
4213 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4214 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4215     uint32_t *p_mpsofilt_hint, uint32_t event)
4216 {
4217 #pragma unused(event, p_mpsofilt_hint)
4218 	struct socket *mp_so, *so;
4219 	struct mptcb *mp_tp;
4220 
4221 	mp_so = mptetoso(mpte);
4222 	mp_tp = mpte->mpte_mptcb;
4223 	so = mpts->mpts_socket;
4224 
4225 	if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4226 		return MPTS_EVRET_DELETE;
4227 	}
4228 
4229 	mpts->mpts_flags |= MPTSF_DISCONNECTED;
4230 
4231 	/* The subflow connection has been disconnected. */
4232 
4233 	if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4234 		mpte->mpte_nummpcapflows--;
4235 		if (mpte->mpte_active_sub == mpts) {
4236 			mpte->mpte_active_sub = NULL;
4237 		}
4238 		mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4239 	} else {
4240 		if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4241 		    !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4242 			mptcp_try_alternate_port(mpte, mpts);
4243 		}
4244 	}
4245 
4246 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4247 	    ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4248 		mptcp_drop(mpte, mp_tp, so->so_error);
4249 	}
4250 
4251 	/*
4252 	 * Clear flags that are used by getconninfo to return state.
4253 	 * Retain like MPTSF_DELETEOK for internal purposes.
4254 	 */
4255 	mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4256 	    MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4257 	    MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4258 
4259 	return MPTS_EVRET_DELETE;
4260 }
4261 
4262 /*
4263  * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4264  */
4265 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4266 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4267     uint32_t *p_mpsofilt_hint, uint32_t event)
4268 {
4269 #pragma unused(event, p_mpsofilt_hint)
4270 	ev_ret_t ret = MPTS_EVRET_OK;
4271 	struct socket *mp_so, *so;
4272 	struct mptcb *mp_tp;
4273 
4274 	mp_so = mptetoso(mpte);
4275 	mp_tp = mpte->mpte_mptcb;
4276 	so = mpts->mpts_socket;
4277 	struct inpcb *inp = sotoinpcb(so);
4278 	struct tcpcb *tp = intotcpcb(inp);
4279 
4280 	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4281 		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4282 	} else {
4283 		mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4284 	}
4285 
4286 	if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4287 		if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4288 			goto done;
4289 		}
4290 		mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4291 	} else {
4292 		mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4293 	}
4294 
4295 	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4296 		mpts->mpts_flags |= MPTSF_MP_READY;
4297 	} else {
4298 		mpts->mpts_flags &= ~MPTSF_MP_READY;
4299 	}
4300 
4301 	if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4302 		mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4303 		mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4304 		tcp_cache_update_mptcp_version(tp, FALSE);
4305 	}
4306 
4307 	if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4308 		ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4309 
4310 		m_freem_list(mpte->mpte_reinjectq);
4311 		mpte->mpte_reinjectq = NULL;
4312 	} else if (mpts->mpts_flags & MPTSF_MP_READY) {
4313 		mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4314 		ret = MPTS_EVRET_CONNECT_PENDING;
4315 	}
4316 
4317 done:
4318 	return ret;
4319 }
4320 
4321 /*
4322  * Handle SO_FILT_HINT_MUSTRST subflow socket event
4323  */
4324 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4325 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4326     uint32_t *p_mpsofilt_hint, uint32_t event)
4327 {
4328 #pragma unused(event)
4329 	struct socket *mp_so, *so;
4330 	struct mptcb *mp_tp;
4331 	boolean_t is_fastclose;
4332 
4333 	mp_so = mptetoso(mpte);
4334 	mp_tp = mpte->mpte_mptcb;
4335 	so = mpts->mpts_socket;
4336 
4337 	/* We got an invalid option or a fast close */
4338 	struct inpcb *inp = sotoinpcb(so);
4339 	struct tcpcb *tp = NULL;
4340 
4341 	tp = intotcpcb(inp);
4342 	so->so_error = ECONNABORTED;
4343 
4344 	is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4345 
4346 	tp->t_mpflags |= TMPF_RESET;
4347 
4348 	if (tp->t_state != TCPS_CLOSED) {
4349 		mbuf_ref_t m;
4350 		struct tcptemp *t_template = tcp_maketemplate(tp, &m, NULL, NULL);
4351 
4352 		if (t_template) {
4353 			struct tcp_respond_args tra;
4354 
4355 			bzero(&tra, sizeof(tra));
4356 			if (inp->inp_flags & INP_BOUND_IF) {
4357 				tra.ifscope = inp->inp_boundifp->if_index;
4358 			} else {
4359 				tra.ifscope = IFSCOPE_NONE;
4360 			}
4361 			tra.awdl_unrestricted = 1;
4362 
4363 			tcp_respond(tp, t_template->tt_ipgen, sizeof(t_template->tt_ipgen),
4364 			    &t_template->tt_t, (struct mbuf *)NULL,
4365 			    tp->rcv_nxt, tp->snd_una, 0, TH_RST, NULL, 0, 0, 0, &tra, false);
4366 			(void) m_free(m);
4367 		}
4368 	}
4369 
4370 	if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4371 		struct mptsub *iter, *tmp;
4372 
4373 		*p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4374 
4375 		mp_so->so_error = ECONNRESET;
4376 
4377 		TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4378 			if (iter == mpts) {
4379 				continue;
4380 			}
4381 			mptcp_subflow_abort(iter, ECONNABORTED);
4382 		}
4383 
4384 		/*
4385 		 * mptcp_drop is being called after processing the events, to fully
4386 		 * close the MPTCP connection
4387 		 */
4388 		mptcp_drop(mpte, mp_tp, mp_so->so_error);
4389 	}
4390 
4391 	mptcp_subflow_abort(mpts, ECONNABORTED);
4392 
4393 	if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4394 		mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4395 	}
4396 
4397 	return MPTS_EVRET_DELETE;
4398 }
4399 
4400 static ev_ret_t
mptcp_subflow_adaptive_rtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4401 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4402     uint32_t *p_mpsofilt_hint, uint32_t event)
4403 {
4404 #pragma unused(event)
4405 	bool found_active = false;
4406 
4407 	mpts->mpts_flags |= MPTSF_READ_STALL;
4408 
4409 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4410 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4411 
4412 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4413 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
4414 			continue;
4415 		}
4416 
4417 		if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4418 			found_active = true;
4419 			break;
4420 		}
4421 	}
4422 
4423 	if (!found_active) {
4424 		*p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4425 	}
4426 
4427 	return MPTS_EVRET_OK;
4428 }
4429 
4430 static ev_ret_t
mptcp_subflow_adaptive_wtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4431 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4432     uint32_t *p_mpsofilt_hint, uint32_t event)
4433 {
4434 #pragma unused(event)
4435 	bool found_active = false;
4436 
4437 	mpts->mpts_flags |= MPTSF_WRITE_STALL;
4438 
4439 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4440 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4441 
4442 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4443 		    tp->t_state > TCPS_CLOSE_WAIT) {
4444 			continue;
4445 		}
4446 
4447 		if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4448 			found_active = true;
4449 			break;
4450 		}
4451 	}
4452 
4453 	if (!found_active) {
4454 		*p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4455 	}
4456 
4457 	return MPTS_EVRET_OK;
4458 }
4459 
4460 /*
4461  * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4462  * caller must ensure that the option can be issued on subflow sockets, via
4463  * MPOF_SUBFLOW_OK flag.
4464  */
4465 int
mptcp_subflow_sosetopt(struct mptses * mpte,struct mptsub * mpts,struct mptopt * mpo)4466 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4467 {
4468 	struct socket *mp_so, *so;
4469 	struct sockopt sopt;
4470 	int error;
4471 
4472 	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4473 
4474 	mp_so = mptetoso(mpte);
4475 	so = mpts->mpts_socket;
4476 
4477 	/* Don't try to apply an IP or IPv6 option on an IPv6 or IP socket */
4478 	if (mpo->mpo_level == IPPROTO_IP && SOCK_CHECK_DOM(so, PF_INET6)) {
4479 		return 0;
4480 	}
4481 	if (mpo->mpo_level == IPPROTO_IPV6 && SOCK_CHECK_DOM(so, PF_INET)) {
4482 		return 0;
4483 	}
4484 
4485 	socket_lock_assert_owned(mp_so);
4486 
4487 	if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4488 	    mpo->mpo_level == SOL_SOCKET &&
4489 	    mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4490 		struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4491 
4492 		/*
4493 		 * When we open a new subflow, mark it as cell fallback, if
4494 		 * this subflow goes over cell.
4495 		 *
4496 		 * (except for first-party apps)
4497 		 */
4498 
4499 		if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4500 			return 0;
4501 		}
4502 
4503 		if (sotoinpcb(so)->inp_last_outifp &&
4504 		    !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4505 			return 0;
4506 		}
4507 
4508 		/*
4509 		 * This here is an OR, because if the app is not binding to the
4510 		 * interface, then it definitely is not a cell-fallback
4511 		 * connection.
4512 		 */
4513 		if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4514 		    !IFNET_IS_CELLULAR(ifp)) {
4515 			return 0;
4516 		}
4517 	}
4518 
4519 	mpo->mpo_flags &= ~MPOF_INTERIM;
4520 
4521 	bzero(&sopt, sizeof(sopt));
4522 	sopt.sopt_dir = SOPT_SET;
4523 	sopt.sopt_level = mpo->mpo_level;
4524 	sopt.sopt_name = mpo->mpo_name;
4525 	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4526 	sopt.sopt_valsize = sizeof(int);
4527 	sopt.sopt_p = kernproc;
4528 
4529 	error = sosetoptlock(so, &sopt, 0);
4530 	if (error) {
4531 		os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4532 		    "val %d set error %d\n", __func__,
4533 		    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4534 		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4535 		    mpo->mpo_intval, error);
4536 	}
4537 	return error;
4538 }
4539 
4540 /*
4541  * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4542  * caller must ensure that the option can be issued on subflow sockets, via
4543  * MPOF_SUBFLOW_OK flag.
4544  */
4545 int
mptcp_subflow_sogetopt(struct mptses * mpte,struct socket * so,struct mptopt * mpo)4546 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4547     struct mptopt *mpo)
4548 {
4549 	struct socket *mp_so;
4550 	struct sockopt sopt;
4551 	int error;
4552 
4553 	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4554 	mp_so = mptetoso(mpte);
4555 
4556 	socket_lock_assert_owned(mp_so);
4557 
4558 	bzero(&sopt, sizeof(sopt));
4559 	sopt.sopt_dir = SOPT_GET;
4560 	sopt.sopt_level = mpo->mpo_level;
4561 	sopt.sopt_name = mpo->mpo_name;
4562 	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4563 	sopt.sopt_valsize = sizeof(int);
4564 	sopt.sopt_p = kernproc;
4565 
4566 	error = sogetoptlock(so, &sopt, 0);     /* already locked */
4567 	if (error) {
4568 		os_log_error(mptcp_log_handle,
4569 		    "%s - %lx: sopt %s get error %d\n",
4570 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4571 		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4572 	}
4573 	return error;
4574 }
4575 
4576 
4577 /*
4578  * MPTCP garbage collector.
4579  *
4580  * This routine is called by the MP domain on-demand, periodic callout,
4581  * which is triggered when a MPTCP socket is closed.  The callout will
4582  * repeat as long as this routine returns a non-zero value.
4583  */
4584 static uint32_t
mptcp_gc(struct mppcbinfo * mppi)4585 mptcp_gc(struct mppcbinfo *mppi)
4586 {
4587 	struct mppcb *mpp, *tmpp;
4588 	uint32_t active = 0;
4589 
4590 	LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4591 
4592 	TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4593 		struct socket *mp_so;
4594 		struct mptses *mpte;
4595 		struct mptcb *mp_tp;
4596 
4597 		mp_so = mpp->mpp_socket;
4598 		mpte = mptompte(mpp);
4599 		mp_tp = mpte->mpte_mptcb;
4600 
4601 		if (!mpp_try_lock(mpp)) {
4602 			active++;
4603 			continue;
4604 		}
4605 
4606 		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4607 
4608 		/* check again under the lock */
4609 		if (mp_so->so_usecount > 0) {
4610 			boolean_t wakeup = FALSE;
4611 			struct mptsub *mpts, *tmpts;
4612 
4613 			if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4614 				if (mp_tp->mpt_gc_ticks > 0) {
4615 					mp_tp->mpt_gc_ticks--;
4616 				}
4617 				if (mp_tp->mpt_gc_ticks == 0) {
4618 					wakeup = TRUE;
4619 				}
4620 			}
4621 			if (wakeup) {
4622 				TAILQ_FOREACH_SAFE(mpts,
4623 				    &mpte->mpte_subflows, mpts_entry, tmpts) {
4624 					mptcp_subflow_eupcall1(mpts->mpts_socket,
4625 					    mpts, SO_FILT_HINT_DISCONNECTED);
4626 				}
4627 			}
4628 			socket_unlock(mp_so, 0);
4629 			active++;
4630 			continue;
4631 		}
4632 
4633 		if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4634 			panic("%s - %lx: skipped state "
4635 			    "[u=%d,r=%d,s=%d]\n", __func__,
4636 			    (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4637 			    mp_so->so_usecount, mp_so->so_retaincnt,
4638 			    mpp->mpp_state);
4639 		}
4640 
4641 		if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4642 			mptcp_close(mpte, mp_tp);
4643 		}
4644 
4645 		mptcp_session_destroy(mpte);
4646 
4647 		DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4648 		    struct sockbuf *, &mp_so->so_rcv,
4649 		    struct sockbuf *, &mp_so->so_snd,
4650 		    struct mppcb *, mpp);
4651 
4652 		mptcp_pcbdispose(mpp);
4653 		sodealloc(mp_so);
4654 	}
4655 
4656 	return active;
4657 }
4658 
4659 /*
4660  * Drop a MPTCP connection, reporting the specified error.
4661  */
4662 struct mptses *
mptcp_drop(struct mptses * mpte,struct mptcb * mp_tp,u_short errno)4663 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4664 {
4665 	struct socket *mp_so = mptetoso(mpte);
4666 
4667 	VERIFY(mpte->mpte_mptcb == mp_tp);
4668 
4669 	socket_lock_assert_owned(mp_so);
4670 
4671 	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4672 	    uint32_t, 0 /* event */);
4673 
4674 	if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4675 		errno = mp_tp->mpt_softerror;
4676 	}
4677 	mp_so->so_error = errno;
4678 
4679 	return mptcp_close(mpte, mp_tp);
4680 }
4681 
4682 /*
4683  * Close a MPTCP control block.
4684  */
4685 struct mptses *
mptcp_close(struct mptses * mpte,struct mptcb * mp_tp)4686 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4687 {
4688 	struct mptsub *mpts = NULL, *tmpts = NULL;
4689 	struct socket *mp_so = mptetoso(mpte);
4690 
4691 	socket_lock_assert_owned(mp_so);
4692 	VERIFY(mpte->mpte_mptcb == mp_tp);
4693 
4694 	mp_tp->mpt_state = MPTCPS_TERMINATE;
4695 
4696 	mptcp_freeq(mp_tp);
4697 
4698 	soisdisconnected(mp_so);
4699 
4700 	/* Clean up all subflows */
4701 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4702 		mptcp_subflow_disconnect(mpte, mpts);
4703 	}
4704 
4705 	return NULL;
4706 }
4707 
4708 void
mptcp_notify_close(struct socket * so)4709 mptcp_notify_close(struct socket *so)
4710 {
4711 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4712 }
4713 
4714 typedef struct mptcp_subflow_event_entry {
4715 	uint32_t    sofilt_hint_mask;
4716 	ev_ret_t    (*sofilt_hint_ev_hdlr)(
4717 		struct mptses *mpte,
4718 		struct mptsub *mpts,
4719 		uint32_t *p_mpsofilt_hint,
4720 		uint32_t event);
4721 } mptsub_ev_entry_t;
4722 
4723 /*
4724  * XXX The order of the event handlers below is really
4725  * really important. Think twice before changing it.
4726  */
4727 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
4728 	{
4729 		.sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
4730 		.sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
4731 	},
4732 	{
4733 		.sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
4734 		.sofilt_hint_ev_hdlr =  mptcp_subflow_mpcantrcvmore_ev,
4735 	},
4736 	{
4737 		.sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
4738 		.sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
4739 	},
4740 	{
4741 		.sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
4742 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4743 	},
4744 	{
4745 		.sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
4746 		.sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
4747 	},
4748 	{
4749 		.sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
4750 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4751 	},
4752 	{
4753 		.sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
4754 		.sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4755 	},
4756 	{
4757 		.sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
4758 		.sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
4759 	},
4760 	{
4761 		.sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
4762 		.sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
4763 	},
4764 	{
4765 		.sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
4766 		.sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
4767 	},
4768 	{
4769 		.sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
4770 		.sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
4771 	},
4772 	{
4773 		.sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
4774 		.sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
4775 	},
4776 	{
4777 		.sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
4778 		.sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
4779 	},
4780 	{
4781 		.sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
4782 		.sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
4783 	},
4784 };
4785 
4786 /*
4787  * Subflow socket control events.
4788  *
4789  * Called for handling events related to the underlying subflow socket.
4790  */
4791 static ev_ret_t
mptcp_subflow_events(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint)4792 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
4793     uint32_t *p_mpsofilt_hint)
4794 {
4795 	ev_ret_t ret = MPTS_EVRET_OK;
4796 	int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
4797 	    sizeof(mpsub_ev_entry_tbl[0]);
4798 
4799 	/* bail if there's nothing to process */
4800 	if (!mpts->mpts_evctl) {
4801 		return ret;
4802 	}
4803 
4804 	if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
4805 	    SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
4806 	    SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
4807 	    SO_FILT_HINT_DISCONNECTED)) {
4808 		mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
4809 	}
4810 
4811 	DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
4812 	    struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
4813 
4814 	/*
4815 	 * Process all the socket filter hints and reset the hint
4816 	 * once it is handled
4817 	 */
4818 	for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
4819 		/*
4820 		 * Always execute the DISCONNECTED event, because it will wakeup
4821 		 * the app.
4822 		 */
4823 		if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
4824 		    (ret >= MPTS_EVRET_OK ||
4825 		    mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
4826 			mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
4827 			ev_ret_t error =
4828 			    mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
4829 			ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
4830 		}
4831 	}
4832 
4833 	return ret;
4834 }
4835 
4836 /*
4837  * MPTCP workloop.
4838  */
4839 void
mptcp_subflow_workloop(struct mptses * mpte)4840 mptcp_subflow_workloop(struct mptses *mpte)
4841 {
4842 	boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4843 	uint32_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4844 	struct mptsub *mpts, *tmpts;
4845 	struct socket *mp_so;
4846 
4847 	mp_so = mptetoso(mpte);
4848 
4849 	socket_lock_assert_owned(mp_so);
4850 
4851 	if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4852 		mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4853 		return;
4854 	}
4855 	mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4856 
4857 relaunch:
4858 	mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4859 
4860 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4861 		ev_ret_t ret;
4862 
4863 		if (mpts->mpts_socket->so_usecount == 0) {
4864 			/* Will be removed soon by tcp_garbage_collect */
4865 			continue;
4866 		}
4867 
4868 		mptcp_subflow_addref(mpts);
4869 		mpts->mpts_socket->so_usecount++;
4870 
4871 		ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4872 
4873 		/*
4874 		 * If MPTCP socket is closed, disconnect all subflows.
4875 		 * This will generate a disconnect event which will
4876 		 * be handled during the next iteration, causing a
4877 		 * non-zero error to be returned above.
4878 		 */
4879 		if (mp_so->so_flags & SOF_PCBCLEARING) {
4880 			mptcp_subflow_disconnect(mpte, mpts);
4881 		}
4882 
4883 		switch (ret) {
4884 		case MPTS_EVRET_OK:
4885 			/* nothing to do */
4886 			break;
4887 		case MPTS_EVRET_DELETE:
4888 			mptcp_subflow_soclose(mpts);
4889 			break;
4890 		case MPTS_EVRET_CONNECT_PENDING:
4891 			connect_pending = TRUE;
4892 			break;
4893 		case MPTS_EVRET_DISCONNECT_FALLBACK:
4894 			disconnect_fallback = TRUE;
4895 			break;
4896 		default:
4897 			break;
4898 		}
4899 		mptcp_subflow_remref(mpts);             /* ours */
4900 
4901 		VERIFY(mpts->mpts_socket->so_usecount != 0);
4902 		mpts->mpts_socket->so_usecount--;
4903 	}
4904 
4905 	if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4906 		VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4907 
4908 		if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4909 			mp_so->so_state |= SS_CANTRCVMORE;
4910 			sorwakeup(mp_so);
4911 		}
4912 
4913 		soevent(mp_so, mpsofilt_hint_mask);
4914 	}
4915 
4916 	if (!connect_pending && !disconnect_fallback) {
4917 		goto exit;
4918 	}
4919 
4920 	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4921 		if (disconnect_fallback) {
4922 			struct socket *so = NULL;
4923 			struct inpcb *inp = NULL;
4924 			struct tcpcb *tp = NULL;
4925 
4926 			if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4927 				continue;
4928 			}
4929 
4930 			mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4931 
4932 			if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4933 			    MPTSF_DISCONNECTED)) {
4934 				continue;
4935 			}
4936 
4937 			so = mpts->mpts_socket;
4938 
4939 			/*
4940 			 * The MPTCP connection has degraded to a fallback
4941 			 * mode, so there is no point in keeping this subflow
4942 			 * regardless of its MPTCP-readiness state, unless it
4943 			 * is the primary one which we use for fallback.  This
4944 			 * assumes that the subflow used for fallback is the
4945 			 * ACTIVE one.
4946 			 */
4947 
4948 			inp = sotoinpcb(so);
4949 			tp = intotcpcb(inp);
4950 			tp->t_mpflags &=
4951 			    ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4952 			tp->t_mpflags |= TMPF_TCP_FALLBACK;
4953 
4954 			soevent(so, SO_FILT_HINT_MUSTRST);
4955 		} else if (connect_pending) {
4956 			/*
4957 			 * The MPTCP connection has progressed to a state
4958 			 * where it supports full multipath semantics; allow
4959 			 * additional joins to be attempted for all subflows
4960 			 * that are in the PENDING state.
4961 			 */
4962 			if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4963 				int error = mptcp_subflow_soconnectx(mpte, mpts);
4964 
4965 				if (error) {
4966 					mptcp_subflow_abort(mpts, error);
4967 				}
4968 			}
4969 		}
4970 	}
4971 
4972 exit:
4973 	if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4974 		goto relaunch;
4975 	}
4976 
4977 	mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4978 }
4979 
4980 /*
4981  * Protocol pr_lock callback.
4982  */
4983 int
mptcp_lock(struct socket * mp_so,int refcount,void * lr)4984 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4985 {
4986 	struct mppcb *mpp = mpsotomppcb(mp_so);
4987 	lr_ref_t lr_saved = TCP_INIT_LR_SAVED(lr);
4988 
4989 	if (mpp == NULL) {
4990 		panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
4991 		    mp_so, lr_saved, solockhistory_nr(mp_so));
4992 		/* NOTREACHED */
4993 	}
4994 	mpp_lock(mpp);
4995 
4996 	if (mp_so->so_usecount < 0) {
4997 		panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__,
4998 		    mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4999 		    solockhistory_nr(mp_so));
5000 		/* NOTREACHED */
5001 	}
5002 	if (refcount != 0) {
5003 		mp_so->so_usecount++;
5004 		mpp->mpp_inside++;
5005 	}
5006 	mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
5007 	mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
5008 
5009 	return 0;
5010 }
5011 
5012 /*
5013  * Protocol pr_unlock callback.
5014  */
5015 int
mptcp_unlock(struct socket * mp_so,int refcount,void * lr)5016 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
5017 {
5018 	struct mppcb *mpp = mpsotomppcb(mp_so);
5019 	lr_ref_t lr_saved = TCP_INIT_LR_SAVED(lr);
5020 
5021 	if (mpp == NULL) {
5022 		panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__,
5023 		    mp_so, mp_so->so_usecount, lr_saved,
5024 		    solockhistory_nr(mp_so));
5025 		/* NOTREACHED */
5026 	}
5027 	socket_lock_assert_owned(mp_so);
5028 
5029 	if (refcount != 0) {
5030 		mp_so->so_usecount--;
5031 		mpp->mpp_inside--;
5032 	}
5033 
5034 	if (mp_so->so_usecount < 0) {
5035 		panic("%s: so=%p usecount=%x lrh= %s", __func__,
5036 		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5037 		/* NOTREACHED */
5038 	}
5039 	if (mpp->mpp_inside < 0) {
5040 		panic("%s: mpp=%p inside=%x lrh= %s", __func__,
5041 		    mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5042 		/* NOTREACHED */
5043 	}
5044 	mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5045 	mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5046 	mpp_unlock(mpp);
5047 
5048 	return 0;
5049 }
5050 
5051 /*
5052  * Protocol pr_getlock callback.
5053  */
5054 lck_mtx_t *
mptcp_getlock(struct socket * mp_so,int flags)5055 mptcp_getlock(struct socket *mp_so, int flags)
5056 {
5057 	struct mppcb *mpp = mpsotomppcb(mp_so);
5058 
5059 	if (mpp == NULL) {
5060 		panic("%s: so=%p NULL so_pcb %s", __func__, mp_so,
5061 		    solockhistory_nr(mp_so));
5062 		/* NOTREACHED */
5063 	}
5064 	if (mp_so->so_usecount < 0) {
5065 		panic("%s: so=%p usecount=%x lrh= %s", __func__,
5066 		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5067 		/* NOTREACHED */
5068 	}
5069 	return mpp_getlock(mpp, flags);
5070 }
5071 
5072 void
mptcp_get_rands(mptcp_addr_id addr_id,struct mptcb * mp_tp,u_int32_t * lrand,u_int32_t * rrand)5073 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5074     u_int32_t *rrand)
5075 {
5076 	struct mptcp_subf_auth_entry *sauth_entry;
5077 
5078 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5079 		if (sauth_entry->msae_laddr_id == addr_id) {
5080 			if (lrand) {
5081 				*lrand = sauth_entry->msae_laddr_rand;
5082 			}
5083 			if (rrand) {
5084 				*rrand = sauth_entry->msae_raddr_rand;
5085 			}
5086 			break;
5087 		}
5088 	}
5089 }
5090 
5091 void
mptcp_set_raddr_rand(mptcp_addr_id laddr_id,struct mptcb * mp_tp,mptcp_addr_id raddr_id,u_int32_t raddr_rand)5092 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5093     mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5094 {
5095 	struct mptcp_subf_auth_entry *sauth_entry;
5096 
5097 	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5098 		if (sauth_entry->msae_laddr_id == laddr_id) {
5099 			if ((sauth_entry->msae_raddr_id != 0) &&
5100 			    (sauth_entry->msae_raddr_id != raddr_id)) {
5101 				os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5102 				    " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5103 				    raddr_id, sauth_entry->msae_raddr_id);
5104 				return;
5105 			}
5106 			sauth_entry->msae_raddr_id = raddr_id;
5107 			if ((sauth_entry->msae_raddr_rand != 0) &&
5108 			    (sauth_entry->msae_raddr_rand != raddr_rand)) {
5109 				os_log_error(mptcp_log_handle, "%s - %lx: "
5110 				    "dup SYN_ACK %d %d \n",
5111 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5112 				    raddr_rand, sauth_entry->msae_raddr_rand);
5113 				return;
5114 			}
5115 			sauth_entry->msae_raddr_rand = raddr_rand;
5116 			return;
5117 		}
5118 	}
5119 }
5120 
5121 /*
5122  * SHA-256 support for MPTCP
5123  */
5124 
5125 static void
mptcp_do_sha256(mptcp_key_t * key,char sha_digest[SHA256_DIGEST_LENGTH])5126 mptcp_do_sha256(mptcp_key_t *key, char sha_digest[SHA256_DIGEST_LENGTH])
5127 {
5128 	const unsigned char *sha2_base;
5129 	int sha2_size;
5130 
5131 	sha2_base = (const unsigned char *) key;
5132 	sha2_size = sizeof(mptcp_key_t);
5133 
5134 	SHA256_CTX sha_ctx;
5135 	SHA256_Init(&sha_ctx);
5136 	SHA256_Update(&sha_ctx, sha2_base, sha2_size);
5137 	SHA256_Final(sha_digest, &sha_ctx);
5138 }
5139 
5140 void
mptcp_hmac_sha256(mptcp_key_t key1,mptcp_key_t key2,u_char * msg __sized_by (msg_len),uint16_t msg_len,u_char digest[SHA256_DIGEST_LENGTH])5141 mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2,
5142     u_char *msg __sized_by(msg_len), uint16_t msg_len, u_char digest[SHA256_DIGEST_LENGTH])
5143 {
5144 	SHA256_CTX sha_ctx;
5145 	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5146 	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5147 	int i;
5148 
5149 	bzero(digest, SHA256_DIGEST_LENGTH);
5150 
5151 	/* Set up the Key for HMAC */
5152 	key_ipad[0] = key1;
5153 	key_ipad[1] = key2;
5154 
5155 	key_opad[0] = key1;
5156 	key_opad[1] = key2;
5157 
5158 	/* Key is 512 block length, so no need to compute hash */
5159 
5160 	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5161 
5162 	for (i = 0; i < 8; i++) {
5163 		key_ipad[i] ^= 0x3636363636363636;
5164 		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5165 	}
5166 
5167 	/* Perform inner SHA256 */
5168 	SHA256_Init(&sha_ctx);
5169 	SHA256_Update(&sha_ctx, (unsigned char *)key_ipad, sizeof(key_ipad));
5170 	SHA256_Update(&sha_ctx, msg, msg_len);
5171 	SHA256_Final(digest, &sha_ctx);
5172 
5173 	/* Perform outer SHA256 */
5174 	SHA256_Init(&sha_ctx);
5175 	SHA256_Update(&sha_ctx, (unsigned char *)key_opad, sizeof(key_opad));
5176 	SHA256_Update(&sha_ctx, (unsigned char *)digest, SHA256_DIGEST_LENGTH);
5177 	SHA256_Final(digest, &sha_ctx);
5178 }
5179 
5180 /*
5181  * SHA1 support for MPTCP
5182  */
5183 
5184 static void
mptcp_do_sha1(mptcp_key_t * key,char sha_digest[SHA1_RESULTLEN])5185 mptcp_do_sha1(mptcp_key_t *key, char sha_digest[SHA1_RESULTLEN])
5186 {
5187 	SHA1_CTX sha1ctxt;
5188 	const unsigned char *sha1_base;
5189 	int sha1_size;
5190 
5191 	sha1_base = (const unsigned char *) key;
5192 	sha1_size = sizeof(mptcp_key_t);
5193 	SHA1Init(&sha1ctxt);
5194 	SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5195 	SHA1Final(sha_digest, &sha1ctxt);
5196 }
5197 
5198 void
mptcp_hmac_sha1(mptcp_key_t key1,mptcp_key_t key2,u_int32_t rand1,u_int32_t rand2,u_char digest[SHA1_RESULTLEN])5199 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5200     u_int32_t rand1, u_int32_t rand2, u_char digest[SHA1_RESULTLEN])
5201 {
5202 	SHA1_CTX  sha1ctxt;
5203 	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5204 	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5205 	u_int32_t data[2];
5206 	int i;
5207 
5208 	bzero(digest, SHA1_RESULTLEN);
5209 
5210 	/* Set up the Key for HMAC */
5211 	key_ipad[0] = key1;
5212 	key_ipad[1] = key2;
5213 
5214 	key_opad[0] = key1;
5215 	key_opad[1] = key2;
5216 
5217 	/* Set up the message for HMAC */
5218 	data[0] = rand1;
5219 	data[1] = rand2;
5220 
5221 	/* Key is 512 block length, so no need to compute hash */
5222 
5223 	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5224 
5225 	for (i = 0; i < 8; i++) {
5226 		key_ipad[i] ^= 0x3636363636363636;
5227 		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5228 	}
5229 
5230 	/* Perform inner SHA1 */
5231 	SHA1Init(&sha1ctxt);
5232 	SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5233 	SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5234 	SHA1Final(digest, &sha1ctxt);
5235 
5236 	/* Perform outer SHA1 */
5237 	SHA1Init(&sha1ctxt);
5238 	SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5239 	SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5240 	SHA1Final(digest, &sha1ctxt);
5241 }
5242 
5243 /*
5244  * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5245  * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5246  */
5247 void
mptcp_get_mpjoin_hmac(mptcp_addr_id aid,struct mptcb * mp_tp,u_char * digest __sized_by (digest_len),uint8_t digest_len)5248 mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest __sized_by(digest_len), uint8_t digest_len)
5249 {
5250 	uint32_t lrand, rrand;
5251 
5252 	lrand = rrand = 0;
5253 	mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5254 
5255 	u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0};
5256 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5257 		mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, full_digest);
5258 	} else {
5259 		uint32_t data[2];
5260 		data[0] = lrand;
5261 		data[1] = rrand;
5262 		mptcp_hmac_sha256(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, (u_char*)data, 8, full_digest);
5263 	}
5264 	bcopy(full_digest, digest, digest_len);
5265 }
5266 
5267 /*
5268  * Authentication data generation
5269  */
5270 static void
mptcp_generate_token(char * sha_digest __sized_by (sha_digest_len),int sha_digest_len,caddr_t token __sized_by (token_len),int token_len)5271 mptcp_generate_token(char *sha_digest __sized_by(sha_digest_len), int sha_digest_len, caddr_t token __sized_by(token_len),
5272     int token_len)
5273 {
5274 	VERIFY(token_len == sizeof(u_int32_t));
5275 	VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5276 	    sha_digest_len == SHA256_DIGEST_LENGTH);
5277 
5278 	/* Most significant 32 bits of the SHA1/SHA256 hash */
5279 	bcopy(sha_digest, token, sizeof(u_int32_t));
5280 	return;
5281 }
5282 
5283 static void
mptcp_generate_idsn(char * sha_digest __sized_by (sha_digest_len),int sha_digest_len,caddr_t idsn __sized_by (idsn_len),int idsn_len,uint8_t mp_version)5284 mptcp_generate_idsn(char *sha_digest __sized_by(sha_digest_len), int sha_digest_len, caddr_t idsn __sized_by(idsn_len),
5285     int idsn_len, uint8_t mp_version)
5286 {
5287 	VERIFY(idsn_len == sizeof(u_int64_t));
5288 	VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5289 	    sha_digest_len == SHA256_DIGEST_LENGTH);
5290 	VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1);
5291 
5292 	/*
5293 	 * Least significant 64 bits of the hash
5294 	 */
5295 
5296 	if (mp_version == MPTCP_VERSION_0) {
5297 		idsn[7] = sha_digest[12];
5298 		idsn[6] = sha_digest[13];
5299 		idsn[5] = sha_digest[14];
5300 		idsn[4] = sha_digest[15];
5301 		idsn[3] = sha_digest[16];
5302 		idsn[2] = sha_digest[17];
5303 		idsn[1] = sha_digest[18];
5304 		idsn[0] = sha_digest[19];
5305 	} else {
5306 		idsn[7] = sha_digest[24];
5307 		idsn[6] = sha_digest[25];
5308 		idsn[5] = sha_digest[26];
5309 		idsn[4] = sha_digest[27];
5310 		idsn[3] = sha_digest[28];
5311 		idsn[2] = sha_digest[29];
5312 		idsn[1] = sha_digest[30];
5313 		idsn[0] = sha_digest[31];
5314 	}
5315 	return;
5316 }
5317 
5318 static void
mptcp_conn_properties(struct mptcb * mp_tp)5319 mptcp_conn_properties(struct mptcb *mp_tp)
5320 {
5321 	/* Set DSS checksum flag */
5322 	if (mptcp_dss_csum) {
5323 		mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5324 	}
5325 
5326 	/* Set up receive window */
5327 	mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5328 
5329 	/* Set up gc ticks */
5330 	mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5331 }
5332 
5333 static void
mptcp_init_local_parms(struct mptses * mpte,struct sockaddr * dst)5334 mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst)
5335 {
5336 	struct mptcb *mp_tp = mpte->mpte_mptcb;
5337 	char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5338 	uint16_t digest_len;
5339 
5340 	if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) {
5341 		mp_tp->mpt_version = MPTCP_VERSION_0;
5342 	} else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) {
5343 		mp_tp->mpt_version = MPTCP_VERSION_1;
5344 	} else {
5345 		mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst);
5346 	}
5347 	VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 ||
5348 	    mp_tp->mpt_version == MPTCP_VERSION_1);
5349 
5350 	read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5351 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5352 		digest_len = SHA1_RESULTLEN;
5353 		mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5354 	} else {
5355 		digest_len = SHA256_DIGEST_LENGTH;
5356 		mptcp_do_sha256(&mp_tp->mpt_localkey, key_digest);
5357 	}
5358 
5359 	mptcp_generate_token(key_digest, digest_len,
5360 	    (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5361 	mptcp_generate_idsn(key_digest, digest_len,
5362 	    (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5363 	/* The subflow SYN is also first MPTCP byte */
5364 	mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5365 	mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5366 
5367 	mptcp_conn_properties(mp_tp);
5368 }
5369 
5370 int
mptcp_init_remote_parms(struct mptcb * mp_tp)5371 mptcp_init_remote_parms(struct mptcb *mp_tp)
5372 {
5373 	/* Setup local and remote tokens and Initial DSNs */
5374 	char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5375 	uint16_t digest_len;
5376 
5377 	if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5378 		digest_len = SHA1_RESULTLEN;
5379 		mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5380 	} else if (mp_tp->mpt_version == MPTCP_VERSION_1) {
5381 		digest_len = SHA256_DIGEST_LENGTH;
5382 		mptcp_do_sha256(&mp_tp->mpt_remotekey, remote_digest);
5383 	} else {
5384 		return -1;
5385 	}
5386 
5387 	mptcp_generate_token(remote_digest, digest_len,
5388 	    (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5389 	mptcp_generate_idsn(remote_digest, digest_len,
5390 	    (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5391 	mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5392 	mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5393 	return 0;
5394 }
5395 
5396 static void
mptcp_send_dfin(struct socket * so)5397 mptcp_send_dfin(struct socket *so)
5398 {
5399 	struct tcpcb *tp = NULL;
5400 	struct inpcb *inp = NULL;
5401 
5402 	inp = sotoinpcb(so);
5403 	if (!inp) {
5404 		return;
5405 	}
5406 
5407 	tp = intotcpcb(inp);
5408 	if (!tp) {
5409 		return;
5410 	}
5411 
5412 	if (!(tp->t_mpflags & TMPF_RESET)) {
5413 		tp->t_mpflags |= TMPF_SEND_DFIN;
5414 	}
5415 }
5416 
5417 /*
5418  * Data Sequence Mapping routines
5419  */
5420 void
mptcp_insert_dsn(struct mppcb * mpp,struct mbuf * m)5421 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5422 {
5423 	struct mptcb *mp_tp;
5424 
5425 	if (m == NULL) {
5426 		return;
5427 	}
5428 
5429 	mp_tp = &__container_of(mpp, struct mpp_mtp, mpp)->mtcb;
5430 
5431 	while (m) {
5432 		VERIFY(m->m_flags & M_PKTHDR);
5433 		m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5434 		m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5435 		VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5436 		m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5437 		mp_tp->mpt_sndmax += m_pktlen(m);
5438 		m = m->m_next;
5439 	}
5440 }
5441 
5442 void
mptcp_fallback_sbdrop(struct socket * so,struct mbuf * m,int len)5443 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5444 {
5445 	struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5446 	uint64_t data_ack;
5447 	uint64_t dsn;
5448 
5449 	VERIFY(len >= 0);
5450 
5451 	if (!m || len == 0) {
5452 		return;
5453 	}
5454 
5455 	while (m && len > 0) {
5456 		VERIFY(m->m_flags & M_PKTHDR);
5457 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5458 
5459 		data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5460 		dsn = m->m_pkthdr.mp_dsn;
5461 
5462 		len -= m->m_len;
5463 		m = m->m_next;
5464 	}
5465 
5466 	if (m && len == 0) {
5467 		/*
5468 		 * If there is one more mbuf in the chain, it automatically means
5469 		 * that up to m->mp_dsn has been ack'ed.
5470 		 *
5471 		 * This means, we actually correct data_ack back down (compared
5472 		 * to what we set inside the loop - dsn + data_len). Because in
5473 		 * the loop we are "optimistic" and assume that the full mapping
5474 		 * will be acked. If that's not the case and we get out of the
5475 		 * loop with m != NULL, it means only up to m->mp_dsn has been
5476 		 * really acked.
5477 		 */
5478 		data_ack = m->m_pkthdr.mp_dsn;
5479 	}
5480 
5481 	if (len < 0) {
5482 		/*
5483 		 * If len is negative, meaning we acked in the middle of an mbuf,
5484 		 * only up to this mbuf's data-sequence number has been acked
5485 		 * at the MPTCP-level.
5486 		 */
5487 		data_ack = dsn;
5488 	}
5489 
5490 	/* We can have data in the subflow's send-queue that is being acked,
5491 	 * while the DATA_ACK has already advanced. Thus, we should check whether
5492 	 * or not the DATA_ACK is actually new here.
5493 	 */
5494 	if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5495 	    MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5496 		mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5497 	}
5498 }
5499 
5500 void
mptcp_preproc_sbdrop(struct socket * so,struct mbuf * m,unsigned int len)5501 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5502 {
5503 	int rewinding = 0;
5504 
5505 	/* TFO makes things complicated. */
5506 	if (so->so_flags1 & SOF1_TFO_REWIND) {
5507 		rewinding = 1;
5508 		so->so_flags1 &= ~SOF1_TFO_REWIND;
5509 	}
5510 
5511 	while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5512 		u_int32_t sub_len;
5513 		VERIFY(m->m_flags & M_PKTHDR);
5514 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5515 
5516 		sub_len = m->m_pkthdr.mp_rlen;
5517 
5518 		if (sub_len < len) {
5519 			m->m_pkthdr.mp_dsn += sub_len;
5520 			if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5521 				m->m_pkthdr.mp_rseq += sub_len;
5522 			}
5523 			m->m_pkthdr.mp_rlen = 0;
5524 			len -= sub_len;
5525 		} else {
5526 			/* sub_len >= len */
5527 			if (rewinding == 0) {
5528 				m->m_pkthdr.mp_dsn += len;
5529 			}
5530 			if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5531 				if (rewinding == 0) {
5532 					m->m_pkthdr.mp_rseq += len;
5533 				}
5534 			}
5535 			m->m_pkthdr.mp_rlen -= len;
5536 			break;
5537 		}
5538 		m = m->m_next;
5539 	}
5540 
5541 	if (so->so_flags & SOF_MP_SUBFLOW &&
5542 	    !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5543 	    !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5544 		/*
5545 		 * Received an ack without receiving a DATA_ACK.
5546 		 * Need to fallback to regular TCP (or destroy this subflow).
5547 		 */
5548 		sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5549 		mptcp_notify_mpfail(so);
5550 	}
5551 }
5552 
5553 /* Obtain the DSN mapping stored in the mbuf */
5554 void
mptcp_output_getm_dsnmap32(struct socket * so,int off,uint32_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5555 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5556     uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5557 {
5558 	u_int64_t dsn64;
5559 
5560 	mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5561 	*dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5562 }
5563 
5564 void
mptcp_output_getm_dsnmap64(struct socket * so,int off,uint64_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5565 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5566     uint32_t *relseq, uint16_t *data_len,
5567     uint16_t *dss_csum)
5568 {
5569 	struct mbuf *m = so->so_snd.sb_mb;
5570 
5571 	VERIFY(off >= 0);
5572 
5573 	if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5574 		*dsn = 0;
5575 		*relseq = 0;
5576 		*data_len = 0;
5577 		*dss_csum = 0;
5578 		return;
5579 	}
5580 
5581 	/*
5582 	 * In the subflow socket, the DSN sequencing can be discontiguous,
5583 	 * but the subflow sequence mapping is contiguous. Use the subflow
5584 	 * sequence property to find the right mbuf and corresponding dsn
5585 	 * mapping.
5586 	 */
5587 
5588 	while (m) {
5589 		VERIFY(m->m_flags & M_PKTHDR);
5590 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5591 
5592 		if (off >= m->m_len) {
5593 			off -= m->m_len;
5594 			m = m->m_next;
5595 		} else {
5596 			break;
5597 		}
5598 	}
5599 
5600 	VERIFY(off >= 0);
5601 	VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5602 
5603 	*dsn = m->m_pkthdr.mp_dsn;
5604 	*relseq = m->m_pkthdr.mp_rseq;
5605 	*data_len = m->m_pkthdr.mp_rlen;
5606 	*dss_csum = m->m_pkthdr.mp_csum;
5607 }
5608 
5609 void
mptcp_output_getm_data_level_details(struct socket * so,int off,uint16_t * data_len,uint16_t * dss_csum)5610 mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum)
5611 {
5612 	uint64_t dsn;
5613 	uint32_t relseq;
5614 
5615 	mptcp_output_getm_dsnmap64(so, off, &dsn, &relseq, data_len, dss_csum);
5616 }
5617 
5618 /*
5619  * Note that this is called only from tcp_input() via mptcp_input_preproc()
5620  * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5621  * When it trims data tcp_input calls m_adj() which does not remove the
5622  * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5623  * The dsn map insertion cannot be delayed after trim, because data can be in
5624  * the reassembly queue for a while and the DSN option info in tp will be
5625  * overwritten for every new packet received.
5626  * The dsn map will be adjusted just prior to appending to subflow sockbuf
5627  * with mptcp_adj_rmap()
5628  */
5629 void
mptcp_insert_rmap(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th)5630 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5631 {
5632 	VERIFY(m->m_flags & M_PKTHDR);
5633 	VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5634 
5635 	if (tp->t_mpflags & TMPF_EMBED_DSN) {
5636 		m->m_pkthdr.mp_dsn = tp->t_mpsub->mpts_rcv_map.mpt_dsn;
5637 		m->m_pkthdr.mp_rseq = tp->t_mpsub->mpts_rcv_map.mpt_sseq;
5638 		m->m_pkthdr.mp_rlen = tp->t_mpsub->mpts_rcv_map.mpt_len;
5639 		m->m_pkthdr.mp_csum = tp->t_mpsub->mpts_rcv_map.mpt_csum;
5640 		if (tp->t_mpsub->mpts_rcv_map.mpt_dfin) {
5641 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5642 		}
5643 
5644 		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5645 
5646 		tp->t_mpflags &= ~TMPF_EMBED_DSN;
5647 		tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5648 	} else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5649 		if (th->th_flags & TH_FIN) {
5650 			m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5651 		}
5652 	}
5653 }
5654 
5655 /*
5656  * Following routines help with failure detection and failover of data
5657  * transfer from one subflow to another.
5658  */
5659 void
mptcp_act_on_txfail(struct socket * so)5660 mptcp_act_on_txfail(struct socket *so)
5661 {
5662 	struct tcpcb *tp = NULL;
5663 	struct inpcb *inp = sotoinpcb(so);
5664 
5665 	if (inp == NULL) {
5666 		return;
5667 	}
5668 
5669 	tp = intotcpcb(inp);
5670 	if (tp == NULL) {
5671 		return;
5672 	}
5673 
5674 	if (so->so_flags & SOF_MP_TRYFAILOVER) {
5675 		return;
5676 	}
5677 
5678 	so->so_flags |= SOF_MP_TRYFAILOVER;
5679 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5680 }
5681 
5682 /*
5683  * Support for MP_FAIL option
5684  */
5685 int
mptcp_get_map_for_dsn(struct socket * so,uint64_t dsn_fail,uint32_t * tcp_seq)5686 mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5687 {
5688 	struct mbuf *m = so->so_snd.sb_mb;
5689 	uint16_t datalen;
5690 	uint64_t dsn;
5691 	int off = 0;
5692 
5693 	if (m == NULL) {
5694 		return -1;
5695 	}
5696 
5697 	while (m != NULL) {
5698 		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5699 		VERIFY(m->m_flags & M_PKTHDR);
5700 		dsn = m->m_pkthdr.mp_dsn;
5701 		datalen = m->m_pkthdr.mp_rlen;
5702 		if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5703 		    (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5704 			off = (int)(dsn_fail - dsn);
5705 			*tcp_seq = m->m_pkthdr.mp_rseq + off;
5706 			return 0;
5707 		}
5708 
5709 		m = m->m_next;
5710 	}
5711 
5712 	/*
5713 	 * If there was no mbuf data and a fallback to TCP occurred, there's
5714 	 * not much else to do.
5715 	 */
5716 
5717 	os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5718 	return -1;
5719 }
5720 
5721 /*
5722  * Support for sending contiguous MPTCP bytes in subflow
5723  * Also for preventing sending data with ACK in 3-way handshake
5724  */
5725 int32_t
mptcp_adj_sendlen(struct socket * so,int32_t off)5726 mptcp_adj_sendlen(struct socket *so, int32_t off)
5727 {
5728 	struct tcpcb *tp = sototcpcb(so);
5729 	struct mptsub *mpts = tp->t_mpsub;
5730 	uint64_t mdss_dsn;
5731 	uint32_t mdss_subflow_seq;
5732 	int mdss_subflow_off;
5733 	uint16_t mdss_data_len;
5734 	uint16_t dss_csum;
5735 
5736 	if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5737 		return 0;
5738 	}
5739 
5740 	mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5741 	    &mdss_data_len, &dss_csum);
5742 
5743 	/*
5744 	 * We need to compute how much of the mapping still remains.
5745 	 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5746 	 */
5747 	mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5748 
5749 	/*
5750 	 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5751 	 * seq has been set to 1 (while it should be 0).
5752 	 */
5753 	if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5754 		mdss_subflow_off--;
5755 	}
5756 
5757 	VERIFY(off >= mdss_subflow_off);
5758 
5759 	return mdss_data_len - (off - mdss_subflow_off);
5760 }
5761 
5762 static uint32_t
mptcp_get_maxseg(struct mptses * mpte)5763 mptcp_get_maxseg(struct mptses *mpte)
5764 {
5765 	struct mptsub *mpts;
5766 	uint32_t maxseg = 0;
5767 
5768 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5769 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5770 
5771 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5772 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
5773 			continue;
5774 		}
5775 
5776 		if (tp->t_maxseg > maxseg) {
5777 			maxseg = tp->t_maxseg;
5778 		}
5779 	}
5780 
5781 	return maxseg;
5782 }
5783 
5784 static uint8_t
mptcp_get_rcvscale(struct mptses * mpte)5785 mptcp_get_rcvscale(struct mptses *mpte)
5786 {
5787 	struct mptsub *mpts;
5788 	uint8_t rcvscale = UINT8_MAX;
5789 
5790 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5791 		struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5792 
5793 		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5794 		    TCPS_HAVERCVDFIN2(tp->t_state)) {
5795 			continue;
5796 		}
5797 
5798 		if (tp->rcv_scale < rcvscale) {
5799 			rcvscale = tp->rcv_scale;
5800 		}
5801 	}
5802 
5803 	return rcvscale;
5804 }
5805 
5806 /* Similar to tcp_sbrcv_reserve */
5807 static void
mptcp_sbrcv_reserve(struct mptcb * mp_tp,struct sockbuf * sbrcv,u_int32_t newsize,u_int32_t idealsize)5808 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5809     u_int32_t newsize, u_int32_t idealsize)
5810 {
5811 	uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5812 
5813 	if (rcvscale == UINT8_MAX) {
5814 		return;
5815 	}
5816 
5817 	/* newsize should not exceed max */
5818 	newsize = min(newsize, tcp_autorcvbuf_max);
5819 
5820 	/* The receive window scale negotiated at the
5821 	 * beginning of the connection will also set a
5822 	 * limit on the socket buffer size
5823 	 */
5824 	newsize = min(newsize, TCP_MAXWIN << rcvscale);
5825 
5826 	/* Set new socket buffer size */
5827 	if (newsize > sbrcv->sb_hiwat &&
5828 	    (sbreserve(sbrcv, newsize) == 1)) {
5829 		sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5830 		    (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5831 
5832 		/* Again check the limit set by the advertised
5833 		 * window scale
5834 		 */
5835 		sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5836 		    TCP_MAXWIN << rcvscale);
5837 	}
5838 }
5839 
5840 void
mptcp_sbrcv_grow(struct mptcb * mp_tp)5841 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5842 {
5843 	struct mptses *mpte = mp_tp->mpt_mpte;
5844 	struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5845 	struct sockbuf *sbrcv = &mp_so->so_rcv;
5846 	uint32_t hiwat_sum = 0;
5847 	uint32_t ideal_sum = 0;
5848 	struct mptsub *mpts;
5849 
5850 	/*
5851 	 * Do not grow the receive socket buffer if
5852 	 * - auto resizing is disabled, globally or on this socket
5853 	 * - the high water mark already reached the maximum
5854 	 * - the stream is in background and receive side is being
5855 	 * throttled
5856 	 * - if there are segments in reassembly queue indicating loss,
5857 	 * do not need to increase recv window during recovery as more
5858 	 * data is not going to be sent. A duplicate ack sent during
5859 	 * recovery should not change the receive window
5860 	 */
5861 	if (tcp_do_autorcvbuf == 0 ||
5862 	    (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5863 	    sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5864 	    (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5865 	    !LIST_EMPTY(&mp_tp->mpt_segq)) {
5866 		/* Can not resize the socket buffer, just return */
5867 		return;
5868 	}
5869 
5870 	/*
5871 	 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5872 	 *
5873 	 * But, for this we first need accurate receiver-RTT estimations, which
5874 	 * we currently don't have.
5875 	 *
5876 	 * Let's use a dummy algorithm for now, just taking the sum of all
5877 	 * subflow's receive-buffers. It's too low, but that's all we can get
5878 	 * for now.
5879 	 */
5880 
5881 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5882 		hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5883 		ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5884 	}
5885 
5886 	mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5887 }
5888 
5889 /*
5890  * Determine if we can grow the recieve socket buffer to avoid sending
5891  * a zero window update to the peer. We allow even socket buffers that
5892  * have fixed size (set by the application) to grow if the resource
5893  * constraints are met. They will also be trimmed after the application
5894  * reads data.
5895  *
5896  * Similar to tcp_sbrcv_grow_rwin
5897  */
5898 static void
mptcp_sbrcv_grow_rwin(struct mptcb * mp_tp,struct sockbuf * sb)5899 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5900 {
5901 	struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5902 	u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5903 	u_int32_t rcvbuf = sb->sb_hiwat;
5904 
5905 	if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5906 		return;
5907 	}
5908 
5909 	if (tcp_do_autorcvbuf == 1 &&
5910 	    /* Diff to tcp_sbrcv_grow_rwin */
5911 	    (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5912 	    (rcvbuf - sb->sb_cc) < rcvbufinc &&
5913 	    rcvbuf < tcp_autorcvbuf_max &&
5914 	    (sb->sb_idealsize > 0 &&
5915 	    sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5916 		sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5917 	}
5918 }
5919 
5920 /* Similar to tcp_sbspace */
5921 int32_t
mptcp_sbspace(struct mptcb * mp_tp)5922 mptcp_sbspace(struct mptcb *mp_tp)
5923 {
5924 	struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5925 	uint32_t rcvbuf;
5926 	int32_t space;
5927 	int32_t pending = 0;
5928 
5929 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5930 
5931 	mptcp_sbrcv_grow_rwin(mp_tp, sb);
5932 
5933 	/* hiwat might have changed */
5934 	rcvbuf = sb->sb_hiwat;
5935 
5936 	space =  ((int32_t) imin((rcvbuf - sb->sb_cc),
5937 	    (sb->sb_mbmax - sb->sb_mbcnt)));
5938 	if (space < 0) {
5939 		space = 0;
5940 	}
5941 
5942 #if CONTENT_FILTER
5943 	/* Compensate for data being processed by content filters */
5944 	pending = cfil_sock_data_space(sb);
5945 #endif /* CONTENT_FILTER */
5946 	if (pending > space) {
5947 		space = 0;
5948 	} else {
5949 		space -= pending;
5950 	}
5951 
5952 	return space;
5953 }
5954 
5955 /*
5956  * Support Fallback to Regular TCP
5957  */
5958 void
mptcp_notify_mpready(struct socket * so)5959 mptcp_notify_mpready(struct socket *so)
5960 {
5961 	struct tcpcb *tp = NULL;
5962 
5963 	if (so == NULL) {
5964 		return;
5965 	}
5966 
5967 	tp = intotcpcb(sotoinpcb(so));
5968 
5969 	if (tp == NULL) {
5970 		return;
5971 	}
5972 
5973 	DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5974 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5975 	    struct tcpcb *, tp);
5976 
5977 	if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5978 		return;
5979 	}
5980 
5981 	if (tp->t_mpflags & TMPF_MPTCP_READY) {
5982 		return;
5983 	}
5984 
5985 	tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5986 	tp->t_mpflags |= TMPF_MPTCP_READY;
5987 
5988 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5989 }
5990 
5991 void
mptcp_notify_mpfail(struct socket * so)5992 mptcp_notify_mpfail(struct socket *so)
5993 {
5994 	struct tcpcb *tp = NULL;
5995 
5996 	if (so == NULL) {
5997 		return;
5998 	}
5999 
6000 	tp = intotcpcb(sotoinpcb(so));
6001 
6002 	if (tp == NULL) {
6003 		return;
6004 	}
6005 
6006 	DTRACE_MPTCP4(multipath__failed, struct socket *, so,
6007 	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
6008 	    struct tcpcb *, tp);
6009 
6010 	if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
6011 		return;
6012 	}
6013 
6014 	tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
6015 	tp->t_mpflags |= TMPF_TCP_FALLBACK;
6016 
6017 	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
6018 }
6019 
6020 /*
6021  * Keepalive helper function
6022  */
6023 boolean_t
mptcp_ok_to_keepalive(struct mptcb * mp_tp)6024 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
6025 {
6026 	boolean_t ret = 1;
6027 
6028 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6029 
6030 	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
6031 		ret = 0;
6032 	}
6033 	return ret;
6034 }
6035 
6036 /*
6037  * MPTCP t_maxseg adjustment function
6038  */
6039 int
mptcp_adj_mss(struct tcpcb * tp,boolean_t mtudisc)6040 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6041 {
6042 	int mss_lower = 0;
6043 	struct mptcb *mp_tp = tptomptp(tp);
6044 
6045 #define MPTCP_COMPUTE_LEN {                             \
6046 	mss_lower = sizeof (struct mptcp_dss_ack_opt);  \
6047 	if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)         \
6048 	        mss_lower += 2;                         \
6049 	else                                            \
6050 	/* adjust to 32-bit boundary + EOL */   \
6051 	        mss_lower += 2;                         \
6052 }
6053 	if (mp_tp == NULL) {
6054 		return 0;
6055 	}
6056 
6057 	socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6058 
6059 	/*
6060 	 * For the first subflow and subsequent subflows, adjust mss for
6061 	 * most common MPTCP option size, for case where tcp_mss is called
6062 	 * during option processing and MTU discovery.
6063 	 */
6064 	if (!mtudisc) {
6065 		if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6066 		    !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6067 			MPTCP_COMPUTE_LEN;
6068 		}
6069 
6070 		if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6071 		    tp->t_mpflags & TMPF_SENT_JOIN) {
6072 			MPTCP_COMPUTE_LEN;
6073 		}
6074 	} else {
6075 		if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6076 			MPTCP_COMPUTE_LEN;
6077 		}
6078 	}
6079 
6080 	return mss_lower;
6081 }
6082 
6083 static void
fill_mptcp_subflow(struct socket * so,mptcp_flow_t * flow,struct mptsub * mpts)6084 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6085 {
6086 	struct inpcb *inp;
6087 
6088 	tcp_getconninfo(so, &flow->flow_ci);
6089 	inp = sotoinpcb(so);
6090 	if ((inp->inp_vflag & INP_IPV6) != 0) {
6091 		flow->flow_src.ss_family = AF_INET6;
6092 		flow->flow_dst.ss_family = AF_INET6;
6093 		flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6094 		flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6095 		SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6096 		SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6097 		SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6098 		SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
6099 	} else if ((inp->inp_vflag & INP_IPV4) != 0) {
6100 		flow->flow_src.ss_family = AF_INET;
6101 		flow->flow_dst.ss_family = AF_INET;
6102 		flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6103 		flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6104 		SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6105 		SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6106 		SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6107 		SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6108 	}
6109 	flow->flow_len = sizeof(*flow);
6110 	flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
6111 	flow->flow_flags = mpts->mpts_flags;
6112 	flow->flow_cid = mpts->mpts_connid;
6113 	flow->flow_relseq = mpts->mpts_rel_seq;
6114 	flow->flow_soerror = mpts->mpts_socket->so_error;
6115 	flow->flow_probecnt = mpts->mpts_probecnt;
6116 }
6117 
6118 static int
6119 mptcp_pcblist SYSCTL_HANDLER_ARGS
6120 {
6121 #pragma unused(oidp, arg1, arg2)
6122 	int error = 0, f;
6123 	size_t len;
6124 	struct mppcb *mpp;
6125 	struct mptses *mpte;
6126 	struct mptcb *mp_tp;
6127 	struct mptsub *mpts;
6128 	struct socket *so;
6129 	conninfo_mptcp_t mptcpci;
6130 	mptcp_flow_t *flows = NULL;
6131 
6132 	if (req->newptr != USER_ADDR_NULL) {
6133 		return EPERM;
6134 	}
6135 
6136 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6137 	if (req->oldptr == USER_ADDR_NULL) {
6138 		size_t n = mtcbinfo.mppi_count;
6139 		lck_mtx_unlock(&mtcbinfo.mppi_lock);
6140 		req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6141 		    4 * (n + n / 8)  * sizeof(mptcp_flow_t);
6142 		return 0;
6143 	}
6144 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6145 		flows = NULL;
6146 		socket_lock(mpp->mpp_socket, 1);
6147 		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6148 		mpte = mptompte(mpp);
6149 
6150 		socket_lock_assert_owned(mptetoso(mpte));
6151 		mp_tp = mpte->mpte_mptcb;
6152 
6153 		bzero(&mptcpci, sizeof(mptcpci));
6154 		mptcpci.mptcpci_state = mp_tp->mpt_state;
6155 		mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6156 		mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6157 		mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6158 		mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6159 		mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6160 		mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6161 		mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6162 		mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6163 		mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6164 		mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6165 		mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6166 		mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6167 
6168 		mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6169 		mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6170 		mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6171 		mptcpci.mptcpci_flow_offset =
6172 		    offsetof(conninfo_mptcp_t, mptcpci_flows);
6173 
6174 		len = sizeof(*flows) * mpte->mpte_numflows;
6175 		if (mpte->mpte_numflows != 0) {
6176 			flows = kalloc_data(len, Z_WAITOK | Z_ZERO);
6177 			if (flows == NULL) {
6178 				socket_unlock(mpp->mpp_socket, 1);
6179 				break;
6180 			}
6181 			mptcpci.mptcpci_len = sizeof(mptcpci) +
6182 			    sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6183 			error = SYSCTL_OUT(req, &mptcpci,
6184 			    sizeof(mptcpci) - sizeof(mptcp_flow_t));
6185 		} else {
6186 			mptcpci.mptcpci_len = sizeof(mptcpci);
6187 			error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6188 		}
6189 		if (error) {
6190 			socket_unlock(mpp->mpp_socket, 1);
6191 			kfree_data(flows, len);
6192 			break;
6193 		}
6194 		f = 0;
6195 		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6196 			so = mpts->mpts_socket;
6197 			fill_mptcp_subflow(so, &flows[f], mpts);
6198 			f++;
6199 		}
6200 		socket_unlock(mpp->mpp_socket, 1);
6201 		if (flows) {
6202 			error = SYSCTL_OUT(req, flows, len);
6203 			kfree_data(flows, len);
6204 			if (error) {
6205 				break;
6206 			}
6207 		}
6208 	}
6209 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6210 
6211 	return error;
6212 }
6213 
6214 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6215     0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6216     "List of active MPTCP connections");
6217 
6218 /*
6219  * Set notsent lowat mark on the MPTCB
6220  */
6221 int
mptcp_set_notsent_lowat(struct mptses * mpte,int optval)6222 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6223 {
6224 	struct mptcb *mp_tp = NULL;
6225 	int error = 0;
6226 
6227 	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6228 		mp_tp = mpte->mpte_mptcb;
6229 	}
6230 
6231 	if (mp_tp) {
6232 		mp_tp->mpt_notsent_lowat = optval;
6233 	} else {
6234 		error = EINVAL;
6235 	}
6236 
6237 	return error;
6238 }
6239 
6240 u_int32_t
mptcp_get_notsent_lowat(struct mptses * mpte)6241 mptcp_get_notsent_lowat(struct mptses *mpte)
6242 {
6243 	struct mptcb *mp_tp = NULL;
6244 
6245 	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6246 		mp_tp = mpte->mpte_mptcb;
6247 	}
6248 
6249 	if (mp_tp) {
6250 		return mp_tp->mpt_notsent_lowat;
6251 	} else {
6252 		return 0;
6253 	}
6254 }
6255 
6256 int
mptcp_notsent_lowat_check(struct socket * so)6257 mptcp_notsent_lowat_check(struct socket *so)
6258 {
6259 	struct mptses *mpte;
6260 	struct mppcb *mpp;
6261 	struct mptcb *mp_tp;
6262 	struct mptsub *mpts;
6263 
6264 	int notsent = 0;
6265 
6266 	mpp = mpsotomppcb(so);
6267 	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6268 		return 0;
6269 	}
6270 
6271 	mpte = mptompte(mpp);
6272 	socket_lock_assert_owned(mptetoso(mpte));
6273 	mp_tp = mpte->mpte_mptcb;
6274 
6275 	notsent = so->so_snd.sb_cc;
6276 
6277 	if ((notsent == 0) ||
6278 	    ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6279 	    mp_tp->mpt_notsent_lowat)) {
6280 		return 1;
6281 	}
6282 
6283 	/* When Nagle's algorithm is not disabled, it is better
6284 	 * to wakeup the client even before there is atleast one
6285 	 * maxseg of data to write.
6286 	 */
6287 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6288 		int retval = 0;
6289 		if (mpts->mpts_flags & MPTSF_ACTIVE) {
6290 			struct socket *subf_so = mpts->mpts_socket;
6291 			struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6292 
6293 			notsent = so->so_snd.sb_cc -
6294 			    (tp->snd_nxt - tp->snd_una);
6295 
6296 			if ((tp->t_flags & TF_NODELAY) == 0 &&
6297 			    notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6298 				retval = 1;
6299 			}
6300 			return retval;
6301 		}
6302 	}
6303 	return 0;
6304 }
6305 
6306 static errno_t
mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)6307 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6308     void **unitinfo)
6309 {
6310 #pragma unused(kctlref, sac, unitinfo)
6311 
6312 	if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6313 		os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6314 	}
6315 
6316 	mptcp_kern_skt_unit = sac->sc_unit;
6317 
6318 	return 0;
6319 }
6320 
6321 static void
mptcp_allow_uuid(uuid_t uuid,int32_t rssi)6322 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6323 {
6324 	struct mppcb *mpp;
6325 
6326 	/* Iterate over all MPTCP connections */
6327 
6328 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6329 
6330 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6331 		struct socket *mp_so = mpp->mpp_socket;
6332 		struct mptses *mpte = mpp->mpp_pcbe;
6333 
6334 		socket_lock(mp_so, 1);
6335 
6336 		if (mp_so->so_flags & SOF_DELEGATED &&
6337 		    uuid_compare(uuid, mp_so->e_uuid)) {
6338 			goto next;
6339 		} else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6340 		    uuid_compare(uuid, mp_so->last_uuid)) {
6341 			goto next;
6342 		}
6343 
6344 		os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6345 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6346 
6347 		mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6348 
6349 		if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6350 			mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6351 		}
6352 
6353 		mptcp_check_subflows_and_add(mpte);
6354 		mptcp_remove_subflows(mpte);
6355 
6356 		mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6357 
6358 next:
6359 		socket_unlock(mp_so, 1);
6360 	}
6361 
6362 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6363 }
6364 
6365 static void
mptcp_wifi_status_changed(void)6366 mptcp_wifi_status_changed(void)
6367 {
6368 	struct mppcb *mpp;
6369 
6370 	/* Iterate over all MPTCP connections */
6371 
6372 	lck_mtx_lock(&mtcbinfo.mppi_lock);
6373 
6374 	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6375 		struct socket *mp_so = mpp->mpp_socket;
6376 		struct mptses *mpte = mpp->mpp_pcbe;
6377 
6378 		socket_lock(mp_so, 1);
6379 
6380 		/* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6381 		if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6382 		    mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
6383 		    mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6384 			goto next;
6385 		}
6386 
6387 		mptcp_check_subflows_and_add(mpte);
6388 		mptcp_check_subflows_and_remove(mpte);
6389 
6390 next:
6391 		socket_unlock(mp_so, 1);
6392 	}
6393 
6394 	lck_mtx_unlock(&mtcbinfo.mppi_lock);
6395 }
6396 
6397 struct mptcp_uuid_search_info {
6398 	uuid_t target_uuid;
6399 	proc_t found_proc;
6400 	boolean_t is_proc_found;
6401 };
6402 
6403 static int
mptcp_find_proc_filter(proc_t p,void * arg)6404 mptcp_find_proc_filter(proc_t p, void *arg)
6405 {
6406 	struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6407 	int found;
6408 
6409 	if (info->is_proc_found) {
6410 		return 0;
6411 	}
6412 
6413 	/*
6414 	 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6415 	 * expects != 0 for a matching filter.
6416 	 */
6417 	found = uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0;
6418 	if (found) {
6419 		info->is_proc_found = true;
6420 	}
6421 
6422 	return found;
6423 }
6424 
6425 static int
mptcp_find_proc_callout(proc_t p,void * arg)6426 mptcp_find_proc_callout(proc_t p, void * arg)
6427 {
6428 	struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6429 
6430 	if (uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0) {
6431 		info->found_proc = p;
6432 		return PROC_CLAIMED_DONE;
6433 	}
6434 
6435 	return PROC_RETURNED;
6436 }
6437 
6438 static proc_t
mptcp_find_proc(const uuid_t uuid)6439 mptcp_find_proc(const uuid_t uuid)
6440 {
6441 	struct mptcp_uuid_search_info info;
6442 
6443 	uuid_copy(info.target_uuid, uuid);
6444 	info.found_proc = PROC_NULL;
6445 	info.is_proc_found = false;
6446 
6447 	proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
6448 	    mptcp_find_proc_filter, &info);
6449 
6450 	return info.found_proc;
6451 }
6452 
6453 void
mptcp_ask_symptoms(struct mptses * mpte)6454 mptcp_ask_symptoms(struct mptses *mpte)
6455 {
6456 	struct mptcp_symptoms_ask_uuid ask;
6457 	struct socket *mp_so;
6458 	struct proc *p = PROC_NULL;
6459 	int pid, prio, err;
6460 
6461 	if (mptcp_kern_skt_unit == 0) {
6462 		os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6463 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6464 		return;
6465 	}
6466 
6467 	mp_so = mptetoso(mpte);
6468 
6469 	if (mp_so->so_flags & SOF_DELEGATED) {
6470 		if (mpte->mpte_epid != 0) {
6471 			p = proc_find(mpte->mpte_epid);
6472 			if (p != PROC_NULL) {
6473 				/* We found a pid, check its UUID */
6474 				if (uuid_compare(mp_so->e_uuid, proc_executableuuid_addr(p))) {
6475 					/* It's not the same - we need to look for the real proc */
6476 					proc_rele(p);
6477 					p = PROC_NULL;
6478 				}
6479 			}
6480 		}
6481 
6482 		if (p == PROC_NULL) {
6483 			p = mptcp_find_proc(mp_so->e_uuid);
6484 			if (p == PROC_NULL) {
6485 				uuid_string_t uuid_string;
6486 				uuid_unparse(mp_so->e_uuid, uuid_string);
6487 
6488 				os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6489 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
6490 
6491 				return;
6492 			}
6493 			mpte->mpte_epid = proc_pid(p);
6494 		}
6495 
6496 		pid = mpte->mpte_epid;
6497 		uuid_copy(ask.uuid, mp_so->e_uuid);
6498 	} else {
6499 		pid = mp_so->last_pid;
6500 
6501 		p = proc_find(pid);
6502 		if (p == PROC_NULL) {
6503 			os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6504 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6505 			return;
6506 		}
6507 
6508 		uuid_copy(ask.uuid, mp_so->last_uuid);
6509 	}
6510 
6511 
6512 	ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6513 
6514 	prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6515 
6516 	if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6517 	    prio == TASK_DARWINBG_APPLICATION) {
6518 		ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6519 	} else if (prio == TASK_FOREGROUND_APPLICATION) {
6520 		ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6521 	} else {
6522 		ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6523 	}
6524 
6525 	err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6526 	    &ask, sizeof(ask), CTL_DATA_EOR);
6527 
6528 	os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6529 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6530 
6531 
6532 	proc_rele(p);
6533 }
6534 
6535 static errno_t
mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo)6536 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6537     void *unitinfo)
6538 {
6539 #pragma unused(kctlref, kcunit, unitinfo)
6540 
6541 	OSDecrementAtomic(&mptcp_kern_skt_inuse);
6542 
6543 	return 0;
6544 }
6545 
6546 static errno_t
mptcp_symptoms_ctl_send(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,mbuf_t m,int flags)6547 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6548     mbuf_t m, int flags)
6549 {
6550 #pragma unused(kctlref, unitinfo, flags)
6551 	symptoms_advisory_t *sa = NULL;
6552 
6553 	if (kcunit != mptcp_kern_skt_unit) {
6554 		os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6555 		    __func__, kcunit, mptcp_kern_skt_unit);
6556 	}
6557 
6558 	if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6559 		mbuf_freem(m);
6560 		return EINVAL;
6561 	}
6562 
6563 	if (mbuf_len(m) < sizeof(*sa)) {
6564 		os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6565 		    __func__, mbuf_len(m), sizeof(*sa));
6566 		mbuf_freem(m);
6567 		return EINVAL;
6568 	}
6569 
6570 	sa = mtod(m, void *);
6571 
6572 	if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6573 		os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6574 		    sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6575 		    sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6576 
6577 		if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6578 			mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6579 			mptcp_wifi_status_changed();
6580 		}
6581 	} else {
6582 		struct mptcp_symptoms_answer answer;
6583 		errno_t err;
6584 
6585 		/* We temporarily allow different sizes for ease of submission */
6586 		if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6587 		    mbuf_len(m) != sizeof(answer)) {
6588 			os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6589 			    __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6590 			    sizeof(answer));
6591 			mbuf_free(m);
6592 			return EINVAL;
6593 		}
6594 
6595 		memset(&answer, 0, sizeof(answer));
6596 
6597 		err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6598 		if (err) {
6599 			os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6600 			mbuf_free(m);
6601 			return err;
6602 		}
6603 
6604 		mptcp_allow_uuid(answer.uuid, answer.rssi);
6605 	}
6606 
6607 	mbuf_freem(m);
6608 	return 0;
6609 }
6610 
6611 void
mptcp_control_register(void)6612 mptcp_control_register(void)
6613 {
6614 	/* Set up the advisory control socket */
6615 	struct kern_ctl_reg mptcp_kern_ctl;
6616 
6617 	bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6618 	strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6619 	    sizeof(mptcp_kern_ctl.ctl_name));
6620 	mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6621 	mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6622 	mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6623 	mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6624 
6625 	(void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6626 }
6627 
6628 mptcp_wifi_quality_t
mptcp_wifi_quality_for_session(struct mptses * mpte)6629 mptcp_wifi_quality_for_session(struct mptses *mpte)
6630 {
6631 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6632 		if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6633 		    mptcp_advisory.sa_wifi_status) {
6634 			return symptoms_is_wifi_lossy() ? MPTCP_WIFI_QUALITY_BAD : MPTCP_WIFI_QUALITY_GOOD;
6635 		}
6636 
6637 		/*
6638 		 * If it's a first-party app and we don't have any info
6639 		 * about the Wi-Fi state, let's be pessimistic.
6640 		 */
6641 		return MPTCP_WIFI_QUALITY_UNSURE;
6642 	} else {
6643 		if (symptoms_is_wifi_lossy()) {
6644 			return MPTCP_WIFI_QUALITY_BAD;
6645 		}
6646 
6647 		/*
6648 		 * If we are target-based (meaning, we allow to be more lax on
6649 		 * the when wifi is considered bad), we only *know* about the state once
6650 		 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6651 		 *
6652 		 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6653 		 * be set.
6654 		 *
6655 		 * In any other case (while in target-mode), consider WiFi bad
6656 		 * and we are going to ask for allowance from Symptoms anyway.
6657 		 */
6658 		if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6659 			if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6660 			    mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6661 				return MPTCP_WIFI_QUALITY_GOOD;
6662 			}
6663 
6664 			return MPTCP_WIFI_QUALITY_BAD;
6665 		}
6666 
6667 		return MPTCP_WIFI_QUALITY_GOOD;
6668 	}
6669 }
6670 
6671 boolean_t
symptoms_is_wifi_lossy(void)6672 symptoms_is_wifi_lossy(void)
6673 {
6674 	return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6675 }
6676 
6677 int
mptcp_freeq(struct mptcb * mp_tp)6678 mptcp_freeq(struct mptcb *mp_tp)
6679 {
6680 	struct protosw *proto = mptetoso(mp_tp->mpt_mpte)->so_proto;
6681 	struct tseg_qent *q;
6682 	int count = 0;
6683 	int rv = 0;
6684 
6685 	while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6686 		LIST_REMOVE(q, tqe_q);
6687 		m_freem(q->tqe_m);
6688 		tcp_reass_qent_free(proto, q);
6689 		count++;
6690 		rv = 1;
6691 	}
6692 	mp_tp->mpt_reassqlen = 0;
6693 
6694 	if (count > 0) {
6695 		OSAddAtomic(-count, &mptcp_reass_total_qlen);
6696 	}
6697 
6698 	return rv;
6699 }
6700 
6701 static int
mptcp_post_event(u_int32_t event_code,int value)6702 mptcp_post_event(u_int32_t event_code, int value)
6703 {
6704 	struct kev_mptcp_data event_data;
6705 	struct kev_msg ev_msg;
6706 
6707 	memset(&ev_msg, 0, sizeof(ev_msg));
6708 
6709 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
6710 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
6711 	ev_msg.kev_subclass     = KEV_MPTCP_SUBCLASS;
6712 	ev_msg.event_code       = event_code;
6713 
6714 	event_data.value = value;
6715 
6716 	ev_msg.dv[0].data_ptr    = &event_data;
6717 	ev_msg.dv[0].data_length = sizeof(event_data);
6718 
6719 	return kev_post_msg(&ev_msg);
6720 }
6721 
6722 static void
mptcp_set_cellicon(struct mptses * mpte,struct mptsub * mpts)6723 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6724 {
6725 	struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6726 	int error;
6727 
6728 	/* First-party apps (Siri) don't flip the cellicon */
6729 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6730 		return;
6731 	}
6732 
6733 	/* Subflow is disappearing - don't set it on this one */
6734 	if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6735 		return;
6736 	}
6737 
6738 	/* Fallen back connections are not triggering the cellicon */
6739 	if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6740 		return;
6741 	}
6742 
6743 	/* Remember the last time we set the cellicon. Needed for debouncing */
6744 	mpte->mpte_last_cellicon_set = tcp_now;
6745 
6746 	tp->t_timer[TCPT_CELLICON] = tcp_offset_from_start(tp,
6747 	    MPTCP_CELLICON_TOGGLE_RATE);
6748 	tcp_sched_timers(tp);
6749 
6750 	if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6751 	    mpte->mpte_cellicon_increments != 0) {
6752 		if (mptcp_cellicon_refcount == 0) {
6753 			os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6754 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6755 
6756 			/* Continue, so that the icon gets set... */
6757 		} else {
6758 			/*
6759 			 * In this case, the cellicon is already set. No need to bump it
6760 			 * even higher
6761 			 */
6762 
6763 			return;
6764 		}
6765 	}
6766 
6767 	/* When tearing down this subflow, we need to decrement the
6768 	 * reference counter
6769 	 */
6770 	mpts->mpts_flags |= MPTSF_CELLICON_SET;
6771 
6772 	/* This counter, so that when a session gets destroyed we decrement
6773 	 * the reference counter by whatever is left
6774 	 */
6775 	mpte->mpte_cellicon_increments++;
6776 
6777 	if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6778 		/* If cellicon is already set, get out of here! */
6779 		return;
6780 	}
6781 
6782 	error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6783 
6784 	if (error) {
6785 		os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6786 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6787 	} else {
6788 		os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6789 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6790 	}
6791 }
6792 
6793 void
mptcp_clear_cellicon(void)6794 mptcp_clear_cellicon(void)
6795 {
6796 	int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6797 
6798 	if (error) {
6799 		os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6800 		    __func__, error);
6801 	} else {
6802 		os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6803 		    __func__);
6804 	}
6805 }
6806 
6807 /*
6808  * Returns true if the icon has been flipped to WiFi.
6809  */
6810 static boolean_t
__mptcp_unset_cellicon(uint32_t val)6811 __mptcp_unset_cellicon(uint32_t val)
6812 {
6813 	VERIFY(val < INT32_MAX);
6814 	if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
6815 		return false;
6816 	}
6817 
6818 	mptcp_clear_cellicon();
6819 
6820 	return true;
6821 }
6822 
6823 void
mptcp_unset_cellicon(struct mptses * mpte,struct mptsub * mpts,uint32_t val)6824 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6825 {
6826 	/* First-party apps (Siri) don't flip the cellicon */
6827 	if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6828 		return;
6829 	}
6830 
6831 	if (mpte->mpte_cellicon_increments == 0) {
6832 		/* This flow never used cell - get out of here! */
6833 		return;
6834 	}
6835 
6836 	if (mptcp_cellicon_refcount == 0) {
6837 		os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6838 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6839 
6840 		return;
6841 	}
6842 
6843 	if (mpts) {
6844 		if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6845 			return;
6846 		}
6847 
6848 		mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6849 	}
6850 
6851 	if (mpte->mpte_cellicon_increments < val) {
6852 		os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6853 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6854 		val = mpte->mpte_cellicon_increments;
6855 	}
6856 
6857 	mpte->mpte_cellicon_increments -= val;
6858 
6859 	if (__mptcp_unset_cellicon(val) == false) {
6860 		return;
6861 	}
6862 
6863 	/* All flows are gone - our counter should be at zero too! */
6864 	if (mpte->mpte_cellicon_increments != 0) {
6865 		os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6866 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6867 	}
6868 }
6869 
6870 void
mptcp_reset_rexmit_state(struct tcpcb * tp)6871 mptcp_reset_rexmit_state(struct tcpcb *tp)
6872 {
6873 	struct mptsub *mpts;
6874 	struct inpcb *inp;
6875 	struct socket *so;
6876 
6877 	inp = tp->t_inpcb;
6878 	if (inp == NULL) {
6879 		return;
6880 	}
6881 
6882 	so = inp->inp_socket;
6883 	if (so == NULL) {
6884 		return;
6885 	}
6886 
6887 	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6888 		return;
6889 	}
6890 
6891 	mpts = tp->t_mpsub;
6892 
6893 	mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6894 	so->so_flags &= ~SOF_MP_TRYFAILOVER;
6895 }
6896 
6897 void
mptcp_reset_keepalive(struct tcpcb * tp)6898 mptcp_reset_keepalive(struct tcpcb *tp)
6899 {
6900 	struct mptsub *mpts = tp->t_mpsub;
6901 
6902 	mpts->mpts_flags &= ~MPTSF_READ_STALL;
6903 }
6904 
6905 static struct mppcb *
mtcp_alloc(void)6906 mtcp_alloc(void)
6907 {
6908 	return &kalloc_type(struct mpp_mtp, Z_WAITOK | Z_ZERO | Z_NOFAIL)->mpp;
6909 }
6910 
6911 static void
mtcp_free(struct mppcb * mpp)6912 mtcp_free(struct mppcb *mpp)
6913 {
6914 	struct mpp_mtp *mtp = __container_of(mpp, struct mpp_mtp, mpp);
6915 
6916 	kfree_type(struct mpp_mtp, mtp);
6917 }
6918 
6919 /*
6920  * Protocol pr_init callback.
6921  */
6922 void
mptcp_init(struct protosw * pp,struct domain * dp)6923 mptcp_init(struct protosw *pp, struct domain *dp)
6924 {
6925 #pragma unused(dp)
6926 	static int mptcp_initialized = 0;
6927 	struct protosw *prp;
6928 	struct ip6protosw *prp6;
6929 
6930 	VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
6931 
6932 	/* do this only once */
6933 	if (!os_atomic_cmpxchg(&mptcp_initialized, 0, 1, relaxed)) {
6934 		return;
6935 	}
6936 
6937 	mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
6938 
6939 	/*
6940 	 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
6941 	 * we must be able to find IPPROTO_TCP entries for both.
6942 	 */
6943 	prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
6944 	VERIFY(prp != NULL);
6945 	bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
6946 	bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
6947 	    sizeof(mptcp_subflow_usrreqs));
6948 	mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
6949 	mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
6950 	mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
6951 	mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
6952 	mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
6953 	mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
6954 	/*
6955 	 * Socket filters shouldn't attach/detach to/from this protosw
6956 	 * since pr_protosw is to be used instead, which points to the
6957 	 * real protocol; if they do, it is a bug and we should panic.
6958 	 */
6959 	mptcp_subflow_protosw.pr_filter_head.tqh_first =
6960 	    __unsafe_forge_single(struct socket_filter *, 0xdeadbeefdeadbeef);
6961 	mptcp_subflow_protosw.pr_filter_head.tqh_last =
6962 	    __unsafe_forge_single(struct socket_filter **, 0xdeadbeefdeadbeef);
6963 
6964 	prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
6965 	    IPPROTO_TCP, SOCK_STREAM);
6966 	VERIFY(prp6 != NULL);
6967 	bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
6968 	bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
6969 	    sizeof(mptcp_subflow_usrreqs6));
6970 	mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
6971 	mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
6972 	mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
6973 	mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
6974 	mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
6975 	mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
6976 	/*
6977 	 * Socket filters shouldn't attach/detach to/from this protosw
6978 	 * since pr_protosw is to be used instead, which points to the
6979 	 * real protocol; if they do, it is a bug and we should panic.
6980 	 */
6981 	mptcp_subflow_protosw6.pr_filter_head.tqh_first =
6982 	    __unsafe_forge_single(struct socket_filter *, 0xdeadbeefdeadbeef);
6983 	mptcp_subflow_protosw6.pr_filter_head.tqh_last =
6984 	    __unsafe_forge_single(struct socket_filter **, 0xdeadbeefdeadbeef);
6985 
6986 	bzero(&mtcbinfo, sizeof(mtcbinfo));
6987 	TAILQ_INIT(&mtcbinfo.mppi_pcbs);
6988 	mtcbinfo.mppi_alloc = mtcp_alloc;
6989 	mtcbinfo.mppi_free  = mtcp_free;
6990 
6991 	mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", LCK_GRP_ATTR_NULL);
6992 	lck_attr_setdefault(&mtcbinfo.mppi_lock_attr);
6993 	lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
6994 	    &mtcbinfo.mppi_lock_attr);
6995 
6996 	mtcbinfo.mppi_gc = mptcp_gc;
6997 	mtcbinfo.mppi_timer = mptcp_timer;
6998 
6999 	/* attach to MP domain for garbage collection to take place */
7000 	mp_pcbinfo_attach(&mtcbinfo);
7001 
7002 	mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
7003 }
7004