xref: /xnu-8020.121.3/bsd/netinet/mptcp.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * A note on the MPTCP/NECP-interactions:
31  *
32  * MPTCP uses NECP-callbacks to get notified of interface/policy events.
33  * MPTCP registers to these events at the MPTCP-layer for interface-events
34  * through a call to necp_client_register_multipath_cb.
35  * To get per-flow events (aka per TCP-subflow), we register to it with
36  * necp_client_register_socket_flow. Both registrations happen by using the
37  * necp-client-uuid that comes from the app.
38  *
39  * The locking is rather tricky. In general, we expect the lock-ordering to
40  * happen from necp-fd -> necp->client -> mpp_lock.
41  *
42  * There are however some subtleties.
43  *
44  * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
45  * safe, because it is the very first time this MPTCP-connection goes into NECP.
46  * As we go into NECP we take the NECP-locks and thus are guaranteed that no
47  * NECP-locks will deadlock us. Because these NECP-events will also first take
48  * the NECP-locks. Either they win the race and thus won't find our
49  * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
50  * the callbacks while holding the NECP lock.
51  *
52  * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
53  * because we have already registered callbacks and we might race against an
54  * NECP-event that will match on our socket. So, we have to unlock to be safe.
55  *
56  * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
57  * so_usecount has reached 0. We must be careful to not remove the mpp_socket
58  * pointers before we unregistered the callback. Because, again we might be
59  * racing against an NECP-event. Unregistering must happen with an unlocked
60  * mpp_lock, because of the lock-ordering constraint. It could be that
61  * before we had a chance to unregister an NECP-event triggers. That's why
62  * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
63  * there while the socket is being garbage-collected, the use-count will go
64  * down to 0 and we exit. Removal of the multipath_cb again happens by taking
65  * the NECP-locks so any running NECP-events will finish first and exit cleanly.
66  *
67  * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
68  * the socket-lock must be unlocked for lock-ordering constraints. This gets a
69  * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
70  * So, we drop the mp_so-lock as soon as the subflow is unlinked with
71  * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
72  * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
73  * gets it, it will realize that the subflow became non-MPTCP and retry (see
74  * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
75  * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
76  * for the NECP-lock (held by the other thread that is taking care of the NECP-
77  * event). So, the event now finally gets the subflow-lock and then hits an
78  * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
79  * the NECP callback.
80  */
81 
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/kernel.h>
85 #include <sys/mbuf.h>
86 #include <sys/mcache.h>
87 #include <sys/socket.h>
88 #include <sys/socketvar.h>
89 #include <sys/syslog.h>
90 #include <sys/protosw.h>
91 
92 #include <kern/zalloc.h>
93 #include <kern/locks.h>
94 
95 #include <mach/sdt.h>
96 
97 #include <net/if.h>
98 #include <netinet/in.h>
99 #include <netinet/in_var.h>
100 #include <netinet/tcp.h>
101 #include <netinet/tcp_fsm.h>
102 #include <netinet/tcp_seq.h>
103 #include <netinet/tcp_var.h>
104 #include <netinet/mptcp_var.h>
105 #include <netinet/mptcp.h>
106 #include <netinet/mptcp_seq.h>
107 #include <netinet/mptcp_opt.h>
108 #include <netinet/mptcp_timer.h>
109 
110 int mptcp_enable = 1;
111 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
112     &mptcp_enable, 0, "Enable Multipath TCP Support");
113 
114 /*
115  * Number of times to try negotiating MPTCP on SYN retransmissions.
116  * We haven't seen any reports of a middlebox that is dropping all SYN-segments
117  * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times.
118  */
119 int mptcp_mpcap_retries = 4;
120 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
121     CTLFLAG_RW | CTLFLAG_LOCKED,
122     &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
123 
124 /*
125  * By default, DSS checksum is turned off, revisit if we ever do
126  * MPTCP for non SSL Traffic.
127  */
128 int mptcp_dss_csum = 0;
129 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
130     &mptcp_dss_csum, 0, "Enable DSS checksum");
131 
132 /*
133  * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
134  * is attempted on a different path.
135  */
136 int mptcp_fail_thresh = 1;
137 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
138     &mptcp_fail_thresh, 0, "Failover threshold");
139 
140 /*
141  * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
142  * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
143  * Some carrier networks have a timeout of 10 or 15 minutes.
144  */
145 int mptcp_subflow_keeptime = 60 * 14;
146 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
147     &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
148 
149 int mptcp_rtthist_rtthresh = 600;
150 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
151     &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
152 
153 int mptcp_rtothresh = 1500;
154 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
155     &mptcp_rtothresh, 0, "RTO threshold");
156 
157 /*
158  * Probe the preferred path, when it is not in use
159  */
160 uint32_t mptcp_probeto = 1000;
161 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
162     &mptcp_probeto, 0, "Disable probing by setting to 0");
163 
164 uint32_t mptcp_probecnt = 5;
165 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
166     &mptcp_probecnt, 0, "Number of probe writes");
167 
168 uint32_t mptcp_enable_v1 = 1;
169 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, enable_v1, CTLFLAG_RW | CTLFLAG_LOCKED,
170     &mptcp_enable_v1, 0, "Enable or disable v1");
171 
172 static int
173 sysctl_mptcp_version_check SYSCTL_HANDLER_ARGS
174 {
175 #pragma unused(arg1, arg2)
176 	int error;
177 	int new_value = *(int *)oidp->oid_arg1;
178 	int old_value = *(int *)oidp->oid_arg1;
179 
180 	error = sysctl_handle_int(oidp, &new_value, 0, req);
181 	if (!error) {
182 		if (new_value != MPTCP_VERSION_0 && new_value != MPTCP_VERSION_1) {
183 			return EINVAL;
184 		}
185 		*(int *)oidp->oid_arg1 = new_value;
186 	}
187 
188 	os_log(OS_LOG_DEFAULT,
189 	    "%s:%u sysctl net.inet.tcp.mptcp_preferred_version: %d -> %d)",
190 	    proc_best_name(current_proc()), proc_selfpid(),
191 	    old_value, *(int *)oidp->oid_arg1);
192 
193 	return error;
194 }
195 
196 int mptcp_preferred_version = MPTCP_VERSION_0;
197 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, mptcp_preferred_version,
198     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
199     &mptcp_preferred_version, 0, &sysctl_mptcp_version_check, "I", "");
200 
201 int mptcp_reass_total_qlen = 0;
202 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, reass_qlen,
203     CTLFLAG_RD | CTLFLAG_LOCKED, &mptcp_reass_total_qlen, 0,
204     "Total number of MPTCP segments in reassembly queues");
205 
206 static int
mptcp_reass_present(struct socket * mp_so)207 mptcp_reass_present(struct socket *mp_so)
208 {
209 	struct mptses *mpte = mpsotompte(mp_so);
210 	struct mptcb *mp_tp = mpte->mpte_mptcb;
211 	struct tseg_qent *q;
212 	int dowakeup = 0;
213 	int flags = 0;
214 	int count = 0;
215 
216 	/*
217 	 * Present data to user, advancing rcv_nxt through
218 	 * completed sequence space.
219 	 */
220 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
221 		return flags;
222 	}
223 	q = LIST_FIRST(&mp_tp->mpt_segq);
224 	if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) {
225 		return flags;
226 	}
227 
228 	/*
229 	 * If there is already another thread doing reassembly for this
230 	 * connection, it is better to let it finish the job --
231 	 * (radar 16316196)
232 	 */
233 	if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) {
234 		return flags;
235 	}
236 
237 	mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
238 
239 	do {
240 		mp_tp->mpt_rcvnxt += q->tqe_len;
241 		LIST_REMOVE(q, tqe_q);
242 		if (mp_so->so_state & SS_CANTRCVMORE) {
243 			m_freem(q->tqe_m);
244 		} else {
245 			flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
246 			if (sbappendstream_rcvdemux(mp_so, q->tqe_m)) {
247 				dowakeup = 1;
248 			}
249 		}
250 		zfree(tcp_reass_zone, q);
251 		mp_tp->mpt_reassqlen--;
252 		count++;
253 		q = LIST_FIRST(&mp_tp->mpt_segq);
254 	} while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
255 	mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
256 
257 	if (count > 0) {
258 		OSAddAtomic(-count, &mptcp_reass_total_qlen);
259 	}
260 	if (dowakeup) {
261 		sorwakeup(mp_so); /* done with socket lock held */
262 	}
263 	return flags;
264 }
265 
266 static int
mptcp_reass(struct socket * mp_so,struct pkthdr * phdr,int * tlenp,struct mbuf * m)267 mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
268 {
269 	struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
270 	u_int64_t mb_dsn = phdr->mp_dsn;
271 	struct tseg_qent *q;
272 	struct tseg_qent *p = NULL;
273 	struct tseg_qent *nq;
274 	struct tseg_qent *te = NULL;
275 	uint32_t qlimit;
276 
277 	/*
278 	 * Limit the number of segments in the reassembly queue to prevent
279 	 * holding on to too many segments (and thus running out of mbufs).
280 	 * Make sure to let the missing segment through which caused this
281 	 * queue.  Always keep one global queue entry spare to be able to
282 	 * process the missing segment.
283 	 */
284 	qlimit = MIN(MAX(100, mp_so->so_rcv.sb_hiwat >> 10),
285 	    (tcp_autorcvbuf_max >> 10));
286 	if (mb_dsn != mp_tp->mpt_rcvnxt &&
287 	    (mp_tp->mpt_reassqlen + 1) >= qlimit) {
288 		tcpstat.tcps_mptcp_rcvmemdrop++;
289 		m_freem(m);
290 		*tlenp = 0;
291 		return 0;
292 	}
293 
294 	/* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
295 	te = zalloc_flags(tcp_reass_zone, Z_WAITOK | Z_NOFAIL);
296 
297 	mp_tp->mpt_reassqlen++;
298 	OSIncrementAtomic(&mptcp_reass_total_qlen);
299 
300 	/*
301 	 * Find a segment which begins after this one does.
302 	 */
303 	LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
304 		if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) {
305 			break;
306 		}
307 		p = q;
308 	}
309 
310 	/*
311 	 * If there is a preceding segment, it may provide some of
312 	 * our data already.  If so, drop the data from the incoming
313 	 * segment.  If it provides all of our data, drop us.
314 	 */
315 	if (p != NULL) {
316 		int64_t i;
317 		/* conversion to int (in i) handles seq wraparound */
318 		i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
319 		if (i > 0) {
320 			if (i >= *tlenp) {
321 				tcpstat.tcps_mptcp_rcvduppack++;
322 				m_freem(m);
323 				zfree(tcp_reass_zone, te);
324 				te = NULL;
325 				mp_tp->mpt_reassqlen--;
326 				OSDecrementAtomic(&mptcp_reass_total_qlen);
327 				/*
328 				 * Try to present any queued data
329 				 * at the left window edge to the user.
330 				 * This is needed after the 3-WHS
331 				 * completes.
332 				 */
333 				goto out;
334 			}
335 			VERIFY(i <= INT_MAX);
336 			m_adj(m, (int)i);
337 			*tlenp -= i;
338 			phdr->mp_dsn += i;
339 		}
340 	}
341 
342 	tcpstat.tcps_mp_oodata++;
343 
344 	/*
345 	 * While we overlap succeeding segments trim them or,
346 	 * if they are completely covered, dequeue them.
347 	 */
348 	while (q) {
349 		int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
350 		if (i <= 0) {
351 			break;
352 		}
353 
354 		if (i < q->tqe_len) {
355 			q->tqe_m->m_pkthdr.mp_dsn += i;
356 			q->tqe_len -= i;
357 
358 			VERIFY(i <= INT_MAX);
359 			m_adj(q->tqe_m, (int)i);
360 			break;
361 		}
362 
363 		nq = LIST_NEXT(q, tqe_q);
364 		LIST_REMOVE(q, tqe_q);
365 		m_freem(q->tqe_m);
366 		zfree(tcp_reass_zone, q);
367 		mp_tp->mpt_reassqlen--;
368 		OSDecrementAtomic(&mptcp_reass_total_qlen);
369 		q = nq;
370 	}
371 
372 	/* Insert the new segment queue entry into place. */
373 	te->tqe_m = m;
374 	te->tqe_th = NULL;
375 	te->tqe_len = *tlenp;
376 
377 	if (p == NULL) {
378 		LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
379 	} else {
380 		LIST_INSERT_AFTER(p, te, tqe_q);
381 	}
382 
383 out:
384 	return mptcp_reass_present(mp_so);
385 }
386 
387 /*
388  * MPTCP input, called when data has been read from a subflow socket.
389  */
390 void
mptcp_input(struct mptses * mpte,struct mbuf * m)391 mptcp_input(struct mptses *mpte, struct mbuf *m)
392 {
393 	struct socket *mp_so;
394 	struct mptcb *mp_tp = NULL;
395 	int count = 0, wakeup = 0;
396 	struct mbuf *save = NULL, *prev = NULL;
397 	struct mbuf *freelist = NULL, *tail = NULL;
398 
399 	VERIFY(m->m_flags & M_PKTHDR);
400 
401 	mp_so = mptetoso(mpte);
402 	mp_tp = mpte->mpte_mptcb;
403 
404 	socket_lock_assert_owned(mp_so);
405 
406 	DTRACE_MPTCP(input);
407 
408 	mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
409 
410 	/*
411 	 * Each mbuf contains MPTCP Data Sequence Map
412 	 * Process the data for reassembly, delivery to MPTCP socket
413 	 * client, etc.
414 	 *
415 	 */
416 	count = mp_so->so_rcv.sb_cc;
417 
418 	/*
419 	 * In the degraded fallback case, data is accepted without DSS map
420 	 */
421 	if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
422 		struct mbuf *iter;
423 		int mb_dfin;
424 fallback:
425 		mb_dfin = 0;
426 		mptcp_sbrcv_grow(mp_tp);
427 
428 		iter = m;
429 		while (iter) {
430 			if ((iter->m_flags & M_PKTHDR) &&
431 			    (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
432 				mb_dfin = 1;
433 			}
434 
435 			if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
436 				/* Don't add zero-length packets, so jump it! */
437 				if (prev == NULL) {
438 					m = iter->m_next;
439 					m_free(iter);
440 					iter = m;
441 				} else {
442 					prev->m_next = iter->m_next;
443 					m_free(iter);
444 					iter = prev->m_next;
445 				}
446 
447 				/* It was a zero-length packet so next one must be a pkthdr */
448 				VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
449 			} else {
450 				prev = iter;
451 				iter = iter->m_next;
452 			}
453 		}
454 
455 		/*
456 		 * assume degraded flow as this may be the first packet
457 		 * without DSS, and the subflow state is not updated yet.
458 		 */
459 		if (sbappendstream_rcvdemux(mp_so, m)) {
460 			sorwakeup(mp_so);
461 		}
462 
463 		DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
464 		    struct socket *, mp_so,
465 		    struct sockbuf *, &mp_so->so_rcv,
466 		    struct sockbuf *, &mp_so->so_snd,
467 		    struct mptses *, mpte);
468 		count = mp_so->so_rcv.sb_cc - count;
469 
470 		mp_tp->mpt_rcvnxt += count;
471 
472 		if (mb_dfin) {
473 			mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
474 			socantrcvmore(mp_so);
475 		}
476 		return;
477 	}
478 
479 	do {
480 		u_int64_t mb_dsn;
481 		int32_t mb_datalen;
482 		int64_t todrop;
483 		int mb_dfin = 0;
484 
485 		VERIFY(m->m_flags & M_PKTHDR);
486 
487 		/* If fallback occurs, mbufs will not have PKTF_MPTCP set */
488 		if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
489 			goto fallback;
490 		}
491 
492 		save = m->m_next;
493 		/*
494 		 * A single TCP packet formed of multiple mbufs
495 		 * holds DSS mapping in the first mbuf of the chain.
496 		 * Other mbufs in the chain may have M_PKTHDR set
497 		 * even though they belong to the same TCP packet
498 		 * and therefore use the DSS mapping stored in the
499 		 * first mbuf of the mbuf chain. mptcp_input() can
500 		 * get an mbuf chain with multiple TCP packets.
501 		 */
502 		while (save && (!(save->m_flags & M_PKTHDR) ||
503 		    !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
504 			prev = save;
505 			save = save->m_next;
506 		}
507 		if (prev) {
508 			prev->m_next = NULL;
509 		} else {
510 			m->m_next = NULL;
511 		}
512 
513 		mb_dsn = m->m_pkthdr.mp_dsn;
514 		mb_datalen = m->m_pkthdr.mp_rlen;
515 
516 		todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
517 		if (todrop > 0) {
518 			tcpstat.tcps_mptcp_rcvpackafterwin++;
519 
520 			os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n",
521 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
522 			    (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt,
523 			    mp_tp->mpt_rcvwnd, todrop);
524 
525 			if (todrop >= mb_datalen) {
526 				if (freelist == NULL) {
527 					freelist = m;
528 				} else {
529 					tail->m_next = m;
530 				}
531 
532 				if (prev != NULL) {
533 					tail = prev;
534 				} else {
535 					tail = m;
536 				}
537 
538 				m = save;
539 				prev = save = NULL;
540 				continue;
541 			} else {
542 				VERIFY(todrop <= INT_MAX);
543 				m_adj(m, (int)-todrop);
544 				mb_datalen -= todrop;
545 				m->m_pkthdr.mp_rlen -= todrop;
546 			}
547 
548 			/*
549 			 * We drop from the right edge of the mbuf, thus the
550 			 * DATA_FIN is dropped as well
551 			 */
552 			m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
553 		}
554 
555 		if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
556 			if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
557 			    mp_tp->mpt_rcvnxt)) {
558 				if (freelist == NULL) {
559 					freelist = m;
560 				} else {
561 					tail->m_next = m;
562 				}
563 
564 				if (prev != NULL) {
565 					tail = prev;
566 				} else {
567 					tail = m;
568 				}
569 
570 				m = save;
571 				prev = save = NULL;
572 				continue;
573 			} else {
574 				VERIFY((mp_tp->mpt_rcvnxt - mb_dsn) <= INT_MAX);
575 				m_adj(m, (int)(mp_tp->mpt_rcvnxt - mb_dsn));
576 				mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn);
577 				mb_dsn = mp_tp->mpt_rcvnxt;
578 				VERIFY(mb_datalen >= 0 && mb_datalen <= USHRT_MAX);
579 				m->m_pkthdr.mp_rlen = (uint16_t)mb_datalen;
580 				m->m_pkthdr.mp_dsn = mb_dsn;
581 			}
582 		}
583 
584 		if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
585 		    !LIST_EMPTY(&mp_tp->mpt_segq)) {
586 			mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
587 
588 			goto next;
589 		}
590 		mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
591 
592 		mptcp_sbrcv_grow(mp_tp);
593 
594 		if (sbappendstream_rcvdemux(mp_so, m)) {
595 			wakeup = 1;
596 		}
597 
598 		DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
599 		    struct sockbuf *, &mp_so->so_rcv,
600 		    struct sockbuf *, &mp_so->so_snd,
601 		    struct mptses *, mpte,
602 		    struct mptcb *, mp_tp);
603 		count = mp_so->so_rcv.sb_cc - count;
604 		tcpstat.tcps_mp_rcvtotal++;
605 		tcpstat.tcps_mp_rcvbytes += count;
606 
607 		mp_tp->mpt_rcvnxt += count;
608 
609 next:
610 		if (mb_dfin) {
611 			mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
612 			socantrcvmore(mp_so);
613 		}
614 		m = save;
615 		prev = save = NULL;
616 		count = mp_so->so_rcv.sb_cc;
617 	} while (m);
618 
619 	if (freelist) {
620 		m_freem(freelist);
621 	}
622 
623 	if (wakeup) {
624 		sorwakeup(mp_so);
625 	}
626 }
627 
628 boolean_t
mptcp_can_send_more(struct mptcb * mp_tp,boolean_t ignore_reinject)629 mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
630 {
631 	struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
632 
633 	/*
634 	 * Always send if there is data in the reinject-queue.
635 	 */
636 	if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) {
637 		return TRUE;
638 	}
639 
640 	/*
641 	 * Don't send, if:
642 	 *
643 	 * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
644 	 *    Except when using TFO, we might be doing a 0-byte write.
645 	 * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
646 	 * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
647 	 */
648 
649 	if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
650 		return FALSE;
651 	}
652 
653 	if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) {
654 		return FALSE;
655 	}
656 
657 	if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
658 		return FALSE;
659 	}
660 
661 	if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
662 		return FALSE;
663 	}
664 
665 	return TRUE;
666 }
667 
668 /*
669  * MPTCP output.
670  */
671 int
mptcp_output(struct mptses * mpte)672 mptcp_output(struct mptses *mpte)
673 {
674 	struct mptcb *mp_tp;
675 	struct mptsub *mpts;
676 	struct mptsub *mpts_tried = NULL;
677 	struct socket *mp_so;
678 	struct mptsub *preferred_mpts = NULL;
679 	uint64_t old_snd_nxt;
680 	int error = 0;
681 
682 	mp_so = mptetoso(mpte);
683 	mp_tp = mpte->mpte_mptcb;
684 
685 	socket_lock_assert_owned(mp_so);
686 
687 	if (mp_so->so_flags & SOF_DEFUNCT) {
688 		return 0;
689 	}
690 
691 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
692 	mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
693 
694 	old_snd_nxt = mp_tp->mpt_sndnxt;
695 	while (mptcp_can_send_more(mp_tp, FALSE)) {
696 		/* get the "best" subflow to be used for transmission */
697 		mpts = mptcp_get_subflow(mpte, &preferred_mpts);
698 		if (mpts == NULL) {
699 			mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
700 			    MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
701 			break;
702 		}
703 
704 		/* In case there's just one flow, we reattempt later */
705 		if (mpts_tried != NULL &&
706 		    (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
707 			mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
708 			mpts_tried->mpts_flags |= MPTSF_ACTIVE;
709 			mptcp_start_timer(mpte, MPTT_REXMT);
710 			break;
711 		}
712 
713 		/*
714 		 * Automatic sizing of send socket buffer. Increase the send
715 		 * socket buffer size if all of the following criteria are met
716 		 *	1. the receiver has enough buffer space for this data
717 		 *	2. send buffer is filled to 7/8th with data (so we actually
718 		 *	   have data to make use of it);
719 		 */
720 		if ((mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
721 		    tcp_cansbgrow(&mp_so->so_snd)) {
722 			if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
723 			    mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
724 				if (sbreserve(&mp_so->so_snd,
725 				    min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
726 				    tcp_autosndbuf_max)) == 1) {
727 					mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
728 				}
729 			}
730 		}
731 
732 		DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
733 		    struct socket *, mp_so);
734 		error = mptcp_subflow_output(mpte, mpts, 0);
735 		if (error) {
736 			/* can be a temporary loss of source address or other error */
737 			mpts->mpts_flags |= MPTSF_FAILINGOVER;
738 			mpts->mpts_flags &= ~MPTSF_ACTIVE;
739 			mpts_tried = mpts;
740 			if (error != ECANCELED) {
741 				os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n",
742 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
743 				    error, mpts->mpts_flags);
744 			}
745 			break;
746 		}
747 		/* The model is to have only one active flow at a time */
748 		mpts->mpts_flags |= MPTSF_ACTIVE;
749 		mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
750 
751 		/* Allows us to update the smoothed rtt */
752 		if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
753 			if (preferred_mpts->mpts_probesoon) {
754 				if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
755 					mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
756 					if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
757 						preferred_mpts->mpts_probesoon = 0;
758 						preferred_mpts->mpts_probecnt = 0;
759 					}
760 				}
761 			} else {
762 				preferred_mpts->mpts_probesoon = tcp_now;
763 				preferred_mpts->mpts_probecnt = 0;
764 			}
765 		}
766 
767 		if (mpte->mpte_active_sub == NULL) {
768 			mpte->mpte_active_sub = mpts;
769 		} else if (mpte->mpte_active_sub != mpts) {
770 			mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
771 			mpte->mpte_active_sub = mpts;
772 
773 			mptcpstats_inc_switch(mpte, mpts);
774 		}
775 	}
776 
777 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
778 		if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
779 		    mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) {
780 			mptcp_finish_usrclosed(mpte);
781 		}
782 	}
783 
784 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
785 
786 	/* subflow errors should not be percolated back up */
787 	return 0;
788 }
789 
790 
791 static struct mptsub *
mptcp_choose_subflow(struct mptsub * mpts,struct mptsub * curbest,int * currtt)792 mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
793 {
794 	struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
795 
796 	/*
797 	 * Lower RTT? Take it, if it's our first one, or
798 	 * it doesn't has any loss, or the current one has
799 	 * loss as well.
800 	 */
801 	if (tp->t_srtt && *currtt > tp->t_srtt &&
802 	    (curbest == NULL || tp->t_rxtshift == 0 ||
803 	    sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
804 		*currtt = tp->t_srtt;
805 		return mpts;
806 	}
807 
808 	/*
809 	 * If we find a subflow without loss, take it always!
810 	 */
811 	if (curbest &&
812 	    sototcpcb(curbest->mpts_socket)->t_rxtshift &&
813 	    tp->t_rxtshift == 0) {
814 		*currtt = tp->t_srtt;
815 		return mpts;
816 	}
817 
818 	return curbest != NULL ? curbest : mpts;
819 }
820 
821 static struct mptsub *
mptcp_return_subflow(struct mptsub * mpts)822 mptcp_return_subflow(struct mptsub *mpts)
823 {
824 	if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) {
825 		return NULL;
826 	}
827 
828 	return mpts;
829 }
830 
831 static boolean_t
mptcp_subflow_is_slow(struct mptses * mpte,struct mptsub * mpts)832 mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts)
833 {
834 	struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
835 	int fail_thresh = mptcp_fail_thresh;
836 
837 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
838 		fail_thresh *= 2;
839 	}
840 
841 	return tp->t_rxtshift >= fail_thresh &&
842 	       (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
843 }
844 
845 /*
846  * Return the most eligible subflow to be used for sending data.
847  */
848 struct mptsub *
mptcp_get_subflow(struct mptses * mpte,struct mptsub ** preferred)849 mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred)
850 {
851 	struct tcpcb *besttp, *secondtp;
852 	struct inpcb *bestinp, *secondinp;
853 	struct mptsub *mpts;
854 	struct mptsub *best = NULL;
855 	struct mptsub *second_best = NULL;
856 	int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
857 
858 	/*
859 	 * First Step:
860 	 * Choose the best subflow for cellular and non-cellular interfaces.
861 	 */
862 
863 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
864 		struct socket *so = mpts->mpts_socket;
865 		struct tcpcb *tp = sototcpcb(so);
866 		struct inpcb *inp = sotoinpcb(so);
867 
868 		mptcplog((LOG_DEBUG, "%s mpts %u mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
869 		    __func__, mpts->mpts_connid, mpts->mpts_flags,
870 		    INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
871 		    inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
872 		    tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
873 		    mptcp_subflow_cwnd_space(so)),
874 		    MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
875 
876 		/*
877 		 * First, the hard conditions to reject subflows
878 		 * (e.g., not connected,...)
879 		 */
880 		if (inp->inp_last_outifp == NULL) {
881 			continue;
882 		}
883 
884 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
885 			continue;
886 		}
887 
888 		/* There can only be one subflow in degraded state */
889 		if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
890 			best = mpts;
891 			break;
892 		}
893 
894 		/*
895 		 * If this subflow is waiting to finally send, do it!
896 		 */
897 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
898 			return mptcp_return_subflow(mpts);
899 		}
900 
901 		/*
902 		 * Only send if the subflow is MP_CAPABLE. The exceptions to
903 		 * this rule (degraded or TFO) have been taken care of above.
904 		 */
905 		if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) {
906 			continue;
907 		}
908 
909 		if ((so->so_state & SS_ISDISCONNECTED) ||
910 		    !(so->so_state & SS_ISCONNECTED) ||
911 		    !TCPS_HAVEESTABLISHED(tp->t_state) ||
912 		    tp->t_state > TCPS_CLOSE_WAIT) {
913 			continue;
914 		}
915 
916 		/*
917 		 * Second, the soft conditions to find the subflow with best
918 		 * conditions for each set (aka cellular vs non-cellular)
919 		 */
920 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
921 			second_best = mptcp_choose_subflow(mpts, second_best,
922 			    &exp_rtt);
923 		} else {
924 			best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
925 		}
926 	}
927 
928 	/*
929 	 * If there is no preferred or backup subflow, and there is no active
930 	 * subflow use the last usable subflow.
931 	 */
932 	if (best == NULL) {
933 		return mptcp_return_subflow(second_best);
934 	}
935 
936 	if (second_best == NULL) {
937 		return mptcp_return_subflow(best);
938 	}
939 
940 	besttp = sototcpcb(best->mpts_socket);
941 	bestinp = sotoinpcb(best->mpts_socket);
942 	secondtp = sototcpcb(second_best->mpts_socket);
943 	secondinp = sotoinpcb(second_best->mpts_socket);
944 
945 	if (preferred != NULL) {
946 		*preferred = mptcp_return_subflow(best);
947 	}
948 
949 	/*
950 	 * Second Step: Among best and second_best. Choose the one that is
951 	 * most appropriate for this particular service-type.
952 	 */
953 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
954 		return mptcp_return_subflow(best);
955 	} else if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
956 		/*
957 		 * Only handover if Symptoms tells us to do so.
958 		 */
959 		if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
960 		    mptcp_is_wifi_unusable_for_session(mpte) != 0 && mptcp_subflow_is_slow(mpte, best)) {
961 			return mptcp_return_subflow(second_best);
962 		}
963 
964 		return mptcp_return_subflow(best);
965 	} else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
966 		int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
967 		int rto_thresh = mptcp_rtothresh;
968 
969 		/* Adjust with symptoms information */
970 		if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
971 		    mptcp_is_wifi_unusable_for_session(mpte) != 0) {
972 			rtt_thresh /= 2;
973 			rto_thresh /= 2;
974 		}
975 
976 		if (besttp->t_srtt && secondtp->t_srtt &&
977 		    besttp->t_srtt >= rtt_thresh &&
978 		    secondtp->t_srtt < rtt_thresh) {
979 			tcpstat.tcps_mp_sel_rtt++;
980 			mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d,  second cid %d at rtt %d\n", __func__,
981 			    best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
982 			    second_best->mpts_connid,
983 			    secondtp->t_srtt >> TCP_RTT_SHIFT),
984 			    MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
985 			return mptcp_return_subflow(second_best);
986 		}
987 
988 		if (mptcp_subflow_is_slow(mpte, best) &&
989 		    secondtp->t_rxtshift == 0) {
990 			return mptcp_return_subflow(second_best);
991 		}
992 
993 		/* Compare RTOs, select second_best if best's rto exceeds rtothresh */
994 		if (besttp->t_rxtcur && secondtp->t_rxtcur &&
995 		    besttp->t_rxtcur >= rto_thresh &&
996 		    secondtp->t_rxtcur < rto_thresh) {
997 			tcpstat.tcps_mp_sel_rto++;
998 			mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
999 			    best->mpts_connid, besttp->t_rxtcur,
1000 			    second_best->mpts_connid, secondtp->t_rxtcur),
1001 			    MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
1002 
1003 			return mptcp_return_subflow(second_best);
1004 		}
1005 
1006 		/*
1007 		 * None of the above conditions for sending on the secondary
1008 		 * were true. So, let's schedule on the best one, if he still
1009 		 * has some space in the congestion-window.
1010 		 */
1011 		return mptcp_return_subflow(best);
1012 	} else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) {
1013 		struct mptsub *tmp;
1014 
1015 		/*
1016 		 * We only care about RTT when aggregating
1017 		 */
1018 		if (besttp->t_srtt > secondtp->t_srtt) {
1019 			tmp = best;
1020 			best = second_best;
1021 			besttp = secondtp;
1022 			bestinp = secondinp;
1023 
1024 			second_best = tmp;
1025 			secondtp = sototcpcb(second_best->mpts_socket);
1026 			secondinp = sotoinpcb(second_best->mpts_socket);
1027 		}
1028 
1029 		/* Is there still space in the congestion window? */
1030 		if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) {
1031 			return mptcp_return_subflow(second_best);
1032 		}
1033 
1034 		return mptcp_return_subflow(best);
1035 	} else {
1036 		panic("Unknown service-type configured for MPTCP");
1037 	}
1038 
1039 	return NULL;
1040 }
1041 
1042 static const char *
mptcp_event_to_str(uint32_t event)1043 mptcp_event_to_str(uint32_t event)
1044 {
1045 	const char *c = "UNDEFINED";
1046 	switch (event) {
1047 	case MPCE_CLOSE:
1048 		c = "MPCE_CLOSE";
1049 		break;
1050 	case MPCE_RECV_DATA_ACK:
1051 		c = "MPCE_RECV_DATA_ACK";
1052 		break;
1053 	case MPCE_RECV_DATA_FIN:
1054 		c = "MPCE_RECV_DATA_FIN";
1055 		break;
1056 	}
1057 	return c;
1058 }
1059 
1060 static const char *
mptcp_state_to_str(mptcp_state_t state)1061 mptcp_state_to_str(mptcp_state_t state)
1062 {
1063 	const char *c = "UNDEFINED";
1064 	switch (state) {
1065 	case MPTCPS_CLOSED:
1066 		c = "MPTCPS_CLOSED";
1067 		break;
1068 	case MPTCPS_LISTEN:
1069 		c = "MPTCPS_LISTEN";
1070 		break;
1071 	case MPTCPS_ESTABLISHED:
1072 		c = "MPTCPS_ESTABLISHED";
1073 		break;
1074 	case MPTCPS_CLOSE_WAIT:
1075 		c = "MPTCPS_CLOSE_WAIT";
1076 		break;
1077 	case MPTCPS_FIN_WAIT_1:
1078 		c = "MPTCPS_FIN_WAIT_1";
1079 		break;
1080 	case MPTCPS_CLOSING:
1081 		c = "MPTCPS_CLOSING";
1082 		break;
1083 	case MPTCPS_LAST_ACK:
1084 		c = "MPTCPS_LAST_ACK";
1085 		break;
1086 	case MPTCPS_FIN_WAIT_2:
1087 		c = "MPTCPS_FIN_WAIT_2";
1088 		break;
1089 	case MPTCPS_TIME_WAIT:
1090 		c = "MPTCPS_TIME_WAIT";
1091 		break;
1092 	case MPTCPS_TERMINATE:
1093 		c = "MPTCPS_TERMINATE";
1094 		break;
1095 	}
1096 	return c;
1097 }
1098 
1099 void
mptcp_close_fsm(struct mptcb * mp_tp,uint32_t event)1100 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1101 {
1102 	struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
1103 
1104 	socket_lock_assert_owned(mp_so);
1105 
1106 	mptcp_state_t old_state = mp_tp->mpt_state;
1107 
1108 	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1109 	    uint32_t, event);
1110 
1111 	switch (mp_tp->mpt_state) {
1112 	case MPTCPS_CLOSED:
1113 	case MPTCPS_LISTEN:
1114 		mp_tp->mpt_state = MPTCPS_TERMINATE;
1115 		break;
1116 
1117 	case MPTCPS_ESTABLISHED:
1118 		if (event == MPCE_CLOSE) {
1119 			mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1120 			mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1121 		} else if (event == MPCE_RECV_DATA_FIN) {
1122 			mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1123 			mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1124 		}
1125 		break;
1126 
1127 	case MPTCPS_CLOSE_WAIT:
1128 		if (event == MPCE_CLOSE) {
1129 			mp_tp->mpt_state = MPTCPS_LAST_ACK;
1130 			mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1131 		}
1132 		break;
1133 
1134 	case MPTCPS_FIN_WAIT_1:
1135 		if (event == MPCE_RECV_DATA_ACK) {
1136 			mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1137 		} else if (event == MPCE_RECV_DATA_FIN) {
1138 			mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1139 			mp_tp->mpt_state = MPTCPS_CLOSING;
1140 		}
1141 		break;
1142 
1143 	case MPTCPS_CLOSING:
1144 		if (event == MPCE_RECV_DATA_ACK) {
1145 			mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1146 		}
1147 		break;
1148 
1149 	case MPTCPS_LAST_ACK:
1150 		if (event == MPCE_RECV_DATA_ACK) {
1151 			mptcp_close(mp_tp->mpt_mpte, mp_tp);
1152 		}
1153 		break;
1154 
1155 	case MPTCPS_FIN_WAIT_2:
1156 		if (event == MPCE_RECV_DATA_FIN) {
1157 			mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1158 			mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1159 		}
1160 		break;
1161 
1162 	case MPTCPS_TIME_WAIT:
1163 	case MPTCPS_TERMINATE:
1164 		break;
1165 
1166 	default:
1167 		VERIFY(0);
1168 		/* NOTREACHED */
1169 	}
1170 	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1171 	    uint32_t, event);
1172 	mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
1173 	    mptcp_state_to_str(old_state),
1174 	    mptcp_state_to_str(mp_tp->mpt_state),
1175 	    mptcp_event_to_str(event)),
1176 	    MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
1177 }
1178 
1179 /* If you change this function, match up mptcp_update_rcv_state_f */
1180 void
mptcp_update_dss_rcv_state(struct mptcp_dsn_opt * dss_info,struct tcpcb * tp,uint16_t csum)1181 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1182     uint16_t csum)
1183 {
1184 	struct mptcb *mp_tp = tptomptp(tp);
1185 	u_int64_t full_dsn = 0;
1186 
1187 	NTOHL(dss_info->mdss_dsn);
1188 	NTOHL(dss_info->mdss_subflow_seqn);
1189 	NTOHS(dss_info->mdss_data_len);
1190 
1191 	/* XXX for autosndbuf grow sb here */
1192 	MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1193 	mptcp_update_rcv_state_meat(mp_tp, tp,
1194 	    full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1195 	    csum);
1196 }
1197 
1198 void
mptcp_update_rcv_state_meat(struct mptcb * mp_tp,struct tcpcb * tp,u_int64_t full_dsn,u_int32_t seqn,u_int16_t mdss_data_len,uint16_t csum)1199 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1200     u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1201     uint16_t csum)
1202 {
1203 	if (mdss_data_len == 0) {
1204 		os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n",
1205 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte));
1206 
1207 		if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1208 			os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n",
1209 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum);
1210 		}
1211 		mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1212 		return;
1213 	}
1214 
1215 	mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1216 
1217 	tp->t_rcv_map.mpt_dsn = full_dsn;
1218 	tp->t_rcv_map.mpt_sseq = seqn;
1219 	tp->t_rcv_map.mpt_len = mdss_data_len;
1220 	tp->t_rcv_map.mpt_csum = csum;
1221 	tp->t_mpflags |= TMPF_EMBED_DSN;
1222 }
1223 
1224 
1225 static int
mptcp_validate_dss_map(struct socket * so,struct tcpcb * tp,struct mbuf * m,int hdrlen)1226 mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
1227     int hdrlen)
1228 {
1229 	u_int32_t datalen;
1230 
1231 	if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1232 		return 0;
1233 	}
1234 
1235 	datalen = m->m_pkthdr.mp_rlen;
1236 
1237 	/* unacceptable DSS option, fallback to TCP */
1238 	if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
1239 		os_log_error(mptcp_log_handle, "%s - %lx: mbuf len %d, MPTCP expected %d",
1240 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), m->m_pkthdr.len, datalen);
1241 	} else {
1242 		return 0;
1243 	}
1244 	tp->t_mpflags |= TMPF_SND_MPFAIL;
1245 	mptcp_notify_mpfail(so);
1246 	m_freem(m);
1247 	return -1;
1248 }
1249 
1250 int
mptcp_input_preproc(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th,int drop_hdrlen)1251 mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1252     int drop_hdrlen)
1253 {
1254 	mptcp_insert_rmap(tp, m, th);
1255 	if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
1256 	    drop_hdrlen) != 0) {
1257 		return -1;
1258 	}
1259 	return 0;
1260 }
1261 
1262 static uint16_t
mptcp_input_csum(struct tcpcb * tp,struct mbuf * m,uint64_t dsn,uint32_t sseq,uint16_t dlen,uint16_t csum,int dfin)1263 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1264     uint16_t dlen, uint16_t csum, int dfin)
1265 {
1266 	struct mptcb *mp_tp = tptomptp(tp);
1267 	int real_len = dlen - dfin;
1268 	uint32_t sum = 0;
1269 
1270 	VERIFY(real_len >= 0);
1271 
1272 	if (mp_tp == NULL) {
1273 		return 0;
1274 	}
1275 
1276 	if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
1277 		return 0;
1278 	}
1279 
1280 	if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
1281 		return 0;
1282 	}
1283 
1284 	/*
1285 	 * The remote side may send a packet with fewer bytes than the
1286 	 * claimed DSS checksum length.
1287 	 */
1288 	if ((int)m_length2(m, NULL) < real_len) {
1289 		return 0xffff;
1290 	}
1291 
1292 	if (real_len != 0) {
1293 		sum = m_sum16(m, 0, real_len);
1294 	}
1295 
1296 	sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1297 	ADDCARRY(sum);
1298 
1299 	DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1300 	    uint32_t, sum);
1301 
1302 	return ~sum & 0xffff;
1303 }
1304 
1305 /*
1306  * MPTCP Checksum support
1307  * The checksum is calculated whenever the MPTCP DSS option is included
1308  * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1309  * header and the actual data indicated by the length specified in the
1310  * DSS option.
1311  */
1312 
1313 int
mptcp_validate_csum(struct tcpcb * tp,struct mbuf * m,uint64_t dsn,uint32_t sseq,uint16_t dlen,uint16_t csum,int dfin)1314 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1315     uint32_t sseq, uint16_t dlen, uint16_t csum, int dfin)
1316 {
1317 	uint16_t mptcp_csum;
1318 
1319 	mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1320 	if (mptcp_csum) {
1321 		tp->t_mpflags |= TMPF_SND_MPFAIL;
1322 		mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1323 		m_freem(m);
1324 		tcpstat.tcps_mp_badcsum++;
1325 		return -1;
1326 	}
1327 	return 0;
1328 }
1329 
1330 uint16_t
mptcp_output_csum(struct mbuf * m,uint64_t dss_val,uint32_t sseq,uint16_t dlen)1331 mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1332 {
1333 	uint32_t sum = 0;
1334 
1335 	if (dlen) {
1336 		sum = m_sum16(m, 0, dlen);
1337 	}
1338 
1339 	dss_val = mptcp_hton64(dss_val);
1340 	sseq = htonl(sseq);
1341 	dlen = htons(dlen);
1342 	sum += in_pseudo64(dss_val, sseq, dlen);
1343 
1344 	ADDCARRY(sum);
1345 	sum = ~sum & 0xffff;
1346 	DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1347 	mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1348 	    MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1349 
1350 	return (uint16_t)sum;
1351 }
1352 
1353 /*
1354  * When WiFi signal starts fading, there's more loss and RTT spikes.
1355  * Check if there has been a large spike by comparing against
1356  * a tolerable RTT spike threshold.
1357  */
1358 boolean_t
mptcp_no_rto_spike(struct socket * so)1359 mptcp_no_rto_spike(struct socket *so)
1360 {
1361 	struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1362 	int32_t spike = 0;
1363 
1364 	if (tp->t_rxtcur > mptcp_rtothresh) {
1365 		spike = tp->t_rxtcur - mptcp_rtothresh;
1366 
1367 		mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
1368 		    __func__, spike,
1369 		    tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1370 		    tp->t_rttcur),
1371 		    (MPTCP_SOCKET_DBG | MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1372 	}
1373 
1374 	if (spike > 0) {
1375 		return FALSE;
1376 	} else {
1377 		return TRUE;
1378 	}
1379 }
1380 
1381 void
mptcp_handle_deferred_upcalls(struct mppcb * mpp,uint32_t flag)1382 mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1383 {
1384 	VERIFY(mpp->mpp_flags & flag);
1385 	mpp->mpp_flags &= ~flag;
1386 
1387 	if (mptcp_should_defer_upcall(mpp)) {
1388 		return;
1389 	}
1390 
1391 	if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1392 		mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1393 
1394 		mptcp_subflow_workloop(mpp->mpp_pcbe);
1395 	}
1396 
1397 	if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1398 		mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1399 
1400 		sorwakeup(mpp->mpp_socket);
1401 	}
1402 
1403 	if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1404 		mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1405 
1406 		sowwakeup(mpp->mpp_socket);
1407 	}
1408 }
1409 
1410 static void
mptcp_reset_itfinfo(struct mpt_itf_info * info)1411 mptcp_reset_itfinfo(struct mpt_itf_info *info)
1412 {
1413 	memset(info, 0, sizeof(*info));
1414 }
1415 
1416 void
mptcp_session_necp_cb(void * handle,int action,uint32_t interface_index,uint32_t necp_flags,__unused bool * viable)1417 mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
1418     uint32_t necp_flags, __unused bool *viable)
1419 {
1420 	boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1421 	boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1422 	boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
1423 	boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1424 	struct mppcb *mp = (struct mppcb *)handle;
1425 	struct mptses *mpte = mptompte(mp);
1426 	struct socket *mp_so;
1427 	struct mptcb *mp_tp;
1428 	uint32_t i, ifindex;
1429 	struct ifnet *ifp;
1430 	int locked = 0;
1431 
1432 	ifindex = interface_index;
1433 	VERIFY(ifindex != IFSCOPE_NONE);
1434 
1435 	/* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1436 	if (mp->mpp_socket->so_usecount == 0) {
1437 		return;
1438 	}
1439 
1440 	mp_so = mptetoso(mpte);
1441 
1442 	if (action != NECP_CLIENT_CBACTION_INITIAL) {
1443 		socket_lock(mp_so, 1);
1444 		locked = 1;
1445 
1446 		/* Check again, because it might have changed while waiting */
1447 		if (mp->mpp_socket->so_usecount == 0) {
1448 			goto out;
1449 		}
1450 	}
1451 
1452 	socket_lock_assert_owned(mp_so);
1453 
1454 	mp_tp = mpte->mpte_mptcb;
1455 
1456 	ifnet_head_lock_shared();
1457 	ifp = ifindex2ifnet[ifindex];
1458 	ifnet_head_done();
1459 
1460 	os_log(mptcp_log_handle, "%s - %lx: action: %u ifindex %u delegated to %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1461 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex,
1462 	    ifp && ifp->if_delegated.ifp ? ifp->if_delegated.ifp->if_index : IFSCOPE_NONE,
1463 	    mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
1464 	    has_v4, has_v6, has_nat64, low_power);
1465 
1466 	/* No need on fallen back sockets */
1467 	if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
1468 		goto out;
1469 	}
1470 
1471 	/*
1472 	 * When the interface goes in low-power mode we don't want to establish
1473 	 * new subflows on it. Thus, mark it internally as non-viable.
1474 	 */
1475 	if (low_power) {
1476 		action = NECP_CLIENT_CBACTION_NONVIABLE;
1477 	}
1478 
1479 	if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1480 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1481 			if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1482 				continue;
1483 			}
1484 
1485 			if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1486 				mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1487 			}
1488 		}
1489 
1490 		mptcp_sched_create_subflows(mpte);
1491 	} else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1492 	    action == NECP_CLIENT_CBACTION_INITIAL) {
1493 		int found_slot = 0, slot_index = -1;
1494 		struct sockaddr *dst;
1495 
1496 		if (ifp == NULL) {
1497 			goto out;
1498 		}
1499 
1500 		if (IFNET_IS_COMPANION_LINK(ifp)) {
1501 			goto out;
1502 		}
1503 
1504 		if (IFNET_IS_EXPENSIVE(ifp) &&
1505 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1506 			goto out;
1507 		}
1508 
1509 		if (IFNET_IS_CONSTRAINED(ifp) &&
1510 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1511 			goto out;
1512 		}
1513 
1514 		if (IFNET_IS_CELLULAR(ifp) &&
1515 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1516 			goto out;
1517 		}
1518 
1519 		if (IS_INTF_CLAT46(ifp)) {
1520 			has_v4 = FALSE;
1521 		}
1522 
1523 		/* Look for the slot on where to store/update the interface-info. */
1524 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1525 			/* Found a potential empty slot where we can put it */
1526 			if (mpte->mpte_itfinfo[i].ifindex == 0) {
1527 				found_slot = 1;
1528 				slot_index = i;
1529 			}
1530 
1531 			/*
1532 			 * The interface is already in our array. Check if we
1533 			 * need to update it.
1534 			 */
1535 			if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
1536 			    (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
1537 			    mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
1538 			    mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
1539 				found_slot = 1;
1540 				slot_index = i;
1541 				break;
1542 			}
1543 
1544 			if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1545 				/*
1546 				 * Ok, it's already there and we don't need
1547 				 * to update it
1548 				 */
1549 				goto out;
1550 			}
1551 		}
1552 
1553 		dst = mptcp_get_session_dst(mpte, has_v6, has_v4);
1554 		if (dst && dst->sa_family == AF_INET &&
1555 		    has_v6 && !has_nat64 && !has_v4) {
1556 			if (found_slot) {
1557 				mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1558 				mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1559 				mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1560 				mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1561 			}
1562 			goto out;
1563 		}
1564 
1565 		if (found_slot == 0) {
1566 			int new_size = mpte->mpte_itfinfo_size * 2;
1567 			struct mpt_itf_info *info = kalloc_data(sizeof(*info) * new_size, Z_ZERO);
1568 
1569 			if (info == NULL) {
1570 				os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n",
1571 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size);
1572 				goto out;
1573 			}
1574 
1575 			memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1576 
1577 			if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
1578 				kfree_data(mpte->mpte_itfinfo,
1579 				    sizeof(*info) * mpte->mpte_itfinfo_size);
1580 			}
1581 
1582 			/* We allocated a new one, thus the first must be empty */
1583 			slot_index = mpte->mpte_itfinfo_size;
1584 
1585 			mpte->mpte_itfinfo = info;
1586 			mpte->mpte_itfinfo_size = new_size;
1587 		}
1588 
1589 		VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
1590 		mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1591 		mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1592 		mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1593 		mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1594 
1595 		mptcp_sched_create_subflows(mpte);
1596 	}
1597 
1598 out:
1599 	if (locked) {
1600 		socket_unlock(mp_so, 1);
1601 	}
1602 }
1603 
1604 void
mptcp_set_restrictions(struct socket * mp_so)1605 mptcp_set_restrictions(struct socket *mp_so)
1606 {
1607 	struct mptses *mpte = mpsotompte(mp_so);
1608 	uint32_t i;
1609 
1610 	socket_lock_assert_owned(mp_so);
1611 
1612 	ifnet_head_lock_shared();
1613 
1614 	for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1615 		struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1616 		uint32_t ifindex = info->ifindex;
1617 		struct ifnet *ifp;
1618 
1619 		if (ifindex == IFSCOPE_NONE) {
1620 			continue;
1621 		}
1622 
1623 		ifp = ifindex2ifnet[ifindex];
1624 		if (ifp == NULL) {
1625 			continue;
1626 		}
1627 
1628 		if (IFNET_IS_EXPENSIVE(ifp) &&
1629 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1630 			info->ifindex = IFSCOPE_NONE;
1631 		}
1632 
1633 		if (IFNET_IS_CONSTRAINED(ifp) &&
1634 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1635 			info->ifindex = IFSCOPE_NONE;
1636 		}
1637 
1638 		if (IFNET_IS_CELLULAR(ifp) &&
1639 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1640 			info->ifindex = IFSCOPE_NONE;
1641 		}
1642 	}
1643 
1644 	ifnet_head_done();
1645 }
1646 
1647 #define DUMP_BUF_CHK() {        \
1648 	clen -= k;              \
1649 	if (clen < 1)           \
1650 	        goto done;      \
1651 	c += k;                 \
1652 }
1653 
1654 int
dump_mptcp_reass_qlen(char * str,int str_len)1655 dump_mptcp_reass_qlen(char *str, int str_len)
1656 {
1657 	char *c = str;
1658 	int k, clen = str_len;
1659 
1660 	if (mptcp_reass_total_qlen != 0) {
1661 		k = scnprintf(c, clen, "\nmptcp reass qlen %d\n", mptcp_reass_total_qlen);
1662 		DUMP_BUF_CHK();
1663 	}
1664 
1665 done:
1666 	return str_len - clen;
1667 }
1668