xref: /xnu-12377.61.12/bsd/netinet/mptcp.c (revision 4d495c6e23c53686cf65f45067f79024cf5dcee8)
1 /*
2  * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * A note on the MPTCP/NECP-interactions:
31  *
32  * MPTCP uses NECP-callbacks to get notified of interface/policy events.
33  * MPTCP registers to these events at the MPTCP-layer for interface-events
34  * through a call to necp_client_register_multipath_cb.
35  * To get per-flow events (aka per TCP-subflow), we register to it with
36  * necp_client_register_socket_flow. Both registrations happen by using the
37  * necp-client-uuid that comes from the app.
38  *
39  * The locking is rather tricky. In general, we expect the lock-ordering to
40  * happen from necp-fd -> necp->client -> mpp_lock.
41  *
42  * There are however some subtleties.
43  *
44  * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
45  * safe, because it is the very first time this MPTCP-connection goes into NECP.
46  * As we go into NECP we take the NECP-locks and thus are guaranteed that no
47  * NECP-locks will deadlock us. Because these NECP-events will also first take
48  * the NECP-locks. Either they win the race and thus won't find our
49  * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
50  * the callbacks while holding the NECP lock.
51  *
52  * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
53  * because we have already registered callbacks and we might race against an
54  * NECP-event that will match on our socket. So, we have to unlock to be safe.
55  *
56  * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
57  * so_usecount has reached 0. We must be careful to not remove the mpp_socket
58  * pointers before we unregistered the callback. Because, again we might be
59  * racing against an NECP-event. Unregistering must happen with an unlocked
60  * mpp_lock, because of the lock-ordering constraint. It could be that
61  * before we had a chance to unregister an NECP-event triggers. That's why
62  * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
63  * there while the socket is being garbage-collected, the use-count will go
64  * down to 0 and we exit. Removal of the multipath_cb again happens by taking
65  * the NECP-locks so any running NECP-events will finish first and exit cleanly.
66  *
67  * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
68  * the socket-lock must be unlocked for lock-ordering constraints. This gets a
69  * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
70  * So, we drop the mp_so-lock as soon as the subflow is unlinked with
71  * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
72  * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
73  * gets it, it will realize that the subflow became non-MPTCP and retry (see
74  * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
75  * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
76  * for the NECP-lock (held by the other thread that is taking care of the NECP-
77  * event). So, the event now finally gets the subflow-lock and then hits an
78  * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
79  * the NECP callback.
80  */
81 
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/kernel.h>
85 #include <sys/mbuf.h>
86 #include <sys/mcache.h>
87 #include <sys/socket.h>
88 #include <sys/socketvar.h>
89 #include <sys/syslog.h>
90 #include <sys/protosw.h>
91 
92 #include <kern/zalloc.h>
93 #include <kern/locks.h>
94 
95 #include <mach/sdt.h>
96 
97 #include <net/droptap.h>
98 #include <net/if.h>
99 #include <netinet/in.h>
100 #include <netinet/in_var.h>
101 #include <netinet/tcp.h>
102 #include <netinet/tcp_fsm.h>
103 #include <netinet/tcp_seq.h>
104 #include <netinet/tcp_var.h>
105 #include <netinet/mptcp_var.h>
106 #include <netinet/mptcp.h>
107 #include <netinet/mptcp_seq.h>
108 #include <netinet/mptcp_opt.h>
109 #include <netinet/mptcp_timer.h>
110 
111 int mptcp_enable = 1;
112 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
113     &mptcp_enable, 0, "Enable Multipath TCP Support");
114 
115 /*
116  * Number of times to try negotiating MPTCP on SYN retransmissions.
117  * We haven't seen any reports of a middlebox that is dropping all SYN-segments
118  * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times.
119  */
120 int mptcp_mpcap_retries = 4;
121 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
122     CTLFLAG_RW | CTLFLAG_LOCKED,
123     &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
124 
125 /*
126  * By default, DSS checksum is turned off, revisit if we ever do
127  * MPTCP for non SSL Traffic.
128  */
129 int mptcp_dss_csum = 0;
130 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
131     &mptcp_dss_csum, 0, "Enable DSS checksum");
132 
133 /*
134  * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
135  * is attempted on a different path.
136  */
137 int mptcp_fail_thresh = 1;
138 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
139     &mptcp_fail_thresh, 0, "Failover threshold");
140 
141 /*
142  * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
143  * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
144  * Some carrier networks have a timeout of 10 or 15 minutes.
145  */
146 int mptcp_subflow_keeptime = 60 * 14;
147 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
148     &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
149 
150 int mptcp_rtthist_rtthresh = 600;
151 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
152     &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
153 
154 int mptcp_rtothresh = 1500;
155 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
156     &mptcp_rtothresh, 0, "RTO threshold");
157 
158 /*
159  * Probe the preferred path, when it is not in use
160  */
161 uint32_t mptcp_probeto = 1000;
162 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
163     &mptcp_probeto, 0, "Disable probing by setting to 0");
164 
165 uint32_t mptcp_probecnt = 5;
166 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
167     &mptcp_probecnt, 0, "Number of probe writes");
168 
169 uint32_t mptcp_enable_v1 = 1;
170 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, enable_v1, CTLFLAG_RW | CTLFLAG_LOCKED,
171     &mptcp_enable_v1, 0, "Enable or disable v1");
172 
173 static int
174 sysctl_mptcp_version_check SYSCTL_HANDLER_ARGS
175 {
176 #pragma unused(arg1, arg2)
177 	int error;
178 	int new_value = *(int *)oidp->oid_arg1;
179 	int old_value = *(int *)oidp->oid_arg1;
180 
181 	error = sysctl_handle_int(oidp, &new_value, 0, req);
182 	if (!error) {
183 		if (new_value != MPTCP_VERSION_0 && new_value != MPTCP_VERSION_1) {
184 			return EINVAL;
185 		}
186 		*(int *)oidp->oid_arg1 = new_value;
187 	}
188 
189 	os_log(OS_LOG_DEFAULT,
190 	    "%s:%u sysctl net.inet.tcp.mptcp_preferred_version: %d -> %d)",
191 	    proc_best_name(current_proc()), proc_selfpid(),
192 	    old_value, *(int *)oidp->oid_arg1);
193 
194 	return error;
195 }
196 
197 int mptcp_preferred_version = MPTCP_VERSION_1;
198 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, mptcp_preferred_version,
199     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
200     &mptcp_preferred_version, 0, &sysctl_mptcp_version_check, "I", "");
201 
202 int mptcp_reass_total_qlen = 0;
203 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, reass_qlen,
204     CTLFLAG_RD | CTLFLAG_LOCKED, &mptcp_reass_total_qlen, 0,
205     "Total number of MPTCP segments in reassembly queues");
206 
207 static int
mptcp_reass_present(struct socket * mp_so)208 mptcp_reass_present(struct socket *mp_so)
209 {
210 	struct mptses *mpte = mpsotompte(mp_so);
211 	struct mptcb *mp_tp = mpte->mpte_mptcb;
212 	struct tseg_qent *q;
213 	int dowakeup = 0;
214 	int flags = 0;
215 	int count = 0;
216 
217 	/*
218 	 * Present data to user, advancing rcv_nxt through
219 	 * completed sequence space.
220 	 */
221 	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
222 		return flags;
223 	}
224 	q = LIST_FIRST(&mp_tp->mpt_segq);
225 	if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) {
226 		return flags;
227 	}
228 
229 	/*
230 	 * If there is already another thread doing reassembly for this
231 	 * connection, it is better to let it finish the job --
232 	 * (radar 16316196)
233 	 */
234 	if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) {
235 		return flags;
236 	}
237 
238 	mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
239 
240 	do {
241 		mp_tp->mpt_rcvnxt += q->tqe_len;
242 		LIST_REMOVE(q, tqe_q);
243 		if (mp_so->so_state & SS_CANTRCVMORE) {
244 			m_freem(q->tqe_m);
245 		} else {
246 			flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
247 			if (sbappendstream_rcvdemux(mp_so, q->tqe_m)) {
248 				dowakeup = 1;
249 			}
250 		}
251 		tcp_reass_qent_free(mp_so->so_proto, q);
252 		mp_tp->mpt_reassqlen--;
253 		count++;
254 		q = LIST_FIRST(&mp_tp->mpt_segq);
255 	} while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
256 	mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
257 
258 	if (count > 0) {
259 		OSAddAtomic(-count, &mptcp_reass_total_qlen);
260 	}
261 	if (dowakeup) {
262 		sorwakeup(mp_so); /* done with socket lock held */
263 	}
264 	return flags;
265 }
266 
267 static int
mptcp_reass(struct socket * mp_so,struct pkthdr * phdr,int * tlenp,struct mbuf * m)268 mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
269 {
270 	struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
271 	u_int64_t mb_dsn = phdr->mp_dsn;
272 	struct tseg_qent *q;
273 	struct tseg_qent *p = NULL;
274 	struct tseg_qent *nq;
275 	struct tseg_qent *te = NULL;
276 	uint32_t qlimit;
277 
278 	/*
279 	 * Limit the number of segments in the reassembly queue to prevent
280 	 * holding on to too many segments (and thus running out of mbufs).
281 	 * Make sure to let the missing segment through which caused this
282 	 * queue.  Always keep one global queue entry spare to be able to
283 	 * process the missing segment.
284 	 */
285 	qlimit = MIN(MAX(100, mp_so->so_rcv.sb_hiwat >> 10),
286 	    (tcp_autorcvbuf_max >> 10));
287 	if (mb_dsn != mp_tp->mpt_rcvnxt &&
288 	    (mp_tp->mpt_reassqlen + 1) >= qlimit) {
289 		tcpstat.tcps_mptcp_rcvmemdrop++;
290 		m_freem(m);
291 		*tlenp = 0;
292 		return 0;
293 	}
294 
295 	/* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
296 	te = tcp_reass_qent_alloc(mp_so->so_proto);
297 	if (te == NULL) {
298 		m_drop_list(m, NULL,
299 		    DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING,
300 		    DROP_REASON_MPTCP_REASSEMBLY_ALLOC, NULL, 0);
301 		*tlenp = 0;
302 		return 0;
303 	}
304 
305 	mp_tp->mpt_reassqlen++;
306 	OSIncrementAtomic(&mptcp_reass_total_qlen);
307 
308 	/*
309 	 * Find a segment which begins after this one does.
310 	 */
311 	LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
312 		if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) {
313 			break;
314 		}
315 		p = q;
316 	}
317 
318 	/*
319 	 * If there is a preceding segment, it may provide some of
320 	 * our data already.  If so, drop the data from the incoming
321 	 * segment.  If it provides all of our data, drop us.
322 	 */
323 	if (p != NULL) {
324 		int64_t i;
325 		/* conversion to int (in i) handles seq wraparound */
326 		i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
327 		if (i > 0) {
328 			if (i >= *tlenp) {
329 				tcpstat.tcps_mptcp_rcvduppack++;
330 				m_freem(m);
331 				tcp_reass_qent_free(mp_so->so_proto, te);
332 				te = NULL;
333 				mp_tp->mpt_reassqlen--;
334 				OSDecrementAtomic(&mptcp_reass_total_qlen);
335 				/*
336 				 * Try to present any queued data
337 				 * at the left window edge to the user.
338 				 * This is needed after the 3-WHS
339 				 * completes.
340 				 */
341 				goto out;
342 			}
343 			VERIFY(i <= INT_MAX);
344 			m_adj(m, (int)i);
345 			*tlenp -= i;
346 			phdr->mp_dsn += i;
347 		}
348 	}
349 
350 	tcpstat.tcps_mp_oodata++;
351 
352 	/*
353 	 * While we overlap succeeding segments trim them or,
354 	 * if they are completely covered, dequeue them.
355 	 */
356 	while (q) {
357 		int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
358 		if (i <= 0) {
359 			break;
360 		}
361 
362 		if (i < q->tqe_len) {
363 			q->tqe_m->m_pkthdr.mp_dsn += i;
364 			q->tqe_len -= i;
365 
366 			VERIFY(i <= INT_MAX);
367 			m_adj(q->tqe_m, (int)i);
368 			break;
369 		}
370 
371 		nq = LIST_NEXT(q, tqe_q);
372 		LIST_REMOVE(q, tqe_q);
373 		m_freem(q->tqe_m);
374 		tcp_reass_qent_free(mp_so->so_proto, q);
375 		mp_tp->mpt_reassqlen--;
376 		OSDecrementAtomic(&mptcp_reass_total_qlen);
377 		q = nq;
378 	}
379 
380 	/* Insert the new segment queue entry into place. */
381 	te->tqe_m = m;
382 	te->tqe_th = NULL;
383 	te->tqe_len = *tlenp;
384 
385 	if (p == NULL) {
386 		LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
387 	} else {
388 		LIST_INSERT_AFTER(p, te, tqe_q);
389 	}
390 
391 out:
392 	return mptcp_reass_present(mp_so);
393 }
394 
395 /*
396  * MPTCP input, called when data has been read from a subflow socket.
397  */
398 void
mptcp_input(struct mptses * mpte,struct mbuf * m)399 mptcp_input(struct mptses *mpte, struct mbuf *m)
400 {
401 	struct socket *mp_so;
402 	struct mptcb *mp_tp = NULL;
403 	int count = 0, wakeup = 0;
404 	struct mbuf *save = NULL, *prev = NULL;
405 	struct mbuf *freelist = NULL, *tail = NULL;
406 
407 	ASSERT(m->m_flags & M_PKTHDR);
408 	if (__improbable((m->m_flags & M_PKTHDR) == 0)) {
409 		m_drop_list(m, NULL, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_MPTCP_INPUT_MALFORMED, NULL, 0);
410 		return;
411 	}
412 
413 	mp_so = mptetoso(mpte);
414 	mp_tp = mpte->mpte_mptcb;
415 
416 	socket_lock_assert_owned(mp_so);
417 
418 	DTRACE_MPTCP(input);
419 
420 	mp_tp->mpt_rcvwnd = imax(mptcp_sbspace(mp_tp), (int)(mp_tp->mpt_rcvadv - mp_tp->mpt_rcvnxt));
421 
422 	/*
423 	 * Each mbuf contains MPTCP Data Sequence Map
424 	 * Process the data for reassembly, delivery to MPTCP socket
425 	 * client, etc.
426 	 *
427 	 */
428 	count = mp_so->so_rcv.sb_cc;
429 
430 	/*
431 	 * In the degraded fallback case, data is accepted without DSS map
432 	 */
433 	if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
434 		struct mbuf *iter;
435 		int mb_dfin;
436 fallback:
437 		mb_dfin = 0;
438 		mptcp_sbrcv_grow(mp_tp);
439 
440 		iter = m;
441 		while (iter) {
442 			if ((iter->m_flags & M_PKTHDR) &&
443 			    (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
444 				mb_dfin = 1;
445 			}
446 
447 			if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
448 				/* Don't add zero-length packets, so jump it! */
449 				if (prev == NULL) {
450 					m = iter->m_next;
451 					m_free(iter);
452 					iter = m;
453 				} else {
454 					prev->m_next = iter->m_next;
455 					m_free(iter);
456 					iter = prev->m_next;
457 				}
458 
459 				/* It was a zero-length packet so next one must be a pkthdr */
460 				VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
461 			} else {
462 				prev = iter;
463 				iter = iter->m_next;
464 			}
465 		}
466 
467 		/*
468 		 * assume degraded flow as this may be the first packet
469 		 * without DSS, and the subflow state is not updated yet.
470 		 */
471 		if (sbappendstream_rcvdemux(mp_so, m)) {
472 			sorwakeup(mp_so);
473 		}
474 
475 		DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
476 		    struct socket *, mp_so,
477 		    struct sockbuf *, &mp_so->so_rcv,
478 		    struct sockbuf *, &mp_so->so_snd,
479 		    struct mptses *, mpte);
480 		count = mp_so->so_rcv.sb_cc - count;
481 
482 		mp_tp->mpt_rcvnxt += count;
483 
484 		if (mb_dfin) {
485 			mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
486 			socantrcvmore(mp_so);
487 		}
488 		return;
489 	}
490 
491 	do {
492 		u_int64_t mb_dsn;
493 		int32_t mb_datalen;
494 		int64_t todrop;
495 		int mb_dfin = 0;
496 
497 		VERIFY(m->m_flags & M_PKTHDR);
498 
499 		/* If fallback occurs, mbufs will not have PKTF_MPTCP set */
500 		if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
501 			goto fallback;
502 		}
503 
504 		save = m->m_next;
505 		/*
506 		 * A single TCP packet formed of multiple mbufs
507 		 * holds DSS mapping in the first mbuf of the chain.
508 		 * Other mbufs in the chain may have M_PKTHDR set
509 		 * even though they belong to the same TCP packet
510 		 * and therefore use the DSS mapping stored in the
511 		 * first mbuf of the mbuf chain. mptcp_input() can
512 		 * get an mbuf chain with multiple TCP packets.
513 		 */
514 		while (save && (!(save->m_flags & M_PKTHDR) ||
515 		    !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
516 			prev = save;
517 			save = save->m_next;
518 		}
519 		if (prev) {
520 			prev->m_next = NULL;
521 		} else {
522 			m->m_next = NULL;
523 		}
524 
525 		mb_dsn = m->m_pkthdr.mp_dsn;
526 		mb_datalen = m->m_pkthdr.mp_rlen;
527 
528 		todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
529 		if (todrop > 0) {
530 			tcpstat.tcps_mptcp_rcvpackafterwin++;
531 
532 			os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n",
533 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
534 			    (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt,
535 			    mp_tp->mpt_rcvwnd, todrop);
536 
537 			if (todrop >= mb_datalen) {
538 				if (freelist == NULL) {
539 					freelist = m;
540 				} else {
541 					tail->m_next = m;
542 				}
543 
544 				if (prev != NULL) {
545 					tail = prev;
546 				} else {
547 					tail = m;
548 				}
549 
550 				m = save;
551 				prev = save = NULL;
552 				continue;
553 			} else {
554 				VERIFY(todrop <= INT_MAX);
555 				m_adj(m, (int)-todrop);
556 				mb_datalen -= todrop;
557 				m->m_pkthdr.mp_rlen -= todrop;
558 			}
559 
560 			/*
561 			 * We drop from the right edge of the mbuf, thus the
562 			 * DATA_FIN is dropped as well
563 			 */
564 			m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
565 		}
566 
567 		if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
568 			if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
569 			    mp_tp->mpt_rcvnxt)) {
570 				if (freelist == NULL) {
571 					freelist = m;
572 				} else {
573 					tail->m_next = m;
574 				}
575 
576 				if (prev != NULL) {
577 					tail = prev;
578 				} else {
579 					tail = m;
580 				}
581 
582 				m = save;
583 				prev = save = NULL;
584 				continue;
585 			} else {
586 				VERIFY((mp_tp->mpt_rcvnxt - mb_dsn) <= INT_MAX);
587 				m_adj(m, (int)(mp_tp->mpt_rcvnxt - mb_dsn));
588 				mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn);
589 				mb_dsn = mp_tp->mpt_rcvnxt;
590 				VERIFY(mb_datalen >= 0 && mb_datalen <= USHRT_MAX);
591 				m->m_pkthdr.mp_rlen = (uint16_t)mb_datalen;
592 				m->m_pkthdr.mp_dsn = mb_dsn;
593 			}
594 		}
595 
596 		if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
597 		    !LIST_EMPTY(&mp_tp->mpt_segq)) {
598 			mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
599 
600 			goto next;
601 		}
602 		mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
603 
604 		mptcp_sbrcv_grow(mp_tp);
605 
606 		if (sbappendstream_rcvdemux(mp_so, m)) {
607 			wakeup = 1;
608 		}
609 
610 		DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
611 		    struct sockbuf *, &mp_so->so_rcv,
612 		    struct sockbuf *, &mp_so->so_snd,
613 		    struct mptses *, mpte,
614 		    struct mptcb *, mp_tp);
615 		count = mp_so->so_rcv.sb_cc - count;
616 		tcpstat.tcps_mp_rcvtotal++;
617 		tcpstat.tcps_mp_rcvbytes += count;
618 
619 		mp_tp->mpt_rcvnxt += count;
620 
621 next:
622 		if (mb_dfin) {
623 			mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
624 			socantrcvmore(mp_so);
625 		}
626 		m = save;
627 		prev = save = NULL;
628 		count = mp_so->so_rcv.sb_cc;
629 	} while (m);
630 
631 	if (freelist) {
632 		m_freem(freelist);
633 	}
634 
635 	if (wakeup) {
636 		sorwakeup(mp_so);
637 	}
638 }
639 
640 boolean_t
mptcp_can_send_more(struct mptcb * mp_tp,boolean_t ignore_reinject)641 mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
642 {
643 	struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
644 
645 	/*
646 	 * Always send if there is data in the reinject-queue.
647 	 */
648 	if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) {
649 		return TRUE;
650 	}
651 
652 	/*
653 	 * Don't send, if:
654 	 *
655 	 * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
656 	 *    Except when using TFO, we might be doing a 0-byte write.
657 	 * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
658 	 * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
659 	 */
660 
661 	if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
662 		return FALSE;
663 	}
664 
665 	if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) {
666 		return FALSE;
667 	}
668 
669 	if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
670 		return FALSE;
671 	}
672 
673 	if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
674 		return FALSE;
675 	}
676 
677 	return TRUE;
678 }
679 
680 /*
681  * MPTCP output.
682  */
683 int
mptcp_output(struct mptses * mpte)684 mptcp_output(struct mptses *mpte)
685 {
686 	struct mptcb *mp_tp;
687 	struct mptsub *mpts;
688 	struct mptsub *mpts_tried = NULL;
689 	struct socket *mp_so;
690 	struct mptsub *preferred_mpts __single = NULL;
691 	uint64_t old_snd_nxt;
692 	int error = 0;
693 
694 	mp_so = mptetoso(mpte);
695 	mp_tp = mpte->mpte_mptcb;
696 
697 	socket_lock_assert_owned(mp_so);
698 
699 	if (mp_so->so_flags & SOF_DEFUNCT) {
700 		return 0;
701 	}
702 
703 	VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
704 	mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
705 
706 	old_snd_nxt = mp_tp->mpt_sndnxt;
707 	while (mptcp_can_send_more(mp_tp, FALSE)) {
708 		/* get the "best" subflow to be used for transmission */
709 		mpts = mptcp_get_subflow(mpte, &preferred_mpts);
710 		if (mpts == NULL) {
711 			break;
712 		}
713 
714 		/* In case there's just one flow, we reattempt later */
715 		if (mpts_tried != NULL &&
716 		    (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
717 			mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
718 			mpts_tried->mpts_flags |= MPTSF_ACTIVE;
719 			mptcp_start_timer(mpte, MPTT_REXMT);
720 			break;
721 		}
722 
723 		/*
724 		 * Automatic sizing of send socket buffer. Increase the send
725 		 * socket buffer size if all of the following criteria are met
726 		 *	1. the receiver has enough buffer space for this data
727 		 *	2. send buffer is filled to 7/8th with data (so we actually
728 		 *	   have data to make use of it);
729 		 */
730 		if ((mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE) {
731 			if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
732 			    mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
733 				if (sbreserve(&mp_so->so_snd,
734 				    min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
735 				    tcp_autosndbuf_max)) == 1) {
736 					mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
737 				}
738 			}
739 		}
740 
741 		DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
742 		    struct socket *, mp_so);
743 		error = mptcp_subflow_output(mpte, mpts, 0);
744 		if (error) {
745 			/* can be a temporary loss of source address or other error */
746 			mpts->mpts_flags |= MPTSF_FAILINGOVER;
747 			mpts->mpts_flags &= ~MPTSF_ACTIVE;
748 			mpts_tried = mpts;
749 			if (error != ECANCELED) {
750 				os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n",
751 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
752 				    error, mpts->mpts_flags);
753 			}
754 			break;
755 		}
756 		/* The model is to have only one active flow at a time */
757 		mpts->mpts_flags |= MPTSF_ACTIVE;
758 		mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
759 
760 		/* Allows us to update the smoothed rtt */
761 		if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
762 			if (preferred_mpts->mpts_probesoon) {
763 				if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
764 					mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
765 					if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
766 						preferred_mpts->mpts_probesoon = 0;
767 						preferred_mpts->mpts_probecnt = 0;
768 					}
769 				}
770 			} else {
771 				preferred_mpts->mpts_probesoon = tcp_now;
772 				preferred_mpts->mpts_probecnt = 0;
773 			}
774 		}
775 
776 		if (mpte->mpte_active_sub == NULL) {
777 			mpte->mpte_active_sub = mpts;
778 		} else if (mpte->mpte_active_sub != mpts) {
779 			mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
780 			mpte->mpte_active_sub = mpts;
781 
782 			mptcpstats_inc_switch(mpte, mpts);
783 		}
784 	}
785 
786 	if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
787 		if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
788 		    mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) {
789 			mptcp_finish_usrclosed(mpte);
790 		}
791 	}
792 
793 	mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
794 
795 	/* subflow errors should not be percolated back up */
796 	return 0;
797 }
798 
799 
800 static struct mptsub *
mptcp_choose_subflow(struct mptsub * mpts,struct mptsub * curbest,int * currtt)801 mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
802 {
803 	struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
804 
805 	/*
806 	 * Lower RTT? Take it, if it's our first one, or
807 	 * it doesn't has any loss, or the current one has
808 	 * loss as well.
809 	 */
810 	if (tp->t_srtt && *currtt > tp->t_srtt &&
811 	    (curbest == NULL || tp->t_rxtshift == 0 ||
812 	    sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
813 		*currtt = tp->t_srtt;
814 		return mpts;
815 	}
816 
817 	/*
818 	 * If we find a subflow without loss, take it always!
819 	 */
820 	if (curbest &&
821 	    sototcpcb(curbest->mpts_socket)->t_rxtshift &&
822 	    tp->t_rxtshift == 0) {
823 		*currtt = tp->t_srtt;
824 		return mpts;
825 	}
826 
827 	return curbest != NULL ? curbest : mpts;
828 }
829 
830 static struct mptsub *
mptcp_return_subflow(struct mptsub * mpts)831 mptcp_return_subflow(struct mptsub *mpts)
832 {
833 	if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) {
834 		return NULL;
835 	}
836 
837 	return mpts;
838 }
839 
840 static boolean_t
mptcp_subflow_is_slow(struct mptses * mpte,struct mptsub * mpts)841 mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts)
842 {
843 	struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
844 	int fail_thresh = mptcp_fail_thresh;
845 
846 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
847 		fail_thresh *= 2;
848 	}
849 
850 	return tp->t_rxtshift >= fail_thresh &&
851 	       (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
852 }
853 
854 /*
855  * Return the most eligible subflow to be used for sending data.
856  */
857 struct mptsub *
mptcp_get_subflow(struct mptses * mpte,struct mptsub ** preferred)858 mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred)
859 {
860 	struct tcpcb *besttp, *secondtp;
861 	struct inpcb *bestinp, *secondinp;
862 	struct mptsub *mpts;
863 	struct mptsub *best = NULL;
864 	struct mptsub *second_best = NULL;
865 	int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
866 
867 	/*
868 	 * First Step:
869 	 * Choose the best subflow for cellular and non-cellular interfaces.
870 	 */
871 
872 	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
873 		struct socket *so = mpts->mpts_socket;
874 		struct tcpcb *tp = sototcpcb(so);
875 		struct inpcb *inp = sotoinpcb(so);
876 
877 		/*
878 		 * First, the hard conditions to reject subflows
879 		 * (e.g., not connected,...)
880 		 */
881 		if (inp->inp_last_outifp == NULL) {
882 			continue;
883 		}
884 
885 		if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
886 			continue;
887 		}
888 
889 		/* There can only be one subflow in degraded state */
890 		if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
891 			best = mpts;
892 			break;
893 		}
894 
895 		/*
896 		 * If this subflow is waiting to finally send, do it!
897 		 */
898 		if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
899 			return mptcp_return_subflow(mpts);
900 		}
901 
902 		/*
903 		 * Only send if the subflow is MP_CAPABLE. The exceptions to
904 		 * this rule (degraded or TFO) have been taken care of above.
905 		 */
906 		if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) {
907 			continue;
908 		}
909 
910 		if ((so->so_state & SS_ISDISCONNECTED) ||
911 		    !(so->so_state & SS_ISCONNECTED) ||
912 		    !TCPS_HAVEESTABLISHED(tp->t_state) ||
913 		    tp->t_state > TCPS_CLOSE_WAIT) {
914 			continue;
915 		}
916 
917 		/*
918 		 * Second, the soft conditions to find the subflow with best
919 		 * conditions for each set (aka cellular vs non-cellular)
920 		 */
921 		if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
922 			second_best = mptcp_choose_subflow(mpts, second_best,
923 			    &exp_rtt);
924 		} else {
925 			best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
926 		}
927 	}
928 
929 	/*
930 	 * If there is no preferred or backup subflow, and there is no active
931 	 * subflow use the last usable subflow.
932 	 */
933 	if (best == NULL) {
934 		return mptcp_return_subflow(second_best);
935 	}
936 
937 	if (second_best == NULL) {
938 		return mptcp_return_subflow(best);
939 	}
940 
941 	besttp = sototcpcb(best->mpts_socket);
942 	bestinp = sotoinpcb(best->mpts_socket);
943 	secondtp = sototcpcb(second_best->mpts_socket);
944 	secondinp = sotoinpcb(second_best->mpts_socket);
945 
946 	if (preferred != NULL) {
947 		*preferred = mptcp_return_subflow(best);
948 	}
949 
950 	/*
951 	 * Second Step: Among best and second_best. Choose the one that is
952 	 * most appropriate for this particular service-type.
953 	 */
954 	if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
955 		return mptcp_return_subflow(best);
956 	} else if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
957 		/*
958 		 * Only handover if Symptoms tells us to do so.
959 		 */
960 		if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
961 		    mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD &&
962 		    mptcp_subflow_is_slow(mpte, best)) {
963 			return mptcp_return_subflow(second_best);
964 		}
965 
966 		return mptcp_return_subflow(best);
967 	} else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
968 		int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
969 		int rto_thresh = mptcp_rtothresh;
970 
971 		/* Adjust with symptoms information */
972 		if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
973 		    mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
974 			rtt_thresh /= 2;
975 			rto_thresh /= 2;
976 		}
977 
978 		if (besttp->t_srtt && secondtp->t_srtt &&
979 		    besttp->t_srtt >= rtt_thresh &&
980 		    secondtp->t_srtt < rtt_thresh) {
981 			tcpstat.tcps_mp_sel_rtt++;
982 			return mptcp_return_subflow(second_best);
983 		}
984 
985 		if (mptcp_subflow_is_slow(mpte, best) &&
986 		    secondtp->t_rxtshift == 0) {
987 			return mptcp_return_subflow(second_best);
988 		}
989 
990 		/* Compare RTOs, select second_best if best's rto exceeds rtothresh */
991 		if (besttp->t_rxtcur && secondtp->t_rxtcur &&
992 		    besttp->t_rxtcur >= rto_thresh &&
993 		    secondtp->t_rxtcur < rto_thresh) {
994 			tcpstat.tcps_mp_sel_rto++;
995 
996 			return mptcp_return_subflow(second_best);
997 		}
998 
999 		/*
1000 		 * None of the above conditions for sending on the secondary
1001 		 * were true. So, let's schedule on the best one, if he still
1002 		 * has some space in the congestion-window.
1003 		 */
1004 		return mptcp_return_subflow(best);
1005 	} else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) {
1006 		struct mptsub *tmp;
1007 
1008 		/*
1009 		 * We only care about RTT when aggregating
1010 		 */
1011 		if (besttp->t_srtt > secondtp->t_srtt) {
1012 			tmp = best;
1013 			best = second_best;
1014 			besttp = secondtp;
1015 			bestinp = secondinp;
1016 
1017 			second_best = tmp;
1018 			secondtp = sototcpcb(second_best->mpts_socket);
1019 			secondinp = sotoinpcb(second_best->mpts_socket);
1020 		}
1021 
1022 		/* Is there still space in the congestion window? */
1023 		if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) {
1024 			return mptcp_return_subflow(second_best);
1025 		}
1026 
1027 		return mptcp_return_subflow(best);
1028 	} else {
1029 		panic("Unknown service-type configured for MPTCP");
1030 	}
1031 
1032 	return NULL;
1033 }
1034 
1035 void
mptcp_close_fsm(struct mptcb * mp_tp,uint32_t event)1036 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1037 {
1038 	struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
1039 
1040 	socket_lock_assert_owned(mp_so);
1041 
1042 	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1043 	    uint32_t, event);
1044 
1045 	switch (mp_tp->mpt_state) {
1046 	case MPTCPS_CLOSED:
1047 	case MPTCPS_LISTEN:
1048 		mp_tp->mpt_state = MPTCPS_TERMINATE;
1049 		break;
1050 
1051 	case MPTCPS_ESTABLISHED:
1052 		if (event == MPCE_CLOSE) {
1053 			mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1054 			mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1055 		} else if (event == MPCE_RECV_DATA_FIN) {
1056 			mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1057 			mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1058 		}
1059 		break;
1060 
1061 	case MPTCPS_CLOSE_WAIT:
1062 		if (event == MPCE_CLOSE) {
1063 			mp_tp->mpt_state = MPTCPS_LAST_ACK;
1064 			mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1065 		}
1066 		break;
1067 
1068 	case MPTCPS_FIN_WAIT_1:
1069 		if (event == MPCE_RECV_DATA_ACK) {
1070 			mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1071 		} else if (event == MPCE_RECV_DATA_FIN) {
1072 			mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1073 			mp_tp->mpt_state = MPTCPS_CLOSING;
1074 		}
1075 		break;
1076 
1077 	case MPTCPS_CLOSING:
1078 		if (event == MPCE_RECV_DATA_ACK) {
1079 			mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1080 		}
1081 		break;
1082 
1083 	case MPTCPS_LAST_ACK:
1084 		if (event == MPCE_RECV_DATA_ACK) {
1085 			mptcp_close(mp_tp->mpt_mpte, mp_tp);
1086 		}
1087 		break;
1088 
1089 	case MPTCPS_FIN_WAIT_2:
1090 		if (event == MPCE_RECV_DATA_FIN) {
1091 			mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1092 			mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1093 		}
1094 		break;
1095 
1096 	case MPTCPS_TIME_WAIT:
1097 	case MPTCPS_TERMINATE:
1098 		break;
1099 
1100 	default:
1101 		VERIFY(0);
1102 		/* NOTREACHED */
1103 	}
1104 	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1105 	    uint32_t, event);
1106 }
1107 
1108 /* If you change this function, match up mptcp_update_rcv_state_f */
1109 void
mptcp_update_dss_rcv_state(struct mptcp_dsn_opt * dss_info,struct tcpcb * tp,uint16_t csum)1110 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1111     uint16_t csum)
1112 {
1113 	struct mptcb *mp_tp = tptomptp(tp);
1114 	u_int64_t full_dsn = 0;
1115 
1116 	NTOHL(dss_info->mdss_dsn);
1117 	NTOHL(dss_info->mdss_subflow_seqn);
1118 	NTOHS(dss_info->mdss_data_len);
1119 
1120 	/* XXX for autosndbuf grow sb here */
1121 	MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1122 	mptcp_update_rcv_state_meat(mp_tp, tp,
1123 	    full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1124 	    csum);
1125 }
1126 
1127 void
mptcp_update_rcv_state_meat(struct mptcb * mp_tp,struct tcpcb * tp,u_int64_t full_dsn,u_int32_t seqn,u_int16_t mdss_data_len,uint16_t csum)1128 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1129     u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1130     uint16_t csum)
1131 {
1132 	struct mptsub *mpts = tp->t_mpsub;
1133 
1134 	if (mdss_data_len == 0) {
1135 		os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n",
1136 		    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte));
1137 
1138 		if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1139 			os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n",
1140 			    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum);
1141 		}
1142 		mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1143 		return;
1144 	}
1145 
1146 	mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1147 
1148 	mpts->mpts_rcv_map.mpt_dsn = full_dsn;
1149 	mpts->mpts_rcv_map.mpt_sseq = seqn;
1150 	mpts->mpts_rcv_map.mpt_len = mdss_data_len;
1151 	mpts->mpts_rcv_map.mpt_csum = csum;
1152 	tp->t_mpflags |= TMPF_EMBED_DSN;
1153 }
1154 
1155 
1156 static uint16_t
mptcp_input_csum(struct tcpcb * tp,struct mbuf * m,uint64_t dsn,uint32_t sseq,uint16_t dlen,uint16_t csum,int dfin)1157 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1158     uint16_t dlen, uint16_t csum, int dfin)
1159 {
1160 	struct mptcb *mp_tp = tptomptp(tp);
1161 	int real_len = dlen - dfin;
1162 	uint32_t sum = 0;
1163 
1164 	VERIFY(real_len >= 0);
1165 
1166 	if (mp_tp == NULL) {
1167 		return 0;
1168 	}
1169 
1170 	if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
1171 		return 0;
1172 	}
1173 
1174 	if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
1175 		return 0;
1176 	}
1177 
1178 	/*
1179 	 * The remote side may send a packet with fewer bytes than the
1180 	 * claimed DSS checksum length.
1181 	 */
1182 	if ((int)m_length2(m, NULL) < real_len) {
1183 		return 0xffff;
1184 	}
1185 
1186 	if (real_len != 0) {
1187 		sum = m_sum16(m, 0, real_len);
1188 	}
1189 
1190 	sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1191 	ADDCARRY(sum);
1192 
1193 	DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1194 	    uint32_t, sum);
1195 
1196 	return ~sum & 0xffff;
1197 }
1198 
1199 /*
1200  * MPTCP Checksum support
1201  * The checksum is calculated whenever the MPTCP DSS option is included
1202  * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1203  * header and the actual data indicated by the length specified in the
1204  * DSS option.
1205  */
1206 
1207 int
mptcp_validate_csum(struct tcpcb * tp,struct mbuf * m,uint64_t dsn,uint32_t sseq,uint16_t dlen,uint16_t csum,int dfin)1208 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1209     uint32_t sseq, uint16_t dlen, uint16_t csum, int dfin)
1210 {
1211 	uint16_t mptcp_csum;
1212 
1213 	mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1214 	if (mptcp_csum) {
1215 		tp->t_mpflags |= TMPF_SND_MPFAIL;
1216 		mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1217 		m_freem(m);
1218 		tcpstat.tcps_mp_badcsum++;
1219 		return -1;
1220 	}
1221 	return 0;
1222 }
1223 
1224 uint16_t
mptcp_output_csum(struct mbuf * m,uint64_t dss_val,uint32_t sseq,uint16_t dlen)1225 mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1226 {
1227 	uint32_t sum = 0;
1228 
1229 	if (dlen) {
1230 		sum = m_sum16(m, 0, dlen);
1231 	}
1232 
1233 	dss_val = mptcp_hton64(dss_val);
1234 	sseq = htonl(sseq);
1235 	dlen = htons(dlen);
1236 	sum += in_pseudo64(dss_val, sseq, dlen);
1237 
1238 	ADDCARRY(sum);
1239 	sum = ~sum & 0xffff;
1240 	DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1241 
1242 	return (uint16_t)sum;
1243 }
1244 
1245 /*
1246  * When WiFi signal starts fading, there's more loss and RTT spikes.
1247  * Check if there has been a large spike by comparing against
1248  * a tolerable RTT spike threshold.
1249  */
1250 boolean_t
mptcp_no_rto_spike(struct socket * so)1251 mptcp_no_rto_spike(struct socket *so)
1252 {
1253 	struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1254 	int32_t spike = 0;
1255 
1256 	if (tp->t_rxtcur > mptcp_rtothresh) {
1257 		spike = tp->t_rxtcur - mptcp_rtothresh;
1258 	}
1259 
1260 	if (spike > 0) {
1261 		return FALSE;
1262 	} else {
1263 		return TRUE;
1264 	}
1265 }
1266 
1267 void
mptcp_handle_deferred_upcalls(struct mppcb * mpp,uint32_t flag)1268 mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1269 {
1270 	VERIFY(mpp->mpp_flags & flag);
1271 	mpp->mpp_flags &= ~flag;
1272 
1273 	if (mptcp_should_defer_upcall(mpp)) {
1274 		return;
1275 	}
1276 
1277 	if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1278 		mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1279 
1280 		mptcp_subflow_workloop(mpp->mpp_pcbe);
1281 	}
1282 
1283 	if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1284 		mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1285 
1286 		sorwakeup(mpp->mpp_socket);
1287 	}
1288 
1289 	if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1290 		mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1291 
1292 		sowwakeup(mpp->mpp_socket);
1293 	}
1294 }
1295 
1296 static void
mptcp_reset_itfinfo(struct mpt_itf_info * info)1297 mptcp_reset_itfinfo(struct mpt_itf_info *info)
1298 {
1299 	memset(info, 0, sizeof(*info));
1300 }
1301 
1302 void
mptcp_session_necp_cb(void * handle,int action,uint32_t interface_index,uint32_t necp_flags,__unused bool * viable)1303 mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
1304     uint32_t necp_flags, __unused bool *viable)
1305 {
1306 	boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1307 	boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1308 	boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
1309 	boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1310 	struct mppcb *mp = (struct mppcb *)handle;
1311 	struct mptses *mpte = mptompte(mp);
1312 	struct socket *mp_so;
1313 	struct mptcb *mp_tp;
1314 	uint32_t i, ifindex;
1315 	struct ifnet *ifp;
1316 	int locked = 0;
1317 
1318 	ifindex = interface_index;
1319 	VERIFY(ifindex != IFSCOPE_NONE);
1320 
1321 	/* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1322 	if (mp->mpp_socket->so_usecount == 0) {
1323 		return;
1324 	}
1325 
1326 	mp_so = mptetoso(mpte);
1327 
1328 	if (action != NECP_CLIENT_CBACTION_INITIAL) {
1329 		socket_lock(mp_so, 1);
1330 		locked = 1;
1331 
1332 		/* Check again, because it might have changed while waiting */
1333 		if (mp->mpp_socket->so_usecount == 0) {
1334 			goto out;
1335 		}
1336 	}
1337 
1338 	socket_lock_assert_owned(mp_so);
1339 
1340 	mp_tp = mpte->mpte_mptcb;
1341 
1342 	ifnet_head_lock_shared();
1343 	ifp = ifindex2ifnet[ifindex];
1344 	ifnet_head_done();
1345 
1346 	os_log(mptcp_log_handle, "%s - %lx: action: %u ifindex %u delegated to %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1347 	    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex,
1348 	    ifp && ifp->if_delegated.ifp ? ifp->if_delegated.ifp->if_index : IFSCOPE_NONE,
1349 	    mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
1350 	    has_v4, has_v6, has_nat64, low_power);
1351 
1352 	/* No need on fallen back sockets */
1353 	if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
1354 		goto out;
1355 	}
1356 
1357 	/*
1358 	 * When the interface goes in low-power mode we don't want to establish
1359 	 * new subflows on it. Thus, mark it internally as non-viable.
1360 	 */
1361 	if (low_power) {
1362 		action = NECP_CLIENT_CBACTION_NONVIABLE;
1363 	}
1364 
1365 	if (action == NECP_CLIENT_CBACTION_INITIAL) {
1366 		mpte->mpte_flags |= MPTE_ITFINFO_INIT;
1367 	}
1368 
1369 	if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1370 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1371 			if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1372 				continue;
1373 			}
1374 
1375 			if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1376 				mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1377 			}
1378 		}
1379 
1380 		mptcp_sched_create_subflows(mpte);
1381 	} else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1382 	    action == NECP_CLIENT_CBACTION_INITIAL) {
1383 		int found_slot = 0, slot_index = -1;
1384 		struct sockaddr *dst;
1385 
1386 		if (ifp == NULL) {
1387 			goto out;
1388 		}
1389 
1390 		if (IFNET_IS_COMPANION_LINK(ifp)) {
1391 			goto out;
1392 		}
1393 
1394 		if (IFNET_IS_EXPENSIVE(ifp) &&
1395 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1396 			goto out;
1397 		}
1398 
1399 		if (IFNET_IS_CONSTRAINED(ifp) &&
1400 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1401 			goto out;
1402 		}
1403 
1404 		if (IFNET_IS_CELLULAR(ifp) &&
1405 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1406 			goto out;
1407 		}
1408 
1409 		if (IS_INTF_CLAT46(ifp)) {
1410 			has_v4 = FALSE;
1411 		}
1412 
1413 		/* Look for the slot on where to store/update the interface-info. */
1414 		for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1415 			/* Found a potential empty slot where we can put it */
1416 			if (mpte->mpte_itfinfo[i].ifindex == 0) {
1417 				found_slot = 1;
1418 				slot_index = i;
1419 			}
1420 
1421 			/*
1422 			 * The interface is already in our array. Check if we
1423 			 * need to update it.
1424 			 */
1425 			if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
1426 			    (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
1427 			    mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
1428 			    mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
1429 				found_slot = 1;
1430 				slot_index = i;
1431 				break;
1432 			}
1433 
1434 			if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1435 				/*
1436 				 * Ok, it's already there and we don't need
1437 				 * to update it
1438 				 */
1439 				goto out;
1440 			}
1441 		}
1442 
1443 		dst = mptcp_get_session_dst(mpte, has_v6, has_v4);
1444 		if (dst && dst->sa_family == AF_INET &&
1445 		    has_v6 && !has_nat64 && !has_v4) {
1446 			if (found_slot) {
1447 				mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1448 				mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1449 				mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1450 				mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1451 			}
1452 			goto out;
1453 		}
1454 
1455 		if (found_slot == 0) {
1456 			int new_size = mpte->mpte_itfinfo_size * 2;
1457 			struct mpt_itf_info *info = kalloc_data(sizeof(*info) * new_size, Z_ZERO);
1458 
1459 			if (info == NULL) {
1460 				os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n",
1461 				    __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size);
1462 				goto out;
1463 			}
1464 
1465 			memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1466 
1467 			if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
1468 				kfree_data_counted_by(mpte->mpte_itfinfo, mpte->mpte_itfinfo_size);
1469 			}
1470 
1471 			/* We allocated a new one, thus the first must be empty */
1472 			slot_index = mpte->mpte_itfinfo_size;
1473 
1474 			mpte->mpte_itfinfo = info;
1475 			mpte->mpte_itfinfo_size = new_size;
1476 		}
1477 
1478 		VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
1479 		mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1480 		mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1481 		mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1482 		mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1483 
1484 		mptcp_sched_create_subflows(mpte);
1485 	}
1486 
1487 out:
1488 	if (locked) {
1489 		socket_unlock(mp_so, 1);
1490 	}
1491 }
1492 
1493 void
mptcp_set_restrictions(struct socket * mp_so)1494 mptcp_set_restrictions(struct socket *mp_so)
1495 {
1496 	struct mptses *mpte = mpsotompte(mp_so);
1497 	uint32_t i;
1498 
1499 	socket_lock_assert_owned(mp_so);
1500 
1501 	ifnet_head_lock_shared();
1502 
1503 	for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1504 		struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1505 		uint32_t ifindex = info->ifindex;
1506 		struct ifnet *ifp;
1507 
1508 		if (ifindex == IFSCOPE_NONE) {
1509 			continue;
1510 		}
1511 
1512 		ifp = ifindex2ifnet[ifindex];
1513 		if (ifp == NULL) {
1514 			continue;
1515 		}
1516 
1517 		if (IFNET_IS_EXPENSIVE(ifp) &&
1518 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1519 			info->ifindex = IFSCOPE_NONE;
1520 		}
1521 
1522 		if (IFNET_IS_CONSTRAINED(ifp) &&
1523 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1524 			info->ifindex = IFSCOPE_NONE;
1525 		}
1526 
1527 		if (IFNET_IS_CELLULAR(ifp) &&
1528 		    (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1529 			info->ifindex = IFSCOPE_NONE;
1530 		}
1531 	}
1532 
1533 	ifnet_head_done();
1534 }
1535