1 /*
2 * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * A note on the MPTCP/NECP-interactions:
31 *
32 * MPTCP uses NECP-callbacks to get notified of interface/policy events.
33 * MPTCP registers to these events at the MPTCP-layer for interface-events
34 * through a call to necp_client_register_multipath_cb.
35 * To get per-flow events (aka per TCP-subflow), we register to it with
36 * necp_client_register_socket_flow. Both registrations happen by using the
37 * necp-client-uuid that comes from the app.
38 *
39 * The locking is rather tricky. In general, we expect the lock-ordering to
40 * happen from necp-fd -> necp->client -> mpp_lock.
41 *
42 * There are however some subtleties.
43 *
44 * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
45 * safe, because it is the very first time this MPTCP-connection goes into NECP.
46 * As we go into NECP we take the NECP-locks and thus are guaranteed that no
47 * NECP-locks will deadlock us. Because these NECP-events will also first take
48 * the NECP-locks. Either they win the race and thus won't find our
49 * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
50 * the callbacks while holding the NECP lock.
51 *
52 * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
53 * because we have already registered callbacks and we might race against an
54 * NECP-event that will match on our socket. So, we have to unlock to be safe.
55 *
56 * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
57 * so_usecount has reached 0. We must be careful to not remove the mpp_socket
58 * pointers before we unregistered the callback. Because, again we might be
59 * racing against an NECP-event. Unregistering must happen with an unlocked
60 * mpp_lock, because of the lock-ordering constraint. It could be that
61 * before we had a chance to unregister an NECP-event triggers. That's why
62 * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
63 * there while the socket is being garbage-collected, the use-count will go
64 * down to 0 and we exit. Removal of the multipath_cb again happens by taking
65 * the NECP-locks so any running NECP-events will finish first and exit cleanly.
66 *
67 * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
68 * the socket-lock must be unlocked for lock-ordering constraints. This gets a
69 * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
70 * So, we drop the mp_so-lock as soon as the subflow is unlinked with
71 * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
72 * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
73 * gets it, it will realize that the subflow became non-MPTCP and retry (see
74 * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
75 * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
76 * for the NECP-lock (held by the other thread that is taking care of the NECP-
77 * event). So, the event now finally gets the subflow-lock and then hits an
78 * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
79 * the NECP callback.
80 */
81
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/kernel.h>
85 #include <sys/mbuf.h>
86 #include <sys/mcache.h>
87 #include <sys/socket.h>
88 #include <sys/socketvar.h>
89 #include <sys/syslog.h>
90 #include <sys/protosw.h>
91
92 #include <kern/zalloc.h>
93 #include <kern/locks.h>
94
95 #include <mach/sdt.h>
96
97 #include <net/droptap.h>
98 #include <net/if.h>
99 #include <netinet/in.h>
100 #include <netinet/in_var.h>
101 #include <netinet/tcp.h>
102 #include <netinet/tcp_fsm.h>
103 #include <netinet/tcp_seq.h>
104 #include <netinet/tcp_var.h>
105 #include <netinet/mptcp_var.h>
106 #include <netinet/mptcp.h>
107 #include <netinet/mptcp_seq.h>
108 #include <netinet/mptcp_opt.h>
109 #include <netinet/mptcp_timer.h>
110
111 int mptcp_enable = 1;
112 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
113 &mptcp_enable, 0, "Enable Multipath TCP Support");
114
115 /*
116 * Number of times to try negotiating MPTCP on SYN retransmissions.
117 * We haven't seen any reports of a middlebox that is dropping all SYN-segments
118 * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times.
119 */
120 int mptcp_mpcap_retries = 4;
121 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
122 CTLFLAG_RW | CTLFLAG_LOCKED,
123 &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
124
125 /*
126 * By default, DSS checksum is turned off, revisit if we ever do
127 * MPTCP for non SSL Traffic.
128 */
129 int mptcp_dss_csum = 0;
130 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
131 &mptcp_dss_csum, 0, "Enable DSS checksum");
132
133 /*
134 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
135 * is attempted on a different path.
136 */
137 int mptcp_fail_thresh = 1;
138 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
139 &mptcp_fail_thresh, 0, "Failover threshold");
140
141 /*
142 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
143 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
144 * Some carrier networks have a timeout of 10 or 15 minutes.
145 */
146 int mptcp_subflow_keeptime = 60 * 14;
147 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
148 &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
149
150 int mptcp_rtthist_rtthresh = 600;
151 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
152 &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
153
154 int mptcp_rtothresh = 1500;
155 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
156 &mptcp_rtothresh, 0, "RTO threshold");
157
158 /*
159 * Probe the preferred path, when it is not in use
160 */
161 uint32_t mptcp_probeto = 1000;
162 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
163 &mptcp_probeto, 0, "Disable probing by setting to 0");
164
165 uint32_t mptcp_probecnt = 5;
166 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
167 &mptcp_probecnt, 0, "Number of probe writes");
168
169 uint32_t mptcp_enable_v1 = 1;
170 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, enable_v1, CTLFLAG_RW | CTLFLAG_LOCKED,
171 &mptcp_enable_v1, 0, "Enable or disable v1");
172
173 static int
174 sysctl_mptcp_version_check SYSCTL_HANDLER_ARGS
175 {
176 #pragma unused(arg1, arg2)
177 int error;
178 int new_value = *(int *)oidp->oid_arg1;
179 int old_value = *(int *)oidp->oid_arg1;
180
181 error = sysctl_handle_int(oidp, &new_value, 0, req);
182 if (!error) {
183 if (new_value != MPTCP_VERSION_0 && new_value != MPTCP_VERSION_1) {
184 return EINVAL;
185 }
186 *(int *)oidp->oid_arg1 = new_value;
187 }
188
189 os_log(OS_LOG_DEFAULT,
190 "%s:%u sysctl net.inet.tcp.mptcp_preferred_version: %d -> %d)",
191 proc_best_name(current_proc()), proc_selfpid(),
192 old_value, *(int *)oidp->oid_arg1);
193
194 return error;
195 }
196
197 int mptcp_preferred_version = MPTCP_VERSION_1;
198 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, mptcp_preferred_version,
199 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
200 &mptcp_preferred_version, 0, &sysctl_mptcp_version_check, "I", "");
201
202 int mptcp_reass_total_qlen = 0;
203 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, reass_qlen,
204 CTLFLAG_RD | CTLFLAG_LOCKED, &mptcp_reass_total_qlen, 0,
205 "Total number of MPTCP segments in reassembly queues");
206
207 static int
mptcp_reass_present(struct socket * mp_so)208 mptcp_reass_present(struct socket *mp_so)
209 {
210 struct mptses *mpte = mpsotompte(mp_so);
211 struct mptcb *mp_tp = mpte->mpte_mptcb;
212 struct tseg_qent *q;
213 int dowakeup = 0;
214 int flags = 0;
215 int count = 0;
216
217 /*
218 * Present data to user, advancing rcv_nxt through
219 * completed sequence space.
220 */
221 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
222 return flags;
223 }
224 q = LIST_FIRST(&mp_tp->mpt_segq);
225 if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) {
226 return flags;
227 }
228
229 /*
230 * If there is already another thread doing reassembly for this
231 * connection, it is better to let it finish the job --
232 * (radar 16316196)
233 */
234 if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) {
235 return flags;
236 }
237
238 mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
239
240 do {
241 mp_tp->mpt_rcvnxt += q->tqe_len;
242 LIST_REMOVE(q, tqe_q);
243 if (mp_so->so_state & SS_CANTRCVMORE) {
244 m_freem(q->tqe_m);
245 } else {
246 flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
247 if (sbappendstream_rcvdemux(mp_so, q->tqe_m)) {
248 dowakeup = 1;
249 }
250 }
251 zfree(tcp_reass_zone, q);
252 mp_tp->mpt_reassqlen--;
253 count++;
254 q = LIST_FIRST(&mp_tp->mpt_segq);
255 } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
256 mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
257
258 if (count > 0) {
259 OSAddAtomic(-count, &mptcp_reass_total_qlen);
260 }
261 if (dowakeup) {
262 sorwakeup(mp_so); /* done with socket lock held */
263 }
264 return flags;
265 }
266
267 static int
mptcp_reass(struct socket * mp_so,struct pkthdr * phdr,int * tlenp,struct mbuf * m)268 mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
269 {
270 struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
271 u_int64_t mb_dsn = phdr->mp_dsn;
272 struct tseg_qent *q;
273 struct tseg_qent *p = NULL;
274 struct tseg_qent *nq;
275 struct tseg_qent *te = NULL;
276 uint32_t qlimit;
277
278 /*
279 * Limit the number of segments in the reassembly queue to prevent
280 * holding on to too many segments (and thus running out of mbufs).
281 * Make sure to let the missing segment through which caused this
282 * queue. Always keep one global queue entry spare to be able to
283 * process the missing segment.
284 */
285 qlimit = MIN(MAX(100, mp_so->so_rcv.sb_hiwat >> 10),
286 (tcp_autorcvbuf_max >> 10));
287 if (mb_dsn != mp_tp->mpt_rcvnxt &&
288 (mp_tp->mpt_reassqlen + 1) >= qlimit) {
289 tcpstat.tcps_mptcp_rcvmemdrop++;
290 m_freem(m);
291 *tlenp = 0;
292 return 0;
293 }
294
295 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
296 te = zalloc_flags(tcp_reass_zone, Z_WAITOK | Z_NOFAIL);
297
298 mp_tp->mpt_reassqlen++;
299 OSIncrementAtomic(&mptcp_reass_total_qlen);
300
301 /*
302 * Find a segment which begins after this one does.
303 */
304 LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
305 if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) {
306 break;
307 }
308 p = q;
309 }
310
311 /*
312 * If there is a preceding segment, it may provide some of
313 * our data already. If so, drop the data from the incoming
314 * segment. If it provides all of our data, drop us.
315 */
316 if (p != NULL) {
317 int64_t i;
318 /* conversion to int (in i) handles seq wraparound */
319 i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
320 if (i > 0) {
321 if (i >= *tlenp) {
322 tcpstat.tcps_mptcp_rcvduppack++;
323 m_freem(m);
324 zfree(tcp_reass_zone, te);
325 te = NULL;
326 mp_tp->mpt_reassqlen--;
327 OSDecrementAtomic(&mptcp_reass_total_qlen);
328 /*
329 * Try to present any queued data
330 * at the left window edge to the user.
331 * This is needed after the 3-WHS
332 * completes.
333 */
334 goto out;
335 }
336 VERIFY(i <= INT_MAX);
337 m_adj(m, (int)i);
338 *tlenp -= i;
339 phdr->mp_dsn += i;
340 }
341 }
342
343 tcpstat.tcps_mp_oodata++;
344
345 /*
346 * While we overlap succeeding segments trim them or,
347 * if they are completely covered, dequeue them.
348 */
349 while (q) {
350 int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
351 if (i <= 0) {
352 break;
353 }
354
355 if (i < q->tqe_len) {
356 q->tqe_m->m_pkthdr.mp_dsn += i;
357 q->tqe_len -= i;
358
359 VERIFY(i <= INT_MAX);
360 m_adj(q->tqe_m, (int)i);
361 break;
362 }
363
364 nq = LIST_NEXT(q, tqe_q);
365 LIST_REMOVE(q, tqe_q);
366 m_freem(q->tqe_m);
367 zfree(tcp_reass_zone, q);
368 mp_tp->mpt_reassqlen--;
369 OSDecrementAtomic(&mptcp_reass_total_qlen);
370 q = nq;
371 }
372
373 /* Insert the new segment queue entry into place. */
374 te->tqe_m = m;
375 te->tqe_th = NULL;
376 te->tqe_len = *tlenp;
377
378 if (p == NULL) {
379 LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
380 } else {
381 LIST_INSERT_AFTER(p, te, tqe_q);
382 }
383
384 out:
385 return mptcp_reass_present(mp_so);
386 }
387
388 /*
389 * MPTCP input, called when data has been read from a subflow socket.
390 */
391 void
mptcp_input(struct mptses * mpte,struct mbuf * m)392 mptcp_input(struct mptses *mpte, struct mbuf *m)
393 {
394 struct socket *mp_so;
395 struct mptcb *mp_tp = NULL;
396 int count = 0, wakeup = 0;
397 struct mbuf *save = NULL, *prev = NULL;
398 struct mbuf *freelist = NULL, *tail = NULL;
399
400 ASSERT(m->m_flags & M_PKTHDR);
401 if (__improbable((m->m_flags & M_PKTHDR) == 0)) {
402 m_drop_list(m, NULL, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_MPTCP_INPUT_MALFORMED, NULL, 0);
403 return;
404 }
405
406 mp_so = mptetoso(mpte);
407 mp_tp = mpte->mpte_mptcb;
408
409 socket_lock_assert_owned(mp_so);
410
411 DTRACE_MPTCP(input);
412
413 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
414
415 /*
416 * Each mbuf contains MPTCP Data Sequence Map
417 * Process the data for reassembly, delivery to MPTCP socket
418 * client, etc.
419 *
420 */
421 count = mp_so->so_rcv.sb_cc;
422
423 /*
424 * In the degraded fallback case, data is accepted without DSS map
425 */
426 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
427 struct mbuf *iter;
428 int mb_dfin;
429 fallback:
430 mb_dfin = 0;
431 mptcp_sbrcv_grow(mp_tp);
432
433 iter = m;
434 while (iter) {
435 if ((iter->m_flags & M_PKTHDR) &&
436 (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
437 mb_dfin = 1;
438 }
439
440 if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
441 /* Don't add zero-length packets, so jump it! */
442 if (prev == NULL) {
443 m = iter->m_next;
444 m_free(iter);
445 iter = m;
446 } else {
447 prev->m_next = iter->m_next;
448 m_free(iter);
449 iter = prev->m_next;
450 }
451
452 /* It was a zero-length packet so next one must be a pkthdr */
453 VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
454 } else {
455 prev = iter;
456 iter = iter->m_next;
457 }
458 }
459
460 /*
461 * assume degraded flow as this may be the first packet
462 * without DSS, and the subflow state is not updated yet.
463 */
464 if (sbappendstream_rcvdemux(mp_so, m)) {
465 sorwakeup(mp_so);
466 }
467
468 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
469 struct socket *, mp_so,
470 struct sockbuf *, &mp_so->so_rcv,
471 struct sockbuf *, &mp_so->so_snd,
472 struct mptses *, mpte);
473 count = mp_so->so_rcv.sb_cc - count;
474
475 mp_tp->mpt_rcvnxt += count;
476
477 if (mb_dfin) {
478 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
479 socantrcvmore(mp_so);
480 }
481 return;
482 }
483
484 do {
485 u_int64_t mb_dsn;
486 int32_t mb_datalen;
487 int64_t todrop;
488 int mb_dfin = 0;
489
490 VERIFY(m->m_flags & M_PKTHDR);
491
492 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
493 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
494 goto fallback;
495 }
496
497 save = m->m_next;
498 /*
499 * A single TCP packet formed of multiple mbufs
500 * holds DSS mapping in the first mbuf of the chain.
501 * Other mbufs in the chain may have M_PKTHDR set
502 * even though they belong to the same TCP packet
503 * and therefore use the DSS mapping stored in the
504 * first mbuf of the mbuf chain. mptcp_input() can
505 * get an mbuf chain with multiple TCP packets.
506 */
507 while (save && (!(save->m_flags & M_PKTHDR) ||
508 !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
509 prev = save;
510 save = save->m_next;
511 }
512 if (prev) {
513 prev->m_next = NULL;
514 } else {
515 m->m_next = NULL;
516 }
517
518 mb_dsn = m->m_pkthdr.mp_dsn;
519 mb_datalen = m->m_pkthdr.mp_rlen;
520
521 todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
522 if (todrop > 0) {
523 tcpstat.tcps_mptcp_rcvpackafterwin++;
524
525 os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n",
526 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
527 (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt,
528 mp_tp->mpt_rcvwnd, todrop);
529
530 if (todrop >= mb_datalen) {
531 if (freelist == NULL) {
532 freelist = m;
533 } else {
534 tail->m_next = m;
535 }
536
537 if (prev != NULL) {
538 tail = prev;
539 } else {
540 tail = m;
541 }
542
543 m = save;
544 prev = save = NULL;
545 continue;
546 } else {
547 VERIFY(todrop <= INT_MAX);
548 m_adj(m, (int)-todrop);
549 mb_datalen -= todrop;
550 m->m_pkthdr.mp_rlen -= todrop;
551 }
552
553 /*
554 * We drop from the right edge of the mbuf, thus the
555 * DATA_FIN is dropped as well
556 */
557 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
558 }
559
560 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
561 if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
562 mp_tp->mpt_rcvnxt)) {
563 if (freelist == NULL) {
564 freelist = m;
565 } else {
566 tail->m_next = m;
567 }
568
569 if (prev != NULL) {
570 tail = prev;
571 } else {
572 tail = m;
573 }
574
575 m = save;
576 prev = save = NULL;
577 continue;
578 } else {
579 VERIFY((mp_tp->mpt_rcvnxt - mb_dsn) <= INT_MAX);
580 m_adj(m, (int)(mp_tp->mpt_rcvnxt - mb_dsn));
581 mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn);
582 mb_dsn = mp_tp->mpt_rcvnxt;
583 VERIFY(mb_datalen >= 0 && mb_datalen <= USHRT_MAX);
584 m->m_pkthdr.mp_rlen = (uint16_t)mb_datalen;
585 m->m_pkthdr.mp_dsn = mb_dsn;
586 }
587 }
588
589 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
590 !LIST_EMPTY(&mp_tp->mpt_segq)) {
591 mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
592
593 goto next;
594 }
595 mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
596
597 mptcp_sbrcv_grow(mp_tp);
598
599 if (sbappendstream_rcvdemux(mp_so, m)) {
600 wakeup = 1;
601 }
602
603 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
604 struct sockbuf *, &mp_so->so_rcv,
605 struct sockbuf *, &mp_so->so_snd,
606 struct mptses *, mpte,
607 struct mptcb *, mp_tp);
608 count = mp_so->so_rcv.sb_cc - count;
609 tcpstat.tcps_mp_rcvtotal++;
610 tcpstat.tcps_mp_rcvbytes += count;
611
612 mp_tp->mpt_rcvnxt += count;
613
614 next:
615 if (mb_dfin) {
616 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
617 socantrcvmore(mp_so);
618 }
619 m = save;
620 prev = save = NULL;
621 count = mp_so->so_rcv.sb_cc;
622 } while (m);
623
624 if (freelist) {
625 m_freem(freelist);
626 }
627
628 if (wakeup) {
629 sorwakeup(mp_so);
630 }
631 }
632
633 boolean_t
mptcp_can_send_more(struct mptcb * mp_tp,boolean_t ignore_reinject)634 mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
635 {
636 struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
637
638 /*
639 * Always send if there is data in the reinject-queue.
640 */
641 if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) {
642 return TRUE;
643 }
644
645 /*
646 * Don't send, if:
647 *
648 * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
649 * Except when using TFO, we might be doing a 0-byte write.
650 * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
651 * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
652 */
653
654 if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
655 return FALSE;
656 }
657
658 if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) {
659 return FALSE;
660 }
661
662 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
663 return FALSE;
664 }
665
666 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
667 return FALSE;
668 }
669
670 return TRUE;
671 }
672
673 /*
674 * MPTCP output.
675 */
676 int
mptcp_output(struct mptses * mpte)677 mptcp_output(struct mptses *mpte)
678 {
679 struct mptcb *mp_tp;
680 struct mptsub *mpts;
681 struct mptsub *mpts_tried = NULL;
682 struct socket *mp_so;
683 struct mptsub *preferred_mpts = NULL;
684 uint64_t old_snd_nxt;
685 int error = 0;
686
687 mp_so = mptetoso(mpte);
688 mp_tp = mpte->mpte_mptcb;
689
690 socket_lock_assert_owned(mp_so);
691
692 if (mp_so->so_flags & SOF_DEFUNCT) {
693 return 0;
694 }
695
696 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
697 mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
698
699 old_snd_nxt = mp_tp->mpt_sndnxt;
700 while (mptcp_can_send_more(mp_tp, FALSE)) {
701 /* get the "best" subflow to be used for transmission */
702 mpts = mptcp_get_subflow(mpte, &preferred_mpts);
703 if (mpts == NULL) {
704 break;
705 }
706
707 /* In case there's just one flow, we reattempt later */
708 if (mpts_tried != NULL &&
709 (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
710 mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
711 mpts_tried->mpts_flags |= MPTSF_ACTIVE;
712 mptcp_start_timer(mpte, MPTT_REXMT);
713 break;
714 }
715
716 /*
717 * Automatic sizing of send socket buffer. Increase the send
718 * socket buffer size if all of the following criteria are met
719 * 1. the receiver has enough buffer space for this data
720 * 2. send buffer is filled to 7/8th with data (so we actually
721 * have data to make use of it);
722 */
723 if ((mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE) {
724 if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
725 mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
726 if (sbreserve(&mp_so->so_snd,
727 min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
728 tcp_autosndbuf_max)) == 1) {
729 mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
730 }
731 }
732 }
733
734 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
735 struct socket *, mp_so);
736 error = mptcp_subflow_output(mpte, mpts, 0);
737 if (error) {
738 /* can be a temporary loss of source address or other error */
739 mpts->mpts_flags |= MPTSF_FAILINGOVER;
740 mpts->mpts_flags &= ~MPTSF_ACTIVE;
741 mpts_tried = mpts;
742 if (error != ECANCELED) {
743 os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n",
744 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
745 error, mpts->mpts_flags);
746 }
747 break;
748 }
749 /* The model is to have only one active flow at a time */
750 mpts->mpts_flags |= MPTSF_ACTIVE;
751 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
752
753 /* Allows us to update the smoothed rtt */
754 if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
755 if (preferred_mpts->mpts_probesoon) {
756 if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
757 mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
758 if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
759 preferred_mpts->mpts_probesoon = 0;
760 preferred_mpts->mpts_probecnt = 0;
761 }
762 }
763 } else {
764 preferred_mpts->mpts_probesoon = tcp_now;
765 preferred_mpts->mpts_probecnt = 0;
766 }
767 }
768
769 if (mpte->mpte_active_sub == NULL) {
770 mpte->mpte_active_sub = mpts;
771 } else if (mpte->mpte_active_sub != mpts) {
772 mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
773 mpte->mpte_active_sub = mpts;
774
775 mptcpstats_inc_switch(mpte, mpts);
776 }
777 }
778
779 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
780 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
781 mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) {
782 mptcp_finish_usrclosed(mpte);
783 }
784 }
785
786 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
787
788 /* subflow errors should not be percolated back up */
789 return 0;
790 }
791
792
793 static struct mptsub *
mptcp_choose_subflow(struct mptsub * mpts,struct mptsub * curbest,int * currtt)794 mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
795 {
796 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
797
798 /*
799 * Lower RTT? Take it, if it's our first one, or
800 * it doesn't has any loss, or the current one has
801 * loss as well.
802 */
803 if (tp->t_srtt && *currtt > tp->t_srtt &&
804 (curbest == NULL || tp->t_rxtshift == 0 ||
805 sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
806 *currtt = tp->t_srtt;
807 return mpts;
808 }
809
810 /*
811 * If we find a subflow without loss, take it always!
812 */
813 if (curbest &&
814 sototcpcb(curbest->mpts_socket)->t_rxtshift &&
815 tp->t_rxtshift == 0) {
816 *currtt = tp->t_srtt;
817 return mpts;
818 }
819
820 return curbest != NULL ? curbest : mpts;
821 }
822
823 static struct mptsub *
mptcp_return_subflow(struct mptsub * mpts)824 mptcp_return_subflow(struct mptsub *mpts)
825 {
826 if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) {
827 return NULL;
828 }
829
830 return mpts;
831 }
832
833 static boolean_t
mptcp_subflow_is_slow(struct mptses * mpte,struct mptsub * mpts)834 mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts)
835 {
836 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
837 int fail_thresh = mptcp_fail_thresh;
838
839 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
840 fail_thresh *= 2;
841 }
842
843 return tp->t_rxtshift >= fail_thresh &&
844 (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
845 }
846
847 /*
848 * Return the most eligible subflow to be used for sending data.
849 */
850 struct mptsub *
mptcp_get_subflow(struct mptses * mpte,struct mptsub ** preferred)851 mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred)
852 {
853 struct tcpcb *besttp, *secondtp;
854 struct inpcb *bestinp, *secondinp;
855 struct mptsub *mpts;
856 struct mptsub *best = NULL;
857 struct mptsub *second_best = NULL;
858 int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
859
860 /*
861 * First Step:
862 * Choose the best subflow for cellular and non-cellular interfaces.
863 */
864
865 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
866 struct socket *so = mpts->mpts_socket;
867 struct tcpcb *tp = sototcpcb(so);
868 struct inpcb *inp = sotoinpcb(so);
869
870 /*
871 * First, the hard conditions to reject subflows
872 * (e.g., not connected,...)
873 */
874 if (inp->inp_last_outifp == NULL) {
875 continue;
876 }
877
878 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
879 continue;
880 }
881
882 /* There can only be one subflow in degraded state */
883 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
884 best = mpts;
885 break;
886 }
887
888 /*
889 * If this subflow is waiting to finally send, do it!
890 */
891 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
892 return mptcp_return_subflow(mpts);
893 }
894
895 /*
896 * Only send if the subflow is MP_CAPABLE. The exceptions to
897 * this rule (degraded or TFO) have been taken care of above.
898 */
899 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) {
900 continue;
901 }
902
903 if ((so->so_state & SS_ISDISCONNECTED) ||
904 !(so->so_state & SS_ISCONNECTED) ||
905 !TCPS_HAVEESTABLISHED(tp->t_state) ||
906 tp->t_state > TCPS_CLOSE_WAIT) {
907 continue;
908 }
909
910 /*
911 * Second, the soft conditions to find the subflow with best
912 * conditions for each set (aka cellular vs non-cellular)
913 */
914 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
915 second_best = mptcp_choose_subflow(mpts, second_best,
916 &exp_rtt);
917 } else {
918 best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
919 }
920 }
921
922 /*
923 * If there is no preferred or backup subflow, and there is no active
924 * subflow use the last usable subflow.
925 */
926 if (best == NULL) {
927 return mptcp_return_subflow(second_best);
928 }
929
930 if (second_best == NULL) {
931 return mptcp_return_subflow(best);
932 }
933
934 besttp = sototcpcb(best->mpts_socket);
935 bestinp = sotoinpcb(best->mpts_socket);
936 secondtp = sototcpcb(second_best->mpts_socket);
937 secondinp = sotoinpcb(second_best->mpts_socket);
938
939 if (preferred != NULL) {
940 *preferred = mptcp_return_subflow(best);
941 }
942
943 /*
944 * Second Step: Among best and second_best. Choose the one that is
945 * most appropriate for this particular service-type.
946 */
947 if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
948 return mptcp_return_subflow(best);
949 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
950 /*
951 * Only handover if Symptoms tells us to do so.
952 */
953 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
954 mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD &&
955 mptcp_subflow_is_slow(mpte, best)) {
956 return mptcp_return_subflow(second_best);
957 }
958
959 return mptcp_return_subflow(best);
960 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
961 int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
962 int rto_thresh = mptcp_rtothresh;
963
964 /* Adjust with symptoms information */
965 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
966 mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
967 rtt_thresh /= 2;
968 rto_thresh /= 2;
969 }
970
971 if (besttp->t_srtt && secondtp->t_srtt &&
972 besttp->t_srtt >= rtt_thresh &&
973 secondtp->t_srtt < rtt_thresh) {
974 tcpstat.tcps_mp_sel_rtt++;
975 return mptcp_return_subflow(second_best);
976 }
977
978 if (mptcp_subflow_is_slow(mpte, best) &&
979 secondtp->t_rxtshift == 0) {
980 return mptcp_return_subflow(second_best);
981 }
982
983 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
984 if (besttp->t_rxtcur && secondtp->t_rxtcur &&
985 besttp->t_rxtcur >= rto_thresh &&
986 secondtp->t_rxtcur < rto_thresh) {
987 tcpstat.tcps_mp_sel_rto++;
988
989 return mptcp_return_subflow(second_best);
990 }
991
992 /*
993 * None of the above conditions for sending on the secondary
994 * were true. So, let's schedule on the best one, if he still
995 * has some space in the congestion-window.
996 */
997 return mptcp_return_subflow(best);
998 } else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) {
999 struct mptsub *tmp;
1000
1001 /*
1002 * We only care about RTT when aggregating
1003 */
1004 if (besttp->t_srtt > secondtp->t_srtt) {
1005 tmp = best;
1006 best = second_best;
1007 besttp = secondtp;
1008 bestinp = secondinp;
1009
1010 second_best = tmp;
1011 secondtp = sototcpcb(second_best->mpts_socket);
1012 secondinp = sotoinpcb(second_best->mpts_socket);
1013 }
1014
1015 /* Is there still space in the congestion window? */
1016 if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) {
1017 return mptcp_return_subflow(second_best);
1018 }
1019
1020 return mptcp_return_subflow(best);
1021 } else {
1022 panic("Unknown service-type configured for MPTCP");
1023 }
1024
1025 return NULL;
1026 }
1027
1028 void
mptcp_close_fsm(struct mptcb * mp_tp,uint32_t event)1029 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1030 {
1031 struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
1032
1033 socket_lock_assert_owned(mp_so);
1034
1035 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1036 uint32_t, event);
1037
1038 switch (mp_tp->mpt_state) {
1039 case MPTCPS_CLOSED:
1040 case MPTCPS_LISTEN:
1041 mp_tp->mpt_state = MPTCPS_TERMINATE;
1042 break;
1043
1044 case MPTCPS_ESTABLISHED:
1045 if (event == MPCE_CLOSE) {
1046 mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1047 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1048 } else if (event == MPCE_RECV_DATA_FIN) {
1049 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1050 mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1051 }
1052 break;
1053
1054 case MPTCPS_CLOSE_WAIT:
1055 if (event == MPCE_CLOSE) {
1056 mp_tp->mpt_state = MPTCPS_LAST_ACK;
1057 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1058 }
1059 break;
1060
1061 case MPTCPS_FIN_WAIT_1:
1062 if (event == MPCE_RECV_DATA_ACK) {
1063 mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1064 } else if (event == MPCE_RECV_DATA_FIN) {
1065 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1066 mp_tp->mpt_state = MPTCPS_CLOSING;
1067 }
1068 break;
1069
1070 case MPTCPS_CLOSING:
1071 if (event == MPCE_RECV_DATA_ACK) {
1072 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1073 }
1074 break;
1075
1076 case MPTCPS_LAST_ACK:
1077 if (event == MPCE_RECV_DATA_ACK) {
1078 mptcp_close(mp_tp->mpt_mpte, mp_tp);
1079 }
1080 break;
1081
1082 case MPTCPS_FIN_WAIT_2:
1083 if (event == MPCE_RECV_DATA_FIN) {
1084 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1085 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1086 }
1087 break;
1088
1089 case MPTCPS_TIME_WAIT:
1090 case MPTCPS_TERMINATE:
1091 break;
1092
1093 default:
1094 VERIFY(0);
1095 /* NOTREACHED */
1096 }
1097 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1098 uint32_t, event);
1099 }
1100
1101 /* If you change this function, match up mptcp_update_rcv_state_f */
1102 void
mptcp_update_dss_rcv_state(struct mptcp_dsn_opt * dss_info,struct tcpcb * tp,uint16_t csum)1103 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1104 uint16_t csum)
1105 {
1106 struct mptcb *mp_tp = tptomptp(tp);
1107 u_int64_t full_dsn = 0;
1108
1109 NTOHL(dss_info->mdss_dsn);
1110 NTOHL(dss_info->mdss_subflow_seqn);
1111 NTOHS(dss_info->mdss_data_len);
1112
1113 /* XXX for autosndbuf grow sb here */
1114 MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1115 mptcp_update_rcv_state_meat(mp_tp, tp,
1116 full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1117 csum);
1118 }
1119
1120 void
mptcp_update_rcv_state_meat(struct mptcb * mp_tp,struct tcpcb * tp,u_int64_t full_dsn,u_int32_t seqn,u_int16_t mdss_data_len,uint16_t csum)1121 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1122 u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1123 uint16_t csum)
1124 {
1125 if (mdss_data_len == 0) {
1126 os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n",
1127 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte));
1128
1129 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1130 os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n",
1131 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum);
1132 }
1133 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1134 return;
1135 }
1136
1137 mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1138
1139 tp->t_rcv_map.mpt_dsn = full_dsn;
1140 tp->t_rcv_map.mpt_sseq = seqn;
1141 tp->t_rcv_map.mpt_len = mdss_data_len;
1142 tp->t_rcv_map.mpt_csum = csum;
1143 tp->t_mpflags |= TMPF_EMBED_DSN;
1144 }
1145
1146
1147 static uint16_t
mptcp_input_csum(struct tcpcb * tp,struct mbuf * m,uint64_t dsn,uint32_t sseq,uint16_t dlen,uint16_t csum,int dfin)1148 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1149 uint16_t dlen, uint16_t csum, int dfin)
1150 {
1151 struct mptcb *mp_tp = tptomptp(tp);
1152 int real_len = dlen - dfin;
1153 uint32_t sum = 0;
1154
1155 VERIFY(real_len >= 0);
1156
1157 if (mp_tp == NULL) {
1158 return 0;
1159 }
1160
1161 if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
1162 return 0;
1163 }
1164
1165 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
1166 return 0;
1167 }
1168
1169 /*
1170 * The remote side may send a packet with fewer bytes than the
1171 * claimed DSS checksum length.
1172 */
1173 if ((int)m_length2(m, NULL) < real_len) {
1174 return 0xffff;
1175 }
1176
1177 if (real_len != 0) {
1178 sum = m_sum16(m, 0, real_len);
1179 }
1180
1181 sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1182 ADDCARRY(sum);
1183
1184 DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1185 uint32_t, sum);
1186
1187 return ~sum & 0xffff;
1188 }
1189
1190 /*
1191 * MPTCP Checksum support
1192 * The checksum is calculated whenever the MPTCP DSS option is included
1193 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1194 * header and the actual data indicated by the length specified in the
1195 * DSS option.
1196 */
1197
1198 int
mptcp_validate_csum(struct tcpcb * tp,struct mbuf * m,uint64_t dsn,uint32_t sseq,uint16_t dlen,uint16_t csum,int dfin)1199 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1200 uint32_t sseq, uint16_t dlen, uint16_t csum, int dfin)
1201 {
1202 uint16_t mptcp_csum;
1203
1204 mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1205 if (mptcp_csum) {
1206 tp->t_mpflags |= TMPF_SND_MPFAIL;
1207 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1208 m_freem(m);
1209 tcpstat.tcps_mp_badcsum++;
1210 return -1;
1211 }
1212 return 0;
1213 }
1214
1215 uint16_t
mptcp_output_csum(struct mbuf * m,uint64_t dss_val,uint32_t sseq,uint16_t dlen)1216 mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1217 {
1218 uint32_t sum = 0;
1219
1220 if (dlen) {
1221 sum = m_sum16(m, 0, dlen);
1222 }
1223
1224 dss_val = mptcp_hton64(dss_val);
1225 sseq = htonl(sseq);
1226 dlen = htons(dlen);
1227 sum += in_pseudo64(dss_val, sseq, dlen);
1228
1229 ADDCARRY(sum);
1230 sum = ~sum & 0xffff;
1231 DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1232
1233 return (uint16_t)sum;
1234 }
1235
1236 /*
1237 * When WiFi signal starts fading, there's more loss and RTT spikes.
1238 * Check if there has been a large spike by comparing against
1239 * a tolerable RTT spike threshold.
1240 */
1241 boolean_t
mptcp_no_rto_spike(struct socket * so)1242 mptcp_no_rto_spike(struct socket *so)
1243 {
1244 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1245 int32_t spike = 0;
1246
1247 if (tp->t_rxtcur > mptcp_rtothresh) {
1248 spike = tp->t_rxtcur - mptcp_rtothresh;
1249 }
1250
1251 if (spike > 0) {
1252 return FALSE;
1253 } else {
1254 return TRUE;
1255 }
1256 }
1257
1258 void
mptcp_handle_deferred_upcalls(struct mppcb * mpp,uint32_t flag)1259 mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1260 {
1261 VERIFY(mpp->mpp_flags & flag);
1262 mpp->mpp_flags &= ~flag;
1263
1264 if (mptcp_should_defer_upcall(mpp)) {
1265 return;
1266 }
1267
1268 if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1269 mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1270
1271 mptcp_subflow_workloop(mpp->mpp_pcbe);
1272 }
1273
1274 if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1275 mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1276
1277 sorwakeup(mpp->mpp_socket);
1278 }
1279
1280 if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1281 mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1282
1283 sowwakeup(mpp->mpp_socket);
1284 }
1285 }
1286
1287 static void
mptcp_reset_itfinfo(struct mpt_itf_info * info)1288 mptcp_reset_itfinfo(struct mpt_itf_info *info)
1289 {
1290 memset(info, 0, sizeof(*info));
1291 }
1292
1293 void
mptcp_session_necp_cb(void * handle,int action,uint32_t interface_index,uint32_t necp_flags,__unused bool * viable)1294 mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
1295 uint32_t necp_flags, __unused bool *viable)
1296 {
1297 boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1298 boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1299 boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
1300 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1301 struct mppcb *mp = (struct mppcb *)handle;
1302 struct mptses *mpte = mptompte(mp);
1303 struct socket *mp_so;
1304 struct mptcb *mp_tp;
1305 uint32_t i, ifindex;
1306 struct ifnet *ifp;
1307 int locked = 0;
1308
1309 ifindex = interface_index;
1310 VERIFY(ifindex != IFSCOPE_NONE);
1311
1312 /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1313 if (mp->mpp_socket->so_usecount == 0) {
1314 return;
1315 }
1316
1317 mp_so = mptetoso(mpte);
1318
1319 if (action != NECP_CLIENT_CBACTION_INITIAL) {
1320 socket_lock(mp_so, 1);
1321 locked = 1;
1322
1323 /* Check again, because it might have changed while waiting */
1324 if (mp->mpp_socket->so_usecount == 0) {
1325 goto out;
1326 }
1327 }
1328
1329 socket_lock_assert_owned(mp_so);
1330
1331 mp_tp = mpte->mpte_mptcb;
1332
1333 ifnet_head_lock_shared();
1334 ifp = ifindex2ifnet[ifindex];
1335 ifnet_head_done();
1336
1337 os_log(mptcp_log_handle, "%s - %lx: action: %u ifindex %u delegated to %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1338 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex,
1339 ifp && ifp->if_delegated.ifp ? ifp->if_delegated.ifp->if_index : IFSCOPE_NONE,
1340 mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
1341 has_v4, has_v6, has_nat64, low_power);
1342
1343 /* No need on fallen back sockets */
1344 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
1345 goto out;
1346 }
1347
1348 /*
1349 * When the interface goes in low-power mode we don't want to establish
1350 * new subflows on it. Thus, mark it internally as non-viable.
1351 */
1352 if (low_power) {
1353 action = NECP_CLIENT_CBACTION_NONVIABLE;
1354 }
1355
1356 if (action == NECP_CLIENT_CBACTION_INITIAL) {
1357 mpte->mpte_flags |= MPTE_ITFINFO_INIT;
1358 }
1359
1360 if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1361 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1362 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1363 continue;
1364 }
1365
1366 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1367 mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1368 }
1369 }
1370
1371 mptcp_sched_create_subflows(mpte);
1372 } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1373 action == NECP_CLIENT_CBACTION_INITIAL) {
1374 int found_slot = 0, slot_index = -1;
1375 struct sockaddr *dst;
1376
1377 if (ifp == NULL) {
1378 goto out;
1379 }
1380
1381 if (IFNET_IS_COMPANION_LINK(ifp)) {
1382 goto out;
1383 }
1384
1385 if (IFNET_IS_EXPENSIVE(ifp) &&
1386 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1387 goto out;
1388 }
1389
1390 if (IFNET_IS_CONSTRAINED(ifp) &&
1391 (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1392 goto out;
1393 }
1394
1395 if (IFNET_IS_CELLULAR(ifp) &&
1396 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1397 goto out;
1398 }
1399
1400 if (IS_INTF_CLAT46(ifp)) {
1401 has_v4 = FALSE;
1402 }
1403
1404 /* Look for the slot on where to store/update the interface-info. */
1405 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1406 /* Found a potential empty slot where we can put it */
1407 if (mpte->mpte_itfinfo[i].ifindex == 0) {
1408 found_slot = 1;
1409 slot_index = i;
1410 }
1411
1412 /*
1413 * The interface is already in our array. Check if we
1414 * need to update it.
1415 */
1416 if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
1417 (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
1418 mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
1419 mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
1420 found_slot = 1;
1421 slot_index = i;
1422 break;
1423 }
1424
1425 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1426 /*
1427 * Ok, it's already there and we don't need
1428 * to update it
1429 */
1430 goto out;
1431 }
1432 }
1433
1434 dst = mptcp_get_session_dst(mpte, has_v6, has_v4);
1435 if (dst && dst->sa_family == AF_INET &&
1436 has_v6 && !has_nat64 && !has_v4) {
1437 if (found_slot) {
1438 mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1439 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1440 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1441 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1442 }
1443 goto out;
1444 }
1445
1446 if (found_slot == 0) {
1447 int new_size = mpte->mpte_itfinfo_size * 2;
1448 struct mpt_itf_info *info = kalloc_data(sizeof(*info) * new_size, Z_ZERO);
1449
1450 if (info == NULL) {
1451 os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n",
1452 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size);
1453 goto out;
1454 }
1455
1456 memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1457
1458 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
1459 kfree_data(mpte->mpte_itfinfo,
1460 sizeof(*info) * mpte->mpte_itfinfo_size);
1461 }
1462
1463 /* We allocated a new one, thus the first must be empty */
1464 slot_index = mpte->mpte_itfinfo_size;
1465
1466 mpte->mpte_itfinfo = info;
1467 mpte->mpte_itfinfo_size = new_size;
1468 }
1469
1470 VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
1471 mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1472 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1473 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1474 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1475
1476 mptcp_sched_create_subflows(mpte);
1477 }
1478
1479 out:
1480 if (locked) {
1481 socket_unlock(mp_so, 1);
1482 }
1483 }
1484
1485 void
mptcp_set_restrictions(struct socket * mp_so)1486 mptcp_set_restrictions(struct socket *mp_so)
1487 {
1488 struct mptses *mpte = mpsotompte(mp_so);
1489 uint32_t i;
1490
1491 socket_lock_assert_owned(mp_so);
1492
1493 ifnet_head_lock_shared();
1494
1495 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1496 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1497 uint32_t ifindex = info->ifindex;
1498 struct ifnet *ifp;
1499
1500 if (ifindex == IFSCOPE_NONE) {
1501 continue;
1502 }
1503
1504 ifp = ifindex2ifnet[ifindex];
1505 if (ifp == NULL) {
1506 continue;
1507 }
1508
1509 if (IFNET_IS_EXPENSIVE(ifp) &&
1510 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1511 info->ifindex = IFSCOPE_NONE;
1512 }
1513
1514 if (IFNET_IS_CONSTRAINED(ifp) &&
1515 (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1516 info->ifindex = IFSCOPE_NONE;
1517 }
1518
1519 if (IFNET_IS_CELLULAR(ifp) &&
1520 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1521 info->ifindex = IFSCOPE_NONE;
1522 }
1523 }
1524
1525 ifnet_head_done();
1526 }
1527
1528 #define DUMP_BUF_CHK() { \
1529 clen -= k; \
1530 if (clen < 1) \
1531 goto done; \
1532 c += k; \
1533 }
1534
1535 int
dump_mptcp_reass_qlen(char * str,int str_len)1536 dump_mptcp_reass_qlen(char *str, int str_len)
1537 {
1538 char *c = str;
1539 int k, clen = str_len;
1540
1541 if (mptcp_reass_total_qlen != 0) {
1542 k = scnprintf(c, clen, "\nmptcp reass qlen %d\n", mptcp_reass_total_qlen);
1543 DUMP_BUF_CHK();
1544 }
1545
1546 done:
1547 return str_len - clen;
1548 }
1549