1 /*
2 * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * A note on the MPTCP/NECP-interactions:
31 *
32 * MPTCP uses NECP-callbacks to get notified of interface/policy events.
33 * MPTCP registers to these events at the MPTCP-layer for interface-events
34 * through a call to necp_client_register_multipath_cb.
35 * To get per-flow events (aka per TCP-subflow), we register to it with
36 * necp_client_register_socket_flow. Both registrations happen by using the
37 * necp-client-uuid that comes from the app.
38 *
39 * The locking is rather tricky. In general, we expect the lock-ordering to
40 * happen from necp-fd -> necp->client -> mpp_lock.
41 *
42 * There are however some subtleties.
43 *
44 * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
45 * safe, because it is the very first time this MPTCP-connection goes into NECP.
46 * As we go into NECP we take the NECP-locks and thus are guaranteed that no
47 * NECP-locks will deadlock us. Because these NECP-events will also first take
48 * the NECP-locks. Either they win the race and thus won't find our
49 * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
50 * the callbacks while holding the NECP lock.
51 *
52 * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
53 * because we have already registered callbacks and we might race against an
54 * NECP-event that will match on our socket. So, we have to unlock to be safe.
55 *
56 * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
57 * so_usecount has reached 0. We must be careful to not remove the mpp_socket
58 * pointers before we unregistered the callback. Because, again we might be
59 * racing against an NECP-event. Unregistering must happen with an unlocked
60 * mpp_lock, because of the lock-ordering constraint. It could be that
61 * before we had a chance to unregister an NECP-event triggers. That's why
62 * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
63 * there while the socket is being garbage-collected, the use-count will go
64 * down to 0 and we exit. Removal of the multipath_cb again happens by taking
65 * the NECP-locks so any running NECP-events will finish first and exit cleanly.
66 *
67 * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
68 * the socket-lock must be unlocked for lock-ordering constraints. This gets a
69 * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
70 * So, we drop the mp_so-lock as soon as the subflow is unlinked with
71 * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
72 * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
73 * gets it, it will realize that the subflow became non-MPTCP and retry (see
74 * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
75 * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
76 * for the NECP-lock (held by the other thread that is taking care of the NECP-
77 * event). So, the event now finally gets the subflow-lock and then hits an
78 * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
79 * the NECP callback.
80 */
81
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/kernel.h>
85 #include <sys/mbuf.h>
86 #include <sys/mcache.h>
87 #include <sys/socket.h>
88 #include <sys/socketvar.h>
89 #include <sys/syslog.h>
90 #include <sys/protosw.h>
91
92 #include <kern/zalloc.h>
93 #include <kern/locks.h>
94
95 #include <mach/sdt.h>
96
97 #include <net/if.h>
98 #include <netinet/in.h>
99 #include <netinet/in_var.h>
100 #include <netinet/tcp.h>
101 #include <netinet/tcp_fsm.h>
102 #include <netinet/tcp_seq.h>
103 #include <netinet/tcp_var.h>
104 #include <netinet/mptcp_var.h>
105 #include <netinet/mptcp.h>
106 #include <netinet/mptcp_seq.h>
107 #include <netinet/mptcp_opt.h>
108 #include <netinet/mptcp_timer.h>
109
110 int mptcp_enable = 1;
111 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
112 &mptcp_enable, 0, "Enable Multipath TCP Support");
113
114 /*
115 * Number of times to try negotiating MPTCP on SYN retransmissions.
116 * We haven't seen any reports of a middlebox that is dropping all SYN-segments
117 * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times.
118 */
119 int mptcp_mpcap_retries = 4;
120 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
121 CTLFLAG_RW | CTLFLAG_LOCKED,
122 &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
123
124 /*
125 * By default, DSS checksum is turned off, revisit if we ever do
126 * MPTCP for non SSL Traffic.
127 */
128 int mptcp_dss_csum = 0;
129 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
130 &mptcp_dss_csum, 0, "Enable DSS checksum");
131
132 /*
133 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
134 * is attempted on a different path.
135 */
136 int mptcp_fail_thresh = 1;
137 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
138 &mptcp_fail_thresh, 0, "Failover threshold");
139
140 /*
141 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
142 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
143 * Some carrier networks have a timeout of 10 or 15 minutes.
144 */
145 int mptcp_subflow_keeptime = 60 * 14;
146 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
147 &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
148
149 int mptcp_rtthist_rtthresh = 600;
150 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
151 &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
152
153 int mptcp_rtothresh = 1500;
154 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
155 &mptcp_rtothresh, 0, "RTO threshold");
156
157 /*
158 * Probe the preferred path, when it is not in use
159 */
160 uint32_t mptcp_probeto = 1000;
161 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
162 &mptcp_probeto, 0, "Disable probing by setting to 0");
163
164 uint32_t mptcp_probecnt = 5;
165 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
166 &mptcp_probecnt, 0, "Number of probe writes");
167
168 uint32_t mptcp_enable_v1 = 1;
169 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, enable_v1, CTLFLAG_RW | CTLFLAG_LOCKED,
170 &mptcp_enable_v1, 0, "Enable or disable v1");
171
172 static int
173 sysctl_mptcp_version_check SYSCTL_HANDLER_ARGS
174 {
175 #pragma unused(arg1, arg2)
176 int error;
177 int new_value = *(int *)oidp->oid_arg1;
178 int old_value = *(int *)oidp->oid_arg1;
179
180 error = sysctl_handle_int(oidp, &new_value, 0, req);
181 if (!error) {
182 if (new_value != MPTCP_VERSION_0 && new_value != MPTCP_VERSION_1) {
183 return EINVAL;
184 }
185 *(int *)oidp->oid_arg1 = new_value;
186 }
187
188 os_log(OS_LOG_DEFAULT,
189 "%s:%u sysctl net.inet.tcp.mptcp_preferred_version: %d -> %d)",
190 proc_best_name(current_proc()), proc_selfpid(),
191 old_value, *(int *)oidp->oid_arg1);
192
193 return error;
194 }
195
196 int mptcp_preferred_version = MPTCP_VERSION_0;
197 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, mptcp_preferred_version,
198 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
199 &mptcp_preferred_version, 0, &sysctl_mptcp_version_check, "I", "");
200
201
202 static int
mptcp_reass_present(struct socket * mp_so)203 mptcp_reass_present(struct socket *mp_so)
204 {
205 struct mptses *mpte = mpsotompte(mp_so);
206 struct mptcb *mp_tp = mpte->mpte_mptcb;
207 struct tseg_qent *q;
208 int dowakeup = 0;
209 int flags = 0;
210
211 /*
212 * Present data to user, advancing rcv_nxt through
213 * completed sequence space.
214 */
215 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
216 return flags;
217 }
218 q = LIST_FIRST(&mp_tp->mpt_segq);
219 if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) {
220 return flags;
221 }
222
223 /*
224 * If there is already another thread doing reassembly for this
225 * connection, it is better to let it finish the job --
226 * (radar 16316196)
227 */
228 if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) {
229 return flags;
230 }
231
232 mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
233
234 do {
235 mp_tp->mpt_rcvnxt += q->tqe_len;
236 LIST_REMOVE(q, tqe_q);
237 if (mp_so->so_state & SS_CANTRCVMORE) {
238 m_freem(q->tqe_m);
239 } else {
240 flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
241 if (sbappendstream_rcvdemux(mp_so, q->tqe_m)) {
242 dowakeup = 1;
243 }
244 }
245 zfree(tcp_reass_zone, q);
246 mp_tp->mpt_reassqlen--;
247 q = LIST_FIRST(&mp_tp->mpt_segq);
248 } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
249 mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
250
251 if (dowakeup) {
252 sorwakeup(mp_so); /* done with socket lock held */
253 }
254 return flags;
255 }
256
257 static int
mptcp_reass(struct socket * mp_so,struct pkthdr * phdr,int * tlenp,struct mbuf * m)258 mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
259 {
260 struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
261 u_int64_t mb_dsn = phdr->mp_dsn;
262 struct tseg_qent *q;
263 struct tseg_qent *p = NULL;
264 struct tseg_qent *nq;
265 struct tseg_qent *te = NULL;
266 uint32_t qlimit;
267
268 /*
269 * Limit the number of segments in the reassembly queue to prevent
270 * holding on to too many segments (and thus running out of mbufs).
271 * Make sure to let the missing segment through which caused this
272 * queue. Always keep one global queue entry spare to be able to
273 * process the missing segment.
274 */
275 qlimit = MIN(MAX(100, mp_so->so_rcv.sb_hiwat >> 10),
276 (tcp_autorcvbuf_max >> 10));
277 if (mb_dsn != mp_tp->mpt_rcvnxt &&
278 (mp_tp->mpt_reassqlen + 1) >= qlimit) {
279 tcpstat.tcps_mptcp_rcvmemdrop++;
280 m_freem(m);
281 *tlenp = 0;
282 return 0;
283 }
284
285 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
286 te = zalloc_flags(tcp_reass_zone, Z_WAITOK | Z_NOFAIL);
287
288 mp_tp->mpt_reassqlen++;
289
290 /*
291 * Find a segment which begins after this one does.
292 */
293 LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
294 if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) {
295 break;
296 }
297 p = q;
298 }
299
300 /*
301 * If there is a preceding segment, it may provide some of
302 * our data already. If so, drop the data from the incoming
303 * segment. If it provides all of our data, drop us.
304 */
305 if (p != NULL) {
306 int64_t i;
307 /* conversion to int (in i) handles seq wraparound */
308 i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
309 if (i > 0) {
310 if (i >= *tlenp) {
311 tcpstat.tcps_mptcp_rcvduppack++;
312 m_freem(m);
313 zfree(tcp_reass_zone, te);
314 te = NULL;
315 mp_tp->mpt_reassqlen--;
316 /*
317 * Try to present any queued data
318 * at the left window edge to the user.
319 * This is needed after the 3-WHS
320 * completes.
321 */
322 goto out;
323 }
324 VERIFY(i <= INT_MAX);
325 m_adj(m, (int)i);
326 *tlenp -= i;
327 phdr->mp_dsn += i;
328 }
329 }
330
331 tcpstat.tcps_mp_oodata++;
332
333 /*
334 * While we overlap succeeding segments trim them or,
335 * if they are completely covered, dequeue them.
336 */
337 while (q) {
338 int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
339 if (i <= 0) {
340 break;
341 }
342
343 if (i < q->tqe_len) {
344 q->tqe_m->m_pkthdr.mp_dsn += i;
345 q->tqe_len -= i;
346
347 VERIFY(i <= INT_MAX);
348 m_adj(q->tqe_m, (int)i);
349 break;
350 }
351
352 nq = LIST_NEXT(q, tqe_q);
353 LIST_REMOVE(q, tqe_q);
354 m_freem(q->tqe_m);
355 zfree(tcp_reass_zone, q);
356 mp_tp->mpt_reassqlen--;
357 q = nq;
358 }
359
360 /* Insert the new segment queue entry into place. */
361 te->tqe_m = m;
362 te->tqe_th = NULL;
363 te->tqe_len = *tlenp;
364
365 if (p == NULL) {
366 LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
367 } else {
368 LIST_INSERT_AFTER(p, te, tqe_q);
369 }
370
371 out:
372 return mptcp_reass_present(mp_so);
373 }
374
375 /*
376 * MPTCP input, called when data has been read from a subflow socket.
377 */
378 void
mptcp_input(struct mptses * mpte,struct mbuf * m)379 mptcp_input(struct mptses *mpte, struct mbuf *m)
380 {
381 struct socket *mp_so;
382 struct mptcb *mp_tp = NULL;
383 int count = 0, wakeup = 0;
384 struct mbuf *save = NULL, *prev = NULL;
385 struct mbuf *freelist = NULL, *tail = NULL;
386
387 VERIFY(m->m_flags & M_PKTHDR);
388
389 mp_so = mptetoso(mpte);
390 mp_tp = mpte->mpte_mptcb;
391
392 socket_lock_assert_owned(mp_so);
393
394 DTRACE_MPTCP(input);
395
396 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
397
398 /*
399 * Each mbuf contains MPTCP Data Sequence Map
400 * Process the data for reassembly, delivery to MPTCP socket
401 * client, etc.
402 *
403 */
404 count = mp_so->so_rcv.sb_cc;
405
406 /*
407 * In the degraded fallback case, data is accepted without DSS map
408 */
409 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
410 struct mbuf *iter;
411 int mb_dfin;
412 fallback:
413 mb_dfin = 0;
414 mptcp_sbrcv_grow(mp_tp);
415
416 iter = m;
417 while (iter) {
418 if ((iter->m_flags & M_PKTHDR) &&
419 (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
420 mb_dfin = 1;
421 }
422
423 if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
424 /* Don't add zero-length packets, so jump it! */
425 if (prev == NULL) {
426 m = iter->m_next;
427 m_free(iter);
428 iter = m;
429 } else {
430 prev->m_next = iter->m_next;
431 m_free(iter);
432 iter = prev->m_next;
433 }
434
435 /* It was a zero-length packet so next one must be a pkthdr */
436 VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
437 } else {
438 prev = iter;
439 iter = iter->m_next;
440 }
441 }
442
443 /*
444 * assume degraded flow as this may be the first packet
445 * without DSS, and the subflow state is not updated yet.
446 */
447 if (sbappendstream_rcvdemux(mp_so, m)) {
448 sorwakeup(mp_so);
449 }
450
451 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
452 struct socket *, mp_so,
453 struct sockbuf *, &mp_so->so_rcv,
454 struct sockbuf *, &mp_so->so_snd,
455 struct mptses *, mpte);
456 count = mp_so->so_rcv.sb_cc - count;
457
458 mp_tp->mpt_rcvnxt += count;
459
460 if (mb_dfin) {
461 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
462 socantrcvmore(mp_so);
463 }
464 return;
465 }
466
467 do {
468 u_int64_t mb_dsn;
469 int32_t mb_datalen;
470 int64_t todrop;
471 int mb_dfin = 0;
472
473 VERIFY(m->m_flags & M_PKTHDR);
474
475 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
476 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
477 goto fallback;
478 }
479
480 save = m->m_next;
481 /*
482 * A single TCP packet formed of multiple mbufs
483 * holds DSS mapping in the first mbuf of the chain.
484 * Other mbufs in the chain may have M_PKTHDR set
485 * even though they belong to the same TCP packet
486 * and therefore use the DSS mapping stored in the
487 * first mbuf of the mbuf chain. mptcp_input() can
488 * get an mbuf chain with multiple TCP packets.
489 */
490 while (save && (!(save->m_flags & M_PKTHDR) ||
491 !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
492 prev = save;
493 save = save->m_next;
494 }
495 if (prev) {
496 prev->m_next = NULL;
497 } else {
498 m->m_next = NULL;
499 }
500
501 mb_dsn = m->m_pkthdr.mp_dsn;
502 mb_datalen = m->m_pkthdr.mp_rlen;
503
504 todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
505 if (todrop > 0) {
506 tcpstat.tcps_mptcp_rcvpackafterwin++;
507
508 os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n",
509 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
510 (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt,
511 mp_tp->mpt_rcvwnd, todrop);
512
513 if (todrop >= mb_datalen) {
514 if (freelist == NULL) {
515 freelist = m;
516 } else {
517 tail->m_next = m;
518 }
519
520 if (prev != NULL) {
521 tail = prev;
522 } else {
523 tail = m;
524 }
525
526 m = save;
527 prev = save = NULL;
528 continue;
529 } else {
530 VERIFY(todrop <= INT_MAX);
531 m_adj(m, (int)-todrop);
532 mb_datalen -= todrop;
533 m->m_pkthdr.mp_rlen -= todrop;
534 }
535
536 /*
537 * We drop from the right edge of the mbuf, thus the
538 * DATA_FIN is dropped as well
539 */
540 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
541 }
542
543 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
544 if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
545 mp_tp->mpt_rcvnxt)) {
546 if (freelist == NULL) {
547 freelist = m;
548 } else {
549 tail->m_next = m;
550 }
551
552 if (prev != NULL) {
553 tail = prev;
554 } else {
555 tail = m;
556 }
557
558 m = save;
559 prev = save = NULL;
560 continue;
561 } else {
562 VERIFY((mp_tp->mpt_rcvnxt - mb_dsn) <= INT_MAX);
563 m_adj(m, (int)(mp_tp->mpt_rcvnxt - mb_dsn));
564 mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn);
565 mb_dsn = mp_tp->mpt_rcvnxt;
566 VERIFY(mb_datalen >= 0 && mb_datalen <= USHRT_MAX);
567 m->m_pkthdr.mp_rlen = (uint16_t)mb_datalen;
568 m->m_pkthdr.mp_dsn = mb_dsn;
569 }
570 }
571
572 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
573 !LIST_EMPTY(&mp_tp->mpt_segq)) {
574 mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
575
576 goto next;
577 }
578 mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
579
580 mptcp_sbrcv_grow(mp_tp);
581
582 if (sbappendstream_rcvdemux(mp_so, m)) {
583 wakeup = 1;
584 }
585
586 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
587 struct sockbuf *, &mp_so->so_rcv,
588 struct sockbuf *, &mp_so->so_snd,
589 struct mptses *, mpte,
590 struct mptcb *, mp_tp);
591 count = mp_so->so_rcv.sb_cc - count;
592 tcpstat.tcps_mp_rcvtotal++;
593 tcpstat.tcps_mp_rcvbytes += count;
594
595 mp_tp->mpt_rcvnxt += count;
596
597 next:
598 if (mb_dfin) {
599 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
600 socantrcvmore(mp_so);
601 }
602 m = save;
603 prev = save = NULL;
604 count = mp_so->so_rcv.sb_cc;
605 } while (m);
606
607 if (freelist) {
608 m_freem(freelist);
609 }
610
611 if (wakeup) {
612 sorwakeup(mp_so);
613 }
614 }
615
616 boolean_t
mptcp_can_send_more(struct mptcb * mp_tp,boolean_t ignore_reinject)617 mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
618 {
619 struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
620
621 /*
622 * Always send if there is data in the reinject-queue.
623 */
624 if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) {
625 return TRUE;
626 }
627
628 /*
629 * Don't send, if:
630 *
631 * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
632 * Except when using TFO, we might be doing a 0-byte write.
633 * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
634 * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
635 */
636
637 if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
638 return FALSE;
639 }
640
641 if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) {
642 return FALSE;
643 }
644
645 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
646 return FALSE;
647 }
648
649 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
650 return FALSE;
651 }
652
653 return TRUE;
654 }
655
656 /*
657 * MPTCP output.
658 */
659 int
mptcp_output(struct mptses * mpte)660 mptcp_output(struct mptses *mpte)
661 {
662 struct mptcb *mp_tp;
663 struct mptsub *mpts;
664 struct mptsub *mpts_tried = NULL;
665 struct socket *mp_so;
666 struct mptsub *preferred_mpts = NULL;
667 uint64_t old_snd_nxt;
668 int error = 0;
669
670 mp_so = mptetoso(mpte);
671 mp_tp = mpte->mpte_mptcb;
672
673 socket_lock_assert_owned(mp_so);
674
675 if (mp_so->so_flags & SOF_DEFUNCT) {
676 return 0;
677 }
678
679 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
680 mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
681
682 old_snd_nxt = mp_tp->mpt_sndnxt;
683 while (mptcp_can_send_more(mp_tp, FALSE)) {
684 /* get the "best" subflow to be used for transmission */
685 mpts = mptcp_get_subflow(mpte, &preferred_mpts);
686 if (mpts == NULL) {
687 mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
688 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
689 break;
690 }
691
692 /* In case there's just one flow, we reattempt later */
693 if (mpts_tried != NULL &&
694 (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
695 mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
696 mpts_tried->mpts_flags |= MPTSF_ACTIVE;
697 mptcp_start_timer(mpte, MPTT_REXMT);
698 break;
699 }
700
701 /*
702 * Automatic sizing of send socket buffer. Increase the send
703 * socket buffer size if all of the following criteria are met
704 * 1. the receiver has enough buffer space for this data
705 * 2. send buffer is filled to 7/8th with data (so we actually
706 * have data to make use of it);
707 */
708 if ((mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
709 tcp_cansbgrow(&mp_so->so_snd)) {
710 if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
711 mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
712 if (sbreserve(&mp_so->so_snd,
713 min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
714 tcp_autosndbuf_max)) == 1) {
715 mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
716 }
717 }
718 }
719
720 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
721 struct socket *, mp_so);
722 error = mptcp_subflow_output(mpte, mpts, 0);
723 if (error) {
724 /* can be a temporary loss of source address or other error */
725 mpts->mpts_flags |= MPTSF_FAILINGOVER;
726 mpts->mpts_flags &= ~MPTSF_ACTIVE;
727 mpts_tried = mpts;
728 if (error != ECANCELED) {
729 os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n",
730 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
731 error, mpts->mpts_flags);
732 }
733 break;
734 }
735 /* The model is to have only one active flow at a time */
736 mpts->mpts_flags |= MPTSF_ACTIVE;
737 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
738
739 /* Allows us to update the smoothed rtt */
740 if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
741 if (preferred_mpts->mpts_probesoon) {
742 if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
743 mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
744 if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
745 preferred_mpts->mpts_probesoon = 0;
746 preferred_mpts->mpts_probecnt = 0;
747 }
748 }
749 } else {
750 preferred_mpts->mpts_probesoon = tcp_now;
751 preferred_mpts->mpts_probecnt = 0;
752 }
753 }
754
755 if (mpte->mpte_active_sub == NULL) {
756 mpte->mpte_active_sub = mpts;
757 } else if (mpte->mpte_active_sub != mpts) {
758 mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
759 mpte->mpte_active_sub = mpts;
760
761 mptcpstats_inc_switch(mpte, mpts);
762 }
763 }
764
765 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
766 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
767 mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) {
768 mptcp_finish_usrclosed(mpte);
769 }
770 }
771
772 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
773
774 /* subflow errors should not be percolated back up */
775 return 0;
776 }
777
778
779 static struct mptsub *
mptcp_choose_subflow(struct mptsub * mpts,struct mptsub * curbest,int * currtt)780 mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
781 {
782 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
783
784 /*
785 * Lower RTT? Take it, if it's our first one, or
786 * it doesn't has any loss, or the current one has
787 * loss as well.
788 */
789 if (tp->t_srtt && *currtt > tp->t_srtt &&
790 (curbest == NULL || tp->t_rxtshift == 0 ||
791 sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
792 *currtt = tp->t_srtt;
793 return mpts;
794 }
795
796 /*
797 * If we find a subflow without loss, take it always!
798 */
799 if (curbest &&
800 sototcpcb(curbest->mpts_socket)->t_rxtshift &&
801 tp->t_rxtshift == 0) {
802 *currtt = tp->t_srtt;
803 return mpts;
804 }
805
806 return curbest != NULL ? curbest : mpts;
807 }
808
809 static struct mptsub *
mptcp_return_subflow(struct mptsub * mpts)810 mptcp_return_subflow(struct mptsub *mpts)
811 {
812 if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) {
813 return NULL;
814 }
815
816 return mpts;
817 }
818
819 static boolean_t
mptcp_subflow_is_slow(struct mptses * mpte,struct mptsub * mpts)820 mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts)
821 {
822 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
823 int fail_thresh = mptcp_fail_thresh;
824
825 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
826 fail_thresh *= 2;
827 }
828
829 return tp->t_rxtshift >= fail_thresh &&
830 (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
831 }
832
833 /*
834 * Return the most eligible subflow to be used for sending data.
835 */
836 struct mptsub *
mptcp_get_subflow(struct mptses * mpte,struct mptsub ** preferred)837 mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred)
838 {
839 struct tcpcb *besttp, *secondtp;
840 struct inpcb *bestinp, *secondinp;
841 struct mptsub *mpts;
842 struct mptsub *best = NULL;
843 struct mptsub *second_best = NULL;
844 int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
845
846 /*
847 * First Step:
848 * Choose the best subflow for cellular and non-cellular interfaces.
849 */
850
851 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
852 struct socket *so = mpts->mpts_socket;
853 struct tcpcb *tp = sototcpcb(so);
854 struct inpcb *inp = sotoinpcb(so);
855
856 mptcplog((LOG_DEBUG, "%s mpts %u mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
857 __func__, mpts->mpts_connid, mpts->mpts_flags,
858 INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
859 inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
860 tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
861 mptcp_subflow_cwnd_space(so)),
862 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
863
864 /*
865 * First, the hard conditions to reject subflows
866 * (e.g., not connected,...)
867 */
868 if (inp->inp_last_outifp == NULL) {
869 continue;
870 }
871
872 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
873 continue;
874 }
875
876 /* There can only be one subflow in degraded state */
877 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
878 best = mpts;
879 break;
880 }
881
882 /*
883 * If this subflow is waiting to finally send, do it!
884 */
885 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
886 return mptcp_return_subflow(mpts);
887 }
888
889 /*
890 * Only send if the subflow is MP_CAPABLE. The exceptions to
891 * this rule (degraded or TFO) have been taken care of above.
892 */
893 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) {
894 continue;
895 }
896
897 if ((so->so_state & SS_ISDISCONNECTED) ||
898 !(so->so_state & SS_ISCONNECTED) ||
899 !TCPS_HAVEESTABLISHED(tp->t_state) ||
900 tp->t_state > TCPS_CLOSE_WAIT) {
901 continue;
902 }
903
904 /*
905 * Second, the soft conditions to find the subflow with best
906 * conditions for each set (aka cellular vs non-cellular)
907 */
908 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
909 second_best = mptcp_choose_subflow(mpts, second_best,
910 &exp_rtt);
911 } else {
912 best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
913 }
914 }
915
916 /*
917 * If there is no preferred or backup subflow, and there is no active
918 * subflow use the last usable subflow.
919 */
920 if (best == NULL) {
921 return mptcp_return_subflow(second_best);
922 }
923
924 if (second_best == NULL) {
925 return mptcp_return_subflow(best);
926 }
927
928 besttp = sototcpcb(best->mpts_socket);
929 bestinp = sotoinpcb(best->mpts_socket);
930 secondtp = sototcpcb(second_best->mpts_socket);
931 secondinp = sotoinpcb(second_best->mpts_socket);
932
933 if (preferred != NULL) {
934 *preferred = mptcp_return_subflow(best);
935 }
936
937 /*
938 * Second Step: Among best and second_best. Choose the one that is
939 * most appropriate for this particular service-type.
940 */
941 if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
942 return mptcp_return_subflow(best);
943 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
944 /*
945 * Only handover if Symptoms tells us to do so.
946 */
947 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
948 mptcp_is_wifi_unusable_for_session(mpte) != 0 && mptcp_subflow_is_slow(mpte, best)) {
949 return mptcp_return_subflow(second_best);
950 }
951
952 return mptcp_return_subflow(best);
953 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
954 int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
955 int rto_thresh = mptcp_rtothresh;
956
957 /* Adjust with symptoms information */
958 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
959 mptcp_is_wifi_unusable_for_session(mpte) != 0) {
960 rtt_thresh /= 2;
961 rto_thresh /= 2;
962 }
963
964 if (besttp->t_srtt && secondtp->t_srtt &&
965 besttp->t_srtt >= rtt_thresh &&
966 secondtp->t_srtt < rtt_thresh) {
967 tcpstat.tcps_mp_sel_rtt++;
968 mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d, second cid %d at rtt %d\n", __func__,
969 best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
970 second_best->mpts_connid,
971 secondtp->t_srtt >> TCP_RTT_SHIFT),
972 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
973 return mptcp_return_subflow(second_best);
974 }
975
976 if (mptcp_subflow_is_slow(mpte, best) &&
977 secondtp->t_rxtshift == 0) {
978 return mptcp_return_subflow(second_best);
979 }
980
981 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
982 if (besttp->t_rxtcur && secondtp->t_rxtcur &&
983 besttp->t_rxtcur >= rto_thresh &&
984 secondtp->t_rxtcur < rto_thresh) {
985 tcpstat.tcps_mp_sel_rto++;
986 mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
987 best->mpts_connid, besttp->t_rxtcur,
988 second_best->mpts_connid, secondtp->t_rxtcur),
989 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
990
991 return mptcp_return_subflow(second_best);
992 }
993
994 /*
995 * None of the above conditions for sending on the secondary
996 * were true. So, let's schedule on the best one, if he still
997 * has some space in the congestion-window.
998 */
999 return mptcp_return_subflow(best);
1000 } else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) {
1001 struct mptsub *tmp;
1002
1003 /*
1004 * We only care about RTT when aggregating
1005 */
1006 if (besttp->t_srtt > secondtp->t_srtt) {
1007 tmp = best;
1008 best = second_best;
1009 besttp = secondtp;
1010 bestinp = secondinp;
1011
1012 second_best = tmp;
1013 secondtp = sototcpcb(second_best->mpts_socket);
1014 secondinp = sotoinpcb(second_best->mpts_socket);
1015 }
1016
1017 /* Is there still space in the congestion window? */
1018 if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) {
1019 return mptcp_return_subflow(second_best);
1020 }
1021
1022 return mptcp_return_subflow(best);
1023 } else {
1024 panic("Unknown service-type configured for MPTCP");
1025 }
1026
1027 return NULL;
1028 }
1029
1030 static const char *
mptcp_event_to_str(uint32_t event)1031 mptcp_event_to_str(uint32_t event)
1032 {
1033 const char *c = "UNDEFINED";
1034 switch (event) {
1035 case MPCE_CLOSE:
1036 c = "MPCE_CLOSE";
1037 break;
1038 case MPCE_RECV_DATA_ACK:
1039 c = "MPCE_RECV_DATA_ACK";
1040 break;
1041 case MPCE_RECV_DATA_FIN:
1042 c = "MPCE_RECV_DATA_FIN";
1043 break;
1044 }
1045 return c;
1046 }
1047
1048 static const char *
mptcp_state_to_str(mptcp_state_t state)1049 mptcp_state_to_str(mptcp_state_t state)
1050 {
1051 const char *c = "UNDEFINED";
1052 switch (state) {
1053 case MPTCPS_CLOSED:
1054 c = "MPTCPS_CLOSED";
1055 break;
1056 case MPTCPS_LISTEN:
1057 c = "MPTCPS_LISTEN";
1058 break;
1059 case MPTCPS_ESTABLISHED:
1060 c = "MPTCPS_ESTABLISHED";
1061 break;
1062 case MPTCPS_CLOSE_WAIT:
1063 c = "MPTCPS_CLOSE_WAIT";
1064 break;
1065 case MPTCPS_FIN_WAIT_1:
1066 c = "MPTCPS_FIN_WAIT_1";
1067 break;
1068 case MPTCPS_CLOSING:
1069 c = "MPTCPS_CLOSING";
1070 break;
1071 case MPTCPS_LAST_ACK:
1072 c = "MPTCPS_LAST_ACK";
1073 break;
1074 case MPTCPS_FIN_WAIT_2:
1075 c = "MPTCPS_FIN_WAIT_2";
1076 break;
1077 case MPTCPS_TIME_WAIT:
1078 c = "MPTCPS_TIME_WAIT";
1079 break;
1080 case MPTCPS_TERMINATE:
1081 c = "MPTCPS_TERMINATE";
1082 break;
1083 }
1084 return c;
1085 }
1086
1087 void
mptcp_close_fsm(struct mptcb * mp_tp,uint32_t event)1088 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1089 {
1090 struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
1091
1092 socket_lock_assert_owned(mp_so);
1093
1094 mptcp_state_t old_state = mp_tp->mpt_state;
1095
1096 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1097 uint32_t, event);
1098
1099 switch (mp_tp->mpt_state) {
1100 case MPTCPS_CLOSED:
1101 case MPTCPS_LISTEN:
1102 mp_tp->mpt_state = MPTCPS_TERMINATE;
1103 break;
1104
1105 case MPTCPS_ESTABLISHED:
1106 if (event == MPCE_CLOSE) {
1107 mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1108 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1109 } else if (event == MPCE_RECV_DATA_FIN) {
1110 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1111 mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1112 }
1113 break;
1114
1115 case MPTCPS_CLOSE_WAIT:
1116 if (event == MPCE_CLOSE) {
1117 mp_tp->mpt_state = MPTCPS_LAST_ACK;
1118 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1119 }
1120 break;
1121
1122 case MPTCPS_FIN_WAIT_1:
1123 if (event == MPCE_RECV_DATA_ACK) {
1124 mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1125 } else if (event == MPCE_RECV_DATA_FIN) {
1126 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1127 mp_tp->mpt_state = MPTCPS_CLOSING;
1128 }
1129 break;
1130
1131 case MPTCPS_CLOSING:
1132 if (event == MPCE_RECV_DATA_ACK) {
1133 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1134 }
1135 break;
1136
1137 case MPTCPS_LAST_ACK:
1138 if (event == MPCE_RECV_DATA_ACK) {
1139 mptcp_close(mp_tp->mpt_mpte, mp_tp);
1140 }
1141 break;
1142
1143 case MPTCPS_FIN_WAIT_2:
1144 if (event == MPCE_RECV_DATA_FIN) {
1145 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1146 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1147 }
1148 break;
1149
1150 case MPTCPS_TIME_WAIT:
1151 case MPTCPS_TERMINATE:
1152 break;
1153
1154 default:
1155 VERIFY(0);
1156 /* NOTREACHED */
1157 }
1158 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1159 uint32_t, event);
1160 mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
1161 mptcp_state_to_str(old_state),
1162 mptcp_state_to_str(mp_tp->mpt_state),
1163 mptcp_event_to_str(event)),
1164 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
1165 }
1166
1167 /* If you change this function, match up mptcp_update_rcv_state_f */
1168 void
mptcp_update_dss_rcv_state(struct mptcp_dsn_opt * dss_info,struct tcpcb * tp,uint16_t csum)1169 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1170 uint16_t csum)
1171 {
1172 struct mptcb *mp_tp = tptomptp(tp);
1173 u_int64_t full_dsn = 0;
1174
1175 NTOHL(dss_info->mdss_dsn);
1176 NTOHL(dss_info->mdss_subflow_seqn);
1177 NTOHS(dss_info->mdss_data_len);
1178
1179 /* XXX for autosndbuf grow sb here */
1180 MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1181 mptcp_update_rcv_state_meat(mp_tp, tp,
1182 full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1183 csum);
1184 }
1185
1186 void
mptcp_update_rcv_state_meat(struct mptcb * mp_tp,struct tcpcb * tp,u_int64_t full_dsn,u_int32_t seqn,u_int16_t mdss_data_len,uint16_t csum)1187 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1188 u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1189 uint16_t csum)
1190 {
1191 if (mdss_data_len == 0) {
1192 os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n",
1193 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte));
1194
1195 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1196 os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n",
1197 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum);
1198 }
1199 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1200 return;
1201 }
1202
1203 mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1204
1205 tp->t_rcv_map.mpt_dsn = full_dsn;
1206 tp->t_rcv_map.mpt_sseq = seqn;
1207 tp->t_rcv_map.mpt_len = mdss_data_len;
1208 tp->t_rcv_map.mpt_csum = csum;
1209 tp->t_mpflags |= TMPF_EMBED_DSN;
1210 }
1211
1212
1213 static int
mptcp_validate_dss_map(struct socket * so,struct tcpcb * tp,struct mbuf * m,int hdrlen)1214 mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
1215 int hdrlen)
1216 {
1217 u_int32_t datalen;
1218
1219 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1220 return 0;
1221 }
1222
1223 datalen = m->m_pkthdr.mp_rlen;
1224
1225 /* unacceptable DSS option, fallback to TCP */
1226 if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
1227 os_log_error(mptcp_log_handle, "%s - %lx: mbuf len %d, MPTCP expected %d",
1228 __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), m->m_pkthdr.len, datalen);
1229 } else {
1230 return 0;
1231 }
1232 tp->t_mpflags |= TMPF_SND_MPFAIL;
1233 mptcp_notify_mpfail(so);
1234 m_freem(m);
1235 return -1;
1236 }
1237
1238 int
mptcp_input_preproc(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th,int drop_hdrlen)1239 mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1240 int drop_hdrlen)
1241 {
1242 mptcp_insert_rmap(tp, m, th);
1243 if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
1244 drop_hdrlen) != 0) {
1245 return -1;
1246 }
1247 return 0;
1248 }
1249
1250 static uint16_t
mptcp_input_csum(struct tcpcb * tp,struct mbuf * m,uint64_t dsn,uint32_t sseq,uint16_t dlen,uint16_t csum,int dfin)1251 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1252 uint16_t dlen, uint16_t csum, int dfin)
1253 {
1254 struct mptcb *mp_tp = tptomptp(tp);
1255 int real_len = dlen - dfin;
1256 uint32_t sum = 0;
1257
1258 VERIFY(real_len >= 0);
1259
1260 if (mp_tp == NULL) {
1261 return 0;
1262 }
1263
1264 if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
1265 return 0;
1266 }
1267
1268 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
1269 return 0;
1270 }
1271
1272 /*
1273 * The remote side may send a packet with fewer bytes than the
1274 * claimed DSS checksum length.
1275 */
1276 if ((int)m_length2(m, NULL) < real_len) {
1277 return 0xffff;
1278 }
1279
1280 if (real_len != 0) {
1281 sum = m_sum16(m, 0, real_len);
1282 }
1283
1284 sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1285 ADDCARRY(sum);
1286
1287 DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1288 uint32_t, sum);
1289
1290 return ~sum & 0xffff;
1291 }
1292
1293 /*
1294 * MPTCP Checksum support
1295 * The checksum is calculated whenever the MPTCP DSS option is included
1296 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1297 * header and the actual data indicated by the length specified in the
1298 * DSS option.
1299 */
1300
1301 int
mptcp_validate_csum(struct tcpcb * tp,struct mbuf * m,uint64_t dsn,uint32_t sseq,uint16_t dlen,uint16_t csum,int dfin)1302 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1303 uint32_t sseq, uint16_t dlen, uint16_t csum, int dfin)
1304 {
1305 uint16_t mptcp_csum;
1306
1307 mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1308 if (mptcp_csum) {
1309 tp->t_mpflags |= TMPF_SND_MPFAIL;
1310 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1311 m_freem(m);
1312 tcpstat.tcps_mp_badcsum++;
1313 return -1;
1314 }
1315 return 0;
1316 }
1317
1318 uint16_t
mptcp_output_csum(struct mbuf * m,uint64_t dss_val,uint32_t sseq,uint16_t dlen)1319 mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1320 {
1321 uint32_t sum = 0;
1322
1323 if (dlen) {
1324 sum = m_sum16(m, 0, dlen);
1325 }
1326
1327 dss_val = mptcp_hton64(dss_val);
1328 sseq = htonl(sseq);
1329 dlen = htons(dlen);
1330 sum += in_pseudo64(dss_val, sseq, dlen);
1331
1332 ADDCARRY(sum);
1333 sum = ~sum & 0xffff;
1334 DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1335 mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1336 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1337
1338 return (uint16_t)sum;
1339 }
1340
1341 /*
1342 * When WiFi signal starts fading, there's more loss and RTT spikes.
1343 * Check if there has been a large spike by comparing against
1344 * a tolerable RTT spike threshold.
1345 */
1346 boolean_t
mptcp_no_rto_spike(struct socket * so)1347 mptcp_no_rto_spike(struct socket *so)
1348 {
1349 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1350 int32_t spike = 0;
1351
1352 if (tp->t_rxtcur > mptcp_rtothresh) {
1353 spike = tp->t_rxtcur - mptcp_rtothresh;
1354
1355 mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
1356 __func__, spike,
1357 tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1358 tp->t_rttcur),
1359 (MPTCP_SOCKET_DBG | MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1360 }
1361
1362 if (spike > 0) {
1363 return FALSE;
1364 } else {
1365 return TRUE;
1366 }
1367 }
1368
1369 void
mptcp_handle_deferred_upcalls(struct mppcb * mpp,uint32_t flag)1370 mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1371 {
1372 VERIFY(mpp->mpp_flags & flag);
1373 mpp->mpp_flags &= ~flag;
1374
1375 if (mptcp_should_defer_upcall(mpp)) {
1376 return;
1377 }
1378
1379 if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1380 mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1381
1382 mptcp_subflow_workloop(mpp->mpp_pcbe);
1383 }
1384
1385 if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1386 mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1387
1388 sorwakeup(mpp->mpp_socket);
1389 }
1390
1391 if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1392 mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1393
1394 sowwakeup(mpp->mpp_socket);
1395 }
1396 }
1397
1398 static void
mptcp_reset_itfinfo(struct mpt_itf_info * info)1399 mptcp_reset_itfinfo(struct mpt_itf_info *info)
1400 {
1401 memset(info, 0, sizeof(*info));
1402 }
1403
1404 void
mptcp_session_necp_cb(void * handle,int action,uint32_t interface_index,uint32_t necp_flags,__unused bool * viable)1405 mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
1406 uint32_t necp_flags, __unused bool *viable)
1407 {
1408 boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1409 boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1410 boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
1411 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1412 struct mppcb *mp = (struct mppcb *)handle;
1413 struct mptses *mpte = mptompte(mp);
1414 struct socket *mp_so;
1415 struct mptcb *mp_tp;
1416 uint32_t i, ifindex;
1417 struct ifnet *ifp;
1418 int locked = 0;
1419
1420 ifindex = interface_index;
1421 VERIFY(ifindex != IFSCOPE_NONE);
1422
1423 /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1424 if (mp->mpp_socket->so_usecount == 0) {
1425 return;
1426 }
1427
1428 mp_so = mptetoso(mpte);
1429
1430 if (action != NECP_CLIENT_CBACTION_INITIAL) {
1431 socket_lock(mp_so, 1);
1432 locked = 1;
1433
1434 /* Check again, because it might have changed while waiting */
1435 if (mp->mpp_socket->so_usecount == 0) {
1436 goto out;
1437 }
1438 }
1439
1440 socket_lock_assert_owned(mp_so);
1441
1442 mp_tp = mpte->mpte_mptcb;
1443
1444 ifnet_head_lock_shared();
1445 ifp = ifindex2ifnet[ifindex];
1446 ifnet_head_done();
1447
1448 os_log(mptcp_log_handle, "%s - %lx: action: %u ifindex %u delegated to %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1449 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex,
1450 ifp && ifp->if_delegated.ifp ? ifp->if_delegated.ifp->if_index : IFSCOPE_NONE,
1451 mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
1452 has_v4, has_v6, has_nat64, low_power);
1453
1454 /* No need on fallen back sockets */
1455 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
1456 goto out;
1457 }
1458
1459 /*
1460 * When the interface goes in low-power mode we don't want to establish
1461 * new subflows on it. Thus, mark it internally as non-viable.
1462 */
1463 if (low_power) {
1464 action = NECP_CLIENT_CBACTION_NONVIABLE;
1465 }
1466
1467 if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1468 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1469 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1470 continue;
1471 }
1472
1473 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1474 mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1475 }
1476 }
1477
1478 mptcp_sched_create_subflows(mpte);
1479 } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1480 action == NECP_CLIENT_CBACTION_INITIAL) {
1481 int found_slot = 0, slot_index = -1;
1482 struct sockaddr *dst;
1483
1484 if (ifp == NULL) {
1485 goto out;
1486 }
1487
1488 if (IFNET_IS_COMPANION_LINK(ifp)) {
1489 goto out;
1490 }
1491
1492 if (IFNET_IS_EXPENSIVE(ifp) &&
1493 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1494 goto out;
1495 }
1496
1497 if (IFNET_IS_CONSTRAINED(ifp) &&
1498 (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1499 goto out;
1500 }
1501
1502 if (IFNET_IS_CELLULAR(ifp) &&
1503 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1504 goto out;
1505 }
1506
1507 if (IS_INTF_CLAT46(ifp)) {
1508 has_v4 = FALSE;
1509 }
1510
1511 /* Look for the slot on where to store/update the interface-info. */
1512 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1513 /* Found a potential empty slot where we can put it */
1514 if (mpte->mpte_itfinfo[i].ifindex == 0) {
1515 found_slot = 1;
1516 slot_index = i;
1517 }
1518
1519 /*
1520 * The interface is already in our array. Check if we
1521 * need to update it.
1522 */
1523 if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
1524 (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
1525 mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
1526 mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
1527 found_slot = 1;
1528 slot_index = i;
1529 break;
1530 }
1531
1532 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1533 /*
1534 * Ok, it's already there and we don't need
1535 * to update it
1536 */
1537 goto out;
1538 }
1539 }
1540
1541 dst = mptcp_get_session_dst(mpte, has_v6, has_v4);
1542 if (dst && dst->sa_family == AF_INET &&
1543 has_v6 && !has_nat64 && !has_v4) {
1544 if (found_slot) {
1545 mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1546 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1547 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1548 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1549 }
1550 goto out;
1551 }
1552
1553 if (found_slot == 0) {
1554 int new_size = mpte->mpte_itfinfo_size * 2;
1555 struct mpt_itf_info *info = kalloc_data(sizeof(*info) * new_size, Z_ZERO);
1556
1557 if (info == NULL) {
1558 os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n",
1559 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size);
1560 goto out;
1561 }
1562
1563 memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1564
1565 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
1566 kfree_data(mpte->mpte_itfinfo,
1567 sizeof(*info) * mpte->mpte_itfinfo_size);
1568 }
1569
1570 /* We allocated a new one, thus the first must be empty */
1571 slot_index = mpte->mpte_itfinfo_size;
1572
1573 mpte->mpte_itfinfo = info;
1574 mpte->mpte_itfinfo_size = new_size;
1575 }
1576
1577 VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
1578 mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1579 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1580 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1581 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1582
1583 mptcp_sched_create_subflows(mpte);
1584 }
1585
1586 out:
1587 if (locked) {
1588 socket_unlock(mp_so, 1);
1589 }
1590 }
1591
1592 void
mptcp_set_restrictions(struct socket * mp_so)1593 mptcp_set_restrictions(struct socket *mp_so)
1594 {
1595 struct mptses *mpte = mpsotompte(mp_so);
1596 uint32_t i;
1597
1598 socket_lock_assert_owned(mp_so);
1599
1600 ifnet_head_lock_shared();
1601
1602 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1603 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1604 uint32_t ifindex = info->ifindex;
1605 struct ifnet *ifp;
1606
1607 if (ifindex == IFSCOPE_NONE) {
1608 continue;
1609 }
1610
1611 ifp = ifindex2ifnet[ifindex];
1612 if (ifp == NULL) {
1613 continue;
1614 }
1615
1616 if (IFNET_IS_EXPENSIVE(ifp) &&
1617 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1618 info->ifindex = IFSCOPE_NONE;
1619 }
1620
1621 if (IFNET_IS_CONSTRAINED(ifp) &&
1622 (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1623 info->ifindex = IFSCOPE_NONE;
1624 }
1625
1626 if (IFNET_IS_CELLULAR(ifp) &&
1627 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1628 info->ifindex = IFSCOPE_NONE;
1629 }
1630 }
1631
1632 ifnet_head_done();
1633 }
1634