1 /*
2 * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/locks.h>
30 #include <kern/policy_internal.h>
31 #include <kern/zalloc.h>
32
33 #include <mach/sdt.h>
34
35 #include <sys/domain.h>
36 #include <sys/kdebug.h>
37 #include <sys/kern_control.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/mcache.h>
41 #include <sys/param.h>
42 #include <sys/proc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/systm.h>
50
51 #include <net/content_filter.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_cache.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_var.h>
62 #include <netinet/mptcp_var.h>
63 #include <netinet/mptcp.h>
64 #include <netinet/mptcp_opt.h>
65 #include <netinet/mptcp_seq.h>
66 #include <netinet/mptcp_timer.h>
67 #include <libkern/crypto/sha1.h>
68 #include <libkern/crypto/sha2.h>
69 #include <netinet6/in6_pcb.h>
70 #include <netinet6/ip6protosw.h>
71 #include <dev/random/randomdev.h>
72 #include <net/sockaddr_utils.h>
73
74 /*
75 * Notes on MPTCP implementation.
76 *
77 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
78 * communication domain. The structure mtcbinfo describes the MPTCP instance
79 * of a Multipath protocol in that domain. It is used to keep track of all
80 * MPTCP PCB instances in the system, and is protected by the global lock
81 * mppi_lock.
82 *
83 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
84 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
85 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
86 * allocated from the same memory block, and each structure has a pointer
87 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
88 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
89 * PCB (mppcb) as well as the MPTCP Session (mptses).
90 *
91 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
92 *
93 * A functioning MPTCP Session consists of one or more subflow sockets. Each
94 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
95 * represented by the mptsub structure. Because each subflow requires access
96 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
97 * subflow. This gets decremented prior to the subflow's destruction.
98 *
99 * To handle events (read, write, control) from the subflows, we do direct
100 * upcalls into the specific function.
101 *
102 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
103 * lock. Incoming data on a subflow also ends up taking this single lock. To
104 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
105 * of the MPTCP-socket.
106 *
107 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
108 * work is done by the MPTCP garbage collector which is invoked on demand by
109 * the PF_MULTIPATH garbage collector. This process will take place once all
110 * of the subflows have been destroyed.
111 */
112
113 static void mptcp_subflow_abort(struct mptsub *, int);
114
115 static void mptcp_send_dfin(struct socket *so);
116 static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
117 static int mptcp_freeq(struct mptcb *mp_tp);
118
119 /*
120 * Possible return values for subflow event handlers. Note that success
121 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
122 * indicate errors or actions which require immediate attention; they will
123 * prevent the rest of the handlers from processing their respective events
124 * until the next round of events processing.
125 */
126 typedef enum {
127 MPTS_EVRET_DELETE = 1, /* delete this subflow */
128 MPTS_EVRET_OK = 2, /* OK */
129 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
130 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
131 } ev_ret_t;
132
133 static void mptcp_do_sha1(mptcp_key_t *, char sha_digest[SHA1_RESULTLEN]);
134 static void mptcp_do_sha256(mptcp_key_t *, char sha_digest[SHA256_DIGEST_LENGTH]);
135
136 static void mptcp_init_local_parms(struct mptses *, struct sockaddr *);
137
138 static KALLOC_TYPE_DEFINE(mptsub_zone, struct mptsub, NET_KT_DEFAULT);
139 static KALLOC_TYPE_DEFINE(mptopt_zone, struct mptopt, NET_KT_DEFAULT);
140 static KALLOC_TYPE_DEFINE(mpt_subauth_zone, struct mptcp_subf_auth_entry,
141 NET_KT_DEFAULT);
142
143 struct mppcbinfo mtcbinfo;
144
145 SYSCTL_DECL(_net_inet);
146
147 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
148
149 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
150 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
151
152
153 static int mptcp_alternate_port = 0;
154 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
155 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
156
157 static struct protosw mptcp_subflow_protosw;
158 static struct pr_usrreqs mptcp_subflow_usrreqs;
159 static struct ip6protosw mptcp_subflow_protosw6;
160 static struct pr_usrreqs mptcp_subflow_usrreqs6;
161
162 static uint8_t mptcp_create_subflows_scheduled;
163
164 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
165 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
166 static uint32_t mptcp_kern_skt_inuse = 0;
167 static uint32_t mptcp_kern_skt_unit;
168 static symptoms_advisory_t mptcp_advisory;
169
170 uint32_t mptcp_cellicon_refcount = 0;
171
172 os_log_t mptcp_log_handle;
173
174 int
mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats * stats __counted_by (stats_count),uint16_t stats_count,u_short ifindex,boolean_t create)175 mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats __counted_by(stats_count), uint16_t stats_count, u_short ifindex, boolean_t create)
176 {
177 int i, index = -1;
178
179 VERIFY(stats_count <= MPTCP_ITFSTATS_SIZE);
180
181 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
182 if (create && stats[i].ifindex == IFSCOPE_NONE) {
183 if (index < 0) {
184 index = i;
185 }
186 continue;
187 }
188
189 if (stats[i].ifindex == ifindex) {
190 index = i;
191 return index;
192 }
193 }
194
195 if (index != -1) {
196 stats[index].ifindex = ifindex;
197 }
198
199 return index;
200 }
201
202 static int
mptcpstats_get_index(struct mptcp_itf_stats * stats __counted_by (stats_count),uint16_t stats_count,const struct mptsub * mpts)203 mptcpstats_get_index(struct mptcp_itf_stats *stats __counted_by(stats_count), uint16_t stats_count, const struct mptsub *mpts)
204 {
205 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
206 int index;
207
208 VERIFY(stats_count <= MPTCP_ITFSTATS_SIZE);
209
210 if (ifp == NULL) {
211 os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
212 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
213 sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
214 return -1;
215 }
216
217 index = mptcpstats_get_index_by_ifindex(stats, MPTCP_ITFSTATS_SIZE, ifp->if_index, true);
218
219 if (index != -1) {
220 if (stats[index].is_expensive == 0) {
221 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
222 }
223 }
224
225 return index;
226 }
227
228 void
mptcpstats_inc_switch(struct mptses * mpte,const struct mptsub * mpts)229 mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
230 {
231 int index;
232
233 tcpstat.tcps_mp_switches++;
234 mpte->mpte_subflow_switches++;
235
236 index = mptcpstats_get_index(mpte->mpte_itfstats, MPTCP_ITFSTATS_SIZE, mpts);
237
238 if (index != -1) {
239 mpte->mpte_itfstats[index].switches++;
240 }
241 }
242
243 /*
244 * Flushes all recorded socket options from an MP socket.
245 */
246 static void
mptcp_flush_sopts(struct mptses * mpte)247 mptcp_flush_sopts(struct mptses *mpte)
248 {
249 struct mptopt *mpo, *tmpo;
250
251 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
252 mptcp_sopt_remove(mpte, mpo);
253 mptcp_sopt_free(mpo);
254 }
255 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
256 }
257
258 /*
259 * Create an MPTCP session, called as a result of opening a MPTCP socket.
260 */
261 int
mptcp_session_create(struct mppcb * mpp)262 mptcp_session_create(struct mppcb *mpp)
263 {
264 struct mpp_mtp *mtp;
265 struct mppcbinfo *mppi;
266 struct mptses *mpte;
267 struct mptcb *mp_tp;
268
269 VERIFY(mpp != NULL);
270 mppi = mpp->mpp_pcbinfo;
271 VERIFY(mppi != NULL);
272
273 mtp = __container_of(mpp, struct mpp_mtp, mpp);
274 mpte = &mtp->mpp_ses;
275 mp_tp = &mtp->mtcb;
276
277 /* MPTCP Multipath PCB Extension */
278 bzero(mpte, sizeof(*mpte));
279 VERIFY(mpp->mpp_pcbe == NULL);
280 mpp->mpp_pcbe = mpte;
281 mpte->mpte_mppcb = mpp;
282 mpte->mpte_mptcb = mp_tp;
283
284 TAILQ_INIT(&mpte->mpte_sopts);
285 TAILQ_INIT(&mpte->mpte_subflows);
286 mpte->mpte_associd = SAE_ASSOCID_ANY;
287 mpte->mpte_connid_last = SAE_CONNID_ANY;
288
289 mptcp_init_urgency_timer(mpte);
290
291 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
292 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
293
294 if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
295 mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
296 }
297
298 mpte->mpte_last_cellicon_set = tcp_now;
299
300 /* MPTCP Protocol Control Block */
301 bzero(mp_tp, sizeof(*mp_tp));
302 mp_tp->mpt_mpte = mpte;
303 mp_tp->mpt_state = MPTCPS_CLOSED;
304
305 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
306
307 return 0;
308 }
309
310 struct sockaddr *
mptcp_get_session_dst(struct mptses * mpte,boolean_t ipv6,boolean_t ipv4)311 mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
312 {
313 if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
314 return SA(&mpte->mpte_sub_dst_v6);
315 }
316
317 if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
318 return SA(&mpte->mpte_sub_dst_v4);
319 }
320
321 /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
322 * meaning we prefer IPv6 over IPv4.
323 */
324 if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
325 return SA(&mpte->mpte_sub_dst_v6);
326 }
327
328 if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
329 return SA(&mpte->mpte_sub_dst_v4);
330 }
331
332 /* We don't yet have a unicast IP */
333 return NULL;
334 }
335
336 static void
mptcpstats_get_bytes(struct mptses * mpte,boolean_t initial_cell,uint64_t * cellbytes,uint64_t * allbytes)337 mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
338 uint64_t *cellbytes, uint64_t *allbytes)
339 {
340 int64_t mycellbytes = 0;
341 uint64_t myallbytes = 0;
342 int i;
343
344 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
345 if (mpte->mpte_itfstats[i].is_expensive) {
346 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
347 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
348 }
349
350 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
351 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
352 }
353
354 if (initial_cell) {
355 mycellbytes -= mpte->mpte_init_txbytes;
356 mycellbytes -= mpte->mpte_init_rxbytes;
357 }
358
359 if (mycellbytes < 0) {
360 os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
361 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
362 *cellbytes = 0;
363 *allbytes = 0;
364 } else {
365 *cellbytes = mycellbytes;
366 *allbytes = myallbytes;
367 }
368 }
369
370 static void
mptcpstats_session_wrapup(struct mptses * mpte)371 mptcpstats_session_wrapup(struct mptses *mpte)
372 {
373 boolean_t cell = mpte->mpte_initial_cell;
374
375 switch (mpte->mpte_svctype) {
376 case MPTCP_SVCTYPE_HANDOVER:
377 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
378 tcpstat.tcps_mptcp_fp_handover_attempt++;
379
380 if (cell && mpte->mpte_handshake_success) {
381 tcpstat.tcps_mptcp_fp_handover_success_cell++;
382
383 if (mpte->mpte_used_wifi) {
384 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
385 }
386 } else if (mpte->mpte_handshake_success) {
387 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
388
389 if (mpte->mpte_used_cell) {
390 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
391 }
392 }
393 } else {
394 tcpstat.tcps_mptcp_handover_attempt++;
395
396 if (cell && mpte->mpte_handshake_success) {
397 tcpstat.tcps_mptcp_handover_success_cell++;
398
399 if (mpte->mpte_used_wifi) {
400 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
401 }
402 } else if (mpte->mpte_handshake_success) {
403 tcpstat.tcps_mptcp_handover_success_wifi++;
404
405 if (mpte->mpte_used_cell) {
406 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
407 }
408 }
409 }
410
411 if (mpte->mpte_handshake_success) {
412 uint64_t cellbytes;
413 uint64_t allbytes;
414
415 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
416
417 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
418 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
419 }
420 break;
421 case MPTCP_SVCTYPE_INTERACTIVE:
422 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
423 tcpstat.tcps_mptcp_fp_interactive_attempt++;
424
425 if (mpte->mpte_handshake_success) {
426 tcpstat.tcps_mptcp_fp_interactive_success++;
427
428 if (!cell && mpte->mpte_used_cell) {
429 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
430 }
431 }
432 } else {
433 tcpstat.tcps_mptcp_interactive_attempt++;
434
435 if (mpte->mpte_handshake_success) {
436 tcpstat.tcps_mptcp_interactive_success++;
437
438 if (!cell && mpte->mpte_used_cell) {
439 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
440 }
441 }
442 }
443
444 if (mpte->mpte_handshake_success) {
445 uint64_t cellbytes;
446 uint64_t allbytes;
447
448 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
449
450 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
451 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
452 }
453 break;
454 case MPTCP_SVCTYPE_AGGREGATE:
455 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
456 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
457
458 if (mpte->mpte_handshake_success) {
459 tcpstat.tcps_mptcp_fp_aggregate_success++;
460 }
461 } else {
462 tcpstat.tcps_mptcp_aggregate_attempt++;
463
464 if (mpte->mpte_handshake_success) {
465 tcpstat.tcps_mptcp_aggregate_success++;
466 }
467 }
468
469 if (mpte->mpte_handshake_success) {
470 uint64_t cellbytes;
471 uint64_t allbytes;
472
473 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
474
475 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
476 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
477 }
478 break;
479 }
480
481 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
482 tcpstat.tcps_mptcp_back_to_wifi++;
483 }
484
485 if (mpte->mpte_triggered_cell) {
486 tcpstat.tcps_mptcp_triggered_cell++;
487 }
488 }
489
490 /*
491 * Destroy an MPTCP session.
492 */
493 static void
mptcp_session_destroy(struct mptses * mpte)494 mptcp_session_destroy(struct mptses *mpte)
495 {
496 struct mptcb *mp_tp = mpte->mpte_mptcb;
497
498 VERIFY(mp_tp != NULL);
499 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
500
501 mptcpstats_session_wrapup(mpte);
502 mptcp_unset_cellicon(mpte, NULL, mpte->mpte_cellicon_increments);
503 mptcp_flush_sopts(mpte);
504
505 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
506 kfree_data_counted_by(mpte->mpte_itfinfo, mpte->mpte_itfinfo_size);
507 }
508 mpte->mpte_itfinfo = NULL;
509 mpte->mpte_itfinfo_size = 0;
510
511 mptcp_freeq(mp_tp);
512 m_freem_list(mpte->mpte_reinjectq);
513
514 os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
515 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
516 }
517
518 boolean_t
mptcp_ok_to_create_subflows(struct mptcb * mp_tp)519 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
520 {
521 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
522 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
523 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
524 }
525
526 static int
mptcp_synthesize_nat64(struct in6_addr * addr0,uint32_t len,const struct in_addr * addrv4_0)527 mptcp_synthesize_nat64(struct in6_addr *addr0, uint32_t len,
528 const struct in_addr *addrv4_0)
529 {
530 static const struct in6_addr well_known_prefix = {
531 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
532 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
533 0x00, 0x00, 0x00, 0x00},
534 };
535 struct in6_addr *addr = addr0;
536 char *ptr = (char *)addr;
537 const struct in_addr *addrv4 = addrv4_0;
538 const char *ptrv4 = (const char *)addrv4;
539
540 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
541 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
542 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
543 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
544 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
545 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
546 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
547 return -1;
548 }
549
550 /* Check for the well-known prefix */
551 if (len == NAT64_PREFIX_LEN_96 &&
552 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
553 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
554 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
555 return -1;
556 }
557 }
558
559 switch (len) {
560 case NAT64_PREFIX_LEN_96:
561 memcpy(ptr + 12, ptrv4, 4);
562 break;
563 case NAT64_PREFIX_LEN_64:
564 memcpy(ptr + 9, ptrv4, 4);
565 break;
566 case NAT64_PREFIX_LEN_56:
567 memcpy(ptr + 7, ptrv4, 1);
568 memcpy(ptr + 9, ptrv4 + 1, 3);
569 break;
570 case NAT64_PREFIX_LEN_48:
571 memcpy(ptr + 6, ptrv4, 2);
572 memcpy(ptr + 9, ptrv4 + 2, 2);
573 break;
574 case NAT64_PREFIX_LEN_40:
575 memcpy(ptr + 5, ptrv4, 3);
576 memcpy(ptr + 9, ptrv4 + 3, 1);
577 break;
578 case NAT64_PREFIX_LEN_32:
579 memcpy(ptr + 4, ptrv4, 4);
580 break;
581 default:
582 panic("NAT64-prefix len is wrong: %u", len);
583 }
584
585 return 0;
586 }
587
588 static void
mptcp_trigger_cell_bringup(struct mptses * mpte)589 mptcp_trigger_cell_bringup(struct mptses *mpte)
590 {
591 struct socket *mp_so = mptetoso(mpte);
592
593 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
594 uuid_string_t uuidstr;
595 int err;
596
597 socket_unlock(mp_so, 0);
598 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
599 TRUE);
600 socket_lock(mp_so, 0);
601
602 if (err == 0) {
603 mpte->mpte_triggered_cell = 1;
604 }
605
606 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
607 os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
608 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
609 } else {
610 os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
611 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
612 }
613 }
614
615 static boolean_t
mptcp_subflow_disconnecting(struct mptsub * mpts)616 mptcp_subflow_disconnecting(struct mptsub *mpts)
617 {
618 if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
619 return true;
620 }
621
622 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
623 return true;
624 }
625
626 if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
627 return true;
628 }
629
630 return false;
631 }
632
633 /*
634 * In Handover mode, only create cell subflow if
635 * - Symptoms marked WiFi as weak:
636 * Here, if we are sending data, then we can check the RTO-state. That is a
637 * stronger signal of WiFi quality than the Symptoms indicator.
638 * If however we are not sending any data, the only thing we can do is guess
639 * and thus bring up Cell.
640 *
641 * - Symptoms marked WiFi as unknown:
642 * In this state we don't know what the situation is and thus remain
643 * conservative, only bringing up cell if there are retransmissions going on.
644 */
645 static boolean_t
mptcp_handover_use_cellular(struct mptses * mpte,struct tcpcb * tp)646 mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
647 {
648 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
649
650 if (wifi_quality == MPTCP_WIFI_QUALITY_GOOD) {
651 /* WiFi is good - don't use cell */
652 return false;
653 }
654
655 if (wifi_quality == MPTCP_WIFI_QUALITY_UNSURE) {
656 /*
657 * We are in unknown state, only use Cell if we have confirmed
658 * that WiFi is bad.
659 */
660 if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
661 return true;
662 } else {
663 return false;
664 }
665 }
666
667 if (wifi_quality == MPTCP_WIFI_QUALITY_BAD) {
668 /*
669 * WiFi is confirmed to be bad from Symptoms-Framework.
670 * If we are sending data, check the RTOs.
671 * Otherwise, be pessimistic and use Cell.
672 */
673 if (mptetoso(mpte)->so_snd.sb_cc != 0) {
674 if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
675 return true;
676 } else {
677 return false;
678 }
679 } else {
680 return true;
681 }
682 }
683
684 return false;
685 }
686
687 void
mptcp_check_subflows_and_add(struct mptses * mpte)688 mptcp_check_subflows_and_add(struct mptses *mpte)
689 {
690 struct mptcb *mp_tp = mpte->mpte_mptcb;
691 boolean_t cellular_viable = FALSE;
692 boolean_t want_cellular = TRUE;
693 uint32_t i;
694
695 if (!mptcp_ok_to_create_subflows(mp_tp)) {
696 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
697 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
698 return;
699 }
700
701 /* Just to see if we have an IP-address available */
702 if (mptcp_get_session_dst(mpte, false, false) == NULL) {
703 return;
704 }
705
706 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
707 boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
708 struct mpt_itf_info *info;
709 struct sockaddr_in6 nat64pre;
710 struct sockaddr *dst;
711 struct mptsub *mpts;
712 struct ifnet *ifp;
713 uint32_t ifindex;
714
715 info = &mpte->mpte_itfinfo[i];
716
717 ifindex = info->ifindex;
718 if (ifindex == IFSCOPE_NONE) {
719 continue;
720 }
721
722 os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
723 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
724 info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
725
726 if (info->no_mptcp_support) {
727 continue;
728 }
729
730 ifnet_head_lock_shared();
731 ifp = ifindex2ifnet[ifindex];
732 ifnet_head_done();
733
734 if (ifp == NULL) {
735 continue;
736 }
737
738 if (IFNET_IS_CELLULAR(ifp)) {
739 cellular_viable = TRUE;
740
741 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
742 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
743 if (mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD) {
744 continue;
745 }
746 }
747 }
748
749 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
750 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
751 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
752
753 if (subifp == NULL) {
754 continue;
755 }
756
757 /*
758 * If there is at least one functioning subflow on WiFi
759 * and we are checking for the cell interface, then
760 * we always need to ask symptoms for permission as
761 * cell is triggered even if WiFi is available.
762 */
763 if (!IFNET_IS_CELLULAR(subifp) &&
764 !mptcp_subflow_disconnecting(mpts) &&
765 IFNET_IS_CELLULAR(ifp)) {
766 need_to_ask_symptoms = TRUE;
767 }
768
769 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
770 os_log(mptcp_log_handle,
771 "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
772 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
773 mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
774 IFNET_IS_CELLULAR(subifp),
775 mptcp_wifi_quality_for_session(mpte),
776 mpts->mpts_flags,
777 tp->t_rxtshift,
778 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
779 mptetoso(mpte)->so_snd.sb_cc,
780 ifindex, subifp->if_index,
781 tp->t_srtt >> TCP_RTT_SHIFT,
782 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
783 tp->t_rxtcur);
784
785 if (!IFNET_IS_CELLULAR(subifp) &&
786 !mptcp_subflow_disconnecting(mpts) &&
787 (mpts->mpts_flags & MPTSF_CONNECTED) &&
788 !mptcp_handover_use_cellular(mpte, tp)) {
789 found = TRUE;
790
791 /* We found a proper subflow on WiFi - no need for cell */
792 want_cellular = FALSE;
793 break;
794 }
795 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
796 uint64_t time_now = mach_continuous_time();
797
798 os_log(mptcp_log_handle,
799 "%s - %lx: target-based: %llu now %llu wifi quality %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
800 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
801 time_now, mptcp_wifi_quality_for_session(mpte),
802 IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
803 mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
804
805 if (!IFNET_IS_CELLULAR(subifp) &&
806 !mptcp_subflow_disconnecting(mpts) &&
807 (mpte->mpte_time_target == 0 ||
808 (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
809 mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD)) {
810 found = TRUE;
811
812 want_cellular = FALSE;
813 break;
814 }
815 }
816
817 if (subifp->if_index == ifindex &&
818 !mptcp_subflow_disconnecting(mpts)) {
819 /*
820 * We found a subflow on this interface.
821 * No need to create a new one.
822 */
823 found = TRUE;
824 break;
825 }
826 }
827
828 if (found) {
829 continue;
830 }
831
832 if (need_to_ask_symptoms &&
833 !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
834 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
835 mptcp_developer_mode == 0) {
836 mptcp_ask_symptoms(mpte);
837 return;
838 }
839
840 dst = mptcp_get_session_dst(mpte, info->has_v6_conn, info->has_v4_conn);
841
842 if (dst->sa_family == AF_INET &&
843 !info->has_v4_conn && info->has_nat64_conn) {
844 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
845 int error, j;
846
847 SOCKADDR_ZERO(&nat64pre, sizeof(struct sockaddr_in6));
848
849 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
850 if (error) {
851 os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
852 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
853 continue;
854 }
855
856 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
857 if (nat64prefixes[j].prefix_len != 0) {
858 break;
859 }
860 }
861
862 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
863
864 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
865 nat64prefixes[j].prefix_len,
866 &SIN(dst)->sin_addr);
867 if (error != 0) {
868 os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
869 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
870 continue;
871 }
872
873 memcpy(&nat64pre.sin6_addr,
874 &nat64prefixes[j].ipv6_prefix,
875 sizeof(nat64pre.sin6_addr));
876 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
877 nat64pre.sin6_family = AF_INET6;
878 nat64pre.sin6_port = SIN(dst)->sin_port;
879 nat64pre.sin6_flowinfo = 0;
880 nat64pre.sin6_scope_id = 0;
881
882 dst = SA(&nat64pre);
883 }
884
885 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
886 continue;
887 }
888 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
889 continue;
890 }
891
892 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
893 }
894
895 if (!cellular_viable && want_cellular) {
896 /* Trigger Cell Bringup */
897 mptcp_trigger_cell_bringup(mpte);
898 }
899 }
900
901 static void
mptcp_remove_cell_subflows(struct mptses * mpte)902 mptcp_remove_cell_subflows(struct mptses *mpte)
903 {
904 struct mptsub *mpts, *tmpts;
905
906 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
907 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
908
909 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
910 continue;
911 }
912
913 os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
914 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
915
916 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
917 }
918
919 return;
920 }
921
922 static void
mptcp_remove_wifi_subflows(struct mptses * mpte)923 mptcp_remove_wifi_subflows(struct mptses *mpte)
924 {
925 struct mptsub *mpts, *tmpts;
926
927 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
928 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
929
930 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
931 continue;
932 }
933
934 os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
935 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
936
937 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
938 }
939
940 return;
941 }
942
943 static void
mptcp_pure_handover_subflows_remove(struct mptses * mpte)944 mptcp_pure_handover_subflows_remove(struct mptses *mpte)
945 {
946 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
947 boolean_t found_working_wifi_subflow = false;
948 boolean_t found_working_cell_subflow = false;
949
950 struct mptsub *mpts;
951
952 /*
953 * Look for a subflow that is on a non-cellular interface in connected
954 * state.
955 *
956 * In that case, remove all cellular subflows.
957 *
958 * If however there is no connected subflow
959 */
960 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
961 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
962 struct socket *so;
963 struct tcpcb *tp;
964
965 if (ifp == NULL) {
966 continue;
967 }
968
969 so = mpts->mpts_socket;
970 tp = sototcpcb(so);
971
972 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
973 tp->t_state != TCPS_ESTABLISHED ||
974 mptcp_subflow_disconnecting(mpts)) {
975 continue;
976 }
977
978 if (IFNET_IS_CELLULAR(ifp)) {
979 found_working_cell_subflow = true;
980 } else {
981 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
982 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
983 if (!mptcp_handover_use_cellular(mpte, tp)) {
984 found_working_wifi_subflow = true;
985 }
986 }
987 }
988
989 /*
990 * Couldn't find a working subflow, let's not remove those on a cellular
991 * interface.
992 */
993 os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
994 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
995 found_working_wifi_subflow, found_working_cell_subflow);
996 if (!found_working_wifi_subflow && wifi_quality != MPTCP_WIFI_QUALITY_GOOD) {
997 if (found_working_cell_subflow) {
998 mptcp_remove_wifi_subflows(mpte);
999 }
1000 return;
1001 }
1002
1003 mptcp_remove_cell_subflows(mpte);
1004 }
1005
1006 static void
mptcp_handover_subflows_remove(struct mptses * mpte)1007 mptcp_handover_subflows_remove(struct mptses *mpte)
1008 {
1009 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
1010 boolean_t found_working_subflow = false;
1011 struct mptsub *mpts;
1012
1013 /*
1014 * Look for a subflow that is on a non-cellular interface
1015 * and actually works (aka, no retransmission timeout).
1016 */
1017 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1018 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1019 struct socket *so;
1020 struct tcpcb *tp;
1021
1022 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1023 continue;
1024 }
1025
1026 so = mpts->mpts_socket;
1027 tp = sototcpcb(so);
1028
1029 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1030 tp->t_state != TCPS_ESTABLISHED) {
1031 continue;
1032 }
1033
1034 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
1035 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
1036
1037 if (!mptcp_handover_use_cellular(mpte, tp)) {
1038 found_working_subflow = true;
1039 break;
1040 }
1041 }
1042
1043 /*
1044 * Couldn't find a working subflow, let's not remove those on a cellular
1045 * interface.
1046 */
1047 if (!found_working_subflow) {
1048 return;
1049 }
1050
1051 mptcp_remove_cell_subflows(mpte);
1052 }
1053
1054 static void
mptcp_targetbased_subflows_remove(struct mptses * mpte)1055 mptcp_targetbased_subflows_remove(struct mptses *mpte)
1056 {
1057 uint64_t time_now = mach_continuous_time();
1058 struct mptsub *mpts;
1059
1060 if (mpte->mpte_time_target != 0 &&
1061 (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1062 mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
1063 /* WiFi is bad and we are below the target - don't remove any subflows */
1064 return;
1065 }
1066
1067 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1068 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1069
1070 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1071 continue;
1072 }
1073
1074 /* We have a functioning subflow on WiFi. No need for cell! */
1075 if (mpts->mpts_flags & MPTSF_CONNECTED &&
1076 !mptcp_subflow_disconnecting(mpts)) {
1077 mptcp_remove_cell_subflows(mpte);
1078 break;
1079 }
1080 }
1081 }
1082
1083 /*
1084 * Based on the MPTCP Service-type and the state of the subflows, we
1085 * will destroy subflows here.
1086 */
1087 void
mptcp_check_subflows_and_remove(struct mptses * mpte)1088 mptcp_check_subflows_and_remove(struct mptses *mpte)
1089 {
1090 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1091 return;
1092 }
1093
1094 socket_lock_assert_owned(mptetoso(mpte));
1095
1096 if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1097 mptcp_pure_handover_subflows_remove(mpte);
1098 }
1099
1100 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1101 mptcp_handover_subflows_remove(mpte);
1102 }
1103
1104 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1105 mptcp_targetbased_subflows_remove(mpte);
1106 }
1107 }
1108
1109 static void
mptcp_remove_subflows(struct mptses * mpte)1110 mptcp_remove_subflows(struct mptses *mpte)
1111 {
1112 struct mptsub *mpts, *tmpts;
1113
1114 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
1115 return;
1116 }
1117
1118 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1119 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1120 boolean_t found = false;
1121 uint32_t ifindex;
1122 uint32_t i;
1123
1124 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1125 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1126
1127 os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1128 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1129 ifp ? ifp->if_index : -1);
1130 soevent(mpts->mpts_socket,
1131 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1132
1133 continue;
1134 }
1135
1136 if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1137 continue;
1138 }
1139
1140 if (ifp) {
1141 ifindex = ifp->if_index;
1142 } else {
1143 ifindex = mpts->mpts_ifscope;
1144 }
1145
1146 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1147 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1148 continue;
1149 }
1150
1151 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1152 if (mpts->mpts_dst.sa_family == AF_INET6 &&
1153 (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1154 found = true;
1155 break;
1156 }
1157
1158 if (mpts->mpts_dst.sa_family == AF_INET &&
1159 mpte->mpte_itfinfo[i].has_v4_conn) {
1160 found = true;
1161 break;
1162 }
1163 }
1164 }
1165
1166 if (!found) {
1167 os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1168 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1169 ifindex, mpts->mpts_flags);
1170
1171 soevent(mpts->mpts_socket,
1172 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1173 }
1174 }
1175 }
1176
1177 static void
mptcp_create_subflows(__unused void * arg)1178 mptcp_create_subflows(__unused void *arg)
1179 {
1180 struct mppcb *mpp;
1181
1182 /*
1183 * Start with clearing, because we might be processing connections
1184 * while a new event comes in.
1185 */
1186 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled)) {
1187 os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1188 }
1189
1190 /* Iterate over all MPTCP connections */
1191
1192 lck_mtx_lock(&mtcbinfo.mppi_lock);
1193
1194 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1195 struct socket *mp_so = mpp->mpp_socket;
1196 struct mptses *mpte = mpp->mpp_pcbe;
1197
1198 socket_lock(mp_so, 1);
1199 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS) ||
1200 !(mpte->mpte_flags & MPTE_ITFINFO_INIT)) {
1201 socket_unlock(mp_so, 1);
1202 continue;
1203 }
1204
1205 VERIFY(mp_so->so_usecount > 0);
1206
1207 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1208
1209 mptcp_check_subflows_and_add(mpte);
1210 mptcp_remove_subflows(mpte);
1211
1212 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1213 socket_unlock(mp_so, 1);
1214 }
1215
1216 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1217 }
1218
1219 /*
1220 * We need this because we are coming from an NECP-event. This event gets posted
1221 * while holding NECP-locks. The creation of the subflow however leads us back
1222 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1223 * So, we would deadlock there as we already hold the NECP-lock.
1224 *
1225 * So, let's schedule this separately. It also gives NECP the chance to make
1226 * progress, without having to wait for MPTCP to finish its subflow creation.
1227 */
1228 void
mptcp_sched_create_subflows(struct mptses * mpte)1229 mptcp_sched_create_subflows(struct mptses *mpte)
1230 {
1231 struct mppcb *mpp = mpte->mpte_mppcb;
1232 struct mptcb *mp_tp = mpte->mpte_mptcb;
1233 struct socket *mp_so = mpp->mpp_socket;
1234
1235 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1236 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1237 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1238 return;
1239 }
1240
1241 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1242 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1243 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1244 }
1245
1246 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled)) {
1247 return;
1248 }
1249
1250 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1251 timeout(mptcp_create_subflows, NULL, hz / 10);
1252 }
1253
1254 /*
1255 * Allocate an MPTCP socket option structure.
1256 */
1257 struct mptopt *
mptcp_sopt_alloc(void)1258 mptcp_sopt_alloc(void)
1259 {
1260 return zalloc_flags(mptopt_zone, Z_WAITOK | Z_ZERO);
1261 }
1262
1263 /*
1264 * Free an MPTCP socket option structure.
1265 */
1266 void
mptcp_sopt_free(struct mptopt * mpo)1267 mptcp_sopt_free(struct mptopt *mpo)
1268 {
1269 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1270
1271 zfree(mptopt_zone, mpo);
1272 }
1273
1274 /*
1275 * Add a socket option to the MPTCP socket option list.
1276 */
1277 void
mptcp_sopt_insert(struct mptses * mpte,struct mptopt * mpo)1278 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1279 {
1280 socket_lock_assert_owned(mptetoso(mpte));
1281 mpo->mpo_flags |= MPOF_ATTACHED;
1282 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1283 }
1284
1285 /*
1286 * Remove a socket option from the MPTCP socket option list.
1287 */
1288 void
mptcp_sopt_remove(struct mptses * mpte,struct mptopt * mpo)1289 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1290 {
1291 socket_lock_assert_owned(mptetoso(mpte));
1292 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1293 mpo->mpo_flags &= ~MPOF_ATTACHED;
1294 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1295 }
1296
1297 /*
1298 * Search for an existing <sopt_level,sopt_name> socket option.
1299 */
1300 struct mptopt *
mptcp_sopt_find(struct mptses * mpte,struct sockopt * sopt)1301 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1302 {
1303 struct mptopt *mpo;
1304
1305 socket_lock_assert_owned(mptetoso(mpte));
1306
1307 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1308 if (mpo->mpo_level == sopt->sopt_level &&
1309 mpo->mpo_name == sopt->sopt_name) {
1310 break;
1311 }
1312 }
1313 return mpo;
1314 }
1315
1316 /*
1317 * Allocate a MPTCP subflow structure.
1318 */
1319 static struct mptsub *
mptcp_subflow_alloc(void)1320 mptcp_subflow_alloc(void)
1321 {
1322 return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1323 }
1324
1325 /*
1326 * Deallocate a subflow structure, called when all of the references held
1327 * on it have been released. This implies that the subflow has been deleted.
1328 */
1329 static void
mptcp_subflow_free(struct mptsub * mpts)1330 mptcp_subflow_free(struct mptsub *mpts)
1331 {
1332 VERIFY(mpts->mpts_refcnt == 0);
1333 VERIFY(mpts->mpts_mpte == NULL);
1334 VERIFY(mpts->mpts_socket == NULL);
1335
1336 free_sockaddr(mpts->mpts_src);
1337
1338 zfree(mptsub_zone, mpts);
1339 }
1340
1341 static void
mptcp_subflow_addref(struct mptsub * mpts)1342 mptcp_subflow_addref(struct mptsub *mpts)
1343 {
1344 if (++mpts->mpts_refcnt == 0) {
1345 panic("%s: mpts %p wraparound refcnt", __func__, mpts);
1346 }
1347 /* NOTREACHED */
1348 }
1349
1350 static void
mptcp_subflow_remref(struct mptsub * mpts)1351 mptcp_subflow_remref(struct mptsub *mpts)
1352 {
1353 if (mpts->mpts_refcnt == 0) {
1354 panic("%s: mpts %p negative refcnt", __func__, mpts);
1355 /* NOTREACHED */
1356 }
1357 if (--mpts->mpts_refcnt > 0) {
1358 return;
1359 }
1360
1361 /* callee will unlock and destroy lock */
1362 mptcp_subflow_free(mpts);
1363 }
1364
1365 static void
mptcp_subflow_attach(struct mptses * mpte,struct mptsub * mpts,struct socket * so)1366 mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1367 {
1368 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1369 struct tcpcb *tp = sototcpcb(so);
1370
1371 /*
1372 * From this moment on, the subflow is linked to the MPTCP-connection.
1373 * Locking,... happens now at the MPTCP-layer
1374 */
1375 tp->t_mptcb = mpte->mpte_mptcb;
1376 so->so_flags |= SOF_MP_SUBFLOW;
1377 mp_so->so_usecount++;
1378
1379 /*
1380 * Insert the subflow into the list, and associate the MPTCP PCB
1381 * as well as the the subflow socket. From this point on, removing
1382 * the subflow needs to be done via mptcp_subflow_del().
1383 */
1384 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1385 mpte->mpte_numflows++;
1386
1387 mpts->mpts_mpte = mpte;
1388 mpts->mpts_socket = so;
1389 tp->t_mpsub = mpts;
1390 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1391 mptcp_subflow_addref(mpts); /* for subflow socket */
1392 }
1393
1394 static void
mptcp_subflow_necp_cb(void * handle,__unused int action,__unused uint32_t interface_index,uint32_t necp_flags,bool * viable)1395 mptcp_subflow_necp_cb(void *handle, __unused int action,
1396 __unused uint32_t interface_index,
1397 uint32_t necp_flags, bool *viable)
1398 {
1399 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1400 struct inpcb *inp = (struct inpcb *)handle;
1401 struct socket *so = inp->inp_socket;
1402 struct mptsub *mpts;
1403 struct mptses *mpte;
1404
1405 if (low_power) {
1406 action = NECP_CLIENT_CBACTION_NONVIABLE;
1407 }
1408
1409 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1410 return;
1411 }
1412
1413 /*
1414 * The socket is being garbage-collected. There is nothing to be done
1415 * here.
1416 */
1417 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1418 return;
1419 }
1420
1421 socket_lock(so, 1);
1422
1423 /* Check again after we acquired the lock. */
1424 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1425 goto out;
1426 }
1427
1428 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1429 mpts = sototcpcb(so)->t_mpsub;
1430
1431 os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1432 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1433
1434 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1435
1436 mptcp_sched_create_subflows(mpte);
1437
1438 if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1439 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
1440 mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1441 viable != NULL) {
1442 *viable = 1;
1443 }
1444
1445 out:
1446 socket_unlock(so, 1);
1447 }
1448
1449 /*
1450 * Create an MPTCP subflow socket.
1451 */
1452 static int
mptcp_subflow_socreate(struct mptses * mpte,struct mptsub * mpts,int dom,struct socket ** so)1453 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1454 struct socket **so)
1455 {
1456 lck_mtx_t *subflow_mtx;
1457 struct mptopt smpo, *mpo, *tmpo;
1458 struct proc *p;
1459 struct socket *mp_so;
1460 struct mppcb *mpp;
1461 int error;
1462
1463 *so = NULL;
1464
1465 mp_so = mptetoso(mpte);
1466 mpp = mpsotomppcb(mp_so);
1467
1468 p = proc_find(mp_so->last_pid);
1469 if (p == PROC_NULL) {
1470 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1471 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1472
1473 mptcp_subflow_free(mpts);
1474 return ESRCH;
1475 }
1476
1477 /*
1478 * Create the subflow socket (multipath subflow, non-blocking.)
1479 *
1480 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1481 * socket; it will be cleared when the socket is peeled off or closed.
1482 * It also indicates to the underlying TCP to handle MPTCP options.
1483 * A multipath subflow socket implies SS_NOFDREF state.
1484 */
1485
1486 /*
1487 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1488 * the ipi-lock. We cannot hold the socket-lock at that point.
1489 */
1490 socket_unlock(mp_so, 0);
1491 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1492 SOCF_MPTCP, PROC_NULL);
1493 socket_lock(mp_so, 0);
1494 if (error) {
1495 os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1496 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1497
1498 proc_rele(p);
1499
1500 mptcp_subflow_free(mpts);
1501 return error;
1502 }
1503
1504 /*
1505 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1506 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1507 * Which is why we also need to get the lock with pr_getlock, as after
1508 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1509 */
1510 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1511 lck_mtx_lock(subflow_mtx);
1512
1513 /*
1514 * Must be the first thing we do, to make sure all pointers for this
1515 * subflow are set.
1516 */
1517 mptcp_subflow_attach(mpte, mpts, *so);
1518
1519 /*
1520 * A multipath subflow socket is used internally in the kernel,
1521 * therefore it does not have a file desciptor associated by
1522 * default.
1523 */
1524 (*so)->so_state |= SS_NOFDREF;
1525
1526 lck_mtx_unlock(subflow_mtx);
1527
1528 /* prevent the socket buffers from being compressed */
1529 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1530 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1531
1532 /* Inherit preconnect and TFO data flags */
1533 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1534 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1535 }
1536 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1537 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1538 }
1539 if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1540 (*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1541 }
1542
1543 /* Inherit uuid and create the related flow. */
1544 if (!uuid_is_null(mpp->necp_client_uuid)) {
1545 struct mptcb *mp_tp = mpte->mpte_mptcb;
1546
1547 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1548
1549 /*
1550 * A note on the unlock: With MPTCP, we do multiple times a
1551 * necp_client_register_socket_flow. This is problematic,
1552 * because now the lock-ordering guarantee (first necp-locks,
1553 * then socket-locks) is no more respected. So, we need to
1554 * unlock here.
1555 */
1556 socket_unlock(mp_so, 0);
1557 error = necp_client_register_socket_flow(mp_so->last_pid,
1558 mpp->necp_client_uuid, sotoinpcb(*so));
1559 socket_lock(mp_so, 0);
1560
1561 if (error) {
1562 os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1563 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1564
1565 goto out_err;
1566 }
1567
1568 /* Possible state-change during the unlock above */
1569 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1570 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1571 os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1572 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1573 mp_tp->mpt_state, mp_tp->mpt_flags);
1574
1575 error = EINVAL;
1576 goto out_err;
1577 }
1578
1579 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpp->necp_client_uuid);
1580 }
1581
1582 if (mpp->inp_necp_attributes.inp_domain != NULL) {
1583 char *buffer = NULL;
1584 size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain);
1585 buffer = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1586 if (buffer != NULL) {
1587 sotoinpcb(*so)->inp_necp_attributes.inp_domain = strlcpy_ret(buffer, mpp->inp_necp_attributes.inp_domain, string_size + 1);
1588 } else {
1589 sotoinpcb(*so)->inp_necp_attributes.inp_domain = NULL;
1590 }
1591 }
1592 if (mpp->inp_necp_attributes.inp_account != NULL) {
1593 char *buffer = NULL;
1594 size_t string_size = strlen(mpp->inp_necp_attributes.inp_account);
1595 buffer = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1596 if (buffer != NULL) {
1597 sotoinpcb(*so)->inp_necp_attributes.inp_account = strlcpy_ret(buffer, mpp->inp_necp_attributes.inp_account, string_size + 1);
1598 } else {
1599 sotoinpcb(*so)->inp_necp_attributes.inp_account = NULL;
1600 }
1601 }
1602
1603 if (mpp->inp_necp_attributes.inp_domain_owner != NULL) {
1604 char *buffer = NULL;
1605 size_t string_size = strlen(mpp->inp_necp_attributes.inp_domain_owner);
1606 buffer = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1607 if (buffer != NULL) {
1608 sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = strlcpy_ret(buffer, mpp->inp_necp_attributes.inp_domain_owner, string_size + 1);
1609 } else {
1610 sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = NULL;
1611 }
1612 }
1613
1614 if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) {
1615 char *buffer = NULL;
1616 size_t string_size = strlen(mpp->inp_necp_attributes.inp_tracker_domain);
1617 buffer = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1618 if (buffer != NULL) {
1619 sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = strlcpy_ret(buffer, mpp->inp_necp_attributes.inp_tracker_domain, string_size + 1);
1620 } else {
1621 sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = NULL;
1622 }
1623 }
1624
1625 /* Needs to happen prior to the delegation! */
1626 (*so)->last_pid = mp_so->last_pid;
1627
1628 if (mp_so->so_flags & SOF_DELEGATED) {
1629 if (mpte->mpte_epid) {
1630 error = so_set_effective_pid(*so, mpte->mpte_epid, p, false);
1631 if (error) {
1632 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1633 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1634 goto out_err;
1635 }
1636 }
1637 if (!uuid_is_null(mpte->mpte_euuid)) {
1638 error = so_set_effective_uuid(*so, mpte->mpte_euuid, p, false);
1639 if (error) {
1640 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1641 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1642 goto out_err;
1643 }
1644 }
1645 }
1646
1647 /* inherit the other socket options */
1648 bzero(&smpo, sizeof(smpo));
1649 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1650 smpo.mpo_level = SOL_SOCKET;
1651 smpo.mpo_intval = 1;
1652
1653 /* disable SIGPIPE */
1654 smpo.mpo_name = SO_NOSIGPIPE;
1655 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1656 goto out_err;
1657 }
1658
1659 /* find out if the subflow's source address goes away */
1660 smpo.mpo_name = SO_NOADDRERR;
1661 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1662 goto out_err;
1663 }
1664
1665 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1666 /*
1667 * On secondary subflows we might need to set the cell-fallback
1668 * flag (see conditions in mptcp_subflow_sosetopt).
1669 */
1670 smpo.mpo_level = SOL_SOCKET;
1671 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1672 smpo.mpo_intval = 1;
1673 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1674 goto out_err;
1675 }
1676 }
1677
1678 /* replay setsockopt(2) on the subflow sockets for eligible options */
1679 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1680 int interim;
1681
1682 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1683 continue;
1684 }
1685
1686 /*
1687 * Skip those that are handled internally; these options
1688 * should not have been recorded and marked with the
1689 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1690 */
1691 if (mpo->mpo_level == SOL_SOCKET &&
1692 (mpo->mpo_name == SO_NOSIGPIPE ||
1693 mpo->mpo_name == SO_NOADDRERR ||
1694 mpo->mpo_name == SO_KEEPALIVE)) {
1695 continue;
1696 }
1697
1698 interim = (mpo->mpo_flags & MPOF_INTERIM);
1699 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1700 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1701 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1702 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1703 mpo->mpo_intval);
1704 mptcp_sopt_remove(mpte, mpo);
1705 mptcp_sopt_free(mpo);
1706 continue;
1707 }
1708 }
1709
1710 /*
1711 * We need to receive everything that the subflow socket has,
1712 * so use a customized socket receive function. We will undo
1713 * this when the socket is peeled off or closed.
1714 */
1715 switch (dom) {
1716 case PF_INET:
1717 (*so)->so_proto = &mptcp_subflow_protosw;
1718 break;
1719 case PF_INET6:
1720 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1721 break;
1722 default:
1723 VERIFY(0);
1724 /* NOTREACHED */
1725 }
1726
1727 proc_rele(p);
1728
1729 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1730 int, dom, int, error);
1731
1732 return 0;
1733
1734 out_err:
1735 mptcp_subflow_abort(mpts, error);
1736
1737 proc_rele(p);
1738
1739 return error;
1740 }
1741
1742 /*
1743 * Close an MPTCP subflow socket.
1744 *
1745 * Note that this may be called on an embryonic subflow, and the only
1746 * thing that is guaranteed valid is the protocol-user request.
1747 */
1748 static void
mptcp_subflow_soclose(struct mptsub * mpts)1749 mptcp_subflow_soclose(struct mptsub *mpts)
1750 {
1751 struct socket *so = mpts->mpts_socket;
1752
1753 if (mpts->mpts_flags & MPTSF_CLOSED) {
1754 return;
1755 }
1756
1757 VERIFY(so != NULL);
1758 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1759 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1760
1761 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1762 struct socket *, so,
1763 struct sockbuf *, &so->so_rcv,
1764 struct sockbuf *, &so->so_snd,
1765 struct mptses *, mpts->mpts_mpte);
1766
1767 mpts->mpts_flags |= MPTSF_CLOSED;
1768
1769 if (so->so_retaincnt == 0) {
1770 soclose_locked(so);
1771
1772 return;
1773 } else {
1774 VERIFY(so->so_usecount > 0);
1775 so->so_usecount--;
1776 }
1777
1778 return;
1779 }
1780
1781 static void
mptcp_attach_to_subf(struct socket * so,struct mptcb * mp_tp,uint8_t addr_id)1782 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
1783 {
1784 struct tcpcb *tp = sototcpcb(so);
1785 struct mptcp_subf_auth_entry *sauth_entry;
1786
1787 /*
1788 * The address ID of the first flow is implicitly 0.
1789 */
1790 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1791 tp->t_local_aid = 0;
1792 } else {
1793 tp->t_local_aid = addr_id;
1794 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
1795 so->so_flags |= SOF_MP_SEC_SUBFLOW;
1796 }
1797 sauth_entry = zalloc(mpt_subauth_zone);
1798 sauth_entry->msae_laddr_id = tp->t_local_aid;
1799 sauth_entry->msae_raddr_id = 0;
1800 sauth_entry->msae_raddr_rand = 0;
1801 try_again:
1802 sauth_entry->msae_laddr_rand = RandomULong();
1803 if (sauth_entry->msae_laddr_rand == 0) {
1804 goto try_again;
1805 }
1806 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
1807 }
1808
1809 static void
mptcp_detach_mptcb_from_subf(struct mptcb * mp_tp,struct socket * so)1810 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
1811 {
1812 struct mptcp_subf_auth_entry *sauth_entry;
1813 struct tcpcb *tp = NULL;
1814 int found = 0;
1815
1816 tp = sototcpcb(so);
1817 if (tp == NULL) {
1818 return;
1819 }
1820
1821 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
1822 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
1823 found = 1;
1824 break;
1825 }
1826 }
1827 if (found) {
1828 LIST_REMOVE(sauth_entry, msae_next);
1829 }
1830
1831 if (found) {
1832 zfree(mpt_subauth_zone, sauth_entry);
1833 }
1834 }
1835
1836 /*
1837 * Connect an MPTCP subflow socket.
1838 *
1839 * Note that in the pending connect case, the subflow socket may have been
1840 * bound to an interface and/or a source IP address which may no longer be
1841 * around by the time this routine is called; in that case the connect attempt
1842 * will most likely fail.
1843 */
1844 static int
mptcp_subflow_soconnectx(struct mptses * mpte,struct mptsub * mpts)1845 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1846 {
1847 char dbuf[MAX_IPv6_STR_LEN];
1848 struct socket *mp_so, *so;
1849 struct mptcb *mp_tp;
1850 struct sockaddr *dst;
1851 struct proc *p;
1852 int af, error, dport;
1853
1854 mp_so = mptetoso(mpte);
1855 mp_tp = mpte->mpte_mptcb;
1856 so = mpts->mpts_socket;
1857 af = mpts->mpts_dst.sa_family;
1858 dst = &mpts->mpts_dst;
1859
1860 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1861 VERIFY(mpts->mpts_socket != NULL);
1862 VERIFY(af == AF_INET || af == AF_INET6);
1863
1864 if (af == AF_INET) {
1865 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1866 dport = ntohs(SIN(dst)->sin_port);
1867 } else {
1868 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1869 dport = ntohs(SIN6(dst)->sin6_port);
1870 }
1871
1872 os_log(mptcp_log_handle,
1873 "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1874 mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1875
1876 p = proc_find(mp_so->last_pid);
1877 if (p == PROC_NULL) {
1878 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1879 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1880
1881 return ESRCH;
1882 }
1883
1884 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1885
1886 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1887
1888 /* connect the subflow socket */
1889 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1890 p, mpts->mpts_ifscope,
1891 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1892
1893 mpts->mpts_iss = sototcpcb(so)->iss;
1894
1895 /* See tcp_connect_complete */
1896 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1897 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1898 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1899 }
1900
1901 /* Allocate a unique address id per subflow */
1902 mpte->mpte_addrid_last++;
1903 if (mpte->mpte_addrid_last == 0) {
1904 mpte->mpte_addrid_last++;
1905 }
1906
1907 proc_rele(p);
1908
1909 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1910 struct mptsub *, mpts, int, error);
1911 if (error) {
1912 os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1913 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1914 }
1915
1916 return error;
1917 }
1918
1919 static int
mptcp_adj_rmap(struct socket * so,struct mbuf * m,int off,uint64_t dsn,uint32_t rseq,uint16_t dlen,uint8_t dfin)1920 mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1921 uint32_t rseq, uint16_t dlen, uint8_t dfin)
1922 {
1923 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1924
1925 if (m_pktlen(m) == 0) {
1926 return 0;
1927 }
1928
1929 if (!(m->m_flags & M_PKTHDR)) {
1930 return 0;
1931 }
1932
1933 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
1934 if (off && (dsn != m->m_pkthdr.mp_dsn ||
1935 rseq != m->m_pkthdr.mp_rseq ||
1936 dlen != m->m_pkthdr.mp_rlen ||
1937 dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1938 os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
1939 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1940 (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1941 rseq, m->m_pkthdr.mp_rseq,
1942 dlen, m->m_pkthdr.mp_rlen,
1943 dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
1944
1945 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1946 return -1;
1947 }
1948 }
1949
1950 /* If mbuf is beyond right edge of the mapping, we need to split */
1951 if (m_pktlen(m) > dlen - dfin - off) {
1952 struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
1953 if (new == NULL) {
1954 os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
1955 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1956 dlen, dfin, off, m_pktlen(m),
1957 mpts->mpts_connid);
1958
1959 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1960 return -1;
1961 }
1962
1963 m->m_next = new;
1964 sballoc(&so->so_rcv, new);
1965 /* Undo, as sballoc will add to it as well */
1966 so->so_rcv.sb_cc -= new->m_len;
1967
1968 if (so->so_rcv.sb_mbtail == m) {
1969 so->so_rcv.sb_mbtail = new;
1970 }
1971 }
1972
1973 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1974 m->m_pkthdr.mp_dsn = dsn + off;
1975 m->m_pkthdr.mp_rseq = rseq + off;
1976 VERIFY(m_pktlen(m) < UINT16_MAX);
1977 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
1978
1979 /* Only put the DATA_FIN-flag on the last mbuf of this mapping */
1980 if (dfin) {
1981 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
1982 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
1983 } else {
1984 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
1985 }
1986 }
1987
1988
1989 mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
1990
1991 return 0;
1992 }
1993
1994 /*
1995 * Update the pid, upid, uuid of the subflow so, based on parent so
1996 */
1997 static void
mptcp_update_last_owner(struct socket * so,struct socket * mp_so)1998 mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
1999 {
2000 if (so->last_pid != mp_so->last_pid ||
2001 so->last_upid != mp_so->last_upid) {
2002 so->last_upid = mp_so->last_upid;
2003 so->last_pid = mp_so->last_pid;
2004 uuid_copy(so->last_uuid, mp_so->last_uuid);
2005 }
2006 so_update_policy(so);
2007 }
2008
2009 /*
2010 * MPTCP subflow socket receive routine, derived from soreceive().
2011 */
2012 static int
mptcp_subflow_soreceive(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)2013 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
2014 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2015 {
2016 #pragma unused(uio)
2017 struct socket *mp_so;
2018 struct mptses *mpte;
2019 struct mptcb *mp_tp;
2020 int flags, error = 0;
2021 struct mbuf *m, **mp = mp0;
2022 struct tcpcb *tp = sototcpcb(so);
2023
2024 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2025 mp_so = mptetoso(mpte);
2026 mp_tp = mpte->mpte_mptcb;
2027
2028 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2029
2030 #ifdef MORE_LOCKING_DEBUG
2031 if (so->so_usecount == 1) {
2032 panic("%s: so=%x no other reference on socket", __func__, so);
2033 /* NOTREACHED */
2034 }
2035 #endif
2036 /*
2037 * We return all that is there in the subflow's socket receive buffer
2038 * to the MPTCP layer, so we require that the caller passes in the
2039 * expected parameters.
2040 */
2041 if (mp == NULL || controlp != NULL) {
2042 return EINVAL;
2043 }
2044
2045 *mp = NULL;
2046 if (psa != NULL) {
2047 *psa = NULL;
2048 }
2049 if (flagsp != NULL) {
2050 flags = *flagsp & ~MSG_EOR;
2051 } else {
2052 flags = 0;
2053 }
2054
2055 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2056 return EOPNOTSUPP;
2057 }
2058
2059 flags |= (MSG_DONTWAIT | MSG_NBIO);
2060
2061 /*
2062 * If a recv attempt is made on a previously-accepted socket
2063 * that has been marked as inactive (disconnected), reject
2064 * the request.
2065 */
2066 if (so->so_flags & SOF_DEFUNCT) {
2067 struct sockbuf *sb = &so->so_rcv;
2068
2069 error = ENOTCONN;
2070 /*
2071 * This socket should have been disconnected and flushed
2072 * prior to being returned from sodefunct(); there should
2073 * be no data on its receive list, so panic otherwise.
2074 */
2075 if (so->so_state & SS_DEFUNCT) {
2076 sb_empty_assert(sb, __func__);
2077 }
2078 return error;
2079 }
2080
2081 /*
2082 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2083 * and if so just return to the caller. This could happen when
2084 * soreceive() is called by a socket upcall function during the
2085 * time the socket is freed. The socket buffer would have been
2086 * locked across the upcall, therefore we cannot put this thread
2087 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2088 * we may livelock), because the lock on the socket buffer will
2089 * only be released when the upcall routine returns to its caller.
2090 * Because the socket has been officially closed, there can be
2091 * no further read on it.
2092 *
2093 * A multipath subflow socket would have its SS_NOFDREF set by
2094 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2095 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2096 */
2097 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2098 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2099 return 0;
2100 }
2101
2102 /*
2103 * For consistency with soreceive() semantics, we need to obey
2104 * SB_LOCK in case some other code path has locked the buffer.
2105 */
2106 error = sblock(&so->so_rcv, 0);
2107 if (error != 0) {
2108 return error;
2109 }
2110
2111 m = so->so_rcv.sb_mb;
2112 if (m == NULL) {
2113 /*
2114 * Panic if we notice inconsistencies in the socket's
2115 * receive list; both sb_mb and sb_cc should correctly
2116 * reflect the contents of the list, otherwise we may
2117 * end up with false positives during select() or poll()
2118 * which could put the application in a bad state.
2119 */
2120 SB_MB_CHECK(&so->so_rcv);
2121
2122 if (so->so_error != 0) {
2123 error = so->so_error;
2124 so->so_error = 0;
2125 goto release;
2126 }
2127
2128 if (so->so_state & SS_CANTRCVMORE) {
2129 goto release;
2130 }
2131
2132 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2133 error = ENOTCONN;
2134 goto release;
2135 }
2136
2137 /*
2138 * MSG_DONTWAIT is implicitly defined and this routine will
2139 * never block, so return EWOULDBLOCK when there is nothing.
2140 */
2141 error = EWOULDBLOCK;
2142 goto release;
2143 }
2144
2145 mptcp_update_last_owner(so, mp_so);
2146
2147 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2148 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2149
2150 while (m != NULL) {
2151 int dlen = 0, error_out = 0, off = 0;
2152 uint8_t dfin = 0;
2153 struct mbuf *start = m;
2154 uint64_t dsn;
2155 uint32_t sseq;
2156 uint16_t orig_dlen;
2157 uint16_t csum;
2158
2159 VERIFY(m->m_nextpkt == NULL);
2160
2161 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2162 fallback:
2163 /* Just move mbuf to MPTCP-level */
2164
2165 sbfree(&so->so_rcv, m);
2166
2167 if (mp != NULL) {
2168 *mp = m;
2169 mp = &m->m_next;
2170 so->so_rcv.sb_mb = m = m->m_next;
2171 *mp = NULL;
2172 }
2173
2174 if (m != NULL) {
2175 so->so_rcv.sb_lastrecord = m;
2176 } else {
2177 SB_EMPTY_FIXUP(&so->so_rcv);
2178 }
2179
2180 continue;
2181 } else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2182 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2183 boolean_t found_mapping = false;
2184 int parsed_length = 0;
2185 struct mbuf *m_iter;
2186
2187 /*
2188 * No MPTCP-option in the header. Either fallback or
2189 * wait for additional mappings.
2190 */
2191 if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2192 /* data arrived without a DSS option mapping */
2193
2194 /* initial subflow can fallback right after SYN handshake */
2195 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2196 mptcp_notify_mpfail(so);
2197
2198 goto fallback;
2199 } else {
2200 os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2201 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2202 mpts->mpts_connid);
2203 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2204
2205 error = EIO;
2206 *mp0 = NULL;
2207 goto release;
2208 }
2209 }
2210
2211 /* Thus, let's look for an mbuf with the mapping */
2212 m_iter = m->m_next;
2213 parsed_length = m->m_len;
2214 while (m_iter != NULL && parsed_length < UINT16_MAX) {
2215 if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2216 parsed_length += m_iter->m_len;
2217 m_iter = m_iter->m_next;
2218 continue;
2219 }
2220
2221 found_mapping = true;
2222
2223 /* Found an mbuf with a DSS-mapping */
2224 orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2225 dsn = m_iter->m_pkthdr.mp_dsn;
2226 sseq = m_iter->m_pkthdr.mp_rseq;
2227 csum = m_iter->m_pkthdr.mp_csum;
2228
2229 if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2230 dfin = 1;
2231 dlen--;
2232 }
2233
2234 break;
2235 }
2236
2237 if (!found_mapping && parsed_length < UINT16_MAX) {
2238 /* Mapping not yet present, we can wait! */
2239 if (*mp0 == NULL) {
2240 error = EWOULDBLOCK;
2241 }
2242 goto release;
2243 } else if (!found_mapping && parsed_length >= UINT16_MAX) {
2244 os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2245 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2246 mpts->mpts_connid);
2247 /* Received 64KB without DSS-mapping. We should kill the subflow */
2248 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2249
2250 error = EIO;
2251 *mp0 = NULL;
2252 goto release;
2253 }
2254 } else {
2255 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2256 dsn = m->m_pkthdr.mp_dsn;
2257 sseq = m->m_pkthdr.mp_rseq;
2258 csum = m->m_pkthdr.mp_csum;
2259
2260 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2261 dfin = 1;
2262 dlen--;
2263 }
2264 }
2265
2266 /* Now, see if we need to remove previous packets */
2267 if (SEQ_GT(sseq + tp->irs, tp->rcv_nxt - so->so_rcv.sb_cc)) {
2268 /* Ok, there is data in there that we don't need - let's throw it away! */
2269 int totrim = (int)sseq + tp->irs - (tp->rcv_nxt - so->so_rcv.sb_cc);
2270
2271 sbdrop(&so->so_rcv, totrim);
2272
2273 m = so->so_rcv.sb_mb;
2274 }
2275
2276 /*
2277 * Check if the full mapping is now present
2278 */
2279 if ((int)so->so_rcv.sb_cc < dlen) {
2280 if (*mp0 == NULL) {
2281 error = EWOULDBLOCK;
2282 }
2283 goto release;
2284 }
2285
2286 /* Now, get the full mapping */
2287 off = 0;
2288 while (dlen > 0) {
2289 if (mptcp_adj_rmap(so, m, off, dsn, sseq, orig_dlen, dfin)) {
2290 error_out = 1;
2291 error = EIO;
2292 dlen = 0;
2293 *mp0 = NULL;
2294 break;
2295 }
2296
2297 dlen -= m->m_len;
2298 off += m->m_len;
2299 sbfree(&so->so_rcv, m);
2300
2301 if (mp != NULL) {
2302 *mp = m;
2303 mp = &m->m_next;
2304 so->so_rcv.sb_mb = m = m->m_next;
2305 *mp = NULL;
2306 }
2307
2308 ASSERT(dlen == 0 || m);
2309 if (dlen != 0 && m == NULL) {
2310 /* "try" to gracefully recover on customer builds */
2311 error_out = 1;
2312 error = EIO;
2313 dlen = 0;
2314
2315 *mp0 = NULL;
2316
2317 SB_EMPTY_FIXUP(&so->so_rcv);
2318 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2319
2320 break;
2321 }
2322 }
2323
2324 ASSERT(dlen == 0);
2325 if (dlen != 0) {
2326 /* "try" to gracefully recover on customer builds */
2327 error_out = 1;
2328 error = EIO;
2329 dlen = 0;
2330
2331 *mp0 = NULL;
2332
2333 SB_EMPTY_FIXUP(&so->so_rcv);
2334 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2335 }
2336
2337 if (m != NULL) {
2338 so->so_rcv.sb_lastrecord = m;
2339 } else {
2340 SB_EMPTY_FIXUP(&so->so_rcv);
2341 }
2342
2343 if (error_out) {
2344 goto release;
2345 }
2346
2347 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
2348 error = EIO;
2349 *mp0 = NULL;
2350 goto release;
2351 }
2352
2353 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2354 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2355 }
2356
2357 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2358 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2359
2360 if (flagsp != NULL) {
2361 *flagsp |= flags;
2362 }
2363
2364 release:
2365 sbunlock(&so->so_rcv, TRUE);
2366
2367 return error;
2368 }
2369
2370 /*
2371 * MPTCP subflow socket send routine, derived from sosend().
2372 */
2373 static int
mptcp_subflow_sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)2374 mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2375 struct mbuf *top, struct mbuf *control, int flags)
2376 {
2377 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
2378 boolean_t en_tracing = FALSE, proc_held = FALSE;
2379 struct proc *p = current_proc();
2380 int en_tracing_val;
2381 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2382 int error;
2383
2384 VERIFY(control == NULL);
2385 VERIFY(addr == NULL);
2386 VERIFY(uio == NULL);
2387 VERIFY(flags == 0);
2388 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2389
2390 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2391 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2392
2393 /*
2394 * trace if tracing & network (vs. unix) sockets & and
2395 * non-loopback
2396 */
2397 if (ENTR_SHOULDTRACE &&
2398 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2399 struct inpcb *inp = sotoinpcb(so);
2400 if (inp->inp_last_outifp != NULL &&
2401 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2402 en_tracing = TRUE;
2403 en_tracing_val = top->m_pkthdr.len;
2404 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2405 (unsigned long)VM_KERNEL_ADDRPERM(so),
2406 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2407 (int64_t)en_tracing_val);
2408 }
2409 }
2410
2411 mptcp_update_last_owner(so, mp_so);
2412
2413 if (mp_so->last_pid != proc_pid(p)) {
2414 p = proc_find(mp_so->last_pid);
2415 if (p == PROC_NULL) {
2416 p = current_proc();
2417 } else {
2418 proc_held = TRUE;
2419 }
2420 }
2421
2422 #if NECP
2423 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2424 #endif /* NECP */
2425
2426 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2427 if (error) {
2428 goto out;
2429 }
2430
2431 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2432 if (error == EJUSTRETURN) {
2433 error = 0;
2434 }
2435 top = NULL;
2436
2437 out:
2438 if (top != NULL) {
2439 m_freem(top);
2440 }
2441
2442 if (proc_held) {
2443 proc_rele(p);
2444 }
2445
2446 soclearfastopen(so);
2447
2448 if (en_tracing) {
2449 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2450 (unsigned long)VM_KERNEL_ADDRPERM(so),
2451 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2452 (int64_t)en_tracing_val);
2453 }
2454
2455 return error;
2456 }
2457
2458 /*
2459 * Subflow socket write upcall.
2460 *
2461 * Called when the associated subflow socket posted a read event.
2462 */
2463 static void
mptcp_subflow_wupcall(struct socket * so,void * arg,int waitf)2464 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2465 {
2466 #pragma unused(so, waitf)
2467 struct mptsub *mpts __single = arg;
2468 struct mptses *mpte = mpts->mpts_mpte;
2469
2470 VERIFY(mpte != NULL);
2471
2472 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2473 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2474 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2475 }
2476 return;
2477 }
2478
2479 mptcp_output(mpte);
2480 }
2481
2482 /*
2483 * Subflow socket control event upcall.
2484 */
2485 static void
mptcp_subflow_eupcall1(struct socket * so,void * arg,uint32_t events)2486 mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
2487 {
2488 #pragma unused(so)
2489 struct mptsub *mpts __single = arg;
2490 struct mptses *mpte = mpts->mpts_mpte;
2491
2492 socket_lock_assert_owned(mptetoso(mpte));
2493
2494 if ((mpts->mpts_evctl & events) == events) {
2495 return;
2496 }
2497
2498 mpts->mpts_evctl |= events;
2499
2500 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2501 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
2502 return;
2503 }
2504
2505 mptcp_subflow_workloop(mpte);
2506 }
2507
2508 /*
2509 * Establish an initial MPTCP connection (if first subflow and not yet
2510 * connected), or add a subflow to an existing MPTCP connection.
2511 */
2512 int
mptcp_subflow_add(struct mptses * mpte,struct sockaddr * src,struct sockaddr * dst,uint32_t ifscope,sae_connid_t * pcid)2513 mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2514 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2515 {
2516 socket_ref_t mp_so, so = NULL;
2517 struct mptcb *mp_tp;
2518 struct mptsub *mpts = NULL;
2519 int af, error = 0;
2520
2521 mp_so = mptetoso(mpte);
2522 mp_tp = mpte->mpte_mptcb;
2523
2524 socket_lock_assert_owned(mp_so);
2525
2526 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2527 /* If the remote end sends Data FIN, refuse subflow adds */
2528 os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2529 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2530 error = ENOTCONN;
2531 goto out_err;
2532 }
2533
2534 if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2535 error = EOVERFLOW;
2536 goto out_err;
2537 }
2538
2539 mpts = mptcp_subflow_alloc();
2540 if (mpts == NULL) {
2541 os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2542 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2543 error = ENOMEM;
2544 goto out_err;
2545 }
2546
2547 if (src) {
2548 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2549 error = EAFNOSUPPORT;
2550 goto out_err;
2551 }
2552
2553 if (src->sa_family == AF_INET &&
2554 src->sa_len != sizeof(struct sockaddr_in)) {
2555 error = EINVAL;
2556 goto out_err;
2557 }
2558
2559 if (src->sa_family == AF_INET6 &&
2560 src->sa_len != sizeof(struct sockaddr_in6)) {
2561 error = EINVAL;
2562 goto out_err;
2563 }
2564
2565 mpts->mpts_src = SA(alloc_sockaddr(src->sa_len, Z_WAITOK | Z_NOFAIL));
2566
2567 SOCKADDR_COPY(src, mpts->mpts_src, src->sa_len);
2568 }
2569
2570 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2571 error = EAFNOSUPPORT;
2572 goto out_err;
2573 }
2574
2575 if (dst->sa_family == AF_INET &&
2576 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2577 error = EINVAL;
2578 goto out_err;
2579 }
2580
2581 if (dst->sa_family == AF_INET6 &&
2582 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2583 error = EINVAL;
2584 goto out_err;
2585 }
2586
2587 SOCKADDR_COPY(dst, &mpts->mpts_dst, dst->sa_len);
2588
2589 af = mpts->mpts_dst.sa_family;
2590
2591 ifnet_head_lock_shared();
2592 if ((ifscope > (unsigned)if_index)) {
2593 ifnet_head_done();
2594 error = ENXIO;
2595 goto out_err;
2596 }
2597 ifnet_head_done();
2598
2599 mpts->mpts_ifscope = ifscope;
2600
2601 /* create the subflow socket */
2602 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0) {
2603 /*
2604 * Returning (error) and not cleaning up, because up to here
2605 * all we did is creating mpts.
2606 *
2607 * And the contract is that the call to mptcp_subflow_socreate,
2608 * moves ownership of mpts to mptcp_subflow_socreate.
2609 */
2610 return error;
2611 }
2612
2613 /*
2614 * We may be called from within the kernel. Still need to account this
2615 * one to the real app.
2616 */
2617 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2618
2619 /*
2620 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2621 * -1 (SAE_CONNID_ALL).
2622 */
2623 mpte->mpte_connid_last++;
2624 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2625 mpte->mpte_connid_last == SAE_CONNID_ANY) {
2626 mpte->mpte_connid_last++;
2627 }
2628
2629 mpts->mpts_connid = mpte->mpte_connid_last;
2630
2631 mpts->mpts_rel_seq = 1;
2632
2633 /* Allocate a unique address id per subflow */
2634 mpte->mpte_addrid_last++;
2635 if (mpte->mpte_addrid_last == 0) {
2636 mpte->mpte_addrid_last++;
2637 }
2638
2639 /* register for subflow socket read/write events */
2640 sock_setupcalls_locked(so, NULL, NULL, mptcp_subflow_wupcall, mpts, 1);
2641
2642 /* Register for subflow socket control events */
2643 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2644 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2645 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2646 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2647 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2648 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2649 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2650 SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2651
2652 /* sanity check */
2653 VERIFY(!(mpts->mpts_flags &
2654 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2655
2656 /*
2657 * Indicate to the TCP subflow whether or not it should establish
2658 * the initial MPTCP connection, or join an existing one. Fill
2659 * in the connection request structure with additional info needed
2660 * by the underlying TCP (to be used in the TCP options, etc.)
2661 */
2662 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2663 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2664
2665 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2666 mptcp_init_local_parms(mpte, dst);
2667 }
2668 soisconnecting(mp_so);
2669
2670 /* If fastopen is requested, set state in mpts */
2671 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2672 mpts->mpts_flags |= MPTSF_TFO_REQD;
2673 }
2674 } else {
2675 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2676 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2677 }
2678 }
2679
2680 mpts->mpts_flags |= MPTSF_CONNECTING;
2681
2682 /* connect right away if first attempt, or if join can be done now */
2683 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2684 error = mptcp_subflow_soconnectx(mpte, mpts);
2685 }
2686
2687 if (error) {
2688 goto out_err_close;
2689 }
2690
2691 if (pcid) {
2692 *pcid = mpts->mpts_connid;
2693 }
2694
2695 return 0;
2696
2697 out_err_close:
2698 mptcp_subflow_abort(mpts, error);
2699
2700 return error;
2701
2702 out_err:
2703 if (mpts) {
2704 mptcp_subflow_free(mpts);
2705 }
2706
2707 return error;
2708 }
2709
2710 void
mptcpstats_update(struct mptcp_itf_stats * stats __counted_by (stats_count),uint16_t stats_count,const struct mptsub * mpts)2711 mptcpstats_update(struct mptcp_itf_stats *stats __counted_by(stats_count), uint16_t stats_count, const struct mptsub *mpts)
2712 {
2713 int index = mptcpstats_get_index(stats, stats_count, mpts);
2714
2715 if (index != -1) {
2716 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2717
2718 stats[index].mpis_txbytes += inp->inp_mstat.ms_total.ts_txbytes;
2719 stats[index].mpis_rxbytes += inp->inp_mstat.ms_total.ts_rxbytes;
2720
2721 stats[index].mpis_wifi_txbytes += inp->inp_mstat.ms_wifi_infra.ts_txbytes +
2722 inp->inp_mstat.ms_wifi_non_infra.ts_txbytes;
2723 stats[index].mpis_wifi_rxbytes += inp->inp_mstat.ms_wifi_infra.ts_rxbytes +
2724 inp->inp_mstat.ms_wifi_non_infra.ts_rxbytes;
2725
2726 stats[index].mpis_wired_txbytes += inp->inp_mstat.ms_wired.ts_txbytes;
2727 stats[index].mpis_wired_rxbytes += inp->inp_mstat.ms_wired.ts_rxbytes;
2728
2729 stats[index].mpis_cell_txbytes += inp->inp_mstat.ms_cellular.ts_txbytes;
2730 stats[index].mpis_cell_rxbytes += inp->inp_mstat.ms_cellular.ts_rxbytes;
2731 }
2732 }
2733
2734 /*
2735 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2736 * will no longer be accessible after a subflow is deleted, thus this
2737 * should occur only after the subflow socket has been disconnected.
2738 */
2739 void
mptcp_subflow_del(struct mptses * mpte,struct mptsub * mpts)2740 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2741 {
2742 struct socket *mp_so = mptetoso(mpte);
2743 struct socket *so = mpts->mpts_socket;
2744 struct tcpcb *tp = sototcpcb(so);
2745
2746 socket_lock_assert_owned(mp_so);
2747 VERIFY(mpts->mpts_mpte == mpte);
2748 VERIFY(mpte->mpte_numflows != 0);
2749 VERIFY(mp_so->so_usecount > 0);
2750
2751 mptcpstats_update(mpte->mpte_itfstats, MPTCP_ITFSTATS_SIZE, mpts);
2752
2753 mptcp_unset_cellicon(mpte, mpts, 1);
2754
2755 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_mstat.ms_total.ts_rxbytes;
2756 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_mstat.ms_total.ts_txbytes;
2757
2758 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2759 mpte->mpte_numflows--;
2760 if (mpte->mpte_active_sub == mpts) {
2761 mpte->mpte_active_sub = NULL;
2762 }
2763
2764 /*
2765 * Drop references held by this subflow socket; there
2766 * will be no further upcalls made from this point.
2767 */
2768 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2769 sock_catchevents_locked(so, NULL, NULL, 0);
2770
2771 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2772
2773 mp_so->so_usecount--; /* for subflow socket */
2774 mpts->mpts_mpte = NULL;
2775 mpts->mpts_socket = NULL;
2776
2777 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2778 mptcp_subflow_remref(mpts); /* for subflow socket */
2779
2780 so->so_flags &= ~SOF_MP_SUBFLOW;
2781 tp->t_mptcb = NULL;
2782 tp->t_mpsub = NULL;
2783 }
2784
2785 void
mptcp_subflow_shutdown(struct mptses * mpte,struct mptsub * mpts)2786 mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2787 {
2788 struct socket *so = mpts->mpts_socket;
2789 struct mptcb *mp_tp = mpte->mpte_mptcb;
2790 int send_dfin = 0;
2791
2792 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2793 send_dfin = 1;
2794 }
2795
2796 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2797 (so->so_state & SS_ISCONNECTED)) {
2798 if (send_dfin) {
2799 mptcp_send_dfin(so);
2800 }
2801 soshutdownlock(so, SHUT_WR);
2802 }
2803 }
2804
2805 static void
mptcp_subflow_abort(struct mptsub * mpts,int error)2806 mptcp_subflow_abort(struct mptsub *mpts, int error)
2807 {
2808 struct socket *so = mpts->mpts_socket;
2809 struct tcpcb *tp = sototcpcb(so);
2810
2811 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2812 return;
2813 }
2814
2815 if (tp->t_state != TCPS_CLOSED) {
2816 tcp_drop(tp, error);
2817 }
2818
2819 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2820 }
2821
2822 /*
2823 * Disconnect a subflow socket.
2824 */
2825 void
mptcp_subflow_disconnect(struct mptses * mpte,struct mptsub * mpts)2826 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2827 {
2828 struct socket *so, *mp_so;
2829 struct mptcb *mp_tp;
2830 int send_dfin = 0;
2831
2832 so = mpts->mpts_socket;
2833 mp_tp = mpte->mpte_mptcb;
2834 mp_so = mptetoso(mpte);
2835
2836 socket_lock_assert_owned(mp_so);
2837
2838 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2839 return;
2840 }
2841
2842 mptcp_unset_cellicon(mpte, mpts, 1);
2843
2844 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2845
2846 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2847 send_dfin = 1;
2848 }
2849
2850 if (mp_so->so_flags & SOF_DEFUNCT) {
2851 errno_t ret;
2852
2853 ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2854 if (ret == 0) {
2855 ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2856
2857 if (ret != 0) {
2858 os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2859 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2860 }
2861 } else {
2862 os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2863 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2864 }
2865 }
2866
2867 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2868 (so->so_state & SS_ISCONNECTED)) {
2869 if (send_dfin) {
2870 mptcp_send_dfin(so);
2871 }
2872
2873 (void) soshutdownlock(so, SHUT_RD);
2874 (void) soshutdownlock(so, SHUT_WR);
2875 (void) sodisconnectlocked(so);
2876 }
2877
2878 /*
2879 * Generate a disconnect event for this subflow socket, in case
2880 * the lower layer doesn't do it; this is needed because the
2881 * subflow socket deletion relies on it.
2882 */
2883 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2884 }
2885
2886 /*
2887 * Subflow socket input.
2888 */
2889 static void
mptcp_subflow_input(struct mptses * mpte,struct mptsub * mpts)2890 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2891 {
2892 struct socket *mp_so = mptetoso(mpte);
2893 mbuf_ref_t m = NULL;
2894 struct socket *so;
2895 int error, wakeup = 0;
2896
2897 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2898 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2899
2900 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2901 struct mptsub *, mpts);
2902
2903 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2904 goto out;
2905 }
2906
2907 so = mpts->mpts_socket;
2908
2909 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2910 if (error != 0 && error != EWOULDBLOCK) {
2911 os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2912 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2913 if (error == ENODATA) {
2914 /*
2915 * Don't ignore ENODATA so as to discover
2916 * nasty middleboxes.
2917 */
2918 mp_so->so_error = ENODATA;
2919
2920 wakeup = 1;
2921 goto out;
2922 }
2923 }
2924
2925 /* In fallback, make sure to accept data on all but one subflow */
2926 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2927 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2928 m_freem(m);
2929 goto out;
2930 }
2931
2932 if (m != NULL) {
2933 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2934 mptcp_set_cellicon(mpte, mpts);
2935
2936 mpte->mpte_used_cell = 1;
2937 } else {
2938 /*
2939 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2940 * explicitly set the cellicon, then we unset it again.
2941 */
2942 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2943 mptcp_unset_cellicon(mpte, NULL, 1);
2944 }
2945
2946 mpte->mpte_used_wifi = 1;
2947 }
2948
2949 mptcp_input(mpte, m);
2950 }
2951
2952 out:
2953 if (wakeup) {
2954 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2955 }
2956
2957 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2958 }
2959
2960 void
mptcp_handle_input(struct socket * so)2961 mptcp_handle_input(struct socket *so)
2962 {
2963 struct mptsub *mpts, *tmpts;
2964 struct mptses *mpte;
2965
2966 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2967 return;
2968 }
2969
2970 mpts = sototcpcb(so)->t_mpsub;
2971 mpte = mpts->mpts_mpte;
2972
2973 socket_lock_assert_owned(mptetoso(mpte));
2974
2975 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2976 if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2977 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2978 }
2979 return;
2980 }
2981
2982 mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2983 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2984 if (mpts->mpts_socket->so_usecount == 0) {
2985 /* Will be removed soon by tcp_garbage_collect */
2986 continue;
2987 }
2988
2989 mptcp_subflow_addref(mpts);
2990 mpts->mpts_socket->so_usecount++;
2991
2992 mptcp_subflow_input(mpte, mpts);
2993
2994 mptcp_subflow_remref(mpts); /* ours */
2995
2996 VERIFY(mpts->mpts_socket->so_usecount != 0);
2997 mpts->mpts_socket->so_usecount--;
2998 }
2999
3000 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INPUT_HANDLE);
3001 }
3002
3003 static boolean_t
mptcp_search_seq_in_sub(struct mbuf * m,struct socket * so)3004 mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
3005 {
3006 struct mbuf *so_m = so->so_snd.sb_mb;
3007 uint64_t dsn = m->m_pkthdr.mp_dsn;
3008
3009 while (so_m) {
3010 VERIFY(so_m->m_flags & M_PKTHDR);
3011 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
3012
3013 /* Part of the segment is covered, don't reinject here */
3014 if (so_m->m_pkthdr.mp_dsn <= dsn &&
3015 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
3016 return TRUE;
3017 }
3018
3019 so_m = so_m->m_next;
3020 }
3021
3022 return FALSE;
3023 }
3024
3025 /*
3026 * Subflow socket output.
3027 *
3028 * Called for sending data from MPTCP to the underlying subflow socket.
3029 */
3030 int
mptcp_subflow_output(struct mptses * mpte,struct mptsub * mpts,int flags)3031 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
3032 {
3033 struct mptcb *mp_tp = mpte->mpte_mptcb;
3034 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL;
3035 struct socket *mp_so, *so;
3036 struct tcpcb *tp;
3037 uint64_t mpt_dsn = 0, off = 0;
3038 int sb_cc = 0, error = 0, wakeup = 0;
3039 uint16_t dss_csum;
3040 uint16_t tot_sent = 0;
3041 boolean_t reinjected = FALSE;
3042
3043 mp_so = mptetoso(mpte);
3044 so = mpts->mpts_socket;
3045 tp = sototcpcb(so);
3046
3047 socket_lock_assert_owned(mp_so);
3048
3049 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3050 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
3051
3052 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3053 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
3054 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3055 (mpts->mpts_flags & MPTSF_TFO_REQD));
3056 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
3057
3058 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3059 struct mptsub *, mpts);
3060
3061 /* Remove Addr Option is not sent reliably as per I-D */
3062 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
3063 tp->t_rem_aid = mpte->mpte_lost_aid;
3064 tp->t_mpflags |= TMPF_SND_REM_ADDR;
3065 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3066 }
3067
3068 /*
3069 * The mbuf chains containing the metadata (as well as pointing to
3070 * the user data sitting at the MPTCP output queue) would then be
3071 * sent down to the subflow socket.
3072 *
3073 * Some notes on data sequencing:
3074 *
3075 * a. Each mbuf must be a M_PKTHDR.
3076 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
3077 * in the mbuf pkthdr structure.
3078 * c. Each mbuf containing the MPTCP metadata must have its
3079 * pkt_flags marked with the PKTF_MPTCP flag.
3080 */
3081
3082 if (mpte->mpte_reinjectq) {
3083 sb_mb = mpte->mpte_reinjectq;
3084 } else {
3085 sb_mb = mp_so->so_snd.sb_mb;
3086 }
3087
3088 if (sb_mb == NULL) {
3089 os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3090 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3091 (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3092 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3093
3094 /* Fix it to prevent looping */
3095 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3096 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3097 }
3098 goto out;
3099 }
3100
3101 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3102
3103 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3104 !(so->so_state & SS_ISCONNECTED) &&
3105 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3106 tp->t_mpflags |= TMPF_TFO_REQUEST;
3107
3108 /* Opting to call pru_send as no mbuf at subflow level */
3109 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3110 NULL, current_proc());
3111 if (error == EJUSTRETURN) {
3112 error = 0;
3113 }
3114
3115 goto done_sending;
3116 }
3117
3118 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3119
3120 /* First, drop acknowledged data */
3121 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3122 os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3123 "dsn %u suna %u reinject? %u\n",
3124 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3125 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3126 if (mpte->mpte_reinjectq) {
3127 mptcp_clean_reinjectq(mpte);
3128 } else {
3129 uint64_t len = 0;
3130 len = mp_tp->mpt_snduna - mpt_dsn;
3131 sbdrop(&mp_so->so_snd, (int)len);
3132 wakeup = 1;
3133 }
3134 }
3135
3136 /* Check again because of above sbdrop */
3137 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3138 os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3139 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3140 goto out;
3141 }
3142
3143 /*
3144 * In degraded mode, we don't receive data acks, so force free
3145 * mbufs less than snd_nxt
3146 */
3147 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3148 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3149 mp_so->so_snd.sb_mb) {
3150 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3151 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3152 uint64_t len = 0;
3153 len = mp_tp->mpt_snduna - mpt_dsn;
3154 sbdrop(&mp_so->so_snd, (int)len);
3155 wakeup = 1;
3156
3157 os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3158 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3159 (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3160 }
3161 }
3162
3163 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3164 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3165 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3166 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3167 }
3168
3169 /*
3170 * Adjust the top level notion of next byte used for retransmissions
3171 * and sending FINs.
3172 */
3173 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3174 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3175 }
3176
3177 /* Now determine the offset from which to start transmitting data */
3178 if (mpte->mpte_reinjectq) {
3179 sb_mb = mpte->mpte_reinjectq;
3180 } else {
3181 dont_reinject:
3182 sb_mb = mp_so->so_snd.sb_mb;
3183 }
3184 if (sb_mb == NULL) {
3185 os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3186 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3187 goto out;
3188 }
3189
3190 if (sb_mb == mpte->mpte_reinjectq) {
3191 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3192 off = 0;
3193
3194 if (mptcp_search_seq_in_sub(sb_mb, so)) {
3195 if (mptcp_can_send_more(mp_tp, TRUE)) {
3196 goto dont_reinject;
3197 }
3198
3199 error = ECANCELED;
3200 goto out;
3201 }
3202
3203 reinjected = TRUE;
3204 } else if (flags & MPTCP_SUBOUT_PROBING) {
3205 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3206 off = 0;
3207 } else {
3208 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
3209
3210 /*
3211 * With TFO, there might be no data at all, thus still go into this
3212 * code-path here.
3213 */
3214 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3215 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3216 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3217 sb_cc -= off;
3218 } else {
3219 os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3220 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3221 (uint32_t)mp_tp->mpt_sndmax);
3222
3223 goto out;
3224 }
3225 }
3226
3227 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
3228 if (sb_cc <= 0) {
3229 os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3230 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3231 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3232 mptcp_subflow_cwnd_space(so));
3233 }
3234
3235 sb_cc = min(sb_cc, UINT16_MAX);
3236
3237 /*
3238 * Create a DSN mapping for the data we are about to send. It all
3239 * has the same mapping.
3240 */
3241 if (reinjected) {
3242 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3243 } else {
3244 mpt_dsn = mp_tp->mpt_snduna + off;
3245 }
3246
3247 mpt_mbuf = sb_mb;
3248 while (mpt_mbuf && reinjected == FALSE &&
3249 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3250 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3251 off -= mpt_mbuf->m_pkthdr.mp_rlen;
3252 mpt_mbuf = mpt_mbuf->m_next;
3253 }
3254 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3255
3256 head = tail = NULL;
3257
3258 while (tot_sent < sb_cc) {
3259 int32_t mlen;
3260
3261 mlen = mpt_mbuf->m_len;
3262 mlen -= off;
3263 mlen = MIN(mlen, sb_cc - tot_sent);
3264
3265 if (mlen < 0) {
3266 os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3267 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3268 (uint32_t)off, sb_cc, tot_sent);
3269 goto out;
3270 }
3271
3272 if (mlen == 0) {
3273 goto next;
3274 }
3275
3276 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT, NULL, NULL,
3277 M_COPYM_MUST_COPY_HDR);
3278 if (m == NULL) {
3279 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3280 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3281 error = ENOBUFS;
3282 break;
3283 }
3284
3285 /* Create a DSN mapping for the data (m_copym does it) */
3286 VERIFY(m->m_flags & M_PKTHDR);
3287 VERIFY(m->m_next == NULL);
3288
3289 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3290 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3291 m->m_pkthdr.mp_dsn = mpt_dsn;
3292 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3293 m->m_pkthdr.len = mlen;
3294
3295 if (head == NULL) {
3296 head = tail = m;
3297 } else {
3298 tail->m_next = m;
3299 tail = m;
3300 }
3301
3302 tot_sent += mlen;
3303 off = 0;
3304 next:
3305 mpt_mbuf = mpt_mbuf->m_next;
3306 }
3307
3308 if (reinjected) {
3309 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3310 struct mbuf *n = sb_mb;
3311
3312 while (n) {
3313 n->m_pkthdr.mp_dsn += sb_cc;
3314 n->m_pkthdr.mp_rlen -= sb_cc;
3315 n = n->m_next;
3316 }
3317 m_adj(sb_mb, sb_cc);
3318 } else {
3319 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3320 m_freem(sb_mb);
3321 }
3322 }
3323
3324 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3325 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
3326 tot_sent);
3327 }
3328
3329 /* Now, let's update rel-seq and the data-level length */
3330 mpts->mpts_rel_seq += tot_sent;
3331 m = head;
3332 while (m) {
3333 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3334 m->m_pkthdr.mp_csum = dss_csum;
3335 }
3336 m->m_pkthdr.mp_rlen = tot_sent;
3337 m = m->m_next;
3338 }
3339
3340 if (head != NULL) {
3341 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3342 (tp->t_tfo_stats == 0)) {
3343 tp->t_mpflags |= TMPF_TFO_REQUEST;
3344 }
3345
3346 error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0);
3347 head = NULL;
3348 }
3349
3350 done_sending:
3351 if (error == 0 ||
3352 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3353 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3354
3355 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3356 tcpstat.tcps_mp_num_probes++;
3357 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3358 mpts->mpts_probecnt += 1;
3359 } else {
3360 mpts->mpts_probecnt +=
3361 tot_sent / mpts->mpts_maxseg;
3362 }
3363 }
3364
3365 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3366 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3367 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3368 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3369 }
3370 mp_tp->mpt_sndnxt = new_sndnxt;
3371 }
3372
3373 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3374
3375 /* Must be here as mptcp_can_send_more() checks for this */
3376 soclearfastopen(mp_so);
3377
3378 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3379 mptcp_set_cellicon(mpte, mpts);
3380
3381 mpte->mpte_used_cell = 1;
3382 } else {
3383 /*
3384 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3385 * explicitly set the cellicon, then we unset it again.
3386 */
3387 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3388 mptcp_unset_cellicon(mpte, NULL, 1);
3389 }
3390
3391 mpte->mpte_used_wifi = 1;
3392 }
3393
3394 /*
3395 * Don't propagate EWOULDBLOCK - it's already taken care of
3396 * in mptcp_usr_send for TFO.
3397 */
3398 error = 0;
3399 } else {
3400 /* We need to revert our change to mpts_rel_seq */
3401 mpts->mpts_rel_seq -= tot_sent;
3402
3403 os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3404 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3405 }
3406 out:
3407
3408 if (head != NULL) {
3409 m_freem(head);
3410 }
3411
3412 if (wakeup) {
3413 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3414 }
3415
3416 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3417 return error;
3418 }
3419
3420 static void
mptcp_add_reinjectq(struct mptses * mpte,struct mbuf * m)3421 mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3422 {
3423 struct mbuf *n, *prev = NULL;
3424
3425 n = mpte->mpte_reinjectq;
3426
3427 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3428 * equal than m's sequence number.
3429 */
3430 while (n) {
3431 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3432 break;
3433 }
3434
3435 prev = n;
3436
3437 n = n->m_nextpkt;
3438 }
3439
3440 if (n) {
3441 /* m is already fully covered by the next mbuf in the queue */
3442 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3443 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3444 os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3445 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3446 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3447 m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
3448 goto dont_queue;
3449 }
3450
3451 /* m is covering the next mbuf entirely, thus we remove this guy */
3452 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3453 struct mbuf *tmp = n->m_nextpkt;
3454
3455 os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3456 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3457 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3458 (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
3459
3460 m->m_nextpkt = NULL;
3461 if (prev == NULL) {
3462 mpte->mpte_reinjectq = tmp;
3463 } else {
3464 prev->m_nextpkt = tmp;
3465 }
3466
3467 m_freem(n);
3468 n = tmp;
3469 }
3470 }
3471
3472 if (prev) {
3473 /* m is already fully covered by the previous mbuf in the queue */
3474 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3475 os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3476 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3477 (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3478 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
3479 goto dont_queue;
3480 }
3481 }
3482
3483 if (prev == NULL) {
3484 mpte->mpte_reinjectq = m;
3485 } else {
3486 prev->m_nextpkt = m;
3487 }
3488
3489 m->m_nextpkt = n;
3490
3491 return;
3492
3493 dont_queue:
3494 m_freem(m);
3495 return;
3496 }
3497
3498 static struct mbuf *
mptcp_lookup_dsn(struct mptses * mpte,uint64_t dsn)3499 mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3500 {
3501 struct socket *mp_so = mptetoso(mpte);
3502 struct mbuf *m;
3503
3504 m = mp_so->so_snd.sb_mb;
3505
3506 while (m) {
3507 /* If this segment covers what we are looking for, return it. */
3508 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3509 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3510 break;
3511 }
3512
3513
3514 /* Segment is no more in the queue */
3515 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3516 return NULL;
3517 }
3518
3519 m = m->m_next;
3520 }
3521
3522 return m;
3523 }
3524
3525 static struct mbuf *
mptcp_copy_mbuf_list(struct mptses * mpte,struct mbuf * m,int len)3526 mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3527 {
3528 struct mbuf *top = NULL, *tail = NULL;
3529 uint64_t dsn;
3530 uint32_t dlen, rseq;
3531
3532 dsn = m->m_pkthdr.mp_dsn;
3533 dlen = m->m_pkthdr.mp_rlen;
3534 rseq = m->m_pkthdr.mp_rseq;
3535
3536 while (len > 0) {
3537 struct mbuf *n;
3538
3539 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3540
3541 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, NULL, NULL, M_COPYM_MUST_COPY_HDR);
3542 if (n == NULL) {
3543 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3544 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3545 goto err;
3546 }
3547
3548 VERIFY(n->m_flags & M_PKTHDR);
3549 VERIFY(n->m_next == NULL);
3550 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3551 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3552 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3553 VERIFY(n->m_len == m->m_len);
3554
3555 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3556
3557 if (top == NULL) {
3558 top = n;
3559 }
3560
3561 if (tail != NULL) {
3562 tail->m_next = n;
3563 }
3564
3565 tail = n;
3566
3567 len -= m->m_len;
3568 m = m->m_next;
3569 }
3570
3571 return top;
3572
3573 err:
3574 if (top) {
3575 m_freem(top);
3576 }
3577
3578 return NULL;
3579 }
3580
3581 static void
mptcp_reinject_mbufs(struct socket * so)3582 mptcp_reinject_mbufs(struct socket *so)
3583 {
3584 struct tcpcb *tp = sototcpcb(so);
3585 struct mptsub *mpts = tp->t_mpsub;
3586 struct mptcb *mp_tp = tptomptp(tp);
3587 struct mptses *mpte = mp_tp->mpt_mpte;
3588 struct sockbuf *sb = &so->so_snd;
3589 struct mbuf *m;
3590
3591 m = sb->sb_mb;
3592 while (m) {
3593 struct mbuf *n = m->m_next, *orig = m;
3594 bool set_reinject_flag = false;
3595
3596 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3597
3598 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3599 goto next;
3600 }
3601
3602 /* Has it all already been acknowledged at the data-level? */
3603 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3604 goto next;
3605 }
3606
3607 /* Part of this has already been acknowledged - lookup in the
3608 * MPTCP-socket for the segment.
3609 */
3610 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3611 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3612 if (m == NULL) {
3613 goto next;
3614 }
3615 }
3616
3617 /* Copy the mbuf with headers (aka, DSN-numbers) */
3618 m = mptcp_copy_mbuf_list(mpte, m, m->m_pkthdr.mp_rlen);
3619 if (m == NULL) {
3620 break;
3621 }
3622
3623 VERIFY(m->m_nextpkt == NULL);
3624
3625 /* Now, add to the reinject-queue, eliminating overlapping
3626 * segments
3627 */
3628 mptcp_add_reinjectq(mpte, m);
3629
3630 set_reinject_flag = true;
3631 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3632
3633 next:
3634 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3635 while (n) {
3636 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3637
3638 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3639 break;
3640 }
3641
3642 if (set_reinject_flag) {
3643 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3644 }
3645 n = n->m_next;
3646 }
3647
3648 m = n;
3649 }
3650 }
3651
3652 void
mptcp_clean_reinjectq(struct mptses * mpte)3653 mptcp_clean_reinjectq(struct mptses *mpte)
3654 {
3655 struct mptcb *mp_tp = mpte->mpte_mptcb;
3656
3657 socket_lock_assert_owned(mptetoso(mpte));
3658
3659 while (mpte->mpte_reinjectq) {
3660 struct mbuf *m = mpte->mpte_reinjectq;
3661
3662 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3663 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3664 break;
3665 }
3666
3667 mpte->mpte_reinjectq = m->m_nextpkt;
3668 m->m_nextpkt = NULL;
3669 m_freem(m);
3670 }
3671 }
3672
3673 static ev_ret_t
mptcp_subflow_propagate_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3674 mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3675 uint32_t *p_mpsofilt_hint, uint32_t event)
3676 {
3677 struct socket *mp_so, *so;
3678 struct mptcb *mp_tp;
3679
3680 mp_so = mptetoso(mpte);
3681 mp_tp = mpte->mpte_mptcb;
3682 so = mpts->mpts_socket;
3683
3684 /*
3685 * We got an event for this subflow that might need to be propagated,
3686 * based on the state of the MPTCP connection.
3687 */
3688 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3689 (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3690 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3691 mp_so->so_error = so->so_error;
3692 *p_mpsofilt_hint |= event;
3693 }
3694
3695 return MPTS_EVRET_OK;
3696 }
3697
3698 /*
3699 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3700 */
3701 static ev_ret_t
mptcp_subflow_nosrcaddr_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3702 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3703 uint32_t *p_mpsofilt_hint, uint32_t event)
3704 {
3705 struct socket *mp_so;
3706 struct tcpcb *tp;
3707
3708 mp_so = mptetoso(mpte);
3709 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3710
3711 /*
3712 * This overwrites any previous mpte_lost_aid to avoid storing
3713 * too much state when the typical case has only two subflows.
3714 */
3715 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3716 mpte->mpte_lost_aid = tp->t_local_aid;
3717
3718 /*
3719 * The subflow connection has lost its source address.
3720 */
3721 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3722
3723 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3724 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3725 }
3726
3727 return MPTS_EVRET_DELETE;
3728 }
3729
3730 static ev_ret_t
mptcp_subflow_mpsuberror_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3731 mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3732 uint32_t *p_mpsofilt_hint, uint32_t event)
3733 {
3734 #pragma unused(event, p_mpsofilt_hint)
3735 struct socket *so, *mp_so;
3736
3737 so = mpts->mpts_socket;
3738
3739 if (so->so_error != ENODATA) {
3740 return MPTS_EVRET_OK;
3741 }
3742
3743
3744 mp_so = mptetoso(mpte);
3745
3746 mp_so->so_error = ENODATA;
3747
3748 sorwakeup(mp_so);
3749 sowwakeup(mp_so);
3750
3751 return MPTS_EVRET_OK;
3752 }
3753
3754
3755 /*
3756 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3757 * indicates that the remote side sent a Data FIN
3758 */
3759 static ev_ret_t
mptcp_subflow_mpcantrcvmore_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3760 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3761 uint32_t *p_mpsofilt_hint, uint32_t event)
3762 {
3763 #pragma unused(event, mpts)
3764 struct mptcb *mp_tp = mpte->mpte_mptcb;
3765
3766 /*
3767 * We got a Data FIN for the MPTCP connection.
3768 * The FIN may arrive with data. The data is handed up to the
3769 * mptcp socket and the user is notified so that it may close
3770 * the socket if needed.
3771 */
3772 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3773 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3774 }
3775
3776 return MPTS_EVRET_OK; /* keep the subflow socket around */
3777 }
3778
3779 /*
3780 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3781 */
3782 static ev_ret_t
mptcp_subflow_failover_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3783 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3784 uint32_t *p_mpsofilt_hint, uint32_t event)
3785 {
3786 #pragma unused(event, p_mpsofilt_hint)
3787 struct mptsub *mpts_alt = NULL;
3788 struct socket *alt_so = NULL;
3789 struct socket *mp_so;
3790 int altpath_exists = 0;
3791
3792 mp_so = mptetoso(mpte);
3793 os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3794
3795 mptcp_reinject_mbufs(mpts->mpts_socket);
3796
3797 mpts_alt = mptcp_get_subflow(mpte, NULL);
3798
3799 /* If there is no alternate eligible subflow, ignore the failover hint. */
3800 if (mpts_alt == NULL || mpts_alt == mpts) {
3801 os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3802 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3803
3804 goto done;
3805 }
3806
3807 altpath_exists = 1;
3808 alt_so = mpts_alt->mpts_socket;
3809 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3810 /* All data acknowledged and no RTT spike */
3811 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3812 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3813 } else {
3814 /* no alternate path available */
3815 altpath_exists = 0;
3816 }
3817 }
3818
3819 if (altpath_exists) {
3820 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3821
3822 mpte->mpte_active_sub = mpts_alt;
3823 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3824 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3825
3826 os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3827 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3828
3829 mptcpstats_inc_switch(mpte, mpts);
3830
3831 sowwakeup(alt_so);
3832 } else {
3833 done:
3834 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3835 }
3836
3837 return MPTS_EVRET_OK;
3838 }
3839
3840 /*
3841 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3842 */
3843 static ev_ret_t
mptcp_subflow_ifdenied_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)3844 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3845 uint32_t *p_mpsofilt_hint, uint32_t event)
3846 {
3847 /*
3848 * The subflow connection cannot use the outgoing interface, let's
3849 * close this subflow.
3850 */
3851 mptcp_subflow_abort(mpts, EPERM);
3852
3853 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3854
3855 return MPTS_EVRET_DELETE;
3856 }
3857
3858 /*
3859 * https://tools.ietf.org/html/rfc6052#section-2
3860 * https://tools.ietf.org/html/rfc6147#section-5.2
3861 */
3862 static boolean_t
mptcp_desynthesize_ipv6_addr(struct mptses * mpte,const struct in6_addr * addr0,const struct ipv6_prefix * prefix,struct in_addr * addrv4_0)3863 mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr0,
3864 const struct ipv6_prefix *prefix,
3865 struct in_addr *addrv4_0)
3866 {
3867 char buf[MAX_IPv4_STR_LEN];
3868 const struct in6_addr *addr = addr0;
3869 const char *ptr = (const char *)addr;
3870 struct in_addr *addrv4 = addrv4_0;
3871 char *ptrv4 = (char *)addrv4;
3872
3873 if (memcmp(ptr, &prefix->ipv6_prefix, prefix->prefix_len) != 0) {
3874 return false;
3875 }
3876
3877 switch (prefix->prefix_len) {
3878 case NAT64_PREFIX_LEN_96:
3879 memcpy(ptrv4, ptr + 12, 4);
3880 break;
3881 case NAT64_PREFIX_LEN_64:
3882 memcpy(ptrv4, ptr + 9, 4);
3883 break;
3884 case NAT64_PREFIX_LEN_56:
3885 memcpy(ptrv4, ptr + 7, 1);
3886 memcpy(ptrv4 + 1, ptr + 9, 3);
3887 break;
3888 case NAT64_PREFIX_LEN_48:
3889 memcpy(ptrv4, ptr + 6, 2);
3890 memcpy(ptrv4 + 2, ptr + 9, 2);
3891 break;
3892 case NAT64_PREFIX_LEN_40:
3893 memcpy(ptrv4, ptr + 5, 3);
3894 memcpy(ptrv4 + 3, ptr + 9, 1);
3895 break;
3896 case NAT64_PREFIX_LEN_32:
3897 memcpy(ptrv4, ptr + 4, 4);
3898 break;
3899 default:
3900 panic("NAT64-prefix len is wrong: %u",
3901 prefix->prefix_len);
3902 }
3903
3904 os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__,
3905 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3906 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3907
3908 return true;
3909 }
3910
3911 static void
mptcp_handle_ipv6_connection(struct mptses * mpte,const struct mptsub * mpts)3912 mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3913 {
3914 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3915 struct socket *so = mpts->mpts_socket;
3916 struct ifnet *ifp;
3917 int j;
3918
3919 /* Subflow IPs will be steered directly by the server - no need to
3920 * desynthesize.
3921 */
3922 if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3923 return;
3924 }
3925
3926 ifp = sotoinpcb(so)->inp_last_outifp;
3927
3928 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3929 return;
3930 }
3931
3932 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3933 int success;
3934
3935 if (nat64prefixes[j].prefix_len == 0) {
3936 continue;
3937 }
3938
3939 success = mptcp_desynthesize_ipv6_addr(mpte,
3940 &mpte->__mpte_dst_v6.sin6_addr,
3941 &nat64prefixes[j],
3942 &mpte->mpte_sub_dst_v4.sin_addr);
3943 if (success) {
3944 mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
3945 mpte->mpte_sub_dst_v4.sin_family = AF_INET;
3946 mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
3947
3948 /*
3949 * We connected to a NAT64'ed address. Let's remove it
3950 * from the potential IPs to use. Whenever we are back on
3951 * that network and need to connect, we can synthesize again.
3952 *
3953 * Otherwise, on different IPv6 networks we will attempt
3954 * to connect to that NAT64 address...
3955 */
3956 memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6));
3957 break;
3958 }
3959 }
3960 }
3961
3962 static void
mptcp_try_alternate_port(struct mptses * mpte,struct mptsub * mpts)3963 mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
3964 {
3965 struct inpcb *inp;
3966
3967 if (!mptcp_ok_to_create_subflows(mpte->mpte_mptcb)) {
3968 return;
3969 }
3970
3971 inp = sotoinpcb(mpts->mpts_socket);
3972 if (inp == NULL) {
3973 return;
3974 }
3975
3976 /* Should we try the alternate port? */
3977 if (mpte->mpte_alternate_port &&
3978 inp->inp_fport != mpte->mpte_alternate_port) {
3979 union sockaddr_in_4_6 dst;
3980 struct sockaddr_in *dst_in = SIN(&dst);
3981
3982 SOCKADDR_COPY(&mpts->mpts_dst, &dst, mpts->mpts_dst.sa_len);
3983
3984 dst_in->sin_port = mpte->mpte_alternate_port;
3985
3986 mptcp_subflow_add(mpte, NULL, SA(&dst), mpts->mpts_ifscope, NULL);
3987 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3988 unsigned int i;
3989
3990 if (inp->inp_last_outifp == NULL) {
3991 return;
3992 }
3993
3994 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3995 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3996
3997 if (inp->inp_last_outifp->if_index == info->ifindex) {
3998 info->no_mptcp_support = 1;
3999 break;
4000 }
4001 }
4002 }
4003 }
4004
4005 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
4006 static void
mptcp_drop_tfo_data(struct mptses * mpte,struct mptsub * mpts)4007 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
4008 {
4009 struct socket *mp_so = mptetoso(mpte);
4010 struct socket *so = mpts->mpts_socket;
4011 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4012 struct mptcb *mp_tp = mpte->mpte_mptcb;
4013
4014 /* If data was sent with SYN, rewind state */
4015 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
4016 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
4017 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
4018
4019 VERIFY(mp_droplen <= (UINT_MAX));
4020 VERIFY(mp_droplen >= tcp_droplen);
4021
4022 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
4023 mpts->mpts_iss += tcp_droplen;
4024 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4025
4026 if (mp_droplen > tcp_droplen) {
4027 /* handle partial TCP ack */
4028 mp_so->so_flags1 |= SOF1_TFO_REWIND;
4029 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
4030 mp_droplen = tcp_droplen;
4031 } else {
4032 /* all data on SYN was acked */
4033 mpts->mpts_rel_seq = 1;
4034 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4035 }
4036 mp_tp->mpt_sndmax -= tcp_droplen;
4037
4038 if (mp_droplen != 0) {
4039 VERIFY(mp_so->so_snd.sb_mb != NULL);
4040 sbdrop(&mp_so->so_snd, (int)mp_droplen);
4041 }
4042 }
4043 }
4044
4045 /*
4046 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4047 */
4048 static ev_ret_t
mptcp_subflow_connected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4049 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4050 uint32_t *p_mpsofilt_hint, uint32_t event)
4051 {
4052 #pragma unused(event, p_mpsofilt_hint)
4053 struct socket *mp_so, *so;
4054 struct inpcb *inp;
4055 struct tcpcb *tp;
4056 struct mptcb *mp_tp;
4057 int af;
4058 boolean_t mpok = FALSE;
4059
4060 mp_so = mptetoso(mpte);
4061 mp_tp = mpte->mpte_mptcb;
4062 so = mpts->mpts_socket;
4063 tp = sototcpcb(so);
4064 af = mpts->mpts_dst.sa_family;
4065
4066 if (mpts->mpts_flags & MPTSF_CONNECTED) {
4067 return MPTS_EVRET_OK;
4068 }
4069
4070 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4071 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4072 return MPTS_EVRET_OK;
4073 }
4074
4075 /*
4076 * The subflow connection has been connected. Find out whether it
4077 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
4078 *
4079 * a. If MPTCP connection is not yet established, then this must be
4080 * the first subflow connection. If MPTCP failed to negotiate,
4081 * fallback to regular TCP by degrading this subflow.
4082 *
4083 * b. If MPTCP connection has been established, then this must be
4084 * one of the subsequent subflow connections. If MPTCP failed
4085 * to negotiate, disconnect the connection.
4086 *
4087 * Right now, we simply unblock any waiters at the MPTCP socket layer
4088 * if the MPTCP connection has not been established.
4089 */
4090
4091 if (so->so_state & SS_ISDISCONNECTED) {
4092 /*
4093 * With MPTCP joins, a connection is connected at the subflow
4094 * level, but the 4th ACK from the server elevates the MPTCP
4095 * subflow to connected state. So there is a small window
4096 * where the subflow could get disconnected before the
4097 * connected event is processed.
4098 */
4099 return MPTS_EVRET_OK;
4100 }
4101
4102 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4103 mptcp_drop_tfo_data(mpte, mpts);
4104 }
4105
4106 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4107 mpts->mpts_flags |= MPTSF_CONNECTED;
4108
4109 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4110 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4111 }
4112
4113 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4114
4115 /* get/verify the outbound interface */
4116 inp = sotoinpcb(so);
4117
4118 mpts->mpts_maxseg = tp->t_maxseg;
4119
4120 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4121
4122 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4123 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4124 mpte->mpte_associd = mpts->mpts_connid;
4125 DTRACE_MPTCP2(state__change,
4126 struct mptcb *, mp_tp,
4127 uint32_t, 0 /* event */);
4128
4129 if (SOCK_DOM(so) == AF_INET) {
4130 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4131 } else {
4132 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4133 }
4134
4135 mpts->mpts_flags |= MPTSF_ACTIVE;
4136
4137 /* case (a) above */
4138 if (!mpok) {
4139 tcpstat.tcps_mpcap_fallback++;
4140
4141 tp->t_mpflags |= TMPF_INFIN_SENT;
4142 mptcp_notify_mpfail(so);
4143 } else {
4144 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4145 mptcp_subflows_need_backup_flag(mpte)) {
4146 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4147 } else {
4148 mpts->mpts_flags |= MPTSF_PREFERRED;
4149 }
4150 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4151 mpte->mpte_nummpcapflows++;
4152
4153 if (SOCK_DOM(so) == AF_INET6) {
4154 mptcp_handle_ipv6_connection(mpte, mpts);
4155 }
4156
4157 mptcp_check_subflows_and_add(mpte);
4158
4159 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4160 mpte->mpte_initial_cell = 1;
4161 }
4162
4163 mpte->mpte_handshake_success = 1;
4164 }
4165
4166 mp_tp->mpt_sndwnd = tp->snd_wnd;
4167 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4168 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4169 soisconnected(mp_so);
4170 } else if (mpok) {
4171 /*
4172 * case (b) above
4173 * In case of additional flows, the MPTCP socket is not
4174 * MPTSF_MP_CAPABLE until an ACK is received from server
4175 * for 3-way handshake. TCP would have guaranteed that this
4176 * is an MPTCP subflow.
4177 */
4178 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4179 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4180 mptcp_subflows_need_backup_flag(mpte)) {
4181 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4182 mpts->mpts_flags &= ~MPTSF_PREFERRED;
4183 } else {
4184 mpts->mpts_flags |= MPTSF_PREFERRED;
4185 }
4186
4187 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4188 mpte->mpte_nummpcapflows++;
4189
4190 mpts->mpts_rel_seq = 1;
4191
4192 mptcp_check_subflows_and_remove(mpte);
4193 } else {
4194 mptcp_try_alternate_port(mpte, mpts);
4195
4196 tcpstat.tcps_join_fallback++;
4197 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4198 tcpstat.tcps_mptcp_cell_proxy++;
4199 } else {
4200 tcpstat.tcps_mptcp_wifi_proxy++;
4201 }
4202
4203 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4204
4205 return MPTS_EVRET_OK;
4206 }
4207
4208 /* This call, just to "book" an entry in the stats-table for this ifindex */
4209 mptcpstats_get_index(mpte->mpte_itfstats, MPTCP_ITFSTATS_SIZE, mpts);
4210
4211 mptcp_output(mpte);
4212
4213 return MPTS_EVRET_OK; /* keep the subflow socket around */
4214 }
4215
4216 /*
4217 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4218 */
4219 static ev_ret_t
mptcp_subflow_disconnected_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4220 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4221 uint32_t *p_mpsofilt_hint, uint32_t event)
4222 {
4223 #pragma unused(event, p_mpsofilt_hint)
4224 struct socket *mp_so, *so;
4225 struct mptcb *mp_tp;
4226
4227 mp_so = mptetoso(mpte);
4228 mp_tp = mpte->mpte_mptcb;
4229 so = mpts->mpts_socket;
4230
4231 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4232 return MPTS_EVRET_DELETE;
4233 }
4234
4235 mpts->mpts_flags |= MPTSF_DISCONNECTED;
4236
4237 /* The subflow connection has been disconnected. */
4238
4239 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4240 mpte->mpte_nummpcapflows--;
4241 if (mpte->mpte_active_sub == mpts) {
4242 mpte->mpte_active_sub = NULL;
4243 }
4244 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4245 } else {
4246 if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4247 !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4248 mptcp_try_alternate_port(mpte, mpts);
4249 }
4250 }
4251
4252 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4253 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4254 mptcp_drop(mpte, mp_tp, so->so_error);
4255 }
4256
4257 /*
4258 * Clear flags that are used by getconninfo to return state.
4259 * Retain like MPTSF_DELETEOK for internal purposes.
4260 */
4261 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4262 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4263 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4264
4265 return MPTS_EVRET_DELETE;
4266 }
4267
4268 /*
4269 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4270 */
4271 static ev_ret_t
mptcp_subflow_mpstatus_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4272 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4273 uint32_t *p_mpsofilt_hint, uint32_t event)
4274 {
4275 #pragma unused(event, p_mpsofilt_hint)
4276 ev_ret_t ret = MPTS_EVRET_OK;
4277 struct socket *mp_so, *so;
4278 struct mptcb *mp_tp;
4279
4280 mp_so = mptetoso(mpte);
4281 mp_tp = mpte->mpte_mptcb;
4282 so = mpts->mpts_socket;
4283 struct inpcb *inp = sotoinpcb(so);
4284 struct tcpcb *tp = intotcpcb(inp);
4285
4286 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4287 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4288 } else {
4289 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4290 }
4291
4292 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4293 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4294 goto done;
4295 }
4296 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4297 } else {
4298 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4299 }
4300
4301 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4302 mpts->mpts_flags |= MPTSF_MP_READY;
4303 } else {
4304 mpts->mpts_flags &= ~MPTSF_MP_READY;
4305 }
4306
4307 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4308 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4309 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4310 tcp_cache_update_mptcp_version(tp, FALSE);
4311 }
4312
4313 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4314 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4315
4316 m_freem_list(mpte->mpte_reinjectq);
4317 mpte->mpte_reinjectq = NULL;
4318 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4319 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4320 ret = MPTS_EVRET_CONNECT_PENDING;
4321 }
4322
4323 done:
4324 return ret;
4325 }
4326
4327 /*
4328 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4329 */
4330 static ev_ret_t
mptcp_subflow_mustrst_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4331 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4332 uint32_t *p_mpsofilt_hint, uint32_t event)
4333 {
4334 #pragma unused(event)
4335 struct socket *mp_so, *so;
4336 struct mptcb *mp_tp;
4337 boolean_t is_fastclose;
4338
4339 mp_so = mptetoso(mpte);
4340 mp_tp = mpte->mpte_mptcb;
4341 so = mpts->mpts_socket;
4342
4343 /* We got an invalid option or a fast close */
4344 struct inpcb *inp = sotoinpcb(so);
4345 struct tcpcb *tp = NULL;
4346
4347 tp = intotcpcb(inp);
4348 so->so_error = ECONNABORTED;
4349
4350 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4351
4352 tp->t_mpflags |= TMPF_RESET;
4353
4354 if (tp->t_state != TCPS_CLOSED) {
4355 mbuf_ref_t m;
4356 struct tcptemp *t_template = tcp_maketemplate(tp, &m, NULL, NULL);
4357
4358 if (t_template) {
4359 struct tcp_respond_args tra;
4360
4361 bzero(&tra, sizeof(tra));
4362 if (inp->inp_flags & INP_BOUND_IF) {
4363 tra.ifscope = inp->inp_boundifp->if_index;
4364 } else {
4365 tra.ifscope = IFSCOPE_NONE;
4366 }
4367 tra.awdl_unrestricted = 1;
4368
4369 tcp_respond(tp, t_template->tt_ipgen, sizeof(t_template->tt_ipgen),
4370 &t_template->tt_t, (struct mbuf *)NULL,
4371 tp->rcv_nxt, tp->snd_una, 0, TH_RST, NULL, 0, 0, 0, &tra, false);
4372 (void) m_free(m);
4373 }
4374 }
4375
4376 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4377 struct mptsub *iter, *tmp;
4378
4379 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4380
4381 mp_so->so_error = ECONNRESET;
4382
4383 TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4384 if (iter == mpts) {
4385 continue;
4386 }
4387 mptcp_subflow_abort(iter, ECONNABORTED);
4388 }
4389
4390 /*
4391 * mptcp_drop is being called after processing the events, to fully
4392 * close the MPTCP connection
4393 */
4394 mptcp_drop(mpte, mp_tp, mp_so->so_error);
4395 }
4396
4397 mptcp_subflow_abort(mpts, ECONNABORTED);
4398
4399 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4400 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4401 }
4402
4403 return MPTS_EVRET_DELETE;
4404 }
4405
4406 static ev_ret_t
mptcp_subflow_adaptive_rtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4407 mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4408 uint32_t *p_mpsofilt_hint, uint32_t event)
4409 {
4410 #pragma unused(event)
4411 bool found_active = false;
4412
4413 mpts->mpts_flags |= MPTSF_READ_STALL;
4414
4415 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4416 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4417
4418 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4419 TCPS_HAVERCVDFIN2(tp->t_state)) {
4420 continue;
4421 }
4422
4423 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4424 found_active = true;
4425 break;
4426 }
4427 }
4428
4429 if (!found_active) {
4430 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4431 }
4432
4433 return MPTS_EVRET_OK;
4434 }
4435
4436 static ev_ret_t
mptcp_subflow_adaptive_wtimo_ev(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint,uint32_t event)4437 mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4438 uint32_t *p_mpsofilt_hint, uint32_t event)
4439 {
4440 #pragma unused(event)
4441 bool found_active = false;
4442
4443 mpts->mpts_flags |= MPTSF_WRITE_STALL;
4444
4445 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4446 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4447
4448 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4449 tp->t_state > TCPS_CLOSE_WAIT) {
4450 continue;
4451 }
4452
4453 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4454 found_active = true;
4455 break;
4456 }
4457 }
4458
4459 if (!found_active) {
4460 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4461 }
4462
4463 return MPTS_EVRET_OK;
4464 }
4465
4466 /*
4467 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4468 * caller must ensure that the option can be issued on subflow sockets, via
4469 * MPOF_SUBFLOW_OK flag.
4470 */
4471 int
mptcp_subflow_sosetopt(struct mptses * mpte,struct mptsub * mpts,struct mptopt * mpo)4472 mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4473 {
4474 struct socket *mp_so, *so;
4475 struct sockopt sopt;
4476 int error;
4477
4478 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4479
4480 mp_so = mptetoso(mpte);
4481 so = mpts->mpts_socket;
4482
4483 /* Don't try to apply an IP or IPv6 option on an IPv6 or IP socket */
4484 if (mpo->mpo_level == IPPROTO_IP && SOCK_CHECK_DOM(so, PF_INET6)) {
4485 return 0;
4486 }
4487 if (mpo->mpo_level == IPPROTO_IPV6 && SOCK_CHECK_DOM(so, PF_INET)) {
4488 return 0;
4489 }
4490
4491 socket_lock_assert_owned(mp_so);
4492
4493 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4494 mpo->mpo_level == SOL_SOCKET &&
4495 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4496 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4497
4498 /*
4499 * When we open a new subflow, mark it as cell fallback, if
4500 * this subflow goes over cell.
4501 *
4502 * (except for first-party apps)
4503 */
4504
4505 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4506 return 0;
4507 }
4508
4509 if (sotoinpcb(so)->inp_last_outifp &&
4510 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4511 return 0;
4512 }
4513
4514 /*
4515 * This here is an OR, because if the app is not binding to the
4516 * interface, then it definitely is not a cell-fallback
4517 * connection.
4518 */
4519 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4520 !IFNET_IS_CELLULAR(ifp)) {
4521 return 0;
4522 }
4523 }
4524
4525 mpo->mpo_flags &= ~MPOF_INTERIM;
4526
4527 bzero(&sopt, sizeof(sopt));
4528 sopt.sopt_dir = SOPT_SET;
4529 sopt.sopt_level = mpo->mpo_level;
4530 sopt.sopt_name = mpo->mpo_name;
4531 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4532 sopt.sopt_valsize = sizeof(int);
4533 sopt.sopt_p = kernproc;
4534
4535 error = sosetoptlock(so, &sopt, 0);
4536 if (error) {
4537 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4538 "val %d set error %d\n", __func__,
4539 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4540 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4541 mpo->mpo_intval, error);
4542 }
4543 return error;
4544 }
4545
4546 /*
4547 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4548 * caller must ensure that the option can be issued on subflow sockets, via
4549 * MPOF_SUBFLOW_OK flag.
4550 */
4551 int
mptcp_subflow_sogetopt(struct mptses * mpte,struct socket * so,struct mptopt * mpo)4552 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4553 struct mptopt *mpo)
4554 {
4555 struct socket *mp_so;
4556 struct sockopt sopt;
4557 int error;
4558
4559 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4560 mp_so = mptetoso(mpte);
4561
4562 socket_lock_assert_owned(mp_so);
4563
4564 bzero(&sopt, sizeof(sopt));
4565 sopt.sopt_dir = SOPT_GET;
4566 sopt.sopt_level = mpo->mpo_level;
4567 sopt.sopt_name = mpo->mpo_name;
4568 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4569 sopt.sopt_valsize = sizeof(int);
4570 sopt.sopt_p = kernproc;
4571
4572 error = sogetoptlock(so, &sopt, 0); /* already locked */
4573 if (error) {
4574 os_log_error(mptcp_log_handle,
4575 "%s - %lx: sopt %s get error %d\n",
4576 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4577 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4578 }
4579 return error;
4580 }
4581
4582
4583 /*
4584 * MPTCP garbage collector.
4585 *
4586 * This routine is called by the MP domain on-demand, periodic callout,
4587 * which is triggered when a MPTCP socket is closed. The callout will
4588 * repeat as long as this routine returns a non-zero value.
4589 */
4590 static uint32_t
mptcp_gc(struct mppcbinfo * mppi)4591 mptcp_gc(struct mppcbinfo *mppi)
4592 {
4593 struct mppcb *mpp, *tmpp;
4594 uint32_t active = 0;
4595
4596 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4597
4598 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4599 struct socket *mp_so;
4600 struct mptses *mpte;
4601 struct mptcb *mp_tp;
4602
4603 mp_so = mpp->mpp_socket;
4604 mpte = mptompte(mpp);
4605 mp_tp = mpte->mpte_mptcb;
4606
4607 if (!mpp_try_lock(mpp)) {
4608 active++;
4609 continue;
4610 }
4611
4612 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4613
4614 /* check again under the lock */
4615 if (mp_so->so_usecount > 0) {
4616 boolean_t wakeup = FALSE;
4617 struct mptsub *mpts, *tmpts;
4618
4619 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4620 if (mp_tp->mpt_gc_ticks > 0) {
4621 mp_tp->mpt_gc_ticks--;
4622 }
4623 if (mp_tp->mpt_gc_ticks == 0) {
4624 wakeup = TRUE;
4625 }
4626 }
4627 if (wakeup) {
4628 TAILQ_FOREACH_SAFE(mpts,
4629 &mpte->mpte_subflows, mpts_entry, tmpts) {
4630 mptcp_subflow_eupcall1(mpts->mpts_socket,
4631 mpts, SO_FILT_HINT_DISCONNECTED);
4632 }
4633 }
4634 socket_unlock(mp_so, 0);
4635 active++;
4636 continue;
4637 }
4638
4639 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4640 panic("%s - %lx: skipped state "
4641 "[u=%d,r=%d,s=%d]\n", __func__,
4642 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4643 mp_so->so_usecount, mp_so->so_retaincnt,
4644 mpp->mpp_state);
4645 }
4646
4647 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4648 mptcp_close(mpte, mp_tp);
4649 }
4650
4651 mptcp_session_destroy(mpte);
4652
4653 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4654 struct sockbuf *, &mp_so->so_rcv,
4655 struct sockbuf *, &mp_so->so_snd,
4656 struct mppcb *, mpp);
4657
4658 mptcp_pcbdispose(mpp);
4659 sodealloc(mp_so);
4660 }
4661
4662 return active;
4663 }
4664
4665 /*
4666 * Drop a MPTCP connection, reporting the specified error.
4667 */
4668 struct mptses *
mptcp_drop(struct mptses * mpte,struct mptcb * mp_tp,u_short errno)4669 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4670 {
4671 struct socket *mp_so = mptetoso(mpte);
4672
4673 VERIFY(mpte->mpte_mptcb == mp_tp);
4674
4675 socket_lock_assert_owned(mp_so);
4676
4677 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4678 uint32_t, 0 /* event */);
4679
4680 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4681 errno = mp_tp->mpt_softerror;
4682 }
4683 mp_so->so_error = errno;
4684
4685 return mptcp_close(mpte, mp_tp);
4686 }
4687
4688 /*
4689 * Close a MPTCP control block.
4690 */
4691 struct mptses *
mptcp_close(struct mptses * mpte,struct mptcb * mp_tp)4692 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4693 {
4694 struct mptsub *mpts = NULL, *tmpts = NULL;
4695 struct socket *mp_so = mptetoso(mpte);
4696
4697 socket_lock_assert_owned(mp_so);
4698 VERIFY(mpte->mpte_mptcb == mp_tp);
4699
4700 mp_tp->mpt_state = MPTCPS_TERMINATE;
4701
4702 mptcp_freeq(mp_tp);
4703
4704 soisdisconnected(mp_so);
4705
4706 /* Clean up all subflows */
4707 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4708 mptcp_subflow_disconnect(mpte, mpts);
4709 }
4710
4711 return NULL;
4712 }
4713
4714 void
mptcp_notify_close(struct socket * so)4715 mptcp_notify_close(struct socket *so)
4716 {
4717 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4718 }
4719
4720 typedef struct mptcp_subflow_event_entry {
4721 uint32_t sofilt_hint_mask;
4722 ev_ret_t (*sofilt_hint_ev_hdlr)(
4723 struct mptses *mpte,
4724 struct mptsub *mpts,
4725 uint32_t *p_mpsofilt_hint,
4726 uint32_t event);
4727 } mptsub_ev_entry_t;
4728
4729 /*
4730 * XXX The order of the event handlers below is really
4731 * really important. Think twice before changing it.
4732 */
4733 static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
4734 {
4735 .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
4736 .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
4737 },
4738 {
4739 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
4740 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
4741 },
4742 {
4743 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
4744 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
4745 },
4746 {
4747 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
4748 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4749 },
4750 {
4751 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
4752 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
4753 },
4754 {
4755 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
4756 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4757 },
4758 {
4759 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
4760 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4761 },
4762 {
4763 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
4764 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
4765 },
4766 {
4767 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
4768 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
4769 },
4770 {
4771 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
4772 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
4773 },
4774 {
4775 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
4776 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
4777 },
4778 {
4779 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
4780 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
4781 },
4782 {
4783 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
4784 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
4785 },
4786 {
4787 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
4788 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
4789 },
4790 };
4791
4792 /*
4793 * Subflow socket control events.
4794 *
4795 * Called for handling events related to the underlying subflow socket.
4796 */
4797 static ev_ret_t
mptcp_subflow_events(struct mptses * mpte,struct mptsub * mpts,uint32_t * p_mpsofilt_hint)4798 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
4799 uint32_t *p_mpsofilt_hint)
4800 {
4801 ev_ret_t ret = MPTS_EVRET_OK;
4802 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
4803 sizeof(mpsub_ev_entry_tbl[0]);
4804
4805 /* bail if there's nothing to process */
4806 if (!mpts->mpts_evctl) {
4807 return ret;
4808 }
4809
4810 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
4811 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
4812 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
4813 SO_FILT_HINT_DISCONNECTED)) {
4814 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
4815 }
4816
4817 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
4818 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
4819
4820 /*
4821 * Process all the socket filter hints and reset the hint
4822 * once it is handled
4823 */
4824 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
4825 /*
4826 * Always execute the DISCONNECTED event, because it will wakeup
4827 * the app.
4828 */
4829 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
4830 (ret >= MPTS_EVRET_OK ||
4831 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
4832 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
4833 ev_ret_t error =
4834 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
4835 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
4836 }
4837 }
4838
4839 return ret;
4840 }
4841
4842 /*
4843 * MPTCP workloop.
4844 */
4845 void
mptcp_subflow_workloop(struct mptses * mpte)4846 mptcp_subflow_workloop(struct mptses *mpte)
4847 {
4848 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4849 uint32_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4850 struct mptsub *mpts, *tmpts;
4851 struct socket *mp_so;
4852
4853 mp_so = mptetoso(mpte);
4854
4855 socket_lock_assert_owned(mp_so);
4856
4857 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4858 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4859 return;
4860 }
4861 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4862
4863 relaunch:
4864 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4865
4866 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4867 ev_ret_t ret;
4868
4869 if (mpts->mpts_socket->so_usecount == 0) {
4870 /* Will be removed soon by tcp_garbage_collect */
4871 continue;
4872 }
4873
4874 mptcp_subflow_addref(mpts);
4875 mpts->mpts_socket->so_usecount++;
4876
4877 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4878
4879 /*
4880 * If MPTCP socket is closed, disconnect all subflows.
4881 * This will generate a disconnect event which will
4882 * be handled during the next iteration, causing a
4883 * non-zero error to be returned above.
4884 */
4885 if (mp_so->so_flags & SOF_PCBCLEARING) {
4886 mptcp_subflow_disconnect(mpte, mpts);
4887 }
4888
4889 switch (ret) {
4890 case MPTS_EVRET_OK:
4891 /* nothing to do */
4892 break;
4893 case MPTS_EVRET_DELETE:
4894 mptcp_subflow_soclose(mpts);
4895 break;
4896 case MPTS_EVRET_CONNECT_PENDING:
4897 connect_pending = TRUE;
4898 break;
4899 case MPTS_EVRET_DISCONNECT_FALLBACK:
4900 disconnect_fallback = TRUE;
4901 break;
4902 default:
4903 break;
4904 }
4905 mptcp_subflow_remref(mpts); /* ours */
4906
4907 VERIFY(mpts->mpts_socket->so_usecount != 0);
4908 mpts->mpts_socket->so_usecount--;
4909 }
4910
4911 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4912 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4913
4914 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4915 mp_so->so_state |= SS_CANTRCVMORE;
4916 sorwakeup(mp_so);
4917 }
4918
4919 soevent(mp_so, mpsofilt_hint_mask);
4920 }
4921
4922 if (!connect_pending && !disconnect_fallback) {
4923 goto exit;
4924 }
4925
4926 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4927 if (disconnect_fallback) {
4928 struct socket *so = NULL;
4929 struct inpcb *inp = NULL;
4930 struct tcpcb *tp = NULL;
4931
4932 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4933 continue;
4934 }
4935
4936 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4937
4938 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4939 MPTSF_DISCONNECTED)) {
4940 continue;
4941 }
4942
4943 so = mpts->mpts_socket;
4944
4945 /*
4946 * The MPTCP connection has degraded to a fallback
4947 * mode, so there is no point in keeping this subflow
4948 * regardless of its MPTCP-readiness state, unless it
4949 * is the primary one which we use for fallback. This
4950 * assumes that the subflow used for fallback is the
4951 * ACTIVE one.
4952 */
4953
4954 inp = sotoinpcb(so);
4955 tp = intotcpcb(inp);
4956 tp->t_mpflags &=
4957 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4958 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4959
4960 soevent(so, SO_FILT_HINT_MUSTRST);
4961 } else if (connect_pending) {
4962 /*
4963 * The MPTCP connection has progressed to a state
4964 * where it supports full multipath semantics; allow
4965 * additional joins to be attempted for all subflows
4966 * that are in the PENDING state.
4967 */
4968 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4969 int error = mptcp_subflow_soconnectx(mpte, mpts);
4970
4971 if (error) {
4972 mptcp_subflow_abort(mpts, error);
4973 }
4974 }
4975 }
4976 }
4977
4978 exit:
4979 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4980 goto relaunch;
4981 }
4982
4983 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4984 }
4985
4986 /*
4987 * Protocol pr_lock callback.
4988 */
4989 int
mptcp_lock(struct socket * mp_so,int refcount,void * lr)4990 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4991 {
4992 struct mppcb *mpp = mpsotomppcb(mp_so);
4993 lr_ref_t lr_saved = TCP_INIT_LR_SAVED(lr);
4994
4995 if (mpp == NULL) {
4996 panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
4997 mp_so, lr_saved, solockhistory_nr(mp_so));
4998 /* NOTREACHED */
4999 }
5000 mpp_lock(mpp);
5001
5002 if (mp_so->so_usecount < 0) {
5003 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__,
5004 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
5005 solockhistory_nr(mp_so));
5006 /* NOTREACHED */
5007 }
5008 if (refcount != 0) {
5009 mp_so->so_usecount++;
5010 mpp->mpp_inside++;
5011 }
5012 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
5013 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
5014
5015 return 0;
5016 }
5017
5018 /*
5019 * Protocol pr_unlock callback.
5020 */
5021 int
mptcp_unlock(struct socket * mp_so,int refcount,void * lr)5022 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
5023 {
5024 struct mppcb *mpp = mpsotomppcb(mp_so);
5025 lr_ref_t lr_saved = TCP_INIT_LR_SAVED(lr);
5026
5027 if (mpp == NULL) {
5028 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__,
5029 mp_so, mp_so->so_usecount, lr_saved,
5030 solockhistory_nr(mp_so));
5031 /* NOTREACHED */
5032 }
5033 socket_lock_assert_owned(mp_so);
5034
5035 if (refcount != 0) {
5036 mp_so->so_usecount--;
5037 mpp->mpp_inside--;
5038 }
5039
5040 if (mp_so->so_usecount < 0) {
5041 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5042 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5043 /* NOTREACHED */
5044 }
5045 if (mpp->mpp_inside < 0) {
5046 panic("%s: mpp=%p inside=%x lrh= %s", __func__,
5047 mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5048 /* NOTREACHED */
5049 }
5050 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5051 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5052 mpp_unlock(mpp);
5053
5054 return 0;
5055 }
5056
5057 /*
5058 * Protocol pr_getlock callback.
5059 */
5060 lck_mtx_t *
mptcp_getlock(struct socket * mp_so,int flags)5061 mptcp_getlock(struct socket *mp_so, int flags)
5062 {
5063 struct mppcb *mpp = mpsotomppcb(mp_so);
5064
5065 if (mpp == NULL) {
5066 panic("%s: so=%p NULL so_pcb %s", __func__, mp_so,
5067 solockhistory_nr(mp_so));
5068 /* NOTREACHED */
5069 }
5070 if (mp_so->so_usecount < 0) {
5071 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5072 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5073 /* NOTREACHED */
5074 }
5075 return mpp_getlock(mpp, flags);
5076 }
5077
5078 void
mptcp_get_rands(mptcp_addr_id addr_id,struct mptcb * mp_tp,u_int32_t * lrand,u_int32_t * rrand)5079 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5080 u_int32_t *rrand)
5081 {
5082 struct mptcp_subf_auth_entry *sauth_entry;
5083
5084 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5085 if (sauth_entry->msae_laddr_id == addr_id) {
5086 if (lrand) {
5087 *lrand = sauth_entry->msae_laddr_rand;
5088 }
5089 if (rrand) {
5090 *rrand = sauth_entry->msae_raddr_rand;
5091 }
5092 break;
5093 }
5094 }
5095 }
5096
5097 void
mptcp_set_raddr_rand(mptcp_addr_id laddr_id,struct mptcb * mp_tp,mptcp_addr_id raddr_id,u_int32_t raddr_rand)5098 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5099 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5100 {
5101 struct mptcp_subf_auth_entry *sauth_entry;
5102
5103 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5104 if (sauth_entry->msae_laddr_id == laddr_id) {
5105 if ((sauth_entry->msae_raddr_id != 0) &&
5106 (sauth_entry->msae_raddr_id != raddr_id)) {
5107 os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5108 " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5109 raddr_id, sauth_entry->msae_raddr_id);
5110 return;
5111 }
5112 sauth_entry->msae_raddr_id = raddr_id;
5113 if ((sauth_entry->msae_raddr_rand != 0) &&
5114 (sauth_entry->msae_raddr_rand != raddr_rand)) {
5115 os_log_error(mptcp_log_handle, "%s - %lx: "
5116 "dup SYN_ACK %d %d \n",
5117 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5118 raddr_rand, sauth_entry->msae_raddr_rand);
5119 return;
5120 }
5121 sauth_entry->msae_raddr_rand = raddr_rand;
5122 return;
5123 }
5124 }
5125 }
5126
5127 /*
5128 * SHA-256 support for MPTCP
5129 */
5130
5131 static void
mptcp_do_sha256(mptcp_key_t * key,char sha_digest[SHA256_DIGEST_LENGTH])5132 mptcp_do_sha256(mptcp_key_t *key, char sha_digest[SHA256_DIGEST_LENGTH])
5133 {
5134 const unsigned char *sha2_base;
5135 int sha2_size;
5136
5137 sha2_base = (const unsigned char *) key;
5138 sha2_size = sizeof(mptcp_key_t);
5139
5140 SHA256_CTX sha_ctx;
5141 SHA256_Init(&sha_ctx);
5142 SHA256_Update(&sha_ctx, sha2_base, sha2_size);
5143 SHA256_Final(sha_digest, &sha_ctx);
5144 }
5145
5146 void
mptcp_hmac_sha256(mptcp_key_t key1,mptcp_key_t key2,u_char * msg __sized_by (msg_len),uint16_t msg_len,u_char digest[SHA256_DIGEST_LENGTH])5147 mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2,
5148 u_char *msg __sized_by(msg_len), uint16_t msg_len, u_char digest[SHA256_DIGEST_LENGTH])
5149 {
5150 SHA256_CTX sha_ctx;
5151 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5152 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5153 int i;
5154
5155 bzero(digest, SHA256_DIGEST_LENGTH);
5156
5157 /* Set up the Key for HMAC */
5158 key_ipad[0] = key1;
5159 key_ipad[1] = key2;
5160
5161 key_opad[0] = key1;
5162 key_opad[1] = key2;
5163
5164 /* Key is 512 block length, so no need to compute hash */
5165
5166 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5167
5168 for (i = 0; i < 8; i++) {
5169 key_ipad[i] ^= 0x3636363636363636;
5170 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5171 }
5172
5173 /* Perform inner SHA256 */
5174 SHA256_Init(&sha_ctx);
5175 SHA256_Update(&sha_ctx, (unsigned char *)key_ipad, sizeof(key_ipad));
5176 SHA256_Update(&sha_ctx, msg, msg_len);
5177 SHA256_Final(digest, &sha_ctx);
5178
5179 /* Perform outer SHA256 */
5180 SHA256_Init(&sha_ctx);
5181 SHA256_Update(&sha_ctx, (unsigned char *)key_opad, sizeof(key_opad));
5182 SHA256_Update(&sha_ctx, (unsigned char *)digest, SHA256_DIGEST_LENGTH);
5183 SHA256_Final(digest, &sha_ctx);
5184 }
5185
5186 /*
5187 * SHA1 support for MPTCP
5188 */
5189
5190 static void
mptcp_do_sha1(mptcp_key_t * key,char sha_digest[SHA1_RESULTLEN])5191 mptcp_do_sha1(mptcp_key_t *key, char sha_digest[SHA1_RESULTLEN])
5192 {
5193 SHA1_CTX sha1ctxt;
5194 const unsigned char *sha1_base;
5195 int sha1_size;
5196
5197 sha1_base = (const unsigned char *) key;
5198 sha1_size = sizeof(mptcp_key_t);
5199 SHA1Init(&sha1ctxt);
5200 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5201 SHA1Final(sha_digest, &sha1ctxt);
5202 }
5203
5204 void
mptcp_hmac_sha1(mptcp_key_t key1,mptcp_key_t key2,u_int32_t rand1,u_int32_t rand2,u_char digest[SHA1_RESULTLEN])5205 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5206 u_int32_t rand1, u_int32_t rand2, u_char digest[SHA1_RESULTLEN])
5207 {
5208 SHA1_CTX sha1ctxt;
5209 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5210 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5211 u_int32_t data[2];
5212 int i;
5213
5214 bzero(digest, SHA1_RESULTLEN);
5215
5216 /* Set up the Key for HMAC */
5217 key_ipad[0] = key1;
5218 key_ipad[1] = key2;
5219
5220 key_opad[0] = key1;
5221 key_opad[1] = key2;
5222
5223 /* Set up the message for HMAC */
5224 data[0] = rand1;
5225 data[1] = rand2;
5226
5227 /* Key is 512 block length, so no need to compute hash */
5228
5229 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5230
5231 for (i = 0; i < 8; i++) {
5232 key_ipad[i] ^= 0x3636363636363636;
5233 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5234 }
5235
5236 /* Perform inner SHA1 */
5237 SHA1Init(&sha1ctxt);
5238 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5239 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5240 SHA1Final(digest, &sha1ctxt);
5241
5242 /* Perform outer SHA1 */
5243 SHA1Init(&sha1ctxt);
5244 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5245 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5246 SHA1Final(digest, &sha1ctxt);
5247 }
5248
5249 /*
5250 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5251 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5252 */
5253 void
mptcp_get_mpjoin_hmac(mptcp_addr_id aid,struct mptcb * mp_tp,u_char * digest __sized_by (digest_len),uint8_t digest_len)5254 mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest __sized_by(digest_len), uint8_t digest_len)
5255 {
5256 uint32_t lrand, rrand;
5257
5258 lrand = rrand = 0;
5259 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
5260
5261 u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0};
5262 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5263 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand, full_digest);
5264 } else {
5265 uint32_t data[2];
5266 data[0] = lrand;
5267 data[1] = rrand;
5268 mptcp_hmac_sha256(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, (u_char*)data, 8, full_digest);
5269 }
5270 bcopy(full_digest, digest, digest_len);
5271 }
5272
5273 /*
5274 * Authentication data generation
5275 */
5276 static void
mptcp_generate_token(char * sha_digest __sized_by (sha_digest_len),int sha_digest_len,caddr_t token __sized_by (token_len),int token_len)5277 mptcp_generate_token(char *sha_digest __sized_by(sha_digest_len), int sha_digest_len, caddr_t token __sized_by(token_len),
5278 int token_len)
5279 {
5280 VERIFY(token_len == sizeof(u_int32_t));
5281 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5282 sha_digest_len == SHA256_DIGEST_LENGTH);
5283
5284 /* Most significant 32 bits of the SHA1/SHA256 hash */
5285 bcopy(sha_digest, token, sizeof(u_int32_t));
5286 return;
5287 }
5288
5289 static void
mptcp_generate_idsn(char * sha_digest __sized_by (sha_digest_len),int sha_digest_len,caddr_t idsn __sized_by (idsn_len),int idsn_len,uint8_t mp_version)5290 mptcp_generate_idsn(char *sha_digest __sized_by(sha_digest_len), int sha_digest_len, caddr_t idsn __sized_by(idsn_len),
5291 int idsn_len, uint8_t mp_version)
5292 {
5293 VERIFY(idsn_len == sizeof(u_int64_t));
5294 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5295 sha_digest_len == SHA256_DIGEST_LENGTH);
5296 VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1);
5297
5298 /*
5299 * Least significant 64 bits of the hash
5300 */
5301
5302 if (mp_version == MPTCP_VERSION_0) {
5303 idsn[7] = sha_digest[12];
5304 idsn[6] = sha_digest[13];
5305 idsn[5] = sha_digest[14];
5306 idsn[4] = sha_digest[15];
5307 idsn[3] = sha_digest[16];
5308 idsn[2] = sha_digest[17];
5309 idsn[1] = sha_digest[18];
5310 idsn[0] = sha_digest[19];
5311 } else {
5312 idsn[7] = sha_digest[24];
5313 idsn[6] = sha_digest[25];
5314 idsn[5] = sha_digest[26];
5315 idsn[4] = sha_digest[27];
5316 idsn[3] = sha_digest[28];
5317 idsn[2] = sha_digest[29];
5318 idsn[1] = sha_digest[30];
5319 idsn[0] = sha_digest[31];
5320 }
5321 return;
5322 }
5323
5324 static void
mptcp_conn_properties(struct mptcb * mp_tp)5325 mptcp_conn_properties(struct mptcb *mp_tp)
5326 {
5327 /* Set DSS checksum flag */
5328 if (mptcp_dss_csum) {
5329 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5330 }
5331
5332 /* Set up receive window */
5333 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5334
5335 /* Set up gc ticks */
5336 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5337 }
5338
5339 static void
mptcp_init_local_parms(struct mptses * mpte,struct sockaddr * dst)5340 mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst)
5341 {
5342 struct mptcb *mp_tp = mpte->mpte_mptcb;
5343 char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5344 uint16_t digest_len;
5345
5346 if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) {
5347 mp_tp->mpt_version = MPTCP_VERSION_0;
5348 } else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) {
5349 mp_tp->mpt_version = MPTCP_VERSION_1;
5350 } else {
5351 mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst);
5352 }
5353 VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 ||
5354 mp_tp->mpt_version == MPTCP_VERSION_1);
5355
5356 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
5357 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5358 digest_len = SHA1_RESULTLEN;
5359 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
5360 } else {
5361 digest_len = SHA256_DIGEST_LENGTH;
5362 mptcp_do_sha256(&mp_tp->mpt_localkey, key_digest);
5363 }
5364
5365 mptcp_generate_token(key_digest, digest_len,
5366 (caddr_t)&mp_tp->mpt_localtoken, sizeof(mp_tp->mpt_localtoken));
5367 mptcp_generate_idsn(key_digest, digest_len,
5368 (caddr_t)&mp_tp->mpt_local_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5369 /* The subflow SYN is also first MPTCP byte */
5370 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5371 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5372
5373 mptcp_conn_properties(mp_tp);
5374 }
5375
5376 int
mptcp_init_remote_parms(struct mptcb * mp_tp)5377 mptcp_init_remote_parms(struct mptcb *mp_tp)
5378 {
5379 /* Setup local and remote tokens and Initial DSNs */
5380 char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5381 uint16_t digest_len;
5382
5383 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5384 digest_len = SHA1_RESULTLEN;
5385 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
5386 } else if (mp_tp->mpt_version == MPTCP_VERSION_1) {
5387 digest_len = SHA256_DIGEST_LENGTH;
5388 mptcp_do_sha256(&mp_tp->mpt_remotekey, remote_digest);
5389 } else {
5390 return -1;
5391 }
5392
5393 mptcp_generate_token(remote_digest, digest_len,
5394 (caddr_t)&mp_tp->mpt_remotetoken, sizeof(mp_tp->mpt_remotetoken));
5395 mptcp_generate_idsn(remote_digest, digest_len,
5396 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof(u_int64_t), mp_tp->mpt_version);
5397 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5398 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5399 return 0;
5400 }
5401
5402 static void
mptcp_send_dfin(struct socket * so)5403 mptcp_send_dfin(struct socket *so)
5404 {
5405 struct tcpcb *tp = NULL;
5406 struct inpcb *inp = NULL;
5407
5408 inp = sotoinpcb(so);
5409 if (!inp) {
5410 return;
5411 }
5412
5413 tp = intotcpcb(inp);
5414 if (!tp) {
5415 return;
5416 }
5417
5418 if (!(tp->t_mpflags & TMPF_RESET)) {
5419 tp->t_mpflags |= TMPF_SEND_DFIN;
5420 }
5421 }
5422
5423 /*
5424 * Data Sequence Mapping routines
5425 */
5426 void
mptcp_insert_dsn(struct mppcb * mpp,struct mbuf * m)5427 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5428 {
5429 struct mptcb *mp_tp;
5430
5431 if (m == NULL) {
5432 return;
5433 }
5434
5435 mp_tp = &__container_of(mpp, struct mpp_mtp, mpp)->mtcb;
5436
5437 while (m) {
5438 VERIFY(m->m_flags & M_PKTHDR);
5439 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5440 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5441 VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5442 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5443 mp_tp->mpt_sndmax += m_pktlen(m);
5444 m = m->m_next;
5445 }
5446 }
5447
5448 void
mptcp_fallback_sbdrop(struct socket * so,struct mbuf * m,int len)5449 mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5450 {
5451 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5452 uint64_t data_ack;
5453 uint64_t dsn;
5454
5455 VERIFY(len >= 0);
5456
5457 if (!m || len == 0) {
5458 return;
5459 }
5460
5461 while (m && len > 0) {
5462 VERIFY(m->m_flags & M_PKTHDR);
5463 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5464
5465 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5466 dsn = m->m_pkthdr.mp_dsn;
5467
5468 len -= m->m_len;
5469 m = m->m_next;
5470 }
5471
5472 if (m && len == 0) {
5473 /*
5474 * If there is one more mbuf in the chain, it automatically means
5475 * that up to m->mp_dsn has been ack'ed.
5476 *
5477 * This means, we actually correct data_ack back down (compared
5478 * to what we set inside the loop - dsn + data_len). Because in
5479 * the loop we are "optimistic" and assume that the full mapping
5480 * will be acked. If that's not the case and we get out of the
5481 * loop with m != NULL, it means only up to m->mp_dsn has been
5482 * really acked.
5483 */
5484 data_ack = m->m_pkthdr.mp_dsn;
5485 }
5486
5487 if (len < 0) {
5488 /*
5489 * If len is negative, meaning we acked in the middle of an mbuf,
5490 * only up to this mbuf's data-sequence number has been acked
5491 * at the MPTCP-level.
5492 */
5493 data_ack = dsn;
5494 }
5495
5496 /* We can have data in the subflow's send-queue that is being acked,
5497 * while the DATA_ACK has already advanced. Thus, we should check whether
5498 * or not the DATA_ACK is actually new here.
5499 */
5500 if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5501 MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5502 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
5503 }
5504 }
5505
5506 void
mptcp_preproc_sbdrop(struct socket * so,struct mbuf * m,unsigned int len)5507 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5508 {
5509 int rewinding = 0;
5510
5511 /* TFO makes things complicated. */
5512 if (so->so_flags1 & SOF1_TFO_REWIND) {
5513 rewinding = 1;
5514 so->so_flags1 &= ~SOF1_TFO_REWIND;
5515 }
5516
5517 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5518 u_int32_t sub_len;
5519 VERIFY(m->m_flags & M_PKTHDR);
5520 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5521
5522 sub_len = m->m_pkthdr.mp_rlen;
5523
5524 if (sub_len < len) {
5525 m->m_pkthdr.mp_dsn += sub_len;
5526 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5527 m->m_pkthdr.mp_rseq += sub_len;
5528 }
5529 m->m_pkthdr.mp_rlen = 0;
5530 len -= sub_len;
5531 } else {
5532 /* sub_len >= len */
5533 if (rewinding == 0) {
5534 m->m_pkthdr.mp_dsn += len;
5535 }
5536 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5537 if (rewinding == 0) {
5538 m->m_pkthdr.mp_rseq += len;
5539 }
5540 }
5541 m->m_pkthdr.mp_rlen -= len;
5542 break;
5543 }
5544 m = m->m_next;
5545 }
5546
5547 if (so->so_flags & SOF_MP_SUBFLOW &&
5548 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5549 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5550 /*
5551 * Received an ack without receiving a DATA_ACK.
5552 * Need to fallback to regular TCP (or destroy this subflow).
5553 */
5554 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5555 mptcp_notify_mpfail(so);
5556 }
5557 }
5558
5559 /* Obtain the DSN mapping stored in the mbuf */
5560 void
mptcp_output_getm_dsnmap32(struct socket * so,int off,uint32_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5561 mptcp_output_getm_dsnmap32(struct socket *so, int off,
5562 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5563 {
5564 u_int64_t dsn64;
5565
5566 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
5567 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5568 }
5569
5570 void
mptcp_output_getm_dsnmap64(struct socket * so,int off,uint64_t * dsn,uint32_t * relseq,uint16_t * data_len,uint16_t * dss_csum)5571 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5572 uint32_t *relseq, uint16_t *data_len,
5573 uint16_t *dss_csum)
5574 {
5575 struct mbuf *m = so->so_snd.sb_mb;
5576
5577 VERIFY(off >= 0);
5578
5579 if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5580 *dsn = 0;
5581 *relseq = 0;
5582 *data_len = 0;
5583 *dss_csum = 0;
5584 return;
5585 }
5586
5587 /*
5588 * In the subflow socket, the DSN sequencing can be discontiguous,
5589 * but the subflow sequence mapping is contiguous. Use the subflow
5590 * sequence property to find the right mbuf and corresponding dsn
5591 * mapping.
5592 */
5593
5594 while (m) {
5595 VERIFY(m->m_flags & M_PKTHDR);
5596 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5597
5598 if (off >= m->m_len) {
5599 off -= m->m_len;
5600 m = m->m_next;
5601 } else {
5602 break;
5603 }
5604 }
5605
5606 VERIFY(off >= 0);
5607 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5608
5609 *dsn = m->m_pkthdr.mp_dsn;
5610 *relseq = m->m_pkthdr.mp_rseq;
5611 *data_len = m->m_pkthdr.mp_rlen;
5612 *dss_csum = m->m_pkthdr.mp_csum;
5613 }
5614
5615 void
mptcp_output_getm_data_level_details(struct socket * so,int off,uint16_t * data_len,uint16_t * dss_csum)5616 mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum)
5617 {
5618 uint64_t dsn;
5619 uint32_t relseq;
5620
5621 mptcp_output_getm_dsnmap64(so, off, &dsn, &relseq, data_len, dss_csum);
5622 }
5623
5624 /*
5625 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5626 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5627 * When it trims data tcp_input calls m_adj() which does not remove the
5628 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5629 * The dsn map insertion cannot be delayed after trim, because data can be in
5630 * the reassembly queue for a while and the DSN option info in tp will be
5631 * overwritten for every new packet received.
5632 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5633 * with mptcp_adj_rmap()
5634 */
5635 void
mptcp_insert_rmap(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th)5636 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5637 {
5638 VERIFY(m->m_flags & M_PKTHDR);
5639 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5640
5641 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5642 m->m_pkthdr.mp_dsn = tp->t_mpsub->mpts_rcv_map.mpt_dsn;
5643 m->m_pkthdr.mp_rseq = tp->t_mpsub->mpts_rcv_map.mpt_sseq;
5644 m->m_pkthdr.mp_rlen = tp->t_mpsub->mpts_rcv_map.mpt_len;
5645 m->m_pkthdr.mp_csum = tp->t_mpsub->mpts_rcv_map.mpt_csum;
5646 if (tp->t_mpsub->mpts_rcv_map.mpt_dfin) {
5647 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5648 }
5649
5650 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5651
5652 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5653 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5654 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5655 if (th->th_flags & TH_FIN) {
5656 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5657 }
5658 }
5659 }
5660
5661 /*
5662 * Following routines help with failure detection and failover of data
5663 * transfer from one subflow to another.
5664 */
5665 void
mptcp_act_on_txfail(struct socket * so)5666 mptcp_act_on_txfail(struct socket *so)
5667 {
5668 struct tcpcb *tp = NULL;
5669 struct inpcb *inp = sotoinpcb(so);
5670
5671 if (inp == NULL) {
5672 return;
5673 }
5674
5675 tp = intotcpcb(inp);
5676 if (tp == NULL) {
5677 return;
5678 }
5679
5680 if (so->so_flags & SOF_MP_TRYFAILOVER) {
5681 return;
5682 }
5683
5684 so->so_flags |= SOF_MP_TRYFAILOVER;
5685 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5686 }
5687
5688 /*
5689 * Support for MP_FAIL option
5690 */
5691 int
mptcp_get_map_for_dsn(struct socket * so,uint64_t dsn_fail,uint32_t * tcp_seq)5692 mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5693 {
5694 struct mbuf *m = so->so_snd.sb_mb;
5695 uint16_t datalen;
5696 uint64_t dsn;
5697 int off = 0;
5698
5699 if (m == NULL) {
5700 return -1;
5701 }
5702
5703 while (m != NULL) {
5704 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5705 VERIFY(m->m_flags & M_PKTHDR);
5706 dsn = m->m_pkthdr.mp_dsn;
5707 datalen = m->m_pkthdr.mp_rlen;
5708 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5709 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5710 off = (int)(dsn_fail - dsn);
5711 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5712 return 0;
5713 }
5714
5715 m = m->m_next;
5716 }
5717
5718 /*
5719 * If there was no mbuf data and a fallback to TCP occurred, there's
5720 * not much else to do.
5721 */
5722
5723 os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5724 return -1;
5725 }
5726
5727 /*
5728 * Support for sending contiguous MPTCP bytes in subflow
5729 * Also for preventing sending data with ACK in 3-way handshake
5730 */
5731 int32_t
mptcp_adj_sendlen(struct socket * so,int32_t off)5732 mptcp_adj_sendlen(struct socket *so, int32_t off)
5733 {
5734 struct tcpcb *tp = sototcpcb(so);
5735 struct mptsub *mpts = tp->t_mpsub;
5736 uint64_t mdss_dsn;
5737 uint32_t mdss_subflow_seq;
5738 int mdss_subflow_off;
5739 uint16_t mdss_data_len;
5740 uint16_t dss_csum;
5741
5742 if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5743 return 0;
5744 }
5745
5746 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5747 &mdss_data_len, &dss_csum);
5748
5749 /*
5750 * We need to compute how much of the mapping still remains.
5751 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5752 */
5753 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5754
5755 /*
5756 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5757 * seq has been set to 1 (while it should be 0).
5758 */
5759 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5760 mdss_subflow_off--;
5761 }
5762
5763 VERIFY(off >= mdss_subflow_off);
5764
5765 return mdss_data_len - (off - mdss_subflow_off);
5766 }
5767
5768 static uint32_t
mptcp_get_maxseg(struct mptses * mpte)5769 mptcp_get_maxseg(struct mptses *mpte)
5770 {
5771 struct mptsub *mpts;
5772 uint32_t maxseg = 0;
5773
5774 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5775 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5776
5777 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5778 TCPS_HAVERCVDFIN2(tp->t_state)) {
5779 continue;
5780 }
5781
5782 if (tp->t_maxseg > maxseg) {
5783 maxseg = tp->t_maxseg;
5784 }
5785 }
5786
5787 return maxseg;
5788 }
5789
5790 static uint8_t
mptcp_get_rcvscale(struct mptses * mpte)5791 mptcp_get_rcvscale(struct mptses *mpte)
5792 {
5793 struct mptsub *mpts;
5794 uint8_t rcvscale = UINT8_MAX;
5795
5796 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5797 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5798
5799 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5800 TCPS_HAVERCVDFIN2(tp->t_state)) {
5801 continue;
5802 }
5803
5804 if (tp->rcv_scale < rcvscale) {
5805 rcvscale = tp->rcv_scale;
5806 }
5807 }
5808
5809 return rcvscale;
5810 }
5811
5812 /* Similar to tcp_sbrcv_reserve */
5813 static void
mptcp_sbrcv_reserve(struct mptcb * mp_tp,struct sockbuf * sbrcv,u_int32_t newsize,u_int32_t idealsize)5814 mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5815 u_int32_t newsize, u_int32_t idealsize)
5816 {
5817 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5818
5819 if (rcvscale == UINT8_MAX) {
5820 return;
5821 }
5822
5823 /* newsize should not exceed max */
5824 newsize = min(newsize, tcp_autorcvbuf_max);
5825
5826 /* The receive window scale negotiated at the
5827 * beginning of the connection will also set a
5828 * limit on the socket buffer size
5829 */
5830 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5831
5832 /* Set new socket buffer size */
5833 if (newsize > sbrcv->sb_hiwat &&
5834 (sbreserve(sbrcv, newsize) == 1)) {
5835 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5836 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5837
5838 /* Again check the limit set by the advertised
5839 * window scale
5840 */
5841 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5842 TCP_MAXWIN << rcvscale);
5843 }
5844 }
5845
5846 void
mptcp_sbrcv_grow(struct mptcb * mp_tp)5847 mptcp_sbrcv_grow(struct mptcb *mp_tp)
5848 {
5849 struct mptses *mpte = mp_tp->mpt_mpte;
5850 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5851 struct sockbuf *sbrcv = &mp_so->so_rcv;
5852 uint32_t hiwat_sum = 0;
5853 uint32_t ideal_sum = 0;
5854 struct mptsub *mpts;
5855
5856 /*
5857 * Do not grow the receive socket buffer if
5858 * - auto resizing is disabled, globally or on this socket
5859 * - the high water mark already reached the maximum
5860 * - the stream is in background and receive side is being
5861 * throttled
5862 * - if there are segments in reassembly queue indicating loss,
5863 * do not need to increase recv window during recovery as more
5864 * data is not going to be sent. A duplicate ack sent during
5865 * recovery should not change the receive window
5866 */
5867 if (tcp_do_autorcvbuf == 0 ||
5868 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5869 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5870 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5871 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5872 /* Can not resize the socket buffer, just return */
5873 return;
5874 }
5875
5876 /*
5877 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5878 *
5879 * But, for this we first need accurate receiver-RTT estimations, which
5880 * we currently don't have.
5881 *
5882 * Let's use a dummy algorithm for now, just taking the sum of all
5883 * subflow's receive-buffers. It's too low, but that's all we can get
5884 * for now.
5885 */
5886
5887 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5888 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5889 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5890 }
5891
5892 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5893 }
5894
5895 /*
5896 * Determine if we can grow the recieve socket buffer to avoid sending
5897 * a zero window update to the peer. We allow even socket buffers that
5898 * have fixed size (set by the application) to grow if the resource
5899 * constraints are met. They will also be trimmed after the application
5900 * reads data.
5901 *
5902 * Similar to tcp_sbrcv_grow_rwin
5903 */
5904 static void
mptcp_sbrcv_grow_rwin(struct mptcb * mp_tp,struct sockbuf * sb)5905 mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5906 {
5907 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5908 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5909 u_int32_t rcvbuf = sb->sb_hiwat;
5910
5911 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5912 return;
5913 }
5914
5915 if (tcp_do_autorcvbuf == 1 &&
5916 /* Diff to tcp_sbrcv_grow_rwin */
5917 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5918 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5919 rcvbuf < tcp_autorcvbuf_max &&
5920 (sb->sb_idealsize > 0 &&
5921 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5922 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5923 }
5924 }
5925
5926 /* Similar to tcp_sbspace */
5927 int32_t
mptcp_sbspace(struct mptcb * mp_tp)5928 mptcp_sbspace(struct mptcb *mp_tp)
5929 {
5930 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5931 uint32_t rcvbuf;
5932 int32_t space;
5933 int32_t pending = 0;
5934
5935 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
5936
5937 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5938
5939 /* hiwat might have changed */
5940 rcvbuf = sb->sb_hiwat;
5941
5942 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5943 (sb->sb_mbmax - sb->sb_mbcnt)));
5944 if (space < 0) {
5945 space = 0;
5946 }
5947
5948 #if CONTENT_FILTER
5949 /* Compensate for data being processed by content filters */
5950 pending = cfil_sock_data_space(sb);
5951 #endif /* CONTENT_FILTER */
5952 if (pending > space) {
5953 space = 0;
5954 } else {
5955 space -= pending;
5956 }
5957
5958 return space;
5959 }
5960
5961 /*
5962 * Support Fallback to Regular TCP
5963 */
5964 void
mptcp_notify_mpready(struct socket * so)5965 mptcp_notify_mpready(struct socket *so)
5966 {
5967 struct tcpcb *tp = NULL;
5968
5969 if (so == NULL) {
5970 return;
5971 }
5972
5973 tp = intotcpcb(sotoinpcb(so));
5974
5975 if (tp == NULL) {
5976 return;
5977 }
5978
5979 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5980 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5981 struct tcpcb *, tp);
5982
5983 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5984 return;
5985 }
5986
5987 if (tp->t_mpflags & TMPF_MPTCP_READY) {
5988 return;
5989 }
5990
5991 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5992 tp->t_mpflags |= TMPF_MPTCP_READY;
5993
5994 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5995 }
5996
5997 void
mptcp_notify_mpfail(struct socket * so)5998 mptcp_notify_mpfail(struct socket *so)
5999 {
6000 struct tcpcb *tp = NULL;
6001
6002 if (so == NULL) {
6003 return;
6004 }
6005
6006 tp = intotcpcb(sotoinpcb(so));
6007
6008 if (tp == NULL) {
6009 return;
6010 }
6011
6012 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
6013 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
6014 struct tcpcb *, tp);
6015
6016 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
6017 return;
6018 }
6019
6020 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
6021 tp->t_mpflags |= TMPF_TCP_FALLBACK;
6022
6023 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
6024 }
6025
6026 /*
6027 * Keepalive helper function
6028 */
6029 boolean_t
mptcp_ok_to_keepalive(struct mptcb * mp_tp)6030 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
6031 {
6032 boolean_t ret = 1;
6033
6034 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6035
6036 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
6037 ret = 0;
6038 }
6039 return ret;
6040 }
6041
6042 /*
6043 * MPTCP t_maxseg adjustment function
6044 */
6045 int
mptcp_adj_mss(struct tcpcb * tp,boolean_t mtudisc)6046 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6047 {
6048 int mss_lower = 0;
6049 struct mptcb *mp_tp = tptomptp(tp);
6050
6051 #define MPTCP_COMPUTE_LEN { \
6052 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
6053 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
6054 mss_lower += 2; \
6055 else \
6056 /* adjust to 32-bit boundary + EOL */ \
6057 mss_lower += 2; \
6058 }
6059 if (mp_tp == NULL) {
6060 return 0;
6061 }
6062
6063 socket_lock_assert_owned(mptetoso(mp_tp->mpt_mpte));
6064
6065 /*
6066 * For the first subflow and subsequent subflows, adjust mss for
6067 * most common MPTCP option size, for case where tcp_mss is called
6068 * during option processing and MTU discovery.
6069 */
6070 if (!mtudisc) {
6071 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6072 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6073 MPTCP_COMPUTE_LEN;
6074 }
6075
6076 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6077 tp->t_mpflags & TMPF_SENT_JOIN) {
6078 MPTCP_COMPUTE_LEN;
6079 }
6080 } else {
6081 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6082 MPTCP_COMPUTE_LEN;
6083 }
6084 }
6085
6086 return mss_lower;
6087 }
6088
6089 static void
fill_mptcp_subflow(struct socket * so,mptcp_flow_t * flow,struct mptsub * mpts)6090 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6091 {
6092 struct inpcb *inp;
6093
6094 tcp_getconninfo(so, &flow->flow_ci);
6095 inp = sotoinpcb(so);
6096 if ((inp->inp_vflag & INP_IPV6) != 0) {
6097 flow->flow_src.ss_family = AF_INET6;
6098 flow->flow_dst.ss_family = AF_INET6;
6099 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6100 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6101 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6102 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6103 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6104 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
6105 } else if ((inp->inp_vflag & INP_IPV4) != 0) {
6106 flow->flow_src.ss_family = AF_INET;
6107 flow->flow_dst.ss_family = AF_INET;
6108 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6109 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6110 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6111 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6112 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6113 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6114 }
6115 flow->flow_len = sizeof(*flow);
6116 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
6117 flow->flow_flags = mpts->mpts_flags;
6118 flow->flow_cid = mpts->mpts_connid;
6119 flow->flow_relseq = mpts->mpts_rel_seq;
6120 flow->flow_soerror = mpts->mpts_socket->so_error;
6121 flow->flow_probecnt = mpts->mpts_probecnt;
6122 }
6123
6124 static int
6125 mptcp_pcblist SYSCTL_HANDLER_ARGS
6126 {
6127 #pragma unused(oidp, arg1, arg2)
6128 int error = 0, f;
6129 size_t len;
6130 struct mppcb *mpp;
6131 struct mptses *mpte;
6132 struct mptcb *mp_tp;
6133 struct mptsub *mpts;
6134 struct socket *so;
6135 conninfo_mptcp_t mptcpci;
6136 mptcp_flow_t *flows = NULL;
6137
6138 if (req->newptr != USER_ADDR_NULL) {
6139 return EPERM;
6140 }
6141
6142 lck_mtx_lock(&mtcbinfo.mppi_lock);
6143 if (req->oldptr == USER_ADDR_NULL) {
6144 size_t n = mtcbinfo.mppi_count;
6145 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6146 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6147 4 * (n + n / 8) * sizeof(mptcp_flow_t);
6148 return 0;
6149 }
6150 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6151 flows = NULL;
6152 socket_lock(mpp->mpp_socket, 1);
6153 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6154 mpte = mptompte(mpp);
6155
6156 socket_lock_assert_owned(mptetoso(mpte));
6157 mp_tp = mpte->mpte_mptcb;
6158
6159 bzero(&mptcpci, sizeof(mptcpci));
6160 mptcpci.mptcpci_state = mp_tp->mpt_state;
6161 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6162 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6163 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6164 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6165 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6166 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6167 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6168 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6169 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6170 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6171 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6172 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6173
6174 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6175 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6176 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6177 mptcpci.mptcpci_flow_offset =
6178 offsetof(conninfo_mptcp_t, mptcpci_flows);
6179
6180 len = sizeof(*flows) * mpte->mpte_numflows;
6181 if (mpte->mpte_numflows != 0) {
6182 flows = kalloc_data(len, Z_WAITOK | Z_ZERO);
6183 if (flows == NULL) {
6184 socket_unlock(mpp->mpp_socket, 1);
6185 break;
6186 }
6187 mptcpci.mptcpci_len = sizeof(mptcpci) +
6188 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6189 error = SYSCTL_OUT(req, &mptcpci,
6190 sizeof(mptcpci) - sizeof(mptcp_flow_t));
6191 } else {
6192 mptcpci.mptcpci_len = sizeof(mptcpci);
6193 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6194 }
6195 if (error) {
6196 socket_unlock(mpp->mpp_socket, 1);
6197 kfree_data(flows, len);
6198 break;
6199 }
6200 f = 0;
6201 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6202 so = mpts->mpts_socket;
6203 fill_mptcp_subflow(so, &flows[f], mpts);
6204 f++;
6205 }
6206 socket_unlock(mpp->mpp_socket, 1);
6207 if (flows) {
6208 error = SYSCTL_OUT(req, flows, len);
6209 kfree_data(flows, len);
6210 if (error) {
6211 break;
6212 }
6213 }
6214 }
6215 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6216
6217 return error;
6218 }
6219
6220 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6221 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6222 "List of active MPTCP connections");
6223
6224 /*
6225 * Set notsent lowat mark on the MPTCB
6226 */
6227 int
mptcp_set_notsent_lowat(struct mptses * mpte,int optval)6228 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6229 {
6230 struct mptcb *mp_tp = NULL;
6231 int error = 0;
6232
6233 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6234 mp_tp = mpte->mpte_mptcb;
6235 }
6236
6237 if (mp_tp) {
6238 mp_tp->mpt_notsent_lowat = optval;
6239 } else {
6240 error = EINVAL;
6241 }
6242
6243 return error;
6244 }
6245
6246 u_int32_t
mptcp_get_notsent_lowat(struct mptses * mpte)6247 mptcp_get_notsent_lowat(struct mptses *mpte)
6248 {
6249 struct mptcb *mp_tp = NULL;
6250
6251 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6252 mp_tp = mpte->mpte_mptcb;
6253 }
6254
6255 if (mp_tp) {
6256 return mp_tp->mpt_notsent_lowat;
6257 } else {
6258 return 0;
6259 }
6260 }
6261
6262 int
mptcp_notsent_lowat_check(struct socket * so)6263 mptcp_notsent_lowat_check(struct socket *so)
6264 {
6265 struct mptses *mpte;
6266 struct mppcb *mpp;
6267 struct mptcb *mp_tp;
6268 struct mptsub *mpts;
6269
6270 int notsent = 0;
6271
6272 mpp = mpsotomppcb(so);
6273 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6274 return 0;
6275 }
6276
6277 mpte = mptompte(mpp);
6278 socket_lock_assert_owned(mptetoso(mpte));
6279 mp_tp = mpte->mpte_mptcb;
6280
6281 notsent = so->so_snd.sb_cc;
6282
6283 if ((notsent == 0) ||
6284 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6285 mp_tp->mpt_notsent_lowat)) {
6286 return 1;
6287 }
6288
6289 /* When Nagle's algorithm is not disabled, it is better
6290 * to wakeup the client even before there is atleast one
6291 * maxseg of data to write.
6292 */
6293 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6294 int retval = 0;
6295 if (mpts->mpts_flags & MPTSF_ACTIVE) {
6296 struct socket *subf_so = mpts->mpts_socket;
6297 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6298
6299 notsent = so->so_snd.sb_cc -
6300 (tp->snd_nxt - tp->snd_una);
6301
6302 if ((tp->t_flags & TF_NODELAY) == 0 &&
6303 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6304 retval = 1;
6305 }
6306 return retval;
6307 }
6308 }
6309 return 0;
6310 }
6311
6312 static errno_t
mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref,struct sockaddr_ctl * sac,void ** unitinfo)6313 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6314 void **unitinfo)
6315 {
6316 #pragma unused(kctlref, sac, unitinfo)
6317
6318 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6319 os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6320 }
6321
6322 mptcp_kern_skt_unit = sac->sc_unit;
6323
6324 return 0;
6325 }
6326
6327 static void
mptcp_allow_uuid(uuid_t uuid,int32_t rssi)6328 mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6329 {
6330 struct mppcb *mpp;
6331
6332 /* Iterate over all MPTCP connections */
6333
6334 lck_mtx_lock(&mtcbinfo.mppi_lock);
6335
6336 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6337 struct socket *mp_so = mpp->mpp_socket;
6338 struct mptses *mpte = mpp->mpp_pcbe;
6339
6340 socket_lock(mp_so, 1);
6341
6342 if (mp_so->so_flags & SOF_DELEGATED &&
6343 uuid_compare(uuid, mp_so->e_uuid)) {
6344 goto next;
6345 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6346 uuid_compare(uuid, mp_so->last_uuid)) {
6347 goto next;
6348 }
6349
6350 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6351 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6352
6353 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6354
6355 if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6356 mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6357 }
6358
6359 mptcp_check_subflows_and_add(mpte);
6360 mptcp_remove_subflows(mpte);
6361
6362 mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6363
6364 next:
6365 socket_unlock(mp_so, 1);
6366 }
6367
6368 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6369 }
6370
6371 static void
mptcp_wifi_status_changed(void)6372 mptcp_wifi_status_changed(void)
6373 {
6374 struct mppcb *mpp;
6375
6376 /* Iterate over all MPTCP connections */
6377
6378 lck_mtx_lock(&mtcbinfo.mppi_lock);
6379
6380 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6381 struct socket *mp_so = mpp->mpp_socket;
6382 struct mptses *mpte = mpp->mpp_pcbe;
6383
6384 socket_lock(mp_so, 1);
6385
6386 /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6387 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6388 mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
6389 mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6390 goto next;
6391 }
6392
6393 mptcp_check_subflows_and_add(mpte);
6394 mptcp_check_subflows_and_remove(mpte);
6395
6396 next:
6397 socket_unlock(mp_so, 1);
6398 }
6399
6400 lck_mtx_unlock(&mtcbinfo.mppi_lock);
6401 }
6402
6403 struct mptcp_uuid_search_info {
6404 uuid_t target_uuid;
6405 proc_t found_proc;
6406 boolean_t is_proc_found;
6407 };
6408
6409 static int
mptcp_find_proc_filter(proc_t p,void * arg)6410 mptcp_find_proc_filter(proc_t p, void *arg)
6411 {
6412 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6413 int found;
6414
6415 if (info->is_proc_found) {
6416 return 0;
6417 }
6418
6419 /*
6420 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6421 * expects != 0 for a matching filter.
6422 */
6423 found = uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0;
6424 if (found) {
6425 info->is_proc_found = true;
6426 }
6427
6428 return found;
6429 }
6430
6431 static int
mptcp_find_proc_callout(proc_t p,void * arg)6432 mptcp_find_proc_callout(proc_t p, void * arg)
6433 {
6434 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6435
6436 if (uuid_compare(proc_executableuuid_addr(p), info->target_uuid) == 0) {
6437 info->found_proc = p;
6438 return PROC_CLAIMED_DONE;
6439 }
6440
6441 return PROC_RETURNED;
6442 }
6443
6444 static proc_t
mptcp_find_proc(const uuid_t uuid)6445 mptcp_find_proc(const uuid_t uuid)
6446 {
6447 struct mptcp_uuid_search_info info;
6448
6449 uuid_copy(info.target_uuid, uuid);
6450 info.found_proc = PROC_NULL;
6451 info.is_proc_found = false;
6452
6453 proc_iterate(PROC_ALLPROCLIST, mptcp_find_proc_callout, &info,
6454 mptcp_find_proc_filter, &info);
6455
6456 return info.found_proc;
6457 }
6458
6459 void
mptcp_ask_symptoms(struct mptses * mpte)6460 mptcp_ask_symptoms(struct mptses *mpte)
6461 {
6462 struct mptcp_symptoms_ask_uuid ask;
6463 struct socket *mp_so;
6464 struct proc *p = PROC_NULL;
6465 int pid, prio, err;
6466
6467 if (mptcp_kern_skt_unit == 0) {
6468 os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6469 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6470 return;
6471 }
6472
6473 mp_so = mptetoso(mpte);
6474
6475 if (mp_so->so_flags & SOF_DELEGATED) {
6476 if (mpte->mpte_epid != 0) {
6477 p = proc_find(mpte->mpte_epid);
6478 if (p != PROC_NULL) {
6479 /* We found a pid, check its UUID */
6480 if (uuid_compare(mp_so->e_uuid, proc_executableuuid_addr(p))) {
6481 /* It's not the same - we need to look for the real proc */
6482 proc_rele(p);
6483 p = PROC_NULL;
6484 }
6485 }
6486 }
6487
6488 if (p == PROC_NULL) {
6489 p = mptcp_find_proc(mp_so->e_uuid);
6490 if (p == PROC_NULL) {
6491 uuid_string_t uuid_string;
6492 uuid_unparse(mp_so->e_uuid, uuid_string);
6493
6494 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6495 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
6496
6497 return;
6498 }
6499 mpte->mpte_epid = proc_pid(p);
6500 }
6501
6502 pid = mpte->mpte_epid;
6503 uuid_copy(ask.uuid, mp_so->e_uuid);
6504 } else {
6505 pid = mp_so->last_pid;
6506
6507 p = proc_find(pid);
6508 if (p == PROC_NULL) {
6509 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6510 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6511 return;
6512 }
6513
6514 uuid_copy(ask.uuid, mp_so->last_uuid);
6515 }
6516
6517
6518 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6519
6520 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
6521
6522 if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6523 prio == TASK_DARWINBG_APPLICATION) {
6524 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6525 } else if (prio == TASK_FOREGROUND_APPLICATION) {
6526 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6527 } else {
6528 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6529 }
6530
6531 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
6532 &ask, sizeof(ask), CTL_DATA_EOR);
6533
6534 os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6535 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6536
6537
6538 proc_rele(p);
6539 }
6540
6541 static errno_t
mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo)6542 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6543 void *unitinfo)
6544 {
6545 #pragma unused(kctlref, kcunit, unitinfo)
6546
6547 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6548
6549 return 0;
6550 }
6551
6552 static errno_t
mptcp_symptoms_ctl_send(kern_ctl_ref kctlref,u_int32_t kcunit,void * unitinfo,mbuf_t m,int flags)6553 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6554 mbuf_t m, int flags)
6555 {
6556 #pragma unused(kctlref, unitinfo, flags)
6557 symptoms_advisory_t *sa = NULL;
6558
6559 if (kcunit != mptcp_kern_skt_unit) {
6560 os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6561 __func__, kcunit, mptcp_kern_skt_unit);
6562 }
6563
6564 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
6565 mbuf_freem(m);
6566 return EINVAL;
6567 }
6568
6569 if (mbuf_len(m) < sizeof(*sa)) {
6570 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6571 __func__, mbuf_len(m), sizeof(*sa));
6572 mbuf_freem(m);
6573 return EINVAL;
6574 }
6575
6576 sa = mtod(m, void *);
6577
6578 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6579 os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6580 sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6581 sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6582
6583 if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6584 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6585 mptcp_wifi_status_changed();
6586 }
6587 } else {
6588 struct mptcp_symptoms_answer answer;
6589 errno_t err;
6590
6591 /* We temporarily allow different sizes for ease of submission */
6592 if (mbuf_len(m) != sizeof(uuid_t) + sizeof(*sa) &&
6593 mbuf_len(m) != sizeof(answer)) {
6594 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6595 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6596 sizeof(answer));
6597 mbuf_free(m);
6598 return EINVAL;
6599 }
6600
6601 memset(&answer, 0, sizeof(answer));
6602
6603 err = mbuf_copydata(m, 0, mbuf_len(m), &answer);
6604 if (err) {
6605 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6606 mbuf_free(m);
6607 return err;
6608 }
6609
6610 mptcp_allow_uuid(answer.uuid, answer.rssi);
6611 }
6612
6613 mbuf_freem(m);
6614 return 0;
6615 }
6616
6617 void
mptcp_control_register(void)6618 mptcp_control_register(void)
6619 {
6620 /* Set up the advisory control socket */
6621 struct kern_ctl_reg mptcp_kern_ctl;
6622
6623 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
6624 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6625 sizeof(mptcp_kern_ctl.ctl_name));
6626 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6627 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6628 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6629 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6630
6631 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
6632 }
6633
6634 mptcp_wifi_quality_t
mptcp_wifi_quality_for_session(struct mptses * mpte)6635 mptcp_wifi_quality_for_session(struct mptses *mpte)
6636 {
6637 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6638 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6639 mptcp_advisory.sa_wifi_status) {
6640 return symptoms_is_wifi_lossy() ? MPTCP_WIFI_QUALITY_BAD : MPTCP_WIFI_QUALITY_GOOD;
6641 }
6642
6643 /*
6644 * If it's a first-party app and we don't have any info
6645 * about the Wi-Fi state, let's be pessimistic.
6646 */
6647 return MPTCP_WIFI_QUALITY_UNSURE;
6648 } else {
6649 if (symptoms_is_wifi_lossy()) {
6650 return MPTCP_WIFI_QUALITY_BAD;
6651 }
6652
6653 /*
6654 * If we are target-based (meaning, we allow to be more lax on
6655 * the when wifi is considered bad), we only *know* about the state once
6656 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6657 *
6658 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6659 * be set.
6660 *
6661 * In any other case (while in target-mode), consider WiFi bad
6662 * and we are going to ask for allowance from Symptoms anyway.
6663 */
6664 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6665 if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6666 mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6667 return MPTCP_WIFI_QUALITY_GOOD;
6668 }
6669
6670 return MPTCP_WIFI_QUALITY_BAD;
6671 }
6672
6673 return MPTCP_WIFI_QUALITY_GOOD;
6674 }
6675 }
6676
6677 boolean_t
symptoms_is_wifi_lossy(void)6678 symptoms_is_wifi_lossy(void)
6679 {
6680 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6681 }
6682
6683 int
mptcp_freeq(struct mptcb * mp_tp)6684 mptcp_freeq(struct mptcb *mp_tp)
6685 {
6686 struct protosw *proto = mptetoso(mp_tp->mpt_mpte)->so_proto;
6687 struct tseg_qent *q;
6688 int count = 0;
6689 int rv = 0;
6690
6691 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6692 LIST_REMOVE(q, tqe_q);
6693 m_freem(q->tqe_m);
6694 tcp_reass_qent_free(proto, q);
6695 count++;
6696 rv = 1;
6697 }
6698 mp_tp->mpt_reassqlen = 0;
6699
6700 if (count > 0) {
6701 OSAddAtomic(-count, &mptcp_reass_total_qlen);
6702 }
6703
6704 return rv;
6705 }
6706
6707 static int
mptcp_post_event(u_int32_t event_code,int value)6708 mptcp_post_event(u_int32_t event_code, int value)
6709 {
6710 struct kev_mptcp_data event_data;
6711 struct kev_msg ev_msg;
6712
6713 memset(&ev_msg, 0, sizeof(ev_msg));
6714
6715 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6716 ev_msg.kev_class = KEV_NETWORK_CLASS;
6717 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6718 ev_msg.event_code = event_code;
6719
6720 event_data.value = value;
6721
6722 ev_msg.dv[0].data_ptr = &event_data;
6723 ev_msg.dv[0].data_length = sizeof(event_data);
6724
6725 return kev_post_msg(&ev_msg);
6726 }
6727
6728 static void
mptcp_set_cellicon(struct mptses * mpte,struct mptsub * mpts)6729 mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6730 {
6731 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6732 int error;
6733
6734 /* First-party apps (Siri) don't flip the cellicon */
6735 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6736 return;
6737 }
6738
6739 /* Subflow is disappearing - don't set it on this one */
6740 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6741 return;
6742 }
6743
6744 /* Fallen back connections are not triggering the cellicon */
6745 if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6746 return;
6747 }
6748
6749 /* Remember the last time we set the cellicon. Needed for debouncing */
6750 mpte->mpte_last_cellicon_set = tcp_now;
6751
6752 tp->t_timer[TCPT_CELLICON] = tcp_offset_from_start(tp,
6753 MPTCP_CELLICON_TOGGLE_RATE);
6754 tcp_sched_timers(tp);
6755
6756 if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6757 mpte->mpte_cellicon_increments != 0) {
6758 if (mptcp_cellicon_refcount == 0) {
6759 os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6760 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6761
6762 /* Continue, so that the icon gets set... */
6763 } else {
6764 /*
6765 * In this case, the cellicon is already set. No need to bump it
6766 * even higher
6767 */
6768
6769 return;
6770 }
6771 }
6772
6773 /* When tearing down this subflow, we need to decrement the
6774 * reference counter
6775 */
6776 mpts->mpts_flags |= MPTSF_CELLICON_SET;
6777
6778 /* This counter, so that when a session gets destroyed we decrement
6779 * the reference counter by whatever is left
6780 */
6781 mpte->mpte_cellicon_increments++;
6782
6783 if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6784 /* If cellicon is already set, get out of here! */
6785 return;
6786 }
6787
6788 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6789
6790 if (error) {
6791 os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6792 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6793 } else {
6794 os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6795 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6796 }
6797 }
6798
6799 void
mptcp_clear_cellicon(void)6800 mptcp_clear_cellicon(void)
6801 {
6802 int error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6803
6804 if (error) {
6805 os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6806 __func__, error);
6807 } else {
6808 os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6809 __func__);
6810 }
6811 }
6812
6813 /*
6814 * Returns true if the icon has been flipped to WiFi.
6815 */
6816 static boolean_t
__mptcp_unset_cellicon(uint32_t val)6817 __mptcp_unset_cellicon(uint32_t val)
6818 {
6819 VERIFY(val < INT32_MAX);
6820 if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
6821 return false;
6822 }
6823
6824 mptcp_clear_cellicon();
6825
6826 return true;
6827 }
6828
6829 void
mptcp_unset_cellicon(struct mptses * mpte,struct mptsub * mpts,uint32_t val)6830 mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6831 {
6832 /* First-party apps (Siri) don't flip the cellicon */
6833 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6834 return;
6835 }
6836
6837 if (mpte->mpte_cellicon_increments == 0) {
6838 /* This flow never used cell - get out of here! */
6839 return;
6840 }
6841
6842 if (mptcp_cellicon_refcount == 0) {
6843 os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6844 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6845
6846 return;
6847 }
6848
6849 if (mpts) {
6850 if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6851 return;
6852 }
6853
6854 mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6855 }
6856
6857 if (mpte->mpte_cellicon_increments < val) {
6858 os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6859 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6860 val = mpte->mpte_cellicon_increments;
6861 }
6862
6863 mpte->mpte_cellicon_increments -= val;
6864
6865 if (__mptcp_unset_cellicon(val) == false) {
6866 return;
6867 }
6868
6869 /* All flows are gone - our counter should be at zero too! */
6870 if (mpte->mpte_cellicon_increments != 0) {
6871 os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6872 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6873 }
6874 }
6875
6876 void
mptcp_reset_rexmit_state(struct tcpcb * tp)6877 mptcp_reset_rexmit_state(struct tcpcb *tp)
6878 {
6879 struct mptsub *mpts;
6880 struct inpcb *inp;
6881 struct socket *so;
6882
6883 inp = tp->t_inpcb;
6884 if (inp == NULL) {
6885 return;
6886 }
6887
6888 so = inp->inp_socket;
6889 if (so == NULL) {
6890 return;
6891 }
6892
6893 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6894 return;
6895 }
6896
6897 mpts = tp->t_mpsub;
6898
6899 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6900 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6901 }
6902
6903 void
mptcp_reset_keepalive(struct tcpcb * tp)6904 mptcp_reset_keepalive(struct tcpcb *tp)
6905 {
6906 struct mptsub *mpts = tp->t_mpsub;
6907
6908 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6909 }
6910
6911 static struct mppcb *
mtcp_alloc(void)6912 mtcp_alloc(void)
6913 {
6914 return &kalloc_type(struct mpp_mtp, Z_WAITOK | Z_ZERO | Z_NOFAIL)->mpp;
6915 }
6916
6917 static void
mtcp_free(struct mppcb * mpp)6918 mtcp_free(struct mppcb *mpp)
6919 {
6920 struct mpp_mtp *mtp = __container_of(mpp, struct mpp_mtp, mpp);
6921
6922 kfree_type(struct mpp_mtp, mtp);
6923 }
6924
6925 /*
6926 * Protocol pr_init callback.
6927 */
6928 void
mptcp_init(struct protosw * pp,struct domain * dp)6929 mptcp_init(struct protosw *pp, struct domain *dp)
6930 {
6931 #pragma unused(dp)
6932 static int mptcp_initialized = 0;
6933 struct protosw *prp;
6934 struct ip6protosw *prp6;
6935
6936 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
6937
6938 /* do this only once */
6939 if (!os_atomic_cmpxchg(&mptcp_initialized, 0, 1, relaxed)) {
6940 return;
6941 }
6942
6943 mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
6944
6945 /*
6946 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
6947 * we must be able to find IPPROTO_TCP entries for both.
6948 */
6949 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
6950 VERIFY(prp != NULL);
6951 bcopy(prp, &mptcp_subflow_protosw, sizeof(*prp));
6952 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
6953 sizeof(mptcp_subflow_usrreqs));
6954 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
6955 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
6956 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
6957 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
6958 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
6959 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
6960 /*
6961 * Socket filters shouldn't attach/detach to/from this protosw
6962 * since pr_protosw is to be used instead, which points to the
6963 * real protocol; if they do, it is a bug and we should panic.
6964 */
6965 mptcp_subflow_protosw.pr_filter_head.tqh_first =
6966 __unsafe_forge_single(struct socket_filter *, 0xdeadbeefdeadbeef);
6967 mptcp_subflow_protosw.pr_filter_head.tqh_last =
6968 __unsafe_forge_single(struct socket_filter **, 0xdeadbeefdeadbeef);
6969
6970 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
6971 IPPROTO_TCP, SOCK_STREAM);
6972 VERIFY(prp6 != NULL);
6973 bcopy(prp6, &mptcp_subflow_protosw6, sizeof(*prp6));
6974 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
6975 sizeof(mptcp_subflow_usrreqs6));
6976 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
6977 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
6978 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
6979 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
6980 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
6981 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
6982 /*
6983 * Socket filters shouldn't attach/detach to/from this protosw
6984 * since pr_protosw is to be used instead, which points to the
6985 * real protocol; if they do, it is a bug and we should panic.
6986 */
6987 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
6988 __unsafe_forge_single(struct socket_filter *, 0xdeadbeefdeadbeef);
6989 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
6990 __unsafe_forge_single(struct socket_filter **, 0xdeadbeefdeadbeef);
6991
6992 bzero(&mtcbinfo, sizeof(mtcbinfo));
6993 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
6994 mtcbinfo.mppi_alloc = mtcp_alloc;
6995 mtcbinfo.mppi_free = mtcp_free;
6996
6997 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", LCK_GRP_ATTR_NULL);
6998 lck_attr_setdefault(&mtcbinfo.mppi_lock_attr);
6999 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
7000 &mtcbinfo.mppi_lock_attr);
7001
7002 mtcbinfo.mppi_gc = mptcp_gc;
7003 mtcbinfo.mppi_timer = mptcp_timer;
7004
7005 /* attach to MP domain for garbage collection to take place */
7006 mp_pcbinfo_attach(&mtcbinfo);
7007
7008 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
7009 }
7010